datachain 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +17 -8
- datachain/catalog/catalog.py +5 -5
- datachain/cli.py +0 -2
- datachain/data_storage/schema.py +5 -5
- datachain/data_storage/sqlite.py +1 -1
- datachain/data_storage/warehouse.py +7 -7
- datachain/lib/arrow.py +25 -8
- datachain/lib/clip.py +6 -11
- datachain/lib/convert/__init__.py +0 -0
- datachain/lib/convert/flatten.py +67 -0
- datachain/lib/convert/type_converter.py +96 -0
- datachain/lib/convert/unflatten.py +69 -0
- datachain/lib/convert/values_to_tuples.py +85 -0
- datachain/lib/data_model.py +74 -0
- datachain/lib/dc.py +192 -167
- datachain/lib/feature_registry.py +36 -10
- datachain/lib/file.py +41 -41
- datachain/lib/gpt4_vision.py +1 -9
- datachain/lib/hf_image_to_text.py +9 -17
- datachain/lib/hf_pipeline.py +4 -12
- datachain/lib/image.py +2 -18
- datachain/lib/image_transform.py +0 -1
- datachain/lib/iptc_exif_xmp.py +8 -15
- datachain/lib/meta_formats.py +1 -5
- datachain/lib/model_store.py +77 -0
- datachain/lib/pytorch.py +9 -21
- datachain/lib/signal_schema.py +120 -58
- datachain/lib/text.py +5 -16
- datachain/lib/udf.py +114 -30
- datachain/lib/udf_signature.py +5 -5
- datachain/lib/webdataset.py +3 -4
- datachain/lib/webdataset_laion.py +2 -3
- datachain/node.py +4 -4
- datachain/query/batch.py +1 -1
- datachain/query/dataset.py +40 -60
- datachain/query/dispatch.py +28 -17
- datachain/query/udf.py +46 -26
- datachain/remote/studio.py +1 -9
- datachain/torch/__init__.py +21 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/METADATA +13 -12
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/RECORD +45 -42
- datachain/image/__init__.py +0 -3
- datachain/lib/cached_stream.py +0 -38
- datachain/lib/claude.py +0 -69
- datachain/lib/feature.py +0 -412
- datachain/lib/feature_utils.py +0 -154
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/LICENSE +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/WHEEL +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=L5IlHOD4AaHkV7P5dbUwdq90I3bGFLtOghoZ1WVFGcs,841
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
|
|
4
4
|
datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=gikzwEXTDKyzY1xOAUziXN2-OVqnOhDMJTd7SHq0Jxc,32406
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
10
|
datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
|
|
11
|
-
datachain/node.py,sha256=
|
|
11
|
+
datachain/node.py,sha256=fsQDJUmRMSRHhL1u6qQlWgreHbH760Ls-yDzFLhbW-U,5724
|
|
12
12
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
13
13
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
14
14
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
@@ -16,7 +16,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
16
16
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
17
17
|
datachain/utils.py,sha256=12yQAV8tfyCHqp_xJcJBeNnr1L_BO8e2bOPyXdM68gs,10759
|
|
18
18
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
19
|
-
datachain/catalog/catalog.py,sha256=
|
|
19
|
+
datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
|
|
20
20
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
21
21
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
22
22
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -32,51 +32,53 @@ datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-
|
|
|
32
32
|
datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
|
|
33
33
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
34
34
|
datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
|
|
35
|
-
datachain/data_storage/schema.py,sha256=
|
|
35
|
+
datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
|
|
36
36
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
37
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
38
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
-
datachain/image/__init__.py,sha256=g3l7vJFzg0-s5OAmBtGargsxt12TuKU4Ex6S0fOmEeY,101
|
|
37
|
+
datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
|
|
38
|
+
datachain/data_storage/warehouse.py,sha256=UbD37_jqaM4BY2SsQaTiJre-eSa7HcPejrTp936L080,33170
|
|
40
39
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
-
datachain/lib/
|
|
43
|
-
datachain/lib/
|
|
44
|
-
datachain/lib/
|
|
45
|
-
datachain/lib/
|
|
46
|
-
datachain/lib/
|
|
47
|
-
datachain/lib/
|
|
48
|
-
datachain/lib/
|
|
49
|
-
datachain/lib/
|
|
50
|
-
datachain/lib/
|
|
51
|
-
datachain/lib/
|
|
52
|
-
datachain/lib/
|
|
53
|
-
datachain/lib/
|
|
54
|
-
datachain/lib/
|
|
55
|
-
datachain/lib/
|
|
56
|
-
datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
|
|
57
|
-
datachain/lib/pytorch.py,sha256=Ea1sXhborF6zcywQjLpXgKnbr1lTez4Bfu3m0Gr78FI,5843
|
|
40
|
+
datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
|
|
41
|
+
datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
|
|
42
|
+
datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
|
|
43
|
+
datachain/lib/dc.py,sha256=Px7zj1mrAsL3sBLu1pezssBQkvY0YAoGJm4VbT2yRwc,34699
|
|
44
|
+
datachain/lib/feature_registry.py,sha256=LUrBvDom-k1shFuCv46-OdgntbIUQ5008oyIS0iPM6Q,2298
|
|
45
|
+
datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
|
|
46
|
+
datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
|
|
47
|
+
datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
|
|
48
|
+
datachain/lib/hf_pipeline.py,sha256=MBFzixVa25_6QVR9RyOq8Rr9UIQ-sFVcBHducx_sZcY,2069
|
|
49
|
+
datachain/lib/image.py,sha256=K0n_P7kmobWTgxe-rDbr5yY3vBrOPnseziE3DXwFFVo,2325
|
|
50
|
+
datachain/lib/image_transform.py,sha256=hfgvIrSMGBx_MEXECyvrFoO1NyPBHoDb28j2lT2dsf8,2953
|
|
51
|
+
datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
|
|
52
|
+
datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
|
|
53
|
+
datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
|
|
54
|
+
datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
|
|
58
55
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
59
|
-
datachain/lib/signal_schema.py,sha256=
|
|
60
|
-
datachain/lib/text.py,sha256=
|
|
61
|
-
datachain/lib/udf.py,sha256=
|
|
62
|
-
datachain/lib/udf_signature.py,sha256=
|
|
56
|
+
datachain/lib/signal_schema.py,sha256=xzVHauGrhGcS5aOE1UMqT5YjJeZIMAZYQq76tZShhnY,13550
|
|
57
|
+
datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
|
|
58
|
+
datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
|
|
59
|
+
datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
|
|
63
60
|
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
64
61
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
65
62
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
-
datachain/lib/webdataset.py,sha256=
|
|
67
|
-
datachain/lib/webdataset_laion.py,sha256=
|
|
63
|
+
datachain/lib/webdataset.py,sha256=eqIDSqfBOhEK43JMp-6lYdYy2x3Ge5lwqR-hKGV8aG0,8259
|
|
64
|
+
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
65
|
+
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
+
datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
|
|
67
|
+
datachain/lib/convert/type_converter.py,sha256=W-wvCIcb6OwWjRJ3EWJE4-LbpoqxsRBd6gYNpFlm8qo,2643
|
|
68
|
+
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
69
|
+
datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0QEckwXlKgFbLA,3088
|
|
68
70
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
69
|
-
datachain/query/batch.py,sha256=
|
|
71
|
+
datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
|
|
70
72
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
71
|
-
datachain/query/dataset.py,sha256=
|
|
72
|
-
datachain/query/dispatch.py,sha256=
|
|
73
|
+
datachain/query/dataset.py,sha256=Pmaz16phEummJpWJD3x-8SMMbCb6xcOtWTyMdsFOdOE,64414
|
|
74
|
+
datachain/query/dispatch.py,sha256=Qv5QpP5-K9JAmZLntifRzS5_AYHbK82Ahreo302Ntq8,13218
|
|
73
75
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
74
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
75
77
|
datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
|
|
76
78
|
datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
|
|
77
|
-
datachain/query/udf.py,sha256=
|
|
79
|
+
datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
|
|
78
80
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
79
|
-
datachain/remote/studio.py,sha256=
|
|
81
|
+
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
80
82
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
81
83
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
82
84
|
datachain/sql/types.py,sha256=BzUm0nCcMPASvdqpQouX5bdVcK3G3DBfeeNhau7X_hA,10234
|
|
@@ -94,9 +96,10 @@ datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,
|
|
|
94
96
|
datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
|
|
95
97
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
96
98
|
datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
|
|
97
|
-
datachain
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
101
|
-
datachain-0.2.
|
|
102
|
-
datachain-0.2.
|
|
99
|
+
datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
|
|
100
|
+
datachain-0.2.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.2.10.dist-info/METADATA,sha256=bWvqTD9c2joLmkDGpdcutjjF_s1LpccbSCLbkIaKQYQ,16732
|
|
102
|
+
datachain-0.2.10.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
103
|
+
datachain-0.2.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.2.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.2.10.dist-info/RECORD,,
|
datachain/image/__init__.py
DELETED
datachain/lib/cached_stream.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
from contextlib import AbstractContextManager
|
|
3
|
-
|
|
4
|
-
from datachain.cache import UniqueId
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class AbstractCachedStream(AbstractContextManager, ABC):
|
|
8
|
-
def __init__(self, catalog, uid: UniqueId):
|
|
9
|
-
self.catalog = catalog
|
|
10
|
-
self.uid = uid
|
|
11
|
-
self.mode = "rb"
|
|
12
|
-
|
|
13
|
-
def set_mode(self, mode):
|
|
14
|
-
self.mode = mode
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class PreCachedStream(AbstractCachedStream):
|
|
18
|
-
def __init__(self, catalog, uid: UniqueId):
|
|
19
|
-
super().__init__(catalog, uid)
|
|
20
|
-
self.client = self.catalog.get_client(self.uid.storage)
|
|
21
|
-
self.cached_file = None
|
|
22
|
-
|
|
23
|
-
def get_path_in_cache(self):
|
|
24
|
-
return self.catalog.cache.path_from_checksum(self.uid.get_hash())
|
|
25
|
-
|
|
26
|
-
def __enter__(self):
|
|
27
|
-
self.client.download(self.uid)
|
|
28
|
-
self.cached_file = open(self.get_path_in_cache(), self.mode)
|
|
29
|
-
return self.cached_file
|
|
30
|
-
|
|
31
|
-
def __exit__(self, *args):
|
|
32
|
-
self.cached_file.close()
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class PreDownloadStream(PreCachedStream):
|
|
36
|
-
def __exit__(self, *args):
|
|
37
|
-
super().__exit__(*args)
|
|
38
|
-
self.catalog.cache.remove(self.uid)
|
datachain/lib/claude.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Callable, Literal, Optional
|
|
3
|
-
|
|
4
|
-
import anthropic
|
|
5
|
-
|
|
6
|
-
from datachain.lib.feature import Feature
|
|
7
|
-
from datachain.lib.file import File
|
|
8
|
-
|
|
9
|
-
default_model_name = "claude-3-haiku-20240307"
|
|
10
|
-
DEFAULT_OUTPUT_TOKENS = 1024
|
|
11
|
-
|
|
12
|
-
# This classes can be auto-generated:
|
|
13
|
-
# >> from anthropic.types.message import Message
|
|
14
|
-
# >> ClaudeMessage = pydantic_to_feature(Message)
|
|
15
|
-
# However, auto-generated pydentic classes do not work in multithreading mode.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class UsageFr(Feature):
|
|
19
|
-
input_tokens: int = 0
|
|
20
|
-
output_tokens: int = 0
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class TextBlockFr(Feature):
|
|
24
|
-
text: str = ""
|
|
25
|
-
type: str = "text"
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class ClaudeMessage(Feature):
|
|
29
|
-
id: str = ""
|
|
30
|
-
content: list[TextBlockFr]
|
|
31
|
-
model: str = ""
|
|
32
|
-
role: str = ""
|
|
33
|
-
stop_reason: Optional[Literal["end_turn", "max_tokens", "stop_sequence"]] = None
|
|
34
|
-
stop_sequence: Optional[str] = None
|
|
35
|
-
type: Literal["message"] = "message"
|
|
36
|
-
usage: UsageFr = UsageFr()
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def claude_processor(
|
|
40
|
-
prompt: str,
|
|
41
|
-
messages: Optional[list] = None,
|
|
42
|
-
model: str = "claude-3-haiku-20240307",
|
|
43
|
-
api_key: Optional[str] = "",
|
|
44
|
-
max_retries: int = 5,
|
|
45
|
-
temperature: float = 0.9,
|
|
46
|
-
max_tokens: int = 1024,
|
|
47
|
-
**kwargs,
|
|
48
|
-
) -> Callable:
|
|
49
|
-
if not messages:
|
|
50
|
-
messages = []
|
|
51
|
-
api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
52
|
-
|
|
53
|
-
def claude_func(file) -> ClaudeMessage:
|
|
54
|
-
try:
|
|
55
|
-
data = file.get_value() if isinstance(file, File) else file
|
|
56
|
-
client = anthropic.Anthropic(api_key=api_key, max_retries=max_retries)
|
|
57
|
-
response = client.messages.create(
|
|
58
|
-
model=model,
|
|
59
|
-
system=prompt,
|
|
60
|
-
messages=[{"role": "user", "content": data}, *messages],
|
|
61
|
-
temperature=temperature,
|
|
62
|
-
max_tokens=max_tokens,
|
|
63
|
-
**kwargs,
|
|
64
|
-
)
|
|
65
|
-
return ClaudeMessage(**response.model_dump())
|
|
66
|
-
except Exception: # noqa: BLE001
|
|
67
|
-
return ClaudeMessage(content=[])
|
|
68
|
-
|
|
69
|
-
return claude_func
|
datachain/lib/feature.py
DELETED
|
@@ -1,412 +0,0 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import inspect
|
|
3
|
-
import re
|
|
4
|
-
import warnings
|
|
5
|
-
from collections.abc import Iterable, Sequence
|
|
6
|
-
from datetime import datetime
|
|
7
|
-
from enum import Enum
|
|
8
|
-
from functools import lru_cache
|
|
9
|
-
from types import GenericAlias
|
|
10
|
-
from typing import (
|
|
11
|
-
TYPE_CHECKING,
|
|
12
|
-
Any,
|
|
13
|
-
ClassVar,
|
|
14
|
-
Literal,
|
|
15
|
-
Union,
|
|
16
|
-
get_args,
|
|
17
|
-
get_origin,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
import attrs
|
|
21
|
-
import numpy as np
|
|
22
|
-
import pandas as pd
|
|
23
|
-
from pydantic import BaseModel
|
|
24
|
-
from typing_extensions import Literal as LiteralEx
|
|
25
|
-
|
|
26
|
-
from datachain.lib.feature_registry import Registry
|
|
27
|
-
from datachain.query import C
|
|
28
|
-
from datachain.query.schema import DEFAULT_DELIMITER
|
|
29
|
-
from datachain.sql.types import (
|
|
30
|
-
JSON,
|
|
31
|
-
Array,
|
|
32
|
-
Binary,
|
|
33
|
-
Boolean,
|
|
34
|
-
DateTime,
|
|
35
|
-
Float,
|
|
36
|
-
Int,
|
|
37
|
-
Int32,
|
|
38
|
-
Int64,
|
|
39
|
-
NullType,
|
|
40
|
-
SQLType,
|
|
41
|
-
String,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
if TYPE_CHECKING:
|
|
45
|
-
from datachain.catalog import Catalog
|
|
46
|
-
|
|
47
|
-
FeatureStandardType = Union[
|
|
48
|
-
type[int],
|
|
49
|
-
type[str],
|
|
50
|
-
type[float],
|
|
51
|
-
type[bool],
|
|
52
|
-
type[list],
|
|
53
|
-
type[dict],
|
|
54
|
-
type[bytes],
|
|
55
|
-
type[datetime],
|
|
56
|
-
]
|
|
57
|
-
|
|
58
|
-
FeatureType = Union[type["Feature"], FeatureStandardType]
|
|
59
|
-
FeatureTypeNames = "Feature, int, str, float, bool, list, dict, bytes, datetime"
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
TYPE_TO_DATACHAIN = {
|
|
63
|
-
int: Int64,
|
|
64
|
-
str: String,
|
|
65
|
-
Literal: String,
|
|
66
|
-
LiteralEx: String,
|
|
67
|
-
Enum: String,
|
|
68
|
-
float: Float,
|
|
69
|
-
bool: Boolean,
|
|
70
|
-
datetime: DateTime, # Note, list of datetime is not supported yet
|
|
71
|
-
bytes: Binary, # Note, list of bytes is not supported yet
|
|
72
|
-
list: Array,
|
|
73
|
-
dict: JSON,
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
DATACHAIN_TO_TYPE = {
|
|
77
|
-
Int: int,
|
|
78
|
-
Int32: int,
|
|
79
|
-
Int64: int,
|
|
80
|
-
String: str,
|
|
81
|
-
Float: float,
|
|
82
|
-
Boolean: bool,
|
|
83
|
-
DateTime: datetime,
|
|
84
|
-
Binary: bytes,
|
|
85
|
-
Array(NullType): list,
|
|
86
|
-
JSON: dict,
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
NUMPY_TO_DATACHAIN = {
|
|
91
|
-
np.dtype("int8"): Int,
|
|
92
|
-
np.dtype("int16"): Int,
|
|
93
|
-
np.dtype("int32"): Int,
|
|
94
|
-
np.dtype("int64"): Int,
|
|
95
|
-
np.dtype("uint8"): Int,
|
|
96
|
-
np.dtype("uint16"): Int,
|
|
97
|
-
np.dtype("uint32"): Int,
|
|
98
|
-
np.dtype("uint64"): Int,
|
|
99
|
-
np.dtype("float16"): Float,
|
|
100
|
-
np.dtype("float32"): Float,
|
|
101
|
-
np.dtype("float64"): Float,
|
|
102
|
-
np.dtype("object"): String,
|
|
103
|
-
pd.CategoricalDtype(): String,
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
# Disable Pydantic warning, see https://github.com/iterative/dvcx/issues/1285
|
|
108
|
-
warnings.filterwarnings(
|
|
109
|
-
"ignore",
|
|
110
|
-
message="Field name .* shadows an attribute in parent",
|
|
111
|
-
category=UserWarning,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# Optimization: Store feature classes in this lookup variable so extra checks can be
|
|
116
|
-
# skipped within loops.
|
|
117
|
-
feature_classes_lookup: dict[type, bool] = {}
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
class Feature(BaseModel):
|
|
121
|
-
"""A base class for defining data classes that serve as inputs and outputs for
|
|
122
|
-
DataChain processing functions like `map()`, `gen()`, etc. Inherits from
|
|
123
|
-
`pydantic`'s BaseModel.
|
|
124
|
-
"""
|
|
125
|
-
|
|
126
|
-
_is_file: ClassVar[bool] = False
|
|
127
|
-
_version: ClassVar[int] = 1
|
|
128
|
-
|
|
129
|
-
@classmethod
|
|
130
|
-
def _is_hidden(cls):
|
|
131
|
-
return cls.__name__.startswith("_")
|
|
132
|
-
|
|
133
|
-
def get_value(self, *args: Any, **kwargs: Any) -> Any:
|
|
134
|
-
name = self.__class__.__name__
|
|
135
|
-
raise NotImplementedError(f"get_value() is not defined for feature '{name}'")
|
|
136
|
-
|
|
137
|
-
@classmethod
|
|
138
|
-
def _name(cls) -> str:
|
|
139
|
-
return f"{cls.__name__}@{cls._version}"
|
|
140
|
-
|
|
141
|
-
@classmethod
|
|
142
|
-
def __pydantic_init_subclass__(cls):
|
|
143
|
-
Registry.add(cls)
|
|
144
|
-
for name, field_info in cls.model_fields.items():
|
|
145
|
-
attr_value = _resolve(cls, name, field_info, prefix=[])
|
|
146
|
-
setattr(cls, name, RestrictedAttribute(attr_value, cls, name))
|
|
147
|
-
|
|
148
|
-
@classmethod
|
|
149
|
-
def _prefix(cls) -> str:
|
|
150
|
-
return cls._normalize(cls.__name__)
|
|
151
|
-
|
|
152
|
-
@classmethod
|
|
153
|
-
def _normalize(cls, name: str) -> str:
|
|
154
|
-
if DEFAULT_DELIMITER in name:
|
|
155
|
-
raise RuntimeError(
|
|
156
|
-
f"variable '{name}' cannot be used "
|
|
157
|
-
f"because it contains {DEFAULT_DELIMITER}"
|
|
158
|
-
)
|
|
159
|
-
return Feature._to_snake_case(name)
|
|
160
|
-
|
|
161
|
-
@staticmethod
|
|
162
|
-
def _to_snake_case(name: str) -> str:
|
|
163
|
-
"""Convert a CamelCase name to snake_case."""
|
|
164
|
-
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
|
|
165
|
-
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
|
|
166
|
-
|
|
167
|
-
def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
|
|
168
|
-
pass
|
|
169
|
-
|
|
170
|
-
@classmethod
|
|
171
|
-
def get_file_signals(cls, path: list[str]) -> Iterable[list[str]]:
|
|
172
|
-
if cls._is_file:
|
|
173
|
-
yield path
|
|
174
|
-
|
|
175
|
-
for name, f_info in cls.model_fields.items():
|
|
176
|
-
anno = f_info.annotation
|
|
177
|
-
if Feature.is_feature(anno):
|
|
178
|
-
yield from anno.get_file_signals([*path, name]) # type: ignore[union-attr]
|
|
179
|
-
|
|
180
|
-
@classmethod
|
|
181
|
-
def is_feature(cls, anno) -> bool:
|
|
182
|
-
if anno in feature_classes_lookup:
|
|
183
|
-
# Optimization: Skip expensive subclass checks if already checked.
|
|
184
|
-
return feature_classes_lookup[anno]
|
|
185
|
-
is_class = inspect.isclass(anno)
|
|
186
|
-
result = (
|
|
187
|
-
is_class
|
|
188
|
-
and not isinstance(anno, GenericAlias)
|
|
189
|
-
and issubclass(anno, Feature)
|
|
190
|
-
)
|
|
191
|
-
if is_class:
|
|
192
|
-
# Only cache types in the feature classes lookup dict (not instances).
|
|
193
|
-
feature_classes_lookup[anno] = result
|
|
194
|
-
return result
|
|
195
|
-
|
|
196
|
-
@classmethod
|
|
197
|
-
def is_standard_type(cls, t: type) -> bool:
|
|
198
|
-
return any(
|
|
199
|
-
t is ft or t is get_args(ft)[0] for ft in get_args(FeatureStandardType)
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
@classmethod
|
|
203
|
-
def is_feature_type(cls, t: type) -> bool:
|
|
204
|
-
if cls.is_standard_type(t):
|
|
205
|
-
return True
|
|
206
|
-
if get_origin(t) is list and len(get_args(t)) == 1:
|
|
207
|
-
return cls.is_feature_type(get_args(t)[0])
|
|
208
|
-
return cls.is_feature(t)
|
|
209
|
-
|
|
210
|
-
def _flatten_fields_values(self, fields, model):
|
|
211
|
-
for name, f_info in fields.items():
|
|
212
|
-
anno = f_info.annotation
|
|
213
|
-
# Optimization: Access attributes directly to skip the model_dump() call.
|
|
214
|
-
value = getattr(model, name)
|
|
215
|
-
|
|
216
|
-
if isinstance(value, list):
|
|
217
|
-
yield [
|
|
218
|
-
val.model_dump() if Feature.is_feature(type(val)) else val
|
|
219
|
-
for val in value
|
|
220
|
-
]
|
|
221
|
-
elif isinstance(value, dict):
|
|
222
|
-
yield {
|
|
223
|
-
key: val.model_dump() if Feature.is_feature(type(val)) else val
|
|
224
|
-
for key, val in value.items()
|
|
225
|
-
}
|
|
226
|
-
elif Feature.is_feature(anno):
|
|
227
|
-
yield from self._flatten_fields_values(anno.model_fields, value)
|
|
228
|
-
else:
|
|
229
|
-
yield value
|
|
230
|
-
|
|
231
|
-
def _flatten(self):
|
|
232
|
-
return tuple(self._flatten_fields_values(self.model_fields, self))
|
|
233
|
-
|
|
234
|
-
@staticmethod
|
|
235
|
-
def _flatten_list(objs):
|
|
236
|
-
return tuple(
|
|
237
|
-
val
|
|
238
|
-
for obj in objs
|
|
239
|
-
for val in obj._flatten_fields_values(obj.model_fields, obj)
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
@classmethod
|
|
243
|
-
def _unflatten_with_path(cls, dump, name_path: list[str]):
|
|
244
|
-
res = {}
|
|
245
|
-
for name, f_info in cls.model_fields.items():
|
|
246
|
-
anno = f_info.annotation
|
|
247
|
-
name_norm = cls._normalize(name)
|
|
248
|
-
lst = copy.copy(name_path)
|
|
249
|
-
|
|
250
|
-
if inspect.isclass(anno) and issubclass(anno, Feature):
|
|
251
|
-
lst.append(name_norm)
|
|
252
|
-
val = anno._unflatten_with_path(dump, lst)
|
|
253
|
-
res[name] = val
|
|
254
|
-
else:
|
|
255
|
-
lst.append(name_norm)
|
|
256
|
-
curr_path = DEFAULT_DELIMITER.join(lst)
|
|
257
|
-
res[name] = dump[curr_path]
|
|
258
|
-
return cls(**res)
|
|
259
|
-
|
|
260
|
-
@classmethod
|
|
261
|
-
def _unflatten(cls, dump):
|
|
262
|
-
return cls._unflatten_with_path(dump, [])
|
|
263
|
-
|
|
264
|
-
@classmethod
|
|
265
|
-
def _unflatten_to_json(cls, row: Sequence[Any], pos=0) -> dict:
|
|
266
|
-
return cls._unflatten_to_json_pos(row, pos)[0]
|
|
267
|
-
|
|
268
|
-
@classmethod
|
|
269
|
-
def _unflatten_to_json_pos(cls, row: Sequence[Any], pos=0) -> tuple[dict, int]:
|
|
270
|
-
res = {}
|
|
271
|
-
for name, f_info in cls.model_fields.items():
|
|
272
|
-
anno = f_info.annotation
|
|
273
|
-
origin = get_origin(anno)
|
|
274
|
-
if (
|
|
275
|
-
origin not in (list, dict)
|
|
276
|
-
and inspect.isclass(anno)
|
|
277
|
-
and issubclass(anno, Feature)
|
|
278
|
-
):
|
|
279
|
-
res[name], pos = anno._unflatten_to_json_pos(row, pos)
|
|
280
|
-
else:
|
|
281
|
-
res[name] = row[pos]
|
|
282
|
-
pos += 1
|
|
283
|
-
return res, pos
|
|
284
|
-
|
|
285
|
-
@classmethod
|
|
286
|
-
@lru_cache(maxsize=1000)
|
|
287
|
-
def build_tree(cls):
|
|
288
|
-
res = {}
|
|
289
|
-
|
|
290
|
-
for name, f_info in cls.model_fields.items():
|
|
291
|
-
anno = f_info.annotation
|
|
292
|
-
subtree = anno.build_tree() if Feature.is_feature(anno) else None
|
|
293
|
-
res[name] = (anno, subtree)
|
|
294
|
-
|
|
295
|
-
return res
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
class RestrictedAttribute:
|
|
299
|
-
"""Descriptor implementing an attribute that can only be accessed through
|
|
300
|
-
the defining class and not from subclasses or instances.
|
|
301
|
-
|
|
302
|
-
Since it is a non-data descriptor, instance dicts have precedence over it.
|
|
303
|
-
Cannot be used with slotted classes.
|
|
304
|
-
"""
|
|
305
|
-
|
|
306
|
-
def __init__(self, value, cls=None, name=None):
|
|
307
|
-
self.cls = cls
|
|
308
|
-
self.value = value
|
|
309
|
-
self.name = name
|
|
310
|
-
|
|
311
|
-
def __get__(self, instance, owner):
|
|
312
|
-
if owner is not self.cls:
|
|
313
|
-
raise AttributeError(
|
|
314
|
-
f"'{type(owner).__name__}' object has no attribute '{self.name}'"
|
|
315
|
-
)
|
|
316
|
-
if instance is not None:
|
|
317
|
-
raise RuntimeError(
|
|
318
|
-
f"Invalid attempt to access class attribute '{self.name}' through "
|
|
319
|
-
f"'{type(owner).__name__}' instance"
|
|
320
|
-
)
|
|
321
|
-
return self.value
|
|
322
|
-
|
|
323
|
-
def __set_name__(self, cls, name):
|
|
324
|
-
self.cls = cls
|
|
325
|
-
self.name = name
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
@attrs.define
|
|
329
|
-
class FeatureAttributeWrapper:
|
|
330
|
-
cls: type[Feature]
|
|
331
|
-
prefix: list[str]
|
|
332
|
-
|
|
333
|
-
@property
|
|
334
|
-
def name(self) -> str:
|
|
335
|
-
return DEFAULT_DELIMITER.join(self.prefix)
|
|
336
|
-
|
|
337
|
-
def __getattr__(self, name):
|
|
338
|
-
field_info = self.cls.model_fields.get(name)
|
|
339
|
-
if field_info:
|
|
340
|
-
return _resolve(self.cls, name, field_info, prefix=self.prefix)
|
|
341
|
-
raise AttributeError(
|
|
342
|
-
f"'{type(self).__name__}' object has no attribute '{name}'"
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
def _resolve(cls, name, field_info, prefix: list[str]):
|
|
347
|
-
"""Resolve feature attributes so they can be used in select(), join()
|
|
348
|
-
and similar functions.
|
|
349
|
-
|
|
350
|
-
Users just use `MyClass.sub_attr1.sub_attr2.field` and it will return a DB column
|
|
351
|
-
with a proper name (with default naming - `my_class__sub_attr1__sub_attr2__field`).
|
|
352
|
-
"""
|
|
353
|
-
anno = field_info.annotation
|
|
354
|
-
norm_name = cls._normalize(name)
|
|
355
|
-
|
|
356
|
-
if not cls.is_feature(anno):
|
|
357
|
-
try:
|
|
358
|
-
anno_sql_class = convert_type_to_datachain(anno)
|
|
359
|
-
except TypeError:
|
|
360
|
-
anno_sql_class = NullType
|
|
361
|
-
new_prefix = copy.copy(prefix)
|
|
362
|
-
new_prefix.append(norm_name)
|
|
363
|
-
return C(DEFAULT_DELIMITER.join(new_prefix), anno_sql_class)
|
|
364
|
-
|
|
365
|
-
return FeatureAttributeWrapper(anno, [*prefix, norm_name])
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
def convert_type_to_datachain(typ): # noqa: PLR0911
|
|
369
|
-
if inspect.isclass(typ):
|
|
370
|
-
if issubclass(typ, SQLType):
|
|
371
|
-
return typ
|
|
372
|
-
if issubclass(typ, Enum):
|
|
373
|
-
return str
|
|
374
|
-
|
|
375
|
-
res = TYPE_TO_DATACHAIN.get(typ)
|
|
376
|
-
if res:
|
|
377
|
-
return res
|
|
378
|
-
|
|
379
|
-
orig = get_origin(typ)
|
|
380
|
-
|
|
381
|
-
if orig in (Literal, LiteralEx):
|
|
382
|
-
return String
|
|
383
|
-
|
|
384
|
-
args = get_args(typ)
|
|
385
|
-
if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
|
|
386
|
-
if args is None or len(args) != 1:
|
|
387
|
-
raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
|
|
388
|
-
|
|
389
|
-
args0 = args[0]
|
|
390
|
-
if Feature.is_feature(args0):
|
|
391
|
-
return Array(JSON())
|
|
392
|
-
|
|
393
|
-
next_type = convert_type_to_datachain(args0)
|
|
394
|
-
return Array(next_type)
|
|
395
|
-
|
|
396
|
-
if inspect.isclass(orig) and issubclass(dict, orig):
|
|
397
|
-
return JSON
|
|
398
|
-
|
|
399
|
-
if orig == Union and len(args) == 2 and (type(None) in args):
|
|
400
|
-
return convert_type_to_datachain(args[0])
|
|
401
|
-
|
|
402
|
-
# Special case for list in JSON: Union[dict, list[dict]]
|
|
403
|
-
if orig == Union and len(args) >= 2:
|
|
404
|
-
args_no_nones = [arg for arg in args if arg != type(None)]
|
|
405
|
-
if len(args_no_nones) == 2:
|
|
406
|
-
args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
|
|
407
|
-
if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
|
|
408
|
-
arg = get_args(args_no_dicts[0])
|
|
409
|
-
if len(arg) == 1 and arg[0] is dict:
|
|
410
|
-
return JSON
|
|
411
|
-
|
|
412
|
-
raise TypeError(f"Cannot recognize type {typ}")
|