datachain 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (50) hide show
  1. datachain/__init__.py +17 -8
  2. datachain/catalog/catalog.py +5 -5
  3. datachain/cli.py +0 -2
  4. datachain/data_storage/schema.py +5 -5
  5. datachain/data_storage/sqlite.py +1 -1
  6. datachain/data_storage/warehouse.py +7 -7
  7. datachain/lib/arrow.py +25 -8
  8. datachain/lib/clip.py +6 -11
  9. datachain/lib/convert/__init__.py +0 -0
  10. datachain/lib/convert/flatten.py +67 -0
  11. datachain/lib/convert/type_converter.py +96 -0
  12. datachain/lib/convert/unflatten.py +69 -0
  13. datachain/lib/convert/values_to_tuples.py +85 -0
  14. datachain/lib/data_model.py +74 -0
  15. datachain/lib/dc.py +192 -167
  16. datachain/lib/feature_registry.py +36 -10
  17. datachain/lib/file.py +41 -41
  18. datachain/lib/gpt4_vision.py +1 -9
  19. datachain/lib/hf_image_to_text.py +9 -17
  20. datachain/lib/hf_pipeline.py +4 -12
  21. datachain/lib/image.py +2 -18
  22. datachain/lib/image_transform.py +0 -1
  23. datachain/lib/iptc_exif_xmp.py +8 -15
  24. datachain/lib/meta_formats.py +1 -5
  25. datachain/lib/model_store.py +77 -0
  26. datachain/lib/pytorch.py +9 -21
  27. datachain/lib/signal_schema.py +120 -58
  28. datachain/lib/text.py +5 -16
  29. datachain/lib/udf.py +114 -30
  30. datachain/lib/udf_signature.py +5 -5
  31. datachain/lib/webdataset.py +3 -4
  32. datachain/lib/webdataset_laion.py +2 -3
  33. datachain/node.py +4 -4
  34. datachain/query/batch.py +1 -1
  35. datachain/query/dataset.py +40 -60
  36. datachain/query/dispatch.py +28 -17
  37. datachain/query/udf.py +46 -26
  38. datachain/remote/studio.py +1 -9
  39. datachain/torch/__init__.py +21 -0
  40. {datachain-0.2.8.dist-info → datachain-0.2.10.dist-info}/METADATA +13 -12
  41. {datachain-0.2.8.dist-info → datachain-0.2.10.dist-info}/RECORD +45 -42
  42. datachain/image/__init__.py +0 -3
  43. datachain/lib/cached_stream.py +0 -38
  44. datachain/lib/claude.py +0 -69
  45. datachain/lib/feature.py +0 -412
  46. datachain/lib/feature_utils.py +0 -154
  47. {datachain-0.2.8.dist-info → datachain-0.2.10.dist-info}/LICENSE +0 -0
  48. {datachain-0.2.8.dist-info → datachain-0.2.10.dist-info}/WHEEL +0 -0
  49. {datachain-0.2.8.dist-info → datachain-0.2.10.dist-info}/entry_points.txt +0 -0
  50. {datachain-0.2.8.dist-info → datachain-0.2.10.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,14 @@
1
- datachain/__init__.py,sha256=WTZQycUOpP1b-Ry_Qje5HH0EE14ptne-ZiQQ5070UMA,798
1
+ datachain/__init__.py,sha256=L5IlHOD4AaHkV7P5dbUwdq90I3bGFLtOghoZ1WVFGcs,841
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
4
  datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
5
- datachain/cli.py,sha256=lInqYMhk8YuPY-ZWkfWZmE-ZmdIChJgbs305-a_MWpo,32457
5
+ datachain/cli.py,sha256=gikzwEXTDKyzY1xOAUziXN2-OVqnOhDMJTd7SHq0Jxc,32406
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
10
  datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
11
- datachain/node.py,sha256=fHe7k5ajI2g2qnzsG-_NQR_T-QdBYctVeEa8c8dsu_Y,5703
11
+ datachain/node.py,sha256=fsQDJUmRMSRHhL1u6qQlWgreHbH760Ls-yDzFLhbW-U,5724
12
12
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
13
13
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
14
14
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
@@ -16,7 +16,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
17
17
  datachain/utils.py,sha256=12yQAV8tfyCHqp_xJcJBeNnr1L_BO8e2bOPyXdM68gs,10759
18
18
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
19
- datachain/catalog/catalog.py,sha256=pulKGJgAmxqSmFqBhA-J0wCKdBqGX4vqpV0cAvV6vUw,79578
19
+ datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
20
20
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
21
21
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
22
22
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -32,51 +32,53 @@ datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-
32
32
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
33
33
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
34
34
  datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
35
- datachain/data_storage/schema.py,sha256=bY3q2OUaUraos0s5BnwWkhgce8YpeNmIl7M1ifshoes,8074
35
+ datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
36
36
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
37
- datachain/data_storage/sqlite.py,sha256=F68Q_AIqNAObZ5kJ0GnBqRC6e2D2sRehkQo8UzrHgtI,25079
38
- datachain/data_storage/warehouse.py,sha256=h35JiJoCGtwkMctis_x3NHxkwEejX5sIWvJOluZxrOI,33132
39
- datachain/image/__init__.py,sha256=g3l7vJFzg0-s5OAmBtGargsxt12TuKU4Ex6S0fOmEeY,101
37
+ datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
38
+ datachain/data_storage/warehouse.py,sha256=UbD37_jqaM4BY2SsQaTiJre-eSa7HcPejrTp936L080,33170
40
39
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
42
- datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
43
- datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
44
- datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
45
- datachain/lib/dc.py,sha256=D3cgib-U0Mo0x5wEK1_NfgymAldHqCvooZwtyohi53Q,34426
46
- datachain/lib/feature.py,sha256=iMwbMyQUyjRUeB-vhAucnx59kNSVvX_xEChTW5B9klY,12244
47
- datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
48
- datachain/lib/feature_utils.py,sha256=2yLdZd9o4AJ5QQX7kqgbCxCT78aT7HE12CLxQ6QRpbc,4982
49
- datachain/lib/file.py,sha256=LGBwC7tFU7VcSWk5kjPpEWPBQas5me69L2uTDNvYXGM,8326
50
- datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
51
- datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
52
- datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
53
- datachain/lib/image.py,sha256=ZYfDqr9p-RRmWBeWFQwXLS1J3vQS616ykfMUvQVpqBY,2717
54
- datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
55
- datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
56
- datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
57
- datachain/lib/pytorch.py,sha256=Ea1sXhborF6zcywQjLpXgKnbr1lTez4Bfu3m0Gr78FI,5843
40
+ datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
41
+ datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
42
+ datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
43
+ datachain/lib/dc.py,sha256=Px7zj1mrAsL3sBLu1pezssBQkvY0YAoGJm4VbT2yRwc,34699
44
+ datachain/lib/feature_registry.py,sha256=LUrBvDom-k1shFuCv46-OdgntbIUQ5008oyIS0iPM6Q,2298
45
+ datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
46
+ datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
47
+ datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
48
+ datachain/lib/hf_pipeline.py,sha256=MBFzixVa25_6QVR9RyOq8Rr9UIQ-sFVcBHducx_sZcY,2069
49
+ datachain/lib/image.py,sha256=K0n_P7kmobWTgxe-rDbr5yY3vBrOPnseziE3DXwFFVo,2325
50
+ datachain/lib/image_transform.py,sha256=hfgvIrSMGBx_MEXECyvrFoO1NyPBHoDb28j2lT2dsf8,2953
51
+ datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
52
+ datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
53
+ datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
54
+ datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
58
55
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
59
- datachain/lib/signal_schema.py,sha256=hD56hyO1H3A5H2oyTUwPcNu6UOQ_XY0DeA0nrXBqFaU,11492
60
- datachain/lib/text.py,sha256=PUT1O0jNJoQGsuhff2LgDpzTWk2eMdwIKqEDBrE448M,1307
61
- datachain/lib/udf.py,sha256=axMvqYz4tdyg_C3nyuOcDsu3Aqr19jWv2vl54U_8LQM,6595
62
- datachain/lib/udf_signature.py,sha256=CUKgoVpM_N8CgvMncpAw2RYchoiJdAGdDSdluoP0hIk,7161
56
+ datachain/lib/signal_schema.py,sha256=xzVHauGrhGcS5aOE1UMqT5YjJeZIMAZYQq76tZShhnY,13550
57
+ datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
58
+ datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
59
+ datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
63
60
  datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
64
61
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
65
62
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
- datachain/lib/webdataset.py,sha256=GWB_pocfRZGoU4Lhd7Wh3hx2Rnm_fJWXX4S_zXJIEmk,8286
67
- datachain/lib/webdataset_laion.py,sha256=HAtSCbVvEQqzKkoRamRxDKaQALSB3QmJRU2yWRFNxwY,2147
63
+ datachain/lib/webdataset.py,sha256=eqIDSqfBOhEK43JMp-6lYdYy2x3Ge5lwqR-hKGV8aG0,8259
64
+ datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
65
+ datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
67
+ datachain/lib/convert/type_converter.py,sha256=W-wvCIcb6OwWjRJ3EWJE4-LbpoqxsRBd6gYNpFlm8qo,2643
68
+ datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
69
+ datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0QEckwXlKgFbLA,3088
68
70
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
69
- datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
71
+ datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
70
72
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
71
- datachain/query/dataset.py,sha256=vpu2wQYC5uWc-LdZrNV-PV7xQapbYCtqyrXiiIa77DI,64982
72
- datachain/query/dispatch.py,sha256=ZeL5dga5d4cJDBftK7gAQ_mx4C7zq6t3z0Hdt7mcZYY,13094
73
+ datachain/query/dataset.py,sha256=Pmaz16phEummJpWJD3x-8SMMbCb6xcOtWTyMdsFOdOE,64414
74
+ datachain/query/dispatch.py,sha256=Qv5QpP5-K9JAmZLntifRzS5_AYHbK82Ahreo302Ntq8,13218
73
75
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
74
76
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
75
77
  datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
76
78
  datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
77
- datachain/query/udf.py,sha256=gnLDM7LKH8_bbdDeVHnlDKaBdbWc_NAbwvYCc4i-OlU,7101
79
+ datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
78
80
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
79
- datachain/remote/studio.py,sha256=bZb85WjtqMNFBoRuPbH-TEGpAyz0afROR7E9UgIef_Y,7438
81
+ datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
80
82
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
81
83
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
82
84
  datachain/sql/types.py,sha256=BzUm0nCcMPASvdqpQouX5bdVcK3G3DBfeeNhau7X_hA,10234
@@ -94,9 +96,10 @@ datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,
94
96
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
95
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
96
98
  datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
97
- datachain-0.2.8.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
98
- datachain-0.2.8.dist-info/METADATA,sha256=5NdR22k9zEgk8HrWm81W6nwm23g0se3SDs-y9CxmPU4,16475
99
- datachain-0.2.8.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
100
- datachain-0.2.8.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
101
- datachain-0.2.8.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
102
- datachain-0.2.8.dist-info/RECORD,,
99
+ datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
100
+ datachain-0.2.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
+ datachain-0.2.10.dist-info/METADATA,sha256=bWvqTD9c2joLmkDGpdcutjjF_s1LpccbSCLbkIaKQYQ,16732
102
+ datachain-0.2.10.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
103
+ datachain-0.2.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
+ datachain-0.2.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
+ datachain-0.2.10.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- from datachain.lib.image import ImageFile, convert_images
2
-
3
- __all__ = ["ImageFile", "convert_images"]
@@ -1,38 +0,0 @@
1
- from abc import ABC
2
- from contextlib import AbstractContextManager
3
-
4
- from datachain.cache import UniqueId
5
-
6
-
7
- class AbstractCachedStream(AbstractContextManager, ABC):
8
- def __init__(self, catalog, uid: UniqueId):
9
- self.catalog = catalog
10
- self.uid = uid
11
- self.mode = "rb"
12
-
13
- def set_mode(self, mode):
14
- self.mode = mode
15
-
16
-
17
- class PreCachedStream(AbstractCachedStream):
18
- def __init__(self, catalog, uid: UniqueId):
19
- super().__init__(catalog, uid)
20
- self.client = self.catalog.get_client(self.uid.storage)
21
- self.cached_file = None
22
-
23
- def get_path_in_cache(self):
24
- return self.catalog.cache.path_from_checksum(self.uid.get_hash())
25
-
26
- def __enter__(self):
27
- self.client.download(self.uid)
28
- self.cached_file = open(self.get_path_in_cache(), self.mode)
29
- return self.cached_file
30
-
31
- def __exit__(self, *args):
32
- self.cached_file.close()
33
-
34
-
35
- class PreDownloadStream(PreCachedStream):
36
- def __exit__(self, *args):
37
- super().__exit__(*args)
38
- self.catalog.cache.remove(self.uid)
datachain/lib/claude.py DELETED
@@ -1,69 +0,0 @@
1
- import os
2
- from typing import Callable, Literal, Optional
3
-
4
- import anthropic
5
-
6
- from datachain.lib.feature import Feature
7
- from datachain.lib.file import File
8
-
9
- default_model_name = "claude-3-haiku-20240307"
10
- DEFAULT_OUTPUT_TOKENS = 1024
11
-
12
- # This classes can be auto-generated:
13
- # >> from anthropic.types.message import Message
14
- # >> ClaudeMessage = pydantic_to_feature(Message)
15
- # However, auto-generated pydentic classes do not work in multithreading mode.
16
-
17
-
18
- class UsageFr(Feature):
19
- input_tokens: int = 0
20
- output_tokens: int = 0
21
-
22
-
23
- class TextBlockFr(Feature):
24
- text: str = ""
25
- type: str = "text"
26
-
27
-
28
- class ClaudeMessage(Feature):
29
- id: str = ""
30
- content: list[TextBlockFr]
31
- model: str = ""
32
- role: str = ""
33
- stop_reason: Optional[Literal["end_turn", "max_tokens", "stop_sequence"]] = None
34
- stop_sequence: Optional[str] = None
35
- type: Literal["message"] = "message"
36
- usage: UsageFr = UsageFr()
37
-
38
-
39
- def claude_processor(
40
- prompt: str,
41
- messages: Optional[list] = None,
42
- model: str = "claude-3-haiku-20240307",
43
- api_key: Optional[str] = "",
44
- max_retries: int = 5,
45
- temperature: float = 0.9,
46
- max_tokens: int = 1024,
47
- **kwargs,
48
- ) -> Callable:
49
- if not messages:
50
- messages = []
51
- api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
52
-
53
- def claude_func(file) -> ClaudeMessage:
54
- try:
55
- data = file.get_value() if isinstance(file, File) else file
56
- client = anthropic.Anthropic(api_key=api_key, max_retries=max_retries)
57
- response = client.messages.create(
58
- model=model,
59
- system=prompt,
60
- messages=[{"role": "user", "content": data}, *messages],
61
- temperature=temperature,
62
- max_tokens=max_tokens,
63
- **kwargs,
64
- )
65
- return ClaudeMessage(**response.model_dump())
66
- except Exception: # noqa: BLE001
67
- return ClaudeMessage(content=[])
68
-
69
- return claude_func
datachain/lib/feature.py DELETED
@@ -1,412 +0,0 @@
1
- import copy
2
- import inspect
3
- import re
4
- import warnings
5
- from collections.abc import Iterable, Sequence
6
- from datetime import datetime
7
- from enum import Enum
8
- from functools import lru_cache
9
- from types import GenericAlias
10
- from typing import (
11
- TYPE_CHECKING,
12
- Any,
13
- ClassVar,
14
- Literal,
15
- Union,
16
- get_args,
17
- get_origin,
18
- )
19
-
20
- import attrs
21
- import numpy as np
22
- import pandas as pd
23
- from pydantic import BaseModel
24
- from typing_extensions import Literal as LiteralEx
25
-
26
- from datachain.lib.feature_registry import Registry
27
- from datachain.query import C
28
- from datachain.query.schema import DEFAULT_DELIMITER
29
- from datachain.sql.types import (
30
- JSON,
31
- Array,
32
- Binary,
33
- Boolean,
34
- DateTime,
35
- Float,
36
- Int,
37
- Int32,
38
- Int64,
39
- NullType,
40
- SQLType,
41
- String,
42
- )
43
-
44
- if TYPE_CHECKING:
45
- from datachain.catalog import Catalog
46
-
47
- FeatureStandardType = Union[
48
- type[int],
49
- type[str],
50
- type[float],
51
- type[bool],
52
- type[list],
53
- type[dict],
54
- type[bytes],
55
- type[datetime],
56
- ]
57
-
58
- FeatureType = Union[type["Feature"], FeatureStandardType]
59
- FeatureTypeNames = "Feature, int, str, float, bool, list, dict, bytes, datetime"
60
-
61
-
62
- TYPE_TO_DATACHAIN = {
63
- int: Int64,
64
- str: String,
65
- Literal: String,
66
- LiteralEx: String,
67
- Enum: String,
68
- float: Float,
69
- bool: Boolean,
70
- datetime: DateTime, # Note, list of datetime is not supported yet
71
- bytes: Binary, # Note, list of bytes is not supported yet
72
- list: Array,
73
- dict: JSON,
74
- }
75
-
76
- DATACHAIN_TO_TYPE = {
77
- Int: int,
78
- Int32: int,
79
- Int64: int,
80
- String: str,
81
- Float: float,
82
- Boolean: bool,
83
- DateTime: datetime,
84
- Binary: bytes,
85
- Array(NullType): list,
86
- JSON: dict,
87
- }
88
-
89
-
90
- NUMPY_TO_DATACHAIN = {
91
- np.dtype("int8"): Int,
92
- np.dtype("int16"): Int,
93
- np.dtype("int32"): Int,
94
- np.dtype("int64"): Int,
95
- np.dtype("uint8"): Int,
96
- np.dtype("uint16"): Int,
97
- np.dtype("uint32"): Int,
98
- np.dtype("uint64"): Int,
99
- np.dtype("float16"): Float,
100
- np.dtype("float32"): Float,
101
- np.dtype("float64"): Float,
102
- np.dtype("object"): String,
103
- pd.CategoricalDtype(): String,
104
- }
105
-
106
-
107
- # Disable Pydantic warning, see https://github.com/iterative/dvcx/issues/1285
108
- warnings.filterwarnings(
109
- "ignore",
110
- message="Field name .* shadows an attribute in parent",
111
- category=UserWarning,
112
- )
113
-
114
-
115
- # Optimization: Store feature classes in this lookup variable so extra checks can be
116
- # skipped within loops.
117
- feature_classes_lookup: dict[type, bool] = {}
118
-
119
-
120
- class Feature(BaseModel):
121
- """A base class for defining data classes that serve as inputs and outputs for
122
- DataChain processing functions like `map()`, `gen()`, etc. Inherits from
123
- `pydantic`'s BaseModel.
124
- """
125
-
126
- _is_file: ClassVar[bool] = False
127
- _version: ClassVar[int] = 1
128
-
129
- @classmethod
130
- def _is_hidden(cls):
131
- return cls.__name__.startswith("_")
132
-
133
- def get_value(self, *args: Any, **kwargs: Any) -> Any:
134
- name = self.__class__.__name__
135
- raise NotImplementedError(f"get_value() is not defined for feature '{name}'")
136
-
137
- @classmethod
138
- def _name(cls) -> str:
139
- return f"{cls.__name__}@{cls._version}"
140
-
141
- @classmethod
142
- def __pydantic_init_subclass__(cls):
143
- Registry.add(cls)
144
- for name, field_info in cls.model_fields.items():
145
- attr_value = _resolve(cls, name, field_info, prefix=[])
146
- setattr(cls, name, RestrictedAttribute(attr_value, cls, name))
147
-
148
- @classmethod
149
- def _prefix(cls) -> str:
150
- return cls._normalize(cls.__name__)
151
-
152
- @classmethod
153
- def _normalize(cls, name: str) -> str:
154
- if DEFAULT_DELIMITER in name:
155
- raise RuntimeError(
156
- f"variable '{name}' cannot be used "
157
- f"because it contains {DEFAULT_DELIMITER}"
158
- )
159
- return Feature._to_snake_case(name)
160
-
161
- @staticmethod
162
- def _to_snake_case(name: str) -> str:
163
- """Convert a CamelCase name to snake_case."""
164
- s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
165
- return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
166
-
167
- def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
168
- pass
169
-
170
- @classmethod
171
- def get_file_signals(cls, path: list[str]) -> Iterable[list[str]]:
172
- if cls._is_file:
173
- yield path
174
-
175
- for name, f_info in cls.model_fields.items():
176
- anno = f_info.annotation
177
- if Feature.is_feature(anno):
178
- yield from anno.get_file_signals([*path, name]) # type: ignore[union-attr]
179
-
180
- @classmethod
181
- def is_feature(cls, anno) -> bool:
182
- if anno in feature_classes_lookup:
183
- # Optimization: Skip expensive subclass checks if already checked.
184
- return feature_classes_lookup[anno]
185
- is_class = inspect.isclass(anno)
186
- result = (
187
- is_class
188
- and not isinstance(anno, GenericAlias)
189
- and issubclass(anno, Feature)
190
- )
191
- if is_class:
192
- # Only cache types in the feature classes lookup dict (not instances).
193
- feature_classes_lookup[anno] = result
194
- return result
195
-
196
- @classmethod
197
- def is_standard_type(cls, t: type) -> bool:
198
- return any(
199
- t is ft or t is get_args(ft)[0] for ft in get_args(FeatureStandardType)
200
- )
201
-
202
- @classmethod
203
- def is_feature_type(cls, t: type) -> bool:
204
- if cls.is_standard_type(t):
205
- return True
206
- if get_origin(t) is list and len(get_args(t)) == 1:
207
- return cls.is_feature_type(get_args(t)[0])
208
- return cls.is_feature(t)
209
-
210
- def _flatten_fields_values(self, fields, model):
211
- for name, f_info in fields.items():
212
- anno = f_info.annotation
213
- # Optimization: Access attributes directly to skip the model_dump() call.
214
- value = getattr(model, name)
215
-
216
- if isinstance(value, list):
217
- yield [
218
- val.model_dump() if Feature.is_feature(type(val)) else val
219
- for val in value
220
- ]
221
- elif isinstance(value, dict):
222
- yield {
223
- key: val.model_dump() if Feature.is_feature(type(val)) else val
224
- for key, val in value.items()
225
- }
226
- elif Feature.is_feature(anno):
227
- yield from self._flatten_fields_values(anno.model_fields, value)
228
- else:
229
- yield value
230
-
231
- def _flatten(self):
232
- return tuple(self._flatten_fields_values(self.model_fields, self))
233
-
234
- @staticmethod
235
- def _flatten_list(objs):
236
- return tuple(
237
- val
238
- for obj in objs
239
- for val in obj._flatten_fields_values(obj.model_fields, obj)
240
- )
241
-
242
- @classmethod
243
- def _unflatten_with_path(cls, dump, name_path: list[str]):
244
- res = {}
245
- for name, f_info in cls.model_fields.items():
246
- anno = f_info.annotation
247
- name_norm = cls._normalize(name)
248
- lst = copy.copy(name_path)
249
-
250
- if inspect.isclass(anno) and issubclass(anno, Feature):
251
- lst.append(name_norm)
252
- val = anno._unflatten_with_path(dump, lst)
253
- res[name] = val
254
- else:
255
- lst.append(name_norm)
256
- curr_path = DEFAULT_DELIMITER.join(lst)
257
- res[name] = dump[curr_path]
258
- return cls(**res)
259
-
260
- @classmethod
261
- def _unflatten(cls, dump):
262
- return cls._unflatten_with_path(dump, [])
263
-
264
- @classmethod
265
- def _unflatten_to_json(cls, row: Sequence[Any], pos=0) -> dict:
266
- return cls._unflatten_to_json_pos(row, pos)[0]
267
-
268
- @classmethod
269
- def _unflatten_to_json_pos(cls, row: Sequence[Any], pos=0) -> tuple[dict, int]:
270
- res = {}
271
- for name, f_info in cls.model_fields.items():
272
- anno = f_info.annotation
273
- origin = get_origin(anno)
274
- if (
275
- origin not in (list, dict)
276
- and inspect.isclass(anno)
277
- and issubclass(anno, Feature)
278
- ):
279
- res[name], pos = anno._unflatten_to_json_pos(row, pos)
280
- else:
281
- res[name] = row[pos]
282
- pos += 1
283
- return res, pos
284
-
285
- @classmethod
286
- @lru_cache(maxsize=1000)
287
- def build_tree(cls):
288
- res = {}
289
-
290
- for name, f_info in cls.model_fields.items():
291
- anno = f_info.annotation
292
- subtree = anno.build_tree() if Feature.is_feature(anno) else None
293
- res[name] = (anno, subtree)
294
-
295
- return res
296
-
297
-
298
- class RestrictedAttribute:
299
- """Descriptor implementing an attribute that can only be accessed through
300
- the defining class and not from subclasses or instances.
301
-
302
- Since it is a non-data descriptor, instance dicts have precedence over it.
303
- Cannot be used with slotted classes.
304
- """
305
-
306
- def __init__(self, value, cls=None, name=None):
307
- self.cls = cls
308
- self.value = value
309
- self.name = name
310
-
311
- def __get__(self, instance, owner):
312
- if owner is not self.cls:
313
- raise AttributeError(
314
- f"'{type(owner).__name__}' object has no attribute '{self.name}'"
315
- )
316
- if instance is not None:
317
- raise RuntimeError(
318
- f"Invalid attempt to access class attribute '{self.name}' through "
319
- f"'{type(owner).__name__}' instance"
320
- )
321
- return self.value
322
-
323
- def __set_name__(self, cls, name):
324
- self.cls = cls
325
- self.name = name
326
-
327
-
328
- @attrs.define
329
- class FeatureAttributeWrapper:
330
- cls: type[Feature]
331
- prefix: list[str]
332
-
333
- @property
334
- def name(self) -> str:
335
- return DEFAULT_DELIMITER.join(self.prefix)
336
-
337
- def __getattr__(self, name):
338
- field_info = self.cls.model_fields.get(name)
339
- if field_info:
340
- return _resolve(self.cls, name, field_info, prefix=self.prefix)
341
- raise AttributeError(
342
- f"'{type(self).__name__}' object has no attribute '{name}'"
343
- )
344
-
345
-
346
- def _resolve(cls, name, field_info, prefix: list[str]):
347
- """Resolve feature attributes so they can be used in select(), join()
348
- and similar functions.
349
-
350
- Users just use `MyClass.sub_attr1.sub_attr2.field` and it will return a DB column
351
- with a proper name (with default naming - `my_class__sub_attr1__sub_attr2__field`).
352
- """
353
- anno = field_info.annotation
354
- norm_name = cls._normalize(name)
355
-
356
- if not cls.is_feature(anno):
357
- try:
358
- anno_sql_class = convert_type_to_datachain(anno)
359
- except TypeError:
360
- anno_sql_class = NullType
361
- new_prefix = copy.copy(prefix)
362
- new_prefix.append(norm_name)
363
- return C(DEFAULT_DELIMITER.join(new_prefix), anno_sql_class)
364
-
365
- return FeatureAttributeWrapper(anno, [*prefix, norm_name])
366
-
367
-
368
- def convert_type_to_datachain(typ): # noqa: PLR0911
369
- if inspect.isclass(typ):
370
- if issubclass(typ, SQLType):
371
- return typ
372
- if issubclass(typ, Enum):
373
- return str
374
-
375
- res = TYPE_TO_DATACHAIN.get(typ)
376
- if res:
377
- return res
378
-
379
- orig = get_origin(typ)
380
-
381
- if orig in (Literal, LiteralEx):
382
- return String
383
-
384
- args = get_args(typ)
385
- if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
386
- if args is None or len(args) != 1:
387
- raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
388
-
389
- args0 = args[0]
390
- if Feature.is_feature(args0):
391
- return Array(JSON())
392
-
393
- next_type = convert_type_to_datachain(args0)
394
- return Array(next_type)
395
-
396
- if inspect.isclass(orig) and issubclass(dict, orig):
397
- return JSON
398
-
399
- if orig == Union and len(args) == 2 and (type(None) in args):
400
- return convert_type_to_datachain(args[0])
401
-
402
- # Special case for list in JSON: Union[dict, list[dict]]
403
- if orig == Union and len(args) >= 2:
404
- args_no_nones = [arg for arg in args if arg != type(None)]
405
- if len(args_no_nones) == 2:
406
- args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
407
- if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
408
- arg = get_args(args_no_dicts[0])
409
- if len(arg) == 1 and arg[0] is dict:
410
- return JSON
411
-
412
- raise TypeError(f"Cannot recognize type {typ}")