datachain 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +2 -92
- datachain/cli.py +0 -37
- datachain/lib/arrow.py +5 -5
- datachain/lib/clip.py +14 -3
- datachain/lib/convert/python_to_sql.py +9 -0
- datachain/lib/data_model.py +10 -1
- datachain/lib/dc.py +135 -39
- datachain/lib/hf.py +166 -0
- datachain/lib/image.py +9 -1
- datachain/lib/pytorch.py +1 -2
- datachain/lib/signal_schema.py +124 -20
- datachain/lib/text.py +4 -0
- datachain/lib/udf.py +14 -20
- datachain/lib/webdataset.py +1 -1
- datachain/query/dataset.py +24 -9
- datachain/query/session.py +5 -3
- {datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/METADATA +19 -15
- {datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/RECORD +22 -21
- {datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/WHEEL +1 -1
- {datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/LICENSE +0 -0
- {datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -41,10 +41,11 @@ Requires-Dist: jmespath >=1.0
|
|
|
41
41
|
Requires-Dist: datamodel-code-generator >=0.25
|
|
42
42
|
Requires-Dist: Pillow <11,>=10.0.0
|
|
43
43
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
44
|
+
Requires-Dist: psutil
|
|
44
45
|
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
45
46
|
Provides-Extra: dev
|
|
46
47
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
47
|
-
Requires-Dist: mypy ==1.11.
|
|
48
|
+
Requires-Dist: mypy ==1.11.2 ; extra == 'dev'
|
|
48
49
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
49
50
|
Requires-Dist: types-pytz ; extra == 'dev'
|
|
50
51
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
@@ -64,11 +65,14 @@ Requires-Dist: accelerate ; extra == 'examples'
|
|
|
64
65
|
Requires-Dist: unstructured[pdf] ; extra == 'examples'
|
|
65
66
|
Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
66
67
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
68
|
+
Provides-Extra: hf
|
|
69
|
+
Requires-Dist: numba >=0.60.0 ; extra == 'hf'
|
|
70
|
+
Requires-Dist: datasets[audio,vision] ; extra == 'hf'
|
|
67
71
|
Provides-Extra: remote
|
|
68
72
|
Requires-Dist: lz4 ; extra == 'remote'
|
|
69
73
|
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
70
74
|
Provides-Extra: tests
|
|
71
|
-
Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
|
|
75
|
+
Requires-Dist: datachain[hf,remote,torch,vector] ; extra == 'tests'
|
|
72
76
|
Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
73
77
|
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
74
78
|
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
@@ -83,6 +87,7 @@ Requires-Dist: hypothesis ; extra == 'tests'
|
|
|
83
87
|
Requires-Dist: open-clip-torch ; extra == 'tests'
|
|
84
88
|
Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
|
|
85
89
|
Requires-Dist: requests-mock ; extra == 'tests'
|
|
90
|
+
Requires-Dist: scipy ; extra == 'tests'
|
|
86
91
|
Provides-Extra: torch
|
|
87
92
|
Requires-Dist: torch >=2.1.0 ; extra == 'torch'
|
|
88
93
|
Requires-Dist: torchvision ; extra == 'torch'
|
|
@@ -110,31 +115,30 @@ AI 🔗 DataChain
|
|
|
110
115
|
|
|
111
116
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
112
117
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
113
|
-
your local machine.
|
|
118
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
114
119
|
|
|
115
120
|
Key Features
|
|
116
121
|
============
|
|
117
122
|
|
|
118
123
|
📂 **Storage as a Source of Truth.**
|
|
119
|
-
- Process unstructured data without redundant copies
|
|
124
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
120
125
|
file systems.
|
|
121
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
122
|
-
-
|
|
126
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
123
128
|
|
|
124
129
|
🐍 **Python-friendly data pipelines.**
|
|
125
130
|
- Operate on Python objects and object fields.
|
|
126
|
-
- Built-in parallelization and out-of-memory compute without
|
|
127
|
-
Spark jobs.
|
|
131
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
128
132
|
|
|
129
133
|
🧠 **Data Enrichment and Processing.**
|
|
130
|
-
- Generate metadata
|
|
131
|
-
- Filter, join, and group by
|
|
132
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
134
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
135
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
136
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
133
137
|
|
|
134
138
|
🚀 **Efficiency.**
|
|
135
139
|
- Parallelization, out-of-memory workloads and data caching.
|
|
136
140
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
137
|
-
-
|
|
141
|
+
- Optimized vector search.
|
|
138
142
|
|
|
139
143
|
|
|
140
144
|
Quick Start
|
|
@@ -159,7 +163,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
159
163
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
160
164
|
}
|
|
161
165
|
|
|
162
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
166
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
163
167
|
|
|
164
168
|
|
|
165
169
|
.. code:: py
|
|
@@ -229,7 +233,7 @@ detected are then copied to the local directory.
|
|
|
229
233
|
LLM judging chatbots
|
|
230
234
|
=============================
|
|
231
235
|
|
|
232
|
-
LLMs can work as
|
|
236
|
+
LLMs can work as universal classifiers. In the example below,
|
|
233
237
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
234
238
|
Mistral API key at https://console.mistral.ai
|
|
235
239
|
|
|
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
|
|
4
4
|
datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=otR2eN0JL-JhZ9SOTPcPwt_-_TiT-vHifx2h4YzD6Tg,32052
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=kGpp9IEyr1YS7QFWjLYprRT1gp7freyt-WLaLNzqUZg,77859
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -38,42 +38,43 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
|
|
|
38
38
|
datachain/data_storage/sqlite.py,sha256=jLgkvikYkENQUO_ykoNFfsBc2ofZXwFHLMa1nyWP3aw,28316
|
|
39
39
|
datachain/data_storage/warehouse.py,sha256=cvlfa-nyIxqrrpSRtCdeVjlTwhn7rcIoWjOq91HhItU,33668
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
-
datachain/lib/clip.py,sha256=
|
|
43
|
-
datachain/lib/data_model.py,sha256=
|
|
41
|
+
datachain/lib/arrow.py,sha256=17-jHLdYhsSdO5kfKWpBS5OAWbMjNi5r8ao0zGXUBoA,4941
|
|
42
|
+
datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
|
|
43
|
+
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=tY_ccOsv9njsXF23cwoZ7tSTCDKCfakyRvsIBLKE0SE,63976
|
|
46
46
|
datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
|
|
47
|
-
datachain/lib/
|
|
47
|
+
datachain/lib/hf.py,sha256=mYaHFPS4CW2-stRZHBMWW-NKN4dhrnhjZobBgRocnvo,5317
|
|
48
|
+
datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
|
|
48
49
|
datachain/lib/listing.py,sha256=nXLmGae_oQke4hnurzzWiHTEjHjWiqqHdB41Wb-hMTk,3521
|
|
49
50
|
datachain/lib/meta_formats.py,sha256=Hels85LJmNCz1aYVJvhymNdAt3qdJ2-qoxsIiUezrow,7198
|
|
50
51
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
51
|
-
datachain/lib/pytorch.py,sha256=
|
|
52
|
+
datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
|
|
52
53
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
53
|
-
datachain/lib/signal_schema.py,sha256=
|
|
54
|
-
datachain/lib/text.py,sha256=
|
|
55
|
-
datachain/lib/udf.py,sha256=
|
|
54
|
+
datachain/lib/signal_schema.py,sha256=rW1R6nIzdtmqWzpXk7aNAfrQD58_gbvkvEGyNTQ4WNM,20099
|
|
55
|
+
datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
|
|
56
|
+
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
56
57
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
57
58
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
58
59
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
|
-
datachain/lib/webdataset.py,sha256=
|
|
60
|
+
datachain/lib/webdataset.py,sha256=Q3UlCk66341sq-nvFbBCX4Cv3cYXBK9n12ejG4axPXE,8298
|
|
60
61
|
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
61
62
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
63
|
datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
|
|
63
|
-
datachain/lib/convert/python_to_sql.py,sha256=
|
|
64
|
+
datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
|
|
64
65
|
datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
|
|
65
66
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
66
67
|
datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
|
|
67
68
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
68
69
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
69
70
|
datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
|
|
70
|
-
datachain/query/dataset.py,sha256=
|
|
71
|
+
datachain/query/dataset.py,sha256=mHqSyovJlCQ7pKVMQKKKCiTJs3bP1GDXLKpOSpzVxx8,61378
|
|
71
72
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
72
73
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
73
74
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
74
75
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
75
76
|
datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
|
|
76
|
-
datachain/query/session.py,sha256=
|
|
77
|
+
datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
|
|
77
78
|
datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
|
|
78
79
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
79
80
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
@@ -94,9 +95,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
94
95
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
95
96
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
96
97
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
97
|
-
datachain-0.3.
|
|
98
|
-
datachain-0.3.
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
98
|
+
datachain-0.3.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
99
|
+
datachain-0.3.9.dist-info/METADATA,sha256=r5uNlVdal7YrsX7nYE56c_Ak8YZIgXqCiSwNJF5KjlY,17015
|
|
100
|
+
datachain-0.3.9.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
|
|
101
|
+
datachain-0.3.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
102
|
+
datachain-0.3.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
103
|
+
datachain-0.3.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|