datachain 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +3 -4
  2. datachain/cache.py +10 -4
  3. datachain/catalog/catalog.py +35 -15
  4. datachain/cli.py +37 -32
  5. datachain/data_storage/metastore.py +24 -0
  6. datachain/data_storage/warehouse.py +3 -1
  7. datachain/job.py +56 -0
  8. datachain/lib/arrow.py +19 -7
  9. datachain/lib/clip.py +89 -66
  10. datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
  11. datachain/lib/convert/sql_to_python.py +23 -0
  12. datachain/lib/convert/values_to_tuples.py +51 -33
  13. datachain/lib/data_model.py +6 -27
  14. datachain/lib/dataset_info.py +70 -0
  15. datachain/lib/dc.py +646 -152
  16. datachain/lib/file.py +117 -15
  17. datachain/lib/image.py +1 -1
  18. datachain/lib/meta_formats.py +14 -2
  19. datachain/lib/model_store.py +3 -2
  20. datachain/lib/pytorch.py +10 -7
  21. datachain/lib/signal_schema.py +39 -14
  22. datachain/lib/text.py +2 -1
  23. datachain/lib/udf.py +56 -5
  24. datachain/lib/udf_signature.py +1 -1
  25. datachain/lib/webdataset.py +4 -3
  26. datachain/node.py +11 -8
  27. datachain/query/dataset.py +66 -147
  28. datachain/query/dispatch.py +15 -13
  29. datachain/query/schema.py +2 -0
  30. datachain/query/session.py +4 -4
  31. datachain/sql/functions/array.py +12 -0
  32. datachain/sql/functions/string.py +8 -0
  33. datachain/torch/__init__.py +1 -1
  34. datachain/utils.py +45 -0
  35. datachain-0.2.12.dist-info/METADATA +412 -0
  36. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/RECORD +40 -45
  37. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
  38. datachain/lib/feature_registry.py +0 -77
  39. datachain/lib/gpt4_vision.py +0 -97
  40. datachain/lib/hf_image_to_text.py +0 -97
  41. datachain/lib/hf_pipeline.py +0 -90
  42. datachain/lib/image_transform.py +0 -103
  43. datachain/lib/iptc_exif_xmp.py +0 -76
  44. datachain/lib/unstructured.py +0 -41
  45. datachain/text/__init__.py +0 -3
  46. datachain-0.2.10.dist-info/METADATA +0 -430
  47. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
  48. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
  49. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,412 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.2.12
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/datachain/issues
9
+ Project-URL: Source, https://github.com/iterative/datachain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy
23
+ Requires-Dist: pandas >=2.0.0
24
+ Requires-Dist: pyarrow
25
+ Requires-Dist: typing-extensions
26
+ Requires-Dist: python-dateutil >=2
27
+ Requires-Dist: attrs >=21.3.0
28
+ Requires-Dist: s3fs >=2024.2.0
29
+ Requires-Dist: gcsfs >=2024.2.0
30
+ Requires-Dist: adlfs >=2024.2.0
31
+ Requires-Dist: dvc-data <4,>=3.10
32
+ Requires-Dist: dvc-objects <6,>=4
33
+ Requires-Dist: shtab <2,>=1.3.4
34
+ Requires-Dist: sqlalchemy >=2
35
+ Requires-Dist: multiprocess ==0.70.16
36
+ Requires-Dist: dill ==0.3.8
37
+ Requires-Dist: cloudpickle
38
+ Requires-Dist: ujson >=5.9.0
39
+ Requires-Dist: pydantic <3,>=2
40
+ Requires-Dist: jmespath >=1.0
41
+ Requires-Dist: datamodel-code-generator >=0.25
42
+ Requires-Dist: Pillow <11,>=10.0.0
43
+ Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
44
+ Provides-Extra: dev
45
+ Requires-Dist: datachain[docs,tests] ; extra == 'dev'
46
+ Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
47
+ Requires-Dist: types-python-dateutil ; extra == 'dev'
48
+ Requires-Dist: types-PyYAML ; extra == 'dev'
49
+ Requires-Dist: types-requests ; extra == 'dev'
50
+ Requires-Dist: types-ujson ; extra == 'dev'
51
+ Provides-Extra: docs
52
+ Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
53
+ Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
54
+ Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
55
+ Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
56
+ Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
57
+ Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
58
+ Provides-Extra: remote
59
+ Requires-Dist: lz4 ; extra == 'remote'
60
+ Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
61
+ Requires-Dist: requests >=2.22.0 ; extra == 'remote'
62
+ Provides-Extra: tests
63
+ Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
64
+ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
65
+ Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
66
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
67
+ Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
68
+ Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
69
+ Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
70
+ Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
71
+ Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
72
+ Requires-Dist: virtualenv ; extra == 'tests'
73
+ Requires-Dist: dulwich ; extra == 'tests'
74
+ Requires-Dist: hypothesis ; extra == 'tests'
75
+ Requires-Dist: open-clip-torch ; extra == 'tests'
76
+ Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
77
+ Requires-Dist: requests-mock ; extra == 'tests'
78
+ Provides-Extra: torch
79
+ Requires-Dist: torch >=2.1.0 ; extra == 'torch'
80
+ Requires-Dist: torchvision ; extra == 'torch'
81
+ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
82
+ Provides-Extra: vector
83
+ Requires-Dist: usearch ; extra == 'vector'
84
+
85
+ |PyPI| |Python Version| |Codecov| |Tests|
86
+
87
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
88
+ :target: https://pypi.org/project/datachain/
89
+ :alt: PyPI
90
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
91
+ :target: https://pypi.org/project/datachain
92
+ :alt: Python Version
93
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
94
+ :target: https://codecov.io/gh/iterative/datachain
95
+ :alt: Codecov
96
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
97
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
98
+ :alt: Tests
99
+
100
+ AI 🔗 DataChain
101
+ ----------------
102
+
103
+ DataChain is an open-source Python library for processing and curating unstructured
104
+ data at scale.
105
+
106
+ 🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
107
+
108
+ 🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
109
+
110
+ 🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
111
+
112
+
113
+ To ensure efficiency, Datachain supports parallel processing, parallel data
114
+ downloads, and out-of-memory computing. It excels at optimizing batch operations.
115
+ While most GenAI tools focus on online applications and realtime, DataChain is designed
116
+ for offline data processing, data curation and ETL.
117
+
118
+ The typical use cases are Computer Vision data curation, LLM analytics
119
+ and validation.
120
+
121
+
122
+ .. code:: console
123
+
124
+ $ pip install datachain
125
+
126
+ |Flowchart|
127
+
128
+ Quick Start
129
+ -----------
130
+
131
+ Basic evaluation
132
+ ================
133
+
134
+ We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
135
+ - 50 files total in the example.
136
+ These dialogs involve users looking for better wireless plans chatting with bot.
137
+ Our goal is to identify successful dialogs.
138
+
139
+ The data used in the examples is publicly available. Please feel free to run this code.
140
+
141
+ First, we'll use a simple sentiment analysis model. Please install transformers.
142
+
143
+ .. code:: shell
144
+
145
+ pip install transformers
146
+
147
+ The code below downloads files the cloud, applies function
148
+ `is_positive_dialogue_ending()` to each. All files with a positive sentiment
149
+ are copied to local directory `output/`.
150
+
151
+ .. code:: py
152
+
153
+ from transformers import pipeline
154
+ from datachain import DataChain, Column
155
+
156
+ classifier = pipeline("sentiment-analysis", device="cpu",
157
+ model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
158
+
159
+ def is_positive_dialogue_ending(file) -> bool:
160
+ dialogue_ending = file.read()[-512:]
161
+ return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
162
+
163
+ chain = (
164
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
165
+ object_name="file", type="text")
166
+ .settings(parallel=8, cache=True)
167
+ .map(is_positive=is_positive_dialogue_ending)
168
+ .save("file_response")
169
+ )
170
+
171
+ positive_chain = chain.filter(Column("is_positive") == True)
172
+ positive_chain.export_files("./output1")
173
+
174
+ print(f"{positive_chain.count()} files were exported")
175
+
176
+
177
+
178
+ 13 files were exported
179
+
180
+ .. code:: shell
181
+
182
+ $ ls output/datachain-demo/chatbot-KiT/
183
+ 15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
184
+ $ ls output/datachain-demo/chatbot-KiT/ | wc -l
185
+ 13
186
+
187
+
188
+ LLM judging LLMs dialogs
189
+ ==========================
190
+
191
+ Finding good dialogs using an LLM can be more efficient. In this example,
192
+ we use Mistral with a free API. Please install the package and get a free
193
+ Mistral API key at https://console.mistral.ai
194
+
195
+ .. code:: shell
196
+
197
+ $ pip install mistralai
198
+ $ export MISTRAL_API_KEY=_your_key_
199
+
200
+ Below is a similar code example, but this time using an LLM to evaluate the dialogs.
201
+ Note, only 4 threads were used in this example `parallel=4` due to a limitation of
202
+ the free LLM service.
203
+
204
+ .. code:: py
205
+
206
+ from mistralai.client import MistralClient
207
+ from mistralai.models.chat_completion import ChatMessage
208
+ from datachain import File, DataChain, Column
209
+
210
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
211
+
212
+ def eval_dialogue(file: File) -> bool:
213
+ client = MistralClient()
214
+ response = client.chat(
215
+ model="open-mixtral-8x22b",
216
+ messages=[ChatMessage(role="system", content=PROMPT),
217
+ ChatMessage(role="user", content=file.read())])
218
+ result = response.choices[0].message.content
219
+ return result.lower().startswith("success")
220
+
221
+ chain = (
222
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
223
+ .settings(parallel=4, cache=True)
224
+ .map(is_success=eval_dialogue)
225
+ .save("mistral_files")
226
+ )
227
+
228
+ successful_chain = chain.filter(Column("is_success") == True)
229
+ successful_chain.export_files("./output_mistral")
230
+
231
+ print(f"{successful_chain.count()} files were exported")
232
+
233
+
234
+ With the current prompt, we found 31 files considered successful dialogs:
235
+
236
+ .. code:: shell
237
+
238
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
239
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
240
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
241
+ 31
242
+
243
+
244
+
245
+ Serializing Python-objects
246
+ ==========================
247
+
248
+ LLM responses contain valuable information for analytics, such as tokens used and the
249
+ model. Preserving this information can be beneficial.
250
+
251
+ Instead of extracting this information from the Mistral data structure (class
252
+ `ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
253
+
254
+
255
+ .. code:: py
256
+
257
+ from mistralai.client import MistralClient
258
+ from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
259
+ from datachain import File, DataChain, Column
260
+
261
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
262
+
263
+ def eval_dialog(file: File) -> ChatCompletionResponse:
264
+ client = MistralClient()
265
+ return client.chat(
266
+ model="open-mixtral-8x22b",
267
+ messages=[ChatMessage(role="system", content=PROMPT),
268
+ ChatMessage(role="user", content=file.read())])
269
+
270
+ chain = (
271
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
272
+ .settings(parallel=4, cache=True)
273
+ .map(response=eval_dialog)
274
+ .map(status=lambda response: response.choices[0].message.content.lower()[:7])
275
+ .save("response")
276
+ )
277
+
278
+ chain.select("file.name", "status", "response.usage").show(5)
279
+
280
+ success_rate = chain.filter(Column("status") == "success").count() / chain.count()
281
+ print(f"{100*success_rate:.1f}% dialogs were successful")
282
+
283
+ Output:
284
+
285
+ .. code:: shell
286
+
287
+ file status response response response
288
+ name usage usage usage
289
+ prompt_tokens total_tokens completion_tokens
290
+ 0 1.txt success 547 548 1
291
+ 1 10.txt failure 3576 3578 2
292
+ 2 11.txt failure 626 628 2
293
+ 3 12.txt failure 1144 1182 38
294
+ 4 13.txt success 1100 1101 1
295
+
296
+ [Limited by 5 rows]
297
+ 64.0% dialogs were successful
298
+
299
+
300
+ Complex Python data structures
301
+ =============================================
302
+
303
+ In the previous examples, a few dataset were saved in the embedded database
304
+ (`SQLite`_ in directory `.datachain`).
305
+ These datasets are versioned, and can be accessed using
306
+ `DataChain.from_dataset("dataset_name")`.
307
+
308
+ .. code:: py
309
+
310
+ chain = DataChain.from_dataset("response")
311
+
312
+ # Iterating one-by-one: out of memory
313
+ for file, response in chain.limit(5).collect("file", "response"):
314
+ # You work with Python objects
315
+ assert isinstance(response, ChatCompletionResponse)
316
+
317
+ status = response.choices[0].message.content[:7]
318
+ tokens = response.usage.total_tokens
319
+ print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
320
+
321
+ Output:
322
+
323
+ .. code:: shell
324
+
325
+ gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
326
+ gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
327
+ gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
328
+ gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
329
+ gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
330
+
331
+
332
+ Vectorized analytics over Python objects
333
+ ========================================
334
+
335
+ Some operations can be efficiently run inside the DB without deserializing Python objects.
336
+ Let's calculate the cost of using LLM APIs in a vectorized way.
337
+ Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
338
+
339
+ .. code:: py
340
+
341
+ chain = DataChain.from_dataset("mistral_dataset")
342
+
343
+ cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
344
+ + chain.sum("response.usage.completion_tokens")*0.000006
345
+ print(f"Spent ${cost:.2f} on {chain.count()} calls")
346
+
347
+ Output:
348
+
349
+ .. code:: shell
350
+
351
+ Spent $0.08 on 50 calls
352
+
353
+
354
+ PyTorch data loader
355
+ ===================
356
+
357
+ Chain results can be exported or passed directly to PyTorch dataloader.
358
+ For example, if we are interested in passing image and a label based on file
359
+ name suffix, the following code will do it:
360
+
361
+ .. code:: py
362
+
363
+ from torch.utils.data import DataLoader
364
+ from transformers import CLIPProcessor
365
+
366
+ from datachain import C, DataChain
367
+
368
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
369
+
370
+ chain = (
371
+ DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
372
+ .map(label=lambda name: name.split(".")[0], params=["file.name"])
373
+ .select("file", "label").to_pytorch(
374
+ transform=processor.image_processor,
375
+ tokenizer=processor.tokenizer,
376
+ )
377
+ )
378
+ loader = DataLoader(chain, batch_size=1)
379
+
380
+
381
+ Tutorials
382
+ ---------
383
+
384
+ * `Getting Started`_
385
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
386
+
387
+ Contributions
388
+ -------------
389
+
390
+ Contributions are very welcome.
391
+ To learn more, see the `Contributor Guide`_.
392
+
393
+
394
+ Community and Support
395
+ ---------------------
396
+
397
+ * `Docs <https://datachain.dvc.ai/>`_
398
+ * `File an issue`_ if you encounter any problems
399
+ * `Discord Chat <https://dvc.org/chat>`_
400
+ * `Email <mailto:support@dvc.org>`_
401
+ * `Twitter <https://twitter.com/DVCorg>`_
402
+
403
+
404
+ .. _PyPI: https://pypi.org/
405
+ .. _file an issue: https://github.com/iterative/datachain/issues
406
+ .. github-only
407
+ .. _Contributor Guide: CONTRIBUTING.rst
408
+ .. _Pydantic: https://github.com/pydantic/pydantic
409
+ .. _SQLite: https://www.sqlite.org/
410
+ .. _Getting Started: https://datachain.dvc.ai/
411
+ .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
412
+ :alt: DataChain FlowChart
@@ -1,22 +1,23 @@
1
- datachain/__init__.py,sha256=L5IlHOD4AaHkV7P5dbUwdq90I3bGFLtOghoZ1WVFGcs,841
1
+ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
- datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
5
- datachain/cli.py,sha256=gikzwEXTDKyzY1xOAUziXN2-OVqnOhDMJTd7SHq0Jxc,32406
4
+ datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
5
+ datachain/cli.py,sha256=MSOID2t-kesk5Z80uoepN63rqvB7iZxaWYLqkiWehkQ,32628
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
+ datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
10
11
  datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
11
- datachain/node.py,sha256=fsQDJUmRMSRHhL1u6qQlWgreHbH760Ls-yDzFLhbW-U,5724
12
+ datachain/node.py,sha256=LwzSOSM9SbPLI5RvYDsiEkk7d5rbMX8huzM_m7uWKx4,5917
12
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
13
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
14
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
15
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
17
- datachain/utils.py,sha256=12yQAV8tfyCHqp_xJcJBeNnr1L_BO8e2bOPyXdM68gs,10759
18
+ datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
18
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
19
- datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
20
+ datachain/catalog/catalog.py,sha256=u8tvWooIon9ju59q8-Re_iqflgbCB-JMZD8n2UC4iag,80397
20
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
21
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
22
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -31,51 +32,46 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
31
32
  datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
32
33
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
33
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
34
- datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
35
+ datachain/data_storage/metastore.py,sha256=R1Jj8dOTAex8fjehewV2vUO4VhBSjj8JQI5mM3YhVEQ,54989
35
36
  datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
36
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
37
38
  datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
38
- datachain/data_storage/warehouse.py,sha256=UbD37_jqaM4BY2SsQaTiJre-eSa7HcPejrTp936L080,33170
39
+ datachain/data_storage/warehouse.py,sha256=FedcsvkAphpi2tUnlcrxO4mYumiCQAcrB5XRAK9tfXQ,33288
39
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
41
- datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
42
- datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
43
- datachain/lib/dc.py,sha256=Px7zj1mrAsL3sBLu1pezssBQkvY0YAoGJm4VbT2yRwc,34699
44
- datachain/lib/feature_registry.py,sha256=LUrBvDom-k1shFuCv46-OdgntbIUQ5008oyIS0iPM6Q,2298
45
- datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
46
- datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
47
- datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
48
- datachain/lib/hf_pipeline.py,sha256=MBFzixVa25_6QVR9RyOq8Rr9UIQ-sFVcBHducx_sZcY,2069
49
- datachain/lib/image.py,sha256=K0n_P7kmobWTgxe-rDbr5yY3vBrOPnseziE3DXwFFVo,2325
50
- datachain/lib/image_transform.py,sha256=hfgvIrSMGBx_MEXECyvrFoO1NyPBHoDb28j2lT2dsf8,2953
51
- datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
52
- datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
53
- datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
54
- datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
41
+ datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
42
+ datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
+ datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
44
+ datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
+ datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
46
+ datachain/lib/file.py,sha256=xiLHaqyl4rqcBLGD62YD3aBIAOmX4EBVucxIncpRi80,11916
47
+ datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
+ datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
49
+ datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
+ datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
55
51
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
56
- datachain/lib/signal_schema.py,sha256=xzVHauGrhGcS5aOE1UMqT5YjJeZIMAZYQq76tZShhnY,13550
57
- datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
58
- datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
59
- datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
60
- datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
52
+ datachain/lib/signal_schema.py,sha256=lKGlpRRUHOUFLcpk-pLQd9kGAJ8FPy0Q2bk--UlVemU,14559
53
+ datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
54
+ datachain/lib/udf.py,sha256=mo3NoyYy7fY2UZtZOtAN_jR1e5a803b1dlnD5ztduzk,11454
55
+ datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
61
56
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
62
57
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
- datachain/lib/webdataset.py,sha256=eqIDSqfBOhEK43JMp-6lYdYy2x3Ge5lwqR-hKGV8aG0,8259
58
+ datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
64
59
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
65
60
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
61
  datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
67
- datachain/lib/convert/type_converter.py,sha256=W-wvCIcb6OwWjRJ3EWJE4-LbpoqxsRBd6gYNpFlm8qo,2643
62
+ datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
63
+ datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
68
64
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
69
- datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0QEckwXlKgFbLA,3088
65
+ datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYgfJ6A2i7l_6Jo,3592
70
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
71
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
72
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
73
- datachain/query/dataset.py,sha256=Pmaz16phEummJpWJD3x-8SMMbCb6xcOtWTyMdsFOdOE,64414
74
- datachain/query/dispatch.py,sha256=Qv5QpP5-K9JAmZLntifRzS5_AYHbK82Ahreo302Ntq8,13218
69
+ datachain/query/dataset.py,sha256=m0bDQK_xXB85KPdJpH3OHdW6WJd1_PMgi01GRcWiiSg,61280
70
+ datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
75
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
76
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
77
- datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
78
- datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
73
+ datachain/query/schema.py,sha256=hAvux_GxUmuG_PwtnKkkizld9f0Gvt2JBzbu3m74fvE,7840
74
+ datachain/query/session.py,sha256=am4XCNj8NlZPAYJSvh43C13dQ5NsfzzuyVDjPgYAgJE,3655
79
75
  datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
80
76
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
77
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
@@ -86,20 +82,19 @@ datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
86
82
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
87
83
  datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
88
84
  datachain/sql/functions/__init__.py,sha256=PP8XV1CC1naIu87fiExbJRpV0Rww47EcDrDIKJb_xBQ,368
89
- datachain/sql/functions/array.py,sha256=vgTXFmBTq5-QW3Z8oDo4cFNi0B8zBqQnCRTQQKlp_VU,899
85
+ datachain/sql/functions/array.py,sha256=rvH27SWN9gdh_mFnp0GIiXuCrNW6n8ZbY4I_JUS-_e0,1140
90
86
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
91
87
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
92
88
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
93
- datachain/sql/functions/string.py,sha256=DsyY6ZMAUqmZVRSla-BJLsLYNsIgLOh4XLR3yvYJUbE,505
89
+ datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
94
90
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
95
91
  datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
96
92
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
97
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
98
- datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
99
- datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
100
- datachain-0.2.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.2.10.dist-info/METADATA,sha256=bWvqTD9c2joLmkDGpdcutjjF_s1LpccbSCLbkIaKQYQ,16732
102
- datachain-0.2.10.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
103
- datachain-0.2.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.2.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.2.10.dist-info/RECORD,,
94
+ datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
+ datachain-0.2.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.12.dist-info/METADATA,sha256=QfDhY5jkblcb94A5CxT-ELhDcwDzZq1ju4cPQXHDEkY,14333
97
+ datachain-0.2.12.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
+ datachain-0.2.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.12.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.3.0)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,77 +0,0 @@
1
- import logging
2
- from typing import Any, ClassVar, Optional
3
-
4
- from pydantic import BaseModel
5
-
6
- logger = logging.getLogger(__name__)
7
-
8
-
9
- class Registry:
10
- reg: ClassVar[dict[str, dict[int, Any]]] = {}
11
-
12
- @classmethod
13
- def get_version(cls, model: type[BaseModel]) -> int:
14
- if not hasattr(model, "_version"):
15
- return 0
16
- return model._version
17
-
18
- @classmethod
19
- def get_name(cls, model) -> str:
20
- if (version := cls.get_version(model)) > 0:
21
- return f"{model.__name__}@v{version}"
22
- return model.__name__
23
-
24
- @classmethod
25
- def add(cls, fr: type):
26
- if (model := Registry.to_pydantic(fr)) is None:
27
- return
28
-
29
- name = model.__name__
30
- if name not in cls.reg:
31
- cls.reg[name] = {}
32
- version = Registry.get_version(model)
33
- cls.reg[name][version] = model
34
-
35
- for f_info in model.model_fields.values():
36
- if (anno := Registry.to_pydantic(f_info.annotation)) is not None:
37
- cls.add(anno)
38
-
39
- @classmethod
40
- def get(cls, name: str, version: Optional[int] = None) -> Optional[type]:
41
- class_dict = cls.reg.get(name, None)
42
- if class_dict is None:
43
- return None
44
- if version is None:
45
- max_ver = max(class_dict.keys(), default=None)
46
- if max_ver is None:
47
- return None
48
- return class_dict[max_ver]
49
- return class_dict.get(version, None)
50
-
51
- @classmethod
52
- def parse_name_version(cls, fullname: str) -> tuple[str, int]:
53
- name = fullname
54
- version = 0
55
-
56
- if "@" in fullname:
57
- name, version_str = fullname.split("@")
58
- if version_str.strip() != "":
59
- version = int(version_str[1:])
60
-
61
- return name, version
62
-
63
- @classmethod
64
- def remove(cls, fr: type) -> None:
65
- version = fr._version # type: ignore[attr-defined]
66
- if fr.__name__ in cls.reg and version in cls.reg[fr.__name__]:
67
- del cls.reg[fr.__name__][version]
68
-
69
- @staticmethod
70
- def is_pydantic(val):
71
- return not hasattr(val, "__origin__") and issubclass(val, BaseModel)
72
-
73
- @staticmethod
74
- def to_pydantic(val) -> Optional[type[BaseModel]]:
75
- if val is None or not Registry.is_pydantic(val):
76
- return None
77
- return val