datachain 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (46) hide show
  1. datachain/__init__.py +3 -4
  2. datachain/cache.py +10 -4
  3. datachain/catalog/catalog.py +42 -16
  4. datachain/cli.py +48 -32
  5. datachain/data_storage/metastore.py +24 -0
  6. datachain/data_storage/warehouse.py +3 -1
  7. datachain/job.py +56 -0
  8. datachain/lib/arrow.py +19 -7
  9. datachain/lib/clip.py +89 -66
  10. datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
  11. datachain/lib/convert/sql_to_python.py +23 -0
  12. datachain/lib/convert/values_to_tuples.py +51 -33
  13. datachain/lib/data_model.py +6 -27
  14. datachain/lib/dataset_info.py +70 -0
  15. datachain/lib/dc.py +618 -156
  16. datachain/lib/file.py +130 -22
  17. datachain/lib/image.py +1 -1
  18. datachain/lib/meta_formats.py +14 -2
  19. datachain/lib/model_store.py +3 -2
  20. datachain/lib/pytorch.py +10 -7
  21. datachain/lib/signal_schema.py +19 -11
  22. datachain/lib/text.py +2 -1
  23. datachain/lib/udf.py +56 -5
  24. datachain/lib/udf_signature.py +1 -1
  25. datachain/node.py +11 -8
  26. datachain/query/dataset.py +62 -28
  27. datachain/query/schema.py +2 -0
  28. datachain/query/session.py +4 -4
  29. datachain/sql/functions/array.py +12 -0
  30. datachain/sql/functions/string.py +8 -0
  31. datachain/torch/__init__.py +1 -1
  32. datachain/utils.py +6 -0
  33. datachain-0.2.13.dist-info/METADATA +411 -0
  34. {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/RECORD +38 -42
  35. {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/WHEEL +1 -1
  36. datachain/lib/gpt4_vision.py +0 -97
  37. datachain/lib/hf_image_to_text.py +0 -97
  38. datachain/lib/hf_pipeline.py +0 -90
  39. datachain/lib/image_transform.py +0 -103
  40. datachain/lib/iptc_exif_xmp.py +0 -76
  41. datachain/lib/unstructured.py +0 -41
  42. datachain/text/__init__.py +0 -3
  43. datachain-0.2.11.dist-info/METADATA +0 -431
  44. {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/LICENSE +0 -0
  45. {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,411 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.2.13
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/datachain/issues
9
+ Project-URL: Source, https://github.com/iterative/datachain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy
23
+ Requires-Dist: pandas >=2.0.0
24
+ Requires-Dist: pyarrow
25
+ Requires-Dist: typing-extensions
26
+ Requires-Dist: python-dateutil >=2
27
+ Requires-Dist: attrs >=21.3.0
28
+ Requires-Dist: s3fs >=2024.2.0
29
+ Requires-Dist: gcsfs >=2024.2.0
30
+ Requires-Dist: adlfs >=2024.2.0
31
+ Requires-Dist: dvc-data <4,>=3.10
32
+ Requires-Dist: dvc-objects <6,>=4
33
+ Requires-Dist: shtab <2,>=1.3.4
34
+ Requires-Dist: sqlalchemy >=2
35
+ Requires-Dist: multiprocess ==0.70.16
36
+ Requires-Dist: dill ==0.3.8
37
+ Requires-Dist: cloudpickle
38
+ Requires-Dist: ujson >=5.9.0
39
+ Requires-Dist: pydantic <3,>=2
40
+ Requires-Dist: jmespath >=1.0
41
+ Requires-Dist: datamodel-code-generator >=0.25
42
+ Requires-Dist: Pillow <11,>=10.0.0
43
+ Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
44
+ Provides-Extra: dev
45
+ Requires-Dist: datachain[docs,tests] ; extra == 'dev'
46
+ Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
47
+ Requires-Dist: types-python-dateutil ; extra == 'dev'
48
+ Requires-Dist: types-pytz ; extra == 'dev'
49
+ Requires-Dist: types-PyYAML ; extra == 'dev'
50
+ Requires-Dist: types-requests ; extra == 'dev'
51
+ Requires-Dist: types-ujson ; extra == 'dev'
52
+ Provides-Extra: docs
53
+ Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
54
+ Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
55
+ Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
56
+ Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
57
+ Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
58
+ Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
59
+ Provides-Extra: remote
60
+ Requires-Dist: lz4 ; extra == 'remote'
61
+ Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
62
+ Requires-Dist: requests >=2.22.0 ; extra == 'remote'
63
+ Provides-Extra: tests
64
+ Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
65
+ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
66
+ Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
67
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
68
+ Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
69
+ Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
70
+ Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
71
+ Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
72
+ Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
73
+ Requires-Dist: virtualenv ; extra == 'tests'
74
+ Requires-Dist: dulwich ; extra == 'tests'
75
+ Requires-Dist: hypothesis ; extra == 'tests'
76
+ Requires-Dist: open-clip-torch ; extra == 'tests'
77
+ Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
78
+ Requires-Dist: requests-mock ; extra == 'tests'
79
+ Provides-Extra: torch
80
+ Requires-Dist: torch >=2.1.0 ; extra == 'torch'
81
+ Requires-Dist: torchvision ; extra == 'torch'
82
+ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
83
+ Provides-Extra: vector
84
+ Requires-Dist: usearch ; extra == 'vector'
85
+
86
+ |PyPI| |Python Version| |Codecov| |Tests|
87
+
88
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
89
+ :target: https://pypi.org/project/datachain/
90
+ :alt: PyPI
91
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
92
+ :target: https://pypi.org/project/datachain
93
+ :alt: Python Version
94
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
95
+ :target: https://codecov.io/gh/iterative/datachain
96
+ :alt: Codecov
97
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
98
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
99
+ :alt: Tests
100
+
101
+ AI 🔗 DataChain
102
+ ----------------
103
+
104
+ DataChain is an open-source Python library for processing and curating unstructured
105
+ data at scale.
106
+
107
+ 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
108
+
109
+ 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
110
+
111
+ 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
112
+
113
+
114
+ Datachain supports parallel processing, parallel data
115
+ downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
116
+
117
+ The typical use cases include Computer Vision data curation, LLM analytics,
118
+ and validation of multimodal AI applications.
119
+
120
+
121
+ .. code:: console
122
+
123
+ $ pip install datachain
124
+
125
+ |Flowchart|
126
+
127
+ Quick Start
128
+ -----------
129
+
130
+ Data curation with a local model
131
+ =================================
132
+
133
+ We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
134
+ - 50 files total in this example.
135
+ These dialogs involve users chatting with a bot while looking for better wireless plans.
136
+ Our goal is to identify the successful dialogs.
137
+
138
+ The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
139
+
140
+ First, we'll show batch inference with a simple sentiment model using the `transformers` library:
141
+
142
+ .. code:: shell
143
+
144
+ pip install transformers
145
+
146
+ The code below downloads files the cloud, and applies a user-defined function
147
+ to each one of them. All files with a positive sentiment
148
+ detected are then copied to the local directory.
149
+
150
+ .. code:: py
151
+
152
+ from transformers import pipeline
153
+ from datachain import DataChain, Column
154
+
155
+ classifier = pipeline("sentiment-analysis", device="cpu",
156
+ model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
157
+
158
+ def is_positive_dialogue_ending(file) -> bool:
159
+ dialogue_ending = file.read()[-512:]
160
+ return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
161
+
162
+ chain = (
163
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
164
+ object_name="file", type="text")
165
+ .settings(parallel=8, cache=True)
166
+ .map(is_positive=is_positive_dialogue_ending)
167
+ .save("file_response")
168
+ )
169
+
170
+ positive_chain = chain.filter(Column("is_positive") == True)
171
+ positive_chain.export_files("./output")
172
+
173
+ print(f"{positive_chain.count()} files were exported")
174
+
175
+
176
+
177
+ 13 files were exported
178
+
179
+ .. code:: shell
180
+
181
+ $ ls output/datachain-demo/chatbot-KiT/
182
+ 15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
183
+ $ ls output/datachain-demo/chatbot-KiT/ | wc -l
184
+ 13
185
+
186
+
187
+ LLM judging chatbots
188
+ =============================
189
+
190
+ LLMs can work as efficient universal classifiers. In the example below,
191
+ we employ a free API from Mistral to judge the chatbot performance. Please get a free
192
+ Mistral API key at https://console.mistral.ai
193
+
194
+ .. code:: shell
195
+
196
+ $ pip install mistralai
197
+ $ export MISTRAL_API_KEY=_your_key_
198
+
199
+ DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
200
+
201
+ .. code:: py
202
+
203
+ from mistralai.client import MistralClient
204
+ from mistralai.models.chat_completion import ChatMessage
205
+ from datachain import File, DataChain, Column
206
+
207
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
208
+
209
+ def eval_dialogue(file: File) -> bool:
210
+ client = MistralClient()
211
+ response = client.chat(
212
+ model="open-mixtral-8x22b",
213
+ messages=[ChatMessage(role="system", content=PROMPT),
214
+ ChatMessage(role="user", content=file.read())])
215
+ result = response.choices[0].message.content
216
+ return result.lower().startswith("success")
217
+
218
+ chain = (
219
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
220
+ .settings(parallel=4, cache=True)
221
+ .map(is_success=eval_dialogue)
222
+ .save("mistral_files")
223
+ )
224
+
225
+ successful_chain = chain.filter(Column("is_success") == True)
226
+ successful_chain.export_files("./output_mistral")
227
+
228
+ print(f"{successful_chain.count()} files were exported")
229
+
230
+
231
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
232
+
233
+ .. code:: shell
234
+
235
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
236
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
237
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
238
+ 31
239
+
240
+
241
+
242
+ Serializing Python-objects
243
+ ==========================
244
+
245
+ LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
246
+ model performance parameters.
247
+
248
+ Instead of extracting this information from the Mistral response data structure (class
249
+ `ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
250
+
251
+
252
+ .. code:: py
253
+
254
+ from mistralai.client import MistralClient
255
+ from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
256
+ from datachain import File, DataChain, Column
257
+
258
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
259
+
260
+ def eval_dialog(file: File) -> ChatCompletionResponse:
261
+ client = MistralClient()
262
+ return client.chat(
263
+ model="open-mixtral-8x22b",
264
+ messages=[ChatMessage(role="system", content=PROMPT),
265
+ ChatMessage(role="user", content=file.read())])
266
+
267
+ chain = (
268
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
269
+ .settings(parallel=4, cache=True)
270
+ .map(response=eval_dialog)
271
+ .map(status=lambda response: response.choices[0].message.content.lower()[:7])
272
+ .save("response")
273
+ )
274
+
275
+ chain.select("file.name", "status", "response.usage").show(5)
276
+
277
+ success_rate = chain.filter(Column("status") == "success").count() / chain.count()
278
+ print(f"{100*success_rate:.1f}% dialogs were successful")
279
+
280
+ Output:
281
+
282
+ .. code:: shell
283
+
284
+ file status response response response
285
+ name usage usage usage
286
+ prompt_tokens total_tokens completion_tokens
287
+ 0 1.txt success 547 548 1
288
+ 1 10.txt failure 3576 3578 2
289
+ 2 11.txt failure 626 628 2
290
+ 3 12.txt failure 1144 1182 38
291
+ 4 13.txt success 1100 1101 1
292
+
293
+ [Limited by 5 rows]
294
+ 64.0% dialogs were successful
295
+
296
+
297
+ Iterating over Python data structures
298
+ =============================================
299
+
300
+ In the previous examples, datasets were saved in the embedded database
301
+ (`SQLite`_ in folder `.datachain` of the working directory).
302
+ These datasets were automatically versioned, and can be accessed using
303
+ `DataChain.from_dataset("dataset_name")`.
304
+
305
+ Here is how to retrieve a saved dataset and iterate over the objects:
306
+
307
+ .. code:: py
308
+
309
+ chain = DataChain.from_dataset("response")
310
+
311
+ # Iterating one-by-one: support out-of-memory workflow
312
+ for file, response in chain.limit(5).collect("file", "response"):
313
+ # verify the collected Python objects
314
+ assert isinstance(response, ChatCompletionResponse)
315
+
316
+ status = response.choices[0].message.content[:7]
317
+ tokens = response.usage.total_tokens
318
+ print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
319
+
320
+ Output:
321
+
322
+ .. code:: shell
323
+
324
+ gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
325
+ gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
326
+ gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
327
+ gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
328
+ gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
329
+
330
+
331
+ Vectorized analytics over Python objects
332
+ ========================================
333
+
334
+ Some operations can run inside the DB without deserialization.
335
+ For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
336
+
337
+ .. code:: py
338
+
339
+ chain = DataChain.from_dataset("mistral_dataset")
340
+
341
+ cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
342
+ + chain.sum("response.usage.completion_tokens")*0.000006
343
+ print(f"Spent ${cost:.2f} on {chain.count()} calls")
344
+
345
+ Output:
346
+
347
+ .. code:: shell
348
+
349
+ Spent $0.08 on 50 calls
350
+
351
+
352
+ PyTorch data loader
353
+ ===================
354
+
355
+ Chain results can be exported or passed directly to PyTorch dataloader.
356
+ For example, if we are interested in passing image and a label based on file
357
+ name suffix, the following code will do it:
358
+
359
+ .. code:: py
360
+
361
+ from torch.utils.data import DataLoader
362
+ from transformers import CLIPProcessor
363
+
364
+ from datachain import C, DataChain
365
+
366
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
367
+
368
+ chain = (
369
+ DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
370
+ .map(label=lambda name: name.split(".")[0], params=["file.name"])
371
+ .select("file", "label").to_pytorch(
372
+ transform=processor.image_processor,
373
+ tokenizer=processor.tokenizer,
374
+ )
375
+ )
376
+ loader = DataLoader(chain, batch_size=1)
377
+
378
+
379
+ Tutorials
380
+ ---------
381
+
382
+ * `Getting Started`_
383
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
384
+
385
+ Contributions
386
+ -------------
387
+
388
+ Contributions are very welcome.
389
+ To learn more, see the `Contributor Guide`_.
390
+
391
+
392
+ Community and Support
393
+ ---------------------
394
+
395
+ * `Docs <https://datachain.dvc.ai/>`_
396
+ * `File an issue`_ if you encounter any problems
397
+ * `Discord Chat <https://dvc.org/chat>`_
398
+ * `Email <mailto:support@dvc.org>`_
399
+ * `Twitter <https://twitter.com/DVCorg>`_
400
+
401
+
402
+ .. _PyPI: https://pypi.org/
403
+ .. _file an issue: https://github.com/iterative/datachain/issues
404
+ .. github-only
405
+ .. _Contributor Guide: CONTRIBUTING.rst
406
+ .. _Pydantic: https://github.com/pydantic/pydantic
407
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
408
+ .. _SQLite: https://www.sqlite.org/
409
+ .. _Getting Started: https://datachain.dvc.ai/
410
+ .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
411
+ :alt: DataChain FlowChart
@@ -1,22 +1,23 @@
1
- datachain/__init__.py,sha256=L5IlHOD4AaHkV7P5dbUwdq90I3bGFLtOghoZ1WVFGcs,841
1
+ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
- datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
5
- datachain/cli.py,sha256=gikzwEXTDKyzY1xOAUziXN2-OVqnOhDMJTd7SHq0Jxc,32406
4
+ datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
5
+ datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
+ datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
10
11
  datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
11
- datachain/node.py,sha256=fsQDJUmRMSRHhL1u6qQlWgreHbH760Ls-yDzFLhbW-U,5724
12
+ datachain/node.py,sha256=LwzSOSM9SbPLI5RvYDsiEkk7d5rbMX8huzM_m7uWKx4,5917
12
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
13
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
14
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
15
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
17
- datachain/utils.py,sha256=AWUXRk7yvDpHcqzzPWwzv8HtF1-jDVEBHKxAgT7u02E,12288
18
+ datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
18
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
19
- datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
20
+ datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
20
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
21
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
22
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -31,50 +32,46 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
31
32
  datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
32
33
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
33
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
34
- datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
35
+ datachain/data_storage/metastore.py,sha256=R1Jj8dOTAex8fjehewV2vUO4VhBSjj8JQI5mM3YhVEQ,54989
35
36
  datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
36
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
37
38
  datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
38
- datachain/data_storage/warehouse.py,sha256=UbD37_jqaM4BY2SsQaTiJre-eSa7HcPejrTp936L080,33170
39
+ datachain/data_storage/warehouse.py,sha256=FedcsvkAphpi2tUnlcrxO4mYumiCQAcrB5XRAK9tfXQ,33288
39
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
41
- datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
42
- datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
43
- datachain/lib/dc.py,sha256=rd-7gVcMRZ2M-O8aQhNx85H31w-kRQHpXSwtf26dSk4,35849
44
- datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
45
- datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
46
- datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
47
- datachain/lib/hf_pipeline.py,sha256=MBFzixVa25_6QVR9RyOq8Rr9UIQ-sFVcBHducx_sZcY,2069
48
- datachain/lib/image.py,sha256=K0n_P7kmobWTgxe-rDbr5yY3vBrOPnseziE3DXwFFVo,2325
49
- datachain/lib/image_transform.py,sha256=hfgvIrSMGBx_MEXECyvrFoO1NyPBHoDb28j2lT2dsf8,2953
50
- datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
51
- datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
52
- datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
53
- datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
41
+ datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
42
+ datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
+ datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
44
+ datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
+ datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
46
+ datachain/lib/file.py,sha256=Y1QQ1pBSESjlA9n6_ukc3YtugeiTeF12xcncyfdCL2k,12128
47
+ datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
+ datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
49
+ datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
+ datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
54
51
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
55
- datachain/lib/signal_schema.py,sha256=mRdq5qEGnFQgbSawzDPi2MCZ6PULTMigd51B2RuNxpg,14173
56
- datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
57
- datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
58
- datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
59
- datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
52
+ datachain/lib/signal_schema.py,sha256=lKGlpRRUHOUFLcpk-pLQd9kGAJ8FPy0Q2bk--UlVemU,14559
53
+ datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
54
+ datachain/lib/udf.py,sha256=mo3NoyYy7fY2UZtZOtAN_jR1e5a803b1dlnD5ztduzk,11454
55
+ datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
60
56
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
61
57
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
58
  datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
63
59
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
64
60
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
61
  datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
66
- datachain/lib/convert/type_converter.py,sha256=W-wvCIcb6OwWjRJ3EWJE4-LbpoqxsRBd6gYNpFlm8qo,2643
62
+ datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
63
+ datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
67
64
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
68
- datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0QEckwXlKgFbLA,3088
65
+ datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYgfJ6A2i7l_6Jo,3592
69
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
70
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
71
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
72
- datachain/query/dataset.py,sha256=P1KBv_R0YnKjNDHzOJwAx9qhwI08l0dLgaXfak3ps7k,60578
69
+ datachain/query/dataset.py,sha256=jOMdvsQIMZq1hYPfqR_iKzGSlJ8m-7Wz75QxdFHdfwY,61567
73
70
  datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
74
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
75
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
76
- datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
77
- datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
73
+ datachain/query/schema.py,sha256=hAvux_GxUmuG_PwtnKkkizld9f0Gvt2JBzbu3m74fvE,7840
74
+ datachain/query/session.py,sha256=am4XCNj8NlZPAYJSvh43C13dQ5NsfzzuyVDjPgYAgJE,3655
78
75
  datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
79
76
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
77
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
@@ -85,20 +82,19 @@ datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
85
82
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
86
83
  datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
87
84
  datachain/sql/functions/__init__.py,sha256=PP8XV1CC1naIu87fiExbJRpV0Rww47EcDrDIKJb_xBQ,368
88
- datachain/sql/functions/array.py,sha256=vgTXFmBTq5-QW3Z8oDo4cFNi0B8zBqQnCRTQQKlp_VU,899
85
+ datachain/sql/functions/array.py,sha256=rvH27SWN9gdh_mFnp0GIiXuCrNW6n8ZbY4I_JUS-_e0,1140
89
86
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
90
87
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
91
88
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
92
- datachain/sql/functions/string.py,sha256=DsyY6ZMAUqmZVRSla-BJLsLYNsIgLOh4XLR3yvYJUbE,505
89
+ datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
93
90
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
94
91
  datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
95
92
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
96
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
97
- datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
98
- datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
99
- datachain-0.2.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
- datachain-0.2.11.dist-info/METADATA,sha256=OVKgVc-Wc75AAQIY6hGL1CEBmnwksfgOXfiUen_xAOM,16759
101
- datachain-0.2.11.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
102
- datachain-0.2.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
- datachain-0.2.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
- datachain-0.2.11.dist-info/RECORD,,
94
+ datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
+ datachain-0.2.13.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.13.dist-info/METADATA,sha256=jiEob-wl7pePOekp9tVY6h00czklAsktsmw910EvZbo,14619
97
+ datachain-0.2.13.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
+ datachain-0.2.13.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.13.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.13.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.0.1)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,97 +0,0 @@
1
- import base64
2
- import io
3
- import os
4
-
5
- import requests
6
- from PIL import Image, ImageOps, UnidentifiedImageError
7
-
8
- from datachain.query import Object, udf
9
- from datachain.sql.types import String
10
-
11
- DEFAULT_FIT_BOX = (500, 500)
12
- DEFAULT_TOKENS = 300
13
-
14
-
15
- def encode_image(raw):
16
- try:
17
- img = Image.open(raw)
18
- except UnidentifiedImageError:
19
- return None
20
- img.load()
21
- img = ImageOps.fit(img, DEFAULT_FIT_BOX)
22
- output = io.BytesIO()
23
- img.save(output, format="JPEG")
24
- hex_data = output.getvalue()
25
- return base64.b64encode(hex_data).decode("utf-8")
26
-
27
-
28
- @udf(
29
- params=(Object(encode_image),), # Columns consumed by the UDF.
30
- output={
31
- "description": String,
32
- "error": String,
33
- }, # Signals being returned by the UDF.
34
- method="image_description",
35
- )
36
- class DescribeImage:
37
- def __init__(
38
- self,
39
- prompt="What is in this image?",
40
- max_tokens=DEFAULT_TOKENS,
41
- key="",
42
- timeout=30,
43
- ):
44
- if not key:
45
- key = os.getenv("OPENAI_API_KEY", "")
46
- if not key:
47
- raise ValueError(
48
- "No key found. Please pass key or set the OPENAI_API_KEY "
49
- "environment variable."
50
- )
51
- self.prompt = prompt
52
- self.max_tokens = max_tokens
53
- self.headers = {
54
- "Content-Type": "application/json",
55
- "Authorization": f"Bearer {key}",
56
- }
57
- self.timeout = timeout
58
-
59
- def image_description(self, base64_image):
60
- if base64_image is None:
61
- return ("", "Unknown image format")
62
-
63
- payload = {
64
- "model": "gpt-4-vision-preview",
65
- "messages": [
66
- {
67
- "role": "user",
68
- "content": [
69
- {"type": "text", "text": self.prompt},
70
- {
71
- "type": "image_url",
72
- "image_url": {
73
- "url": f"data:image/jpeg;base64,{base64_image}"
74
- },
75
- },
76
- ],
77
- }
78
- ],
79
- "max_tokens": self.max_tokens,
80
- }
81
-
82
- response = requests.post(
83
- "https://api.openai.com/v1/chat/completions",
84
- headers=self.headers,
85
- json=payload,
86
- timeout=self.timeout,
87
- )
88
- json_response = response.json()
89
-
90
- if "error" in json_response:
91
- error = str(json_response["error"])
92
- openai_description = ""
93
- else:
94
- error = ""
95
- openai_description = json_response["choices"][0]["message"]["content"]
96
-
97
- return (openai_description, error)