datachain 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (46) hide show
  1. datachain/__init__.py +3 -4
  2. datachain/cache.py +10 -4
  3. datachain/catalog/catalog.py +35 -15
  4. datachain/cli.py +37 -32
  5. datachain/data_storage/metastore.py +24 -0
  6. datachain/data_storage/warehouse.py +3 -1
  7. datachain/job.py +56 -0
  8. datachain/lib/arrow.py +19 -7
  9. datachain/lib/clip.py +89 -66
  10. datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
  11. datachain/lib/convert/sql_to_python.py +23 -0
  12. datachain/lib/convert/values_to_tuples.py +51 -33
  13. datachain/lib/data_model.py +6 -27
  14. datachain/lib/dataset_info.py +70 -0
  15. datachain/lib/dc.py +618 -156
  16. datachain/lib/file.py +117 -15
  17. datachain/lib/image.py +1 -1
  18. datachain/lib/meta_formats.py +14 -2
  19. datachain/lib/model_store.py +3 -2
  20. datachain/lib/pytorch.py +10 -7
  21. datachain/lib/signal_schema.py +19 -11
  22. datachain/lib/text.py +2 -1
  23. datachain/lib/udf.py +56 -5
  24. datachain/lib/udf_signature.py +1 -1
  25. datachain/node.py +11 -8
  26. datachain/query/dataset.py +52 -26
  27. datachain/query/schema.py +2 -0
  28. datachain/query/session.py +4 -4
  29. datachain/sql/functions/array.py +12 -0
  30. datachain/sql/functions/string.py +8 -0
  31. datachain/torch/__init__.py +1 -1
  32. datachain/utils.py +6 -0
  33. datachain-0.2.12.dist-info/METADATA +412 -0
  34. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/RECORD +38 -42
  35. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
  36. datachain/lib/gpt4_vision.py +0 -97
  37. datachain/lib/hf_image_to_text.py +0 -97
  38. datachain/lib/hf_pipeline.py +0 -90
  39. datachain/lib/image_transform.py +0 -103
  40. datachain/lib/iptc_exif_xmp.py +0 -76
  41. datachain/lib/unstructured.py +0 -41
  42. datachain/text/__init__.py +0 -3
  43. datachain-0.2.11.dist-info/METADATA +0 -431
  44. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
  45. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,412 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.2.12
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/datachain/issues
9
+ Project-URL: Source, https://github.com/iterative/datachain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy
23
+ Requires-Dist: pandas >=2.0.0
24
+ Requires-Dist: pyarrow
25
+ Requires-Dist: typing-extensions
26
+ Requires-Dist: python-dateutil >=2
27
+ Requires-Dist: attrs >=21.3.0
28
+ Requires-Dist: s3fs >=2024.2.0
29
+ Requires-Dist: gcsfs >=2024.2.0
30
+ Requires-Dist: adlfs >=2024.2.0
31
+ Requires-Dist: dvc-data <4,>=3.10
32
+ Requires-Dist: dvc-objects <6,>=4
33
+ Requires-Dist: shtab <2,>=1.3.4
34
+ Requires-Dist: sqlalchemy >=2
35
+ Requires-Dist: multiprocess ==0.70.16
36
+ Requires-Dist: dill ==0.3.8
37
+ Requires-Dist: cloudpickle
38
+ Requires-Dist: ujson >=5.9.0
39
+ Requires-Dist: pydantic <3,>=2
40
+ Requires-Dist: jmespath >=1.0
41
+ Requires-Dist: datamodel-code-generator >=0.25
42
+ Requires-Dist: Pillow <11,>=10.0.0
43
+ Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
44
+ Provides-Extra: dev
45
+ Requires-Dist: datachain[docs,tests] ; extra == 'dev'
46
+ Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
47
+ Requires-Dist: types-python-dateutil ; extra == 'dev'
48
+ Requires-Dist: types-PyYAML ; extra == 'dev'
49
+ Requires-Dist: types-requests ; extra == 'dev'
50
+ Requires-Dist: types-ujson ; extra == 'dev'
51
+ Provides-Extra: docs
52
+ Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
53
+ Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
54
+ Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
55
+ Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
56
+ Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
57
+ Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
58
+ Provides-Extra: remote
59
+ Requires-Dist: lz4 ; extra == 'remote'
60
+ Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
61
+ Requires-Dist: requests >=2.22.0 ; extra == 'remote'
62
+ Provides-Extra: tests
63
+ Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
64
+ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
65
+ Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
66
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
67
+ Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
68
+ Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
69
+ Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
70
+ Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
71
+ Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
72
+ Requires-Dist: virtualenv ; extra == 'tests'
73
+ Requires-Dist: dulwich ; extra == 'tests'
74
+ Requires-Dist: hypothesis ; extra == 'tests'
75
+ Requires-Dist: open-clip-torch ; extra == 'tests'
76
+ Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
77
+ Requires-Dist: requests-mock ; extra == 'tests'
78
+ Provides-Extra: torch
79
+ Requires-Dist: torch >=2.1.0 ; extra == 'torch'
80
+ Requires-Dist: torchvision ; extra == 'torch'
81
+ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
82
+ Provides-Extra: vector
83
+ Requires-Dist: usearch ; extra == 'vector'
84
+
85
+ |PyPI| |Python Version| |Codecov| |Tests|
86
+
87
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
88
+ :target: https://pypi.org/project/datachain/
89
+ :alt: PyPI
90
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
91
+ :target: https://pypi.org/project/datachain
92
+ :alt: Python Version
93
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
94
+ :target: https://codecov.io/gh/iterative/datachain
95
+ :alt: Codecov
96
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
97
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
98
+ :alt: Tests
99
+
100
+ AI 🔗 DataChain
101
+ ----------------
102
+
103
+ DataChain is an open-source Python library for processing and curating unstructured
104
+ data at scale.
105
+
106
+ 🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
107
+
108
+ 🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
109
+
110
+ 🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
111
+
112
+
113
+ To ensure efficiency, Datachain supports parallel processing, parallel data
114
+ downloads, and out-of-memory computing. It excels at optimizing batch operations.
115
+ While most GenAI tools focus on online applications and realtime, DataChain is designed
116
+ for offline data processing, data curation and ETL.
117
+
118
+ The typical use cases are Computer Vision data curation, LLM analytics
119
+ and validation.
120
+
121
+
122
+ .. code:: console
123
+
124
+ $ pip install datachain
125
+
126
+ |Flowchart|
127
+
128
+ Quick Start
129
+ -----------
130
+
131
+ Basic evaluation
132
+ ================
133
+
134
+ We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
135
+ - 50 files total in the example.
136
+ These dialogs involve users looking for better wireless plans chatting with bot.
137
+ Our goal is to identify successful dialogs.
138
+
139
+ The data used in the examples is publicly available. Please feel free to run this code.
140
+
141
+ First, we'll use a simple sentiment analysis model. Please install transformers.
142
+
143
+ .. code:: shell
144
+
145
+ pip install transformers
146
+
147
+ The code below downloads files the cloud, applies function
148
+ `is_positive_dialogue_ending()` to each. All files with a positive sentiment
149
+ are copied to local directory `output/`.
150
+
151
+ .. code:: py
152
+
153
+ from transformers import pipeline
154
+ from datachain import DataChain, Column
155
+
156
+ classifier = pipeline("sentiment-analysis", device="cpu",
157
+ model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
158
+
159
+ def is_positive_dialogue_ending(file) -> bool:
160
+ dialogue_ending = file.read()[-512:]
161
+ return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
162
+
163
+ chain = (
164
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
165
+ object_name="file", type="text")
166
+ .settings(parallel=8, cache=True)
167
+ .map(is_positive=is_positive_dialogue_ending)
168
+ .save("file_response")
169
+ )
170
+
171
+ positive_chain = chain.filter(Column("is_positive") == True)
172
+ positive_chain.export_files("./output1")
173
+
174
+ print(f"{positive_chain.count()} files were exported")
175
+
176
+
177
+
178
+ 13 files were exported
179
+
180
+ .. code:: shell
181
+
182
+ $ ls output/datachain-demo/chatbot-KiT/
183
+ 15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
184
+ $ ls output/datachain-demo/chatbot-KiT/ | wc -l
185
+ 13
186
+
187
+
188
+ LLM judging LLMs dialogs
189
+ ==========================
190
+
191
+ Finding good dialogs using an LLM can be more efficient. In this example,
192
+ we use Mistral with a free API. Please install the package and get a free
193
+ Mistral API key at https://console.mistral.ai
194
+
195
+ .. code:: shell
196
+
197
+ $ pip install mistralai
198
+ $ export MISTRAL_API_KEY=_your_key_
199
+
200
+ Below is a similar code example, but this time using an LLM to evaluate the dialogs.
201
+ Note, only 4 threads were used in this example `parallel=4` due to a limitation of
202
+ the free LLM service.
203
+
204
+ .. code:: py
205
+
206
+ from mistralai.client import MistralClient
207
+ from mistralai.models.chat_completion import ChatMessage
208
+ from datachain import File, DataChain, Column
209
+
210
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
211
+
212
+ def eval_dialogue(file: File) -> bool:
213
+ client = MistralClient()
214
+ response = client.chat(
215
+ model="open-mixtral-8x22b",
216
+ messages=[ChatMessage(role="system", content=PROMPT),
217
+ ChatMessage(role="user", content=file.read())])
218
+ result = response.choices[0].message.content
219
+ return result.lower().startswith("success")
220
+
221
+ chain = (
222
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
223
+ .settings(parallel=4, cache=True)
224
+ .map(is_success=eval_dialogue)
225
+ .save("mistral_files")
226
+ )
227
+
228
+ successful_chain = chain.filter(Column("is_success") == True)
229
+ successful_chain.export_files("./output_mistral")
230
+
231
+ print(f"{successful_chain.count()} files were exported")
232
+
233
+
234
+ With the current prompt, we found 31 files considered successful dialogs:
235
+
236
+ .. code:: shell
237
+
238
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
239
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
240
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
241
+ 31
242
+
243
+
244
+
245
+ Serializing Python-objects
246
+ ==========================
247
+
248
+ LLM responses contain valuable information for analytics, such as tokens used and the
249
+ model. Preserving this information can be beneficial.
250
+
251
+ Instead of extracting this information from the Mistral data structure (class
252
+ `ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
253
+
254
+
255
+ .. code:: py
256
+
257
+ from mistralai.client import MistralClient
258
+ from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
259
+ from datachain import File, DataChain, Column
260
+
261
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
262
+
263
+ def eval_dialog(file: File) -> ChatCompletionResponse:
264
+ client = MistralClient()
265
+ return client.chat(
266
+ model="open-mixtral-8x22b",
267
+ messages=[ChatMessage(role="system", content=PROMPT),
268
+ ChatMessage(role="user", content=file.read())])
269
+
270
+ chain = (
271
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
272
+ .settings(parallel=4, cache=True)
273
+ .map(response=eval_dialog)
274
+ .map(status=lambda response: response.choices[0].message.content.lower()[:7])
275
+ .save("response")
276
+ )
277
+
278
+ chain.select("file.name", "status", "response.usage").show(5)
279
+
280
+ success_rate = chain.filter(Column("status") == "success").count() / chain.count()
281
+ print(f"{100*success_rate:.1f}% dialogs were successful")
282
+
283
+ Output:
284
+
285
+ .. code:: shell
286
+
287
+ file status response response response
288
+ name usage usage usage
289
+ prompt_tokens total_tokens completion_tokens
290
+ 0 1.txt success 547 548 1
291
+ 1 10.txt failure 3576 3578 2
292
+ 2 11.txt failure 626 628 2
293
+ 3 12.txt failure 1144 1182 38
294
+ 4 13.txt success 1100 1101 1
295
+
296
+ [Limited by 5 rows]
297
+ 64.0% dialogs were successful
298
+
299
+
300
+ Complex Python data structures
301
+ =============================================
302
+
303
+ In the previous examples, a few dataset were saved in the embedded database
304
+ (`SQLite`_ in directory `.datachain`).
305
+ These datasets are versioned, and can be accessed using
306
+ `DataChain.from_dataset("dataset_name")`.
307
+
308
+ .. code:: py
309
+
310
+ chain = DataChain.from_dataset("response")
311
+
312
+ # Iterating one-by-one: out of memory
313
+ for file, response in chain.limit(5).collect("file", "response"):
314
+ # You work with Python objects
315
+ assert isinstance(response, ChatCompletionResponse)
316
+
317
+ status = response.choices[0].message.content[:7]
318
+ tokens = response.usage.total_tokens
319
+ print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
320
+
321
+ Output:
322
+
323
+ .. code:: shell
324
+
325
+ gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
326
+ gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
327
+ gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
328
+ gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
329
+ gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
330
+
331
+
332
+ Vectorized analytics over Python objects
333
+ ========================================
334
+
335
+ Some operations can be efficiently run inside the DB without deserializing Python objects.
336
+ Let's calculate the cost of using LLM APIs in a vectorized way.
337
+ Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
338
+
339
+ .. code:: py
340
+
341
+ chain = DataChain.from_dataset("mistral_dataset")
342
+
343
+ cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
344
+ + chain.sum("response.usage.completion_tokens")*0.000006
345
+ print(f"Spent ${cost:.2f} on {chain.count()} calls")
346
+
347
+ Output:
348
+
349
+ .. code:: shell
350
+
351
+ Spent $0.08 on 50 calls
352
+
353
+
354
+ PyTorch data loader
355
+ ===================
356
+
357
+ Chain results can be exported or passed directly to PyTorch dataloader.
358
+ For example, if we are interested in passing image and a label based on file
359
+ name suffix, the following code will do it:
360
+
361
+ .. code:: py
362
+
363
+ from torch.utils.data import DataLoader
364
+ from transformers import CLIPProcessor
365
+
366
+ from datachain import C, DataChain
367
+
368
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
369
+
370
+ chain = (
371
+ DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
372
+ .map(label=lambda name: name.split(".")[0], params=["file.name"])
373
+ .select("file", "label").to_pytorch(
374
+ transform=processor.image_processor,
375
+ tokenizer=processor.tokenizer,
376
+ )
377
+ )
378
+ loader = DataLoader(chain, batch_size=1)
379
+
380
+
381
+ Tutorials
382
+ ---------
383
+
384
+ * `Getting Started`_
385
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
386
+
387
+ Contributions
388
+ -------------
389
+
390
+ Contributions are very welcome.
391
+ To learn more, see the `Contributor Guide`_.
392
+
393
+
394
+ Community and Support
395
+ ---------------------
396
+
397
+ * `Docs <https://datachain.dvc.ai/>`_
398
+ * `File an issue`_ if you encounter any problems
399
+ * `Discord Chat <https://dvc.org/chat>`_
400
+ * `Email <mailto:support@dvc.org>`_
401
+ * `Twitter <https://twitter.com/DVCorg>`_
402
+
403
+
404
+ .. _PyPI: https://pypi.org/
405
+ .. _file an issue: https://github.com/iterative/datachain/issues
406
+ .. github-only
407
+ .. _Contributor Guide: CONTRIBUTING.rst
408
+ .. _Pydantic: https://github.com/pydantic/pydantic
409
+ .. _SQLite: https://www.sqlite.org/
410
+ .. _Getting Started: https://datachain.dvc.ai/
411
+ .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
412
+ :alt: DataChain FlowChart
@@ -1,22 +1,23 @@
1
- datachain/__init__.py,sha256=L5IlHOD4AaHkV7P5dbUwdq90I3bGFLtOghoZ1WVFGcs,841
1
+ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
- datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
5
- datachain/cli.py,sha256=gikzwEXTDKyzY1xOAUziXN2-OVqnOhDMJTd7SHq0Jxc,32406
4
+ datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
5
+ datachain/cli.py,sha256=MSOID2t-kesk5Z80uoepN63rqvB7iZxaWYLqkiWehkQ,32628
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
+ datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
10
11
  datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
11
- datachain/node.py,sha256=fsQDJUmRMSRHhL1u6qQlWgreHbH760Ls-yDzFLhbW-U,5724
12
+ datachain/node.py,sha256=LwzSOSM9SbPLI5RvYDsiEkk7d5rbMX8huzM_m7uWKx4,5917
12
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
13
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
14
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
15
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
17
- datachain/utils.py,sha256=AWUXRk7yvDpHcqzzPWwzv8HtF1-jDVEBHKxAgT7u02E,12288
18
+ datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
18
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
19
- datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
20
+ datachain/catalog/catalog.py,sha256=u8tvWooIon9ju59q8-Re_iqflgbCB-JMZD8n2UC4iag,80397
20
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
21
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
22
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -31,50 +32,46 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
31
32
  datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
32
33
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
33
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
34
- datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
35
+ datachain/data_storage/metastore.py,sha256=R1Jj8dOTAex8fjehewV2vUO4VhBSjj8JQI5mM3YhVEQ,54989
35
36
  datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
36
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
37
38
  datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
38
- datachain/data_storage/warehouse.py,sha256=UbD37_jqaM4BY2SsQaTiJre-eSa7HcPejrTp936L080,33170
39
+ datachain/data_storage/warehouse.py,sha256=FedcsvkAphpi2tUnlcrxO4mYumiCQAcrB5XRAK9tfXQ,33288
39
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
41
- datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
42
- datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
43
- datachain/lib/dc.py,sha256=rd-7gVcMRZ2M-O8aQhNx85H31w-kRQHpXSwtf26dSk4,35849
44
- datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
45
- datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
46
- datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
47
- datachain/lib/hf_pipeline.py,sha256=MBFzixVa25_6QVR9RyOq8Rr9UIQ-sFVcBHducx_sZcY,2069
48
- datachain/lib/image.py,sha256=K0n_P7kmobWTgxe-rDbr5yY3vBrOPnseziE3DXwFFVo,2325
49
- datachain/lib/image_transform.py,sha256=hfgvIrSMGBx_MEXECyvrFoO1NyPBHoDb28j2lT2dsf8,2953
50
- datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
51
- datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
52
- datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
53
- datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
41
+ datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
42
+ datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
+ datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
44
+ datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
+ datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
46
+ datachain/lib/file.py,sha256=xiLHaqyl4rqcBLGD62YD3aBIAOmX4EBVucxIncpRi80,11916
47
+ datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
+ datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
49
+ datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
+ datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
54
51
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
55
- datachain/lib/signal_schema.py,sha256=mRdq5qEGnFQgbSawzDPi2MCZ6PULTMigd51B2RuNxpg,14173
56
- datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
57
- datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
58
- datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
59
- datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
52
+ datachain/lib/signal_schema.py,sha256=lKGlpRRUHOUFLcpk-pLQd9kGAJ8FPy0Q2bk--UlVemU,14559
53
+ datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
54
+ datachain/lib/udf.py,sha256=mo3NoyYy7fY2UZtZOtAN_jR1e5a803b1dlnD5ztduzk,11454
55
+ datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
60
56
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
61
57
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
58
  datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
63
59
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
64
60
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
61
  datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
66
- datachain/lib/convert/type_converter.py,sha256=W-wvCIcb6OwWjRJ3EWJE4-LbpoqxsRBd6gYNpFlm8qo,2643
62
+ datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
63
+ datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
67
64
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
68
- datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0QEckwXlKgFbLA,3088
65
+ datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYgfJ6A2i7l_6Jo,3592
69
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
70
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
71
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
72
- datachain/query/dataset.py,sha256=P1KBv_R0YnKjNDHzOJwAx9qhwI08l0dLgaXfak3ps7k,60578
69
+ datachain/query/dataset.py,sha256=m0bDQK_xXB85KPdJpH3OHdW6WJd1_PMgi01GRcWiiSg,61280
73
70
  datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
74
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
75
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
76
- datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
77
- datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
73
+ datachain/query/schema.py,sha256=hAvux_GxUmuG_PwtnKkkizld9f0Gvt2JBzbu3m74fvE,7840
74
+ datachain/query/session.py,sha256=am4XCNj8NlZPAYJSvh43C13dQ5NsfzzuyVDjPgYAgJE,3655
78
75
  datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
79
76
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
77
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
@@ -85,20 +82,19 @@ datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
85
82
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
86
83
  datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
87
84
  datachain/sql/functions/__init__.py,sha256=PP8XV1CC1naIu87fiExbJRpV0Rww47EcDrDIKJb_xBQ,368
88
- datachain/sql/functions/array.py,sha256=vgTXFmBTq5-QW3Z8oDo4cFNi0B8zBqQnCRTQQKlp_VU,899
85
+ datachain/sql/functions/array.py,sha256=rvH27SWN9gdh_mFnp0GIiXuCrNW6n8ZbY4I_JUS-_e0,1140
89
86
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
90
87
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
91
88
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
92
- datachain/sql/functions/string.py,sha256=DsyY6ZMAUqmZVRSla-BJLsLYNsIgLOh4XLR3yvYJUbE,505
89
+ datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
93
90
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
94
91
  datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
95
92
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
96
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
97
- datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
98
- datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
99
- datachain-0.2.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
- datachain-0.2.11.dist-info/METADATA,sha256=OVKgVc-Wc75AAQIY6hGL1CEBmnwksfgOXfiUen_xAOM,16759
101
- datachain-0.2.11.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
102
- datachain-0.2.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
- datachain-0.2.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
- datachain-0.2.11.dist-info/RECORD,,
94
+ datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
+ datachain-0.2.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.12.dist-info/METADATA,sha256=QfDhY5jkblcb94A5CxT-ELhDcwDzZq1ju4cPQXHDEkY,14333
97
+ datachain-0.2.12.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
+ datachain-0.2.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.12.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.0.1)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,97 +0,0 @@
1
- import base64
2
- import io
3
- import os
4
-
5
- import requests
6
- from PIL import Image, ImageOps, UnidentifiedImageError
7
-
8
- from datachain.query import Object, udf
9
- from datachain.sql.types import String
10
-
11
- DEFAULT_FIT_BOX = (500, 500)
12
- DEFAULT_TOKENS = 300
13
-
14
-
15
- def encode_image(raw):
16
- try:
17
- img = Image.open(raw)
18
- except UnidentifiedImageError:
19
- return None
20
- img.load()
21
- img = ImageOps.fit(img, DEFAULT_FIT_BOX)
22
- output = io.BytesIO()
23
- img.save(output, format="JPEG")
24
- hex_data = output.getvalue()
25
- return base64.b64encode(hex_data).decode("utf-8")
26
-
27
-
28
- @udf(
29
- params=(Object(encode_image),), # Columns consumed by the UDF.
30
- output={
31
- "description": String,
32
- "error": String,
33
- }, # Signals being returned by the UDF.
34
- method="image_description",
35
- )
36
- class DescribeImage:
37
- def __init__(
38
- self,
39
- prompt="What is in this image?",
40
- max_tokens=DEFAULT_TOKENS,
41
- key="",
42
- timeout=30,
43
- ):
44
- if not key:
45
- key = os.getenv("OPENAI_API_KEY", "")
46
- if not key:
47
- raise ValueError(
48
- "No key found. Please pass key or set the OPENAI_API_KEY "
49
- "environment variable."
50
- )
51
- self.prompt = prompt
52
- self.max_tokens = max_tokens
53
- self.headers = {
54
- "Content-Type": "application/json",
55
- "Authorization": f"Bearer {key}",
56
- }
57
- self.timeout = timeout
58
-
59
- def image_description(self, base64_image):
60
- if base64_image is None:
61
- return ("", "Unknown image format")
62
-
63
- payload = {
64
- "model": "gpt-4-vision-preview",
65
- "messages": [
66
- {
67
- "role": "user",
68
- "content": [
69
- {"type": "text", "text": self.prompt},
70
- {
71
- "type": "image_url",
72
- "image_url": {
73
- "url": f"data:image/jpeg;base64,{base64_image}"
74
- },
75
- },
76
- ],
77
- }
78
- ],
79
- "max_tokens": self.max_tokens,
80
- }
81
-
82
- response = requests.post(
83
- "https://api.openai.com/v1/chat/completions",
84
- headers=self.headers,
85
- json=payload,
86
- timeout=self.timeout,
87
- )
88
- json_response = response.json()
89
-
90
- if "error" in json_response:
91
- error = str(json_response["error"])
92
- openai_description = ""
93
- else:
94
- error = ""
95
- openai_description = json_response["choices"][0]["message"]["content"]
96
-
97
- return (openai_description, error)