datachain 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +3 -4
- datachain/cache.py +10 -4
- datachain/catalog/catalog.py +42 -16
- datachain/cli.py +48 -32
- datachain/data_storage/metastore.py +24 -0
- datachain/data_storage/warehouse.py +3 -1
- datachain/job.py +56 -0
- datachain/lib/arrow.py +19 -7
- datachain/lib/clip.py +89 -66
- datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
- datachain/lib/convert/sql_to_python.py +23 -0
- datachain/lib/convert/values_to_tuples.py +51 -33
- datachain/lib/data_model.py +6 -27
- datachain/lib/dataset_info.py +70 -0
- datachain/lib/dc.py +618 -156
- datachain/lib/file.py +130 -22
- datachain/lib/image.py +1 -1
- datachain/lib/meta_formats.py +14 -2
- datachain/lib/model_store.py +3 -2
- datachain/lib/pytorch.py +10 -7
- datachain/lib/signal_schema.py +19 -11
- datachain/lib/text.py +2 -1
- datachain/lib/udf.py +56 -5
- datachain/lib/udf_signature.py +1 -1
- datachain/node.py +11 -8
- datachain/query/dataset.py +62 -28
- datachain/query/schema.py +2 -0
- datachain/query/session.py +4 -4
- datachain/sql/functions/array.py +12 -0
- datachain/sql/functions/string.py +8 -0
- datachain/torch/__init__.py +1 -1
- datachain/utils.py +6 -0
- datachain-0.2.13.dist-info/METADATA +411 -0
- {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/RECORD +38 -42
- {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/WHEEL +1 -1
- datachain/lib/gpt4_vision.py +0 -97
- datachain/lib/hf_image_to_text.py +0 -97
- datachain/lib/hf_pipeline.py +0 -90
- datachain/lib/image_transform.py +0 -103
- datachain/lib/iptc_exif_xmp.py +0 -76
- datachain/lib/unstructured.py +0 -41
- datachain/text/__init__.py +0 -3
- datachain-0.2.11.dist-info/METADATA +0 -431
- {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/LICENSE +0 -0
- {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.11.dist-info → datachain-0.2.13.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datachain
|
|
3
|
+
Version: 0.2.13
|
|
4
|
+
Summary: Wrangle unstructured AI data at scale
|
|
5
|
+
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
+
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
|
+
Project-URL: Source, https://github.com/iterative/datachain
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/x-rst
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: tomlkit
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: pandas >=2.0.0
|
|
24
|
+
Requires-Dist: pyarrow
|
|
25
|
+
Requires-Dist: typing-extensions
|
|
26
|
+
Requires-Dist: python-dateutil >=2
|
|
27
|
+
Requires-Dist: attrs >=21.3.0
|
|
28
|
+
Requires-Dist: s3fs >=2024.2.0
|
|
29
|
+
Requires-Dist: gcsfs >=2024.2.0
|
|
30
|
+
Requires-Dist: adlfs >=2024.2.0
|
|
31
|
+
Requires-Dist: dvc-data <4,>=3.10
|
|
32
|
+
Requires-Dist: dvc-objects <6,>=4
|
|
33
|
+
Requires-Dist: shtab <2,>=1.3.4
|
|
34
|
+
Requires-Dist: sqlalchemy >=2
|
|
35
|
+
Requires-Dist: multiprocess ==0.70.16
|
|
36
|
+
Requires-Dist: dill ==0.3.8
|
|
37
|
+
Requires-Dist: cloudpickle
|
|
38
|
+
Requires-Dist: ujson >=5.9.0
|
|
39
|
+
Requires-Dist: pydantic <3,>=2
|
|
40
|
+
Requires-Dist: jmespath >=1.0
|
|
41
|
+
Requires-Dist: datamodel-code-generator >=0.25
|
|
42
|
+
Requires-Dist: Pillow <11,>=10.0.0
|
|
43
|
+
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
44
|
+
Provides-Extra: dev
|
|
45
|
+
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
46
|
+
Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
|
|
47
|
+
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
48
|
+
Requires-Dist: types-pytz ; extra == 'dev'
|
|
49
|
+
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
50
|
+
Requires-Dist: types-requests ; extra == 'dev'
|
|
51
|
+
Requires-Dist: types-ujson ; extra == 'dev'
|
|
52
|
+
Provides-Extra: docs
|
|
53
|
+
Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
|
|
54
|
+
Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
|
|
55
|
+
Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
|
|
56
|
+
Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
|
|
57
|
+
Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
|
|
58
|
+
Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
|
|
59
|
+
Provides-Extra: remote
|
|
60
|
+
Requires-Dist: lz4 ; extra == 'remote'
|
|
61
|
+
Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
|
|
62
|
+
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
63
|
+
Provides-Extra: tests
|
|
64
|
+
Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
|
|
65
|
+
Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
66
|
+
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
67
|
+
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
68
|
+
Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
|
|
69
|
+
Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
|
|
70
|
+
Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
|
|
71
|
+
Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
|
|
72
|
+
Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
|
|
73
|
+
Requires-Dist: virtualenv ; extra == 'tests'
|
|
74
|
+
Requires-Dist: dulwich ; extra == 'tests'
|
|
75
|
+
Requires-Dist: hypothesis ; extra == 'tests'
|
|
76
|
+
Requires-Dist: open-clip-torch ; extra == 'tests'
|
|
77
|
+
Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
|
|
78
|
+
Requires-Dist: requests-mock ; extra == 'tests'
|
|
79
|
+
Provides-Extra: torch
|
|
80
|
+
Requires-Dist: torch >=2.1.0 ; extra == 'torch'
|
|
81
|
+
Requires-Dist: torchvision ; extra == 'torch'
|
|
82
|
+
Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
|
|
83
|
+
Provides-Extra: vector
|
|
84
|
+
Requires-Dist: usearch ; extra == 'vector'
|
|
85
|
+
|
|
86
|
+
|PyPI| |Python Version| |Codecov| |Tests|
|
|
87
|
+
|
|
88
|
+
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
89
|
+
:target: https://pypi.org/project/datachain/
|
|
90
|
+
:alt: PyPI
|
|
91
|
+
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
92
|
+
:target: https://pypi.org/project/datachain
|
|
93
|
+
:alt: Python Version
|
|
94
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
95
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
96
|
+
:alt: Codecov
|
|
97
|
+
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
98
|
+
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
99
|
+
:alt: Tests
|
|
100
|
+
|
|
101
|
+
AI 🔗 DataChain
|
|
102
|
+
----------------
|
|
103
|
+
|
|
104
|
+
DataChain is an open-source Python library for processing and curating unstructured
|
|
105
|
+
data at scale.
|
|
106
|
+
|
|
107
|
+
🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
|
|
108
|
+
|
|
109
|
+
🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
|
|
110
|
+
|
|
111
|
+
🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
Datachain supports parallel processing, parallel data
|
|
115
|
+
downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
|
|
116
|
+
|
|
117
|
+
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
118
|
+
and validation of multimodal AI applications.
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
.. code:: console
|
|
122
|
+
|
|
123
|
+
$ pip install datachain
|
|
124
|
+
|
|
125
|
+
|Flowchart|
|
|
126
|
+
|
|
127
|
+
Quick Start
|
|
128
|
+
-----------
|
|
129
|
+
|
|
130
|
+
Data curation with a local model
|
|
131
|
+
=================================
|
|
132
|
+
|
|
133
|
+
We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
|
|
134
|
+
- 50 files total in this example.
|
|
135
|
+
These dialogs involve users chatting with a bot while looking for better wireless plans.
|
|
136
|
+
Our goal is to identify the successful dialogs.
|
|
137
|
+
|
|
138
|
+
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
139
|
+
|
|
140
|
+
First, we'll show batch inference with a simple sentiment model using the `transformers` library:
|
|
141
|
+
|
|
142
|
+
.. code:: shell
|
|
143
|
+
|
|
144
|
+
pip install transformers
|
|
145
|
+
|
|
146
|
+
The code below downloads files the cloud, and applies a user-defined function
|
|
147
|
+
to each one of them. All files with a positive sentiment
|
|
148
|
+
detected are then copied to the local directory.
|
|
149
|
+
|
|
150
|
+
.. code:: py
|
|
151
|
+
|
|
152
|
+
from transformers import pipeline
|
|
153
|
+
from datachain import DataChain, Column
|
|
154
|
+
|
|
155
|
+
classifier = pipeline("sentiment-analysis", device="cpu",
|
|
156
|
+
model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
|
|
157
|
+
|
|
158
|
+
def is_positive_dialogue_ending(file) -> bool:
|
|
159
|
+
dialogue_ending = file.read()[-512:]
|
|
160
|
+
return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
|
|
161
|
+
|
|
162
|
+
chain = (
|
|
163
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
|
|
164
|
+
object_name="file", type="text")
|
|
165
|
+
.settings(parallel=8, cache=True)
|
|
166
|
+
.map(is_positive=is_positive_dialogue_ending)
|
|
167
|
+
.save("file_response")
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
positive_chain = chain.filter(Column("is_positive") == True)
|
|
171
|
+
positive_chain.export_files("./output")
|
|
172
|
+
|
|
173
|
+
print(f"{positive_chain.count()} files were exported")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
13 files were exported
|
|
178
|
+
|
|
179
|
+
.. code:: shell
|
|
180
|
+
|
|
181
|
+
$ ls output/datachain-demo/chatbot-KiT/
|
|
182
|
+
15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
|
|
183
|
+
$ ls output/datachain-demo/chatbot-KiT/ | wc -l
|
|
184
|
+
13
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
LLM judging chatbots
|
|
188
|
+
=============================
|
|
189
|
+
|
|
190
|
+
LLMs can work as efficient universal classifiers. In the example below,
|
|
191
|
+
we employ a free API from Mistral to judge the chatbot performance. Please get a free
|
|
192
|
+
Mistral API key at https://console.mistral.ai
|
|
193
|
+
|
|
194
|
+
.. code:: shell
|
|
195
|
+
|
|
196
|
+
$ pip install mistralai
|
|
197
|
+
$ export MISTRAL_API_KEY=_your_key_
|
|
198
|
+
|
|
199
|
+
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
200
|
+
|
|
201
|
+
.. code:: py
|
|
202
|
+
|
|
203
|
+
from mistralai.client import MistralClient
|
|
204
|
+
from mistralai.models.chat_completion import ChatMessage
|
|
205
|
+
from datachain import File, DataChain, Column
|
|
206
|
+
|
|
207
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
208
|
+
|
|
209
|
+
def eval_dialogue(file: File) -> bool:
|
|
210
|
+
client = MistralClient()
|
|
211
|
+
response = client.chat(
|
|
212
|
+
model="open-mixtral-8x22b",
|
|
213
|
+
messages=[ChatMessage(role="system", content=PROMPT),
|
|
214
|
+
ChatMessage(role="user", content=file.read())])
|
|
215
|
+
result = response.choices[0].message.content
|
|
216
|
+
return result.lower().startswith("success")
|
|
217
|
+
|
|
218
|
+
chain = (
|
|
219
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
220
|
+
.settings(parallel=4, cache=True)
|
|
221
|
+
.map(is_success=eval_dialogue)
|
|
222
|
+
.save("mistral_files")
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
successful_chain = chain.filter(Column("is_success") == True)
|
|
226
|
+
successful_chain.export_files("./output_mistral")
|
|
227
|
+
|
|
228
|
+
print(f"{successful_chain.count()} files were exported")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
232
|
+
|
|
233
|
+
.. code:: shell
|
|
234
|
+
|
|
235
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/
|
|
236
|
+
1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
|
|
237
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
|
|
238
|
+
31
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
Serializing Python-objects
|
|
243
|
+
==========================
|
|
244
|
+
|
|
245
|
+
LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
|
|
246
|
+
model performance parameters.
|
|
247
|
+
|
|
248
|
+
Instead of extracting this information from the Mistral response data structure (class
|
|
249
|
+
`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
.. code:: py
|
|
253
|
+
|
|
254
|
+
from mistralai.client import MistralClient
|
|
255
|
+
from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
|
|
256
|
+
from datachain import File, DataChain, Column
|
|
257
|
+
|
|
258
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
259
|
+
|
|
260
|
+
def eval_dialog(file: File) -> ChatCompletionResponse:
|
|
261
|
+
client = MistralClient()
|
|
262
|
+
return client.chat(
|
|
263
|
+
model="open-mixtral-8x22b",
|
|
264
|
+
messages=[ChatMessage(role="system", content=PROMPT),
|
|
265
|
+
ChatMessage(role="user", content=file.read())])
|
|
266
|
+
|
|
267
|
+
chain = (
|
|
268
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
269
|
+
.settings(parallel=4, cache=True)
|
|
270
|
+
.map(response=eval_dialog)
|
|
271
|
+
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
272
|
+
.save("response")
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
chain.select("file.name", "status", "response.usage").show(5)
|
|
276
|
+
|
|
277
|
+
success_rate = chain.filter(Column("status") == "success").count() / chain.count()
|
|
278
|
+
print(f"{100*success_rate:.1f}% dialogs were successful")
|
|
279
|
+
|
|
280
|
+
Output:
|
|
281
|
+
|
|
282
|
+
.. code:: shell
|
|
283
|
+
|
|
284
|
+
file status response response response
|
|
285
|
+
name usage usage usage
|
|
286
|
+
prompt_tokens total_tokens completion_tokens
|
|
287
|
+
0 1.txt success 547 548 1
|
|
288
|
+
1 10.txt failure 3576 3578 2
|
|
289
|
+
2 11.txt failure 626 628 2
|
|
290
|
+
3 12.txt failure 1144 1182 38
|
|
291
|
+
4 13.txt success 1100 1101 1
|
|
292
|
+
|
|
293
|
+
[Limited by 5 rows]
|
|
294
|
+
64.0% dialogs were successful
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
Iterating over Python data structures
|
|
298
|
+
=============================================
|
|
299
|
+
|
|
300
|
+
In the previous examples, datasets were saved in the embedded database
|
|
301
|
+
(`SQLite`_ in folder `.datachain` of the working directory).
|
|
302
|
+
These datasets were automatically versioned, and can be accessed using
|
|
303
|
+
`DataChain.from_dataset("dataset_name")`.
|
|
304
|
+
|
|
305
|
+
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
306
|
+
|
|
307
|
+
.. code:: py
|
|
308
|
+
|
|
309
|
+
chain = DataChain.from_dataset("response")
|
|
310
|
+
|
|
311
|
+
# Iterating one-by-one: support out-of-memory workflow
|
|
312
|
+
for file, response in chain.limit(5).collect("file", "response"):
|
|
313
|
+
# verify the collected Python objects
|
|
314
|
+
assert isinstance(response, ChatCompletionResponse)
|
|
315
|
+
|
|
316
|
+
status = response.choices[0].message.content[:7]
|
|
317
|
+
tokens = response.usage.total_tokens
|
|
318
|
+
print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
|
|
319
|
+
|
|
320
|
+
Output:
|
|
321
|
+
|
|
322
|
+
.. code:: shell
|
|
323
|
+
|
|
324
|
+
gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
|
|
325
|
+
gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
|
|
326
|
+
gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
|
|
327
|
+
gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
|
|
328
|
+
gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
Vectorized analytics over Python objects
|
|
332
|
+
========================================
|
|
333
|
+
|
|
334
|
+
Some operations can run inside the DB without deserialization.
|
|
335
|
+
For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
|
|
336
|
+
|
|
337
|
+
.. code:: py
|
|
338
|
+
|
|
339
|
+
chain = DataChain.from_dataset("mistral_dataset")
|
|
340
|
+
|
|
341
|
+
cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
|
|
342
|
+
+ chain.sum("response.usage.completion_tokens")*0.000006
|
|
343
|
+
print(f"Spent ${cost:.2f} on {chain.count()} calls")
|
|
344
|
+
|
|
345
|
+
Output:
|
|
346
|
+
|
|
347
|
+
.. code:: shell
|
|
348
|
+
|
|
349
|
+
Spent $0.08 on 50 calls
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
PyTorch data loader
|
|
353
|
+
===================
|
|
354
|
+
|
|
355
|
+
Chain results can be exported or passed directly to PyTorch dataloader.
|
|
356
|
+
For example, if we are interested in passing image and a label based on file
|
|
357
|
+
name suffix, the following code will do it:
|
|
358
|
+
|
|
359
|
+
.. code:: py
|
|
360
|
+
|
|
361
|
+
from torch.utils.data import DataLoader
|
|
362
|
+
from transformers import CLIPProcessor
|
|
363
|
+
|
|
364
|
+
from datachain import C, DataChain
|
|
365
|
+
|
|
366
|
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
367
|
+
|
|
368
|
+
chain = (
|
|
369
|
+
DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
|
|
370
|
+
.map(label=lambda name: name.split(".")[0], params=["file.name"])
|
|
371
|
+
.select("file", "label").to_pytorch(
|
|
372
|
+
transform=processor.image_processor,
|
|
373
|
+
tokenizer=processor.tokenizer,
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
loader = DataLoader(chain, batch_size=1)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
Tutorials
|
|
380
|
+
---------
|
|
381
|
+
|
|
382
|
+
* `Getting Started`_
|
|
383
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
384
|
+
|
|
385
|
+
Contributions
|
|
386
|
+
-------------
|
|
387
|
+
|
|
388
|
+
Contributions are very welcome.
|
|
389
|
+
To learn more, see the `Contributor Guide`_.
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
Community and Support
|
|
393
|
+
---------------------
|
|
394
|
+
|
|
395
|
+
* `Docs <https://datachain.dvc.ai/>`_
|
|
396
|
+
* `File an issue`_ if you encounter any problems
|
|
397
|
+
* `Discord Chat <https://dvc.org/chat>`_
|
|
398
|
+
* `Email <mailto:support@dvc.org>`_
|
|
399
|
+
* `Twitter <https://twitter.com/DVCorg>`_
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
.. _PyPI: https://pypi.org/
|
|
403
|
+
.. _file an issue: https://github.com/iterative/datachain/issues
|
|
404
|
+
.. github-only
|
|
405
|
+
.. _Contributor Guide: CONTRIBUTING.rst
|
|
406
|
+
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
407
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
408
|
+
.. _SQLite: https://www.sqlite.org/
|
|
409
|
+
.. _Getting Started: https://datachain.dvc.ai/
|
|
410
|
+
.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
|
|
411
|
+
:alt: DataChain FlowChart
|
|
@@ -1,22 +1,23 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
|
|
4
|
-
datachain/cache.py,sha256=
|
|
5
|
-
datachain/cli.py,sha256=
|
|
4
|
+
datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
|
|
5
|
+
datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
|
+
datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
|
|
10
11
|
datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
|
|
11
|
-
datachain/node.py,sha256=
|
|
12
|
+
datachain/node.py,sha256=LwzSOSM9SbPLI5RvYDsiEkk7d5rbMX8huzM_m7uWKx4,5917
|
|
12
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
13
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
14
15
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
15
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
17
|
-
datachain/utils.py,sha256=
|
|
18
|
+
datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
|
|
18
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
19
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
|
|
20
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
21
22
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
22
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -31,50 +32,46 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
31
32
|
datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
|
|
32
33
|
datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
|
|
33
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
34
|
-
datachain/data_storage/metastore.py,sha256=
|
|
35
|
+
datachain/data_storage/metastore.py,sha256=R1Jj8dOTAex8fjehewV2vUO4VhBSjj8JQI5mM3YhVEQ,54989
|
|
35
36
|
datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
|
|
36
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
37
38
|
datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
|
|
38
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=FedcsvkAphpi2tUnlcrxO4mYumiCQAcrB5XRAK9tfXQ,33288
|
|
39
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
datachain/lib/arrow.py,sha256=
|
|
41
|
-
datachain/lib/clip.py,sha256=
|
|
42
|
-
datachain/lib/data_model.py,sha256=
|
|
43
|
-
datachain/lib/
|
|
44
|
-
datachain/lib/
|
|
45
|
-
datachain/lib/
|
|
46
|
-
datachain/lib/
|
|
47
|
-
datachain/lib/
|
|
48
|
-
datachain/lib/
|
|
49
|
-
datachain/lib/
|
|
50
|
-
datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
|
|
51
|
-
datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
|
|
52
|
-
datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
|
|
53
|
-
datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
|
|
41
|
+
datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
|
|
42
|
+
datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
|
|
43
|
+
datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
|
|
44
|
+
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
+
datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
|
|
46
|
+
datachain/lib/file.py,sha256=Y1QQ1pBSESjlA9n6_ukc3YtugeiTeF12xcncyfdCL2k,12128
|
|
47
|
+
datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
|
|
48
|
+
datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
|
|
49
|
+
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
50
|
+
datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
|
|
54
51
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
55
|
-
datachain/lib/signal_schema.py,sha256=
|
|
56
|
-
datachain/lib/text.py,sha256=
|
|
57
|
-
datachain/lib/udf.py,sha256=
|
|
58
|
-
datachain/lib/udf_signature.py,sha256=
|
|
59
|
-
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
52
|
+
datachain/lib/signal_schema.py,sha256=lKGlpRRUHOUFLcpk-pLQd9kGAJ8FPy0Q2bk--UlVemU,14559
|
|
53
|
+
datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
|
|
54
|
+
datachain/lib/udf.py,sha256=mo3NoyYy7fY2UZtZOtAN_jR1e5a803b1dlnD5ztduzk,11454
|
|
55
|
+
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
60
56
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
61
57
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
58
|
datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
|
|
63
59
|
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
64
60
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
61
|
datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
|
|
66
|
-
datachain/lib/convert/
|
|
62
|
+
datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
|
|
63
|
+
datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
|
|
67
64
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
68
|
-
datachain/lib/convert/values_to_tuples.py,sha256=
|
|
65
|
+
datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYgfJ6A2i7l_6Jo,3592
|
|
69
66
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
70
67
|
datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
|
|
71
68
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
72
|
-
datachain/query/dataset.py,sha256=
|
|
69
|
+
datachain/query/dataset.py,sha256=jOMdvsQIMZq1hYPfqR_iKzGSlJ8m-7Wz75QxdFHdfwY,61567
|
|
73
70
|
datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
|
|
74
71
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
75
72
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
76
|
-
datachain/query/schema.py,sha256=
|
|
77
|
-
datachain/query/session.py,sha256=
|
|
73
|
+
datachain/query/schema.py,sha256=hAvux_GxUmuG_PwtnKkkizld9f0Gvt2JBzbu3m74fvE,7840
|
|
74
|
+
datachain/query/session.py,sha256=am4XCNj8NlZPAYJSvh43C13dQ5NsfzzuyVDjPgYAgJE,3655
|
|
78
75
|
datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
|
|
79
76
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
80
77
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
@@ -85,20 +82,19 @@ datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
|
85
82
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
86
83
|
datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
|
|
87
84
|
datachain/sql/functions/__init__.py,sha256=PP8XV1CC1naIu87fiExbJRpV0Rww47EcDrDIKJb_xBQ,368
|
|
88
|
-
datachain/sql/functions/array.py,sha256=
|
|
85
|
+
datachain/sql/functions/array.py,sha256=rvH27SWN9gdh_mFnp0GIiXuCrNW6n8ZbY4I_JUS-_e0,1140
|
|
89
86
|
datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
|
|
90
87
|
datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
|
|
91
88
|
datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
|
|
92
|
-
datachain/sql/functions/string.py,sha256=
|
|
89
|
+
datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
|
|
93
90
|
datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
|
|
94
91
|
datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
|
|
95
92
|
datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
|
|
96
93
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
97
|
-
datachain/
|
|
98
|
-
datachain/
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
101
|
-
datachain-0.2.
|
|
102
|
-
datachain-0.2.
|
|
103
|
-
datachain-0.2.
|
|
104
|
-
datachain-0.2.11.dist-info/RECORD,,
|
|
94
|
+
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
95
|
+
datachain-0.2.13.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
96
|
+
datachain-0.2.13.dist-info/METADATA,sha256=jiEob-wl7pePOekp9tVY6h00czklAsktsmw910EvZbo,14619
|
|
97
|
+
datachain-0.2.13.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
|
98
|
+
datachain-0.2.13.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
99
|
+
datachain-0.2.13.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
100
|
+
datachain-0.2.13.dist-info/RECORD,,
|
datachain/lib/gpt4_vision.py
DELETED
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
import io
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
import requests
|
|
6
|
-
from PIL import Image, ImageOps, UnidentifiedImageError
|
|
7
|
-
|
|
8
|
-
from datachain.query import Object, udf
|
|
9
|
-
from datachain.sql.types import String
|
|
10
|
-
|
|
11
|
-
DEFAULT_FIT_BOX = (500, 500)
|
|
12
|
-
DEFAULT_TOKENS = 300
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def encode_image(raw):
|
|
16
|
-
try:
|
|
17
|
-
img = Image.open(raw)
|
|
18
|
-
except UnidentifiedImageError:
|
|
19
|
-
return None
|
|
20
|
-
img.load()
|
|
21
|
-
img = ImageOps.fit(img, DEFAULT_FIT_BOX)
|
|
22
|
-
output = io.BytesIO()
|
|
23
|
-
img.save(output, format="JPEG")
|
|
24
|
-
hex_data = output.getvalue()
|
|
25
|
-
return base64.b64encode(hex_data).decode("utf-8")
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@udf(
|
|
29
|
-
params=(Object(encode_image),), # Columns consumed by the UDF.
|
|
30
|
-
output={
|
|
31
|
-
"description": String,
|
|
32
|
-
"error": String,
|
|
33
|
-
}, # Signals being returned by the UDF.
|
|
34
|
-
method="image_description",
|
|
35
|
-
)
|
|
36
|
-
class DescribeImage:
|
|
37
|
-
def __init__(
|
|
38
|
-
self,
|
|
39
|
-
prompt="What is in this image?",
|
|
40
|
-
max_tokens=DEFAULT_TOKENS,
|
|
41
|
-
key="",
|
|
42
|
-
timeout=30,
|
|
43
|
-
):
|
|
44
|
-
if not key:
|
|
45
|
-
key = os.getenv("OPENAI_API_KEY", "")
|
|
46
|
-
if not key:
|
|
47
|
-
raise ValueError(
|
|
48
|
-
"No key found. Please pass key or set the OPENAI_API_KEY "
|
|
49
|
-
"environment variable."
|
|
50
|
-
)
|
|
51
|
-
self.prompt = prompt
|
|
52
|
-
self.max_tokens = max_tokens
|
|
53
|
-
self.headers = {
|
|
54
|
-
"Content-Type": "application/json",
|
|
55
|
-
"Authorization": f"Bearer {key}",
|
|
56
|
-
}
|
|
57
|
-
self.timeout = timeout
|
|
58
|
-
|
|
59
|
-
def image_description(self, base64_image):
|
|
60
|
-
if base64_image is None:
|
|
61
|
-
return ("", "Unknown image format")
|
|
62
|
-
|
|
63
|
-
payload = {
|
|
64
|
-
"model": "gpt-4-vision-preview",
|
|
65
|
-
"messages": [
|
|
66
|
-
{
|
|
67
|
-
"role": "user",
|
|
68
|
-
"content": [
|
|
69
|
-
{"type": "text", "text": self.prompt},
|
|
70
|
-
{
|
|
71
|
-
"type": "image_url",
|
|
72
|
-
"image_url": {
|
|
73
|
-
"url": f"data:image/jpeg;base64,{base64_image}"
|
|
74
|
-
},
|
|
75
|
-
},
|
|
76
|
-
],
|
|
77
|
-
}
|
|
78
|
-
],
|
|
79
|
-
"max_tokens": self.max_tokens,
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
response = requests.post(
|
|
83
|
-
"https://api.openai.com/v1/chat/completions",
|
|
84
|
-
headers=self.headers,
|
|
85
|
-
json=payload,
|
|
86
|
-
timeout=self.timeout,
|
|
87
|
-
)
|
|
88
|
-
json_response = response.json()
|
|
89
|
-
|
|
90
|
-
if "error" in json_response:
|
|
91
|
-
error = str(json_response["error"])
|
|
92
|
-
openai_description = ""
|
|
93
|
-
else:
|
|
94
|
-
error = ""
|
|
95
|
-
openai_description = json_response["choices"][0]["message"]["content"]
|
|
96
|
-
|
|
97
|
-
return (openai_description, error)
|