datachain 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -0,0 +1,29 @@
1
+ from datachain.lib.dc import C, DataChain
2
+ from datachain.lib.feature import Feature
3
+ from datachain.lib.feature_utils import pydantic_to_feature
4
+ from datachain.lib.file import File, FileError, FileFeature, IndexedFile, TarVFile
5
+ from datachain.lib.udf import Aggregator, Generator, Mapper
6
+ from datachain.lib.utils import AbstractUDF, DataChainError
7
+ from datachain.query.dataset import UDF as BaseUDF # noqa: N811
8
+ from datachain.query.schema import Column
9
+ from datachain.query.session import Session
10
+
11
+ __all__ = [
12
+ "AbstractUDF",
13
+ "Aggregator",
14
+ "BaseUDF",
15
+ "C",
16
+ "Column",
17
+ "DataChain",
18
+ "DataChainError",
19
+ "Feature",
20
+ "File",
21
+ "FileError",
22
+ "FileFeature",
23
+ "Generator",
24
+ "IndexedFile",
25
+ "Mapper",
26
+ "Session",
27
+ "TarVFile",
28
+ "pydantic_to_feature",
29
+ ]
@@ -0,0 +1,3 @@
1
+ from datachain.lib.image import ImageFile, convert_images
2
+
3
+ __all__ = ["ImageFile", "convert_images"]
datachain/lib/feature.py CHANGED
@@ -4,6 +4,7 @@ import re
4
4
  import warnings
5
5
  from collections.abc import Iterable, Sequence
6
6
  from datetime import datetime
7
+ from enum import Enum
7
8
  from functools import lru_cache
8
9
  from types import GenericAlias
9
10
  from typing import (
@@ -63,6 +64,7 @@ TYPE_TO_DATACHAIN = {
63
64
  str: String,
64
65
  Literal: String,
65
66
  LiteralEx: String,
67
+ Enum: String,
66
68
  float: Float,
67
69
  bool: Boolean,
68
70
  datetime: DateTime, # Note, list of datetime is not supported yet
@@ -364,8 +366,11 @@ def _resolve(cls, name, field_info, prefix: list[str]):
364
366
 
365
367
 
366
368
  def convert_type_to_datachain(typ): # noqa: PLR0911
367
- if inspect.isclass(typ) and issubclass(typ, SQLType):
368
- return typ
369
+ if inspect.isclass(typ):
370
+ if issubclass(typ, SQLType):
371
+ return typ
372
+ if issubclass(typ, Enum):
373
+ return str
369
374
 
370
375
  res = TYPE_TO_DATACHAIN.get(typ)
371
376
  if res:
@@ -1,5 +1,7 @@
1
+ import inspect
1
2
  import string
2
3
  from collections.abc import Sequence
4
+ from enum import Enum
3
5
  from typing import Any, Union, get_args, get_origin
4
6
 
5
7
  from pydantic import BaseModel, create_model
@@ -35,23 +37,7 @@ def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
35
37
  for name, field_info in data_cls.model_fields.items():
36
38
  anno = field_info.annotation
37
39
  if anno not in TYPE_TO_DATACHAIN:
38
- orig = get_origin(anno)
39
- if orig is list:
40
- anno = get_args(anno) # type: ignore[assignment]
41
- if isinstance(anno, Sequence):
42
- anno = anno[0] # type: ignore[unreachable]
43
- is_list = True
44
- else:
45
- is_list = False
46
-
47
- try:
48
- convert_type_to_datachain(anno)
49
- except TypeError:
50
- if not Feature.is_feature(anno): # type: ignore[arg-type]
51
- anno = pydantic_to_feature(anno) # type: ignore[arg-type]
52
-
53
- if is_list:
54
- anno = list[anno] # type: ignore[valid-type]
40
+ anno = _to_feature_type(anno)
55
41
  fields[name] = (anno, field_info.default)
56
42
 
57
43
  cls = create_model(
@@ -63,6 +49,38 @@ def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
63
49
  return cls
64
50
 
65
51
 
52
+ def _to_feature_type(anno):
53
+ if inspect.isclass(anno) and issubclass(anno, Enum):
54
+ return str
55
+
56
+ orig = get_origin(anno)
57
+ if orig is list:
58
+ anno = get_args(anno) # type: ignore[assignment]
59
+ if isinstance(anno, Sequence):
60
+ anno = anno[0] # type: ignore[unreachable]
61
+ is_list = True
62
+ else:
63
+ is_list = False
64
+
65
+ try:
66
+ convert_type_to_datachain(anno)
67
+ except TypeError:
68
+ if not Feature.is_feature(anno): # type: ignore[arg-type]
69
+ orig = get_origin(anno)
70
+ if orig in TYPE_TO_DATACHAIN:
71
+ anno = _to_feature_type(anno)
72
+ else:
73
+ if orig == Union:
74
+ args = get_args(anno)
75
+ if len(args) == 2 and (type(None) in args):
76
+ return _to_feature_type(args[0])
77
+
78
+ anno = pydantic_to_feature(anno) # type: ignore[arg-type]
79
+ if is_list:
80
+ anno = list[anno] # type: ignore[valid-type]
81
+ return anno
82
+
83
+
66
84
  def features_to_tuples(
67
85
  ds_name: str = "",
68
86
  output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
@@ -0,0 +1,3 @@
1
+ from datachain.lib.text import convert_text
2
+
3
+ __all__ = ["convert_text"]
@@ -0,0 +1,429 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.2.7
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/dvcx/issues
9
+ Project-URL: Source, https://github.com/iterative/dvcx
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy
23
+ Requires-Dist: pandas >=2.0.0
24
+ Requires-Dist: pyarrow
25
+ Requires-Dist: typing-extensions
26
+ Requires-Dist: python-dateutil >=2
27
+ Requires-Dist: attrs >=21.3.0
28
+ Requires-Dist: s3fs >=2024.2.0
29
+ Requires-Dist: gcsfs >=2024.2.0
30
+ Requires-Dist: adlfs >=2024.2.0
31
+ Requires-Dist: dvc-data <4,>=3.10
32
+ Requires-Dist: dvc-objects <6,>=4
33
+ Requires-Dist: shtab <2,>=1.3.4
34
+ Requires-Dist: sqlalchemy >=2
35
+ Requires-Dist: multiprocess ==0.70.16
36
+ Requires-Dist: dill ==0.3.8
37
+ Requires-Dist: ujson >=5.9.0
38
+ Requires-Dist: pydantic <3,>=2
39
+ Requires-Dist: jmespath >=1.0
40
+ Requires-Dist: datamodel-code-generator >=0.25
41
+ Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
42
+ Provides-Extra: cv
43
+ Requires-Dist: Pillow <11,>=10.0.0 ; extra == 'cv'
44
+ Requires-Dist: torch >=2.1.0 ; extra == 'cv'
45
+ Requires-Dist: torchvision ; extra == 'cv'
46
+ Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
47
+ Provides-Extra: dev
48
+ Requires-Dist: datachain[docs,tests] ; extra == 'dev'
49
+ Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
50
+ Requires-Dist: types-python-dateutil ; extra == 'dev'
51
+ Requires-Dist: types-PyYAML ; extra == 'dev'
52
+ Requires-Dist: types-requests ; extra == 'dev'
53
+ Requires-Dist: types-ujson ; extra == 'dev'
54
+ Provides-Extra: docs
55
+ Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
56
+ Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
57
+ Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
58
+ Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
59
+ Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
60
+ Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
61
+ Provides-Extra: remote
62
+ Requires-Dist: lz4 ; extra == 'remote'
63
+ Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
64
+ Requires-Dist: requests >=2.22.0 ; extra == 'remote'
65
+ Provides-Extra: tests
66
+ Requires-Dist: datachain[cv,remote,vector] ; extra == 'tests'
67
+ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
68
+ Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
69
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
70
+ Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
71
+ Requires-Dist: pytest-servers[all] >=0.5.4 ; extra == 'tests'
72
+ Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
73
+ Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
74
+ Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
75
+ Requires-Dist: virtualenv ; extra == 'tests'
76
+ Requires-Dist: dulwich ; extra == 'tests'
77
+ Requires-Dist: hypothesis ; extra == 'tests'
78
+ Requires-Dist: open-clip-torch ; extra == 'tests'
79
+ Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
80
+ Requires-Dist: requests-mock ; extra == 'tests'
81
+ Provides-Extra: vector
82
+ Requires-Dist: usearch ; extra == 'vector'
83
+
84
+ |PyPI| |Python Version| |Codecov| |Tests|
85
+
86
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
87
+ :target: https://pypi.org/project/datachain/
88
+ :alt: PyPI
89
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
90
+ :target: https://pypi.org/project/datachain
91
+ :alt: Python Version
92
+ .. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
93
+ :target: https://app.codecov.io/gh/iterative/dvcx
94
+ :alt: Codecov
95
+ .. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
96
+ :target: https://github.com/iterative/dvcx/actions?workflow=Tests
97
+ :alt: Tests
98
+
99
+ AI 🔗 DataChain
100
+ ----------------
101
+
102
+ DataChain is an open-source Python data processing library for wrangling unstructured AI data at scale.
103
+
104
+ Datachain enables multimodal API calls and local AI inferences to run in parallel over many samples as chained operations. The resulting datasets can be saved, versioned, and sent directly to PyTorch and TensorFlow for training. Datachain can persist features of Python objects returned by AI models, and enables vectorized analytical operations over them.
105
+
106
+ The typical use cases are data curation, LLM analytics and validation, image segmentation, pose detection, and GenAI alignment. Datachain is especially helpful if batch operations can be optimized – for instance, when synchronous API calls can be parallelized or where an LLM API offers batch processing.
107
+
108
+ .. code:: console
109
+
110
+ $ pip install datachain
111
+
112
+ Operation basics
113
+ ----------------
114
+
115
+ DataChain is built by composing wrangling operations.
116
+
117
+ For example, let us consider a dataset from Karlsruhe Institute of Technology detailing dialogs between users and customer service chatbots. We can use the chain to read data from the cloud, map it onto the parallel API calls for LLM evaluation, and organize the output into a dataset :
118
+
119
+ .. code:: py
120
+
121
+ # pip install mistralai
122
+ # this example requires a free Mistral API key, get yours at https://console.mistral.ai
123
+ # add the key to your shell environment: $ export MISTRAL_API_KEY= your key
124
+
125
+ # pip install mistralai
126
+ # this example requires a free Mistral API key, get yours at https://console.mistral.ai
127
+ # add the key to your shell environment: $ export MISTRAL_API_KEY= your key
128
+
129
+ import os
130
+
131
+ from mistralai.client import MistralClient
132
+ from mistralai.models.chat_completion import ChatMessage
133
+
134
+ from datachain.lib.dc import DataChain, Column
135
+
136
+ PROMPT = "Was this bot dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
137
+
138
+ model = "mistral-large-latest"
139
+ api_key = os.environ["MISTRAL_API_KEY"]
140
+
141
+ chain = (
142
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
143
+ .limit(5)
144
+ .settings(cache=True, parallel=5)
145
+ .map(
146
+ mistral_response=lambda file: MistralClient(api_key=api_key)
147
+ .chat(
148
+ model=model,
149
+ response_format={"type": "json_object"},
150
+ messages=[
151
+ ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
152
+ ],
153
+ )
154
+ .choices[0]
155
+ .message.content,
156
+ )
157
+ .save()
158
+ )
159
+
160
+ try:
161
+ print(chain.select("mistral_response").results())
162
+ except Exception as e:
163
+ print(f"do you have the right Mistral API key? {e}")
164
+
165
+
166
+ .. code:: shell
167
+
168
+ [('{"result": "Yes"}',), ('{"result": "No"}',), ... , ('{"result": "Yes"}',)]
169
+
170
+ Now we have parallel-processed an LLM API-based query over cloud data and persisted the results.
171
+
172
+ Vectorized analytics
173
+ --------------------
174
+
175
+ Datachain internally represents datasets as tables, so analytical queries on the chain are automatically vectorized:
176
+
177
+ .. code:: py
178
+
179
+ failed_dialogs = chain.filter(Column("mistral_response") == '{"result": "No"}')
180
+ success_rate = failed_dialogs.count() / chain.count()
181
+ print(f"Chatbot dialog success rate: {100*success_rate:.2f}%")
182
+
183
+
184
+ .. code:: shell
185
+
186
+ "40.00%"
187
+
188
+ Note that DataChain represents file samples as pointers into their respective storage locations. This means a newly created dataset version does not duplicate files in storage, and storage remains the single source of truth for the original samples
189
+
190
+ Handling Python objects
191
+ -----------------------
192
+ In addition to storing primitive Python data types, chain is also capable of using data models.
193
+
194
+ For example, instead of collecting just a text response from Mistral API, we might be interested in more fields of the Mistral response object. For this task, we can define a Pydantic-like model and populate it from the API replies:
195
+
196
+ .. code:: py
197
+
198
+ import os
199
+
200
+ from mistralai.client import MistralClient
201
+ from mistralai.models.chat_completion import ChatMessage
202
+
203
+ from datachain.lib.dc import DataChain
204
+ from datachain.lib.feature import Feature
205
+
206
+
207
+ PROMPT = (
208
+ "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
209
+ )
210
+
211
+ model = "mistral-large-latest"
212
+ api_key = os.environ["MISTRAL_API_KEY"]
213
+
214
+
215
+ ## define the data model ###
216
+ class Usage(Feature):
217
+ prompt_tokens: int = 0
218
+ completion_tokens: int = 0
219
+
220
+
221
+ class MyChatMessage(Feature):
222
+ role: str = ""
223
+ content: str = ""
224
+
225
+
226
+ class CompletionResponseChoice(Feature):
227
+ message: MyChatMessage = MyChatMessage()
228
+
229
+
230
+ class MistralModel(Feature):
231
+ id: str = ""
232
+ choices: list[CompletionResponseChoice]
233
+ usage: Usage = Usage()
234
+
235
+
236
+ ## Populate model instances ###
237
+ chain = (
238
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
239
+ .limit(5)
240
+ .settings(cache=True, parallel=5)
241
+ .map(
242
+ mistral_response=lambda file: MistralModel(
243
+ **MistralClient(api_key=api_key)
244
+ .chat(
245
+ model=model,
246
+ response_format={"type": "json_object"},
247
+ messages=[
248
+ ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
249
+ ],
250
+ )
251
+ .dict()
252
+ ),
253
+ output=MistralModel,
254
+ )
255
+ .save("dialog-eval")
256
+ )
257
+
258
+ After the chain execution, we can collect the objects:
259
+
260
+ .. code:: py
261
+
262
+ for obj in responses:
263
+ assert isinstance(obj, MistralModel)
264
+ print(obj.dict())
265
+
266
+ .. code:: shell
267
+
268
+ {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 610, 'completion_tokens': 6}}
269
+ {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 3983, 'completion_tokens': 6}}
270
+ {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 706, 'completion_tokens': 6}}
271
+ {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 1250, 'completion_tokens': 6}}
272
+ {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 1217, 'completion_tokens': 6}}
273
+
274
+
275
+ Dataset persistence
276
+ --------------------
277
+
278
+ The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder .datachain/ holds the records. A persistent dataset can be accessed later to start a derivative chain:
279
+
280
+ .. code:: py
281
+
282
+ DataChain.from_dataset("dialog-eval").limit(2).save("dialog-eval")
283
+
284
+ Persistent datasets are immutable and automatically versioned. Versions can be listed from shell:
285
+
286
+ .. code:: shell
287
+
288
+ $ datachain ls-datasets
289
+
290
+ dialog-rate (v1)
291
+ dialog-rate (v2)
292
+
293
+ By default, when a persistent dataset is loaded, the latest version is fetched but another version can be requested:
294
+
295
+ .. code:: py
296
+
297
+ ds = DataChain.from_dataset("dialog-eval", version = 1)
298
+
299
+ Chain optimization and execution
300
+ --------------------------------
301
+
302
+ Datachain avoids redundant operations. Execution is triggered only when a downstream operation requests the processed results. However, it would be inefficient to run, say, LLM queries again every time you just want to collect several objects.
303
+
304
+ “Save” operation nails execution results and automatically refers to them every time the downstream functions ask for data. Saving without an explicit name generates an auto-named dataset which serves the same purpose.
305
+
306
+
307
+ Matching data with metadata
308
+ ----------------------------
309
+ It is common for AI data to come with pre-computed metadata (annotations, classes, etc).
310
+
311
+ DataChain library understands common metadata formats (JSON, CSV and parquet), and can unite data samples from storage with side-loaded metadata. The schema for metadata can be set explicitly or be inferred.
312
+
313
+ Here is an example of reading a CSV file where schema is heuristically derived from the header:
314
+
315
+ .. code:: py
316
+
317
+ from datachain.lib.dc import DataChain
318
+ csv_dataset = DataChain.from_csv("gs://datachain-demo/chatbot-csv/")
319
+
320
+ print(csv_dataset.to_pandas())
321
+
322
+ Reading metadata from JSON format is a more complicated scenario because a JSON-annotated dataset typically references data samples (e.g. images) in annotation arrays somewhere within JSON files.
323
+
324
+ Here is an example from MS COCO “captions” JSON which employs separate sections for image meta and captions:
325
+
326
+ .. code:: json
327
+
328
+
329
+ {
330
+ "images": [
331
+ {
332
+ "license": 4,
333
+ "file_name": "000000397133.jpg",
334
+ "coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",
335
+ "height": 427,
336
+ "width": 640,
337
+ "date_captured": "2013-11-14 17:02:52",
338
+ "flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",
339
+ "id": 397133
340
+ },
341
+ ...
342
+ ],
343
+ "annotations": [
344
+ {
345
+ "image_id" : "179765",
346
+ "id" : 38,
347
+ "caption" : "A black Honda motorcycle parked in front of a garage."
348
+ },
349
+ ...
350
+ ],
351
+ ...
352
+ }
353
+
354
+ To deal with this layout, we can take the following steps:
355
+
356
+ 1. Generate a dataset of raw image files from storage
357
+ 2. Generate a meta-information dataset from the JSON section “images”
358
+ 3. Join these datasets via the matching id keys
359
+
360
+ .. code:: python
361
+
362
+
363
+ from datachain.lib.dc import DataChain
364
+
365
+ images = DataChain.from_storage("gs://datachain-demo/coco2017/images/val/")
366
+ meta = DataChain.from_json("gs://datachain-demo/coco2017/annotations_captions", jmespath = "images")
367
+
368
+ images_with_meta = images.merge(meta, on="file.name", right_on="images.file_name")
369
+
370
+ print(images_with_meta.limit(1).results())
371
+
372
+ .. code:: shell
373
+
374
+
375
+ Processed: 5000 rows [00:00, 15481.66 rows/s]
376
+ Processed: 1 rows [00:00, 1291.75 rows/s]
377
+ Processed: 1 rows [00:00, 4.70 rows/s]
378
+ Generated: 5000 rows [00:00, 27128.67 rows/s]
379
+ [(1, 2336066478558845549, '', 0, 'coco2017/images/val', '000000000139.jpg', 'CNvXoemj8IYDEAE=', '1719096046021595', 1, datetime.datetime(2024, 6, 22, 22, 40, 46, 70000, tzinfo=datetime.timezone.utc), 161811, '', '', None, 'gs://datachain-demo', 'gs://datachain-demo', 'coco2017/images/val', '000000000139.jpg', 161811, '1719096046021595', 'CNvXoemj8IYDEAE=', 1, datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), None, '', 4146, 6967063844996569113, 2, '000000000139.jpg', 'http://images.cocodataset.org/val2017/000000000139.jpg', 426, 640, '2013-11-21 01:34:01', 'http://farm9.staticflickr.com/8035/8024364858_9c41dc1666_z.jpg', 139)]
380
+
381
+ Passing data to training
382
+ ------------------------
383
+
384
+ Chain results can be exported or passed directly to Pytorch dataloader. For example, if we are interested in passing three columns to training, the following Pytorch code will do it:
385
+
386
+ .. code:: py
387
+
388
+ ds = train.select("file", "caption_choices", "label_ind").to_pytorch(
389
+ transform=preprocess,
390
+ tokenizer=clip.tokenize,
391
+ )
392
+
393
+ loader = DataLoader(ds, batch_size=2)
394
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
395
+ train(loader, model, optimizer)
396
+
397
+ Tutorials
398
+ ------------------
399
+
400
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
401
+
402
+ Contributions
403
+ --------------------
404
+
405
+ Contributions are very welcome.
406
+ To learn more, see the `Contributor Guide`_.
407
+
408
+
409
+ License
410
+ -------
411
+
412
+ Distributed under the terms of the `Apache 2.0 license`_,
413
+ *DataChain* is free and open source software.
414
+
415
+
416
+ Issues
417
+ ------
418
+
419
+ If you encounter any problems,
420
+ please `file an issue`_ along with a detailed description.
421
+
422
+
423
+ .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
424
+ .. _PyPI: https://pypi.org/
425
+ .. _file an issue: https://github.com/iterative/dvcx/issues
426
+ .. _pip: https://pip.pypa.io/
427
+ .. github-only
428
+ .. _Contributor Guide: CONTRIBUTING.rst
429
+ .. _Pydantic: https://github.com/pydantic/pydantic
@@ -1,4 +1,4 @@
1
- datachain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
1
+ datachain/__init__.py,sha256=WTZQycUOpP1b-Ry_Qje5HH0EE14ptne-ZiQQ5070UMA,798
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
4
  datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
@@ -36,15 +36,16 @@ datachain/data_storage/schema.py,sha256=bY3q2OUaUraos0s5BnwWkhgce8YpeNmIl7M1ifsh
36
36
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
37
37
  datachain/data_storage/sqlite.py,sha256=F68Q_AIqNAObZ5kJ0GnBqRC6e2D2sRehkQo8UzrHgtI,25079
38
38
  datachain/data_storage/warehouse.py,sha256=h35JiJoCGtwkMctis_x3NHxkwEejX5sIWvJOluZxrOI,33132
39
+ datachain/image/__init__.py,sha256=g3l7vJFzg0-s5OAmBtGargsxt12TuKU4Ex6S0fOmEeY,101
39
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
41
  datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
41
42
  datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
42
43
  datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
43
44
  datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
44
45
  datachain/lib/dc.py,sha256=D3cgib-U0Mo0x5wEK1_NfgymAldHqCvooZwtyohi53Q,34426
45
- datachain/lib/feature.py,sha256=QDloA9HE7URf9J_veKrguYBvSg-0cbXZFTswNxrKsB8,12135
46
+ datachain/lib/feature.py,sha256=iMwbMyQUyjRUeB-vhAucnx59kNSVvX_xEChTW5B9klY,12244
46
47
  datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
47
- datachain/lib/feature_utils.py,sha256=oqRO_Mu3epOr1HPTxAJ8TxsJshUfKJQtulCDgHtInMI,4557
48
+ datachain/lib/feature_utils.py,sha256=2yLdZd9o4AJ5QQX7kqgbCxCT78aT7HE12CLxQ6QRpbc,4982
48
49
  datachain/lib/file.py,sha256=LGBwC7tFU7VcSWk5kjPpEWPBQas5me69L2uTDNvYXGM,8326
49
50
  datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
50
51
  datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
@@ -92,9 +93,10 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
92
93
  datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
93
94
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
94
95
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
95
- datachain-0.2.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.5.dist-info/METADATA,sha256=VDc20_FTRJRF63521iwyb67LMCYVXn2BqeER4IVc840,14810
97
- datachain-0.2.5.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
98
- datachain-0.2.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.5.dist-info/RECORD,,
96
+ datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
97
+ datachain-0.2.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
98
+ datachain-0.2.7.dist-info/METADATA,sha256=wCM5xqbN0jL3rMZscamMVXYzPRyobgZouv98pQzPK5U,16475
99
+ datachain-0.2.7.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
100
+ datachain-0.2.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
101
+ datachain-0.2.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
102
+ datachain-0.2.7.dist-info/RECORD,,
@@ -1,376 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: datachain
3
- Version: 0.2.5
4
- Summary: Wrangle unstructured AI data at scale
5
- Author-email: Dmitry Petrov <support@dvc.org>
6
- License: Apache-2.0
7
- Project-URL: Documentation, https://datachain.dvc.ai
8
- Project-URL: Issues, https://github.com/iterative/dvcx/issues
9
- Project-URL: Source, https://github.com/iterative/dvcx
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
- Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Classifier: Programming Language :: Python :: 3.12
15
- Classifier: Development Status :: 2 - Pre-Alpha
16
- Requires-Python: >=3.9
17
- Description-Content-Type: text/x-rst
18
- License-File: LICENSE
19
- Requires-Dist: pyyaml
20
- Requires-Dist: tomlkit
21
- Requires-Dist: tqdm
22
- Requires-Dist: numpy
23
- Requires-Dist: pandas >=2.0.0
24
- Requires-Dist: pyarrow
25
- Requires-Dist: typing-extensions
26
- Requires-Dist: python-dateutil >=2
27
- Requires-Dist: attrs >=21.3.0
28
- Requires-Dist: s3fs >=2024.2.0
29
- Requires-Dist: gcsfs >=2024.2.0
30
- Requires-Dist: adlfs >=2024.2.0
31
- Requires-Dist: dvc-data <4,>=3.10
32
- Requires-Dist: dvc-objects <6,>=4
33
- Requires-Dist: shtab <2,>=1.3.4
34
- Requires-Dist: sqlalchemy >=2
35
- Requires-Dist: multiprocess ==0.70.16
36
- Requires-Dist: dill ==0.3.8
37
- Requires-Dist: ujson >=5.9.0
38
- Requires-Dist: pydantic <3,>=2
39
- Requires-Dist: jmespath >=1.0
40
- Requires-Dist: datamodel-code-generator >=0.25
41
- Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
42
- Provides-Extra: cv
43
- Requires-Dist: Pillow <11,>=10.0.0 ; extra == 'cv'
44
- Requires-Dist: torch >=2.1.0 ; extra == 'cv'
45
- Requires-Dist: torchvision ; extra == 'cv'
46
- Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
47
- Provides-Extra: dev
48
- Requires-Dist: datachain[docs,tests] ; extra == 'dev'
49
- Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
50
- Requires-Dist: types-python-dateutil ; extra == 'dev'
51
- Requires-Dist: types-PyYAML ; extra == 'dev'
52
- Requires-Dist: types-requests ; extra == 'dev'
53
- Requires-Dist: types-ujson ; extra == 'dev'
54
- Provides-Extra: docs
55
- Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
56
- Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
57
- Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
58
- Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
59
- Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
60
- Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
61
- Provides-Extra: remote
62
- Requires-Dist: lz4 ; extra == 'remote'
63
- Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
64
- Requires-Dist: requests >=2.22.0 ; extra == 'remote'
65
- Provides-Extra: tests
66
- Requires-Dist: datachain[cv,remote,vector] ; extra == 'tests'
67
- Requires-Dist: pytest <9,>=8 ; extra == 'tests'
68
- Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
69
- Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
70
- Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
71
- Requires-Dist: pytest-servers[all] >=0.5.4 ; extra == 'tests'
72
- Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
73
- Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
74
- Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
75
- Requires-Dist: virtualenv ; extra == 'tests'
76
- Requires-Dist: dulwich ; extra == 'tests'
77
- Requires-Dist: hypothesis ; extra == 'tests'
78
- Requires-Dist: open-clip-torch ; extra == 'tests'
79
- Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
80
- Requires-Dist: requests-mock ; extra == 'tests'
81
- Provides-Extra: vector
82
- Requires-Dist: usearch ; extra == 'vector'
83
-
84
- |PyPI| |Python Version| |Codecov| |Tests| |License|
85
-
86
- .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
87
- :target: https://pypi.org/project/datachain/
88
- :alt: PyPI
89
- .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
90
- :target: https://pypi.org/project/datachain
91
- :alt: Python Version
92
- .. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
93
- :target: https://app.codecov.io/gh/iterative/dvcx
94
- :alt: Codecov
95
- .. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
96
- :target: https://github.com/iterative/dvcx/actions?workflow=Tests
97
- :alt: Tests
98
- .. |License| image:: https://img.shields.io/pypi/l/datachain
99
- :target: https://opensource.org/licenses/Apache-2.0
100
- :alt: License
101
-
102
- AI 🔗 DataChain
103
- ----------------
104
-
105
- DataChain is an open-source Python data processing library for wrangling unstructured AI data at scale.
106
-
107
- It enables batch LLM API calls and local language and vision AI model inferences to run in parallel over many samples as chained operations resolving to table-like datasets. These datasets can be saved, versioned, and sent directly to PyTorch and TensorFlow for training. DataChain employs rigorous `Pydantic`_ data structures, promoting better data processing practices and enabling vectorized analytical operations normally found in databases.
108
-
109
- The DataChain fills the gap between dataframe libraries, data warehouses, and Python-based multimodal AI applications. Our primary use cases include massive data curation, LLM analytics and validation, batch image segmentation and pose detection, GenAI data alignment, etc.
110
-
111
- .. code:: console
112
-
113
- $ pip install datachain
114
-
115
- Basic operation
116
- ---------------
117
-
118
- DataChain is built by composing wrangling operations.
119
-
120
- For example, it can be instructed to read files from the cloud, map them onto a modern AI service returning a Python object, parallelize API calls, save the result as a dataset, and export a column:
121
-
122
- .. code:: py
123
-
124
- import os
125
- import datachain as dc
126
-
127
- from anthropic.types.message import Message
128
- ClaudeModel = dc.pydantic_to_feature(Message)
129
- PROMPT = "summarize this book in less than 200 words"
130
- service = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
131
- source = "gs://datachain-demo/mybooks/"
132
-
133
- chain = dc.DataChain(source) \
134
- .filter(File.name.glob("*.txt")) \
135
- .settings(parallel=4) \
136
- .map( \
137
- claude = lambda file: \
138
- ClaudeModel(**service.messages.create( \
139
- model="claude-3-haiku-20240307", \
140
- system=PROMPT, \
141
- messages=[{"role": "user", \
142
- "content": file.get_value()}] \
143
- ), \
144
- ).model_dump() \
145
- ) \
146
- .save("mydataset")
147
-
148
- dc.DataChain("mydataset").export("./", "claude.response") # export summaries
149
-
150
- Dataset persistence
151
- -------------------
152
-
153
- In the example above, the chain resolves to a saved dataset “mydataset”. DataChain datasets are immutable and versioned. A saved dataset version can be used as a data source:
154
-
155
- .. code:: py
156
-
157
- ds = dc.DataChain("mydataset", version = 1)
158
-
159
- Note that DataChain represents file samples as pointers into their respective storage locations. This means a newly created dataset version does not duplicate files in storage, and storage remains the single source of truth for the original samples
160
-
161
- Vectorized analytics
162
- ---------------------
163
- Since datasets are internally represented as tables, analytical queries can be vectorized:
164
-
165
- .. code:: py
166
-
167
- rate = ds.filter(chain.response == "Success").count() / chain.count() # ??
168
- print(f"API class success rate: {100*rate:.2f}%")
169
- >> 74.68%
170
-
171
- price_input = 0.25
172
- price_output = 1.25
173
- price=(ds.sum(C.claude.usage.input_tokens)*price_input \
174
- + ds.sum(C.claude.usage.output_tokens)*price_output)/1_000_000
175
- print(f"Cost of API calls: ${price:.2f}")
176
- >> Cost of API calls: $1.42
177
-
178
-
179
- Importing metadata
180
- ------------------------
181
-
182
- It is common for AI data to come together with metadata (annotations, classes, etc).
183
- DataChain understands many metadata formats, and can connect data samples in storage with external metadata (e.g. CSV columns) to form a single dataset:
184
-
185
- .. code:: py
186
-
187
- from dc import parse_csv
188
-
189
- files = dc.DataChain("gs://datachain-demo/myimages/")
190
- metadata = dc.DataChain("gs://datachain-demo/myimagesmetadata.csv") \
191
- .gen(meta=parse_csv) # TBD, also dependent on dropping file
192
- dataset = chain1.merge(chain2, on = "file.name", right_on="name"])
193
-
194
- print(dataset.select("file.name", "class", "prob").limit(5).to_pandas())
195
- ....
196
- ....
197
- ....
198
- ....
199
- ....
200
-
201
- Nested annotations (like JSON) can be unrolled into rows and columns in the way that best fits the application. For example, the MS COCO dataset includes JSON annotations detailing segmentations. To build a dataset consisting of all segmented objects in all COCO images:
202
-
203
- .. code:: py
204
-
205
- image_files = dc.DataChain("gs://datachain-demo/coco/images/")
206
- image_meta = dc.DataChain("gs://datachain-demo/coco.json") \
207
- .gen(meta=parse_json, key="images") # list of images
208
- images = image_files.merge(image_meta, on = "file.name", right_on="file_name")
209
- objects_meta = dc.DataChain("gs://datachain-demo/coco.json") \
210
- .gen(meta=parse_json, key="annotations") # annotated objects
211
-
212
- objects = image.full_merge(objects_meta, on = "id", right_on = "image_id")
213
-
214
- Generating metadata
215
- ---------------------
216
-
217
- A typical step in data curation is to create features from data samples for future selection. DataChain represents the newly created metadata as columns, which makes it easy to create new features and filter on them:
218
-
219
- .. code:: py
220
-
221
- from fashion_clip.fashion_clip import FashionCLIP
222
- from sqlalchemy import JSON
223
- from tabulate import tabulate
224
-
225
- from datachain.lib.param import Image
226
- from datachain.query import C, DatasetQuery, udf
227
-
228
-
229
- @udf(
230
- params=(Image(),),
231
- output={"fclip": JSON},
232
- method="fashion_clip",
233
- batch=10,
234
- )
235
- class MyFashionClip:
236
- def __init__(self):
237
- self.fclip = FashionCLIP("fashion-clip")
238
-
239
- def fashion_clip(self, inputs):
240
- embeddings = self.fclip.encode_images(
241
- [input[0] for input in inputs], batch_size=1
242
- )
243
- return [(json.dumps(emb),) for emb in embeddings.tolist()]
244
-
245
- chain = dc.DataChain("gs://datachain-demo/zalando/images/").filter(
246
- C.name.glob("*.jpg")
247
- ).limit(5).add_signals(MyFashionClip).save("zalando_hd_emb")
248
-
249
- test_image = "cs://datachain-demo/zalando/test/banner.jpg"
250
- test_embedding = MyFashionClip.fashion_clip.encode_images(Image(test_image))
251
-
252
- best_matches = chain.filter(similarity_search(test_embeding)).limit(5)
253
-
254
- print best_matches.to_result()
255
-
256
-
257
- Delta updates
258
- -------------
259
-
260
- DataChain is capable of “delta updates” – that is, batch-processing only the newly added data samples. For example, let us copy some images into a local folder and run a chain to generate captions with a locally served captioning model from HuggingFace:
261
-
262
- .. code:: console
263
-
264
- > mkdir demo-images/
265
- > datachain cp gs://datachain-demo/images/ /tmp/demo-images
266
-
267
-
268
- .. code:: py
269
-
270
- import torch
271
-
272
- from datachain.lib.hf_image_to_text import LLaVAdescribe
273
- from datachain.query import C, DatasetQuery
274
-
275
- source = "/tmp/demo-images"
276
-
277
- if torch.cuda.is_available():
278
- device = "cuda"
279
- else:
280
- device = "cpu"
281
-
282
- if __name__ == "__main__":
283
- results = (
284
- DatasetQuery(
285
- source,
286
- anon=True,
287
- )
288
- .filter(C.name.glob("*.jpg"))
289
- .add_signals(
290
- LLaVAdescribe(
291
- device=device,
292
- model=model,
293
- ),
294
- parallel=False,
295
- )
296
- .save("annotated-images")
297
- )
298
-
299
- Now let us add few more more images to the same folder:
300
-
301
- .. code:: console
302
-
303
- > datachain cp gs://datachain-demo/extra-images/ /tmp/demo-images
304
-
305
- and calculate updates only for the delta:
306
-
307
- .. code:: py
308
-
309
- processed = dc.DataChain("annotated-images")
310
- delta = dc.dataChain("/tmp/demo-images").subtract(processed)
311
-
312
- Passing data to training
313
- ------------------------
314
-
315
- Datasets can be exported to CSV or webdataset formats. However, a much better way to pass data to training which avoids data copies and re-sharding is to wrap a DataChain dataset into a PyTorch class, and let the library take care of file downloads and caching under the hood:
316
-
317
- .. code:: py
318
-
319
- ds = dc.DataChain("gs://datachain-demo/name-labeled/images/")
320
- .filter(C.name.glob("*.jpg"))
321
- .map(lambda name: (name[:3],), output={"label": str}, parallel=4)
322
- )
323
-
324
- train_loader = DataLoader(
325
- ds.to_pytorch(
326
- ImageReader(),
327
- LabelReader("label", classes=CLASSES),
328
- transform=transform,
329
- ),
330
- batch_size=16,
331
- parallel=2,
332
- )
333
-
334
- Tutorials
335
- ------------------
336
-
337
- * `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvcx/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
338
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
339
-
340
- 💻  More examples
341
- ------------------
342
-
343
- * Curating images to train a custom CLIP model without re-sharding the Webdataset files
344
- * Batch-transforming and indexing images to create a searchable merchandise catalog
345
- * Evaluating an LLM application at scale
346
- * Ranking the LLM retrieval strategies
347
- * Delta updates in batch processing
348
-
349
- Contributions
350
- --------------------
351
-
352
- Contributions are very welcome.
353
- To learn more, see the `Contributor Guide`_.
354
-
355
-
356
- License
357
- -------
358
-
359
- Distributed under the terms of the `Apache 2.0 license`_,
360
- *DataChain* is free and open source software.
361
-
362
-
363
- Issues
364
- ------
365
-
366
- If you encounter any problems,
367
- please `file an issue`_ along with a detailed description.
368
-
369
-
370
- .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
371
- .. _PyPI: https://pypi.org/
372
- .. _file an issue: https://github.com/iterative/dvcx/issues
373
- .. _pip: https://pip.pypa.io/
374
- .. github-only
375
- .. _Contributor Guide: CONTRIBUTING.rst
376
- .. _Pydantic: https://github.com/pydantic/pydantic