datachain 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +34 -0
- datachain/lib/dc.py +12 -20
- datachain/lib/feature.py +7 -2
- datachain/lib/feature_utils.py +35 -20
- datachain/lib/file.py +0 -3
- datachain/lib/pytorch.py +9 -2
- datachain/lib/signal_schema.py +49 -9
- datachain/lib/udf.py +0 -1
- datachain-0.2.6.dist-info/METADATA +429 -0
- {datachain-0.2.4.dist-info → datachain-0.2.6.dist-info}/RECORD +14 -14
- datachain-0.2.4.dist-info/METADATA +0 -371
- {datachain-0.2.4.dist-info → datachain-0.2.6.dist-info}/LICENSE +0 -0
- {datachain-0.2.4.dist-info → datachain-0.2.6.dist-info}/WHEEL +0 -0
- {datachain-0.2.4.dist-info → datachain-0.2.6.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.4.dist-info → datachain-0.2.6.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from datachain.lib.dc import C, DataChain
|
|
2
|
+
from datachain.lib.feature import Feature
|
|
3
|
+
from datachain.lib.feature_utils import pydantic_to_feature
|
|
4
|
+
from datachain.lib.file import File, FileError, FileFeature, IndexedFile, TarVFile
|
|
5
|
+
from datachain.lib.image import ImageFile, convert_images
|
|
6
|
+
from datachain.lib.text import convert_text
|
|
7
|
+
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
8
|
+
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
9
|
+
from datachain.query.dataset import UDF as BaseUDF # noqa: N811
|
|
10
|
+
from datachain.query.schema import Column
|
|
11
|
+
from datachain.query.session import Session
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"AbstractUDF",
|
|
15
|
+
"Aggregator",
|
|
16
|
+
"BaseUDF",
|
|
17
|
+
"C",
|
|
18
|
+
"Column",
|
|
19
|
+
"DataChain",
|
|
20
|
+
"DataChainError",
|
|
21
|
+
"Feature",
|
|
22
|
+
"File",
|
|
23
|
+
"FileError",
|
|
24
|
+
"FileFeature",
|
|
25
|
+
"Generator",
|
|
26
|
+
"ImageFile",
|
|
27
|
+
"IndexedFile",
|
|
28
|
+
"Mapper",
|
|
29
|
+
"Session",
|
|
30
|
+
"TarVFile",
|
|
31
|
+
"convert_images",
|
|
32
|
+
"convert_text",
|
|
33
|
+
"pydantic_to_feature",
|
|
34
|
+
]
|
datachain/lib/dc.py
CHANGED
|
@@ -137,6 +137,7 @@ class DataChain(DatasetQuery):
|
|
|
137
137
|
indexing_column_types=File._datachain_column_types,
|
|
138
138
|
)
|
|
139
139
|
self._settings = Settings()
|
|
140
|
+
self._setup = {}
|
|
140
141
|
|
|
141
142
|
if self.feature_schema:
|
|
142
143
|
self.signals_schema = SignalSchema.deserialize(self.feature_schema)
|
|
@@ -536,9 +537,9 @@ class DataChain(DatasetQuery):
|
|
|
536
537
|
name = self.name or ""
|
|
537
538
|
|
|
538
539
|
sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
|
|
539
|
-
params_schema = self.signals_schema.slice(sign.params)
|
|
540
|
+
params_schema = self.signals_schema.slice(sign.params, self._setup)
|
|
540
541
|
|
|
541
|
-
return UDFBase._create(target_class, sign, params_schema
|
|
542
|
+
return UDFBase._create(target_class, sign, params_schema)
|
|
542
543
|
|
|
543
544
|
def _extend_features(self, method_name, *args, **kwargs):
|
|
544
545
|
super_func = getattr(super(), method_name)
|
|
@@ -569,18 +570,6 @@ class DataChain(DatasetQuery):
|
|
|
569
570
|
chain.signals_schema = new_schema
|
|
570
571
|
return chain
|
|
571
572
|
|
|
572
|
-
def get_values(self, *cols: str) -> Iterator[list]:
|
|
573
|
-
"""Iterate over rows, getting feature values and applying reader calls.
|
|
574
|
-
|
|
575
|
-
If columns are specified - limit them to specified columns.
|
|
576
|
-
"""
|
|
577
|
-
for features in self.iterate(*cols):
|
|
578
|
-
yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features] # type: ignore[union-attr,call-arg]
|
|
579
|
-
|
|
580
|
-
def get_one_value(self, col: str) -> Iterator:
|
|
581
|
-
for item in self.get_values(col):
|
|
582
|
-
yield item[0]
|
|
583
|
-
|
|
584
573
|
def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
|
|
585
574
|
"""Iterate over rows.
|
|
586
575
|
|
|
@@ -604,12 +593,6 @@ class DataChain(DatasetQuery):
|
|
|
604
593
|
def collect_one(self, col: str) -> list[FeatureType]:
|
|
605
594
|
return list(self.iterate_one(col))
|
|
606
595
|
|
|
607
|
-
def collect_values(self, *cols: str) -> list[list]:
|
|
608
|
-
return list(self.get_values(*cols))
|
|
609
|
-
|
|
610
|
-
def collect_one_value(self, col: str) -> list:
|
|
611
|
-
return list(self.get_one_value(col))
|
|
612
|
-
|
|
613
596
|
def to_pytorch(self, **kwargs):
|
|
614
597
|
"""Convert to pytorch dataset format."""
|
|
615
598
|
|
|
@@ -931,3 +914,12 @@ class DataChain(DatasetQuery):
|
|
|
931
914
|
|
|
932
915
|
def max(self, fr: FeatureType): # type: ignore[override]
|
|
933
916
|
return self._extend_features("max", fr)
|
|
917
|
+
|
|
918
|
+
def setup(self, **kwargs) -> "Self":
|
|
919
|
+
intersection = set(self._setup.keys()) & set(kwargs.keys())
|
|
920
|
+
if intersection:
|
|
921
|
+
keys = ", ".join(intersection)
|
|
922
|
+
raise DatasetPrepareError(self.name, f"this value(s) already setup: {keys}")
|
|
923
|
+
|
|
924
|
+
self._setup = self._setup | kwargs
|
|
925
|
+
return self
|
datachain/lib/feature.py
CHANGED
|
@@ -4,6 +4,7 @@ import re
|
|
|
4
4
|
import warnings
|
|
5
5
|
from collections.abc import Iterable, Sequence
|
|
6
6
|
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
7
8
|
from functools import lru_cache
|
|
8
9
|
from types import GenericAlias
|
|
9
10
|
from typing import (
|
|
@@ -63,6 +64,7 @@ TYPE_TO_DATACHAIN = {
|
|
|
63
64
|
str: String,
|
|
64
65
|
Literal: String,
|
|
65
66
|
LiteralEx: String,
|
|
67
|
+
Enum: String,
|
|
66
68
|
float: Float,
|
|
67
69
|
bool: Boolean,
|
|
68
70
|
datetime: DateTime, # Note, list of datetime is not supported yet
|
|
@@ -364,8 +366,11 @@ def _resolve(cls, name, field_info, prefix: list[str]):
|
|
|
364
366
|
|
|
365
367
|
|
|
366
368
|
def convert_type_to_datachain(typ): # noqa: PLR0911
|
|
367
|
-
if inspect.isclass(typ)
|
|
368
|
-
|
|
369
|
+
if inspect.isclass(typ):
|
|
370
|
+
if issubclass(typ, SQLType):
|
|
371
|
+
return typ
|
|
372
|
+
if issubclass(typ, Enum):
|
|
373
|
+
return str
|
|
369
374
|
|
|
370
375
|
res = TYPE_TO_DATACHAIN.get(typ)
|
|
371
376
|
if res:
|
datachain/lib/feature_utils.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import string
|
|
2
3
|
from collections.abc import Sequence
|
|
4
|
+
from enum import Enum
|
|
3
5
|
from typing import Any, Union, get_args, get_origin
|
|
4
6
|
|
|
5
7
|
from pydantic import BaseModel, create_model
|
|
@@ -12,9 +14,6 @@ from datachain.lib.feature import (
|
|
|
12
14
|
convert_type_to_datachain,
|
|
13
15
|
)
|
|
14
16
|
from datachain.lib.utils import DataChainParamsError
|
|
15
|
-
from datachain.query.schema import Column
|
|
16
|
-
|
|
17
|
-
FeatureLike = Union[type["Feature"], Column, str]
|
|
18
17
|
|
|
19
18
|
AUTO_FEATURE_PREFIX = "_auto_fr"
|
|
20
19
|
SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase
|
|
@@ -38,23 +37,7 @@ def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
|
|
|
38
37
|
for name, field_info in data_cls.model_fields.items():
|
|
39
38
|
anno = field_info.annotation
|
|
40
39
|
if anno not in TYPE_TO_DATACHAIN:
|
|
41
|
-
|
|
42
|
-
if orig is list:
|
|
43
|
-
anno = get_args(anno) # type: ignore[assignment]
|
|
44
|
-
if isinstance(anno, Sequence):
|
|
45
|
-
anno = anno[0] # type: ignore[unreachable]
|
|
46
|
-
is_list = True
|
|
47
|
-
else:
|
|
48
|
-
is_list = False
|
|
49
|
-
|
|
50
|
-
try:
|
|
51
|
-
convert_type_to_datachain(anno)
|
|
52
|
-
except TypeError:
|
|
53
|
-
if not Feature.is_feature(anno): # type: ignore[arg-type]
|
|
54
|
-
anno = pydantic_to_feature(anno) # type: ignore[arg-type]
|
|
55
|
-
|
|
56
|
-
if is_list:
|
|
57
|
-
anno = list[anno] # type: ignore[valid-type]
|
|
40
|
+
anno = _to_feature_type(anno)
|
|
58
41
|
fields[name] = (anno, field_info.default)
|
|
59
42
|
|
|
60
43
|
cls = create_model(
|
|
@@ -66,6 +49,38 @@ def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
|
|
|
66
49
|
return cls
|
|
67
50
|
|
|
68
51
|
|
|
52
|
+
def _to_feature_type(anno):
|
|
53
|
+
if inspect.isclass(anno) and issubclass(anno, Enum):
|
|
54
|
+
return str
|
|
55
|
+
|
|
56
|
+
orig = get_origin(anno)
|
|
57
|
+
if orig is list:
|
|
58
|
+
anno = get_args(anno) # type: ignore[assignment]
|
|
59
|
+
if isinstance(anno, Sequence):
|
|
60
|
+
anno = anno[0] # type: ignore[unreachable]
|
|
61
|
+
is_list = True
|
|
62
|
+
else:
|
|
63
|
+
is_list = False
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
convert_type_to_datachain(anno)
|
|
67
|
+
except TypeError:
|
|
68
|
+
if not Feature.is_feature(anno): # type: ignore[arg-type]
|
|
69
|
+
orig = get_origin(anno)
|
|
70
|
+
if orig in TYPE_TO_DATACHAIN:
|
|
71
|
+
anno = _to_feature_type(anno)
|
|
72
|
+
else:
|
|
73
|
+
if orig == Union:
|
|
74
|
+
args = get_args(anno)
|
|
75
|
+
if len(args) == 2 and (type(None) in args):
|
|
76
|
+
return _to_feature_type(args[0])
|
|
77
|
+
|
|
78
|
+
anno = pydantic_to_feature(anno) # type: ignore[arg-type]
|
|
79
|
+
if is_list:
|
|
80
|
+
anno = list[anno] # type: ignore[valid-type]
|
|
81
|
+
return anno
|
|
82
|
+
|
|
83
|
+
|
|
69
84
|
def features_to_tuples(
|
|
70
85
|
ds_name: str = "",
|
|
71
86
|
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
|
datachain/lib/file.py
CHANGED
datachain/lib/pytorch.py
CHANGED
|
@@ -8,6 +8,7 @@ from torch.utils.data import IterableDataset, get_worker_info
|
|
|
8
8
|
|
|
9
9
|
from datachain.catalog import Catalog, get_catalog
|
|
10
10
|
from datachain.lib.dc import DataChain
|
|
11
|
+
from datachain.lib.feature import Feature
|
|
11
12
|
from datachain.lib.text import convert_text
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
@@ -100,8 +101,14 @@ class PytorchDataset(IterableDataset):
|
|
|
100
101
|
if self.num_samples > 0:
|
|
101
102
|
ds = ds.sample(self.num_samples)
|
|
102
103
|
ds = ds.chunk(total_rank, total_workers)
|
|
103
|
-
stream = ds.
|
|
104
|
-
for
|
|
104
|
+
stream = ds.iterate()
|
|
105
|
+
for row_features in stream:
|
|
106
|
+
row = []
|
|
107
|
+
for fr in row_features:
|
|
108
|
+
if isinstance(fr, Feature):
|
|
109
|
+
row.append(fr.get_value()) # type: ignore[unreachable]
|
|
110
|
+
else:
|
|
111
|
+
row.append(fr)
|
|
105
112
|
# Apply transforms
|
|
106
113
|
if self.transform:
|
|
107
114
|
try:
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
from collections.abc import Iterator, Sequence
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Optional, Union, get_args, get_origin
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin
|
|
5
5
|
|
|
6
6
|
from pydantic import create_model
|
|
7
7
|
|
|
@@ -42,6 +42,11 @@ class SignalResolvingError(SignalSchemaError):
|
|
|
42
42
|
super().__init__(f"cannot resolve signal name{name}: {msg}")
|
|
43
43
|
|
|
44
44
|
|
|
45
|
+
class SetupError(SignalSchemaError):
|
|
46
|
+
def __init__(self, name: str, msg: str):
|
|
47
|
+
super().__init__(f"cannot setup value '{name}': {msg}")
|
|
48
|
+
|
|
49
|
+
|
|
45
50
|
class SignalResolvingTypeError(SignalResolvingError):
|
|
46
51
|
def __init__(self, method: str, field):
|
|
47
52
|
super().__init__(
|
|
@@ -52,9 +57,31 @@ class SignalResolvingTypeError(SignalResolvingError):
|
|
|
52
57
|
|
|
53
58
|
|
|
54
59
|
class SignalSchema:
|
|
55
|
-
def __init__(
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
values: dict[str, FeatureType],
|
|
63
|
+
setup: Optional[dict[str, Callable]] = None,
|
|
64
|
+
):
|
|
56
65
|
self.values = values
|
|
57
|
-
self.tree = self._build_tree()
|
|
66
|
+
self.tree = self._build_tree(values)
|
|
67
|
+
|
|
68
|
+
self.setup_func = setup or {}
|
|
69
|
+
self.setup_values = None
|
|
70
|
+
for key, func in self.setup_func.items():
|
|
71
|
+
if not callable(func):
|
|
72
|
+
raise SetupError(key, "value must be function or callable class")
|
|
73
|
+
|
|
74
|
+
def _init_setup_values(self):
|
|
75
|
+
if self.setup_values is not None:
|
|
76
|
+
return self.setup_values
|
|
77
|
+
|
|
78
|
+
res = {}
|
|
79
|
+
for key, func in self.setup_func.items():
|
|
80
|
+
try:
|
|
81
|
+
res[key] = func()
|
|
82
|
+
except Exception as ex:
|
|
83
|
+
raise SetupError(key, f"error when call function: '{ex}'") from ex
|
|
84
|
+
self.setup_values = res
|
|
58
85
|
|
|
59
86
|
@staticmethod
|
|
60
87
|
def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
|
|
@@ -111,16 +138,22 @@ class SignalSchema:
|
|
|
111
138
|
def to_udf_spec(self) -> dict[str, Any]:
|
|
112
139
|
res = {}
|
|
113
140
|
for path, type_, has_subtree, _ in self.get_flat_tree():
|
|
141
|
+
if path[0] in self.setup_func:
|
|
142
|
+
continue
|
|
114
143
|
if not has_subtree:
|
|
115
144
|
db_name = DEFAULT_DELIMITER.join(path)
|
|
116
145
|
res[db_name] = convert_type_to_datachain(type_)
|
|
117
146
|
return res
|
|
118
147
|
|
|
119
148
|
def row_to_objs(self, row: Sequence[Any]) -> list[FeatureType]:
|
|
149
|
+
self._init_setup_values()
|
|
150
|
+
|
|
120
151
|
objs = []
|
|
121
152
|
pos = 0
|
|
122
|
-
for fr_type in self.values.
|
|
123
|
-
if
|
|
153
|
+
for name, fr_type in self.values.items():
|
|
154
|
+
if val := self.setup_values.get(name, None): # type: ignore[attr-defined]
|
|
155
|
+
objs.append(val)
|
|
156
|
+
elif Feature.is_feature(fr_type):
|
|
124
157
|
j, pos = fr_type._unflatten_to_json_pos(row, pos) # type: ignore[union-attr]
|
|
125
158
|
objs.append(fr_type(**j))
|
|
126
159
|
else:
|
|
@@ -135,8 +168,14 @@ class SignalSchema:
|
|
|
135
168
|
if Feature.is_feature(fr)
|
|
136
169
|
)
|
|
137
170
|
|
|
138
|
-
def slice(
|
|
139
|
-
|
|
171
|
+
def slice(
|
|
172
|
+
self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
|
|
173
|
+
) -> "SignalSchema":
|
|
174
|
+
setup = setup or {}
|
|
175
|
+
setup_no_types = dict.fromkeys(setup.keys(), str)
|
|
176
|
+
union = self.values | setup_no_types
|
|
177
|
+
schema = {k: union[k] for k in keys if k in union}
|
|
178
|
+
return SignalSchema(schema, setup)
|
|
140
179
|
|
|
141
180
|
def row_to_features(self, row: Sequence, catalog: "Catalog") -> list[FeatureType]:
|
|
142
181
|
res = []
|
|
@@ -235,10 +274,11 @@ class SignalSchema:
|
|
|
235
274
|
**fields,
|
|
236
275
|
)
|
|
237
276
|
|
|
238
|
-
|
|
277
|
+
@staticmethod
|
|
278
|
+
def _build_tree(values: dict[str, FeatureType]) -> dict[str, Any]:
|
|
239
279
|
res = {}
|
|
240
280
|
|
|
241
|
-
for name, val in
|
|
281
|
+
for name, val in values.items():
|
|
242
282
|
subtree = val.build_tree() if Feature.is_feature(val) else None # type: ignore[union-attr]
|
|
243
283
|
res[name] = (val, subtree)
|
|
244
284
|
|
datachain/lib/udf.py
CHANGED
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datachain
|
|
3
|
+
Version: 0.2.6
|
|
4
|
+
Summary: Wrangle unstructured AI data at scale
|
|
5
|
+
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
+
Project-URL: Issues, https://github.com/iterative/dvcx/issues
|
|
9
|
+
Project-URL: Source, https://github.com/iterative/dvcx
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/x-rst
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: tomlkit
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: pandas >=2.0.0
|
|
24
|
+
Requires-Dist: pyarrow
|
|
25
|
+
Requires-Dist: typing-extensions
|
|
26
|
+
Requires-Dist: python-dateutil >=2
|
|
27
|
+
Requires-Dist: attrs >=21.3.0
|
|
28
|
+
Requires-Dist: s3fs >=2024.2.0
|
|
29
|
+
Requires-Dist: gcsfs >=2024.2.0
|
|
30
|
+
Requires-Dist: adlfs >=2024.2.0
|
|
31
|
+
Requires-Dist: dvc-data <4,>=3.10
|
|
32
|
+
Requires-Dist: dvc-objects <6,>=4
|
|
33
|
+
Requires-Dist: shtab <2,>=1.3.4
|
|
34
|
+
Requires-Dist: sqlalchemy >=2
|
|
35
|
+
Requires-Dist: multiprocess ==0.70.16
|
|
36
|
+
Requires-Dist: dill ==0.3.8
|
|
37
|
+
Requires-Dist: ujson >=5.9.0
|
|
38
|
+
Requires-Dist: pydantic <3,>=2
|
|
39
|
+
Requires-Dist: jmespath >=1.0
|
|
40
|
+
Requires-Dist: datamodel-code-generator >=0.25
|
|
41
|
+
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
42
|
+
Provides-Extra: cv
|
|
43
|
+
Requires-Dist: Pillow <11,>=10.0.0 ; extra == 'cv'
|
|
44
|
+
Requires-Dist: torch >=2.1.0 ; extra == 'cv'
|
|
45
|
+
Requires-Dist: torchvision ; extra == 'cv'
|
|
46
|
+
Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
49
|
+
Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
|
|
50
|
+
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
51
|
+
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
52
|
+
Requires-Dist: types-requests ; extra == 'dev'
|
|
53
|
+
Requires-Dist: types-ujson ; extra == 'dev'
|
|
54
|
+
Provides-Extra: docs
|
|
55
|
+
Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
|
|
56
|
+
Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
|
|
57
|
+
Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
|
|
58
|
+
Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
|
|
60
|
+
Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
|
|
61
|
+
Provides-Extra: remote
|
|
62
|
+
Requires-Dist: lz4 ; extra == 'remote'
|
|
63
|
+
Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
|
|
64
|
+
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
65
|
+
Provides-Extra: tests
|
|
66
|
+
Requires-Dist: datachain[cv,remote,vector] ; extra == 'tests'
|
|
67
|
+
Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
68
|
+
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
69
|
+
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
70
|
+
Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
|
|
71
|
+
Requires-Dist: pytest-servers[all] >=0.5.4 ; extra == 'tests'
|
|
72
|
+
Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
|
|
73
|
+
Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
|
|
74
|
+
Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
|
|
75
|
+
Requires-Dist: virtualenv ; extra == 'tests'
|
|
76
|
+
Requires-Dist: dulwich ; extra == 'tests'
|
|
77
|
+
Requires-Dist: hypothesis ; extra == 'tests'
|
|
78
|
+
Requires-Dist: open-clip-torch ; extra == 'tests'
|
|
79
|
+
Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
|
|
80
|
+
Requires-Dist: requests-mock ; extra == 'tests'
|
|
81
|
+
Provides-Extra: vector
|
|
82
|
+
Requires-Dist: usearch ; extra == 'vector'
|
|
83
|
+
|
|
84
|
+
|PyPI| |Python Version| |Codecov| |Tests|
|
|
85
|
+
|
|
86
|
+
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
87
|
+
:target: https://pypi.org/project/datachain/
|
|
88
|
+
:alt: PyPI
|
|
89
|
+
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
90
|
+
:target: https://pypi.org/project/datachain
|
|
91
|
+
:alt: Python Version
|
|
92
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
|
|
93
|
+
:target: https://app.codecov.io/gh/iterative/dvcx
|
|
94
|
+
:alt: Codecov
|
|
95
|
+
.. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
|
|
96
|
+
:target: https://github.com/iterative/dvcx/actions?workflow=Tests
|
|
97
|
+
:alt: Tests
|
|
98
|
+
|
|
99
|
+
AI 🔗 DataChain
|
|
100
|
+
----------------
|
|
101
|
+
|
|
102
|
+
DataChain is an open-source Python data processing library for wrangling unstructured AI data at scale.
|
|
103
|
+
|
|
104
|
+
Datachain enables multimodal API calls and local AI inferences to run in parallel over many samples as chained operations. The resulting datasets can be saved, versioned, and sent directly to PyTorch and TensorFlow for training. Datachain can persist features of Python objects returned by AI models, and enables vectorized analytical operations over them.
|
|
105
|
+
|
|
106
|
+
The typical use cases are data curation, LLM analytics and validation, image segmentation, pose detection, and GenAI alignment. Datachain is especially helpful if batch operations can be optimized – for instance, when synchronous API calls can be parallelized or where an LLM API offers batch processing.
|
|
107
|
+
|
|
108
|
+
.. code:: console
|
|
109
|
+
|
|
110
|
+
$ pip install datachain
|
|
111
|
+
|
|
112
|
+
Operation basics
|
|
113
|
+
----------------
|
|
114
|
+
|
|
115
|
+
DataChain is built by composing wrangling operations.
|
|
116
|
+
|
|
117
|
+
For example, let us consider a dataset from Karlsruhe Institute of Technology detailing dialogs between users and customer service chatbots. We can use the chain to read data from the cloud, map it onto the parallel API calls for LLM evaluation, and organize the output into a dataset :
|
|
118
|
+
|
|
119
|
+
.. code:: py
|
|
120
|
+
|
|
121
|
+
# pip install mistralai
|
|
122
|
+
# this example requires a free Mistral API key, get yours at https://console.mistral.ai
|
|
123
|
+
# add the key to your shell environment: $ export MISTRAL_API_KEY= your key
|
|
124
|
+
|
|
125
|
+
# pip install mistralai
|
|
126
|
+
# this example requires a free Mistral API key, get yours at https://console.mistral.ai
|
|
127
|
+
# add the key to your shell environment: $ export MISTRAL_API_KEY= your key
|
|
128
|
+
|
|
129
|
+
import os
|
|
130
|
+
|
|
131
|
+
from mistralai.client import MistralClient
|
|
132
|
+
from mistralai.models.chat_completion import ChatMessage
|
|
133
|
+
|
|
134
|
+
from datachain.lib.dc import DataChain, Column
|
|
135
|
+
|
|
136
|
+
PROMPT = "Was this bot dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
|
|
137
|
+
|
|
138
|
+
model = "mistral-large-latest"
|
|
139
|
+
api_key = os.environ["MISTRAL_API_KEY"]
|
|
140
|
+
|
|
141
|
+
chain = (
|
|
142
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
|
|
143
|
+
.limit(5)
|
|
144
|
+
.settings(cache=True, parallel=5)
|
|
145
|
+
.map(
|
|
146
|
+
mistral_response=lambda file: MistralClient(api_key=api_key)
|
|
147
|
+
.chat(
|
|
148
|
+
model=model,
|
|
149
|
+
response_format={"type": "json_object"},
|
|
150
|
+
messages=[
|
|
151
|
+
ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
|
|
152
|
+
],
|
|
153
|
+
)
|
|
154
|
+
.choices[0]
|
|
155
|
+
.message.content,
|
|
156
|
+
)
|
|
157
|
+
.save()
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
print(chain.select("mistral_response").results())
|
|
162
|
+
except Exception as e:
|
|
163
|
+
print(f"do you have the right Mistral API key? {e}")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
.. code:: shell
|
|
167
|
+
|
|
168
|
+
[('{"result": "Yes"}',), ('{"result": "No"}',), ... , ('{"result": "Yes"}',)]
|
|
169
|
+
|
|
170
|
+
Now we have parallel-processed an LLM API-based query over cloud data and persisted the results.
|
|
171
|
+
|
|
172
|
+
Vectorized analytics
|
|
173
|
+
--------------------
|
|
174
|
+
|
|
175
|
+
Datachain internally represents datasets as tables, so analytical queries on the chain are automatically vectorized:
|
|
176
|
+
|
|
177
|
+
.. code:: py
|
|
178
|
+
|
|
179
|
+
failed_dialogs = chain.filter(Column("mistral_response") == '{"result": "No"}')
|
|
180
|
+
success_rate = failed_dialogs.count() / chain.count()
|
|
181
|
+
print(f"Chatbot dialog success rate: {100*success_rate:.2f}%")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
.. code:: shell
|
|
185
|
+
|
|
186
|
+
"40.00%"
|
|
187
|
+
|
|
188
|
+
Note that DataChain represents file samples as pointers into their respective storage locations. This means a newly created dataset version does not duplicate files in storage, and storage remains the single source of truth for the original samples
|
|
189
|
+
|
|
190
|
+
Handling Python objects
|
|
191
|
+
-----------------------
|
|
192
|
+
In addition to storing primitive Python data types, chain is also capable of using data models.
|
|
193
|
+
|
|
194
|
+
For example, instead of collecting just a text response from Mistral API, we might be interested in more fields of the Mistral response object. For this task, we can define a Pydantic-like model and populate it from the API replies:
|
|
195
|
+
|
|
196
|
+
.. code:: py
|
|
197
|
+
|
|
198
|
+
import os
|
|
199
|
+
|
|
200
|
+
from mistralai.client import MistralClient
|
|
201
|
+
from mistralai.models.chat_completion import ChatMessage
|
|
202
|
+
|
|
203
|
+
from datachain.lib.dc import DataChain
|
|
204
|
+
from datachain.lib.feature import Feature
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
PROMPT = (
|
|
208
|
+
"Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
model = "mistral-large-latest"
|
|
212
|
+
api_key = os.environ["MISTRAL_API_KEY"]
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
## define the data model ###
|
|
216
|
+
class Usage(Feature):
|
|
217
|
+
prompt_tokens: int = 0
|
|
218
|
+
completion_tokens: int = 0
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class MyChatMessage(Feature):
|
|
222
|
+
role: str = ""
|
|
223
|
+
content: str = ""
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class CompletionResponseChoice(Feature):
|
|
227
|
+
message: MyChatMessage = MyChatMessage()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class MistralModel(Feature):
|
|
231
|
+
id: str = ""
|
|
232
|
+
choices: list[CompletionResponseChoice]
|
|
233
|
+
usage: Usage = Usage()
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
## Populate model instances ###
|
|
237
|
+
chain = (
|
|
238
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
|
|
239
|
+
.limit(5)
|
|
240
|
+
.settings(cache=True, parallel=5)
|
|
241
|
+
.map(
|
|
242
|
+
mistral_response=lambda file: MistralModel(
|
|
243
|
+
**MistralClient(api_key=api_key)
|
|
244
|
+
.chat(
|
|
245
|
+
model=model,
|
|
246
|
+
response_format={"type": "json_object"},
|
|
247
|
+
messages=[
|
|
248
|
+
ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
|
|
249
|
+
],
|
|
250
|
+
)
|
|
251
|
+
.dict()
|
|
252
|
+
),
|
|
253
|
+
output=MistralModel,
|
|
254
|
+
)
|
|
255
|
+
.save("dialog-eval")
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
After the chain execution, we can collect the objects:
|
|
259
|
+
|
|
260
|
+
.. code:: py
|
|
261
|
+
|
|
262
|
+
for obj in responses:
|
|
263
|
+
assert isinstance(obj, MistralModel)
|
|
264
|
+
print(obj.dict())
|
|
265
|
+
|
|
266
|
+
.. code:: shell
|
|
267
|
+
|
|
268
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 610, 'completion_tokens': 6}}
|
|
269
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 3983, 'completion_tokens': 6}}
|
|
270
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 706, 'completion_tokens': 6}}
|
|
271
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 1250, 'completion_tokens': 6}}
|
|
272
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 1217, 'completion_tokens': 6}}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
Dataset persistence
|
|
276
|
+
--------------------
|
|
277
|
+
|
|
278
|
+
The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder .datachain/ holds the records. A persistent dataset can be accessed later to start a derivative chain:
|
|
279
|
+
|
|
280
|
+
.. code:: py
|
|
281
|
+
|
|
282
|
+
DataChain.from_dataset("dialog-eval").limit(2).save("dialog-eval")
|
|
283
|
+
|
|
284
|
+
Persistent datasets are immutable and automatically versioned. Versions can be listed from shell:
|
|
285
|
+
|
|
286
|
+
.. code:: shell
|
|
287
|
+
|
|
288
|
+
$ datachain ls-datasets
|
|
289
|
+
|
|
290
|
+
dialog-rate (v1)
|
|
291
|
+
dialog-rate (v2)
|
|
292
|
+
|
|
293
|
+
By default, when a persistent dataset is loaded, the latest version is fetched but another version can be requested:
|
|
294
|
+
|
|
295
|
+
.. code:: py
|
|
296
|
+
|
|
297
|
+
ds = DataChain.from_dataset("dialog-eval", version = 1)
|
|
298
|
+
|
|
299
|
+
Chain optimization and execution
|
|
300
|
+
--------------------------------
|
|
301
|
+
|
|
302
|
+
Datachain avoids redundant operations. Execution is triggered only when a downstream operation requests the processed results. However, it would be inefficient to run, say, LLM queries again every time you just want to collect several objects.
|
|
303
|
+
|
|
304
|
+
“Save” operation nails execution results and automatically refers to them every time the downstream functions ask for data. Saving without an explicit name generates an auto-named dataset which serves the same purpose.
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
Matching data with metadata
|
|
308
|
+
----------------------------
|
|
309
|
+
It is common for AI data to come with pre-computed metadata (annotations, classes, etc).
|
|
310
|
+
|
|
311
|
+
DataChain library understands common metadata formats (JSON, CSV and parquet), and can unite data samples from storage with side-loaded metadata. The schema for metadata can be set explicitly or be inferred.
|
|
312
|
+
|
|
313
|
+
Here is an example of reading a CSV file where schema is heuristically derived from the header:
|
|
314
|
+
|
|
315
|
+
.. code:: py
|
|
316
|
+
|
|
317
|
+
from datachain.lib.dc import DataChain
|
|
318
|
+
csv_dataset = DataChain.from_csv("gs://datachain-demo/chatbot-csv/")
|
|
319
|
+
|
|
320
|
+
print(csv_dataset.to_pandas())
|
|
321
|
+
|
|
322
|
+
Reading metadata from JSON format is a more complicated scenario because a JSON-annotated dataset typically references data samples (e.g. images) in annotation arrays somewhere within JSON files.
|
|
323
|
+
|
|
324
|
+
Here is an example from MS COCO “captions” JSON which employs separate sections for image meta and captions:
|
|
325
|
+
|
|
326
|
+
.. code:: json
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
{
|
|
330
|
+
"images": [
|
|
331
|
+
{
|
|
332
|
+
"license": 4,
|
|
333
|
+
"file_name": "000000397133.jpg",
|
|
334
|
+
"coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",
|
|
335
|
+
"height": 427,
|
|
336
|
+
"width": 640,
|
|
337
|
+
"date_captured": "2013-11-14 17:02:52",
|
|
338
|
+
"flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",
|
|
339
|
+
"id": 397133
|
|
340
|
+
},
|
|
341
|
+
...
|
|
342
|
+
],
|
|
343
|
+
"annotations": [
|
|
344
|
+
{
|
|
345
|
+
"image_id" : "179765",
|
|
346
|
+
"id" : 38,
|
|
347
|
+
"caption" : "A black Honda motorcycle parked in front of a garage."
|
|
348
|
+
},
|
|
349
|
+
...
|
|
350
|
+
],
|
|
351
|
+
...
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
To deal with this layout, we can take the following steps:
|
|
355
|
+
|
|
356
|
+
1. Generate a dataset of raw image files from storage
|
|
357
|
+
2. Generate a meta-information dataset from the JSON section “images”
|
|
358
|
+
3. Join these datasets via the matching id keys
|
|
359
|
+
|
|
360
|
+
.. code:: python
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
from datachain.lib.dc import DataChain
|
|
364
|
+
|
|
365
|
+
images = DataChain.from_storage("gs://datachain-demo/coco2017/images/val/")
|
|
366
|
+
meta = DataChain.from_json("gs://datachain-demo/coco2017/annotations_captions", jmespath = "images")
|
|
367
|
+
|
|
368
|
+
images_with_meta = images.merge(meta, on="file.name", right_on="images.file_name")
|
|
369
|
+
|
|
370
|
+
print(images_with_meta.limit(1).results())
|
|
371
|
+
|
|
372
|
+
.. code:: shell
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
Processed: 5000 rows [00:00, 15481.66 rows/s]
|
|
376
|
+
Processed: 1 rows [00:00, 1291.75 rows/s]
|
|
377
|
+
Processed: 1 rows [00:00, 4.70 rows/s]
|
|
378
|
+
Generated: 5000 rows [00:00, 27128.67 rows/s]
|
|
379
|
+
[(1, 2336066478558845549, '', 0, 'coco2017/images/val', '000000000139.jpg', 'CNvXoemj8IYDEAE=', '1719096046021595', 1, datetime.datetime(2024, 6, 22, 22, 40, 46, 70000, tzinfo=datetime.timezone.utc), 161811, '', '', None, 'gs://datachain-demo', 'gs://datachain-demo', 'coco2017/images/val', '000000000139.jpg', 161811, '1719096046021595', 'CNvXoemj8IYDEAE=', 1, datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), None, '', 4146, 6967063844996569113, 2, '000000000139.jpg', 'http://images.cocodataset.org/val2017/000000000139.jpg', 426, 640, '2013-11-21 01:34:01', 'http://farm9.staticflickr.com/8035/8024364858_9c41dc1666_z.jpg', 139)]
|
|
380
|
+
|
|
381
|
+
Passing data to training
|
|
382
|
+
------------------------
|
|
383
|
+
|
|
384
|
+
Chain results can be exported or passed directly to Pytorch dataloader. For example, if we are interested in passing three columns to training, the following Pytorch code will do it:
|
|
385
|
+
|
|
386
|
+
.. code:: py
|
|
387
|
+
|
|
388
|
+
ds = train.select("file", "caption_choices", "label_ind").to_pytorch(
|
|
389
|
+
transform=preprocess,
|
|
390
|
+
tokenizer=clip.tokenize,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
loader = DataLoader(ds, batch_size=2)
|
|
394
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
|
|
395
|
+
train(loader, model, optimizer)
|
|
396
|
+
|
|
397
|
+
Tutorials
|
|
398
|
+
------------------
|
|
399
|
+
|
|
400
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
401
|
+
|
|
402
|
+
Contributions
|
|
403
|
+
--------------------
|
|
404
|
+
|
|
405
|
+
Contributions are very welcome.
|
|
406
|
+
To learn more, see the `Contributor Guide`_.
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
License
|
|
410
|
+
-------
|
|
411
|
+
|
|
412
|
+
Distributed under the terms of the `Apache 2.0 license`_,
|
|
413
|
+
*DataChain* is free and open source software.
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
Issues
|
|
417
|
+
------
|
|
418
|
+
|
|
419
|
+
If you encounter any problems,
|
|
420
|
+
please `file an issue`_ along with a detailed description.
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
.. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
|
|
424
|
+
.. _PyPI: https://pypi.org/
|
|
425
|
+
.. _file an issue: https://github.com/iterative/dvcx/issues
|
|
426
|
+
.. _pip: https://pip.pypa.io/
|
|
427
|
+
.. github-only
|
|
428
|
+
.. _Contributor Guide: CONTRIBUTING.rst
|
|
429
|
+
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=wvf70BnoqaeYdxZYyZIw_wkykA7ZbFwvOZ2gRk-ZY5o,959
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
|
|
4
4
|
datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
|
|
@@ -41,11 +41,11 @@ datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
|
|
|
41
41
|
datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
|
|
42
42
|
datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
|
|
43
43
|
datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
|
|
44
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
-
datachain/lib/feature.py,sha256=
|
|
44
|
+
datachain/lib/dc.py,sha256=D3cgib-U0Mo0x5wEK1_NfgymAldHqCvooZwtyohi53Q,34426
|
|
45
|
+
datachain/lib/feature.py,sha256=iMwbMyQUyjRUeB-vhAucnx59kNSVvX_xEChTW5B9klY,12244
|
|
46
46
|
datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
|
|
47
|
-
datachain/lib/feature_utils.py,sha256=
|
|
48
|
-
datachain/lib/file.py,sha256=
|
|
47
|
+
datachain/lib/feature_utils.py,sha256=2yLdZd9o4AJ5QQX7kqgbCxCT78aT7HE12CLxQ6QRpbc,4982
|
|
48
|
+
datachain/lib/file.py,sha256=LGBwC7tFU7VcSWk5kjPpEWPBQas5me69L2uTDNvYXGM,8326
|
|
49
49
|
datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
|
|
50
50
|
datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
|
|
51
51
|
datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
|
|
@@ -53,11 +53,11 @@ datachain/lib/image.py,sha256=ZYfDqr9p-RRmWBeWFQwXLS1J3vQS616ykfMUvQVpqBY,2717
|
|
|
53
53
|
datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
|
|
54
54
|
datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
|
|
55
55
|
datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
|
|
56
|
-
datachain/lib/pytorch.py,sha256=
|
|
56
|
+
datachain/lib/pytorch.py,sha256=Ea1sXhborF6zcywQjLpXgKnbr1lTez4Bfu3m0Gr78FI,5843
|
|
57
57
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
58
|
-
datachain/lib/signal_schema.py,sha256=
|
|
58
|
+
datachain/lib/signal_schema.py,sha256=hD56hyO1H3A5H2oyTUwPcNu6UOQ_XY0DeA0nrXBqFaU,11492
|
|
59
59
|
datachain/lib/text.py,sha256=PUT1O0jNJoQGsuhff2LgDpzTWk2eMdwIKqEDBrE448M,1307
|
|
60
|
-
datachain/lib/udf.py,sha256=
|
|
60
|
+
datachain/lib/udf.py,sha256=axMvqYz4tdyg_C3nyuOcDsu3Aqr19jWv2vl54U_8LQM,6595
|
|
61
61
|
datachain/lib/udf_signature.py,sha256=CUKgoVpM_N8CgvMncpAw2RYchoiJdAGdDSdluoP0hIk,7161
|
|
62
62
|
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
63
63
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
@@ -92,9 +92,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
|
|
|
92
92
|
datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
|
|
93
93
|
datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
|
|
94
94
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
95
|
-
datachain-0.2.
|
|
96
|
-
datachain-0.2.
|
|
97
|
-
datachain-0.2.
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
95
|
+
datachain-0.2.6.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
96
|
+
datachain-0.2.6.dist-info/METADATA,sha256=j3Pq4f0toq25yDr9FAQhJaygkE7St8BTZVcigAw47t4,16475
|
|
97
|
+
datachain-0.2.6.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
98
|
+
datachain-0.2.6.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
99
|
+
datachain-0.2.6.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
100
|
+
datachain-0.2.6.dist-info/RECORD,,
|
|
@@ -1,371 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: datachain
|
|
3
|
-
Version: 0.2.4
|
|
4
|
-
Summary: Wrangle unstructured AI data at scale
|
|
5
|
-
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
-
License: Apache-2.0
|
|
7
|
-
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
-
Project-URL: Issues, https://github.com/iterative/dvcx/issues
|
|
9
|
-
Project-URL: Source, https://github.com/iterative/dvcx
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
-
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
-
Requires-Python: >=3.9
|
|
17
|
-
Description-Content-Type: text/x-rst
|
|
18
|
-
License-File: LICENSE
|
|
19
|
-
Requires-Dist: pyyaml
|
|
20
|
-
Requires-Dist: tomlkit
|
|
21
|
-
Requires-Dist: tqdm
|
|
22
|
-
Requires-Dist: numpy
|
|
23
|
-
Requires-Dist: pandas >=2.0.0
|
|
24
|
-
Requires-Dist: pyarrow
|
|
25
|
-
Requires-Dist: typing-extensions
|
|
26
|
-
Requires-Dist: python-dateutil >=2
|
|
27
|
-
Requires-Dist: attrs >=21.3.0
|
|
28
|
-
Requires-Dist: s3fs >=2024.2.0
|
|
29
|
-
Requires-Dist: gcsfs >=2024.2.0
|
|
30
|
-
Requires-Dist: adlfs >=2024.2.0
|
|
31
|
-
Requires-Dist: dvc-data <4,>=3.10
|
|
32
|
-
Requires-Dist: dvc-objects <6,>=4
|
|
33
|
-
Requires-Dist: shtab <2,>=1.3.4
|
|
34
|
-
Requires-Dist: sqlalchemy >=2
|
|
35
|
-
Requires-Dist: multiprocess ==0.70.16
|
|
36
|
-
Requires-Dist: dill ==0.3.8
|
|
37
|
-
Requires-Dist: ujson >=5.9.0
|
|
38
|
-
Requires-Dist: pydantic <3,>=2
|
|
39
|
-
Requires-Dist: jmespath >=1.0
|
|
40
|
-
Requires-Dist: datamodel-code-generator >=0.25
|
|
41
|
-
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
42
|
-
Provides-Extra: cv
|
|
43
|
-
Requires-Dist: Pillow <11,>=10.0.0 ; extra == 'cv'
|
|
44
|
-
Requires-Dist: torch >=2.1.0 ; extra == 'cv'
|
|
45
|
-
Requires-Dist: torchvision ; extra == 'cv'
|
|
46
|
-
Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
|
|
47
|
-
Provides-Extra: dev
|
|
48
|
-
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
49
|
-
Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
|
|
50
|
-
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
51
|
-
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
52
|
-
Requires-Dist: types-requests ; extra == 'dev'
|
|
53
|
-
Requires-Dist: types-ujson ; extra == 'dev'
|
|
54
|
-
Provides-Extra: docs
|
|
55
|
-
Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
|
|
56
|
-
Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
|
|
57
|
-
Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
|
|
58
|
-
Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
|
|
59
|
-
Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
|
|
60
|
-
Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
|
|
61
|
-
Provides-Extra: remote
|
|
62
|
-
Requires-Dist: datachain[pandas] ; extra == 'remote'
|
|
63
|
-
Requires-Dist: lz4 ; extra == 'remote'
|
|
64
|
-
Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
|
|
65
|
-
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
66
|
-
Provides-Extra: tests
|
|
67
|
-
Requires-Dist: datachain[cv,pandas,remote,vector] ; extra == 'tests'
|
|
68
|
-
Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
69
|
-
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
70
|
-
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
71
|
-
Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
|
|
72
|
-
Requires-Dist: pytest-servers[all] >=0.5.4 ; extra == 'tests'
|
|
73
|
-
Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
|
|
74
|
-
Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
|
|
75
|
-
Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
|
|
76
|
-
Requires-Dist: virtualenv ; extra == 'tests'
|
|
77
|
-
Requires-Dist: dulwich ; extra == 'tests'
|
|
78
|
-
Requires-Dist: hypothesis ; extra == 'tests'
|
|
79
|
-
Requires-Dist: open-clip-torch ; extra == 'tests'
|
|
80
|
-
Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
|
|
81
|
-
Requires-Dist: requests-mock ; extra == 'tests'
|
|
82
|
-
Provides-Extra: vector
|
|
83
|
-
Requires-Dist: usearch ; extra == 'vector'
|
|
84
|
-
|
|
85
|
-
|PyPI| |Python Version| |Codecov| |Tests| |License|
|
|
86
|
-
|
|
87
|
-
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
88
|
-
:target: https://pypi.org/project/datachain/
|
|
89
|
-
:alt: PyPI
|
|
90
|
-
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
91
|
-
:target: https://pypi.org/project/datachain
|
|
92
|
-
:alt: Python Version
|
|
93
|
-
.. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
|
|
94
|
-
:target: https://app.codecov.io/gh/iterative/dvcx
|
|
95
|
-
:alt: Codecov
|
|
96
|
-
.. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
|
|
97
|
-
:target: https://github.com/iterative/dvcx/actions?workflow=Tests
|
|
98
|
-
:alt: Tests
|
|
99
|
-
.. |License| image:: https://img.shields.io/pypi/l/datachain
|
|
100
|
-
:target: https://opensource.org/licenses/Apache-2.0
|
|
101
|
-
:alt: License
|
|
102
|
-
|
|
103
|
-
AI 🔗 DataChain
|
|
104
|
-
----------------
|
|
105
|
-
|
|
106
|
-
DataChain is an open-source Python data processing library for wrangling unstructured AI data at scale.
|
|
107
|
-
|
|
108
|
-
It enables batch LLM API calls and local language and vision AI model inferences to run in parallel over many samples as chained operations resolving to table-like datasets. These datasets can be saved, versioned, and sent directly to PyTorch and TensorFlow for training. DataChain employs rigorous `Pydantic`_ data structures, promoting better data processing practices and enabling vectorized analytical operations normally found in databases.
|
|
109
|
-
|
|
110
|
-
The DataChain fills the gap between dataframe libraries, data warehouses, and Python-based multimodal AI applications. Our primary use cases include massive data curation, LLM analytics and validation, batch image segmentation and pose detection, GenAI data alignment, etc.
|
|
111
|
-
|
|
112
|
-
.. code:: console
|
|
113
|
-
|
|
114
|
-
$ pip install datachain
|
|
115
|
-
|
|
116
|
-
Basic operation
|
|
117
|
-
---------------
|
|
118
|
-
|
|
119
|
-
DataChain is built by composing wrangling operations.
|
|
120
|
-
|
|
121
|
-
For example, it can be instructed to read files from the cloud, map them onto a modern AI service returning a Python object, parallelize API calls, save the result as a dataset, and export a column:
|
|
122
|
-
|
|
123
|
-
.. code:: py
|
|
124
|
-
|
|
125
|
-
import os
|
|
126
|
-
import datachain as dc
|
|
127
|
-
|
|
128
|
-
from anthropic.types.message import Message
|
|
129
|
-
ClaudeModel = dc.pydantic_to_feature(Message)
|
|
130
|
-
PROMPT = "summarize this book in less than 200 words"
|
|
131
|
-
service = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
|
132
|
-
source = "gs://datachain-demo/mybooks/"
|
|
133
|
-
|
|
134
|
-
chain = dc.DataChain(source) \
|
|
135
|
-
.filter(File.name.glob("*.txt")) \
|
|
136
|
-
.settings(parallel=4) \
|
|
137
|
-
.map( \
|
|
138
|
-
claude = lambda file: \
|
|
139
|
-
ClaudeModel(**service.messages.create( \
|
|
140
|
-
model="claude-3-haiku-20240307", \
|
|
141
|
-
system=PROMPT, \
|
|
142
|
-
messages=[{"role": "user", \
|
|
143
|
-
"content": file.get_value()}] \
|
|
144
|
-
), \
|
|
145
|
-
).model_dump() \
|
|
146
|
-
) \
|
|
147
|
-
.save("mydataset")
|
|
148
|
-
|
|
149
|
-
dc.DataChain("mydataset").export("./", "claude.response") # export summaries
|
|
150
|
-
|
|
151
|
-
Dataset persistence
|
|
152
|
-
-------------------
|
|
153
|
-
|
|
154
|
-
In the example above, the chain resolves to a saved dataset “mydataset”. DataChain datasets are immutable and versioned. A saved dataset version can be used as a data source:
|
|
155
|
-
|
|
156
|
-
.. code:: py
|
|
157
|
-
|
|
158
|
-
ds = dc.DataChain("mydataset", version = 1)
|
|
159
|
-
|
|
160
|
-
Note that DataChain represents file samples as pointers into their respective storage locations. This means a newly created dataset version does not duplicate files in storage, and storage remains the single source of truth for the original samples
|
|
161
|
-
|
|
162
|
-
Vectorized analytics
|
|
163
|
-
---------------------
|
|
164
|
-
Since datasets are internally represented as tables, analytical queries can be vectorized:
|
|
165
|
-
|
|
166
|
-
.. code:: py
|
|
167
|
-
|
|
168
|
-
rate = ds.filter(chain.response == "Success").count() / chain.count() # ??
|
|
169
|
-
print(f"API class success rate: {100*rate:.2f}%")
|
|
170
|
-
>> 74.68%
|
|
171
|
-
|
|
172
|
-
price_input = 0.25
|
|
173
|
-
price_output = 1.25
|
|
174
|
-
price=(ds.sum(C.claude.usage.input_tokens)*price_input \
|
|
175
|
-
+ ds.sum(C.claude.usage.output_tokens)*price_output)/1_000_000
|
|
176
|
-
print(f"Cost of API calls: ${price:.2f}")
|
|
177
|
-
>> Cost of API calls: $1.42
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
Importing metadata
|
|
181
|
-
------------------------
|
|
182
|
-
|
|
183
|
-
It is common for AI data to come together with metadata (annotations, classes, etc).
|
|
184
|
-
DataChain understands many metadata formats, and can connect data samples in storage with external metadata (e.g. CSV columns) to form a single dataset:
|
|
185
|
-
|
|
186
|
-
.. code:: py
|
|
187
|
-
|
|
188
|
-
from dc import parse_csv
|
|
189
|
-
|
|
190
|
-
files = dc.DataChain("gs://datachain-demo/myimages/")
|
|
191
|
-
metadata = dc.DataChain("gs://datachain-demo/myimagesmetadata.csv") \
|
|
192
|
-
.gen(meta=parse_csv) # TBD, also dependent on dropping file
|
|
193
|
-
dataset = chain1.merge(chain2, on = "file.name", right_on="name"])
|
|
194
|
-
|
|
195
|
-
print(dataset.select("file.name", "class", "prob").limit(5).to_pandas())
|
|
196
|
-
....
|
|
197
|
-
....
|
|
198
|
-
....
|
|
199
|
-
....
|
|
200
|
-
....
|
|
201
|
-
|
|
202
|
-
Nested annotations (like JSON) can be unrolled into rows and columns in the way that best fits the application. For example, the MS COCO dataset includes JSON annotations detailing segmentations. To build a dataset consisting of all segmented objects in all COCO images:
|
|
203
|
-
|
|
204
|
-
.. code:: py
|
|
205
|
-
|
|
206
|
-
image_files = dc.DataChain("gs://datachain-demo/coco/images/")
|
|
207
|
-
image_meta = dc.DataChain("gs://datachain-demo/coco.json") \
|
|
208
|
-
.gen(meta=parse_json, key="images") # list of images
|
|
209
|
-
images = image_files.merge(image_meta, on = "file.name", right_on="file_name")
|
|
210
|
-
objects_meta = dc.DataChain("gs://datachain-demo/coco.json") \
|
|
211
|
-
.gen(meta=parse_json, key="annotations") # annotated objects
|
|
212
|
-
|
|
213
|
-
objects = image.full_merge(objects_meta, on = "id", right_on = "image_id")
|
|
214
|
-
|
|
215
|
-
Generating metadata
|
|
216
|
-
---------------------
|
|
217
|
-
|
|
218
|
-
A typical step in data curation is to create features from data samples for future selection. DataChain represents the newly created metadata as columns, which makes it easy to create new features and filter on them:
|
|
219
|
-
|
|
220
|
-
.. code:: py
|
|
221
|
-
|
|
222
|
-
from fashion_clip.fashion_clip import FashionCLIP
|
|
223
|
-
from sqlalchemy import JSON
|
|
224
|
-
from tabulate import tabulate
|
|
225
|
-
|
|
226
|
-
from datachain.lib.param import Image
|
|
227
|
-
from datachain.query import C, DatasetQuery, udf
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
@udf(
|
|
231
|
-
params=(Image(),),
|
|
232
|
-
output={"fclip": JSON},
|
|
233
|
-
method="fashion_clip",
|
|
234
|
-
batch=10,
|
|
235
|
-
)
|
|
236
|
-
class MyFashionClip:
|
|
237
|
-
def __init__(self):
|
|
238
|
-
self.fclip = FashionCLIP("fashion-clip")
|
|
239
|
-
|
|
240
|
-
def fashion_clip(self, inputs):
|
|
241
|
-
embeddings = self.fclip.encode_images(
|
|
242
|
-
[input[0] for input in inputs], batch_size=1
|
|
243
|
-
)
|
|
244
|
-
return [(json.dumps(emb),) for emb in embeddings.tolist()]
|
|
245
|
-
|
|
246
|
-
chain = dc.DataChain("gs://datachain-demo/zalando/images/").filter(
|
|
247
|
-
C.name.glob("*.jpg")
|
|
248
|
-
).limit(5).add_signals(MyFashionClip).save("zalando_hd_emb")
|
|
249
|
-
|
|
250
|
-
test_image = "cs://datachain-demo/zalando/test/banner.jpg"
|
|
251
|
-
test_embedding = MyFashionClip.fashion_clip.encode_images(Image(test_image))
|
|
252
|
-
|
|
253
|
-
best_matches = chain.filter(similarity_search(test_embeding)).limit(5)
|
|
254
|
-
|
|
255
|
-
print best_matches.to_result()
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
Delta updates
|
|
259
|
-
-------------
|
|
260
|
-
|
|
261
|
-
DataChain is capable of “delta updates” – that is, batch-processing only the newly added data samples. For example, let us copy some images into a local folder and run a chain to generate captions with a locally served captioning model from HuggingFace:
|
|
262
|
-
|
|
263
|
-
.. code:: console
|
|
264
|
-
|
|
265
|
-
> mkdir demo-images/
|
|
266
|
-
> datachain cp gs://datachain-demo/images/ /tmp/demo-images
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
.. code:: py
|
|
270
|
-
|
|
271
|
-
import torch
|
|
272
|
-
|
|
273
|
-
from datachain.lib.hf_image_to_text import LLaVAdescribe
|
|
274
|
-
from datachain.query import C, DatasetQuery
|
|
275
|
-
|
|
276
|
-
source = "/tmp/demo-images"
|
|
277
|
-
|
|
278
|
-
if torch.cuda.is_available():
|
|
279
|
-
device = "cuda"
|
|
280
|
-
else:
|
|
281
|
-
device = "cpu"
|
|
282
|
-
|
|
283
|
-
if __name__ == "__main__":
|
|
284
|
-
results = (
|
|
285
|
-
DatasetQuery(
|
|
286
|
-
source,
|
|
287
|
-
anon=True,
|
|
288
|
-
)
|
|
289
|
-
.filter(C.name.glob("*.jpg"))
|
|
290
|
-
.add_signals(
|
|
291
|
-
LLaVAdescribe(
|
|
292
|
-
device=device,
|
|
293
|
-
model=model,
|
|
294
|
-
),
|
|
295
|
-
parallel=False,
|
|
296
|
-
)
|
|
297
|
-
.save("annotated-images")
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
Now let us add few more more images to the same folder:
|
|
301
|
-
|
|
302
|
-
.. code:: console
|
|
303
|
-
|
|
304
|
-
> datachain cp gs://datachain-demo/extra-images/ /tmp/demo-images
|
|
305
|
-
|
|
306
|
-
and calculate updates only for the delta:
|
|
307
|
-
|
|
308
|
-
.. code:: py
|
|
309
|
-
|
|
310
|
-
processed = dc.DataChain("annotated-images")
|
|
311
|
-
delta = dc.dataChain("/tmp/demo-images").subtract(processed)
|
|
312
|
-
|
|
313
|
-
Passing data to training
|
|
314
|
-
------------------------
|
|
315
|
-
|
|
316
|
-
Datasets can be exported to CSV or webdataset formats. However, a much better way to pass data to training which avoids data copies and re-sharding is to wrap a DataChain dataset into a PyTorch class, and let the library take care of file downloads and caching under the hood:
|
|
317
|
-
|
|
318
|
-
.. code:: py
|
|
319
|
-
|
|
320
|
-
ds = dc.DataChain("gs://datachain-demo/name-labeled/images/")
|
|
321
|
-
.filter(C.name.glob("*.jpg"))
|
|
322
|
-
.map(lambda name: (name[:3],), output={"label": str}, parallel=4)
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
train_loader = DataLoader(
|
|
326
|
-
ds.to_pytorch(
|
|
327
|
-
ImageReader(),
|
|
328
|
-
LabelReader("label", classes=CLASSES),
|
|
329
|
-
transform=transform,
|
|
330
|
-
),
|
|
331
|
-
batch_size=16,
|
|
332
|
-
parallel=2,
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
💻 More examples
|
|
336
|
-
------------------
|
|
337
|
-
|
|
338
|
-
* Curating images to train a custom CLIP model without re-sharding the Webdataset files
|
|
339
|
-
* Batch-transforming and indexing images to create a searchable merchandise catalog
|
|
340
|
-
* Evaluating an LLM application at scale
|
|
341
|
-
* Ranking the LLM retrieval strategies
|
|
342
|
-
* Delta updates in batch processing
|
|
343
|
-
|
|
344
|
-
Contributions
|
|
345
|
-
--------------------
|
|
346
|
-
|
|
347
|
-
Contributions are very welcome.
|
|
348
|
-
To learn more, see the `Contributor Guide`_.
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
License
|
|
352
|
-
-------
|
|
353
|
-
|
|
354
|
-
Distributed under the terms of the `Apache 2.0 license`_,
|
|
355
|
-
*DataChain* is free and open source software.
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
Issues
|
|
359
|
-
------
|
|
360
|
-
|
|
361
|
-
If you encounter any problems,
|
|
362
|
-
please `file an issue`_ along with a detailed description.
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
.. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
|
|
366
|
-
.. _PyPI: https://pypi.org/
|
|
367
|
-
.. _file an issue: https://github.com/iterative/dvcx/issues
|
|
368
|
-
.. _pip: https://pip.pypa.io/
|
|
369
|
-
.. github-only
|
|
370
|
-
.. _Contributor Guide: CONTRIBUTING.rst
|
|
371
|
-
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|