datachain 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/lib/dc.py +12 -20
- datachain/lib/feature_utils.py +0 -3
- datachain/lib/file.py +0 -3
- datachain/lib/pytorch.py +9 -2
- datachain/lib/signal_schema.py +49 -9
- datachain/lib/udf.py +0 -1
- {datachain-0.2.4.dist-info → datachain-0.2.5.dist-info}/METADATA +8 -3
- {datachain-0.2.4.dist-info → datachain-0.2.5.dist-info}/RECORD +12 -12
- {datachain-0.2.4.dist-info → datachain-0.2.5.dist-info}/LICENSE +0 -0
- {datachain-0.2.4.dist-info → datachain-0.2.5.dist-info}/WHEEL +0 -0
- {datachain-0.2.4.dist-info → datachain-0.2.5.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.4.dist-info → datachain-0.2.5.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py
CHANGED
|
@@ -137,6 +137,7 @@ class DataChain(DatasetQuery):
|
|
|
137
137
|
indexing_column_types=File._datachain_column_types,
|
|
138
138
|
)
|
|
139
139
|
self._settings = Settings()
|
|
140
|
+
self._setup = {}
|
|
140
141
|
|
|
141
142
|
if self.feature_schema:
|
|
142
143
|
self.signals_schema = SignalSchema.deserialize(self.feature_schema)
|
|
@@ -536,9 +537,9 @@ class DataChain(DatasetQuery):
|
|
|
536
537
|
name = self.name or ""
|
|
537
538
|
|
|
538
539
|
sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
|
|
539
|
-
params_schema = self.signals_schema.slice(sign.params)
|
|
540
|
+
params_schema = self.signals_schema.slice(sign.params, self._setup)
|
|
540
541
|
|
|
541
|
-
return UDFBase._create(target_class, sign, params_schema
|
|
542
|
+
return UDFBase._create(target_class, sign, params_schema)
|
|
542
543
|
|
|
543
544
|
def _extend_features(self, method_name, *args, **kwargs):
|
|
544
545
|
super_func = getattr(super(), method_name)
|
|
@@ -569,18 +570,6 @@ class DataChain(DatasetQuery):
|
|
|
569
570
|
chain.signals_schema = new_schema
|
|
570
571
|
return chain
|
|
571
572
|
|
|
572
|
-
def get_values(self, *cols: str) -> Iterator[list]:
|
|
573
|
-
"""Iterate over rows, getting feature values and applying reader calls.
|
|
574
|
-
|
|
575
|
-
If columns are specified - limit them to specified columns.
|
|
576
|
-
"""
|
|
577
|
-
for features in self.iterate(*cols):
|
|
578
|
-
yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features] # type: ignore[union-attr,call-arg]
|
|
579
|
-
|
|
580
|
-
def get_one_value(self, col: str) -> Iterator:
|
|
581
|
-
for item in self.get_values(col):
|
|
582
|
-
yield item[0]
|
|
583
|
-
|
|
584
573
|
def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
|
|
585
574
|
"""Iterate over rows.
|
|
586
575
|
|
|
@@ -604,12 +593,6 @@ class DataChain(DatasetQuery):
|
|
|
604
593
|
def collect_one(self, col: str) -> list[FeatureType]:
|
|
605
594
|
return list(self.iterate_one(col))
|
|
606
595
|
|
|
607
|
-
def collect_values(self, *cols: str) -> list[list]:
|
|
608
|
-
return list(self.get_values(*cols))
|
|
609
|
-
|
|
610
|
-
def collect_one_value(self, col: str) -> list:
|
|
611
|
-
return list(self.get_one_value(col))
|
|
612
|
-
|
|
613
596
|
def to_pytorch(self, **kwargs):
|
|
614
597
|
"""Convert to pytorch dataset format."""
|
|
615
598
|
|
|
@@ -931,3 +914,12 @@ class DataChain(DatasetQuery):
|
|
|
931
914
|
|
|
932
915
|
def max(self, fr: FeatureType): # type: ignore[override]
|
|
933
916
|
return self._extend_features("max", fr)
|
|
917
|
+
|
|
918
|
+
def setup(self, **kwargs) -> "Self":
|
|
919
|
+
intersection = set(self._setup.keys()) & set(kwargs.keys())
|
|
920
|
+
if intersection:
|
|
921
|
+
keys = ", ".join(intersection)
|
|
922
|
+
raise DatasetPrepareError(self.name, f"this value(s) already setup: {keys}")
|
|
923
|
+
|
|
924
|
+
self._setup = self._setup | kwargs
|
|
925
|
+
return self
|
datachain/lib/feature_utils.py
CHANGED
|
@@ -12,9 +12,6 @@ from datachain.lib.feature import (
|
|
|
12
12
|
convert_type_to_datachain,
|
|
13
13
|
)
|
|
14
14
|
from datachain.lib.utils import DataChainParamsError
|
|
15
|
-
from datachain.query.schema import Column
|
|
16
|
-
|
|
17
|
-
FeatureLike = Union[type["Feature"], Column, str]
|
|
18
15
|
|
|
19
16
|
AUTO_FEATURE_PREFIX = "_auto_fr"
|
|
20
17
|
SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase
|
datachain/lib/file.py
CHANGED
datachain/lib/pytorch.py
CHANGED
|
@@ -8,6 +8,7 @@ from torch.utils.data import IterableDataset, get_worker_info
|
|
|
8
8
|
|
|
9
9
|
from datachain.catalog import Catalog, get_catalog
|
|
10
10
|
from datachain.lib.dc import DataChain
|
|
11
|
+
from datachain.lib.feature import Feature
|
|
11
12
|
from datachain.lib.text import convert_text
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
@@ -100,8 +101,14 @@ class PytorchDataset(IterableDataset):
|
|
|
100
101
|
if self.num_samples > 0:
|
|
101
102
|
ds = ds.sample(self.num_samples)
|
|
102
103
|
ds = ds.chunk(total_rank, total_workers)
|
|
103
|
-
stream = ds.
|
|
104
|
-
for
|
|
104
|
+
stream = ds.iterate()
|
|
105
|
+
for row_features in stream:
|
|
106
|
+
row = []
|
|
107
|
+
for fr in row_features:
|
|
108
|
+
if isinstance(fr, Feature):
|
|
109
|
+
row.append(fr.get_value()) # type: ignore[unreachable]
|
|
110
|
+
else:
|
|
111
|
+
row.append(fr)
|
|
105
112
|
# Apply transforms
|
|
106
113
|
if self.transform:
|
|
107
114
|
try:
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
from collections.abc import Iterator, Sequence
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Optional, Union, get_args, get_origin
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin
|
|
5
5
|
|
|
6
6
|
from pydantic import create_model
|
|
7
7
|
|
|
@@ -42,6 +42,11 @@ class SignalResolvingError(SignalSchemaError):
|
|
|
42
42
|
super().__init__(f"cannot resolve signal name{name}: {msg}")
|
|
43
43
|
|
|
44
44
|
|
|
45
|
+
class SetupError(SignalSchemaError):
|
|
46
|
+
def __init__(self, name: str, msg: str):
|
|
47
|
+
super().__init__(f"cannot setup value '{name}': {msg}")
|
|
48
|
+
|
|
49
|
+
|
|
45
50
|
class SignalResolvingTypeError(SignalResolvingError):
|
|
46
51
|
def __init__(self, method: str, field):
|
|
47
52
|
super().__init__(
|
|
@@ -52,9 +57,31 @@ class SignalResolvingTypeError(SignalResolvingError):
|
|
|
52
57
|
|
|
53
58
|
|
|
54
59
|
class SignalSchema:
|
|
55
|
-
def __init__(
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
values: dict[str, FeatureType],
|
|
63
|
+
setup: Optional[dict[str, Callable]] = None,
|
|
64
|
+
):
|
|
56
65
|
self.values = values
|
|
57
|
-
self.tree = self._build_tree()
|
|
66
|
+
self.tree = self._build_tree(values)
|
|
67
|
+
|
|
68
|
+
self.setup_func = setup or {}
|
|
69
|
+
self.setup_values = None
|
|
70
|
+
for key, func in self.setup_func.items():
|
|
71
|
+
if not callable(func):
|
|
72
|
+
raise SetupError(key, "value must be function or callable class")
|
|
73
|
+
|
|
74
|
+
def _init_setup_values(self):
|
|
75
|
+
if self.setup_values is not None:
|
|
76
|
+
return self.setup_values
|
|
77
|
+
|
|
78
|
+
res = {}
|
|
79
|
+
for key, func in self.setup_func.items():
|
|
80
|
+
try:
|
|
81
|
+
res[key] = func()
|
|
82
|
+
except Exception as ex:
|
|
83
|
+
raise SetupError(key, f"error when call function: '{ex}'") from ex
|
|
84
|
+
self.setup_values = res
|
|
58
85
|
|
|
59
86
|
@staticmethod
|
|
60
87
|
def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
|
|
@@ -111,16 +138,22 @@ class SignalSchema:
|
|
|
111
138
|
def to_udf_spec(self) -> dict[str, Any]:
|
|
112
139
|
res = {}
|
|
113
140
|
for path, type_, has_subtree, _ in self.get_flat_tree():
|
|
141
|
+
if path[0] in self.setup_func:
|
|
142
|
+
continue
|
|
114
143
|
if not has_subtree:
|
|
115
144
|
db_name = DEFAULT_DELIMITER.join(path)
|
|
116
145
|
res[db_name] = convert_type_to_datachain(type_)
|
|
117
146
|
return res
|
|
118
147
|
|
|
119
148
|
def row_to_objs(self, row: Sequence[Any]) -> list[FeatureType]:
|
|
149
|
+
self._init_setup_values()
|
|
150
|
+
|
|
120
151
|
objs = []
|
|
121
152
|
pos = 0
|
|
122
|
-
for fr_type in self.values.
|
|
123
|
-
if
|
|
153
|
+
for name, fr_type in self.values.items():
|
|
154
|
+
if val := self.setup_values.get(name, None): # type: ignore[attr-defined]
|
|
155
|
+
objs.append(val)
|
|
156
|
+
elif Feature.is_feature(fr_type):
|
|
124
157
|
j, pos = fr_type._unflatten_to_json_pos(row, pos) # type: ignore[union-attr]
|
|
125
158
|
objs.append(fr_type(**j))
|
|
126
159
|
else:
|
|
@@ -135,8 +168,14 @@ class SignalSchema:
|
|
|
135
168
|
if Feature.is_feature(fr)
|
|
136
169
|
)
|
|
137
170
|
|
|
138
|
-
def slice(
|
|
139
|
-
|
|
171
|
+
def slice(
|
|
172
|
+
self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
|
|
173
|
+
) -> "SignalSchema":
|
|
174
|
+
setup = setup or {}
|
|
175
|
+
setup_no_types = dict.fromkeys(setup.keys(), str)
|
|
176
|
+
union = self.values | setup_no_types
|
|
177
|
+
schema = {k: union[k] for k in keys if k in union}
|
|
178
|
+
return SignalSchema(schema, setup)
|
|
140
179
|
|
|
141
180
|
def row_to_features(self, row: Sequence, catalog: "Catalog") -> list[FeatureType]:
|
|
142
181
|
res = []
|
|
@@ -235,10 +274,11 @@ class SignalSchema:
|
|
|
235
274
|
**fields,
|
|
236
275
|
)
|
|
237
276
|
|
|
238
|
-
|
|
277
|
+
@staticmethod
|
|
278
|
+
def _build_tree(values: dict[str, FeatureType]) -> dict[str, Any]:
|
|
239
279
|
res = {}
|
|
240
280
|
|
|
241
|
-
for name, val in
|
|
281
|
+
for name, val in values.items():
|
|
242
282
|
subtree = val.build_tree() if Feature.is_feature(val) else None # type: ignore[union-attr]
|
|
243
283
|
res[name] = (val, subtree)
|
|
244
284
|
|
datachain/lib/udf.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -59,12 +59,11 @@ Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
|
|
|
59
59
|
Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
|
|
60
60
|
Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
|
|
61
61
|
Provides-Extra: remote
|
|
62
|
-
Requires-Dist: datachain[pandas] ; extra == 'remote'
|
|
63
62
|
Requires-Dist: lz4 ; extra == 'remote'
|
|
64
63
|
Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
|
|
65
64
|
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
66
65
|
Provides-Extra: tests
|
|
67
|
-
Requires-Dist: datachain[cv,
|
|
66
|
+
Requires-Dist: datachain[cv,remote,vector] ; extra == 'tests'
|
|
68
67
|
Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
69
68
|
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
70
69
|
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
@@ -332,6 +331,12 @@ Datasets can be exported to CSV or webdataset formats. However, a much better wa
|
|
|
332
331
|
parallel=2,
|
|
333
332
|
)
|
|
334
333
|
|
|
334
|
+
Tutorials
|
|
335
|
+
------------------
|
|
336
|
+
|
|
337
|
+
* `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvcx/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
|
|
338
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
339
|
+
|
|
335
340
|
💻 More examples
|
|
336
341
|
------------------
|
|
337
342
|
|
|
@@ -41,11 +41,11 @@ datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
|
|
|
41
41
|
datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
|
|
42
42
|
datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
|
|
43
43
|
datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
|
|
44
|
-
datachain/lib/dc.py,sha256=
|
|
44
|
+
datachain/lib/dc.py,sha256=D3cgib-U0Mo0x5wEK1_NfgymAldHqCvooZwtyohi53Q,34426
|
|
45
45
|
datachain/lib/feature.py,sha256=QDloA9HE7URf9J_veKrguYBvSg-0cbXZFTswNxrKsB8,12135
|
|
46
46
|
datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
|
|
47
|
-
datachain/lib/feature_utils.py,sha256=
|
|
48
|
-
datachain/lib/file.py,sha256=
|
|
47
|
+
datachain/lib/feature_utils.py,sha256=oqRO_Mu3epOr1HPTxAJ8TxsJshUfKJQtulCDgHtInMI,4557
|
|
48
|
+
datachain/lib/file.py,sha256=LGBwC7tFU7VcSWk5kjPpEWPBQas5me69L2uTDNvYXGM,8326
|
|
49
49
|
datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
|
|
50
50
|
datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
|
|
51
51
|
datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
|
|
@@ -53,11 +53,11 @@ datachain/lib/image.py,sha256=ZYfDqr9p-RRmWBeWFQwXLS1J3vQS616ykfMUvQVpqBY,2717
|
|
|
53
53
|
datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
|
|
54
54
|
datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
|
|
55
55
|
datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
|
|
56
|
-
datachain/lib/pytorch.py,sha256=
|
|
56
|
+
datachain/lib/pytorch.py,sha256=Ea1sXhborF6zcywQjLpXgKnbr1lTez4Bfu3m0Gr78FI,5843
|
|
57
57
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
58
|
-
datachain/lib/signal_schema.py,sha256=
|
|
58
|
+
datachain/lib/signal_schema.py,sha256=hD56hyO1H3A5H2oyTUwPcNu6UOQ_XY0DeA0nrXBqFaU,11492
|
|
59
59
|
datachain/lib/text.py,sha256=PUT1O0jNJoQGsuhff2LgDpzTWk2eMdwIKqEDBrE448M,1307
|
|
60
|
-
datachain/lib/udf.py,sha256=
|
|
60
|
+
datachain/lib/udf.py,sha256=axMvqYz4tdyg_C3nyuOcDsu3Aqr19jWv2vl54U_8LQM,6595
|
|
61
61
|
datachain/lib/udf_signature.py,sha256=CUKgoVpM_N8CgvMncpAw2RYchoiJdAGdDSdluoP0hIk,7161
|
|
62
62
|
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
63
63
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
@@ -92,9 +92,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
|
|
|
92
92
|
datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
|
|
93
93
|
datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
|
|
94
94
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
95
|
-
datachain-0.2.
|
|
96
|
-
datachain-0.2.
|
|
97
|
-
datachain-0.2.
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
95
|
+
datachain-0.2.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
96
|
+
datachain-0.2.5.dist-info/METADATA,sha256=VDc20_FTRJRF63521iwyb67LMCYVXn2BqeER4IVc840,14810
|
|
97
|
+
datachain-0.2.5.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
98
|
+
datachain-0.2.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
99
|
+
datachain-0.2.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
100
|
+
datachain-0.2.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|