datachain 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/dc.py CHANGED
@@ -137,6 +137,7 @@ class DataChain(DatasetQuery):
137
137
  indexing_column_types=File._datachain_column_types,
138
138
  )
139
139
  self._settings = Settings()
140
+ self._setup = {}
140
141
 
141
142
  if self.feature_schema:
142
143
  self.signals_schema = SignalSchema.deserialize(self.feature_schema)
@@ -536,9 +537,9 @@ class DataChain(DatasetQuery):
536
537
  name = self.name or ""
537
538
 
538
539
  sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
539
- params_schema = self.signals_schema.slice(sign.params)
540
+ params_schema = self.signals_schema.slice(sign.params, self._setup)
540
541
 
541
- return UDFBase._create(target_class, sign, params_schema, self.catalog)
542
+ return UDFBase._create(target_class, sign, params_schema)
542
543
 
543
544
  def _extend_features(self, method_name, *args, **kwargs):
544
545
  super_func = getattr(super(), method_name)
@@ -569,18 +570,6 @@ class DataChain(DatasetQuery):
569
570
  chain.signals_schema = new_schema
570
571
  return chain
571
572
 
572
- def get_values(self, *cols: str) -> Iterator[list]:
573
- """Iterate over rows, getting feature values and applying reader calls.
574
-
575
- If columns are specified - limit them to specified columns.
576
- """
577
- for features in self.iterate(*cols):
578
- yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features] # type: ignore[union-attr,call-arg]
579
-
580
- def get_one_value(self, col: str) -> Iterator:
581
- for item in self.get_values(col):
582
- yield item[0]
583
-
584
573
  def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
585
574
  """Iterate over rows.
586
575
 
@@ -604,12 +593,6 @@ class DataChain(DatasetQuery):
604
593
  def collect_one(self, col: str) -> list[FeatureType]:
605
594
  return list(self.iterate_one(col))
606
595
 
607
- def collect_values(self, *cols: str) -> list[list]:
608
- return list(self.get_values(*cols))
609
-
610
- def collect_one_value(self, col: str) -> list:
611
- return list(self.get_one_value(col))
612
-
613
596
  def to_pytorch(self, **kwargs):
614
597
  """Convert to pytorch dataset format."""
615
598
 
@@ -931,3 +914,12 @@ class DataChain(DatasetQuery):
931
914
 
932
915
  def max(self, fr: FeatureType): # type: ignore[override]
933
916
  return self._extend_features("max", fr)
917
+
918
+ def setup(self, **kwargs) -> "Self":
919
+ intersection = set(self._setup.keys()) & set(kwargs.keys())
920
+ if intersection:
921
+ keys = ", ".join(intersection)
922
+ raise DatasetPrepareError(self.name, f"this value(s) already setup: {keys}")
923
+
924
+ self._setup = self._setup | kwargs
925
+ return self
@@ -12,9 +12,6 @@ from datachain.lib.feature import (
12
12
  convert_type_to_datachain,
13
13
  )
14
14
  from datachain.lib.utils import DataChainParamsError
15
- from datachain.query.schema import Column
16
-
17
- FeatureLike = Union[type["Feature"], Column, str]
18
15
 
19
16
  AUTO_FEATURE_PREFIX = "_auto_fr"
20
17
  SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase
datachain/lib/file.py CHANGED
@@ -238,9 +238,6 @@ class File(FileFeature):
238
238
  return self._catalog.get_client(self.source).fs
239
239
 
240
240
 
241
- BinaryFile = File
242
-
243
-
244
241
  class TextFile(File):
245
242
  def __init__(self, **kwargs):
246
243
  super().__init__(**kwargs)
datachain/lib/pytorch.py CHANGED
@@ -8,6 +8,7 @@ from torch.utils.data import IterableDataset, get_worker_info
8
8
 
9
9
  from datachain.catalog import Catalog, get_catalog
10
10
  from datachain.lib.dc import DataChain
11
+ from datachain.lib.feature import Feature
11
12
  from datachain.lib.text import convert_text
12
13
 
13
14
  if TYPE_CHECKING:
@@ -100,8 +101,14 @@ class PytorchDataset(IterableDataset):
100
101
  if self.num_samples > 0:
101
102
  ds = ds.sample(self.num_samples)
102
103
  ds = ds.chunk(total_rank, total_workers)
103
- stream = ds.get_values()
104
- for row in stream:
104
+ stream = ds.iterate()
105
+ for row_features in stream:
106
+ row = []
107
+ for fr in row_features:
108
+ if isinstance(fr, Feature):
109
+ row.append(fr.get_value()) # type: ignore[unreachable]
110
+ else:
111
+ row.append(fr)
105
112
  # Apply transforms
106
113
  if self.transform:
107
114
  try:
@@ -1,7 +1,7 @@
1
1
  import copy
2
2
  from collections.abc import Iterator, Sequence
3
3
  from datetime import datetime
4
- from typing import TYPE_CHECKING, Any, Optional, Union, get_args, get_origin
4
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin
5
5
 
6
6
  from pydantic import create_model
7
7
 
@@ -42,6 +42,11 @@ class SignalResolvingError(SignalSchemaError):
42
42
  super().__init__(f"cannot resolve signal name{name}: {msg}")
43
43
 
44
44
 
45
+ class SetupError(SignalSchemaError):
46
+ def __init__(self, name: str, msg: str):
47
+ super().__init__(f"cannot setup value '{name}': {msg}")
48
+
49
+
45
50
  class SignalResolvingTypeError(SignalResolvingError):
46
51
  def __init__(self, method: str, field):
47
52
  super().__init__(
@@ -52,9 +57,31 @@ class SignalResolvingTypeError(SignalResolvingError):
52
57
 
53
58
 
54
59
  class SignalSchema:
55
- def __init__(self, values: dict[str, FeatureType]):
60
+ def __init__(
61
+ self,
62
+ values: dict[str, FeatureType],
63
+ setup: Optional[dict[str, Callable]] = None,
64
+ ):
56
65
  self.values = values
57
- self.tree = self._build_tree()
66
+ self.tree = self._build_tree(values)
67
+
68
+ self.setup_func = setup or {}
69
+ self.setup_values = None
70
+ for key, func in self.setup_func.items():
71
+ if not callable(func):
72
+ raise SetupError(key, "value must be function or callable class")
73
+
74
+ def _init_setup_values(self):
75
+ if self.setup_values is not None:
76
+ return self.setup_values
77
+
78
+ res = {}
79
+ for key, func in self.setup_func.items():
80
+ try:
81
+ res[key] = func()
82
+ except Exception as ex:
83
+ raise SetupError(key, f"error when call function: '{ex}'") from ex
84
+ self.setup_values = res
58
85
 
59
86
  @staticmethod
60
87
  def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
@@ -111,16 +138,22 @@ class SignalSchema:
111
138
  def to_udf_spec(self) -> dict[str, Any]:
112
139
  res = {}
113
140
  for path, type_, has_subtree, _ in self.get_flat_tree():
141
+ if path[0] in self.setup_func:
142
+ continue
114
143
  if not has_subtree:
115
144
  db_name = DEFAULT_DELIMITER.join(path)
116
145
  res[db_name] = convert_type_to_datachain(type_)
117
146
  return res
118
147
 
119
148
  def row_to_objs(self, row: Sequence[Any]) -> list[FeatureType]:
149
+ self._init_setup_values()
150
+
120
151
  objs = []
121
152
  pos = 0
122
- for fr_type in self.values.values():
123
- if Feature.is_feature(fr_type):
153
+ for name, fr_type in self.values.items():
154
+ if val := self.setup_values.get(name, None): # type: ignore[attr-defined]
155
+ objs.append(val)
156
+ elif Feature.is_feature(fr_type):
124
157
  j, pos = fr_type._unflatten_to_json_pos(row, pos) # type: ignore[union-attr]
125
158
  objs.append(fr_type(**j))
126
159
  else:
@@ -135,8 +168,14 @@ class SignalSchema:
135
168
  if Feature.is_feature(fr)
136
169
  )
137
170
 
138
- def slice(self, keys: Sequence[str]) -> "SignalSchema":
139
- return SignalSchema({k: self.values[k] for k in keys if k in self.values})
171
+ def slice(
172
+ self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
173
+ ) -> "SignalSchema":
174
+ setup = setup or {}
175
+ setup_no_types = dict.fromkeys(setup.keys(), str)
176
+ union = self.values | setup_no_types
177
+ schema = {k: union[k] for k in keys if k in union}
178
+ return SignalSchema(schema, setup)
140
179
 
141
180
  def row_to_features(self, row: Sequence, catalog: "Catalog") -> list[FeatureType]:
142
181
  res = []
@@ -235,10 +274,11 @@ class SignalSchema:
235
274
  **fields,
236
275
  )
237
276
 
238
- def _build_tree(self) -> dict[str, Any]:
277
+ @staticmethod
278
+ def _build_tree(values: dict[str, FeatureType]) -> dict[str, Any]:
239
279
  res = {}
240
280
 
241
- for name, val in self.values.items():
281
+ for name, val in values.items():
242
282
  subtree = val.build_tree() if Feature.is_feature(val) else None # type: ignore[union-attr]
243
283
  res[name] = (val, subtree)
244
284
 
datachain/lib/udf.py CHANGED
@@ -64,7 +64,6 @@ class UDFBase(AbstractUDF):
64
64
  target_class: type["UDFBase"],
65
65
  sign: UdfSignature,
66
66
  params: SignalSchema,
67
- catalog,
68
67
  ) -> "UDFBase":
69
68
  if isinstance(sign.func, AbstractUDF):
70
69
  if not isinstance(sign.func, target_class): # type: ignore[unreachable]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -59,12 +59,11 @@ Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
59
59
  Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
60
60
  Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
61
61
  Provides-Extra: remote
62
- Requires-Dist: datachain[pandas] ; extra == 'remote'
63
62
  Requires-Dist: lz4 ; extra == 'remote'
64
63
  Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
65
64
  Requires-Dist: requests >=2.22.0 ; extra == 'remote'
66
65
  Provides-Extra: tests
67
- Requires-Dist: datachain[cv,pandas,remote,vector] ; extra == 'tests'
66
+ Requires-Dist: datachain[cv,remote,vector] ; extra == 'tests'
68
67
  Requires-Dist: pytest <9,>=8 ; extra == 'tests'
69
68
  Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
70
69
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
@@ -332,6 +331,12 @@ Datasets can be exported to CSV or webdataset formats. However, a much better wa
332
331
  parallel=2,
333
332
  )
334
333
 
334
+ Tutorials
335
+ ------------------
336
+
337
+ * `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvcx/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
338
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
339
+
335
340
  💻  More examples
336
341
  ------------------
337
342
 
@@ -41,11 +41,11 @@ datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
41
41
  datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
42
42
  datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
43
43
  datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
44
- datachain/lib/dc.py,sha256=Sf99R0oOqf7tlS2gieaG56z3bF7YVcMjhJOZrFRfFs8,34778
44
+ datachain/lib/dc.py,sha256=D3cgib-U0Mo0x5wEK1_NfgymAldHqCvooZwtyohi53Q,34426
45
45
  datachain/lib/feature.py,sha256=QDloA9HE7URf9J_veKrguYBvSg-0cbXZFTswNxrKsB8,12135
46
46
  datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
47
- datachain/lib/feature_utils.py,sha256=F4ZENO6tTQvd36a-O1AurYjFSUpoyZaT4qgXsKjQDts,4650
48
- datachain/lib/file.py,sha256=TdhsPYmG0Atkd_QAO997oA8AuM854wNbjjLLT1uiD2M,8346
47
+ datachain/lib/feature_utils.py,sha256=oqRO_Mu3epOr1HPTxAJ8TxsJshUfKJQtulCDgHtInMI,4557
48
+ datachain/lib/file.py,sha256=LGBwC7tFU7VcSWk5kjPpEWPBQas5me69L2uTDNvYXGM,8326
49
49
  datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
50
50
  datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
51
51
  datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
@@ -53,11 +53,11 @@ datachain/lib/image.py,sha256=ZYfDqr9p-RRmWBeWFQwXLS1J3vQS616ykfMUvQVpqBY,2717
53
53
  datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
54
54
  datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
55
55
  datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
56
- datachain/lib/pytorch.py,sha256=Z7iZCsqJzUT0PynVo23Xu4Fx7qIuuEZyH83R1tR5mfI,5561
56
+ datachain/lib/pytorch.py,sha256=Ea1sXhborF6zcywQjLpXgKnbr1lTez4Bfu3m0Gr78FI,5843
57
57
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
58
- datachain/lib/signal_schema.py,sha256=KTegbx-yMvtaKEoUxLgDx5MxMA8De-nmdtqnV1932N8,10151
58
+ datachain/lib/signal_schema.py,sha256=hD56hyO1H3A5H2oyTUwPcNu6UOQ_XY0DeA0nrXBqFaU,11492
59
59
  datachain/lib/text.py,sha256=PUT1O0jNJoQGsuhff2LgDpzTWk2eMdwIKqEDBrE448M,1307
60
- datachain/lib/udf.py,sha256=kMlOsHCVybnnq4AMtYqjylZH7x2tGE62FsDPOu9qhWM,6612
60
+ datachain/lib/udf.py,sha256=axMvqYz4tdyg_C3nyuOcDsu3Aqr19jWv2vl54U_8LQM,6595
61
61
  datachain/lib/udf_signature.py,sha256=CUKgoVpM_N8CgvMncpAw2RYchoiJdAGdDSdluoP0hIk,7161
62
62
  datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
63
63
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
@@ -92,9 +92,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
92
92
  datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
93
93
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
94
94
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
95
- datachain-0.2.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.3.dist-info/METADATA,sha256=NmviJ7UsETesadrJjeyoYjeNqul6GMd9D4zDZLk23Co,14399
97
- datachain-0.2.3.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
98
- datachain-0.2.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.3.dist-info/RECORD,,
95
+ datachain-0.2.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.5.dist-info/METADATA,sha256=VDc20_FTRJRF63521iwyb67LMCYVXn2BqeER4IVc840,14810
97
+ datachain-0.2.5.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
98
+ datachain-0.2.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.5.dist-info/RECORD,,