datachain 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.1.10'
16
- __version_tuple__ = version_tuple = (0, 1, 10)
15
+ __version__ = version = '0.1.11'
16
+ __version_tuple__ = version_tuple = (0, 1, 11)
@@ -1580,10 +1580,54 @@ class Catalog:
1580
1580
 
1581
1581
  return dst
1582
1582
 
1583
- def open_object(self, row: RowDict, use_cache: bool = True, **config: Any):
1583
+ def get_file_signals(
1584
+ self, dataset_name: str, dataset_version: int, row: RowDict
1585
+ ) -> Optional[dict]:
1586
+ """
1587
+ Function that returns file signals from dataset row.
1588
+ Note that signal names are without prefix, so if there was 'laion__file__source'
1589
+ in original row, result will have just 'source'
1590
+ Example output:
1591
+ {
1592
+ "source": "s3://ldb-public",
1593
+ "parent": "animals/dogs",
1594
+ "name": "dog.jpg",
1595
+ ...
1596
+ }
1597
+ """
1598
+ from datachain.lib.signal_schema import SignalSchema
1599
+
1600
+ version = self.get_dataset(dataset_name).get_version(dataset_version)
1601
+
1602
+ file_signals_values = SignalSchema.deserialize(
1603
+ version.feature_schema
1604
+ ).get_file_signals_values(row)
1605
+ if not file_signals_values:
1606
+ return None
1607
+
1608
+ # there can be multiple file signals in a schema, but taking the first
1609
+ # one for now. In future we might add ability to choose from which one
1610
+ # to open object
1611
+ return next(iter(file_signals_values.values()))
1612
+
1613
+ def open_object(
1614
+ self,
1615
+ dataset_name: str,
1616
+ dataset_version: int,
1617
+ row: RowDict,
1618
+ use_cache: bool = True,
1619
+ **config: Any,
1620
+ ):
1621
+ file_signals = self.get_file_signals(dataset_name, dataset_version, row)
1622
+ if not file_signals:
1623
+ raise RuntimeError("Cannot open object without file signals")
1624
+
1584
1625
  config = config or self.client_config
1585
- client = self.get_client(row["source"], **config)
1586
- return client.open_object(self._get_row_uid(row), use_cache=use_cache)
1626
+ client = self.get_client(file_signals["source"], **config)
1627
+ return client.open_object(
1628
+ self._get_row_uid(file_signals), # type: ignore [arg-type]
1629
+ use_cache=use_cache,
1630
+ )
1587
1631
 
1588
1632
  def _get_row_uid(self, row: RowDict) -> UniqueId:
1589
1633
  return UniqueId(
@@ -1142,6 +1142,8 @@ class AbstractDBMetastore(AbstractMetastore):
1142
1142
  if field == "schema":
1143
1143
  dataset_version.update(**{field: DatasetRecord.parse_schema(value)})
1144
1144
  values[field] = json.dumps(value) if value else None
1145
+ elif field == "feature_schema":
1146
+ values[field] = json.dumps(value) if value else None
1145
1147
  elif field == "preview" and isinstance(value, list):
1146
1148
  values[field] = json.dumps(value, cls=JSONSerialize)
1147
1149
  else:
datachain/dataset.py CHANGED
@@ -157,7 +157,7 @@ class DatasetVersion:
157
157
  dataset_id: int
158
158
  version: int
159
159
  status: int
160
- feature_schema: Optional[str]
160
+ feature_schema: dict
161
161
  created_at: datetime
162
162
  finished_at: Optional[datetime]
163
163
  error_message: str
@@ -199,7 +199,7 @@ class DatasetVersion:
199
199
  dataset_id,
200
200
  version,
201
201
  status,
202
- feature_schema,
202
+ json.loads(feature_schema) if feature_schema else {},
203
203
  created_at,
204
204
  finished_at,
205
205
  error_message,
@@ -263,9 +263,9 @@ class DatasetRecord:
263
263
  labels: list[str]
264
264
  shadow: bool
265
265
  schema: dict[str, Union[SQLType, type[SQLType]]]
266
+ feature_schema: dict
266
267
  versions: list[DatasetVersion]
267
268
  status: int = DatasetStatus.CREATED
268
- feature_schema: Optional[dict] = None
269
269
  created_at: Optional[datetime] = None
270
270
  finished_at: Optional[datetime] = None
271
271
  error_message: str = ""
@@ -320,8 +320,6 @@ class DatasetRecord:
320
320
  version_job_id: Optional[str] = None,
321
321
  version_is_job_result: bool = False,
322
322
  ) -> "DatasetRecord":
323
- fr_schema = json.loads(feature_schema) if feature_schema else {}
324
-
325
323
  labels_lst: list[str] = json.loads(labels) if labels else []
326
324
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
327
325
  version_schema_dct: dict[str, str] = (
@@ -333,7 +331,7 @@ class DatasetRecord:
333
331
  version_dataset_id,
334
332
  version,
335
333
  version_status,
336
- fr_schema,
334
+ version_feature_schema,
337
335
  version_created_at,
338
336
  version_finished_at,
339
337
  version_error_message,
@@ -356,9 +354,9 @@ class DatasetRecord:
356
354
  labels_lst,
357
355
  bool(shadow),
358
356
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
357
+ json.loads(feature_schema) if feature_schema else {},
359
358
  [dataset_version],
360
359
  status,
361
- fr_schema,
362
360
  created_at,
363
361
  finished_at,
364
362
  error_message,
datachain/lib/dc.py CHANGED
@@ -6,6 +6,7 @@ import sqlalchemy
6
6
  from datachain.lib.feature import Feature, FeatureType
7
7
  from datachain.lib.feature_utils import features_to_tuples
8
8
  from datachain.lib.file import File, get_file
9
+ from datachain.lib.meta_formats import read_meta
9
10
  from datachain.lib.settings import Settings
10
11
  from datachain.lib.signal_schema import SignalSchema
11
12
  from datachain.lib.udf import (
@@ -219,6 +220,89 @@ class DataChain(DatasetQuery):
219
220
  """
220
221
  return DataChain(name=name, version=version)
221
222
 
223
+ @classmethod
224
+ def from_csv(
225
+ cls,
226
+ path,
227
+ type: Literal["binary", "text", "image"] = "text",
228
+ anon: bool = False,
229
+ spec: Optional[FeatureType] = None,
230
+ schema_from: Optional[str] = "auto",
231
+ show_schema: Optional[bool] = False,
232
+ ) -> "DataChain":
233
+ """Get data from CSV. It returns the chain itself.
234
+
235
+ Parameters
236
+ ----------
237
+ path : storage URI with directory. URI must start with storage prefix such
238
+ as `s3://`, `gs://`, `az://` or "file:///"
239
+ type : read file as "binary", "text", or "image" data. Default is "binary".
240
+ anon : use anonymous mode to access the storage.
241
+ spec : optional Data Model
242
+ schema_from : path to sample to infer spec from
243
+ show_schema : print auto-generated schema
244
+
245
+ Examples
246
+ --------
247
+
248
+ >>> chain = DataChain.from_csv("gs://csv")
249
+ """
250
+ if schema_from == "auto":
251
+ schema_from = path
252
+
253
+ chain = DataChain.from_storage(path=path, type=type, anon=anon)
254
+ return chain.gen(
255
+ csv=read_meta(
256
+ schema_from=schema_from,
257
+ meta_type="csv",
258
+ spec=spec,
259
+ show_schema=show_schema,
260
+ )
261
+ )
262
+
263
+ @classmethod
264
+ def from_json(
265
+ cls,
266
+ path,
267
+ type: Literal["binary", "text", "image"] = "text",
268
+ anon: bool = False,
269
+ spec: Optional[FeatureType] = None,
270
+ schema_from: Optional[str] = "auto",
271
+ jmespath: Optional[str] = None,
272
+ show_schema: Optional[bool] = False,
273
+ ) -> "DataChain":
274
+ """Get data from CSV. It returns the chain itself.
275
+
276
+ Parameters
277
+ ----------
278
+ path : storage URI with directory. URI must start with storage prefix such
279
+ as `s3://`, `gs://`, `az://` or "file:///"
280
+ type : read file as "binary", "text", or "image" data. Default is "binary".
281
+ anon : use anonymous mode to access the storage.
282
+ spec : optional Data Model
283
+ schema_from : path to sample to infer spec from
284
+ show_schema : print auto-generated schema
285
+ jmespath : JMESPATH expression to reduce JSON
286
+ name : return object name
287
+ Examples
288
+ --------
289
+
290
+ >>> chain = DataChain.from_json("gs://json")
291
+ """
292
+ if schema_from == "auto":
293
+ schema_from = path
294
+
295
+ chain = DataChain.from_storage(path=path, type=type, anon=anon)
296
+ return chain.gen(
297
+ json=read_meta(
298
+ schema_from=schema_from,
299
+ meta_type="json",
300
+ spec=spec,
301
+ show_schema=show_schema,
302
+ jmespath=jmespath,
303
+ )
304
+ )
305
+
222
306
  def save( # type: ignore[override]
223
307
  self, name: Optional[str] = None, version: Optional[int] = None
224
308
  ) -> "DataChain":
@@ -408,7 +492,7 @@ class DataChain(DatasetQuery):
408
492
  chain.signals_schema = new_schema
409
493
  return chain
410
494
 
411
- def get_values(self) -> Iterator[Sequence]:
495
+ def get_values(self) -> Iterator[list]:
412
496
  """Iterate over rows, getting feature values and applying reader calls."""
413
497
  for features in self.iterate():
414
498
  yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features]
@@ -607,3 +691,35 @@ class DataChain(DatasetQuery):
607
691
 
608
692
  def max(self, fr: FeatureType): # type: ignore[override]
609
693
  return self._extend_features("max", fr)
694
+
695
+ @detach
696
+ def gen_random(self) -> "DataChain":
697
+ from random import getrandbits
698
+
699
+ from datachain.data_storage.warehouse import RANDOM_BITS
700
+
701
+ if "random" not in self.signals_schema.values:
702
+ chain = self.map(random=lambda: getrandbits(RANDOM_BITS), output=int).save()
703
+ return chain.select_except("random")
704
+
705
+ return self
706
+
707
+ @detach
708
+ def shuffle(self) -> "DataChain":
709
+ """Return results in deterministic random order."""
710
+ chain = self.gen_random()
711
+ return DatasetQuery.shuffle(chain)
712
+
713
+ @detach
714
+ def chunk(self, index: int, total: int) -> "DataChain":
715
+ """Split a query into smaller chunks for e.g. parallelization.
716
+ Example:
717
+ >>> dc = DataChain(...)
718
+ >>> chunk_1 = dc._chunk(0, 2)
719
+ >>> chunk_2 = dc._chunk(1, 2)
720
+ Note:
721
+ Bear in mind that `index` is 0-indexed but `total` isn't.
722
+ Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
723
+ """
724
+ chain = self.gen_random()
725
+ return DatasetQuery.chunk(chain, index, total)
datachain/lib/feature.py CHANGED
@@ -78,16 +78,6 @@ DATACHAIN_TO_TYPE = {
78
78
  JSON: dict,
79
79
  }
80
80
 
81
- NAMES_TO_TYPES = {
82
- "int": int,
83
- "str": str,
84
- "float": float,
85
- "bool": bool,
86
- "list": list,
87
- "dict": dict,
88
- "bytes": bytes,
89
- "datetime": datetime,
90
- }
91
81
 
92
82
  NUMPY_TO_DATACHAIN = {
93
83
  np.dtype("int8"): Int,
@@ -0,0 +1,164 @@
1
+ # pip install datamodel-code-generator
2
+ # pip install jmespath
3
+ #
4
+ import csv
5
+ import io
6
+ import json
7
+ import subprocess
8
+ import sys
9
+ import uuid
10
+ from collections.abc import Iterator
11
+ from typing import Any, Callable
12
+
13
+ import jmespath as jsp
14
+
15
+ from datachain.lib.feature_utils import pydantic_to_feature # noqa: F401
16
+ from datachain.lib.file import File
17
+
18
+ # from datachain.lib.dc import C, DataChain
19
+
20
+
21
+ def generate_uuid():
22
+ return uuid.uuid4() # Generates a random UUID.
23
+
24
+
25
+ # JSON decoder
26
+ def load_json_from_string(json_string):
27
+ try:
28
+ data = json.loads(json_string)
29
+ print("Successfully parsed JSON", file=sys.stderr)
30
+ return data
31
+ except json.JSONDecodeError:
32
+ print("Failed to decode JSON: The string is not formatted correctly.")
33
+ return None
34
+
35
+
36
+ # Read valid JSON and return a data object sample
37
+ def process_json(data_string, jmespath):
38
+ json_dict = load_json_from_string(data_string)
39
+ if jmespath:
40
+ json_dict = jsp.search(jmespath, json_dict)
41
+ # we allow non-list JSONs here to print the root schema
42
+ # but if jmespath expression is given, we assume a list
43
+ if not isinstance(json_dict, list):
44
+ raise ValueError("JMESPATH expression must resolve to a list")
45
+ return None
46
+ json_dict = json_dict[0] # sample the first object
47
+ return json.dumps(json_dict)
48
+
49
+
50
+ # Print a dynamic datamodel-codegen output from JSON or CSV on stdout
51
+ def read_schema(source_file, data_type="csv", expr=None):
52
+ data_string = ""
53
+ uid_str = str(generate_uuid()).replace("-", "") # comply with Python class names
54
+ # using uiid to get around issue #1617
55
+ model_name = f"Model{uid_str}"
56
+ try:
57
+ with source_file.open() as fd: # CSV can be larger than memory
58
+ if data_type == "csv":
59
+ data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
60
+ data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
61
+ else:
62
+ data_string = fd.read() # other meta must fit into RAM
63
+ except OSError as e:
64
+ print(f"An unexpected file error occurred: {e}")
65
+ return
66
+ if data_type == "json":
67
+ data_string = process_json(data_string, expr)
68
+ command = [
69
+ "datamodel-codegen",
70
+ "--input-file-type",
71
+ data_type,
72
+ "--class-name",
73
+ model_name,
74
+ ]
75
+ try:
76
+ result = subprocess.run(
77
+ command, # noqa: S603
78
+ input=data_string,
79
+ text=True,
80
+ capture_output=True,
81
+ check=True,
82
+ )
83
+ model_output = (
84
+ result.stdout
85
+ ) # This will contain the output from datamodel-codegen
86
+ except subprocess.CalledProcessError as e:
87
+ model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
88
+ print(f"{model_output}")
89
+ print("\n" + f"spec=pydantic_to_feature({model_name})" + "\n")
90
+
91
+
92
+ #
93
+ # UDF mapper which calls chain in the setup to infer the dynamic schema
94
+ #
95
+ def read_meta(
96
+ spec=None, schema_from=None, meta_type="json", jmespath=None, show_schema=False
97
+ ) -> Callable:
98
+ from datachain.lib.dc import DataChain
99
+
100
+ # ugly hack: datachain is run redirecting printed outputs to a variable
101
+ if schema_from:
102
+ captured_output = io.StringIO()
103
+ current_stdout = sys.stdout
104
+ sys.stdout = captured_output
105
+ try:
106
+ chain = (
107
+ DataChain.from_storage(schema_from)
108
+ .limit(1)
109
+ .map( # dummy column created (#1615)
110
+ meta_schema=lambda file: read_schema(
111
+ file, data_type=meta_type, expr=jmespath
112
+ ),
113
+ output=str,
114
+ )
115
+ )
116
+ # dummy executor (#1616)
117
+ chain.save()
118
+ finally:
119
+ sys.stdout = current_stdout
120
+ model_output = captured_output.getvalue()
121
+ captured_output.close()
122
+ if show_schema:
123
+ print(f"{model_output}")
124
+ # Below 'spec' should be a dynamically converted Feature from Pydantic datamodel
125
+ if not spec:
126
+ local_vars: dict[str, Any] = {}
127
+ exec(model_output, globals(), local_vars) # noqa: S102
128
+ spec = local_vars["spec"]
129
+
130
+ if not (spec) and not (schema_from):
131
+ raise ValueError(
132
+ "Must provide a static schema in spec: or metadata sample in schema_from:"
133
+ )
134
+
135
+ #
136
+ # UDF mapper parsing a JSON or CSV file using schema spec
137
+ #
138
+ def parse_data(
139
+ file: File, data_model=spec, meta_type=meta_type, jmespath=jmespath
140
+ ) -> Iterator[spec]:
141
+ if meta_type == "csv":
142
+ with (
143
+ file.open() as fd
144
+ ): # TODO: if schema is statically given, should allow CSV without headers
145
+ reader = csv.DictReader(fd)
146
+ for row in reader: # CSV can be larger than memory
147
+ json_string = json.dumps(row)
148
+ yield data_model.model_validate_json(json_string)
149
+ if meta_type == "json":
150
+ try:
151
+ with file.open() as fd: # JSON must fit into RAM
152
+ data_string = fd.read()
153
+ except OSError as e:
154
+ print(f"An unexpected file error occurred: {e}")
155
+ json_object = load_json_from_string(data_string)
156
+ if jmespath:
157
+ json_object = jsp.search(jmespath, json_object)
158
+ if not isinstance(json_object, list):
159
+ raise ValueError("JSON expression must resolve in a list of objects")
160
+ for json_dict in json_object:
161
+ json_string = json.dumps(json_dict)
162
+ yield data_model.model_validate_json(json_string)
163
+
164
+ return parse_data
datachain/lib/pytorch.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from collections.abc import Iterator
3
- from typing import TYPE_CHECKING, Any, Optional
3
+ from typing import TYPE_CHECKING, Any, Callable, Optional
4
4
 
5
5
  from torch import float32
6
6
  from torch.distributed import get_rank, get_world_size
@@ -8,6 +8,7 @@ from torch.utils.data import IterableDataset, get_worker_info
8
8
 
9
9
  from datachain.catalog import Catalog, get_catalog
10
10
  from datachain.lib.dc import DataChain
11
+ from datachain.lib.text import convert_text
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from torchvision.transforms.v2 import Transform
@@ -17,14 +18,25 @@ logger = logging.getLogger("datachain")
17
18
 
18
19
 
19
20
  try:
21
+ from PIL import Image
20
22
  from torchvision.transforms import v2
21
23
 
22
24
  DEFAULT_TRANSFORM = v2.Compose([v2.ToImage(), v2.ToDtype(float32, scale=True)])
23
25
  except ImportError:
24
- logger.warning("Missing dependency torchvision for computer vision transforms.")
26
+ logger.warning(
27
+ "Missing dependencies for computer vision:\n"
28
+ "To install run:\n\n"
29
+ " pip install 'datachain[cv]'\n"
30
+ )
31
+ Image = None # type: ignore[assignment]
32
+ v2 = None
25
33
  DEFAULT_TRANSFORM = None
26
34
 
27
35
 
36
+ def label_to_int(value: str, classes: list) -> int:
37
+ return classes.index(value)
38
+
39
+
28
40
  class PytorchDataset(IterableDataset):
29
41
  def __init__(
30
42
  self,
@@ -32,6 +44,8 @@ class PytorchDataset(IterableDataset):
32
44
  version: Optional[int] = None,
33
45
  catalog: Optional["Catalog"] = None,
34
46
  transform: Optional["Transform"] = DEFAULT_TRANSFORM,
47
+ tokenizer: Optional[Callable] = None,
48
+ tokenizer_kwargs: Optional[dict[str, Any]] = None,
35
49
  num_samples: int = 0,
36
50
  ):
37
51
  """
@@ -41,13 +55,17 @@ class PytorchDataset(IterableDataset):
41
55
  name (str): Name of DataChain dataset to stream.
42
56
  version (int): Version of DataChain dataset to stream.
43
57
  catalog (Catalog): DataChain catalog to which dataset belongs.
44
- transform (Transform): Torchvision v2 transforms to apply to the dataset.
58
+ transform (Transform): Torchvision transforms to apply to the dataset.
59
+ tokenizer (Callable): Tokenizer to use to tokenize text values.
60
+ tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
45
61
  num_samples (int): Number of random samples to draw for each epoch.
46
62
  This argument is ignored if `num_samples=0` (the default).
47
63
  """
48
64
  self.name = name
49
65
  self.version = version
50
66
  self.transform = transform
67
+ self.tokenizer = tokenizer
68
+ self.tokenizer_kwargs = tokenizer_kwargs or {}
51
69
  self.num_samples = num_samples
52
70
  if catalog is None:
53
71
  catalog = get_catalog()
@@ -87,10 +105,21 @@ class PytorchDataset(IterableDataset):
87
105
  # Apply transforms
88
106
  if self.transform:
89
107
  try:
90
- row = self.transform(row)
108
+ if v2 and isinstance(self.transform, v2.Transform):
109
+ row = self.transform(row)
110
+ elif Image:
111
+ for i, val in enumerate(row):
112
+ if isinstance(val, Image.Image):
113
+ row[i] = self.transform(val)
91
114
  except ValueError:
92
115
  logger.warning("Skipping transform due to unsupported data types.")
93
116
  self.transform = None
117
+ if self.tokenizer:
118
+ for i, val in enumerate(row):
119
+ if isinstance(val, str):
120
+ row[i] = convert_text(
121
+ val, self.tokenizer, self.tokenizer_kwargs
122
+ )
94
123
  yield row
95
124
 
96
125
  @staticmethod
@@ -1,19 +1,45 @@
1
1
  import copy
2
2
  from collections.abc import Sequence
3
- from typing import Any, Optional, Union, get_args, get_origin
3
+ from datetime import datetime
4
+ from typing import TYPE_CHECKING, Any, Optional, Union, get_args, get_origin
4
5
 
5
- from datachain.catalog import Catalog
6
6
  from datachain.lib.feature import (
7
7
  DATACHAIN_TO_TYPE,
8
8
  DEFAULT_DELIMITER,
9
- NAMES_TO_TYPES,
10
9
  Feature,
11
10
  FeatureType,
12
11
  convert_type_to_datachain,
13
12
  )
14
13
  from datachain.lib.feature_registry import Registry
15
- from datachain.lib.file import File
14
+ from datachain.lib.file import File, ImageFile, TextFile
16
15
  from datachain.lib.utils import DataChainParamsError
16
+ from datachain.lib.webdataset import TarStream, WDSAllFile, WDSBasic
17
+ from datachain.lib.webdataset_laion import Laion, LaionParquet, WDSLaion
18
+
19
+ if TYPE_CHECKING:
20
+ from datachain.catalog import Catalog
21
+
22
+
23
+ # TODO fix hardcoded Feature class names with://github.com/iterative/dvcx/issues/1625
24
+ NAMES_TO_TYPES = {
25
+ "int": int,
26
+ "str": str,
27
+ "float": float,
28
+ "bool": bool,
29
+ "list": list,
30
+ "dict": dict,
31
+ "bytes": bytes,
32
+ "datetime": datetime,
33
+ "WDSLaion": WDSLaion,
34
+ "Laion": Laion,
35
+ "LaionParquet": LaionParquet,
36
+ "File": File,
37
+ "ImageFile": ImageFile,
38
+ "TextFile": TextFile,
39
+ "TarStream": TarStream,
40
+ "WDSBasic": WDSBasic,
41
+ "WDSAllFile": WDSAllFile,
42
+ }
17
43
 
18
44
 
19
45
  class SignalSchemaError(DataChainParamsError):
@@ -74,7 +100,7 @@ class SignalSchema:
74
100
  signals: dict[str, FeatureType] = {}
75
101
  for signal, type_name in schema.items():
76
102
  try:
77
- fr = NAMES_TO_TYPES.get(type_name, None)
103
+ fr = NAMES_TO_TYPES.get(type_name)
78
104
  if not fr:
79
105
  type_name, version = Registry.parse_name_version(type_name)
80
106
  fr = Registry.get(type_name, version)
@@ -137,7 +163,7 @@ class SignalSchema:
137
163
  def slice(self, keys: Sequence[str]) -> "SignalSchema":
138
164
  return SignalSchema({k: v for k, v in self.values.items() if k in keys})
139
165
 
140
- def row_to_features(self, row: Sequence, catalog: Catalog) -> list[FeatureType]:
166
+ def row_to_features(self, row: Sequence, catalog: "Catalog") -> list[FeatureType]:
141
167
  res = []
142
168
  pos = 0
143
169
  for fr_cls in self.values.values():
@@ -279,3 +305,34 @@ class SignalSchema:
279
305
  for signal in signals:
280
306
  res.append(".".join(signal))
281
307
  return res
308
+
309
+ def get_file_signals_values(self, row: dict[str, Any]) -> dict[str, Any]:
310
+ """
311
+ Method that returns values with clean field names (without prefix) for
312
+ all file signals found in this schema for some row
313
+ Output example:
314
+ {
315
+ laion.file: {
316
+ "source": "s3://ldb-public",
317
+ "name": "dog.jpg",
318
+ ...
319
+ },
320
+ meta.file: {
321
+ "source": "s3://datacomp",
322
+ "name": "cat.jpg",
323
+ ...
324
+ }
325
+ }
326
+ """
327
+ res = {}
328
+
329
+ for file_signals in self.get_file_signals():
330
+ prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
331
+ res[file_signals] = {
332
+ c_name.removeprefix(prefix): c_value
333
+ for c_name, c_value in row.items()
334
+ if c_name.startswith(prefix)
335
+ and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
336
+ }
337
+
338
+ return res
@@ -1462,7 +1462,7 @@ class DatasetQuery:
1462
1462
 
1463
1463
  return cls.from_dataframe(pd_df, *args, **kwargs)
1464
1464
 
1465
- def shuffle(self) -> "DatasetQuery":
1465
+ def shuffle(self) -> "Self":
1466
1466
  # ToDo: implement shaffle based on seed and/or generating random column
1467
1467
  return self.order_by(C.random)
1468
1468
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -35,6 +35,8 @@ Requires-Dist: multiprocess ==0.70.16
35
35
  Requires-Dist: dill ==0.3.8
36
36
  Requires-Dist: ujson >=5.9.0
37
37
  Requires-Dist: pydantic <3,>=2
38
+ Requires-Dist: jmespath >=1.0
39
+ Requires-Dist: datamodel-code-generator >=0.25
38
40
  Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
39
41
  Provides-Extra: cv
40
42
  Requires-Dist: Pillow <11,>=10.0.0 ; extra == 'cv'
@@ -1,12 +1,12 @@
1
1
  datachain/__init__.py,sha256=9a0qX6tqyA9KC3ahLmGarqlRTZJXhM7HijAWpfUaOnQ,102
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
- datachain/_version.py,sha256=0iLmzkTe5cfY4SBtaPpUzHn9tXwbwplszcfp5pHW6nU,413
3
+ datachain/_version.py,sha256=HreDwlLXV189L3kiBj3huM_kqWD1usijlC8LN1YXcCM,413
4
4
  datachain/asyn.py,sha256=opARBVZJxTKU3EGYd-8gcpNXoshuCfVz_b0ut3oxC50,7641
5
5
  datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
6
6
  datachain/cli.py,sha256=1mBozBJS9Nq-EeahxwyKH8ef64E2v93o0CAEzxjcbkY,32209
7
7
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
8
8
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
9
- datachain/dataset.py,sha256=VojzbJxxmGQmL38kxp-hQNVPv2drZIR2SD2oHEo4Cqo,14512
9
+ datachain/dataset.py,sha256=4ksFJlfo_CEmt5xqXPca-hhQL1syFpKxCl_ZOhTS30s,14506
10
10
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
11
11
  datachain/listing.py,sha256=-Cm74Ne2Q36QuCpA22feDA_v-7uPqkwAOg-QzkiZAGQ,8243
12
12
  datachain/node.py,sha256=jCBvwiEUYSKQa27Tb6RORgaUjoiz7mOX63NQmP7JQY0,5703
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=FW1LR5qCL5BtCYk-B-6LUCCMq8zOobkKKMrLqfFfCAg,13535
19
19
  datachain/catalog/__init__.py,sha256=Gkto1V7rUbVjJmgMEnB_VpVeHOfV47IQh1fSjEKnit4,409
20
- datachain/catalog/catalog.py,sha256=fSs4RDMA4Hl9svy3GoVBo-DMIwPJP6HUw_YndKRwYQY,77109
20
+ datachain/catalog/catalog.py,sha256=7ZqCsyr7W4enOIX6jiLJbBfFZvjkqjI1E_NOyL3V3AA,78585
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=FTI9s1b8iX0_TffSAx1mwm-ucsRV14NHX-F1xtTXRSE,7310
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -32,7 +32,7 @@ datachain/data_storage/__init__.py,sha256=arlkQIj2J0ozcT_GvNDxm6PLT9NeabHvIsxPND
32
32
  datachain/data_storage/db_engine.py,sha256=mxOoWP4ntBMgLeTAk4dlEeIJArAz4x_tFrHytcAfLpo,3341
33
33
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=K-bW_um9qkDAZUCs9DJV5W3FlpeblQ9f4ulqgWQ4Isg,53528
35
+ datachain/data_storage/metastore.py,sha256=GnJH2NlFngdj30aK9CSaimJNnh_x_pSjntWUnvQuI2A,53649
36
36
  datachain/data_storage/schema.py,sha256=pF3KBi-8Pz3n5jRYoJpDR3gF8qUFdyAu2XR58J4Fyuo,8724
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
38
  datachain/data_storage/sqlite.py,sha256=eHTiJ0VIxU-chnhKNTN14EsaSnw5LAaxTLi9aMCZpl4,24978
@@ -40,8 +40,8 @@ datachain/data_storage/warehouse.py,sha256=sQLOrv6DH8UcWH1aqlg3YJKmaHr696XkVafBx
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/cached_stream.py,sha256=BQI6gpJ2y7_-jqQo_0VB9ntbkOVISvj9wlDwGDQbqw8,3537
42
42
  datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
43
- datachain/lib/dc.py,sha256=9lL6fNEkPxzkPE8fZqksCmRk3NBbmqMMZFi55j6OXGU,22224
44
- datachain/lib/feature.py,sha256=7ZZzGkafxKeYUfPN84hgQgdf8LzX54ikrne7itbTreI,14369
43
+ datachain/lib/dc.py,sha256=kyuSg-l7HciqFaunqPx41WKyAeuJ2H2tpWJplCXhZJc,26086
44
+ datachain/lib/feature.py,sha256=C5lxQ_Ef4rL0-mef4A4EeoqB0rcNZ0ExRE26ehx20RM,14196
45
45
  datachain/lib/feature_registry.py,sha256=hg_S_9JPEYaQ-8PI64mU0sEhSJ-rcrKtwQk5TPBotEw,1570
46
46
  datachain/lib/feature_utils.py,sha256=6wbKZ2xq08b751EFBRJy1OZLqWYd_gxq9A_Em_aMFk4,4713
47
47
  datachain/lib/file.py,sha256=ZNGzmJSq7PNVxLhGLNdR9YSYkP-1ZeqY_yhDMcDNfkI,8586
@@ -51,11 +51,12 @@ datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,
51
51
  datachain/lib/image.py,sha256=gb-My4rx5zMwOlDkcu_2G8GtRAMfsRvd7-QWUBErDw8,3486
52
52
  datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
53
53
  datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
54
+ datachain/lib/meta_formats.py,sha256=-JAS47NOO6rx1vmr0Cy-G_txxmTvMflXfzJiFD7rWlQ,5742
54
55
  datachain/lib/parquet.py,sha256=_MbRBzcgLLLegjKZNGF9Rm9IkYRSy0IqOksVjL1nntg,917
55
- datachain/lib/pytorch.py,sha256=fdclv6ZqbCgzCDdMZFv3IBrRQT3V1nI8xRaGF4Oa44c,4224
56
+ datachain/lib/pytorch.py,sha256=oU16XXAyAmiiabe1IoQoID00-u3uZ5GhCN48uAl6WDs,5421
56
57
  datachain/lib/reader.py,sha256=rPXXNoTUdm6PQwkAlaU-nOBreP_q4ett_EjFStrA_W0,1727
57
58
  datachain/lib/settings.py,sha256=mVtzyA_y9JA-6chMv1baggDvgeFsaUszySp660Gu4gw,2854
58
- datachain/lib/signal_schema.py,sha256=ST6lw5YGAwOWjZlwFxw1Qjfx0WTXt0lvCpb2Lk9Kn1E,10039
59
+ datachain/lib/signal_schema.py,sha256=WPKHzgZ6HatbDQ2IN_L0JPi46n6acfHpkq91DYdlgSg,11753
59
60
  datachain/lib/text.py,sha256=EEZrYohADi5rAGg3aLLRwtvyAV9js_yWAGhr2C3QbwI,2424
60
61
  datachain/lib/udf.py,sha256=PeZ-UbprfxlmgVbzH4FtNib3kIhTi9C869QM8RuM5dw,6292
61
62
  datachain/lib/udf_signature.py,sha256=1cOMcGXHbdBjyBRkvNxIEt9A_CoyiADxio2wkYu8U5M,7140
@@ -67,7 +68,7 @@ datachain/lib/webdataset_laion.py,sha256=tHn3Zhqx7Eb5Ywy_mobs6jDI0o_pFUbsuHqv0W_
67
68
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
68
69
  datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
69
70
  datachain/query/builtins.py,sha256=RyVEPZEuC7K1vlulrsaUjATLG_tZEvYYW7N5i6Fg-tQ,2781
70
- datachain/query/dataset.py,sha256=shLAtpERSu-ZbeV0EWtE32oa0i7d4O3ma8WL38i5ba8,66869
71
+ datachain/query/dataset.py,sha256=2DZAaEwX9gQlQgrRY3t-ymXN9SUkN_3XN0AfMFT6Mto,66861
71
72
  datachain/query/dispatch.py,sha256=9zcwKkLIuK5-xyRSQNw3yTqYLMHVbuZIn6KcB0g_ZBQ,13107
72
73
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
73
74
  datachain/query/schema.py,sha256=CGu9NBIFvX4iHQnaThLLxwWndxqkyUtYmo2JBgnZ4YQ,7660
@@ -91,9 +92,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
91
92
  datachain/sql/sqlite/base.py,sha256=XVxn4pB-N4pPfiby5uVvfH7feNzRKlBNzsc5eyKPvhI,10965
92
93
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
93
94
  datachain/sql/sqlite/vector.py,sha256=stBeEW6fbVbILmAtV4khjXdJIGT13HkRWJeCoqIOk50,315
94
- datachain-0.1.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
95
- datachain-0.1.10.dist-info/METADATA,sha256=PwiflznodH7Q2esyrf7GsTY45-O3fqWxjBbHzjVfLIk,13895
96
- datachain-0.1.10.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
97
- datachain-0.1.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
98
- datachain-0.1.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
99
- datachain-0.1.10.dist-info/RECORD,,
95
+ datachain-0.1.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.1.11.dist-info/METADATA,sha256=BFTmlt8_vtCHF80AHQcIQkE9YMCigp7k1jcAZV1D7j4,13972
97
+ datachain-0.1.11.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
98
+ datachain-0.1.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.1.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.1.11.dist-info/RECORD,,