datachain 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -621,10 +621,6 @@ class Catalog:
621
621
  code_ast.body[-1:] = new_expressions
622
622
  return code_ast
623
623
 
624
- def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
625
- config = config or self.client_config
626
- return Client.parse_url(uri, self.cache, **config)
627
-
628
624
  def get_client(self, uri: StorageURI, **config: Any) -> Client:
629
625
  """
630
626
  Return the client corresponding to the given source `uri`.
@@ -651,17 +647,16 @@ class Catalog:
651
647
  partial_path: Optional[str]
652
648
 
653
649
  client_config = client_config or self.client_config
654
- client, path = self.parse_url(source, **client_config)
650
+ uri, path = Client.parse_url(source)
651
+ client = Client.get_client(source, self.cache, **client_config)
655
652
  stem = os.path.basename(os.path.normpath(path))
656
653
  prefix = (
657
654
  posixpath.dirname(path)
658
655
  if glob.has_magic(stem) or client.fs.isfile(source)
659
656
  else path
660
657
  )
661
- storage_dataset_name = Storage.dataset_name(
662
- client.uri, posixpath.join(prefix, "")
663
- )
664
- source_metastore = self.metastore.clone(client.uri)
658
+ storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
659
+ source_metastore = self.metastore.clone(uri)
665
660
 
666
661
  columns = [
667
662
  Column("path", String),
@@ -675,15 +670,13 @@ class Catalog:
675
670
  ]
676
671
 
677
672
  if skip_indexing:
678
- source_metastore.create_storage_if_not_registered(client.uri)
679
- storage = source_metastore.get_storage(client.uri)
680
- source_metastore.init_partial_id(client.uri)
681
- partial_id = source_metastore.get_next_partial_id(client.uri)
673
+ source_metastore.create_storage_if_not_registered(uri)
674
+ storage = source_metastore.get_storage(uri)
675
+ source_metastore.init_partial_id(uri)
676
+ partial_id = source_metastore.get_next_partial_id(uri)
682
677
 
683
- source_metastore = self.metastore.clone(
684
- uri=client.uri, partial_id=partial_id
685
- )
686
- source_metastore.init(client.uri)
678
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
679
+ source_metastore.init(uri)
687
680
 
688
681
  source_warehouse = self.warehouse.clone()
689
682
  dataset = self.create_dataset(
@@ -701,20 +694,16 @@ class Catalog:
701
694
  in_progress,
702
695
  partial_id,
703
696
  partial_path,
704
- ) = source_metastore.register_storage_for_indexing(
705
- client.uri, force_update, prefix
706
- )
697
+ ) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
707
698
  if in_progress:
708
699
  raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
709
700
 
710
701
  if not need_index:
711
702
  assert partial_id is not None
712
703
  assert partial_path is not None
713
- source_metastore = self.metastore.clone(
714
- uri=client.uri, partial_id=partial_id
715
- )
704
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
716
705
  source_warehouse = self.warehouse.clone()
717
- dataset = self.get_dataset(Storage.dataset_name(client.uri, partial_path))
706
+ dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
718
707
  lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
719
708
  logger.debug(
720
709
  "Using cached listing %s. Valid till: %s",
@@ -731,11 +720,11 @@ class Catalog:
731
720
 
732
721
  return lst, path
733
722
 
734
- source_metastore.init_partial_id(client.uri)
735
- partial_id = source_metastore.get_next_partial_id(client.uri)
723
+ source_metastore.init_partial_id(uri)
724
+ partial_id = source_metastore.get_next_partial_id(uri)
736
725
 
737
- source_metastore.init(client.uri)
738
- source_metastore = self.metastore.clone(uri=client.uri, partial_id=partial_id)
726
+ source_metastore.init(uri)
727
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
739
728
 
740
729
  source_warehouse = self.warehouse.clone()
741
730
 
@@ -1370,7 +1359,7 @@ class Catalog:
1370
1359
 
1371
1360
  def signed_url(self, source: str, path: str, client_config=None) -> str:
1372
1361
  client_config = client_config or self.client_config
1373
- client, _ = self.parse_url(source, **client_config)
1362
+ client = Client.get_client(source, self.cache, **client_config)
1374
1363
  return client.url(path)
1375
1364
 
1376
1365
  def export_dataset_table(
@@ -116,15 +116,16 @@ class Client(ABC):
116
116
  return DATA_SOURCE_URI_PATTERN.match(name) is not None
117
117
 
118
118
  @staticmethod
119
- def parse_url(
120
- source: str,
121
- cache: DataChainCache,
122
- **kwargs,
123
- ) -> tuple["Client", str]:
119
+ def parse_url(source: str) -> tuple[StorageURI, str]:
120
+ cls = Client.get_implementation(source)
121
+ storage_name, rel_path = cls.split_url(source)
122
+ return cls.get_uri(storage_name), rel_path
123
+
124
+ @staticmethod
125
+ def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
124
126
  cls = Client.get_implementation(source)
125
- storage_url, rel_path = cls.split_url(source)
126
- client = cls.from_name(storage_url, cache, kwargs)
127
- return client, rel_path
127
+ storage_url, _ = cls.split_url(source)
128
+ return cls.from_name(storage_url, cache, kwargs)
128
129
 
129
130
  @classmethod
130
131
  def create_fs(cls, **kwargs) -> "AbstractFileSystem":
datachain/dataset.py CHANGED
@@ -112,7 +112,7 @@ class DatasetDependency:
112
112
 
113
113
  if is_listing_dataset(dataset_name):
114
114
  dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
115
- dependency_name = listing_uri_from_name(dataset_name)
115
+ dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
116
116
 
117
117
  return cls(
118
118
  id,
datachain/lib/arrow.py CHANGED
@@ -13,8 +13,10 @@ from datachain.lib.model_store import ModelStore
13
13
  from datachain.lib.udf import Generator
14
14
 
15
15
  if TYPE_CHECKING:
16
+ from datasets.features.features import Features
16
17
  from pydantic import BaseModel
17
18
 
19
+ from datachain.lib.data_model import DataType
18
20
  from datachain.lib.dc import DataChain
19
21
 
20
22
 
@@ -46,7 +48,10 @@ class ArrowGenerator(Generator):
46
48
  self.kwargs = kwargs
47
49
 
48
50
  def process(self, file: File):
49
- if self.nrows:
51
+ if file._caching_enabled:
52
+ path = file.get_local_path(download=True)
53
+ ds = dataset(path, schema=self.input_schema, **self.kwargs)
54
+ elif self.nrows:
50
55
  path = _nrows_file(file, self.nrows)
51
56
  ds = dataset(path, schema=self.input_schema, **self.kwargs)
52
57
  else:
@@ -54,6 +59,7 @@ class ArrowGenerator(Generator):
54
59
  ds = dataset(
55
60
  path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
56
61
  )
62
+ hf_schema = _get_hf_schema(ds.schema)
57
63
  index = 0
58
64
  with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
59
65
  for record_batch in ds.to_batches():
@@ -62,9 +68,17 @@ class ArrowGenerator(Generator):
62
68
  if self.output_schema:
63
69
  fields = self.output_schema.model_fields
64
70
  vals_dict = {}
65
- for (field, field_info), val in zip(fields.items(), vals):
66
- if ModelStore.is_pydantic(field_info.annotation):
67
- vals_dict[field] = field_info.annotation(**val) # type: ignore[misc]
71
+ for i, ((field, field_info), val) in enumerate(
72
+ zip(fields.items(), vals)
73
+ ):
74
+ anno = field_info.annotation
75
+ if hf_schema:
76
+ from datachain.lib.hf import convert_feature
77
+
78
+ feat = list(hf_schema[0].values())[i]
79
+ vals_dict[field] = convert_feature(val, feat, anno)
80
+ elif ModelStore.is_pydantic(anno):
81
+ vals_dict[field] = anno(**val) # type: ignore[misc]
68
82
  else:
69
83
  vals_dict[field] = val
70
84
  vals = [self.output_schema(**vals_dict)]
@@ -91,26 +105,36 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
91
105
  "Error generating output from Arrow schema - "
92
106
  f"Schema has {len(schema)} columns but got {len(col_names)} column names."
93
107
  )
94
- default_column = 0
108
+ if not col_names:
109
+ col_names = schema.names
110
+ columns = _convert_col_names(col_names) # type: ignore[arg-type]
111
+ hf_schema = _get_hf_schema(schema)
112
+ if hf_schema:
113
+ return {
114
+ column: hf_type for hf_type, column in zip(hf_schema[1].values(), columns)
115
+ }
95
116
  output = {}
96
- for i, field in enumerate(schema):
97
- if col_names:
98
- column = col_names[i]
99
- else:
100
- column = field.name
101
- column = column.lower()
102
- column = re.sub("[^0-9a-z_]+", "", column)
103
- if not column:
104
- column = f"c{default_column}"
105
- default_column += 1
117
+ for field, column in zip(schema, columns):
106
118
  dtype = arrow_type_mapper(field.type, column) # type: ignore[assignment]
107
119
  if field.nullable and not ModelStore.is_pydantic(dtype):
108
120
  dtype = Optional[dtype] # type: ignore[assignment]
109
121
  output[column] = dtype
110
-
111
122
  return output
112
123
 
113
124
 
125
+ def _convert_col_names(col_names: Sequence[str]) -> list[str]:
126
+ default_column = 0
127
+ converted_col_names = []
128
+ for column in col_names:
129
+ column = column.lower()
130
+ column = re.sub("[^0-9a-z_]+", "", column)
131
+ if not column:
132
+ column = f"c{default_column}"
133
+ default_column += 1
134
+ converted_col_names.append(column)
135
+ return converted_col_names
136
+
137
+
114
138
  def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
115
139
  """Convert pyarrow types to basic types."""
116
140
  from datetime import datetime
@@ -156,3 +180,14 @@ def _nrows_file(file: File, nrows: int) -> str:
156
180
  writer.write(line)
157
181
  writer.write("\n")
158
182
  return tf.name
183
+
184
+
185
+ def _get_hf_schema(
186
+ schema: "pa.Schema",
187
+ ) -> Optional[tuple["Features", dict[str, "DataType"]]]:
188
+ if schema.metadata and b"huggingface" in schema.metadata:
189
+ from datachain.lib.hf import get_output_schema, schema_from_arrow
190
+
191
+ features = schema_from_arrow(schema)
192
+ return features, get_output_schema(features)
193
+ return None
datachain/lib/dc.py CHANGED
@@ -408,7 +408,11 @@ class DataChain(DatasetQuery):
408
408
  in_memory=in_memory,
409
409
  )
410
410
  .gen(
411
- list_bucket(list_uri, client_config=session.catalog.client_config),
411
+ list_bucket(
412
+ list_uri,
413
+ session.catalog.cache,
414
+ client_config=session.catalog.client_config,
415
+ ),
412
416
  output={f"{object_name}": File},
413
417
  )
414
418
  .save(list_dataset_name, listing=True)
@@ -1523,7 +1527,8 @@ class DataChain(DatasetQuery):
1523
1527
  output = {"split": str}
1524
1528
 
1525
1529
  model_name = model_name or object_name or ""
1526
- output = output | get_output_schema(next(iter(ds_dict.values())), model_name)
1530
+ hf_features = next(iter(ds_dict.values())).features
1531
+ output = output | get_output_schema(hf_features, model_name)
1527
1532
  model = dict_to_data_model(model_name, output)
1528
1533
  if object_name:
1529
1534
  output = {object_name: model}
datachain/lib/file.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import io
2
2
  import json
3
+ import logging
3
4
  import os
4
5
  import posixpath
5
6
  from abc import ABC, abstractmethod
@@ -15,6 +16,9 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
16
  from PIL import Image
16
17
  from pydantic import Field, field_validator
17
18
 
19
+ if TYPE_CHECKING:
20
+ from typing_extensions import Self
21
+
18
22
  from datachain.cache import UniqueId
19
23
  from datachain.client.fileslice import FileSlice
20
24
  from datachain.lib.data_model import DataModel
@@ -25,6 +29,8 @@ from datachain.utils import TIME_ZERO
25
29
  if TYPE_CHECKING:
26
30
  from datachain.catalog import Catalog
27
31
 
32
+ logger = logging.getLogger("datachain")
33
+
28
34
  # how to create file path when exporting
29
35
  ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
30
36
 
@@ -251,14 +257,18 @@ class File(DataModel):
251
257
  dump = self.model_dump()
252
258
  return UniqueId(*(dump[k] for k in self._unique_id_keys))
253
259
 
254
- def get_local_path(self) -> Optional[str]:
260
+ def get_local_path(self, download: bool = False) -> Optional[str]:
255
261
  """Returns path to a file in a local cache.
256
262
  Return None if file is not cached. Throws an exception if cache is not setup."""
257
263
  if self._catalog is None:
258
264
  raise RuntimeError(
259
265
  "cannot resolve local file path because catalog is not setup"
260
266
  )
261
- return self._catalog.cache.get_path(self.get_uid())
267
+ uid = self.get_uid()
268
+ if download:
269
+ client = self._catalog.get_client(self.source)
270
+ client.download(uid, callback=self._download_cb)
271
+ return self._catalog.cache.get_path(uid)
262
272
 
263
273
  def get_file_suffix(self):
264
274
  """Returns last part of file name with `.`."""
@@ -313,6 +323,70 @@ class File(DataModel):
313
323
  """Returns `fsspec` filesystem for the file."""
314
324
  return self._catalog.get_client(self.source).fs
315
325
 
326
+ def resolve(self) -> "Self":
327
+ """
328
+ Resolve a File object by checking its existence and updating its metadata.
329
+
330
+ Returns:
331
+ File: The resolved File object with updated metadata.
332
+ """
333
+ if self._catalog is None:
334
+ raise RuntimeError("Cannot resolve file: catalog is not set")
335
+
336
+ try:
337
+ client = self._catalog.get_client(self.source)
338
+ except NotImplementedError as e:
339
+ raise RuntimeError(
340
+ f"Unsupported protocol for file source: {self.source}"
341
+ ) from e
342
+
343
+ try:
344
+ info = client.fs.info(client.get_full_path(self.path))
345
+ converted_info = client.info_to_file(info, self.source)
346
+ return type(self)(
347
+ path=self.path,
348
+ source=self.source,
349
+ size=converted_info.size,
350
+ etag=converted_info.etag,
351
+ version=converted_info.version,
352
+ is_latest=converted_info.is_latest,
353
+ last_modified=converted_info.last_modified,
354
+ location=self.location,
355
+ )
356
+ except (FileNotFoundError, PermissionError, OSError) as e:
357
+ logger.warning("File system error when resolving %s: %s", self.path, str(e))
358
+
359
+ return type(self)(
360
+ path=self.path,
361
+ source=self.source,
362
+ size=0,
363
+ etag="",
364
+ version="",
365
+ is_latest=True,
366
+ last_modified=TIME_ZERO,
367
+ location=self.location,
368
+ )
369
+
370
+
371
+ def resolve(file: File) -> File:
372
+ """
373
+ Resolve a File object by checking its existence and updating its metadata.
374
+
375
+ This function is a wrapper around the File.resolve() method, designed to be
376
+ used as a mapper in DataChain operations.
377
+
378
+ Args:
379
+ file (File): The File object to resolve.
380
+
381
+ Returns:
382
+ File: The resolved File object with updated metadata.
383
+
384
+ Raises:
385
+ RuntimeError: If the file's catalog is not set or if
386
+ the file source protocol is unsupported.
387
+ """
388
+ return file.resolve()
389
+
316
390
 
317
391
  class TextFile(File):
318
392
  """`DataModel` for reading text files."""
datachain/lib/hf.py CHANGED
@@ -15,7 +15,7 @@ try:
15
15
  Value,
16
16
  load_dataset,
17
17
  )
18
- from datasets.features.features import string_to_arrow
18
+ from datasets.features.features import Features, string_to_arrow
19
19
  from datasets.features.image import image_to_bytes
20
20
 
21
21
  except ImportError as exc:
@@ -36,6 +36,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
36
36
  from datachain.lib.udf import Generator
37
37
 
38
38
  if TYPE_CHECKING:
39
+ import pyarrow as pa
39
40
  from pydantic import BaseModel
40
41
 
41
42
 
@@ -71,6 +72,15 @@ class HFGenerator(Generator):
71
72
  *args,
72
73
  **kwargs,
73
74
  ):
75
+ """
76
+ Generator for chain from huggingface datasets.
77
+
78
+ Parameters:
79
+
80
+ ds : Path or name of the dataset to read from Hugging Face Hub,
81
+ or an instance of `datasets.Dataset`-like object.
82
+ output_schema : Pydantic model for validation.
83
+ """
74
84
  super().__init__()
75
85
  self.ds = ds
76
86
  self.output_schema = output_schema
@@ -92,7 +102,7 @@ class HFGenerator(Generator):
92
102
  output_dict["split"] = split
93
103
  for name, feat in ds.features.items():
94
104
  anno = self.output_schema.model_fields[name].annotation
95
- output_dict[name] = _convert_feature(row[name], feat, anno)
105
+ output_dict[name] = convert_feature(row[name], feat, anno)
96
106
  yield self.output_schema(**output_dict)
97
107
  pbar.update(1)
98
108
 
@@ -106,7 +116,7 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
106
116
  return {"": ds}
107
117
 
108
118
 
109
- def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
119
+ def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
110
120
  if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
111
121
  return val
112
122
  if isinstance(feat, ClassLabel):
@@ -117,20 +127,23 @@ def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
117
127
  for sname in val:
118
128
  sfeat = feat.feature[sname]
119
129
  sanno = anno.model_fields[sname].annotation
120
- sdict[sname] = [_convert_feature(v, sfeat, sanno) for v in val[sname]]
130
+ sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
121
131
  return anno(**sdict)
122
132
  return val
123
133
  if isinstance(feat, Image):
134
+ if isinstance(val, dict):
135
+ return HFImage(img=val["bytes"])
124
136
  return HFImage(img=image_to_bytes(val))
125
137
  if isinstance(feat, Audio):
126
138
  return HFAudio(**val)
127
139
 
128
140
 
129
141
  def get_output_schema(
130
- ds: Union[Dataset, IterableDataset], model_name: str = ""
142
+ features: Features, model_name: str = "", stream: bool = True
131
143
  ) -> dict[str, DataType]:
144
+ """Generate UDF output schema from huggingface datasets features."""
132
145
  fields_dict = {}
133
- for name, val in ds.features.items():
146
+ for name, val in features.items():
134
147
  fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
135
148
  return fields_dict # type: ignore[return-value]
136
149
 
@@ -165,3 +178,7 @@ def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
165
178
  if isinstance(val, Audio):
166
179
  return HFAudio
167
180
  raise TypeError(f"Unknown huggingface datasets type {type(val)}")
181
+
182
+
183
+ def schema_from_arrow(schema: "pa.Schema"):
184
+ return Features.from_arrow_schema(schema)
datachain/lib/listing.py CHANGED
@@ -20,7 +20,7 @@ LISTING_TTL = 4 * 60 * 60 # cached listing lasts 4 hours
20
20
  LISTING_PREFIX = "lst__" # listing datasets start with this name
21
21
 
22
22
 
23
- def list_bucket(uri: str, client_config=None) -> Callable:
23
+ def list_bucket(uri: str, cache, client_config=None) -> Callable:
24
24
  """
25
25
  Function that returns another generator function that yields File objects
26
26
  from bucket where each File represents one bucket entry.
@@ -28,7 +28,8 @@ def list_bucket(uri: str, client_config=None) -> Callable:
28
28
 
29
29
  def list_func() -> Iterator[File]:
30
30
  config = client_config or {}
31
- client, path = Client.parse_url(uri, None, **config) # type: ignore[arg-type]
31
+ client = Client.get_client(uri, cache, **config) # type: ignore[arg-type]
32
+ _, path = Client.parse_url(uri)
32
33
  for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
33
34
  yield from entries
34
35
 
@@ -76,16 +77,17 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
76
77
  """
77
78
  Parsing uri and returns listing dataset name, listing uri and listing path
78
79
  """
79
- client, path = Client.parse_url(uri, cache, **client_config)
80
+ client = Client.get_client(uri, cache, **client_config)
81
+ storage_uri, path = Client.parse_url(uri)
80
82
 
81
83
  # clean path without globs
82
84
  lst_uri_path = (
83
85
  posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
84
86
  )
85
87
 
86
- lst_uri = f"{client.uri}/{lst_uri_path.lstrip('/')}"
88
+ lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
87
89
  ds_name = (
88
- f"{LISTING_PREFIX}{client.uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
90
+ f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
89
91
  )
90
92
 
91
93
  return ds_name, lst_uri, path
@@ -13,8 +13,8 @@ class ListingInfo(DatasetInfo):
13
13
 
14
14
  @property
15
15
  def storage_uri(self) -> str:
16
- client, _ = Client.parse_url(self.uri, None) # type: ignore[arg-type]
17
- return client.uri
16
+ uri, _ = Client.parse_url(self.uri)
17
+ return uri
18
18
 
19
19
  @property
20
20
  def expires(self) -> Optional[datetime]:
@@ -386,11 +386,20 @@ class SignalSchema:
386
386
  else:
387
387
  json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
388
388
  obj = fr(**json)
389
- if isinstance(obj, File):
390
- obj._set_stream(catalog, caching_enabled=cache)
389
+ SignalSchema._set_file_stream(obj, catalog, cache)
391
390
  res.append(obj)
392
391
  return res
393
392
 
393
+ @staticmethod
394
+ def _set_file_stream(
395
+ obj: BaseModel, catalog: "Catalog", cache: bool = False
396
+ ) -> None:
397
+ if isinstance(obj, File):
398
+ obj._set_stream(catalog, caching_enabled=cache)
399
+ for field, finfo in obj.model_fields.items():
400
+ if ModelStore.is_pydantic(finfo.annotation):
401
+ SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
402
+
394
403
  def db_signals(
395
404
  self, name: Optional[str] = None, as_columns=False
396
405
  ) -> Union[list[str], list[Column]]:
datachain/lib/tar.py ADDED
@@ -0,0 +1,33 @@
1
+ import hashlib
2
+ import tarfile
3
+ from collections.abc import Iterator
4
+
5
+ from datachain.lib.file import File, TarVFile
6
+
7
+
8
+ def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
9
+ new_parent = parent.get_full_name()
10
+ etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
11
+ etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
12
+ return File(
13
+ source=parent.source,
14
+ path=f"{new_parent}/{info.name}",
15
+ version=parent.version,
16
+ size=info.size,
17
+ etag=etag,
18
+ location=[
19
+ {
20
+ "vtype": TarVFile.get_vtype(),
21
+ "parent": parent.model_dump_custom(),
22
+ "size": info.size,
23
+ "offset": info.offset_data,
24
+ }
25
+ ],
26
+ )
27
+
28
+
29
+ def process_tar(file: File) -> Iterator[File]:
30
+ with file.open() as fd:
31
+ with tarfile.open(fileobj=fd) as tar:
32
+ for entry in tar.getmembers():
33
+ yield build_tar_member(file, entry)
@@ -1,4 +1,3 @@
1
- import hashlib
2
1
  import json
3
2
  import tarfile
4
3
  import warnings
@@ -17,7 +16,8 @@ from typing import (
17
16
  from pydantic import Field
18
17
 
19
18
  from datachain.lib.data_model import DataModel
20
- from datachain.lib.file import File, TarVFile
19
+ from datachain.lib.file import File
20
+ from datachain.lib.tar import build_tar_member
21
21
  from datachain.lib.utils import DataChainError
22
22
 
23
23
  # The `json` method of the Pydantic `BaseModel` class has been deprecated
@@ -176,34 +176,11 @@ class Builder:
176
176
  self._tar_stream, self._core_extensions, self.state.stem
177
177
  )
178
178
 
179
- file = self.build_file_record()
179
+ file = build_tar_member(self._tar_stream, self.state.core_file)
180
180
  wds = self._wds_class(**self.state.data | {"file": file})
181
181
  self.state = BuilderState()
182
182
  return wds
183
183
 
184
- def build_file_record(self):
185
- new_parent = self._tar_stream.get_full_name()
186
- core_file = self.state.core_file
187
- etag_string = "-".join(
188
- [self._tar_stream.etag, core_file.name, str(core_file.mtime)]
189
- )
190
- etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
191
- return File(
192
- source=self._tar_stream.source,
193
- path=f"{new_parent}/{core_file.name}",
194
- version=self._tar_stream.version,
195
- size=core_file.size,
196
- etag=etag,
197
- location=[
198
- {
199
- "vtype": TarVFile.get_vtype(),
200
- "parent": self._tar_stream.model_dump_custom(),
201
- "size": core_file.size,
202
- "offset": core_file.offset_data,
203
- }
204
- ],
205
- )
206
-
207
184
  def _get_type(self, ext):
208
185
  field = self._wds_class.model_fields.get(ext, None)
209
186
  if field is None:
@@ -217,39 +194,6 @@ class Builder:
217
194
  return anno
218
195
 
219
196
 
220
- class TarStream(File):
221
- @staticmethod
222
- def to_text(data):
223
- return data.decode("utf-8")
224
-
225
- _DATA_CONVERTERS: ClassVar[dict[type, Any]] = {
226
- str: lambda data: TarStream.to_text(data),
227
- int: lambda data: int(TarStream.to_text(data)),
228
- float: lambda data: float(TarStream.to_text(data)),
229
- bytes: lambda data: data,
230
- dict: lambda data: json.loads(TarStream.to_text(data)),
231
- }
232
-
233
- def __init__(self, **kwargs):
234
- super().__init__(**kwargs)
235
- self._tar = None
236
-
237
- def open(self):
238
- self._tar = tarfile.open(fileobj=super().open()) # noqa: SIM115
239
- return self
240
-
241
- def getmembers(self) -> list[tarfile.TarInfo]:
242
- return self._tar.getmembers()
243
-
244
- def read_member(self, member: tarfile.TarInfo, type):
245
- fd = self._tar.extractfile(member)
246
- data = fd.read()
247
- converter = self._DATA_CONVERTERS.get(type, None)
248
- if not converter:
249
- raise ValueError("")
250
- return converter(data)
251
-
252
-
253
197
  def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
254
198
  builder = Builder(stream, core_extensions, spec, tar, encoding)
255
199
 
@@ -37,6 +37,7 @@ from tqdm import tqdm
37
37
 
38
38
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
39
39
  from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
40
+ from datachain.client import Client
40
41
  from datachain.data_storage.schema import (
41
42
  PARTITION_COLUMN_ID,
42
43
  partition_col_names,
@@ -194,7 +195,7 @@ class IndexingStep(StartingStep):
194
195
 
195
196
  def apply(self):
196
197
  self.catalog.index([self.path], **self.kwargs)
197
- uri, path = self.parse_path()
198
+ uri, path = Client.parse_url(self.path)
198
199
  _partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
199
200
  uri, path
200
201
  )
@@ -216,11 +217,6 @@ class IndexingStep(StartingStep):
216
217
 
217
218
  return step_result(q, dataset_rows.c, dependencies=[storage.uri])
218
219
 
219
- def parse_path(self):
220
- client_config = self.kwargs.get("client_config") or {}
221
- client, path = self.catalog.parse_url(self.path, **client_config)
222
- return client.uri, path
223
-
224
220
 
225
221
  def generator_then_call(generator, func: Callable):
226
222
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.14
3
+ Version: 0.3.15
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -5,7 +5,7 @@ datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
5
5
  datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
- datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
8
+ datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
9
9
  datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
@@ -17,13 +17,13 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=7yl_WMGS6CfOc_G2MCbVVkdAfAlcZb2gC_PvXzBnoJ0,69344
20
+ datachain/catalog/catalog.py,sha256=kPg5ILeCWSjXCj3ewUZY6kzj36HTEqajB3mJDkbs-Vo,69023
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
23
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
24
24
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
25
25
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
26
- datachain/client/fsspec.py,sha256=S93K9bS76MGcLYgWKVZiPVivbMElJ9Fq1w67I8BCR-g,13311
26
+ datachain/client/fsspec.py,sha256=0i4EJIwdx_UNZlbSsUeohWjgVg4B5xoGxTYZKwXS22U,13459
27
27
  datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
28
28
  datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
29
29
  datachain/client/local.py,sha256=LTyISV4oNSOPUdsai5eNZYCGXNCn8rNGuAI0bdgbtnU,5006
@@ -38,27 +38,28 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
38
38
  datachain/data_storage/sqlite.py,sha256=yooLHQXrpoqDguGlF0SGcCiMU1T82OEc4wr1ra8eBHo,28285
39
39
  datachain/data_storage/warehouse.py,sha256=Pq6Nt3fyz1WFv6Mdtv2ZUr0_GFCNbafbtS4PdibblUg,32507
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
41
+ datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
42
42
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
43
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
44
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
45
- datachain/lib/dc.py,sha256=C-sfWRinV8pDK2P6UHLbScOahTlTiVQpoxUUdVllF2k,68710
46
- datachain/lib/file.py,sha256=rXmyzUFgnLQ4J3CyOCcg-guhzAz4x9Ug595FbNn4Y2E,11398
47
- datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
45
+ datachain/lib/dc.py,sha256=HERJNR4TISbaAtSLARV72INgKPfQRItyd1l28P-GtzU,68871
46
+ datachain/lib/file.py,sha256=elQLorLbIkusuQSVfiuC_KrGSZI8cGm-iT8fHmckJlo,13774
47
+ datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
48
48
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
49
- datachain/lib/listing.py,sha256=mt-dsYfYFMPHN3zXnkohBHuueY-4tiNGPkcDYkKB0lY,3887
50
- datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
49
+ datachain/lib/listing.py,sha256=e4O1gs3rKJ0eGwb0hSEfD-l9U7x-f-TYqYGF7Ni-x38,3973
50
+ datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
51
51
  datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
52
52
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
53
53
  datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
54
54
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
55
- datachain/lib/signal_schema.py,sha256=vb4yCC90_pEngiu9Irc02kCPyqBxkrFDL4TKr7UMY5U,23808
55
+ datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
56
+ datachain/lib/tar.py,sha256=d7FpYyxbHCL1twRt_Oe9QoPbZa2Tn5lj7iWP0HvvRn0,999
56
57
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
57
58
  datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
58
59
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
59
60
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
60
61
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
- datachain/lib/webdataset.py,sha256=ZzGLtOUA-QjP4kttGgNqhrioDuDnomWFlsow4fLdezQ,8717
62
+ datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
62
63
  datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
63
64
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
65
  datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
@@ -69,7 +70,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
69
70
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
70
71
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
71
72
  datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
72
- datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
73
+ datachain/query/dataset.py,sha256=9lhcgccavqypVParE4pvd_Hgg8gmoDAN6m1IkpSwXhE,58219
73
74
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
74
75
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
75
76
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -96,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
96
97
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
97
98
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
98
99
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
99
- datachain-0.3.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
- datachain-0.3.14.dist-info/METADATA,sha256=bItmxEsx2MEsJ78Mu1yjO-PX-RkDuWHMESoPuGiJgxw,17073
101
- datachain-0.3.14.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
102
- datachain-0.3.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
- datachain-0.3.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
- datachain-0.3.14.dist-info/RECORD,,
100
+ datachain-0.3.15.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
+ datachain-0.3.15.dist-info/METADATA,sha256=E3jImGtRTyvMPTSqFsgwhsHsnZn_9SRVeThmrDXRuf0,17073
102
+ datachain-0.3.15.dist-info/WHEEL,sha256=5Mi1sN9lKoFv_gxcPtisEVrJZihrm_beibeg5R6xb4I,91
103
+ datachain-0.3.15.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
+ datachain-0.3.15.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
+ datachain-0.3.15.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (75.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5