PyPI - datachain - Versions diffs - 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl - Mend

datachain 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (19) hide show

datachain/catalog/catalog.py +18 -29
datachain/client/fsspec.py +9 -8
datachain/dataset.py +1 -1
datachain/lib/arrow.py +51 -16
datachain/lib/dc.py +7 -2
datachain/lib/file.py +76 -2
datachain/lib/hf.py +23 -6
datachain/lib/listing.py +7 -5
datachain/lib/listing_info.py +2 -2
datachain/lib/signal_schema.py +11 -2
datachain/lib/tar.py +33 -0
datachain/lib/webdataset.py +3 -59
datachain/query/dataset.py +2 -6
{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/METADATA +1 -1
{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/RECORD +19 -18
{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/WHEEL +1 -1
{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/LICENSE +0 -0
{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/entry_points.txt +0 -0
{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -621,10 +621,6 @@ class Catalog:
                 code_ast.body[-1:] = new_expressions
         return code_ast
-    def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
-        config = config or self.client_config
-        return Client.parse_url(uri, self.cache, **config)
     def get_client(self, uri: StorageURI, **config: Any) -> Client:
         """
         Return the client corresponding to the given source `uri`.
@@ -651,17 +647,16 @@ class Catalog:
         partial_path: Optional[str]
         client_config = client_config or self.client_config
-        client, path = self.parse_url(source, **client_config)
+        uri, path = Client.parse_url(source)
+        client = Client.get_client(source, self.cache, **client_config)
         stem = os.path.basename(os.path.normpath(path))
         prefix = (
             posixpath.dirname(path)
             if glob.has_magic(stem) or client.fs.isfile(source)
             else path
         )
-        storage_dataset_name = Storage.dataset_name(
-            client.uri, posixpath.join(prefix, "")
-        )
-        source_metastore = self.metastore.clone(client.uri)
+        storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
+        source_metastore = self.metastore.clone(uri)
         columns = [
             Column("path", String),
@@ -675,15 +670,13 @@ class Catalog:
         ]
         if skip_indexing:
-            source_metastore.create_storage_if_not_registered(client.uri)
-            storage = source_metastore.get_storage(client.uri)
-            source_metastore.init_partial_id(client.uri)
-            partial_id = source_metastore.get_next_partial_id(client.uri)
+            source_metastore.create_storage_if_not_registered(uri)
+            storage = source_metastore.get_storage(uri)
+            source_metastore.init_partial_id(uri)
+            partial_id = source_metastore.get_next_partial_id(uri)
-            source_metastore = self.metastore.clone(
-                uri=client.uri, partial_id=partial_id
-            )
-            source_metastore.init(client.uri)
+            source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
+            source_metastore.init(uri)
             source_warehouse = self.warehouse.clone()
             dataset = self.create_dataset(
@@ -701,20 +694,16 @@ class Catalog:
             in_progress,
             partial_id,
             partial_path,
-        ) = source_metastore.register_storage_for_indexing(
-            client.uri, force_update, prefix
-        )
+        ) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
         if in_progress:
             raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
         if not need_index:
             assert partial_id is not None
             assert partial_path is not None
-            source_metastore = self.metastore.clone(
-                uri=client.uri, partial_id=partial_id
-            )
+            source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
             source_warehouse = self.warehouse.clone()
-            dataset = self.get_dataset(Storage.dataset_name(client.uri, partial_path))
+            dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
             lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
             logger.debug(
                 "Using cached listing %s. Valid till: %s",
@@ -731,11 +720,11 @@ class Catalog:
             return lst, path
-        source_metastore.init_partial_id(client.uri)
-        partial_id = source_metastore.get_next_partial_id(client.uri)
+        source_metastore.init_partial_id(uri)
+        partial_id = source_metastore.get_next_partial_id(uri)
-        source_metastore.init(client.uri)
-        source_metastore = self.metastore.clone(uri=client.uri, partial_id=partial_id)
+        source_metastore.init(uri)
+        source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
         source_warehouse = self.warehouse.clone()
@@ -1370,7 +1359,7 @@ class Catalog:
     def signed_url(self, source: str, path: str, client_config=None) -> str:
         client_config = client_config or self.client_config
-        client, _ = self.parse_url(source, **client_config)
+        client = Client.get_client(source, self.cache, **client_config)
         return client.url(path)
     def export_dataset_table(

datachain/client/fsspec.py CHANGED Viewed

@@ -116,15 +116,16 @@ class Client(ABC):
         return DATA_SOURCE_URI_PATTERN.match(name) is not None
     @staticmethod
-    def parse_url(
-        source: str,
-        cache: DataChainCache,
-        **kwargs,
-    ) -> tuple["Client", str]:
+    def parse_url(source: str) -> tuple[StorageURI, str]:
+        cls = Client.get_implementation(source)
+        storage_name, rel_path = cls.split_url(source)
+        return cls.get_uri(storage_name), rel_path
+    @staticmethod
+    def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
         cls = Client.get_implementation(source)
-        storage_url, rel_path = cls.split_url(source)
-        client = cls.from_name(storage_url, cache, kwargs)
-        return client, rel_path
+        storage_url, _ = cls.split_url(source)
+        return cls.from_name(storage_url, cache, kwargs)
     @classmethod
     def create_fs(cls, **kwargs) -> "AbstractFileSystem":

datachain/dataset.py CHANGED Viewed

@@ -112,7 +112,7 @@ class DatasetDependency:
         if is_listing_dataset(dataset_name):
             dependency_type = DatasetDependencyType.STORAGE  # type: ignore[arg-type]
-            dependency_name = listing_uri_from_name(dataset_name)
+            dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
         return cls(
             id,

datachain/lib/arrow.py CHANGED Viewed

@@ -13,8 +13,10 @@ from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Generator
 if TYPE_CHECKING:
+    from datasets.features.features import Features
     from pydantic import BaseModel
+    from datachain.lib.data_model import DataType
     from datachain.lib.dc import DataChain
@@ -46,7 +48,10 @@ class ArrowGenerator(Generator):
         self.kwargs = kwargs
     def process(self, file: File):
-        if self.nrows:
+        if file._caching_enabled:
+            path = file.get_local_path(download=True)
+            ds = dataset(path, schema=self.input_schema, **self.kwargs)
+        elif self.nrows:
             path = _nrows_file(file, self.nrows)
             ds = dataset(path, schema=self.input_schema, **self.kwargs)
         else:
@@ -54,6 +59,7 @@ class ArrowGenerator(Generator):
             ds = dataset(
                 path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
             )
+        hf_schema = _get_hf_schema(ds.schema)
         index = 0
         with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
             for record_batch in ds.to_batches():
@@ -62,9 +68,17 @@ class ArrowGenerator(Generator):
                     if self.output_schema:
                         fields = self.output_schema.model_fields
                         vals_dict = {}
-                        for (field, field_info), val in zip(fields.items(), vals):
-                            if ModelStore.is_pydantic(field_info.annotation):
-                                vals_dict[field] = field_info.annotation(**val)  # type: ignore[misc]
+                        for i, ((field, field_info), val) in enumerate(
+                            zip(fields.items(), vals)
+                        ):
+                            anno = field_info.annotation
+                            if hf_schema:
+                                from datachain.lib.hf import convert_feature
+                                feat = list(hf_schema[0].values())[i]
+                                vals_dict[field] = convert_feature(val, feat, anno)
+                            elif ModelStore.is_pydantic(anno):
+                                vals_dict[field] = anno(**val)  # type: ignore[misc]
                             else:
                                 vals_dict[field] = val
                         vals = [self.output_schema(**vals_dict)]
@@ -91,26 +105,36 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
             "Error generating output from Arrow schema - "
             f"Schema has {len(schema)} columns but got {len(col_names)} column names."
         )
-    default_column = 0
+    if not col_names:
+        col_names = schema.names
+    columns = _convert_col_names(col_names)  # type: ignore[arg-type]
+    hf_schema = _get_hf_schema(schema)
+    if hf_schema:
+        return {
+            column: hf_type for hf_type, column in zip(hf_schema[1].values(), columns)
+        }
     output = {}
-    for i, field in enumerate(schema):
-        if col_names:
-            column = col_names[i]
-        else:
-            column = field.name
-        column = column.lower()
-        column = re.sub("[^0-9a-z_]+", "", column)
-        if not column:
-            column = f"c{default_column}"
-            default_column += 1
+    for field, column in zip(schema, columns):
         dtype = arrow_type_mapper(field.type, column)  # type: ignore[assignment]
         if field.nullable and not ModelStore.is_pydantic(dtype):
             dtype = Optional[dtype]  # type: ignore[assignment]
         output[column] = dtype
     return output
+def _convert_col_names(col_names: Sequence[str]) -> list[str]:
+    default_column = 0
+    converted_col_names = []
+    for column in col_names:
+        column = column.lower()
+        column = re.sub("[^0-9a-z_]+", "", column)
+        if not column:
+            column = f"c{default_column}"
+            default_column += 1
+        converted_col_names.append(column)
+    return converted_col_names
 def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa: PLR0911
     """Convert pyarrow types to basic types."""
     from datetime import datetime
@@ -156,3 +180,14 @@ def _nrows_file(file: File, nrows: int) -> str:
                 writer.write(line)
                 writer.write("\n")
     return tf.name
+def _get_hf_schema(
+    schema: "pa.Schema",
+) -> Optional[tuple["Features", dict[str, "DataType"]]]:
+    if schema.metadata and b"huggingface" in schema.metadata:
+        from datachain.lib.hf import get_output_schema, schema_from_arrow
+        features = schema_from_arrow(schema)
+        return features, get_output_schema(features)
+    return None

datachain/lib/dc.py CHANGED Viewed

@@ -408,7 +408,11 @@ class DataChain(DatasetQuery):
                     in_memory=in_memory,
                 )
                 .gen(
-                    list_bucket(list_uri, client_config=session.catalog.client_config),
+                    list_bucket(
+                        list_uri,
+                        session.catalog.cache,
+                        client_config=session.catalog.client_config,
+                    ),
                     output={f"{object_name}": File},
                 )
                 .save(list_dataset_name, listing=True)
@@ -1523,7 +1527,8 @@ class DataChain(DatasetQuery):
             output = {"split": str}
         model_name = model_name or object_name or ""
-        output = output | get_output_schema(next(iter(ds_dict.values())), model_name)
+        hf_features = next(iter(ds_dict.values())).features
+        output = output | get_output_schema(hf_features, model_name)
         model = dict_to_data_model(model_name, output)
         if object_name:
             output = {object_name: model}

datachain/lib/file.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import io
 import json
+import logging
 import os
 import posixpath
 from abc import ABC, abstractmethod
@@ -15,6 +16,9 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from PIL import Image
 from pydantic import Field, field_validator
+if TYPE_CHECKING:
+    from typing_extensions import Self
 from datachain.cache import UniqueId
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
@@ -25,6 +29,8 @@ from datachain.utils import TIME_ZERO
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
+logger = logging.getLogger("datachain")
 # how to create file path when exporting
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
@@ -251,14 +257,18 @@ class File(DataModel):
         dump = self.model_dump()
         return UniqueId(*(dump[k] for k in self._unique_id_keys))
-    def get_local_path(self) -> Optional[str]:
+    def get_local_path(self, download: bool = False) -> Optional[str]:
         """Returns path to a file in a local cache.
         Return None if file is not cached. Throws an exception if cache is not setup."""
         if self._catalog is None:
             raise RuntimeError(
                 "cannot resolve local file path because catalog is not setup"
             )
-        return self._catalog.cache.get_path(self.get_uid())
+        uid = self.get_uid()
+        if download:
+            client = self._catalog.get_client(self.source)
+            client.download(uid, callback=self._download_cb)
+        return self._catalog.cache.get_path(uid)
     def get_file_suffix(self):
         """Returns last part of file name with `.`."""
@@ -313,6 +323,70 @@ class File(DataModel):
         """Returns `fsspec` filesystem for the file."""
         return self._catalog.get_client(self.source).fs
+    def resolve(self) -> "Self":
+        """
+        Resolve a File object by checking its existence and updating its metadata.
+        Returns:
+            File: The resolved File object with updated metadata.
+        """
+        if self._catalog is None:
+            raise RuntimeError("Cannot resolve file: catalog is not set")
+        try:
+            client = self._catalog.get_client(self.source)
+        except NotImplementedError as e:
+            raise RuntimeError(
+                f"Unsupported protocol for file source: {self.source}"
+            ) from e
+        try:
+            info = client.fs.info(client.get_full_path(self.path))
+            converted_info = client.info_to_file(info, self.source)
+            return type(self)(
+                path=self.path,
+                source=self.source,
+                size=converted_info.size,
+                etag=converted_info.etag,
+                version=converted_info.version,
+                is_latest=converted_info.is_latest,
+                last_modified=converted_info.last_modified,
+                location=self.location,
+            )
+        except (FileNotFoundError, PermissionError, OSError) as e:
+            logger.warning("File system error when resolving %s: %s", self.path, str(e))
+        return type(self)(
+            path=self.path,
+            source=self.source,
+            size=0,
+            etag="",
+            version="",
+            is_latest=True,
+            last_modified=TIME_ZERO,
+            location=self.location,
+        )
+def resolve(file: File) -> File:
+    """
+    Resolve a File object by checking its existence and updating its metadata.
+    This function is a wrapper around the File.resolve() method, designed to be
+    used as a mapper in DataChain operations.
+    Args:
+        file (File): The File object to resolve.
+    Returns:
+        File: The resolved File object with updated metadata.
+    Raises:
+        RuntimeError: If the file's catalog is not set or if
+        the file source protocol is unsupported.
+    """
+    return file.resolve()
 class TextFile(File):
     """`DataModel` for reading text files."""

datachain/lib/hf.py CHANGED Viewed

@@ -15,7 +15,7 @@ try:
         Value,
         load_dataset,
     )
-    from datasets.features.features import string_to_arrow
+    from datasets.features.features import Features, string_to_arrow
     from datasets.features.image import image_to_bytes
 except ImportError as exc:
@@ -36,6 +36,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
 from datachain.lib.udf import Generator
 if TYPE_CHECKING:
+    import pyarrow as pa
     from pydantic import BaseModel
@@ -71,6 +72,15 @@ class HFGenerator(Generator):
         *args,
         **kwargs,
     ):
+        """
+        Generator for chain from huggingface datasets.
+        Parameters:
+        ds : Path or name of the dataset to read from Hugging Face Hub,
+            or an instance of `datasets.Dataset`-like object.
+        output_schema : Pydantic model for validation.
+        """
         super().__init__()
         self.ds = ds
         self.output_schema = output_schema
@@ -92,7 +102,7 @@ class HFGenerator(Generator):
                     output_dict["split"] = split
                 for name, feat in ds.features.items():
                     anno = self.output_schema.model_fields[name].annotation
-                    output_dict[name] = _convert_feature(row[name], feat, anno)
+                    output_dict[name] = convert_feature(row[name], feat, anno)
                 yield self.output_schema(**output_dict)
                 pbar.update(1)
@@ -106,7 +116,7 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
     return {"": ds}
-def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
+def convert_feature(val: Any, feat: Any, anno: Any) -> Any:  # noqa: PLR0911
     if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
         return val
     if isinstance(feat, ClassLabel):
@@ -117,20 +127,23 @@ def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
             for sname in val:
                 sfeat = feat.feature[sname]
                 sanno = anno.model_fields[sname].annotation
-                sdict[sname] = [_convert_feature(v, sfeat, sanno) for v in val[sname]]
+                sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
             return anno(**sdict)
         return val
     if isinstance(feat, Image):
+        if isinstance(val, dict):
+            return HFImage(img=val["bytes"])
         return HFImage(img=image_to_bytes(val))
     if isinstance(feat, Audio):
         return HFAudio(**val)
 def get_output_schema(
-    ds: Union[Dataset, IterableDataset], model_name: str = ""
+    features: Features, model_name: str = "", stream: bool = True
 ) -> dict[str, DataType]:
+    """Generate UDF output schema from huggingface datasets features."""
     fields_dict = {}
-    for name, val in ds.features.items():
+    for name, val in features.items():
         fields_dict[name] = _feature_to_chain_type(name, val)  # type: ignore[assignment]
     return fields_dict  # type: ignore[return-value]
@@ -165,3 +178,7 @@ def _feature_to_chain_type(name: str, val: Any) -> type:  # noqa: PLR0911
     if isinstance(val, Audio):
         return HFAudio
     raise TypeError(f"Unknown huggingface datasets type {type(val)}")
+def schema_from_arrow(schema: "pa.Schema"):
+    return Features.from_arrow_schema(schema)

datachain/lib/listing.py CHANGED Viewed

@@ -20,7 +20,7 @@ LISTING_TTL = 4 * 60 * 60  # cached listing lasts 4 hours
 LISTING_PREFIX = "lst__"  # listing datasets start with this name
-def list_bucket(uri: str, client_config=None) -> Callable:
+def list_bucket(uri: str, cache, client_config=None) -> Callable:
     """
     Function that returns another generator function that yields File objects
     from bucket where each File represents one bucket entry.
@@ -28,7 +28,8 @@ def list_bucket(uri: str, client_config=None) -> Callable:
     def list_func() -> Iterator[File]:
         config = client_config or {}
-        client, path = Client.parse_url(uri, None, **config)  # type: ignore[arg-type]
+        client = Client.get_client(uri, cache, **config)  # type: ignore[arg-type]
+        _, path = Client.parse_url(uri)
         for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
             yield from entries
@@ -76,16 +77,17 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
     """
     Parsing uri and returns listing dataset name, listing uri and listing path
     """
-    client, path = Client.parse_url(uri, cache, **client_config)
+    client = Client.get_client(uri, cache, **client_config)
+    storage_uri, path = Client.parse_url(uri)
     # clean path without globs
     lst_uri_path = (
         posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
     )
-    lst_uri = f"{client.uri}/{lst_uri_path.lstrip('/')}"
+    lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
     ds_name = (
-        f"{LISTING_PREFIX}{client.uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
+        f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
     )
     return ds_name, lst_uri, path

datachain/lib/listing_info.py CHANGED Viewed

@@ -13,8 +13,8 @@ class ListingInfo(DatasetInfo):
     @property
     def storage_uri(self) -> str:
-        client, _ = Client.parse_url(self.uri, None)  # type: ignore[arg-type]
-        return client.uri
+        uri, _ = Client.parse_url(self.uri)
+        return uri
     @property
     def expires(self) -> Optional[datetime]:

datachain/lib/signal_schema.py CHANGED Viewed

@@ -386,11 +386,20 @@ class SignalSchema:
             else:
                 json, pos = unflatten_to_json_pos(fr, row, pos)  # type: ignore[union-attr]
                 obj = fr(**json)
-                if isinstance(obj, File):
-                    obj._set_stream(catalog, caching_enabled=cache)
+                SignalSchema._set_file_stream(obj, catalog, cache)
                 res.append(obj)
         return res
+    @staticmethod
+    def _set_file_stream(
+        obj: BaseModel, catalog: "Catalog", cache: bool = False
+    ) -> None:
+        if isinstance(obj, File):
+            obj._set_stream(catalog, caching_enabled=cache)
+        for field, finfo in obj.model_fields.items():
+            if ModelStore.is_pydantic(finfo.annotation):
+                SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
     def db_signals(
         self, name: Optional[str] = None, as_columns=False
     ) -> Union[list[str], list[Column]]:

datachain/lib/tar.py ADDED Viewed

@@ -0,0 +1,33 @@
+import hashlib
+import tarfile
+from collections.abc import Iterator
+from datachain.lib.file import File, TarVFile
+def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
+    new_parent = parent.get_full_name()
+    etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
+    etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
+    return File(
+        source=parent.source,
+        path=f"{new_parent}/{info.name}",
+        version=parent.version,
+        size=info.size,
+        etag=etag,
+        location=[
+            {
+                "vtype": TarVFile.get_vtype(),
+                "parent": parent.model_dump_custom(),
+                "size": info.size,
+                "offset": info.offset_data,
+            }
+        ],
+    )
+def process_tar(file: File) -> Iterator[File]:
+    with file.open() as fd:
+        with tarfile.open(fileobj=fd) as tar:
+            for entry in tar.getmembers():
+                yield build_tar_member(file, entry)

datachain/lib/webdataset.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import hashlib
 import json
 import tarfile
 import warnings
@@ -17,7 +16,8 @@ from typing import (
 from pydantic import Field
 from datachain.lib.data_model import DataModel
-from datachain.lib.file import File, TarVFile
+from datachain.lib.file import File
+from datachain.lib.tar import build_tar_member
 from datachain.lib.utils import DataChainError
 # The `json` method of the Pydantic `BaseModel` class has been deprecated
@@ -176,34 +176,11 @@ class Builder:
                 self._tar_stream, self._core_extensions, self.state.stem
             )
-        file = self.build_file_record()
+        file = build_tar_member(self._tar_stream, self.state.core_file)
         wds = self._wds_class(**self.state.data | {"file": file})
         self.state = BuilderState()
         return wds
-    def build_file_record(self):
-        new_parent = self._tar_stream.get_full_name()
-        core_file = self.state.core_file
-        etag_string = "-".join(
-            [self._tar_stream.etag, core_file.name, str(core_file.mtime)]
-        )
-        etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
-        return File(
-            source=self._tar_stream.source,
-            path=f"{new_parent}/{core_file.name}",
-            version=self._tar_stream.version,
-            size=core_file.size,
-            etag=etag,
-            location=[
-                {
-                    "vtype": TarVFile.get_vtype(),
-                    "parent": self._tar_stream.model_dump_custom(),
-                    "size": core_file.size,
-                    "offset": core_file.offset_data,
-                }
-            ],
-        )
     def _get_type(self, ext):
         field = self._wds_class.model_fields.get(ext, None)
         if field is None:
@@ -217,39 +194,6 @@ class Builder:
         return anno
-class TarStream(File):
-    @staticmethod
-    def to_text(data):
-        return data.decode("utf-8")
-    _DATA_CONVERTERS: ClassVar[dict[type, Any]] = {
-        str: lambda data: TarStream.to_text(data),
-        int: lambda data: int(TarStream.to_text(data)),
-        float: lambda data: float(TarStream.to_text(data)),
-        bytes: lambda data: data,
-        dict: lambda data: json.loads(TarStream.to_text(data)),
-    }
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self._tar = None
-    def open(self):
-        self._tar = tarfile.open(fileobj=super().open())  # noqa: SIM115
-        return self
-    def getmembers(self) -> list[tarfile.TarInfo]:
-        return self._tar.getmembers()
-    def read_member(self, member: tarfile.TarInfo, type):
-        fd = self._tar.extractfile(member)
-        data = fd.read()
-        converter = self._DATA_CONVERTERS.get(type, None)
-        if not converter:
-            raise ValueError("")
-        return converter(data)
 def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
     builder = Builder(stream, core_extensions, spec, tar, encoding)

datachain/query/dataset.py CHANGED Viewed

@@ -37,6 +37,7 @@ from tqdm import tqdm
 from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
 from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
+from datachain.client import Client
 from datachain.data_storage.schema import (
     PARTITION_COLUMN_ID,
     partition_col_names,
@@ -194,7 +195,7 @@ class IndexingStep(StartingStep):
     def apply(self):
         self.catalog.index([self.path], **self.kwargs)
-        uri, path = self.parse_path()
+        uri, path = Client.parse_url(self.path)
         _partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
             uri, path
         )
@@ -216,11 +217,6 @@ class IndexingStep(StartingStep):
         return step_result(q, dataset_rows.c, dependencies=[storage.uri])
-    def parse_path(self):
-        client_config = self.kwargs.get("client_config") or {}
-        client, path = self.catalog.parse_url(self.path, **client_config)
-        return client.uri, path
 def generator_then_call(generator, func: Callable):
     """

{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.3.14
+Version: 0.3.15
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0

{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
 datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
-datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
+datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
 datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
 datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
 datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
@@ -17,13 +17,13 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=7yl_WMGS6CfOc_G2MCbVVkdAfAlcZb2gC_PvXzBnoJ0,69344
+datachain/catalog/catalog.py,sha256=kPg5ILeCWSjXCj3ewUZY6kzj36HTEqajB3mJDkbs-Vo,69023
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
 datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=S93K9bS76MGcLYgWKVZiPVivbMElJ9Fq1w67I8BCR-g,13311
+datachain/client/fsspec.py,sha256=0i4EJIwdx_UNZlbSsUeohWjgVg4B5xoGxTYZKwXS22U,13459
 datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
 datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
 datachain/client/local.py,sha256=LTyISV4oNSOPUdsai5eNZYCGXNCn8rNGuAI0bdgbtnU,5006
@@ -38,27 +38,28 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
 datachain/data_storage/sqlite.py,sha256=yooLHQXrpoqDguGlF0SGcCiMU1T82OEc4wr1ra8eBHo,28285
 datachain/data_storage/warehouse.py,sha256=Pq6Nt3fyz1WFv6Mdtv2ZUr0_GFCNbafbtS4PdibblUg,32507
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
+datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
 datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
-datachain/lib/dc.py,sha256=C-sfWRinV8pDK2P6UHLbScOahTlTiVQpoxUUdVllF2k,68710
-datachain/lib/file.py,sha256=rXmyzUFgnLQ4J3CyOCcg-guhzAz4x9Ug595FbNn4Y2E,11398
-datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
+datachain/lib/dc.py,sha256=HERJNR4TISbaAtSLARV72INgKPfQRItyd1l28P-GtzU,68871
+datachain/lib/file.py,sha256=elQLorLbIkusuQSVfiuC_KrGSZI8cGm-iT8fHmckJlo,13774
+datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
-datachain/lib/listing.py,sha256=mt-dsYfYFMPHN3zXnkohBHuueY-4tiNGPkcDYkKB0lY,3887
-datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
+datachain/lib/listing.py,sha256=e4O1gs3rKJ0eGwb0hSEfD-l9U7x-f-TYqYGF7Ni-x38,3973
+datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
 datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
 datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
 datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
 datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
-datachain/lib/signal_schema.py,sha256=vb4yCC90_pEngiu9Irc02kCPyqBxkrFDL4TKr7UMY5U,23808
+datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
+datachain/lib/tar.py,sha256=d7FpYyxbHCL1twRt_Oe9QoPbZa2Tn5lj7iWP0HvvRn0,999
 datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
 datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
 datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
 datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/webdataset.py,sha256=ZzGLtOUA-QjP4kttGgNqhrioDuDnomWFlsow4fLdezQ,8717
+datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
 datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
 datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
@@ -69,7 +70,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
 datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
 datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
 datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
-datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
+datachain/query/dataset.py,sha256=9lhcgccavqypVParE4pvd_Hgg8gmoDAN6m1IkpSwXhE,58219
 datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
 datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -96,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.3.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.3.14.dist-info/METADATA,sha256=bItmxEsx2MEsJ78Mu1yjO-PX-RkDuWHMESoPuGiJgxw,17073
-datachain-0.3.14.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
-datachain-0.3.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.3.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.3.14.dist-info/RECORD,,
+datachain-0.3.15.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.3.15.dist-info/METADATA,sha256=E3jImGtRTyvMPTSqFsgwhsHsnZn_9SRVeThmrDXRuf0,17073
+datachain-0.3.15.dist-info/WHEEL,sha256=5Mi1sN9lKoFv_gxcPtisEVrJZihrm_beibeg5R6xb4I,91
+datachain-0.3.15.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.3.15.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.3.15.dist-info/RECORD,,

{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (74.1.2)
+Generator: setuptools (75.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.3.14.dist-info → datachain-0.3.15.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

Potentially problematic release.

datachain 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl