PyPI - datachain - Versions diffs - 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

datachain 0.3.13py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (22) hide show

datachain/asyn.py +4 -9
datachain/catalog/catalog.py +2 -2
datachain/client/azure.py +1 -13
datachain/client/fsspec.py +7 -7
datachain/client/gcs.py +2 -13
datachain/client/hf.py +0 -10
datachain/client/local.py +3 -12
datachain/client/s3.py +9 -19
datachain/data_storage/sqlite.py +10 -1
datachain/data_storage/warehouse.py +11 -17
datachain/lib/listing.py +1 -2
datachain/lib/model_store.py +2 -2
datachain/lib/pytorch.py +32 -26
datachain/lib/signal_schema.py +146 -58
datachain/listing.py +6 -8
datachain/node.py +0 -43
{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/METADATA +1 -1
{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/RECORD +22 -22
{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/LICENSE +0 -0
{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/WHEEL +0 -0
{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/entry_points.txt +0 -0
{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/top_level.txt +0 -0

datachain/asyn.py CHANGED Viewed

@@ -1,14 +1,8 @@
 import asyncio
-from collections.abc import Awaitable, Coroutine, Iterable
+from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
 from heapq import heappop, heappush
-from typing import (
-    Any,
-    Callable,
-    Generic,
-    Optional,
-    TypeVar,
-)
+from typing import Any, Callable, Generic, Optional, TypeVar
 from fsspec.asyn import get_loop
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
 InputT = TypeVar("InputT", contravariant=True)  # noqa: PLC0105
 ResultT = TypeVar("ResultT", covariant=True)  # noqa: PLC0105
+T = TypeVar("T")
 class AsyncMapper(Generic[InputT, ResultT]):
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
         self._push_result(self._next_yield, None)
-def iter_over_async(ait, loop):
+def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
     """Wrap an asynchronous iterator into a synchronous one"""
     ait = ait.__aiter__()

datachain/catalog/catalog.py CHANGED Viewed

@@ -1390,12 +1390,12 @@ class Catalog:
         dataset = self.get_dataset(name)
         return self.warehouse.dataset_table_export_file_names(dataset, version)
-    def dataset_stats(self, name: str, version: int) -> DatasetStats:
+    def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
         """
         Returns tuple with dataset stats: total number of rows and total dataset size.
         """
         dataset = self.get_dataset(name)
-        dataset_version = dataset.get_version(version)
+        dataset_version = dataset.get_version(version or dataset.latest_version)
         return DatasetStats(
             num_objects=dataset_version.num_objects,
             size=dataset_version.size,

datachain/client/azure.py CHANGED Viewed

@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
 from tqdm import tqdm
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -14,17 +13,6 @@ class AzureClient(Client):
     PREFIX = "az://"
     protocol = "az"
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        version_id = v.get("version_id")
-        return Entry.from_file(
-            path=path,
-            etag=v.get("etag", "").strip('"'),
-            version=version_id or "",
-            is_latest=version_id is None or bool(v.get("is_current_version")),
-            last_modified=v["last_modified"],
-            size=v.get("size", ""),
-        )
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         version_id = v.get("version_id")
         return File(
@@ -57,7 +45,7 @@ class AzureClient(Client):
                                 continue
                             info = (await self.fs._details([b]))[0]
                             entries.append(
-                                self.convert_info(info, self.rel_path(info["name"]))
+                                self.info_to_file(info, self.rel_path(info["name"]))
                             )
                         if entries:
                             await result_queue.put(entries)

datachain/client/fsspec.py CHANGED Viewed

@@ -29,7 +29,7 @@ from tqdm import tqdm
 from datachain.cache import DataChainCache, UniqueId
 from datachain.client.fileslice import FileSlice, FileWrapper
 from datachain.error import ClientError as DataChainClientError
-from datachain.node import Entry
+from datachain.lib.file import File
 from datachain.nodes_fetcher import NodesFetcher
 from datachain.nodes_thread_pool import NodeChunk
 from datachain.storage import StorageURI
@@ -45,7 +45,7 @@ DELIMITER = "/"  # Path delimiter.
 DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
-ResultQueue = asyncio.Queue[Optional[Sequence[Entry]]]
+ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
 def _is_win_local_path(uri: str) -> bool:
@@ -188,7 +188,7 @@ class Client(ABC):
     async def get_current_etag(self, uid: UniqueId) -> str:
         info = await self.fs._info(self.get_full_path(uid.path))
-        return self.convert_info(info, "").etag
+        return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:
         return await self.fs._size(path)
@@ -198,7 +198,7 @@ class Client(ABC):
     async def scandir(
         self, start_prefix: str, method: str = "default"
-    ) -> AsyncIterator[Sequence[Entry]]:
+    ) -> AsyncIterator[Sequence[File]]:
         try:
             impl = getattr(self, f"_fetch_{method}")
         except AttributeError:
@@ -264,7 +264,7 @@ class Client(ABC):
     ) -> None:
         await self._fetch_nested(start_prefix, result_queue)
-    async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
+    async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
         path = f"{self.name}/{prefix}"
         infos = await self.ls_dir(path)
         files = []
@@ -277,7 +277,7 @@ class Client(ABC):
             if info["type"] == "directory":
                 subdirs.add(subprefix)
             else:
-                files.append(self.convert_info(info, subprefix))
+                files.append(self.info_to_file(info, subprefix))
         if files:
             await result_queue.put(files)
         found_count = len(subdirs) + len(files)
@@ -303,7 +303,7 @@ class Client(ABC):
         return f"{self.PREFIX}{self.name}/{rel_path}"
     @abstractmethod
-    def convert_info(self, v: dict[str, Any], parent: str) -> Entry: ...
+    def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
     def fetch_nodes(
         self,

datachain/client/gcs.py CHANGED Viewed

@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
 from tqdm import tqdm
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -108,19 +107,9 @@ class GCSClient(Client):
         finally:
             await page_queue.put(None)
-    def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
+    def _entry_from_dict(self, d: dict[str, Any]) -> File:
         info = self.fs._process_object(self.name, d)
-        return self.convert_info(info, self.rel_path(info["name"]))
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            etag=v.get("etag", ""),
-            version=v.get("generation", ""),
-            is_latest=not v.get("timeDeleted"),
-            last_modified=self.parse_timestamp(v["updated"]),
-            size=v.get("size", ""),
-        )
+        return self.info_to_file(info, self.rel_path(info["name"]))
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(

datachain/client/hf.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Any, cast
 from huggingface_hub import HfFileSystem
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import Client
@@ -22,15 +21,6 @@ class HfClient(Client):
         return cast(HfFileSystem, super().create_fs(**kwargs))
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            size=v["size"],
-            version=v["last_commit"].oid,
-            etag=v.get("blob_id", ""),
-            last_modified=v["last_commit"].date,
-        )
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(
             path=path,

datachain/client/local.py CHANGED Viewed

@@ -7,8 +7,8 @@ from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
+from datachain.cache import UniqueId
 from datachain.lib.file import File
-from datachain.node import Entry
 from datachain.storage import StorageURI
 from .fsspec import Client
@@ -114,9 +114,9 @@ class FileClient(Client):
             use_symlinks=use_symlinks,
         )
-    async def get_current_etag(self, uid) -> str:
+    async def get_current_etag(self, uid: UniqueId) -> str:
         info = self.fs.info(self.get_full_path(uid.path))
-        return self.convert_info(info, "").etag
+        return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:
         return self.fs.size(path)
@@ -136,15 +136,6 @@ class FileClient(Client):
             full_path += "/"
         return full_path
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            etag=v["mtime"].hex(),
-            is_latest=True,
-            last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
-            size=v.get("size", ""),
-        )
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(
             source=self.uri,

datachain/client/s3.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import asyncio
-from typing import Any, cast
+from typing import Any, Optional, cast
 from botocore.exceptions import NoCredentialsError
 from s3fs import S3FileSystem
 from tqdm import tqdm
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -111,8 +110,9 @@ class ClientS3(Client):
     ) -> None:
         await self._fetch_flat(start_prefix, result_queue)
-    def _entry_from_boto(self, v, bucket, versions=False):
-        return Entry.from_file(
+    def _entry_from_boto(self, v, bucket, versions=False) -> File:
+        return File(
+            source=self.uri,
             path=v["Key"],
             etag=v.get("ETag", "").strip('"'),
             version=ClientS3.clean_s3_version(v.get("VersionId", "")),
@@ -125,8 +125,8 @@ class ClientS3(Client):
         self,
         prefix,
         pbar,
-        result_queue,
-    ):
+        result_queue: ResultQueue,
+    ) -> set[str]:
         if prefix:
             prefix = prefix.lstrip(DELIMITER) + DELIMITER
         files = []
@@ -141,7 +141,7 @@ class ClientS3(Client):
             if info["type"] == "directory":
                 subdirs.add(subprefix)
             else:
-                files.append(self.convert_info(info, subprefix))
+                files.append(self.info_to_file(info, subprefix))
                 pbar.update()
             found = True
         if not found:
@@ -152,18 +152,8 @@ class ClientS3(Client):
         return subdirs
     @staticmethod
-    def clean_s3_version(ver):
-        return ver if ver != "null" else ""
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            etag=v.get("ETag", "").strip('"'),
-            version=ClientS3.clean_s3_version(v.get("VersionId", "")),
-            is_latest=v.get("IsLatest", True),
-            last_modified=v.get("LastModified", ""),
-            size=v["size"],
-        )
+    def clean_s3_version(ver: Optional[str]) -> str:
+        return ver if (ver is not None and ver != "null") else ""
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -43,6 +43,8 @@ if TYPE_CHECKING:
     from sqlalchemy.sql.elements import ColumnElement
     from sqlalchemy.types import TypeEngine
+    from datachain.lib.file import File
 logger = logging.getLogger("datachain")
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
 quote = sqlite_dialect.identifier_preparer.quote
+def _get_in_memory_uri():
+    return "file::memory:?cache=shared"
 def get_retry_sleep_sec(retry_count: int) -> int:
     return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
             if db_file == ":memory:":
                 # Enable multithreaded usage of the same in-memory db
                 db = sqlite3.connect(
-                    "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
+                    _get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
                 )
             else:
                 db = sqlite3.connect(
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
         self.db.execute(insert_query)
+    def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
+        return (e.model_dump() for e in entries)
     def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
         rows = list(rows)
         if not rows:

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -20,7 +20,7 @@ from datachain.client import Client
 from datachain.data_storage.schema import convert_rows_custom_column_types
 from datachain.data_storage.serializer import Serializable
 from datachain.dataset import DatasetRecord
-from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
+from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
 from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType
 from datachain.storage import StorageURI
@@ -34,6 +34,7 @@ if TYPE_CHECKING:
     from datachain.data_storage import AbstractIDGenerator, schema
     from datachain.data_storage.db_engine import DatabaseEngine
     from datachain.data_storage.schema import DataTable
+    from datachain.lib.file import File
 try:
     import numpy as np
@@ -401,25 +402,18 @@ class AbstractWarehouse(ABC, Serializable):
         expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
             sa.func.count(table.c.sys__id),
         )
-        if "file__size" in table.columns:
-            expressions = (*expressions, sa.func.sum(table.c.file__size))
-        elif "size" in table.columns:
-            expressions = (*expressions, sa.func.sum(table.c.size))
+        size_columns = [
+            c for c in table.columns if c.name == "size" or c.name.endswith("__size")
+        ]
+        if size_columns:
+            expressions = (*expressions, sa.func.sum(sum(size_columns)))
         query = select(*expressions)
         ((nrows, *rest),) = self.db.execute(query)
-        return nrows, rest[0] if rest else None
-    def prepare_entries(
-        self, uri: str, entries: Iterable[Entry]
-    ) -> list[dict[str, Any]]:
-        """
-        Prepares bucket listing entry (row) for inserting into database
-        """
-        def _prepare_entry(entry: Entry):
-            return attrs.asdict(entry) | {"source": uri}
+        return nrows, rest[0] if rest else 0
-        return [_prepare_entry(e) for e in entries]
+    @abstractmethod
+    def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
+        """Convert File entries so they can be passed on to `insert_rows()`"""
     @abstractmethod
     def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:

datachain/lib/listing.py CHANGED Viewed

@@ -30,8 +30,7 @@ def list_bucket(uri: str, client_config=None) -> Callable:
         config = client_config or {}
         client, path = Client.parse_url(uri, None, **config)  # type: ignore[arg-type]
         for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
-            for entry in entries:
-                yield entry.to_file(client.uri)
+            yield from entries
     return list_func

datachain/lib/model_store.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import inspect
 import logging
-from typing import ClassVar, Optional
+from typing import Any, ClassVar, Optional
 from pydantic import BaseModel
@@ -69,7 +69,7 @@ class ModelStore:
             del cls.store[fr.__name__][version]
     @staticmethod
-    def is_pydantic(val):
+    def is_pydantic(val: Any) -> bool:
         return (
             not hasattr(val, "__origin__")
             and inspect.isclass(val)

datachain/lib/pytorch.py CHANGED Viewed

@@ -7,6 +7,7 @@ from torch import float32
 from torch.distributed import get_rank, get_world_size
 from torch.utils.data import IterableDataset, get_worker_info
 from torchvision.transforms import v2
+from tqdm import tqdm
 from datachain.catalog import Catalog, get_catalog
 from datachain.lib.dc import DataChain
@@ -93,33 +94,38 @@ class PytorchDataset(IterableDataset):
         if self.num_samples > 0:
             ds = ds.sample(self.num_samples)
         ds = ds.chunk(total_rank, total_workers)
-        for row_features in ds.collect():
-            row = []
-            for fr in row_features:
-                if hasattr(fr, "read"):
-                    row.append(fr.read())  # type: ignore[unreachable]
-                else:
-                    row.append(fr)
-            # Apply transforms
-            if self.transform:
-                try:
-                    if isinstance(self.transform, v2.Transform):
-                        row = self.transform(row)
+        desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
+        with tqdm(desc=desc, unit=" rows") as pbar:
+            for row_features in ds.collect():
+                row = []
+                for fr in row_features:
+                    if hasattr(fr, "read"):
+                        row.append(fr.read())  # type: ignore[unreachable]
+                    else:
+                        row.append(fr)
+                # Apply transforms
+                if self.transform:
+                    try:
+                        if isinstance(self.transform, v2.Transform):
+                            row = self.transform(row)
+                        for i, val in enumerate(row):
+                            if isinstance(val, Image.Image):
+                                row[i] = self.transform(val)
+                    except ValueError:
+                        logger.warning(
+                            "Skipping transform due to unsupported data types."
+                        )
+                        self.transform = None
+                if self.tokenizer:
                     for i, val in enumerate(row):
-                        if isinstance(val, Image.Image):
-                            row[i] = self.transform(val)
-                except ValueError:
-                    logger.warning("Skipping transform due to unsupported data types.")
-                    self.transform = None
-            if self.tokenizer:
-                for i, val in enumerate(row):
-                    if isinstance(val, str) or (
-                        isinstance(val, list) and isinstance(val[0], str)
-                    ):
-                        row[i] = convert_text(
-                            val, self.tokenizer, self.tokenizer_kwargs
-                        ).squeeze(0)  # type: ignore[union-attr]
-            yield row
+                        if isinstance(val, str) or (
+                            isinstance(val, list) and isinstance(val[0], str)
+                        ):
+                            row[i] = convert_text(
+                                val, self.tokenizer, self.tokenizer_kwargs
+                            ).squeeze(0)  # type: ignore[union-attr]
+                yield row
+                pbar.update(1)
     @staticmethod
     def get_rank_and_workers() -> tuple[int, int]:

datachain/lib/signal_schema.py CHANGED Viewed

@@ -4,11 +4,14 @@ from collections.abc import Iterator, Sequence
 from dataclasses import dataclass
 from datetime import datetime
 from inspect import isclass
-from typing import (
+from typing import (  # noqa: UP035
     TYPE_CHECKING,
     Annotated,
     Any,
     Callable,
+    Dict,
+    Final,
+    List,
     Literal,
     Optional,
     Union,
@@ -42,8 +45,13 @@ NAMES_TO_TYPES = {
     "dict": dict,
     "bytes": bytes,
     "datetime": datetime,
-    "Literal": Literal,
+    "Final": Final,
     "Union": Union,
+    "Optional": Optional,
+    "List": list,
+    "Dict": dict,
+    "Literal": Any,
+    "Any": Any,
 }
@@ -146,35 +154,11 @@ class SignalSchema:
         return SignalSchema(signals)
     @staticmethod
-    def _get_name_original_type(fr_type: type) -> tuple[str, type]:
-        """Returns the name of and the original type for the given type,
-        based on whether the type is Optional or not."""
-        orig = get_origin(fr_type)
-        args = get_args(fr_type)
-        # Check if fr_type is Optional
-        if orig == Union and len(args) == 2 and (type(None) in args):
-            fr_type = args[0]
-            orig = get_origin(fr_type)
-        if orig in (Literal, LiteralEx):
-            # Literal has no __name__ in Python 3.9
-            type_name = "Literal"
-        elif orig == Union:
-            # Union also has no __name__ in Python 3.9
-            type_name = "Union"
-        else:
-            type_name = str(fr_type.__name__)  # type: ignore[union-attr]
-        return type_name, fr_type
-    @staticmethod
-    def serialize_custom_model_fields(
-        name: str, fr: type, custom_types: dict[str, Any]
+    def _serialize_custom_model_fields(
+        version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
     ) -> str:
         """This serializes any custom type information to the provided custom_types
-        dict, and returns the name of the type provided."""
-        if hasattr(fr, "__origin__") or not issubclass(fr, BaseModel):
-            # Don't store non-feature types.
-            return name
-        version_name = ModelStore.get_name(fr)
+        dict, and returns the name of the type serialized."""
         if version_name in custom_types:
             # This type is already stored in custom_types.
             return version_name
@@ -183,37 +167,102 @@ class SignalSchema:
             field_type = info.annotation
             # All fields should be typed.
             assert field_type
-            field_type_name, field_type = SignalSchema._get_name_original_type(
-                field_type
-            )
-            # Serialize this type to custom_types if it is a custom type as well.
-            fields[field_name] = SignalSchema.serialize_custom_model_fields(
-                field_type_name, field_type, custom_types
-            )
+            fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
         custom_types[version_name] = fields
         return version_name
+    @staticmethod
+    def _serialize_type(fr: type, custom_types: dict[str, Any]) -> str:
+        """Serialize a given type to a string, including automatic ModelStore
+        registration, and save this type and subtypes to custom_types as well."""
+        subtypes: list[Any] = []
+        type_name = SignalSchema._type_to_str(fr, subtypes)
+        # Iterate over all subtypes (includes the input type).
+        for st in subtypes:
+            if st is None or not ModelStore.is_pydantic(st):
+                continue
+            # Register and save feature types.
+            ModelStore.register(st)
+            st_version_name = ModelStore.get_name(st)
+            if st is fr:
+                # If the main type is Pydantic, then use the ModelStore version name.
+                type_name = st_version_name
+            # Save this type to custom_types.
+            SignalSchema._serialize_custom_model_fields(
+                st_version_name, st, custom_types
+            )
+        return type_name
     def serialize(self) -> dict[str, Any]:
         signals: dict[str, Any] = {}
         custom_types: dict[str, Any] = {}
         for name, fr_type in self.values.items():
-            if (fr := ModelStore.to_pydantic(fr_type)) is not None:
-                ModelStore.register(fr)
-                signals[name] = ModelStore.get_name(fr)
-                type_name, fr_type = SignalSchema._get_name_original_type(fr)
-            else:
-                type_name, fr_type = SignalSchema._get_name_original_type(fr_type)
-                signals[name] = type_name
-            self.serialize_custom_model_fields(type_name, fr_type, custom_types)
+            signals[name] = self._serialize_type(fr_type, custom_types)
         if custom_types:
             signals["_custom_types"] = custom_types
         return signals
     @staticmethod
-    def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:
+    def _split_subtypes(type_name: str) -> list[str]:
+        """This splits a list of subtypes, including proper square bracket handling."""
+        start = 0
+        depth = 0
+        subtypes = []
+        for i, c in enumerate(type_name):
+            if c == "[":
+                depth += 1
+            elif c == "]":
+                if depth == 0:
+                    raise TypeError(
+                        "Extra closing square bracket when parsing subtype list"
+                    )
+                depth -= 1
+            elif c == "," and depth == 0:
+                subtypes.append(type_name[start:i].strip())
+                start = i + 1
+        if depth > 0:
+            raise TypeError("Unclosed square bracket when parsing subtype list")
+        subtypes.append(type_name[start:].strip())
+        return subtypes
+    @staticmethod
+    def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:  # noqa: PLR0911
         """Convert a string-based type back into a python type."""
+        type_name = type_name.strip()
+        if not type_name:
+            raise TypeError("Type cannot be empty")
+        if type_name == "NoneType":
+            return None
+        bracket_idx = type_name.find("[")
+        subtypes: Optional[tuple[Optional[type], ...]] = None
+        if bracket_idx > -1:
+            if bracket_idx == 0:
+                raise TypeError("Type cannot start with '['")
+            close_bracket_idx = type_name.rfind("]")
+            if close_bracket_idx == -1:
+                raise TypeError("Unclosed square bracket when parsing type")
+            if close_bracket_idx < bracket_idx:
+                raise TypeError("Square brackets are out of order when parsing type")
+            if close_bracket_idx == bracket_idx + 1:
+                raise TypeError("Empty square brackets when parsing type")
+            subtype_names = SignalSchema._split_subtypes(
+                type_name[bracket_idx + 1 : close_bracket_idx]
+            )
+            # Types like Union require the parameters to be a tuple of types.
+            subtypes = tuple(
+                SignalSchema._resolve_type(st, custom_types) for st in subtype_names
+            )
+            type_name = type_name[:bracket_idx].strip()
         fr = NAMES_TO_TYPES.get(type_name)
         if fr:
+            if subtypes:
+                if len(subtypes) == 1:
+                    # Types like Optional require there to be only one argument.
+                    return fr[subtypes[0]]  # type: ignore[index]
+                # Other types like Union require the parameters to be a tuple of types.
+                return fr[subtypes]  # type: ignore[index]
             return fr  # type: ignore[return-value]
         model_name, version = ModelStore.parse_name_version(type_name)
@@ -228,7 +277,14 @@ class SignalSchema:
                 for field_name, field_type_str in fields.items()
             }
             return create_feature_model(type_name, fields)
-        return None
+        # This can occur if a third-party or custom type is used, which is not available
+        # when deserializing.
+        warnings.warn(
+            f"Could not resolve type: '{type_name}'.",
+            SignalSchemaWarning,
+            stacklevel=2,
+        )
+        return Any  # type: ignore[return-value]
     @staticmethod
     def deserialize(schema: dict[str, Any]) -> "SignalSchema":
@@ -242,9 +298,14 @@ class SignalSchema:
                 # This entry is used as a lookup for custom types,
                 # and is not an actual field.
                 continue
+            if not isinstance(type_name, str):
+                raise SignalSchemaError(
+                    f"cannot deserialize '{type_name}': "
+                    "serialized types must be a string"
+                )
             try:
                 fr = SignalSchema._resolve_type(type_name, custom_types)
-                if fr is None:
+                if fr is Any:
                     # Skip if the type is not found, so all data can be displayed.
                     warnings.warn(
                         f"In signal '{signal}': "
@@ -258,7 +319,7 @@ class SignalSchema:
                 raise SignalSchemaError(
                     f"cannot deserialize '{signal}': {err}"
                 ) from err
-            signals[signal] = fr
+            signals[signal] = fr  # type: ignore[assignment]
         return SignalSchema(signals)
@@ -509,31 +570,58 @@ class SignalSchema:
         return self.values.pop(name)
     @staticmethod
-    def _type_to_str(type_):  # noqa: PLR0911
+    def _type_to_str(type_: Optional[type], subtypes: Optional[list] = None) -> str:  # noqa: PLR0911
+        """Convert a type to a string-based representation."""
+        if type_ is None:
+            return "NoneType"
         origin = get_origin(type_)
         if origin == Union:
             args = get_args(type_)
-            formatted_types = ", ".join(SignalSchema._type_to_str(arg) for arg in args)
+            formatted_types = ", ".join(
+                SignalSchema._type_to_str(arg, subtypes) for arg in args
+            )
             return f"Union[{formatted_types}]"
         if origin == Optional:
             args = get_args(type_)
-            type_str = SignalSchema._type_to_str(args[0])
+            type_str = SignalSchema._type_to_str(args[0], subtypes)
             return f"Optional[{type_str}]"
-        if origin is list:
+        if origin in (list, List):  # noqa: UP006
             args = get_args(type_)
-            type_str = SignalSchema._type_to_str(args[0])
+            type_str = SignalSchema._type_to_str(args[0], subtypes)
             return f"list[{type_str}]"
-        if origin is dict:
+        if origin in (dict, Dict):  # noqa: UP006
             args = get_args(type_)
-            type_str = SignalSchema._type_to_str(args[0]) if len(args) > 0 else ""
-            vals = f", {SignalSchema._type_to_str(args[1])}" if len(args) > 1 else ""
+            type_str = (
+                SignalSchema._type_to_str(args[0], subtypes) if len(args) > 0 else ""
+            )
+            vals = (
+                f", {SignalSchema._type_to_str(args[1], subtypes)}"
+                if len(args) > 1
+                else ""
+            )
             return f"dict[{type_str}{vals}]"
         if origin == Annotated:
             args = get_args(type_)
-            return SignalSchema._type_to_str(args[0])
-        if origin in (Literal, LiteralEx):
+            return SignalSchema._type_to_str(args[0], subtypes)
+        if origin in (Literal, LiteralEx) or type_ in (Literal, LiteralEx):
             return "Literal"
+        if Any in (origin, type_):
+            return "Any"
+        if Final in (origin, type_):
+            return "Final"
+        if subtypes is not None:
+            # Include this type in the list of all subtypes, if requested.
+            subtypes.append(type_)
+        if not hasattr(type_, "__name__"):
+            # This can happen for some third-party or custom types, mostly on Python 3.9
+            warnings.warn(
+                f"Unable to determine name of type '{type_}'.",
+                SignalSchemaWarning,
+                stacklevel=2,
+            )
+            return "Any"
         return type_.__name__
     @staticmethod

datachain/listing.py CHANGED Viewed

@@ -9,7 +9,8 @@ from sqlalchemy import Column
 from sqlalchemy.sql import func
 from tqdm import tqdm
-from datachain.node import DirType, Entry, Node, NodeWithPath
+from datachain.lib.file import File
+from datachain.node import DirType, Node, NodeWithPath
 from datachain.sql.functions import path as pathfunc
 from datachain.utils import suffix_to_number
@@ -80,16 +81,13 @@ class Listing:
             finally:
                 fetch_listing.insert_entries_done()
-    def insert_entry(self, entry: Entry) -> None:
-        self.warehouse.insert_rows(
-            self.dataset_rows.get_table(),
-            self.warehouse.prepare_entries(self.client.uri, [entry]),
-        )
+    def insert_entry(self, entry: File) -> None:
+        self.insert_entries([entry])
-    def insert_entries(self, entries: Iterable[Entry]) -> None:
+    def insert_entries(self, entries: Iterable[File]) -> None:
         self.warehouse.insert_rows(
             self.dataset_rows.get_table(),
-            self.warehouse.prepare_entries(self.client.uri, entries),
+            self.warehouse.prepare_entries(entries),
         )
     def insert_entries_done(self) -> None:

datachain/node.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any, Optional
 import attrs
 from datachain.cache import UniqueId
-from datachain.lib.file import File
 from datachain.storage import StorageURI
 from datachain.utils import TIME_ZERO, time_to_str
@@ -139,48 +138,6 @@ class Node:
         return split[0]
-@attrs.define
-class Entry:
-    path: str = ""
-    etag: str = ""
-    version: str = ""
-    is_latest: bool = True
-    last_modified: Optional[datetime] = None
-    size: int = 0
-    location: Optional[str] = None
-    @classmethod
-    def from_file(cls, path: str, **kwargs) -> "Entry":
-        return cls(path=path, **kwargs)
-    @property
-    def full_path(self) -> str:
-        return self.path
-    @property
-    def name(self):
-        return self.path.rsplit("/", 1)[-1]
-    @property
-    def parent(self):
-        split = self.path.rsplit("/", 1)
-        if len(split) <= 1:
-            return ""
-        return split[0]
-    def to_file(self, source: str) -> File:
-        return File(
-            source=source,
-            path=self.path,
-            size=self.size,
-            version=self.version,
-            etag=self.etag,
-            is_latest=self.is_latest,
-            last_modified=self.last_modified,
-            location=self.location,
-        )
 def get_path(parent: str, name: str):
     return f"{parent}/{name}" if parent else name

{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.3.13
+Version: 0.3.14
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0

{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
-datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
+datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
 datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
 datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
@@ -8,8 +8,8 @@ datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
 datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
 datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
 datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
-datachain/listing.py,sha256=LgL0lV10AzD1v52ajSaJKFnyiq4hNXwQiqaGySWGQsw,8290
-datachain/node.py,sha256=gacKxUPLgJ1ul6LJWz7nylYjUWPbyUY5cqaBFDOnO9E,5756
+datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
+datachain/node.py,sha256=2pF3Y9oYzElfiUBcw2LIv7LNNt--V4E-K021zjv0b0I,4748
 datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
 datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
 datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
@@ -17,17 +17,17 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=hhLciKHD0dVwniFzUsYORQ72WpnM40QYT0ydoyx1Kvw,69308
+datachain/catalog/catalog.py,sha256=7yl_WMGS6CfOc_G2MCbVVkdAfAlcZb2gC_PvXzBnoJ0,69344
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
-datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
+datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=Hy3-4HRV-3MozOybqAnF-qL0EoMYFHynpTG_YZphjZE,13298
-datachain/client/gcs.py,sha256=P_E3mhzhXR9mJ_wc3AYZuczzwOJ0-D3J5qhJXeSU-xk,4518
-datachain/client/hf.py,sha256=R-F6Ks6aVM9wSNkIXOkOnZFwsJlfdRwJjymRa78RLjM,1246
-datachain/client/local.py,sha256=H8TNY8pi2kA8y9_f_1XLUjJF66f229qC_b2y4xGkzdU,5300
-datachain/client/s3.py,sha256=zs41EvYW1bS_pUxnkCnJILzUJpL2V1jvvVKSN4BKYcc,6326
+datachain/client/fsspec.py,sha256=S93K9bS76MGcLYgWKVZiPVivbMElJ9Fq1w67I8BCR-g,13311
+datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
+datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
+datachain/client/local.py,sha256=LTyISV4oNSOPUdsai5eNZYCGXNCn8rNGuAI0bdgbtnU,5006
+datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
 datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
 datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
 datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
@@ -35,8 +35,8 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
 datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
 datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
-datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9mTFg,28048
-datachain/data_storage/warehouse.py,sha256=s5hhVUWrlEopE6eGOqzXHeNtRapK30G8gj0Vkt_HHFQ,32649
+datachain/data_storage/sqlite.py,sha256=yooLHQXrpoqDguGlF0SGcCiMU1T82OEc4wr1ra8eBHo,28285
+datachain/data_storage/warehouse.py,sha256=Pq6Nt3fyz1WFv6Mdtv2ZUr0_GFCNbafbtS4PdibblUg,32507
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
@@ -46,13 +46,13 @@ datachain/lib/dc.py,sha256=C-sfWRinV8pDK2P6UHLbScOahTlTiVQpoxUUdVllF2k,68710
 datachain/lib/file.py,sha256=rXmyzUFgnLQ4J3CyOCcg-guhzAz4x9Ug595FbNn4Y2E,11398
 datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
-datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
+datachain/lib/listing.py,sha256=mt-dsYfYFMPHN3zXnkohBHuueY-4tiNGPkcDYkKB0lY,3887
 datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
 datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
-datachain/lib/model_store.py,sha256=xcrQ69-jcQs716U4UFOSoSKM7EvFIWqxlPhIcE4X7oI,2497
-datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
+datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
+datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
 datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
-datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
+datachain/lib/signal_schema.py,sha256=vb4yCC90_pEngiu9Irc02kCPyqBxkrFDL4TKr7UMY5U,23808
 datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
 datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -96,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.3.13.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.3.13.dist-info/METADATA,sha256=pzMOR9LYuLR26Wifk4GPS9Wi1mmqCC5CIBZyA-X5_oo,17073
-datachain-0.3.13.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
-datachain-0.3.13.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.3.13.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.3.13.dist-info/RECORD,,
+datachain-0.3.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.3.14.dist-info/METADATA,sha256=bItmxEsx2MEsJ78Mu1yjO-PX-RkDuWHMESoPuGiJgxw,17073
+datachain-0.3.14.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
+datachain-0.3.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.3.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.3.14.dist-info/RECORD,,

{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl

Potentially problematic release.

datachain 0.3.13py3-none-any.whl → 0.3.14py3-none-any.whl