PyPI - datachain - Versions diffs - 0.2.18__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

datachain 0.2.18py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/cache.py +5 -10
datachain/catalog/catalog.py +10 -20
datachain/client/azure.py +5 -12
datachain/client/fsspec.py +6 -10
datachain/client/gcs.py +4 -14
datachain/client/local.py +4 -11
datachain/client/s3.py +4 -8
datachain/data_storage/schema.py +7 -15
datachain/data_storage/warehouse.py +34 -45
datachain/lib/dc.py +8 -6
datachain/lib/file.py +19 -18
datachain/lib/udf.py +21 -14
datachain/lib/webdataset.py +2 -3
datachain/listing.py +14 -20
datachain/node.py +32 -21
datachain/query/batch.py +45 -41
datachain/query/builtins.py +5 -12
datachain/query/dataset.py +15 -8
datachain/query/dispatch.py +53 -68
datachain/query/queue.py +120 -0
datachain/query/schema.py +3 -7
datachain/query/udf.py +23 -8
datachain/utils.py +17 -2
{datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/METADATA +1 -1
{datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/RECORD +29 -28
{datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/LICENSE +0 -0
{datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/WHEEL +0 -0
{datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/entry_points.txt +0 -0
{datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/top_level.txt +0 -0

datachain/lib/file.py CHANGED Viewed

@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from datetime import datetime
 from io import BytesIO
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
 from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
@@ -111,8 +111,7 @@ class File(DataModel):
     """`DataModel` for reading binary files."""
     source: str = Field(default="")
-    parent: str = Field(default="")
-    name: str
+    path: str
     size: int = Field(default=0)
     version: str = Field(default="")
     etag: str = Field(default="")
@@ -123,8 +122,7 @@ class File(DataModel):
     _datachain_column_types: ClassVar[dict[str, Any]] = {
         "source": String,
-        "parent": String,
-        "name": String,
+        "path": String,
         "size": Int,
         "version": String,
         "etag": String,
@@ -136,8 +134,7 @@ class File(DataModel):
     _unique_id_keys: ClassVar[list[str]] = [
         "source",
-        "parent",
-        "name",
+        "path",
         "size",
         "etag",
         "version",
@@ -168,11 +165,9 @@ class File(DataModel):
     def validate_location(cls, v):
         return File._validate_dict(v)
-    @field_validator("parent", mode="before")
+    @field_validator("path", mode="before")
     @classmethod
     def validate_path(cls, path):
-        if path == "":
-            return ""
         return Path(path).as_posix()
     def model_dump_custom(self):
@@ -185,6 +180,14 @@ class File(DataModel):
         self._catalog = None
         self._caching_enabled = False
+    @property
+    def name(self):
+        return PurePosixPath(self.path).name
+    @property
+    def parent(self):
+        return str(PurePosixPath(self.path).parent)
     @contextmanager
     def open(self, mode: Literal["rb", "r"] = "rb"):
         """Open the file and return a file object."""
@@ -261,19 +264,19 @@ class File(DataModel):
     def get_file_suffix(self):
         """Returns last part of file name with `.`."""
-        return Path(self.name).suffix
+        return PurePosixPath(self.path).suffix
     def get_file_ext(self):
         """Returns last part of file name without `.`."""
-        return Path(self.name).suffix.strip(".")
+        return PurePosixPath(self.path).suffix.strip(".")
     def get_file_stem(self):
         """Returns file name without extension."""
-        return Path(self.name).stem
+        return PurePosixPath(self.path).stem
     def get_full_name(self):
         """Returns name with parent directories."""
-        return (Path(self.parent) / self.name).as_posix()
+        return self.path
     def get_uri(self):
         """Returns file URI."""
@@ -355,8 +358,7 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
     def get_file_type(
         source: str,
-        parent: str,
-        name: str,
+        path: str,
         size: int,
         version: str,
         etag: str,
@@ -367,8 +369,7 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
     ) -> file:  # type: ignore[valid-type]
         return file(
             source=source,
-            parent=parent,
-            name=name,
+            path=path,
             size=size,
             version=version,
             etag=etag,

datachain/lib/udf.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import sys
 import traceback
-from collections.abc import Iterable, Iterator
 from typing import TYPE_CHECKING, Callable, Optional
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -14,16 +13,19 @@ from datachain.lib.model_store import ModelStore
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
-from datachain.query.batch import RowBatch
+from datachain.query.batch import UDFInputBatch
 from datachain.query.schema import ColumnParameter
 from datachain.query.udf import UDFBase as _UDFBase
-from datachain.query.udf import UDFProperties, UDFResult
+from datachain.query.udf import UDFProperties
 if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator, Sequence
     from typing_extensions import Self
     from datachain.catalog import Catalog
-    from datachain.query.batch import BatchingResult
+    from datachain.query.batch import RowsOutput, UDFInput
+    from datachain.query.udf import UDFResult
 class UdfError(DataChainParamsError):
@@ -42,22 +44,27 @@ class UDFAdapter(_UDFBase):
     def run(
         self,
-        udf_inputs: "Iterable[BatchingResult]",
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[RowsOutput]",
         catalog: "Catalog",
         is_generator: bool,
         cache: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
-    ) -> Iterator[Iterable["UDFResult"]]:
+    ) -> "Iterator[Iterable[UDFResult]]":
         self.inner._catalog = catalog
         if hasattr(self.inner, "setup") and callable(self.inner.setup):
             self.inner.setup()
-        for batch in udf_inputs:
-            n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
-            output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
-            processed_cb.relative_update(n_rows)
-            yield output
+        yield from super().run(
+            udf_fields,
+            udf_inputs,
+            catalog,
+            is_generator,
+            cache,
+            download_cb,
+            processed_cb,
+        )
         if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
             self.inner.teardown()
@@ -65,12 +72,12 @@ class UDFAdapter(_UDFBase):
     def run_once(
         self,
         catalog: "Catalog",
-        arg: "BatchingResult",
+        arg: "UDFInput",
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
-    ) -> Iterable[UDFResult]:
-        if isinstance(arg, RowBatch):
+    ) -> "Iterable[UDFResult]":
+        if isinstance(arg, UDFInputBatch):
             udf_inputs = [
                 self.bind_parameters(catalog, row, cache=cache, cb=cb)
                 for row in arg.rows

datachain/lib/webdataset.py CHANGED Viewed

@@ -119,7 +119,7 @@ class Builder:
         return self._tar.extractfile(item).read().decode(self._encoding)
     def add(self, file: tarfile.TarInfo):
-        fstream = File(name=file.name)
+        fstream = File(path=file.name)
         ext = fstream.get_file_ext()
         stem = fstream.get_file_stem()
@@ -176,9 +176,8 @@ class Builder:
         )
         etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
         return File(
-            name=core_file.name,
             source=self._tar_stream.source,
-            parent=new_parent,
+            path=f"{new_parent}/{core_file.name}",
             version=self._tar_stream.version,
             size=core_file.size,
             etag=etag,

datachain/listing.py CHANGED Viewed

@@ -5,11 +5,12 @@ from itertools import zip_longest
 from typing import TYPE_CHECKING, Optional
 from fsspec.asyn import get_loop, sync
-from sqlalchemy import Column, case
+from sqlalchemy import Column
 from sqlalchemy.sql import func
 from tqdm import tqdm
 from datachain.node import DirType, Entry, Node, NodeWithPath
+from datachain.sql.functions import path as pathfunc
 from datachain.utils import suffix_to_number
 if TYPE_CHECKING:
@@ -129,7 +130,7 @@ class Listing:
                 dir_path = []
                 if not copy_dir_contents:
                     dir_path.append(node.name)
-                subtree_nodes = src.find(sort=["parent", "name"])
+                subtree_nodes = src.find(sort=["path"])
                 all_nodes.extend(
                     NodeWithPath(n.n, path=dir_path + n.path) for n in subtree_nodes
                 )
@@ -148,8 +149,7 @@ class Listing:
                 elif from_dataset:
                     node_path = [
                         src.listing.client.name,
-                        node.parent,
-                        node.name,
+                        node.path,
                     ]
                 else:
                     node_path = [node.name]
@@ -201,25 +201,19 @@ class Listing:
         dr = self.dataset_rows
         conds = []
         if names:
-            f = Column("name").op("GLOB")
-            conds.extend(f(name) for name in names)
+            for name in names:
+                conds.append(pathfunc.name(Column("path")).op("GLOB")(name))
         if inames:
-            f = func.lower(Column("name")).op("GLOB")
-            conds.extend(f(iname.lower()) for iname in inames)
+            for iname in inames:
+                conds.append(
+                    func.lower(pathfunc.name(Column("path"))).op("GLOB")(iname.lower())
+                )
         if paths:
-            node_path = case(
-                (Column("parent") == "", Column("name")),
-                else_=Column("parent") + "/" + Column("name"),
-            )
-            f = node_path.op("GLOB")
-            conds.extend(f(path) for path in paths)
+            for path in paths:
+                conds.append(Column("path").op("GLOB")(path))
         if ipaths:
-            node_path = case(
-                (Column("parent") == "", Column("name")),
-                else_=Column("parent") + "/" + Column("name"),
-            )
-            f = func.lower(node_path).op("GLOB")
-            conds.extend(f(ipath.lower()) for ipath in ipaths)
+            for ipath in ipaths:
+                conds.append(func.lower(Column("path")).op("GLOB")(ipath.lower()))
         if size is not None:
             size_limit = suffix_to_number(size)

datachain/node.py CHANGED Viewed

@@ -50,8 +50,7 @@ class Node:
     sys__rand: int = -1
     vtype: str = ""
     dir_type: Optional[int] = None
-    parent: str = ""
-    name: str = ""
+    path: str = ""
     etag: str = ""
     version: Optional[str] = None
     is_latest: bool = True
@@ -62,10 +61,6 @@ class Node:
     location: Optional[str] = None
     source: StorageURI = StorageURI("")
-    @property
-    def path(self) -> str:
-        return f"{self.parent}/{self.name}" if self.parent else self.name
     @property
     def is_dir(self) -> bool:
         return self.dir_type == DirType.DIR
@@ -107,13 +102,12 @@ class Node:
             return self.path + "/"
         return self.path
-    def as_uid(self, storage: Optional[StorageURI] = None):
+    def as_uid(self, storage: Optional[StorageURI] = None) -> UniqueId:
         if storage is None:
             storage = self.source
         return UniqueId(
             storage=storage,
-            parent=self.parent,
-            name=self.name,
+            path=self.path,
             size=self.size,
             version=self.version or "",
             etag=self.etag,
@@ -129,20 +123,30 @@ class Node:
         return cls(**kw)
     @classmethod
-    def from_dir(cls, parent, name, **kwargs) -> "Node":
-        return cls(sys__id=-1, dir_type=DirType.DIR, parent=parent, name=name, **kwargs)
+    def from_dir(cls, path, **kwargs) -> "Node":
+        return cls(sys__id=-1, dir_type=DirType.DIR, path=path, **kwargs)
     @classmethod
     def root(cls) -> "Node":
         return cls(sys__id=-1, dir_type=DirType.DIR)
+    @property
+    def name(self):
+        return self.path.rsplit("/", 1)[-1]
+    @property
+    def parent(self):
+        split = self.path.rsplit("/", 1)
+        if len(split) <= 1:
+            return ""
+        return split[0]
 @attrs.define
 class Entry:
     vtype: str = ""
     dir_type: Optional[int] = None
-    parent: str = ""
-    name: str = ""
+    path: str = ""
     etag: str = ""
     version: str = ""
     is_latest: bool = True
@@ -157,27 +161,34 @@ class Entry:
         return self.dir_type == DirType.DIR
     @classmethod
-    def from_dir(cls, parent: str, name: str, **kwargs) -> "Entry":
-        return cls(dir_type=DirType.DIR, parent=parent, name=name, **kwargs)
+    def from_dir(cls, path: str, **kwargs) -> "Entry":
+        return cls(dir_type=DirType.DIR, path=path, **kwargs)
     @classmethod
-    def from_file(cls, parent: str, name: str, **kwargs) -> "Entry":
-        return cls(dir_type=DirType.FILE, parent=parent, name=name, **kwargs)
+    def from_file(cls, path: str, **kwargs) -> "Entry":
+        return cls(dir_type=DirType.FILE, path=path, **kwargs)
     @classmethod
     def root(cls):
         return cls(dir_type=DirType.DIR)
-    @property
-    def path(self) -> str:
-        return f"{self.parent}/{self.name}" if self.parent else self.name
     @property
     def full_path(self) -> str:
         if self.is_dir and self.path:
             return self.path + "/"
         return self.path
+    @property
+    def name(self):
+        return self.path.rsplit("/", 1)[-1]
+    @property
+    def parent(self):
+        split = self.path.rsplit("/", 1)
+        if len(split) <= 1:
+            return ""
+        return split[0]
 def get_path(parent: str, name: str):
     return f"{parent}/{name}" if parent else name

datachain/query/batch.py CHANGED Viewed

@@ -5,21 +5,29 @@ from collections.abc import Generator, Sequence
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Callable, Optional, Union
-import sqlalchemy as sa
 from datachain.data_storage.schema import PARTITION_COLUMN_ID
 from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
 if TYPE_CHECKING:
+    from sqlalchemy import Select
     from datachain.dataset import RowDict
 @dataclass
-class RowBatch:
+class RowsOutputBatch:
+    rows: Sequence[Sequence]
+RowsOutput = Union[Sequence, RowsOutputBatch]
+@dataclass
+class UDFInputBatch:
     rows: Sequence["RowDict"]
-BatchingResult = Union["RowDict", RowBatch]
+UDFInput = Union["RowDict", UDFInputBatch]
 class BatchingStrategy(ABC):
@@ -28,9 +36,9 @@ class BatchingStrategy(ABC):
     @abstractmethod
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator[BatchingResult, None, None]:
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[RowsOutput, None, None]:
         """Apply the provided parameters to the UDF."""
@@ -42,10 +50,10 @@ class NoBatching(BatchingStrategy):
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator["RowDict", None, None]:
-        return execute(query, limit=query._limit, order_by=query._order_by_clauses)
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[Sequence, None, None]:
+        return execute(query)
 class Batch(BatchingStrategy):
@@ -59,31 +67,24 @@ class Batch(BatchingStrategy):
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator[RowBatch, None, None]:
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[RowsOutputBatch, None, None]:
         # choose page size that is a multiple of the batch size
         page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
         # select rows in batches
-        results: list[RowDict] = []
-        with contextlib.closing(
-            execute(
-                query,
-                page_size=page_size,
-                limit=query._limit,
-                order_by=query._order_by_clauses,
-            )
-        ) as rows:
+        results: list[Sequence] = []
+        with contextlib.closing(execute(query, page_size=page_size)) as rows:
             for row in rows:
                 results.append(row)
                 if len(results) >= self.count:
                     batch, results = results[: self.count], results[self.count :]
-                    yield RowBatch(batch)
+                    yield RowsOutputBatch(batch)
             if len(results) > 0:
-                yield RowBatch(results)
+                yield RowsOutputBatch(results)
 class Partition(BatchingStrategy):
@@ -95,27 +96,30 @@ class Partition(BatchingStrategy):
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator[RowBatch, None, None]:
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[RowsOutputBatch, None, None]:
         current_partition: Optional[int] = None
-        batch: list[RowDict] = []
-        with contextlib.closing(
-            execute(
-                query,
-                order_by=(PARTITION_COLUMN_ID, "sys__id", *query._order_by_clauses),
-                limit=query._limit,
-            )
-        ) as rows:
+        batch: list[Sequence] = []
+        query_fields = [str(c.name) for c in query.selected_columns]
+        partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
+        ordered_query = query.order_by(None).order_by(
+            PARTITION_COLUMN_ID,
+            "sys__id",
+            *query._order_by_clauses,
+        )
+        with contextlib.closing(execute(ordered_query)) as rows:
             for row in rows:
-                partition = row[PARTITION_COLUMN_ID]
+                partition = row[partition_column_idx]
                 if current_partition != partition:
                     current_partition = partition
                     if len(batch) > 0:
-                        yield RowBatch(batch)
+                        yield RowsOutputBatch(batch)
                         batch = []
                 batch.append(row)
             if len(batch) > 0:
-                yield RowBatch(batch)
+                yield RowsOutputBatch(batch)

datachain/query/builtins.py CHANGED Viewed

@@ -20,8 +20,7 @@ def load_tar(raw):
 @udf(
     (
         C.source,
-        C.name,
-        C.parent,
+        C.path,
         C.size,
         C.vtype,
         C.dir_type,
@@ -37,8 +36,7 @@ def load_tar(raw):
 )
 def index_tar(
     source,
-    name,
-    parent,
+    parent_path,
     size,
     vtype,
     dir_type,
@@ -52,9 +50,8 @@ def index_tar(
 ):
     # generate original tar files as well, along with subobjects
     yield DatasetRow.create(
-        name,
         source=source,
-        parent=parent,
+        path=parent_path,
         size=size,
         vtype=vtype,
         dir_type=dir_type,
@@ -66,15 +63,12 @@ def index_tar(
         etag=etag,
     )
-    parent_path = name if not parent else f"{parent}/{name}"
     for info in tar_entries:
         if info.isfile():
             full_path = f"{parent_path}/{info.name}"
-            parent_dir, subobject_name = full_path.rsplit("/", 1)
             yield DatasetRow.create(
-                subobject_name,
                 source=source,
-                parent=parent_dir,
+                path=full_path,
                 size=info.size,
                 vtype="tar",
                 location={
@@ -83,8 +77,7 @@ def index_tar(
                     "size": info.size,
                     "parent": {
                         "source": source,
-                        "parent": parent,
-                        "name": name,
+                        "path": parent_path,
                         "version": version,
                         "size": size,
                         "etag": etag,

datachain/query/dataset.py CHANGED Viewed

@@ -307,7 +307,7 @@ class Subtract(DatasetDiffOperation):
 class Changed(DatasetDiffOperation):
     """
     Calculates rows that are changed in a source query compared to target query
-    Changed means it has same source + parent + name but different last_modified
+    Changed means it has same source + path but different last_modified
     Example:
         >>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
         >>> ds_updated = (
@@ -461,6 +461,8 @@ class UDFStep(Step, ABC):
         processes = determine_processes(self.parallel)
+        udf_fields = [str(c.name) for c in query.selected_columns]
         try:
             if workers:
                 from datachain.catalog.loader import get_distributed_class
@@ -473,6 +475,7 @@ class UDFStep(Step, ABC):
                     query,
                     workers,
                     processes,
+                    udf_fields=udf_fields,
                     is_generator=self.is_generator,
                     use_partitioning=use_partitioning,
                     cache=self.cache,
@@ -489,6 +492,7 @@ class UDFStep(Step, ABC):
                     "warehouse_clone_params": self.catalog.warehouse.clone_params(),
                     "table": udf_table,
                     "query": query,
+                    "udf_fields": udf_fields,
                     "batching": batching,
                     "processes": processes,
                     "is_generator": self.is_generator,
@@ -528,6 +532,7 @@ class UDFStep(Step, ABC):
                     generated_cb = get_generated_callback(self.is_generator)
                     try:
                         udf_results = udf.run(
+                            udf_fields,
                             udf_inputs,
                             self.catalog,
                             self.is_generator,
@@ -1244,21 +1249,23 @@ class DatasetQuery:
         actual_params = [normalize_param(p) for p in params]
         try:
             query = self.apply_steps().select()
+            query_fields = [str(c.name) for c in query.selected_columns]
-            def row_iter() -> Generator[RowDict, None, None]:
+            def row_iter() -> Generator[Sequence, None, None]:
                 # warehouse isn't threadsafe, we need to clone() it
                 # in the thread that uses the results
                 with self.catalog.warehouse.clone() as warehouse:
-                    gen = warehouse.dataset_select_paginated(
-                        query, limit=query._limit, order_by=query._order_by_clauses
-                    )
+                    gen = warehouse.dataset_select_paginated(query)
                     with contextlib.closing(gen) as rows:
                         yield from rows
-            async def get_params(row: RowDict) -> tuple:
+            async def get_params(row: Sequence) -> tuple:
+                row_dict = RowDict(zip(query_fields, row))
                 return tuple(
                     [
-                        await p.get_value_async(self.catalog, row, mapper, **kwargs)
+                        await p.get_value_async(
+                            self.catalog, row_dict, mapper, **kwargs
+                        )
                         for p in actual_params
                     ]
                 )
@@ -1526,7 +1533,7 @@ class DatasetQuery:
     @detach
     def subtract(self, dq: "DatasetQuery") -> "Self":
-        return self._subtract(dq, on=["source", "parent", "name"])
+        return self._subtract(dq, on=["source", "path"])
     @detach
     def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":

datachain 0.2.18__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

datachain 0.2.18py3-none-any.whl → 0.3.1py3-none-any.whl