PyPI - datachain - Versions diffs - 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl - Mend

datachain 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (31) hide show

datachain/asyn.py +16 -6
datachain/cache.py +32 -10
datachain/catalog/catalog.py +17 -1
datachain/client/azure.py +6 -2
datachain/client/fsspec.py +1 -1
datachain/client/gcs.py +6 -2
datachain/client/s3.py +22 -4
datachain/data_storage/db_engine.py +9 -0
datachain/data_storage/schema.py +4 -10
datachain/data_storage/sqlite.py +7 -1
datachain/data_storage/warehouse.py +6 -4
datachain/{lib/diff.py → diff/__init__.py} +116 -12
datachain/func/__init__.py +2 -1
datachain/func/conditional.py +31 -9
datachain/lib/arrow.py +3 -1
datachain/lib/dc.py +5 -3
datachain/lib/file.py +15 -4
datachain/lib/hf.py +1 -1
datachain/lib/pytorch.py +57 -13
datachain/lib/udf.py +82 -40
datachain/listing.py +1 -0
datachain/progress.py +18 -1
datachain/query/dataset.py +122 -93
datachain/query/dispatch.py +22 -16
datachain/utils.py +13 -2
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/METADATA +6 -6
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/RECORD +31 -31
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/WHEEL +1 -1
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/LICENSE +0 -0
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/entry_points.txt +0 -0
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/top_level.txt +0 -0

datachain/asyn.py CHANGED Viewed

@@ -8,12 +8,14 @@ from collections.abc import (
     Iterable,
     Iterator,
 )
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, wait
 from heapq import heappop, heappush
 from typing import Any, Callable, Generic, Optional, TypeVar
 from fsspec.asyn import get_loop
+from datachain.utils import safe_closing
 ASYNC_WORKERS = 20
 InputT = TypeVar("InputT", contravariant=True)  # noqa: PLC0105
@@ -56,6 +58,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
         self.pool = ThreadPoolExecutor(workers)
         self._tasks: set[asyncio.Task] = set()
         self._shutdown_producer = threading.Event()
+        self._producer_is_shutdown = threading.Event()
     def start_task(self, coro: Coroutine) -> asyncio.Task:
         task = self.loop.create_task(coro)
@@ -64,11 +67,16 @@ class AsyncMapper(Generic[InputT, ResultT]):
         return task
     def _produce(self) -> None:
-        for item in self.iterable:
-            if self._shutdown_producer.is_set():
-                return
-            fut = asyncio.run_coroutine_threadsafe(self.work_queue.put(item), self.loop)
-            fut.result()  # wait until the item is in the queue
+        try:
+            with safe_closing(self.iterable):
+                for item in self.iterable:
+                    if self._shutdown_producer.is_set():
+                        return
+                    coro = self.work_queue.put(item)
+                    fut = asyncio.run_coroutine_threadsafe(coro, self.loop)
+                    fut.result()  # wait until the item is in the queue
+        finally:
+            self._producer_is_shutdown.set()
     async def produce(self) -> None:
         await self.to_thread(self._produce)
@@ -179,6 +187,8 @@ class AsyncMapper(Generic[InputT, ResultT]):
             self.shutdown_producer()
             if not async_run.done():
                 async_run.cancel()
+                wait([async_run])
+            self._producer_is_shutdown.wait()
     def __iter__(self):
         return self.iterate()

datachain/cache.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import os
+from collections.abc import Iterator
+from contextlib import contextmanager
+from tempfile import mkdtemp
 from typing import TYPE_CHECKING, Optional
 from dvc_data.hashfile.db.local import LocalHashFileDB
 from dvc_objects.fs.local import LocalFileSystem
+from dvc_objects.fs.utils import remove
 from fsspec.callbacks import Callback, TqdmCallback
 from .progress import Tqdm
@@ -20,6 +24,23 @@ def try_scandir(path):
         pass
+def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
+    cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
+    return DataChainCache(cache_dir, tmp_dir=tmp_dir)
+@contextmanager
+def temporary_cache(
+    tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
+) -> Iterator["DataChainCache"]:
+    cache = get_temp_cache(tmp_dir, prefix=prefix)
+    try:
+        yield cache
+    finally:
+        if delete:
+            cache.destroy()
 class DataChainCache:
     def __init__(self, cache_dir: str, tmp_dir: str):
         self.odb = LocalHashFileDB(
@@ -28,6 +49,9 @@ class DataChainCache:
             tmp_dir=tmp_dir,
         )
+    def __eq__(self, other) -> bool:
+        return self.odb == other.odb
     @property
     def cache_dir(self):
         return self.odb.path
@@ -63,7 +87,7 @@ class DataChainCache:
         if size < 0:
             size = await client.get_size(from_path, version_id=file.version)
         cb = callback or TqdmCallback(
-            tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
+            tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True, "leave": False},
             tqdm_cls=Tqdm,
             size=size,
         )
@@ -82,20 +106,18 @@ class DataChainCache:
             os.unlink(tmp_info)
     def store_data(self, file: "File", contents: bytes) -> None:
-        checksum = file.get_hash()
-        dst = self.path_from_checksum(checksum)
-        if not os.path.exists(dst):
-            # Create the file only if it's not already in cache
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            with open(dst, mode="wb") as f:
-                f.write(contents)
-    def clear(self):
+        self.odb.add_bytes(file.get_hash(), contents)
+    def clear(self) -> None:
         """
         Completely clear the cache.
         """
         self.odb.clear()
+    def destroy(self) -> None:
+        # `clear` leaves the prefix directory structure intact.
+        remove(self.cache_dir)
     def get_total_size(self) -> int:
         total = 0
         for subdir in try_scandir(self.odb.path):

datachain/catalog/catalog.py CHANGED Viewed

@@ -405,6 +405,7 @@ def get_download_bar(bar_format: str, total_size: int):
         unit_scale=True,
         unit_divisor=1000,
         total=total_size,
+        leave=False,
     )
@@ -429,6 +430,7 @@ def instantiate_node_groups(
             unit_scale=True,
             unit_divisor=1000,
             total=total_files,
+            leave=False,
         )
     )
@@ -534,6 +536,12 @@ def find_column_to_str(  # noqa: PLR0911
     return ""
+def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
+    clone = catalog.copy()
+    clone.cache = cache
+    return clone
 class Catalog:
     def __init__(
         self,
@@ -1242,10 +1250,17 @@ class Catalog:
         path: str,
         version_id: Optional[str] = None,
         client_config=None,
+        content_disposition: Optional[str] = None,
+        **kwargs,
     ) -> str:
         client_config = client_config or self.client_config
         client = Client.get_client(source, self.cache, **client_config)
-        return client.url(path, version_id=version_id)
+        return client.url(
+            path,
+            version_id=version_id,
+            content_disposition=content_disposition,
+            **kwargs,
+        )
     def export_dataset_table(
         self,
@@ -1437,6 +1452,7 @@ class Catalog:
             unit_scale=True,
             unit_divisor=1000,
             total=ds_stats.num_objects,  # type: ignore [union-attr]
+            leave=False,
         )
         schema = DatasetRecord.parse_schema(remote_ds_version.schema)

datachain/client/azure.py CHANGED Viewed

@@ -31,8 +31,12 @@ class AzureClient(Client):
         Generate a signed URL for the given path.
         """
         version_id = kwargs.pop("version_id", None)
+        content_disposition = kwargs.pop("content_disposition", None)
         result = self.fs.sign(
-            self.get_full_path(path, version_id), expiration=expires, **kwargs
+            self.get_full_path(path, version_id),
+            expiration=expires,
+            content_disposition=content_disposition,
+            **kwargs,
         )
         return result + (f"&versionid={version_id}" if version_id else "")
@@ -42,7 +46,7 @@ class AzureClient(Client):
             prefix = prefix.lstrip(DELIMITER) + DELIMITER
         found = False
         try:
-            with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
+            with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
                 async with self.fs.service_client.get_container_client(
                     container=self.name
                 ) as container_client:

datachain/client/fsspec.py CHANGED Viewed

@@ -249,7 +249,7 @@ class Client(ABC):
         await main_task
     async def _fetch_nested(self, start_prefix: str, result_queue: ResultQueue) -> None:
-        progress_bar = tqdm(desc=f"Listing {self.uri}", unit=" objects")
+        progress_bar = tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False)
         loop = get_loop()
         queue: asyncio.Queue[str] = asyncio.Queue()

datachain/client/gcs.py CHANGED Viewed

@@ -39,11 +39,15 @@ class GCSClient(Client):
         (see https://cloud.google.com/storage/docs/access-public-data#api-link).
         """
         version_id = kwargs.pop("version_id", None)
+        content_disposition = kwargs.pop("content_disposition", None)
         if self.fs.storage_options.get("token") == "anon":
             query = f"?generation={version_id}" if version_id else ""
             return f"https://storage.googleapis.com/{self.name}/{path}{query}"
         return self.fs.sign(
-            self.get_full_path(path, version_id), expiration=expires, **kwargs
+            self.get_full_path(path, version_id),
+            expiration=expires,
+            response_disposition=content_disposition,
+            **kwargs,
         )
     @staticmethod
@@ -83,7 +87,7 @@ class GCSClient(Client):
         self, page_queue: PageQueue, result_queue: ResultQueue
     ) -> bool:
         found = False
-        with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
+        with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
             while (page := await page_queue.get()) is not None:
                 if page:
                     found = True

datachain/client/s3.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+import os
 from typing import Any, Optional, cast
 from urllib.parse import parse_qs, urlsplit, urlunsplit
@@ -31,9 +32,11 @@ class ClientS3(Client):
         if "aws_token" in kwargs:
             kwargs.setdefault("token", kwargs.pop("aws_token"))
-        # caching bucket regions to use the right one in signed urls, otherwise
-        # it tries to randomly guess and creates wrong signature
-        kwargs.setdefault("cache_regions", True)
+        # remove this `if` when https://github.com/fsspec/s3fs/pull/929 lands
+        if not os.environ.get("AWS_REGION") and not os.environ.get("AWS_ENDPOINT_URL"):
+            # caching bucket regions to use the right one in signed urls, otherwise
+            # it tries to randomly guess and creates wrong signature
+            kwargs.setdefault("cache_regions", True)
         # We want to use newer v4 signature version since regions added after
         # 2014 are not going to support v2 which is the older one.
@@ -51,6 +54,21 @@ class ClientS3(Client):
         return cast(S3FileSystem, super().create_fs(**kwargs))
+    def url(self, path: str, expires: int = 3600, **kwargs) -> str:
+        """
+        Generate a signed URL for the given path.
+        """
+        version_id = kwargs.pop("version_id", None)
+        content_disposition = kwargs.pop("content_disposition", None)
+        if content_disposition:
+            kwargs["ResponseContentDisposition"] = content_disposition
+        return self.fs.sign(
+            self.get_full_path(path, version_id),
+            expiration=expires,
+            **kwargs,
+        )
     async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
         async def get_pages(it, page_queue):
             try:
@@ -61,7 +79,7 @@ class ClientS3(Client):
         async def process_pages(page_queue, result_queue):
             found = False
-            with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
+            with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
                 while (res := await page_queue.get()) is not None:
                     if res:
                         found = True

datachain/data_storage/db_engine.py CHANGED Viewed

@@ -79,6 +79,15 @@ class DatabaseEngine(ABC, Serializable):
         conn: Optional[Any] = None,
     ) -> Iterator[tuple[Any, ...]]: ...
+    def get_table(self, name: str) -> "Table":
+        table = self.metadata.tables.get(name)
+        if table is None:
+            sa.Table(name, self.metadata, autoload_with=self.engine)
+            # ^^^ This table may not be correctly initialised on some dialects
+            # Grab it from metadata instead.
+            table = self.metadata.tables[name]
+        return table
     @abstractmethod
     def executemany(
         self, query, params, cursor: Optional[Any] = None

datachain/data_storage/schema.py CHANGED Viewed

@@ -16,7 +16,6 @@ from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType, UInt64
 if TYPE_CHECKING:
-    from sqlalchemy import Engine
     from sqlalchemy.engine.interfaces import Dialect
     from sqlalchemy.sql.base import (
         ColumnCollection,
@@ -25,6 +24,8 @@ if TYPE_CHECKING:
     )
     from sqlalchemy.sql.elements import ColumnElement
+    from datachain.data_storage.db_engine import DatabaseEngine
 DEFAULT_DELIMITER = "__"
@@ -150,14 +151,12 @@ class DataTable:
     def __init__(
         self,
         name: str,
-        engine: "Engine",
-        metadata: Optional["sa.MetaData"] = None,
+        engine: "DatabaseEngine",
         column_types: Optional[dict[str, SQLType]] = None,
         object_name: str = "file",
     ):
         self.name: str = name
         self.engine = engine
-        self.metadata: sa.MetaData = metadata if metadata is not None else sa.MetaData()
         self.column_types: dict[str, SQLType] = column_types or {}
         self.object_name = object_name
@@ -211,12 +210,7 @@ class DataTable:
         return sa.Table(name, metadata, *columns)
     def get_table(self) -> "sa.Table":
-        table = self.metadata.tables.get(self.name)
-        if table is None:
-            sa.Table(self.name, self.metadata, autoload_with=self.engine)
-            # ^^^ This table may not be correctly initialised on some dialects
-            # Grab it from metadata instead.
-            table = self.metadata.tables[self.name]
+        table = self.engine.get_table(self.name)
         column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
         # adjusting types for custom columns to be instances of SQLType if possible

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -186,6 +186,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
         self.db_file = db_file
         self.is_closed = False
+    def get_table(self, name: str) -> Table:
+        if self.is_closed:
+            # Reconnect in case of being closed previously.
+            self._reconnect()
+        return super().get_table(name)
     @retry_sqlite_locks
     def execute(
         self,
@@ -670,7 +676,7 @@ class SQLiteWarehouse(AbstractWarehouse):
         ]
         table = self.create_udf_table(columns)
-        with tqdm(desc="Preparing", unit=" rows") as pbar:
+        with tqdm(desc="Preparing", unit=" rows", leave=False) as pbar:
             self.copy_table(table, query, progress_cb=pbar.update)
         return table

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -191,8 +191,7 @@ class AbstractWarehouse(ABC, Serializable):
         table_name = self.dataset_table_name(dataset.name, version)
         return self.schema.dataset_row_cls(
             table_name,
-            self.db.engine,
-            self.db.metadata,
+            self.db,
             dataset.get_schema(version),
             object_name=object_name,
         )
@@ -904,8 +903,11 @@ class AbstractWarehouse(ABC, Serializable):
         This should be implemented to ensure that the provided tables
         are cleaned up as soon as they are no longer needed.
         """
-        with tqdm(desc="Cleanup", unit=" tables") as pbar:
-            for name in set(names):
+        to_drop = set(names)
+        with tqdm(
+            desc="Cleanup", unit=" tables", total=len(to_drop), leave=False
+        ) as pbar:
+            for name in to_drop:
                 self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
                 pbar.update(1)

datachain/{lib/diff.py → diff/__init__.py} RENAMED Viewed

@@ -1,6 +1,7 @@
 import random
 import string
 from collections.abc import Sequence
+from enum import Enum
 from typing import TYPE_CHECKING, Optional, Union
 import sqlalchemy as sa
@@ -16,7 +17,22 @@ if TYPE_CHECKING:
 C = Column
-def compare(  # noqa: PLR0912, PLR0915, C901
+def get_status_col_name() -> str:
+    """Returns new unique status col name"""
+    return "diff_" + "".join(
+        random.choice(string.ascii_letters)  # noqa: S311
+        for _ in range(10)
+    )
+class CompareStatus(str, Enum):
+    ADDED = "A"
+    DELETED = "D"
+    MODIFIED = "M"
+    SAME = "S"
+def _compare(  # noqa: PLR0912, PLR0915, C901
     left: "DataChain",
     right: "DataChain",
     on: Union[str, Sequence[str]],
@@ -72,13 +88,10 @@ def compare(  # noqa: PLR0912, PLR0915, C901
             "At least one of added, deleted, modified, same flags must be set"
         )
-    # we still need status column for internal implementation even if not
-    # needed in output
     need_status_col = bool(status_col)
-    status_col = status_col or "diff_" + "".join(
-        random.choice(string.ascii_letters)  # noqa: S311
-        for _ in range(10)
-    )
+    # we still need status column for internal implementation even if not
+    # needed in the output
+    status_col = status_col or get_status_col_name()
     # calculate on and compare column names
     right_on = right_on or on
@@ -112,7 +125,7 @@ def compare(  # noqa: PLR0912, PLR0915, C901
                 for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
             ]
         )
-        diff_cond.append((added_cond, "A"))
+        diff_cond.append((added_cond, CompareStatus.ADDED))
     if modified and compare:
         modified_cond = sa.or_(
             *[
@@ -120,7 +133,7 @@ def compare(  # noqa: PLR0912, PLR0915, C901
                 for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
             ]
         )
-        diff_cond.append((modified_cond, "M"))
+        diff_cond.append((modified_cond, CompareStatus.MODIFIED))
     if same and compare:
         same_cond = sa.and_(
             *[
@@ -128,9 +141,11 @@ def compare(  # noqa: PLR0912, PLR0915, C901
                 for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
             ]
         )
-        diff_cond.append((same_cond, "S"))
+        diff_cond.append((same_cond, CompareStatus.SAME))
-    diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
+    diff = sa.case(*diff_cond, else_=None if compare else CompareStatus.MODIFIED).label(
+        status_col
+    )
     diff.type = String()
     left_right_merge = left.merge(
@@ -145,7 +160,7 @@ def compare(  # noqa: PLR0912, PLR0915, C901
         )
     )
-    diff_col = sa.literal("D").label(status_col)
+    diff_col = sa.literal(CompareStatus.DELETED).label(status_col)
     diff_col.type = String()
     right_left_merge = right.merge(
@@ -195,3 +210,92 @@ def compare(  # noqa: PLR0912, PLR0915, C901
         res = res.select_except(C(status_col))
     return left._evolve(query=res, signal_schema=schema)
+def compare_and_split(
+    left: "DataChain",
+    right: "DataChain",
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]] = None,
+    compare: Optional[Union[str, Sequence[str]]] = None,
+    right_compare: Optional[Union[str, Sequence[str]]] = None,
+    added: bool = True,
+    deleted: bool = True,
+    modified: bool = True,
+    same: bool = False,
+) -> dict[str, "DataChain"]:
+    """Comparing two chains and returning multiple chains, one for each of `added`,
+    `deleted`, `modified` and `same` status. Result is returned in form of
+    dictionary where each item represents one of the statuses and key values
+    are `A`, `D`, `M`, `S` corresponding. Note that status column is not in the
+    resulting chains.
+    Parameters:
+        left: Chain to calculate diff on.
+        right: Chain to calculate diff from.
+        on: Column or list of columns to match on. If both chains have the
+            same columns then this column is enough for the match. Otherwise,
+            `right_on` parameter has to specify the columns for the other chain.
+            This value is used to find corresponding row in other dataset. If not
+            found there, row is considered as added (or removed if vice versa), and
+            if found then row can be either modified or same.
+        right_on: Optional column or list of columns
+            for the `other` to match.
+        compare: Column or list of columns to compare on. If both chains have
+            the same columns then this column is enough for the compare. Otherwise,
+            `right_compare` parameter has to specify the columns for the other
+            chain. This value is used to see if row is modified or same. If
+            not set, all columns will be used for comparison
+        right_compare: Optional column or list of columns
+                for the `other` to compare to.
+        added (bool): Whether to return chain containing only added rows.
+        deleted (bool): Whether to return chain containing only deleted rows.
+        modified (bool): Whether to return chain containing only modified rows.
+        same (bool): Whether to return chain containing only same rows.
+    Example:
+        ```py
+        chains = compare(
+            persons,
+            new_persons,
+            on=["id"],
+            right_on=["other_id"],
+            compare=["name"],
+            added=True,
+            deleted=True,
+            modified=True,
+            same=True,
+        )
+        ```
+    """
+    status_col = get_status_col_name()
+    res = _compare(
+        left,
+        right,
+        on,
+        right_on=right_on,
+        compare=compare,
+        right_compare=right_compare,
+        added=added,
+        deleted=deleted,
+        modified=modified,
+        same=same,
+        status_col=status_col,
+    )
+    chains = {}
+    def filter_by_status(compare_status) -> "DataChain":
+        return res.filter(C(status_col) == compare_status).select_except(status_col)
+    if added:
+        chains[CompareStatus.ADDED.value] = filter_by_status(CompareStatus.ADDED)
+    if deleted:
+        chains[CompareStatus.DELETED.value] = filter_by_status(CompareStatus.DELETED)
+    if modified:
+        chains[CompareStatus.MODIFIED.value] = filter_by_status(CompareStatus.MODIFIED)
+    if same:
+        chains[CompareStatus.SAME.value] = filter_by_status(CompareStatus.SAME)
+    return chains

datachain/func/__init__.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .aggregate import (
     sum,
 )
 from .array import cosine_distance, euclidean_distance, length, sip_hash_64
-from .conditional import case, greatest, least
+from .conditional import case, greatest, ifelse, least
 from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .random import rand
 from .string import byte_hamming_distance
@@ -40,6 +40,7 @@ __all__ = [
     "euclidean_distance",
     "first",
     "greatest",
+    "ifelse",
     "int_hash_64",
     "least",
     "length",

datachain 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

Potentially problematic release.

datachain 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl