PyPI - datachain - Versions diffs - 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl - Mend

datachain 0.3.16py3-none-any.whl → 0.3.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (33) hide show

datachain/cache.py +14 -55
datachain/catalog/catalog.py +21 -55
datachain/cli.py +7 -26
datachain/client/fsspec.py +29 -63
datachain/client/local.py +2 -3
datachain/data_storage/metastore.py +7 -66
datachain/data_storage/sqlite.py +5 -2
datachain/data_storage/warehouse.py +0 -22
datachain/lib/arrow.py +2 -1
datachain/lib/dc.py +5 -2
datachain/lib/file.py +41 -23
datachain/lib/listing.py +3 -0
datachain/lib/tar.py +2 -1
datachain/listing.py +4 -4
datachain/node.py +23 -9
datachain/nodes_fetcher.py +12 -5
datachain/nodes_thread_pool.py +1 -1
datachain/progress.py +2 -12
datachain/query/__init__.py +0 -2
datachain/query/dataset.py +26 -144
datachain/query/dispatch.py +2 -15
datachain/query/schema.py +36 -24
datachain/query/udf.py +2 -148
datachain/sql/types.py +4 -2
datachain/telemetry.py +37 -0
datachain/utils.py +11 -40
{datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/METADATA +5 -3
{datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/RECORD +32 -32
datachain/query/builtins.py +0 -96
{datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/LICENSE +0 -0
{datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/WHEEL +0 -0
{datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/entry_points.txt +0 -0
{datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/top_level.txt +0 -0

datachain/cache.py CHANGED Viewed

@@ -1,56 +1,15 @@
-import hashlib
-import json
 import os
-from datetime import datetime
-from functools import partial
 from typing import TYPE_CHECKING, Optional
-import attrs
 from dvc_data.hashfile.db.local import LocalHashFileDB
 from dvc_objects.fs.local import LocalFileSystem
 from fsspec.callbacks import Callback, TqdmCallback
-from datachain.utils import TIME_ZERO
 from .progress import Tqdm
 if TYPE_CHECKING:
     from datachain.client import Client
-    from datachain.storage import StorageURI
-sha256 = partial(hashlib.sha256, usedforsecurity=False)
-@attrs.frozen
-class UniqueId:
-    storage: "StorageURI"
-    path: str
-    size: int
-    etag: str
-    version: str = ""
-    is_latest: bool = True
-    location: Optional[str] = None
-    last_modified: datetime = TIME_ZERO
-    def get_parsed_location(self) -> Optional[dict]:
-        if not self.location:
-            return None
-        loc_stack = (
-            json.loads(self.location)
-            if isinstance(self.location, str)
-            else self.location
-        )
-        if len(loc_stack) > 1:
-            raise NotImplementedError("Nested v-objects are not supported yet.")
-        return loc_stack[0]
-    def get_hash(self) -> str:
-        fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
-        if self.location:
-            fingerprint += f"/{self.location}"
-        return sha256(fingerprint.encode()).hexdigest()
+    from datachain.lib.file import File
 def try_scandir(path):
@@ -77,30 +36,30 @@ class DataChainCache:
     def tmp_dir(self):
         return self.odb.tmp_dir
-    def get_path(self, uid: UniqueId) -> Optional[str]:
-        if self.contains(uid):
-            return self.path_from_checksum(uid.get_hash())
+    def get_path(self, file: "File") -> Optional[str]:
+        if self.contains(file):
+            return self.path_from_checksum(file.get_hash())
         return None
-    def contains(self, uid: UniqueId) -> bool:
-        return self.odb.exists(uid.get_hash())
+    def contains(self, file: "File") -> bool:
+        return self.odb.exists(file.get_hash())
     def path_from_checksum(self, checksum: str) -> str:
         assert checksum
         return self.odb.oid_to_path(checksum)
-    def remove(self, uid: UniqueId) -> None:
-        self.odb.delete(uid.get_hash())
+    def remove(self, file: "File") -> None:
+        self.odb.delete(file.get_hash())
     async def download(
-        self, uid: UniqueId, client: "Client", callback: Optional[Callback] = None
+        self, file: "File", client: "Client", callback: Optional[Callback] = None
     ) -> None:
-        from_path = f"{uid.storage}/{uid.path}"
+        from_path = f"{file.source}/{file.path}"
         from dvc_objects.fs.utils import tmp_fname
         odb_fs = self.odb.fs
         tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname())  # type: ignore[arg-type]
-        size = uid.size
+        size = file.size
         if size < 0:
             size = await client.get_size(from_path)
         cb = callback or TqdmCallback(
@@ -115,13 +74,13 @@ class DataChainCache:
                 cb.close()
         try:
-            oid = uid.get_hash()
+            oid = file.get_hash()
             self.odb.add(tmp_info, self.odb.fs, oid)
         finally:
             os.unlink(tmp_info)
-    def store_data(self, uid: UniqueId, contents: bytes) -> None:
-        checksum = uid.get_hash()
+    def store_data(self, file: "File", contents: bytes) -> None:
+        checksum = file.get_hash()
         dst = self.path_from_checksum(checksum)
         if not os.path.exists(dst):
             # Create the file only if it's not already in cache

datachain/catalog/catalog.py CHANGED Viewed

@@ -34,7 +34,7 @@ import yaml
 from sqlalchemy import Column
 from tqdm import tqdm
-from datachain.cache import DataChainCache, UniqueId
+from datachain.cache import DataChainCache
 from datachain.client import Client
 from datachain.config import get_remote_config, read_config
 from datachain.dataset import (
@@ -68,8 +68,6 @@ from datachain.utils import (
     DataChainDir,
     batched,
     datachain_paths_join,
-    import_object,
-    parse_params_string,
 )
 from .datasource import DataSource
@@ -621,13 +619,13 @@ class Catalog:
                 code_ast.body[-1:] = new_expressions
         return code_ast
-    def get_client(self, uri: StorageURI, **config: Any) -> Client:
+    def get_client(self, uri: str, **config: Any) -> Client:
         """
         Return the client corresponding to the given source `uri`.
         """
         config = config or self.client_config
         cls = Client.get_implementation(uri)
-        return cls.from_source(uri, self.cache, **config)
+        return cls.from_source(StorageURI(uri), self.cache, **config)
     def enlist_source(
         self,
@@ -843,7 +841,7 @@ class Catalog:
         from datachain.query import DatasetQuery
         def _row_to_node(d: dict[str, Any]) -> Node:
-            del d["source"]
+            del d["file__source"]
             return Node.from_dict(d)
         enlisted_sources: list[tuple[bool, bool, Any]] = []
@@ -1148,30 +1146,28 @@ class Catalog:
         if not sources:
             raise ValueError("Sources needs to be non empty list")
-        from datachain.query import DatasetQuery
+        from datachain.lib.dc import DataChain
+        from datachain.query.session import Session
+        session = Session.get(catalog=self, client_config=client_config)
-        dataset_queries = []
+        chains = []
         for source in sources:
             if source.startswith(DATASET_PREFIX):
-                dq = DatasetQuery(
-                    name=source[len(DATASET_PREFIX) :],
-                    catalog=self,
-                    client_config=client_config,
+                dc = DataChain.from_dataset(
+                    source[len(DATASET_PREFIX) :], session=session
                 )
             else:
-                dq = DatasetQuery(
-                    path=source,
-                    catalog=self,
-                    client_config=client_config,
-                    recursive=recursive,
+                dc = DataChain.from_storage(
+                    source, session=session, recursive=recursive
                 )
-            dataset_queries.append(dq)
+            chains.append(dc)
         # create union of all dataset queries created from sources
-        dq = reduce(lambda ds1, ds2: ds1.union(ds2), dataset_queries)
+        dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
         try:
-            dq.save(name)
+            dc.save(name)
         except Exception as e:  # noqa: BLE001
             try:
                 ds = self.get_dataset(name)
@@ -1435,7 +1431,7 @@ class Catalog:
     def get_file_signals(
         self, dataset_name: str, dataset_version: int, row: RowDict
-    ) -> Optional[dict]:
+    ) -> Optional[RowDict]:
         """
         Function that returns file signals from dataset row.
         Note that signal names are without prefix, so if there was 'laion__file__source'
@@ -1452,7 +1448,7 @@ class Catalog:
         version = self.get_dataset(dataset_name).get_version(dataset_version)
-        file_signals_values = {}
+        file_signals_values = RowDict()
         schema = SignalSchema.deserialize(version.feature_schema)
         for file_signals in schema.get_signals(File):
@@ -1480,6 +1476,8 @@ class Catalog:
         use_cache: bool = True,
         **config: Any,
     ):
+        from datachain.lib.file import File
         file_signals = self.get_file_signals(dataset_name, dataset_version, row)
         if not file_signals:
             raise RuntimeError("Cannot open object without file signals")
@@ -1487,22 +1485,10 @@ class Catalog:
         config = config or self.client_config
         client = self.get_client(file_signals["source"], **config)
         return client.open_object(
-            self._get_row_uid(file_signals),  # type: ignore [arg-type]
+            File._from_row(file_signals),
             use_cache=use_cache,
         )
-    def _get_row_uid(self, row: RowDict) -> UniqueId:
-        return UniqueId(
-            row["source"],
-            row["path"],
-            row["size"],
-            row["etag"],
-            row["version"],
-            row["is_latest"],
-            row["location"],
-            row["last_modified"],
-        )
     def ls(
         self,
         sources: list[str],
@@ -1731,26 +1717,6 @@ class Catalog:
             output, sources, client_config=client_config, recursive=recursive
         )
-    def apply_udf(
-        self,
-        udf_location: str,
-        source: str,
-        target_name: str,
-        parallel: Optional[int] = None,
-        params: Optional[str] = None,
-    ):
-        from datachain.query import DatasetQuery
-        if source.startswith(DATASET_PREFIX):
-            ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
-        else:
-            ds = DatasetQuery(path=source, catalog=self)
-        udf = import_object(udf_location)
-        if params:
-            args, kwargs = parse_params_string(params)
-            udf = udf(*args, **kwargs)
-        ds.add_signals(udf, parallel=parallel).save(target_name)
     def query(
         self,
         query_script: str,

datachain/cli.py CHANGED Viewed

@@ -15,6 +15,7 @@ import shtab
 from datachain import utils
 from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
 from datachain.lib.dc import DataChain
+from datachain.telemetry import telemetry
 from datachain.utils import DataChainDir
 if TYPE_CHECKING:
@@ -494,27 +495,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Query parameters",
     )
-    apply_udf_parser = subp.add_parser(
-        "apply-udf", parents=[parent_parser], description="Apply UDF"
-    )
-    apply_udf_parser.add_argument("udf", type=str, help="UDF location")
-    apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
-    apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
-    apply_udf_parser.add_argument(
-        "--parallel",
-        nargs="?",
-        type=int,
-        const=-1,
-        default=None,
-        metavar="N",
-        help=(
-            "Use multiprocessing to run the UDF with N worker processes. "
-            "N defaults to the CPU count."
-        ),
-    )
-    apply_udf_parser.add_argument(
-        "--udf-params", type=str, default=None, help="UDF class parameters"
-    )
     subp.add_parser(
         "clear-cache", parents=[parent_parser], description="Clear the local file cache"
     )
@@ -893,6 +873,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
         # This also sets this environment variable for any subprocesses
         os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
+    error = None
     try:
         catalog = get_catalog(client_config=client_config)
         if args.command == "cp":
@@ -1016,10 +997,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 parallel=args.parallel,
                 params=args.param,
             )
-        elif args.command == "apply-udf":
-            catalog.apply_udf(
-                args.udf, args.source, args.target, args.parallel, args.udf_params
-            )
         elif args.command == "clear-cache":
             clear_cache(catalog)
         elif args.command == "gc":
@@ -1028,14 +1005,16 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             print(f"invalid command: {args.command}", file=sys.stderr)
             return 1
         return 0
-    except BrokenPipeError:
+    except BrokenPipeError as exc:
         # Python flushes standard streams on exit; redirect remaining output
         # to devnull to avoid another BrokenPipeError at shutdown
         # See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
+        error = str(exc)
         devnull = os.open(os.devnull, os.O_WRONLY)
         os.dup2(devnull, sys.stdout.fileno())
         return 141  # 128 + 13 (SIGPIPE)
     except (KeyboardInterrupt, Exception) as exc:
+        error = str(exc)
         if isinstance(exc, KeyboardInterrupt):
             msg = "Operation cancelled by the user"
         else:
@@ -1053,3 +1032,5 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             pdb.post_mortem()
         return 1
+    finally:
+        telemetry.send_cli_call(args.command, error=error)

datachain/client/fsspec.py CHANGED Viewed

@@ -3,7 +3,6 @@ import functools
 import logging
 import multiprocessing
 import os
-import posixpath
 import re
 import sys
 from abc import ABC, abstractmethod
@@ -26,8 +25,8 @@ from fsspec.asyn import get_loop, sync
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from tqdm import tqdm
-from datachain.cache import DataChainCache, UniqueId
-from datachain.client.fileslice import FileSlice, FileWrapper
+from datachain.cache import DataChainCache
+from datachain.client.fileslice import FileWrapper
 from datachain.error import ClientError as DataChainClientError
 from datachain.lib.file import File
 from datachain.nodes_fetcher import NodesFetcher
@@ -187,8 +186,8 @@ class Client(ABC):
     def url(self, path: str, expires: int = 3600, **kwargs) -> str:
         return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
-    async def get_current_etag(self, uid: UniqueId) -> str:
-        info = await self.fs._info(self.get_full_path(uid.path))
+    async def get_current_etag(self, file: "File") -> str:
+        info = await self.fs._info(self.get_full_path(file.path))
         return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:
@@ -317,7 +316,7 @@ class Client(ABC):
     def instantiate_object(
         self,
-        uid: UniqueId,
+        file: "File",
         dst: str,
         progress_bar: tqdm,
         force: bool = False,
@@ -328,10 +327,10 @@ class Client(ABC):
             else:
                 progress_bar.close()
                 raise FileExistsError(f"Path {dst} already exists")
-        self.do_instantiate_object(uid, dst)
+        self.do_instantiate_object(file, dst)
-    def do_instantiate_object(self, uid: "UniqueId", dst: str) -> None:
-        src = self.cache.get_path(uid)
+    def do_instantiate_object(self, file: "File", dst: str) -> None:
+        src = self.cache.get_path(file)
         assert src is not None
         try:
@@ -341,66 +340,33 @@ class Client(ABC):
             copy2(src, dst)
     def open_object(
-        self, uid: UniqueId, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
+        self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
     ) -> BinaryIO:
         """Open a file, including files in tar archives."""
-        location = uid.get_parsed_location()
-        if use_cache and (cache_path := self.cache.get_path(uid)):
+        if use_cache and (cache_path := self.cache.get_path(file)):
             return open(cache_path, mode="rb")  # noqa: SIM115
-        if location and location["vtype"] == "tar":
-            return self._open_tar(uid, use_cache=True)
-        return FileWrapper(self.fs.open(self.get_full_path(uid.path)), cb)  # type: ignore[return-value]
-    def _open_tar(self, uid: UniqueId, use_cache: bool = True):
-        location = uid.get_parsed_location()
-        assert location
-        offset = location["offset"]
-        size = location["size"]
-        parent = location["parent"]
-        parent_uid = UniqueId(
-            parent["source"],
-            parent["path"],
-            parent["size"],
-            parent["etag"],
-            location=parent["location"],
-        )
-        f = self.open_object(parent_uid, use_cache=use_cache)
-        return FileSlice(f, offset, size, posixpath.basename(uid.path))
-    def download(self, uid: UniqueId, *, callback: Callback = DEFAULT_CALLBACK) -> None:
-        sync(get_loop(), functools.partial(self._download, uid, callback=callback))
-    async def _download(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
-        if self.cache.contains(uid):
+        assert not file.location
+        return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb)  # type: ignore[return-value]
+    def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
+        sync(get_loop(), functools.partial(self._download, file, callback=callback))
+    async def _download(self, file: File, *, callback: "Callback" = None) -> None:
+        if self.cache.contains(file):
             # Already in cache, so there's nothing to do.
             return
-        await self._put_in_cache(uid, callback=callback)
+        await self._put_in_cache(file, callback=callback)
-    def put_in_cache(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
-        sync(get_loop(), functools.partial(self._put_in_cache, uid, callback=callback))
+    def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
+        sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
-    async def _put_in_cache(
-        self, uid: UniqueId, *, callback: "Callback" = None
-    ) -> None:
-        location = uid.get_parsed_location()
-        if location and location["vtype"] == "tar":
-            loop = asyncio.get_running_loop()
-            await loop.run_in_executor(
-                None, functools.partial(self._download_from_tar, uid, callback=callback)
-            )
-            return
-        if uid.etag:
-            etag = await self.get_current_etag(uid)
-            if uid.etag != etag:
+    async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
+        assert not file.location
+        if file.etag:
+            etag = await self.get_current_etag(file)
+            if file.etag != etag:
                 raise FileNotFoundError(
-                    f"Invalid etag for {uid.storage}/{uid.path}: "
-                    f"expected {uid.etag}, got {etag}"
+                    f"Invalid etag for {file.source}/{file.path}: "
+                    f"expected {file.etag}, got {etag}"
                 )
-        await self.cache.download(uid, self, callback=callback)
-    def _download_from_tar(self, uid, *, callback: "Callback" = None):
-        with self._open_tar(uid, use_cache=False) as f:
-            contents = f.read()
-        self.cache.store_data(uid, contents)
+        await self.cache.download(file, self, callback=callback)

datachain/client/local.py CHANGED Viewed

@@ -7,7 +7,6 @@ from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
-from datachain.cache import UniqueId
 from datachain.lib.file import File
 from datachain.storage import StorageURI
@@ -114,8 +113,8 @@ class FileClient(Client):
             use_symlinks=use_symlinks,
         )
-    async def get_current_etag(self, uid: UniqueId) -> str:
-        info = self.fs.info(self.get_full_path(uid.path))
+    async def get_current_etag(self, file: "File") -> str:
+        info = self.fs.info(self.get_full_path(file.path))
         return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:

datachain/data_storage/metastore.py CHANGED Viewed

@@ -297,39 +297,6 @@ class AbstractMetastore(ABC, Serializable):
     #
     # Dataset dependencies
     #
-    def add_dependency(
-        self,
-        dependency: DatasetDependency,
-        source_dataset_name: str,
-        source_dataset_version: int,
-    ) -> None:
-        """Add dependency to dataset or storage."""
-        if dependency.is_dataset:
-            self.add_dataset_dependency(
-                source_dataset_name,
-                source_dataset_version,
-                dependency.dataset_name,
-                int(dependency.version),
-            )
-        else:
-            self.add_storage_dependency(
-                source_dataset_name,
-                source_dataset_version,
-                StorageURI(dependency.name),
-                dependency.version,
-            )
-    @abstractmethod
-    def add_storage_dependency(
-        self,
-        source_dataset_name: str,
-        source_dataset_version: int,
-        storage_uri: StorageURI,
-        storage_timestamp_str: Optional[str] = None,
-    ) -> None:
-        """Adds storage dependency to dataset."""
     @abstractmethod
     def add_dataset_dependency(
         self,
@@ -1268,32 +1235,6 @@ class AbstractDBMetastore(AbstractMetastore):
     #
     # Dataset dependencies
     #
-    def _insert_dataset_dependency(self, data: dict[str, Any]) -> None:
-        """Method for inserting dependencies."""
-        self.db.execute(self._datasets_dependencies_insert().values(**data))
-    def add_storage_dependency(
-        self,
-        source_dataset_name: str,
-        source_dataset_version: int,
-        storage_uri: StorageURI,
-        storage_timestamp_str: Optional[str] = None,
-    ) -> None:
-        source_dataset = self.get_dataset(source_dataset_name)
-        storage = self.get_storage(storage_uri)
-        self._insert_dataset_dependency(
-            {
-                "source_dataset_id": source_dataset.id,
-                "source_dataset_version_id": (
-                    source_dataset.get_version(source_dataset_version).id
-                ),
-                "bucket_id": storage.id,
-                "bucket_version": storage_timestamp_str,
-            }
-        )
     def add_dataset_dependency(
         self,
         source_dataset_name: str,
@@ -1305,15 +1246,15 @@ class AbstractDBMetastore(AbstractMetastore):
         source_dataset = self.get_dataset(source_dataset_name)
         dataset = self.get_dataset(dataset_name)
-        self._insert_dataset_dependency(
-            {
-                "source_dataset_id": source_dataset.id,
-                "source_dataset_version_id": (
+        self.db.execute(
+            self._datasets_dependencies_insert().values(
+                source_dataset_id=source_dataset.id,
+                source_dataset_version_id=(
                     source_dataset.get_version(source_dataset_version).id
                 ),
-                "dataset_id": dataset.id,
-                "dataset_version_id": dataset.get_version(dataset_version).id,
-            }
+                dataset_id=dataset.id,
+                dataset_version_id=dataset.get_version(dataset_version).id,
+            )
         )
     def update_dataset_dependency_source(

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -651,11 +651,14 @@ class SQLiteWarehouse(AbstractWarehouse):
         self, dataset: DatasetRecord, version: int
     ) -> list[StorageURI]:
         dr = self.dataset_rows(dataset, version)
-        query = dr.select(dr.c.source).distinct()
+        query = dr.select(dr.c.file__source).distinct()
         cur = self.db.cursor()
         cur.row_factory = sqlite3.Row  # type: ignore[assignment]
-        return [StorageURI(row["source"]) for row in self.db.execute(query, cursor=cur)]
+        return [
+            StorageURI(row["file__source"])
+            for row in self.db.execute(query, cursor=cur)
+        ]
     def merge_dataset_rows(
         self,

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -942,28 +942,6 @@ class AbstractWarehouse(ABC, Serializable):
                 self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
                 pbar.update(1)
-    def changed_query(
-        self,
-        source_query: sa.sql.selectable.Select,
-        target_query: sa.sql.selectable.Select,
-    ) -> sa.sql.selectable.Select:
-        sq = source_query.alias("source_query")
-        tq = target_query.alias("target_query")
-        source_target_join = sa.join(
-            sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
-        )
-        return (
-            select(*sq.c)
-            .select_from(source_target_join)
-            .where(
-                (sq.c.last_modified > tq.c.last_modified)
-                & (sq.c.is_latest == true())
-                & (tq.c.is_latest == true())
-            )
-        )
 def _random_string(length: int) -> str:
     return "".join(

datachain/lib/arrow.py CHANGED Viewed

@@ -49,7 +49,8 @@ class ArrowGenerator(Generator):
     def process(self, file: File):
         if file._caching_enabled:
-            path = file.get_local_path(download=True)
+            file.ensure_cached()
+            path = file.get_local_path()
             ds = dataset(path, schema=self.input_schema, **self.kwargs)
         elif self.nrows:
             path = _nrows_file(file, self.nrows)

datachain 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl

Potentially problematic release.

datachain 0.3.16py3-none-any.whl → 0.3.18py3-none-any.whl