PyPI - datachain - Versions diffs - 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

datachain 0.3.17py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (30) hide show

datachain/__init__.py +5 -2
datachain/cache.py +14 -55
datachain/catalog/catalog.py +17 -97
datachain/cli.py +7 -2
datachain/client/fsspec.py +29 -63
datachain/client/local.py +2 -3
datachain/dataset.py +7 -2
datachain/error.py +6 -4
datachain/lib/arrow.py +10 -4
datachain/lib/dc.py +6 -2
datachain/lib/file.py +64 -28
datachain/lib/listing.py +2 -0
datachain/listing.py +4 -4
datachain/node.py +6 -6
datachain/nodes_fetcher.py +12 -5
datachain/nodes_thread_pool.py +1 -1
datachain/progress.py +2 -12
datachain/query/dataset.py +6 -40
datachain/query/dispatch.py +2 -15
datachain/query/schema.py +25 -24
datachain/query/udf.py +0 -106
datachain/sql/types.py +4 -2
datachain/telemetry.py +37 -0
datachain/utils.py +11 -0
{datachain-0.3.17.dist-info → datachain-0.3.19.dist-info}/METADATA +5 -4
{datachain-0.3.17.dist-info → datachain-0.3.19.dist-info}/RECORD +30 -29
{datachain-0.3.17.dist-info → datachain-0.3.19.dist-info}/LICENSE +0 -0
{datachain-0.3.17.dist-info → datachain-0.3.19.dist-info}/WHEEL +0 -0
{datachain-0.3.17.dist-info → datachain-0.3.19.dist-info}/entry_points.txt +0 -0
{datachain-0.3.17.dist-info → datachain-0.3.19.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -1,21 +1,23 @@
 from datachain.lib.data_model import DataModel, DataType, is_chain_type
 from datachain.lib.dc import C, Column, DataChain, Sys
 from datachain.lib.file import (
+    ArrowRow,
     File,
     FileError,
     ImageFile,
-    IndexedFile,
     TarVFile,
     TextFile,
 )
 from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Aggregator, Generator, Mapper
 from datachain.lib.utils import AbstractUDF, DataChainError
+from datachain.query import metrics, param
 from datachain.query.session import Session
 __all__ = [
     "AbstractUDF",
     "Aggregator",
+    "ArrowRow",
     "C",
     "Column",
     "DataChain",
@@ -26,7 +28,6 @@ __all__ = [
     "FileError",
     "Generator",
     "ImageFile",
-    "IndexedFile",
     "Mapper",
     "ModelStore",
     "Session",
@@ -34,4 +35,6 @@ __all__ = [
     "TarVFile",
     "TextFile",
     "is_chain_type",
+    "metrics",
+    "param",
 ]

datachain/cache.py CHANGED Viewed

@@ -1,56 +1,15 @@
-import hashlib
-import json
 import os
-from datetime import datetime
-from functools import partial
 from typing import TYPE_CHECKING, Optional
-import attrs
 from dvc_data.hashfile.db.local import LocalHashFileDB
 from dvc_objects.fs.local import LocalFileSystem
 from fsspec.callbacks import Callback, TqdmCallback
-from datachain.utils import TIME_ZERO
 from .progress import Tqdm
 if TYPE_CHECKING:
     from datachain.client import Client
-    from datachain.storage import StorageURI
-sha256 = partial(hashlib.sha256, usedforsecurity=False)
-@attrs.frozen
-class UniqueId:
-    storage: "StorageURI"
-    path: str
-    size: int
-    etag: str
-    version: str = ""
-    is_latest: bool = True
-    location: Optional[str] = None
-    last_modified: datetime = TIME_ZERO
-    def get_parsed_location(self) -> Optional[dict]:
-        if not self.location:
-            return None
-        loc_stack = (
-            json.loads(self.location)
-            if isinstance(self.location, str)
-            else self.location
-        )
-        if len(loc_stack) > 1:
-            raise NotImplementedError("Nested v-objects are not supported yet.")
-        return loc_stack[0]
-    def get_hash(self) -> str:
-        fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
-        if self.location:
-            fingerprint += f"/{self.location}"
-        return sha256(fingerprint.encode()).hexdigest()
+    from datachain.lib.file import File
 def try_scandir(path):
@@ -77,30 +36,30 @@ class DataChainCache:
     def tmp_dir(self):
         return self.odb.tmp_dir
-    def get_path(self, uid: UniqueId) -> Optional[str]:
-        if self.contains(uid):
-            return self.path_from_checksum(uid.get_hash())
+    def get_path(self, file: "File") -> Optional[str]:
+        if self.contains(file):
+            return self.path_from_checksum(file.get_hash())
         return None
-    def contains(self, uid: UniqueId) -> bool:
-        return self.odb.exists(uid.get_hash())
+    def contains(self, file: "File") -> bool:
+        return self.odb.exists(file.get_hash())
     def path_from_checksum(self, checksum: str) -> str:
         assert checksum
         return self.odb.oid_to_path(checksum)
-    def remove(self, uid: UniqueId) -> None:
-        self.odb.delete(uid.get_hash())
+    def remove(self, file: "File") -> None:
+        self.odb.delete(file.get_hash())
     async def download(
-        self, uid: UniqueId, client: "Client", callback: Optional[Callback] = None
+        self, file: "File", client: "Client", callback: Optional[Callback] = None
     ) -> None:
-        from_path = f"{uid.storage}/{uid.path}"
+        from_path = f"{file.source}/{file.path}"
         from dvc_objects.fs.utils import tmp_fname
         odb_fs = self.odb.fs
         tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname())  # type: ignore[arg-type]
-        size = uid.size
+        size = file.size
         if size < 0:
             size = await client.get_size(from_path)
         cb = callback or TqdmCallback(
@@ -115,13 +74,13 @@ class DataChainCache:
                 cb.close()
         try:
-            oid = uid.get_hash()
+            oid = file.get_hash()
             self.odb.add(tmp_info, self.odb.fs, oid)
         finally:
             os.unlink(tmp_info)
-    def store_data(self, uid: UniqueId, contents: bytes) -> None:
-        checksum = uid.get_hash()
+    def store_data(self, file: "File", contents: bytes) -> None:
+        checksum = file.get_hash()
         dst = self.path_from_checksum(checksum)
         if not os.path.exists(dst):
             # Create the file only if it's not already in cache

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import ast
 import glob
 import io
 import json
@@ -34,7 +33,7 @@ import yaml
 from sqlalchemy import Column
 from tqdm import tqdm
-from datachain.cache import DataChainCache, UniqueId
+from datachain.cache import DataChainCache
 from datachain.client import Client
 from datachain.config import get_remote_config, read_config
 from datachain.dataset import (
@@ -53,9 +52,9 @@ from datachain.error import (
     DataChainError,
     DatasetInvalidVersionError,
     DatasetNotFoundError,
+    DatasetVersionNotFoundError,
     PendingIndexingError,
     QueryScriptCancelError,
-    QueryScriptCompileError,
     QueryScriptRunError,
 )
 from datachain.listing import Listing
@@ -588,44 +587,13 @@ class Catalog:
     def generate_query_dataset_name(cls) -> str:
         return f"{QUERY_DATASET_PREFIX}_{uuid4().hex}"
-    def attach_query_wrapper(self, code_ast):
-        if code_ast.body:
-            last_expr = code_ast.body[-1]
-            if isinstance(last_expr, ast.Expr):
-                new_expressions = [
-                    ast.Import(
-                        names=[ast.alias(name="datachain.query.dataset", asname=None)]
-                    ),
-                    ast.Expr(
-                        value=ast.Call(
-                            func=ast.Attribute(
-                                value=ast.Attribute(
-                                    value=ast.Attribute(
-                                        value=ast.Name(id="datachain", ctx=ast.Load()),
-                                        attr="query",
-                                        ctx=ast.Load(),
-                                    ),
-                                    attr="dataset",
-                                    ctx=ast.Load(),
-                                ),
-                                attr="query_wrapper",
-                                ctx=ast.Load(),
-                            ),
-                            args=[last_expr],
-                            keywords=[],
-                        )
-                    ),
-                ]
-                code_ast.body[-1:] = new_expressions
-        return code_ast
-    def get_client(self, uri: StorageURI, **config: Any) -> Client:
+    def get_client(self, uri: str, **config: Any) -> Client:
         """
         Return the client corresponding to the given source `uri`.
         """
         config = config or self.client_config
         cls = Client.get_implementation(uri)
-        return cls.from_source(uri, self.cache, **config)
+        return cls.from_source(StorageURI(uri), self.cache, **config)
     def enlist_source(
         self,
@@ -1218,7 +1186,9 @@ class Catalog:
         dataset_version = dataset.get_version(version)
         if not dataset_version:
-            raise ValueError(f"Dataset {dataset.name} does not have version {version}")
+            raise DatasetVersionNotFoundError(
+                f"Dataset {dataset.name} does not have version {version}"
+            )
         if not dataset_version.is_final_status():
             raise ValueError("Cannot register dataset version in non final status")
@@ -1431,7 +1401,7 @@ class Catalog:
     def get_file_signals(
         self, dataset_name: str, dataset_version: int, row: RowDict
-    ) -> Optional[dict]:
+    ) -> Optional[RowDict]:
         """
         Function that returns file signals from dataset row.
         Note that signal names are without prefix, so if there was 'laion__file__source'
@@ -1448,7 +1418,7 @@ class Catalog:
         version = self.get_dataset(dataset_name).get_version(dataset_version)
-        file_signals_values = {}
+        file_signals_values = RowDict()
         schema = SignalSchema.deserialize(version.feature_schema)
         for file_signals in schema.get_signals(File):
@@ -1476,6 +1446,8 @@ class Catalog:
         use_cache: bool = True,
         **config: Any,
     ):
+        from datachain.lib.file import File
         file_signals = self.get_file_signals(dataset_name, dataset_version, row)
         if not file_signals:
             raise RuntimeError("Cannot open object without file signals")
@@ -1483,22 +1455,10 @@ class Catalog:
         config = config or self.client_config
         client = self.get_client(file_signals["source"], **config)
         return client.open_object(
-            self._get_row_uid(file_signals),  # type: ignore [arg-type]
+            File._from_row(file_signals),
             use_cache=use_cache,
         )
-    def _get_row_uid(self, row: RowDict) -> UniqueId:
-        return UniqueId(
-            row["source"],
-            row["path"],
-            row["size"],
-            row["etag"],
-            row["version"],
-            row["is_latest"],
-            row["location"],
-            row["last_modified"],
-        )
     def ls(
         self,
         sources: list[str],
@@ -1591,7 +1551,7 @@ class Catalog:
         try:
             remote_dataset_version = remote_dataset.get_version(version)
-        except (ValueError, StopIteration) as exc:
+        except (DatasetVersionNotFoundError, StopIteration) as exc:
             raise DataChainError(
                 f"Dataset {remote_dataset_name} doesn't have version {version}"
                 " on server"
@@ -1732,64 +1692,24 @@ class Catalog:
         query_script: str,
         env: Optional[Mapping[str, str]] = None,
         python_executable: str = sys.executable,
-        save: bool = False,
-        capture_output: bool = True,
+        capture_output: bool = False,
         output_hook: Callable[[str], None] = noop,
         params: Optional[dict[str, str]] = None,
         job_id: Optional[str] = None,
-        _execute_last_expression: bool = False,
     ) -> None:
-        """
-        Method to run custom user Python script to run a query and, as result,
-        creates new dataset from the results of a query.
-        Returns tuple of result dataset and script output.
-        Constraints on query script:
-            1. datachain.query.DatasetQuery should be used in order to create query
-            for a dataset
-            2. There should not be any .save() call on DatasetQuery since the idea
-            is to create only one dataset as the outcome of the script
-            3. Last statement must be an instance of DatasetQuery
-        If save is set to True, we are creating new dataset with results
-        from dataset query. If it's set to False, we will just print results
-        without saving anything
-        Example of query script:
-            from datachain.query import DatasetQuery, C
-            DatasetQuery('s3://ldb-public/remote/datasets/mnist-tiny/').filter(
-                C.size > 1000
-            )
-        """
-        if _execute_last_expression:
-            try:
-                code_ast = ast.parse(query_script)
-                code_ast = self.attach_query_wrapper(code_ast)
-                query_script_compiled = ast.unparse(code_ast)
-            except Exception as exc:
-                raise QueryScriptCompileError(
-                    f"Query script failed to compile, reason: {exc}"
-                ) from exc
-        else:
-            query_script_compiled = query_script
-            assert not save
+        cmd = [python_executable, "-c", query_script]
         env = dict(env or os.environ)
         env.update(
             {
                 "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
-                "PYTHONPATH": os.getcwd(),  # For local imports
-                "DATACHAIN_QUERY_SAVE": "1" if save else "",
-                "PYTHONUNBUFFERED": "1",
                 "DATACHAIN_JOB_ID": job_id or "",
             },
         )
-        popen_kwargs = {}
+        popen_kwargs: dict[str, Any] = {}
         if capture_output:
             popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
-        cmd = [python_executable, "-c", query_script_compiled]
-        with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc:  # type: ignore[call-overload]  # noqa: S603
+        with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc:  # noqa: S603
             if capture_output:
                 args = (proc.stdout, output_hook)
                 thread = Thread(target=_process_stream, args=args, daemon=True)

datachain/cli.py CHANGED Viewed

@@ -15,6 +15,7 @@ import shtab
 from datachain import utils
 from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
 from datachain.lib.dc import DataChain
+from datachain.telemetry import telemetry
 from datachain.utils import DataChainDir
 if TYPE_CHECKING:
@@ -803,7 +804,6 @@ def query(
         catalog.query(
             script_content,
             python_executable=python_executable,
-            capture_output=False,
             params=params,
             job_id=job_id,
         )
@@ -872,6 +872,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
         # This also sets this environment variable for any subprocesses
         os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
+    error = None
     try:
         catalog = get_catalog(client_config=client_config)
         if args.command == "cp":
@@ -1003,14 +1004,16 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             print(f"invalid command: {args.command}", file=sys.stderr)
             return 1
         return 0
-    except BrokenPipeError:
+    except BrokenPipeError as exc:
         # Python flushes standard streams on exit; redirect remaining output
         # to devnull to avoid another BrokenPipeError at shutdown
         # See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
+        error = str(exc)
         devnull = os.open(os.devnull, os.O_WRONLY)
         os.dup2(devnull, sys.stdout.fileno())
         return 141  # 128 + 13 (SIGPIPE)
     except (KeyboardInterrupt, Exception) as exc:
+        error = str(exc)
         if isinstance(exc, KeyboardInterrupt):
             msg = "Operation cancelled by the user"
         else:
@@ -1028,3 +1031,5 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             pdb.post_mortem()
         return 1
+    finally:
+        telemetry.send_cli_call(args.command, error=error)

datachain/client/fsspec.py CHANGED Viewed

@@ -3,7 +3,6 @@ import functools
 import logging
 import multiprocessing
 import os
-import posixpath
 import re
 import sys
 from abc import ABC, abstractmethod
@@ -26,8 +25,8 @@ from fsspec.asyn import get_loop, sync
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from tqdm import tqdm
-from datachain.cache import DataChainCache, UniqueId
-from datachain.client.fileslice import FileSlice, FileWrapper
+from datachain.cache import DataChainCache
+from datachain.client.fileslice import FileWrapper
 from datachain.error import ClientError as DataChainClientError
 from datachain.lib.file import File
 from datachain.nodes_fetcher import NodesFetcher
@@ -187,8 +186,8 @@ class Client(ABC):
     def url(self, path: str, expires: int = 3600, **kwargs) -> str:
         return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
-    async def get_current_etag(self, uid: UniqueId) -> str:
-        info = await self.fs._info(self.get_full_path(uid.path))
+    async def get_current_etag(self, file: "File") -> str:
+        info = await self.fs._info(self.get_full_path(file.path))
         return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:
@@ -317,7 +316,7 @@ class Client(ABC):
     def instantiate_object(
         self,
-        uid: UniqueId,
+        file: "File",
         dst: str,
         progress_bar: tqdm,
         force: bool = False,
@@ -328,10 +327,10 @@ class Client(ABC):
             else:
                 progress_bar.close()
                 raise FileExistsError(f"Path {dst} already exists")
-        self.do_instantiate_object(uid, dst)
+        self.do_instantiate_object(file, dst)
-    def do_instantiate_object(self, uid: "UniqueId", dst: str) -> None:
-        src = self.cache.get_path(uid)
+    def do_instantiate_object(self, file: "File", dst: str) -> None:
+        src = self.cache.get_path(file)
         assert src is not None
         try:
@@ -341,66 +340,33 @@ class Client(ABC):
             copy2(src, dst)
     def open_object(
-        self, uid: UniqueId, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
+        self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
     ) -> BinaryIO:
         """Open a file, including files in tar archives."""
-        location = uid.get_parsed_location()
-        if use_cache and (cache_path := self.cache.get_path(uid)):
+        if use_cache and (cache_path := self.cache.get_path(file)):
             return open(cache_path, mode="rb")  # noqa: SIM115
-        if location and location["vtype"] == "tar":
-            return self._open_tar(uid, use_cache=True)
-        return FileWrapper(self.fs.open(self.get_full_path(uid.path)), cb)  # type: ignore[return-value]
-    def _open_tar(self, uid: UniqueId, use_cache: bool = True):
-        location = uid.get_parsed_location()
-        assert location
-        offset = location["offset"]
-        size = location["size"]
-        parent = location["parent"]
-        parent_uid = UniqueId(
-            parent["source"],
-            parent["path"],
-            parent["size"],
-            parent["etag"],
-            location=parent["location"],
-        )
-        f = self.open_object(parent_uid, use_cache=use_cache)
-        return FileSlice(f, offset, size, posixpath.basename(uid.path))
-    def download(self, uid: UniqueId, *, callback: Callback = DEFAULT_CALLBACK) -> None:
-        sync(get_loop(), functools.partial(self._download, uid, callback=callback))
-    async def _download(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
-        if self.cache.contains(uid):
+        assert not file.location
+        return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb)  # type: ignore[return-value]
+    def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
+        sync(get_loop(), functools.partial(self._download, file, callback=callback))
+    async def _download(self, file: File, *, callback: "Callback" = None) -> None:
+        if self.cache.contains(file):
             # Already in cache, so there's nothing to do.
             return
-        await self._put_in_cache(uid, callback=callback)
+        await self._put_in_cache(file, callback=callback)
-    def put_in_cache(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
-        sync(get_loop(), functools.partial(self._put_in_cache, uid, callback=callback))
+    def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
+        sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
-    async def _put_in_cache(
-        self, uid: UniqueId, *, callback: "Callback" = None
-    ) -> None:
-        location = uid.get_parsed_location()
-        if location and location["vtype"] == "tar":
-            loop = asyncio.get_running_loop()
-            await loop.run_in_executor(
-                None, functools.partial(self._download_from_tar, uid, callback=callback)
-            )
-            return
-        if uid.etag:
-            etag = await self.get_current_etag(uid)
-            if uid.etag != etag:
+    async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
+        assert not file.location
+        if file.etag:
+            etag = await self.get_current_etag(file)
+            if file.etag != etag:
                 raise FileNotFoundError(
-                    f"Invalid etag for {uid.storage}/{uid.path}: "
-                    f"expected {uid.etag}, got {etag}"
+                    f"Invalid etag for {file.source}/{file.path}: "
+                    f"expected {file.etag}, got {etag}"
                 )
-        await self.cache.download(uid, self, callback=callback)
-    def _download_from_tar(self, uid, *, callback: "Callback" = None):
-        with self._open_tar(uid, use_cache=False) as f:
-            contents = f.read()
-        self.cache.store_data(uid, contents)
+        await self.cache.download(file, self, callback=callback)

datachain/client/local.py CHANGED Viewed

@@ -7,7 +7,6 @@ from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
-from datachain.cache import UniqueId
 from datachain.lib.file import File
 from datachain.storage import StorageURI
@@ -114,8 +113,8 @@ class FileClient(Client):
             use_symlinks=use_symlinks,
         )
-    async def get_current_etag(self, uid: UniqueId) -> str:
-        info = self.fs.info(self.get_full_path(uid.path))
+    async def get_current_etag(self, file: "File") -> str:
+        info = self.fs.info(self.get_full_path(file.path))
         return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:

datachain/dataset.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import (
 from urllib.parse import urlparse
 from datachain.client import Client
+from datachain.error import DatasetVersionNotFoundError
 from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
 if TYPE_CHECKING:
@@ -417,7 +418,9 @@ class DatasetRecord:
     def get_version(self, version: int) -> DatasetVersion:
         if not self.has_version(version):
-            raise ValueError(f"Dataset {self.name} does not have version {version}")
+            raise DatasetVersionNotFoundError(
+                f"Dataset {self.name} does not have version {version}"
+            )
         return next(
             v
             for v in self.versions  # type: ignore [union-attr]
@@ -435,7 +438,9 @@ class DatasetRecord:
         Get identifier in the form my-dataset@v3
         """
         if not self.has_version(version):
-            raise ValueError(f"Dataset {self.name} doesn't have a version {version}")
+            raise DatasetVersionNotFoundError(
+                f"Dataset {self.name} doesn't have a version {version}"
+            )
         return f"{self.name}@v{version}"
     def uri(self, version: int) -> str:

datachain/error.py CHANGED Viewed

@@ -10,6 +10,10 @@ class DatasetNotFoundError(NotFoundError):
     pass
+class DatasetVersionNotFoundError(NotFoundError):
+    pass
 class DatasetInvalidVersionError(Exception):
     pass
@@ -32,14 +36,12 @@ class QueryScriptRunError(Exception):
     Attributes:
         message      Explanation of the error
         return_code  Code returned by the subprocess
-        output       STDOUT + STDERR output of the subprocess
     """
-    def __init__(self, message: str, return_code: int = 0, output: str = ""):
+    def __init__(self, message: str, return_code: int = 0):
         self.message = message
         self.return_code = return_code
-        self.output = output
-        super().__init__(self.message)
+        super().__init__(message)
 class QueryScriptCancelError(QueryScriptRunError):

datachain/lib/arrow.py CHANGED Viewed

@@ -4,11 +4,11 @@ from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Optional
 import pyarrow as pa
-from pyarrow.dataset import dataset
+from pyarrow.dataset import CsvFileFormat, dataset
 from tqdm import tqdm
 from datachain.lib.data_model import dict_to_data_model
-from datachain.lib.file import File, IndexedFile
+from datachain.lib.file import ArrowRow, File
 from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Generator
@@ -49,7 +49,8 @@ class ArrowGenerator(Generator):
     def process(self, file: File):
         if file._caching_enabled:
-            path = file.get_local_path(download=True)
+            file.ensure_cached()
+            path = file.get_local_path()
             ds = dataset(path, schema=self.input_schema, **self.kwargs)
         elif self.nrows:
             path = _nrows_file(file, self.nrows)
@@ -83,7 +84,12 @@ class ArrowGenerator(Generator):
                                 vals_dict[field] = val
                         vals = [self.output_schema(**vals_dict)]
                     if self.source:
-                        yield [IndexedFile(file=file, index=index), *vals]
+                        kwargs: dict = self.kwargs
+                        # Can't serialize CsvFileFormat; may lose formatting options.
+                        if isinstance(kwargs.get("format"), CsvFileFormat):
+                            kwargs["format"] = "csv"
+                        arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
+                        yield [arrow_file, *vals]
                     else:
                         yield vals
                     index += 1

datachain 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

Potentially problematic release.

datachain 0.3.17py3-none-any.whl → 0.3.19py3-none-any.whl