PyPI - datachain - Versions diffs - 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl - Mend

datachain 0.16.3py3-none-any.whl → 0.16.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/catalog/catalog.py +5 -1
datachain/cli/__init__.py +11 -9
datachain/cli/commands/query.py +1 -0
datachain/cli/parser/__init__.py +9 -1
datachain/cli/parser/job.py +6 -1
datachain/data_storage/job.py +1 -0
datachain/data_storage/metastore.py +82 -71
datachain/data_storage/warehouse.py +46 -34
datachain/lib/arrow.py +23 -1
datachain/lib/dc/csv.py +1 -0
datachain/lib/dc/datachain.py +30 -13
datachain/lib/listing.py +2 -0
datachain/lib/udf.py +17 -5
datachain/query/batch.py +40 -39
datachain/query/dataset.py +33 -32
datachain/query/dispatch.py +137 -75
datachain/query/metrics.py +1 -2
datachain/query/queue.py +1 -11
datachain/query/udf.py +1 -1
datachain/query/utils.py +8 -14
datachain/remote/studio.py +2 -0
datachain/studio.py +3 -0
datachain/utils.py +3 -0
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/METADATA +1 -1
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/RECORD +29 -29
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/WHEEL +1 -1
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/entry_points.txt +0 -0
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/licenses/LICENSE +0 -0
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -79,6 +79,7 @@ DATASET_INTERNAL_ERROR_MESSAGE = "Internal error on creating dataset"
 QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
 # exit code we use if query script was canceled
 QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
+QUERY_SCRIPT_SIGTERM_EXIT_CODE = -15  # if query script was terminated by SIGTERM
 # dataset pull
 PULL_DATASET_MAX_THREADS = 5
@@ -1645,7 +1646,10 @@ class Catalog:
                     thread.join()  # wait for the reader thread
         logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
-        if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
+        if proc.returncode in (
+            QUERY_SCRIPT_CANCELED_EXIT_CODE,
+            QUERY_SCRIPT_SIGTERM_EXIT_CODE,
+        ):
             raise QueryScriptCancelError(
                 "Query script was canceled by user",
                 return_code=proc.returncode,

datachain/cli/__init__.py CHANGED Viewed

@@ -34,8 +34,10 @@ def main(argv: Optional[list[str]] = None) -> int:
     datachain_parser = get_parser()
     args = datachain_parser.parse_args(argv)
-    if args.command in ("internal-run-udf", "internal-run-udf-worker"):
-        return handle_udf(args.command)
+    if args.command == "internal-run-udf":
+        return handle_udf()
+    if args.command == "internal-run-udf-worker":
+        return handle_udf_runner(args.fd)
     if args.command is None:
         datachain_parser.print_help(sys.stderr)
@@ -303,13 +305,13 @@ def handle_general_exception(exc, args, logging_level):
     return error, 1
-def handle_udf(command):
-    if command == "internal-run-udf":
-        from datachain.query.dispatch import udf_entrypoint
+def handle_udf() -> int:
+    from datachain.query.dispatch import udf_entrypoint
-        return udf_entrypoint()
+    return udf_entrypoint()
-    if command == "internal-run-udf-worker":
-        from datachain.query.dispatch import udf_worker_entrypoint
-        return udf_worker_entrypoint()
+def handle_udf_runner(fd: Optional[int] = None) -> int:
+    from datachain.query.dispatch import udf_worker_entrypoint
+    return udf_worker_entrypoint(fd)

datachain/cli/commands/query.py CHANGED Viewed

@@ -29,6 +29,7 @@ def query(
         name=os.path.basename(script),
         query=script_content,
         query_type=JobQueryType.PYTHON,
+        status=JobStatus.RUNNING,
         python_version=python_version,
         params=params,
     )

datachain/cli/parser/__init__.py CHANGED Viewed

@@ -549,7 +549,15 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     add_anon_arg(parse_gc)
     subp.add_parser("internal-run-udf", parents=[parent_parser])
-    subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
+    run_udf_worker = subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
+    run_udf_worker.add_argument(
+        "--fd",
+        type=int,
+        action="store",
+        default=None,
+        help="File descriptor to write results to",
+    )
     add_completion_parser(subp, [parent_parser])
     return parser

datachain/cli/parser/job.py CHANGED Viewed

@@ -13,7 +13,7 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
     )
     jobs_subparser = jobs_parser.add_subparsers(
         dest="cmd",
-        help="Use `datachain auth CMD --help` to display command-specific help",
+        help="Use `datachain job CMD --help` to display command-specific help",
     )
     studio_run_help = "Run a job in Studio"
@@ -66,6 +66,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         action="store",
         help="Python version for the job (e.g., 3.9, 3.10, 3.11)",
     )
+    studio_run_parser.add_argument(
+        "--repository",
+        action="store",
+        help="Repository URL to clone before running the job",
+    )
     studio_run_parser.add_argument(
         "--req-file",
         action="store",

datachain/data_storage/job.py CHANGED Viewed

@@ -3,6 +3,7 @@ from enum import Enum
 class JobStatus(int, Enum):
     CREATED = 1
+    SCHEDULED = 10
     QUEUED = 2
     INIT = 3
     RUNNING = 4

datachain/data_storage/metastore.py CHANGED Viewed

@@ -254,6 +254,7 @@ class AbstractMetastore(ABC, Serializable):
         name: str,
         query: str,
         query_type: JobQueryType = JobQueryType.PYTHON,
+        status: JobStatus = JobStatus.CREATED,
         workers: int = 1,
         python_version: Optional[str] = None,
         params: Optional[dict[str, str]] = None,
@@ -264,33 +265,35 @@ class AbstractMetastore(ABC, Serializable):
         """
     @abstractmethod
-    def set_job_status(
+    def get_job(self, job_id: str) -> Optional[Job]:
+        """Returns the job with the given ID."""
+    @abstractmethod
+    def update_job(
         self,
         job_id: str,
-        status: JobStatus,
+        status: Optional[JobStatus] = None,
+        exit_code: Optional[int] = None,
         error_message: Optional[str] = None,
         error_stack: Optional[str] = None,
+        finished_at: Optional[datetime] = None,
         metrics: Optional[dict[str, Any]] = None,
-    ) -> None:
-        """Set the status of the given job."""
+    ) -> Optional["Job"]:
+        """Updates job fields."""
     @abstractmethod
-    def get_job_status(self, job_id: str) -> Optional[JobStatus]:
-        """Returns the status of the given job."""
-    @abstractmethod
-    def set_job_and_dataset_status(
+    def set_job_status(
         self,
         job_id: str,
-        job_status: JobStatus,
-        dataset_status: DatasetStatus,
+        status: JobStatus,
+        error_message: Optional[str] = None,
+        error_stack: Optional[str] = None,
     ) -> None:
-        """Set the status of the given job and dataset."""
+        """Set the status of the given job."""
     @abstractmethod
-    def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
-        """Returns dataset names and versions for the job."""
-        raise NotImplementedError
+    def get_job_status(self, job_id: str) -> Optional[JobStatus]:
+        """Returns the status of the given job."""
 class AbstractDBMetastore(AbstractMetastore):
@@ -651,30 +654,31 @@ class AbstractDBMetastore(AbstractMetastore):
         dataset_version = dataset.get_version(version)
         values = {}
+        version_values: dict = {}
         for field, value in kwargs.items():
             if field in self._dataset_version_fields[1:]:
                 if field == "schema":
-                    dataset_version.update(**{field: DatasetRecord.parse_schema(value)})
                     values[field] = json.dumps(value) if value else None
+                    version_values[field] = DatasetRecord.parse_schema(value)
                 elif field == "feature_schema":
                     values[field] = json.dumps(value) if value else None
+                    version_values[field] = value
                 elif field == "preview" and isinstance(value, list):
                     values[field] = json.dumps(value, cls=JSONSerialize)
+                    version_values[field] = value
                 else:
                     values[field] = value
-                    dataset_version.update(**{field: value})
-        if not values:
-            # Nothing to update
-            return dataset_version
+                    version_values[field] = value
-        dv = self._datasets_versions
-        self.db.execute(
-            self._datasets_versions_update()
-            .where(dv.c.id == dataset_version.id)
-            .values(values),
-            conn=conn,
-        )  # type: ignore [attr-defined]
+        if values:
+            dv = self._datasets_versions
+            self.db.execute(
+                self._datasets_versions_update()
+                .where(dv.c.dataset_id == dataset.id and dv.c.version == version)
+                .values(values),
+                conn=conn,
+            )  # type: ignore [attr-defined]
+            dataset_version.update(**version_values)
         return dataset_version
@@ -702,7 +706,7 @@ class AbstractDBMetastore(AbstractMetastore):
         dataset_fields: list[str],
         dataset_version_fields: list[str],
         isouter: bool = True,
-    ):
+    ) -> "Select":
         if not (
             self.db.has_table(self._datasets.name)
             and self.db.has_table(self._datasets_versions.name)
@@ -719,12 +723,12 @@ class AbstractDBMetastore(AbstractMetastore):
         j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
         return query.select_from(j)
-    def _base_dataset_query(self):
+    def _base_dataset_query(self) -> "Select":
         return self._get_dataset_query(
             self._dataset_fields, self._dataset_version_fields
         )
-    def _base_list_datasets_query(self):
+    def _base_list_datasets_query(self) -> "Select":
         return self._get_dataset_query(
             self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
         )
@@ -1018,6 +1022,7 @@ class AbstractDBMetastore(AbstractMetastore):
         name: str,
         query: str,
         query_type: JobQueryType = JobQueryType.PYTHON,
+        status: JobStatus = JobStatus.CREATED,
         workers: int = 1,
         python_version: Optional[str] = None,
         params: Optional[dict[str, str]] = None,
@@ -1032,7 +1037,7 @@ class AbstractDBMetastore(AbstractMetastore):
             self._jobs_insert().values(
                 id=job_id,
                 name=name,
-                status=JobStatus.CREATED,
+                status=status,
                 created_at=datetime.now(timezone.utc),
                 query=query,
                 query_type=query_type.value,
@@ -1047,25 +1052,65 @@ class AbstractDBMetastore(AbstractMetastore):
         )
         return job_id
+    def get_job(self, job_id: str, conn=None) -> Optional[Job]:
+        """Returns the job with the given ID."""
+        query = self._jobs_select(self._jobs).where(self._jobs.c.id == job_id)
+        results = list(self.db.execute(query, conn=conn))
+        if not results:
+            return None
+        return self._parse_job(results[0])
+    def update_job(
+        self,
+        job_id: str,
+        status: Optional[JobStatus] = None,
+        exit_code: Optional[int] = None,
+        error_message: Optional[str] = None,
+        error_stack: Optional[str] = None,
+        finished_at: Optional[datetime] = None,
+        metrics: Optional[dict[str, Any]] = None,
+        conn: Optional[Any] = None,
+    ) -> Optional["Job"]:
+        """Updates job fields."""
+        values: dict = {}
+        if status is not None:
+            values["status"] = status
+        if exit_code is not None:
+            values["exit_code"] = exit_code
+        if error_message is not None:
+            values["error_message"] = error_message
+        if error_stack is not None:
+            values["error_stack"] = error_stack
+        if finished_at is not None:
+            values["finished_at"] = finished_at
+        if metrics:
+            values["metrics"] = json.dumps(metrics)
+        if values:
+            j = self._jobs
+            self.db.execute(
+                self._jobs_update().where(j.c.id == job_id).values(**values),
+                conn=conn,
+            )  # type: ignore [attr-defined]
+        return self.get_job(job_id, conn=conn)
     def set_job_status(
         self,
         job_id: str,
         status: JobStatus,
         error_message: Optional[str] = None,
         error_stack: Optional[str] = None,
-        metrics: Optional[dict[str, Any]] = None,
         conn: Optional[Any] = None,
     ) -> None:
         """Set the status of the given job."""
-        values: dict = {"status": status.value}
-        if status.value in JobStatus.finished():
+        values: dict = {"status": status}
+        if status in JobStatus.finished():
             values["finished_at"] = datetime.now(timezone.utc)
         if error_message:
             values["error_message"] = error_message
         if error_stack:
             values["error_stack"] = error_stack
-        if metrics:
-            values["metrics"] = json.dumps(metrics)
         self.db.execute(
             self._jobs_update(self._jobs.c.id == job_id).values(**values),
             conn=conn,
@@ -1086,37 +1131,3 @@ class AbstractDBMetastore(AbstractMetastore):
         if not results:
             return None
         return results[0][0]
-    def set_job_and_dataset_status(
-        self,
-        job_id: str,
-        job_status: JobStatus,
-        dataset_status: DatasetStatus,
-    ) -> None:
-        """Set the status of the given job and dataset."""
-        with self.db.transaction() as conn:
-            self.set_job_status(job_id, status=job_status, conn=conn)
-            dv = self._datasets_versions
-            query = (
-                self._datasets_versions_update()
-                .where(
-                    (dv.c.job_id == job_id) & (dv.c.status != DatasetStatus.COMPLETE)
-                )
-                .values(status=dataset_status)
-            )
-            self.db.execute(query, conn=conn)  # type: ignore[attr-defined]
-    def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
-        """Returns dataset names and versions for the job."""
-        dv = self._datasets_versions
-        ds = self._datasets
-        join_condition = dv.c.dataset_id == ds.c.id
-        query = (
-            self._datasets_versions_select(ds.c.name, dv.c.version)
-            .select_from(dv.join(ds, join_condition))
-            .where(dv.c.job_id == job_id)
-        )
-        return list(self.db.execute(query))

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -11,16 +11,15 @@ from urllib.parse import urlparse
 import attrs
 import sqlalchemy as sa
-from sqlalchemy import Table, case, select
-from sqlalchemy.sql import func
 from sqlalchemy.sql.expression import true
-from tqdm.auto import tqdm
 from datachain.client import Client
 from datachain.data_storage.schema import convert_rows_custom_column_types
 from datachain.data_storage.serializer import Serializable
 from datachain.dataset import DatasetRecord, StorageURI
 from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
+from datachain.query.batch import RowsOutput
+from datachain.query.utils import get_query_id_column
 from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType
 from datachain.utils import sql_escape_like
@@ -31,7 +30,6 @@ if TYPE_CHECKING:
         _FromClauseArgument,
         _OnClauseArgument,
     )
-    from sqlalchemy.sql.selectable import Select
     from sqlalchemy.types import TypeEngine
     from datachain.data_storage import schema
@@ -199,13 +197,13 @@ class AbstractWarehouse(ABC, Serializable):
     # Query Execution
     #
-    def query_count(self, query: sa.sql.selectable.Select) -> int:
+    def query_count(self, query: sa.Select) -> int:
         """Count the number of rows in a query."""
-        count_query = sa.select(func.count(1)).select_from(query.subquery())
+        count_query = sa.select(sa.func.count(1)).select_from(query.subquery())
         return next(self.db.execute(count_query))[0]
     def table_rows_count(self, table) -> int:
-        count_query = sa.select(func.count(1)).select_from(table)
+        count_query = sa.select(sa.func.count(1)).select_from(table)
         return next(self.db.execute(count_query))[0]
     def dataset_select_paginated(
@@ -278,7 +276,7 @@ class AbstractWarehouse(ABC, Serializable):
         name: str,
         columns: Sequence["sa.Column"] = (),
         if_not_exists: bool = True,
-    ) -> Table:
+    ) -> sa.Table:
         """Creates a dataset rows table for the given dataset name and columns"""
     def drop_dataset_rows_table(
@@ -289,7 +287,7 @@ class AbstractWarehouse(ABC, Serializable):
     ) -> None:
         """Drops a dataset rows table for the given dataset name."""
         table_name = self.dataset_table_name(dataset.name, version)
-        table = Table(table_name, self.db.metadata)
+        table = sa.Table(table_name, self.db.metadata)
         self.db.drop_table(table, if_exists=if_exists)
     @abstractmethod
@@ -309,7 +307,7 @@ class AbstractWarehouse(ABC, Serializable):
     def dataset_rows_select(
         self,
-        query: sa.sql.selectable.Select,
+        query: sa.Select,
         **kwargs,
     ) -> Iterator[tuple[Any, ...]]:
         """
@@ -320,6 +318,24 @@ class AbstractWarehouse(ABC, Serializable):
             query.selected_columns, rows, self.db.dialect
         )
+    def dataset_rows_select_from_ids(
+        self,
+        query: sa.Select,
+        ids: Iterable[RowsOutput],
+        is_batched: bool,
+    ) -> Iterator[RowsOutput]:
+        """
+        Fetch dataset rows from database using a list of IDs.
+        """
+        if (id_col := get_query_id_column(query)) is None:
+            raise RuntimeError("sys__id column not found in query")
+        if is_batched:
+            for batch in ids:
+                yield list(self.dataset_rows_select(query.where(id_col.in_(batch))))
+        else:
+            yield from self.dataset_rows_select(query.where(id_col.in_(ids)))
     @abstractmethod
     def get_dataset_sources(
         self, dataset: DatasetRecord, version: int
@@ -341,7 +357,7 @@ class AbstractWarehouse(ABC, Serializable):
         """Returns total number of rows in a dataset"""
         dr = self.dataset_rows(dataset, version)
         table = dr.get_table()
-        query = select(sa.func.count(table.c.sys__id))
+        query = sa.select(sa.func.count(table.c.sys__id))
         (res,) = self.db.execute(query)
         return res[0]
@@ -364,7 +380,7 @@ class AbstractWarehouse(ABC, Serializable):
         ]
         if size_columns:
             expressions = (*expressions, sa.func.sum(sum(size_columns)))
-        query = select(*expressions)
+        query = sa.select(*expressions)
         ((nrows, *rest),) = self.db.execute(query)
         return nrows, rest[0] if rest else 0
@@ -373,10 +389,10 @@ class AbstractWarehouse(ABC, Serializable):
         """Convert File entries so they can be passed on to `insert_rows()`"""
     @abstractmethod
-    def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
+    def insert_rows(self, table: sa.Table, rows: Iterable[dict[str, Any]]) -> None:
         """Does batch inserts of any kind of rows into table"""
-    def insert_rows_done(self, table: Table) -> None:
+    def insert_rows_done(self, table: sa.Table) -> None:
         """
         Only needed for certain implementations
         to signal when rows inserts are complete.
@@ -497,7 +513,7 @@ class AbstractWarehouse(ABC, Serializable):
         ).subquery()
         path_glob = "/".join([*path_list, glob_name])
         dirpath = path_glob[: -len(glob_name)]
-        relpath = func.substr(de.c(q, "path"), len(dirpath) + 1)
+        relpath = sa.func.substr(de.c(q, "path"), len(dirpath) + 1)
         return self.get_nodes(
             self.expand_query(de, q, dr)
@@ -584,13 +600,13 @@ class AbstractWarehouse(ABC, Serializable):
             default = getattr(
                 attrs.fields(Node), dr.without_object(column.name)
             ).default
-            return func.coalesce(column, default).label(column.name)
+            return sa.func.coalesce(column, default).label(column.name)
         return sa.select(
             q.c.sys__id,
-            case((de.c(q, "is_dir") == true(), DirType.DIR), else_=DirType.FILE).label(
-                dr.col_name("dir_type")
-            ),
+            sa.case(
+                (de.c(q, "is_dir") == true(), DirType.DIR), else_=DirType.FILE
+            ).label(dr.col_name("dir_type")),
             de.c(q, "path"),
             with_default(dr.c("etag")),
             de.c(q, "version"),
@@ -665,7 +681,7 @@ class AbstractWarehouse(ABC, Serializable):
             return de.c(inner_query, f)
         return self.db.execute(
-            select(*(field_to_expr(f) for f in fields)).order_by(
+            sa.select(*(field_to_expr(f) for f in fields)).order_by(
                 de.c(inner_query, "source"),
                 de.c(inner_query, "path"),
                 de.c(inner_query, "version"),
@@ -687,7 +703,7 @@ class AbstractWarehouse(ABC, Serializable):
             return dr.c(f)
         q = (
-            select(*(field_to_expr(f) for f in fields))
+            sa.select(*(field_to_expr(f) for f in fields))
             .where(
                 dr.c("path").like(f"{sql_escape_like(dirpath)}%"),
                 ~self.instr(pathfunc.name(dr.c("path")), "/"),
@@ -722,10 +738,10 @@ class AbstractWarehouse(ABC, Serializable):
         sub_glob = posixpath.join(path, "*")
         dr = dataset_rows
         selections: list[sa.ColumnElement] = [
-            func.sum(dr.c("size")),
+            sa.func.sum(dr.c("size")),
         ]
         if count_files:
-            selections.append(func.count())
+            selections.append(sa.func.count())
         results = next(
             self.db.execute(
                 dr.select(*selections).where(
@@ -842,7 +858,7 @@ class AbstractWarehouse(ABC, Serializable):
         self,
         columns: Sequence["sa.Column"] = (),
         name: Optional[str] = None,
-    ) -> "sa.Table":
+    ) -> sa.Table:
         """
         Create a temporary table for storing custom signals generated by a UDF.
         SQLite TEMPORARY tables cannot be directly used as they are process-specific,
@@ -860,8 +876,8 @@ class AbstractWarehouse(ABC, Serializable):
     @abstractmethod
     def copy_table(
         self,
-        table: Table,
-        query: "Select",
+        table: sa.Table,
+        query: sa.Select,
         progress_cb: Optional[Callable[[int], None]] = None,
     ) -> None:
         """
@@ -875,13 +891,13 @@ class AbstractWarehouse(ABC, Serializable):
         right: "_FromClauseArgument",
         onclause: "_OnClauseArgument",
         inner: bool = True,
-    ) -> "Select":
+    ) -> sa.Select:
         """
         Join two tables together.
         """
     @abstractmethod
-    def create_pre_udf_table(self, query: "Select") -> "Table":
+    def create_pre_udf_table(self, query: sa.Select) -> sa.Table:
         """
         Create a temporary table from a query for use in a UDF.
         """
@@ -906,12 +922,8 @@ class AbstractWarehouse(ABC, Serializable):
         are cleaned up as soon as they are no longer needed.
         """
         to_drop = set(names)
-        with tqdm(
-            desc="Cleanup", unit=" tables", total=len(to_drop), leave=False
-        ) as pbar:
-            for name in to_drop:
-                self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
-                pbar.update(1)
+        for name in to_drop:
+            self.db.drop_table(sa.Table(name, self.db.metadata), if_exists=True)
 def _random_string(length: int) -> str:

datachain/lib/arrow.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import orjson
 import pyarrow as pa
+from pyarrow._csv import ParseOptions
 from pyarrow.dataset import CsvFileFormat, dataset
 from tqdm.auto import tqdm
@@ -26,6 +27,18 @@ if TYPE_CHECKING:
 DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
+def fix_pyarrow_format(format, parse_options=None):
+    # Re-init invalid row handler: https://issues.apache.org/jira/browse/ARROW-17641
+    if (
+        format
+        and isinstance(format, CsvFileFormat)
+        and parse_options
+        and isinstance(parse_options, ParseOptions)
+    ):
+        format.parse_options = parse_options
+    return format
 class ArrowGenerator(Generator):
     DEFAULT_BATCH_SIZE = 2**17  # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
@@ -53,6 +66,7 @@ class ArrowGenerator(Generator):
         self.output_schema = output_schema
         self.source = source
         self.nrows = nrows
+        self.parse_options = kwargs.pop("parse_options", None)
         self.kwargs = kwargs
     def process(self, file: File):
@@ -64,7 +78,11 @@ class ArrowGenerator(Generator):
         else:
             fs, fs_path = file.get_fs(), file.get_path()
-        ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
+        kwargs = self.kwargs
+        if format := kwargs.get("format"):
+            kwargs["format"] = fix_pyarrow_format(format, self.parse_options)
+        ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **kwargs)
         hf_schema = _get_hf_schema(ds.schema)
         use_datachain_schema = (
@@ -137,6 +155,10 @@ class ArrowGenerator(Generator):
 def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
+    parse_options = kwargs.pop("parse_options", None)
+    if format := kwargs.get("format"):
+        kwargs["format"] = fix_pyarrow_format(format, parse_options)
     schemas = []
     for file in chain.collect("file"):
         ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs)  # type: ignore[union-attr]

datachain/lib/dc/csv.py CHANGED Viewed

@@ -124,4 +124,5 @@ def read_csv(
         source=source,
         nrows=nrows,
         format=format,
+        parse_options=parse_options,
     )

datachain 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl

Potentially problematic release.

datachain 0.16.3py3-none-any.whl → 0.16.5py3-none-any.whl