PyPI - datachain - Versions diffs - 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

datachain 0.7.11py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (27) hide show

datachain/catalog/catalog.py +56 -45
datachain/cli.py +25 -3
datachain/client/gcs.py +9 -0
datachain/data_storage/sqlite.py +20 -6
datachain/data_storage/warehouse.py +0 -1
datachain/lib/arrow.py +82 -58
datachain/lib/dc.py +167 -166
datachain/lib/diff.py +197 -0
datachain/lib/file.py +3 -1
datachain/lib/listing.py +44 -0
datachain/lib/meta_formats.py +38 -42
datachain/lib/udf.py +0 -1
datachain/query/batch.py +32 -6
datachain/query/dataset.py +18 -17
datachain/query/dispatch.py +125 -125
datachain/query/session.py +8 -5
datachain/query/udf.py +20 -0
datachain/query/utils.py +42 -0
datachain/remote/studio.py +53 -1
datachain/studio.py +47 -2
datachain/utils.py +1 -1
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/METADATA +4 -3
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/RECORD +27 -24
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/LICENSE +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/WHEEL +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/entry_points.txt +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import io
 import json
 import logging
-import math
 import os
 import os.path
 import posixpath
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
 from copy import copy
 from dataclasses import dataclass
 from functools import cached_property, reduce
-from random import shuffle
 from threading import Thread
 from typing import (
     IO,
@@ -54,15 +52,12 @@ from datachain.error import (
     QueryScriptCancelError,
     QueryScriptRunError,
 )
+from datachain.lib.listing import get_listing
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
 from datachain.sql.types import DateTime, SQLType
-from datachain.utils import (
-    DataChainDir,
-    batched,
-    datachain_paths_join,
-)
+from datachain.utils import DataChainDir, datachain_paths_join
 from .datasource import DataSource
@@ -90,7 +85,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
 QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
 # dataset pull
-PULL_DATASET_MAX_THREADS = 10
+PULL_DATASET_MAX_THREADS = 5
 PULL_DATASET_CHUNK_TIMEOUT = 3600
 PULL_DATASET_SLEEP_INTERVAL = 0.1  # sleep time while waiting for chunk to be available
 PULL_DATASET_CHECK_STATUS_INTERVAL = 20  # interval to check export status in Studio
@@ -130,6 +125,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         local_ds_version: int,
         schema: dict[str, Union[SQLType, type[SQLType]]],
         max_threads: int = PULL_DATASET_MAX_THREADS,
+        progress_bar=None,
     ):
         super().__init__(max_threads)
         self._check_dependencies()
@@ -142,6 +138,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         self.schema = schema
         self.last_status_check: Optional[float] = None
         self.studio_client = StudioClient()
+        self.progress_bar = progress_bar
     def done_task(self, done):
         for task in done:
@@ -198,6 +195,20 @@ class DatasetRowsFetcher(NodesThreadPool):
         for c in [c for c, t in self.schema.items() if t == DateTime]:
             df[c] = pd.to_datetime(df[c], unit="s")
+        # id will be autogenerated in DB
+        return df.drop("sys__id", axis=1)
+    def get_parquet_content(self, url: str):
+        while True:
+            if self.should_check_for_status():
+                self.check_for_status()
+            r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
+            if r.status_code == 404:
+                time.sleep(PULL_DATASET_SLEEP_INTERVAL)
+                continue
+            r.raise_for_status()
+            return r.content
     def do_task(self, urls):
         import lz4.frame
         import pandas as pd
@@ -207,31 +218,22 @@ class DatasetRowsFetcher(NodesThreadPool):
             local_ds = metastore.get_dataset(self.local_ds_name)
             urls = list(urls)
-            while urls:
-                for url in urls:
-                    if self.should_check_for_status():
-                        self.check_for_status()
-                    r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
-                    if r.status_code == 404:
-                        time.sleep(PULL_DATASET_SLEEP_INTERVAL)
-                        # moving to the next url
-                        continue
+            for url in urls:
+                if self.should_check_for_status():
+                    self.check_for_status()
-                    r.raise_for_status()
-                    df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
-                    self.fix_columns(df)
-                    # id will be autogenerated in DB
-                    df = df.drop("sys__id", axis=1)
+                df = pd.read_parquet(
+                    io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
+                )
+                df = self.fix_columns(df)
-                    inserted = warehouse.insert_dataset_rows(
-                        df, local_ds, self.local_ds_version
-                    )
-                    self.increase_counter(inserted)  # type: ignore [arg-type]
-                    urls.remove(url)
+                inserted = warehouse.insert_dataset_rows(
+                    df, local_ds, self.local_ds_version
+                )
+                self.increase_counter(inserted)  # type: ignore [arg-type]
+                # sometimes progress bar doesn't get updated so manually updating it
+                self.update_progress_bar(self.progress_bar)
 @dataclass
@@ -598,7 +600,7 @@ class Catalog:
             source, session=self.session, update=update, object_name=object_name
         )
-        list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
+        list_ds_name, list_uri, list_path, _ = get_listing(
             source, self.session, update=update
         )
@@ -696,11 +698,9 @@ class Catalog:
                 )
                 indexed_sources = []
                 for source in dataset_sources:
-                    from datachain.lib.dc import DataChain
                     client = self.get_client(source, **client_config)
                     uri = client.uri
-                    dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
+                    dataset_name, _, _, _ = get_listing(uri, self.session)
                     listing = Listing(
                         self.metastore.clone(),
                         self.warehouse.clone(),
@@ -1291,13 +1291,13 @@ class Catalog:
         for source in data_sources:  # type: ignore [union-attr]
             yield source, source.ls(fields)
-    def pull_dataset(  # noqa: PLR0915
+    def pull_dataset(  # noqa: C901, PLR0915
         self,
         remote_ds_uri: str,
         output: Optional[str] = None,
         local_ds_name: Optional[str] = None,
         local_ds_version: Optional[int] = None,
-        no_cp: bool = False,
+        cp: bool = False,
         force: bool = False,
         edatachain: bool = False,
         edatachain_file: Optional[str] = None,
@@ -1305,7 +1305,7 @@ class Catalog:
         client_config=None,
     ) -> None:
         def _instantiate(ds_uri: str) -> None:
-            if no_cp:
+            if not cp:
                 return
             assert output
             self.cp(
@@ -1318,7 +1318,7 @@ class Catalog:
             )
             print(f"Dataset {ds_uri} instantiated locally to {output}")
-        if not output and not no_cp:
+        if cp and not output:
             raise ValueError("Please provide output directory for instantiation")
         studio_client = StudioClient()
@@ -1417,12 +1417,26 @@ class Catalog:
         signed_urls = export_response.data
         if signed_urls:
-            shuffle(signed_urls)
             with (
                 self.metastore.clone() as metastore,
                 self.warehouse.clone() as warehouse,
             ):
+                def batch(urls):
+                    """
+                    Batching urls in a way that fetching is most efficient as
+                    urls with lower id will be created first. Because that, we
+                    are making sure all threads are pulling most recent urls
+                    from beginning
+                    """
+                    res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
+                    current_worker = 0
+                    for url in signed_urls:
+                        res[current_worker].append(url)
+                        current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
+                    return res
                 rows_fetcher = DatasetRowsFetcher(
                     metastore,
                     warehouse,
@@ -1431,14 +1445,11 @@ class Catalog:
                     local_ds_name,
                     local_ds_version,
                     schema,
+                    progress_bar=dataset_save_progress_bar,
                 )
                 try:
                     rows_fetcher.run(
-                        batched(
-                            signed_urls,
-                            math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
-                        ),
-                        dataset_save_progress_bar,
+                        iter(batch(signed_urls)), dataset_save_progress_bar
                     )
                 except:
                     self.remove_dataset(local_ds_name, local_ds_version)

datachain/cli.py CHANGED Viewed

@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         help="Python package requirement. Can be specified multiple times.",
     )
+    studio_cancel_help = "Cancel a job in Studio"
+    studio_cancel_description = "This command cancels a job in Studio."
+    studio_cancel_parser = studio_subparser.add_parser(
+        "cancel",
+        parents=[parent_parser],
+        description=studio_cancel_description,
+        help=studio_cancel_help,
+    )
+    studio_cancel_parser.add_argument(
+        "job_id",
+        action="store",
+        help="The job ID to cancel.",
+    )
+    studio_cancel_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to cancel a job for. By default, it will use team from config.",
+    )
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
     try:
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Copy directories recursively",
     )
     parse_pull.add_argument(
-        "--no-cp",
+        "--cp",
         default=False,
         action="store_true",
-        help="Do not copy files, just pull a remote dataset into local DB",
+        help="Copy actual files after pulling remote dataset into local DB",
     )
     parse_pull.add_argument(
         "--edatachain",
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                     args.output,
                     local_ds_name=args.local_name,
                     local_ds_version=args.local_version,
-                    no_cp=args.no_cp,
+                    cp=args.cp,
                     force=bool(args.force),
                     edatachain=args.edatachain,
                     edatachain_file=args.edatachain_file,

datachain/client/gcs.py CHANGED Viewed

@@ -32,6 +32,15 @@ class GCSClient(Client):
         return cast(GCSFileSystem, super().create_fs(**kwargs))
+    def url(self, path: str, expires: int = 3600, **kwargs) -> str:
+        try:
+            return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
+        except AttributeError as exc:
+            is_anon = self.fs.storage_options.get("token") == "anon"
+            if is_anon and "you need a private key to sign credentials" in str(exc):
+                return f"https://storage.googleapis.com/{self.name}/{path}"
+            raise
     @staticmethod
     def parse_timestamp(timestamp: str) -> datetime:
         """

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
     @retry_sqlite_locks
     def executemany(
-        self, query, params, cursor: Optional[sqlite3.Cursor] = None
+        self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
     ) -> sqlite3.Cursor:
         if cursor:
             return cursor.executemany(self.compile(query).string, params)
+        if conn:
+            return conn.executemany(self.compile(query).string, params)
         return self.db.executemany(self.compile(query).string, params)
     @retry_sqlite_locks
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
         return self.db.execute(sql, parameters)
     def insert_dataframe(self, table_name: str, df) -> int:
-        return df.to_sql(table_name, self.db, if_exists="append", index=False)
+        return df.to_sql(
+            table_name,
+            self.db,
+            if_exists="append",
+            index=False,
+            method="multi",
+            chunksize=1000,
+        )
     def cursor(self, factory=None):
         if factory is None:
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
         rows = list(rows)
         if not rows:
             return
-        self.db.executemany(
-            table.insert().values({f: bindparam(f) for f in rows[0]}),
-            rows,
-        )
+        with self.db.transaction() as conn:
+            # transactions speeds up inserts significantly as there is no separate
+            # transaction created for each insert row
+            self.db.executemany(
+                table.insert().values({f: bindparam(f) for f in rows[0]}),
+                rows,
+                conn=conn,
+            )
     def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
         dr = self.dataset_rows(dataset, version)

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -216,7 +216,6 @@ class AbstractWarehouse(ABC, Serializable):
         limit = query._limit
         paginated_query = query.limit(page_size)
-        results = None
         offset = 0
         num_yielded = 0

datachain/lib/arrow.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from collections.abc import Sequence
-from tempfile import NamedTemporaryFile
+from itertools import islice
 from typing import TYPE_CHECKING, Any, Optional
+import fsspec.implementations.reference
 import orjson
 import pyarrow as pa
+from fsspec.core import split_protocol
 from pyarrow.dataset import CsvFileFormat, dataset
 from tqdm import tqdm
@@ -25,7 +27,18 @@ if TYPE_CHECKING:
 DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
+class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
+    def _open(self, path, mode="rb", *args, **kwargs):
+        # overriding because `fsspec`'s `ReferenceFileSystem._open`
+        # reads the whole file in-memory.
+        (uri,) = self.references[path]
+        protocol, _ = split_protocol(uri)
+        return self.fss[protocol]._open(uri, mode, *args, **kwargs)
 class ArrowGenerator(Generator):
+    DEFAULT_BATCH_SIZE = 2**17  # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
     def __init__(
         self,
         input_schema: Optional["pa.Schema"] = None,
@@ -55,57 +68,80 @@ class ArrowGenerator(Generator):
     def process(self, file: File):
         if file._caching_enabled:
             file.ensure_cached()
-            path = file.get_local_path()
-            ds = dataset(path, schema=self.input_schema, **self.kwargs)
-        elif self.nrows:
-            path = _nrows_file(file, self.nrows)
-            ds = dataset(path, schema=self.input_schema, **self.kwargs)
+            cache_path = file.get_local_path()
+            fs_path = file.path
+            fs = ReferenceFileSystem({fs_path: [cache_path]})
         else:
-            path = file.get_path()
-            ds = dataset(
-                path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
-            )
+            fs, fs_path = file.get_fs(), file.get_path()
+        ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
         hf_schema = _get_hf_schema(ds.schema)
         use_datachain_schema = (
             bool(ds.schema.metadata)
             and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
         )
-        index = 0
-        with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
-            for record_batch in ds.to_batches():
-                for record in record_batch.to_pylist():
-                    if use_datachain_schema and self.output_schema:
-                        vals = [_nested_model_instantiate(record, self.output_schema)]
-                    else:
-                        vals = list(record.values())
-                        if self.output_schema:
-                            fields = self.output_schema.model_fields
-                            vals_dict = {}
-                            for i, ((field, field_info), val) in enumerate(
-                                zip(fields.items(), vals)
-                            ):
-                                anno = field_info.annotation
-                                if hf_schema:
-                                    from datachain.lib.hf import convert_feature
-                                    feat = list(hf_schema[0].values())[i]
-                                    vals_dict[field] = convert_feature(val, feat, anno)
-                                elif ModelStore.is_pydantic(anno):
-                                    vals_dict[field] = anno(**val)  # type: ignore[misc]
-                                else:
-                                    vals_dict[field] = val
-                            vals = [self.output_schema(**vals_dict)]
-                    if self.source:
-                        kwargs: dict = self.kwargs
-                        # Can't serialize CsvFileFormat; may lose formatting options.
-                        if isinstance(kwargs.get("format"), CsvFileFormat):
-                            kwargs["format"] = "csv"
-                        arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
-                        yield [arrow_file, *vals]
-                    else:
-                        yield vals
-                    index += 1
-                pbar.update(len(record_batch))
+        kw = {}
+        if self.nrows:
+            kw = {"batch_size": min(self.DEFAULT_BATCH_SIZE, self.nrows)}
+        def iter_records():
+            for record_batch in ds.to_batches(**kw):
+                yield from record_batch.to_pylist()
+        it = islice(iter_records(), self.nrows)
+        with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
+            for index, record in enumerate(pbar):
+                yield self._process_record(
+                    record, file, index, hf_schema, use_datachain_schema
+                )
+    def _process_record(
+        self,
+        record: dict[str, Any],
+        file: File,
+        index: int,
+        hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
+        use_datachain_schema: bool,
+    ):
+        if use_datachain_schema and self.output_schema:
+            vals = [_nested_model_instantiate(record, self.output_schema)]
+        else:
+            vals = self._process_non_datachain_record(record, hf_schema)
+        if self.source:
+            kwargs: dict = self.kwargs
+            # Can't serialize CsvFileFormat; may lose formatting options.
+            if isinstance(kwargs.get("format"), CsvFileFormat):
+                kwargs["format"] = "csv"
+            arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
+            return [arrow_file, *vals]
+        return vals
+    def _process_non_datachain_record(
+        self,
+        record: dict[str, Any],
+        hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
+    ):
+        vals = list(record.values())
+        if not self.output_schema:
+            return vals
+        fields = self.output_schema.model_fields
+        vals_dict = {}
+        for i, ((field, field_info), val) in enumerate(zip(fields.items(), vals)):
+            anno = field_info.annotation
+            if hf_schema:
+                from datachain.lib.hf import convert_feature
+                feat = list(hf_schema[0].values())[i]
+                vals_dict[field] = convert_feature(val, feat, anno)
+            elif ModelStore.is_pydantic(anno):
+                vals_dict[field] = anno(**val)  # type: ignore[misc]
+            else:
+                vals_dict[field] = val
+        return [self.output_schema(**vals_dict)]
 def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
@@ -190,18 +226,6 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
     raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
-def _nrows_file(file: File, nrows: int) -> str:
-    tf = NamedTemporaryFile(delete=False)  # noqa: SIM115
-    with file.open(mode="r") as reader:
-        with open(tf.name, "a") as writer:
-            for row, line in enumerate(reader):
-                if row >= nrows:
-                    break
-                writer.write(line)
-                writer.write("\n")
-    return tf.name
 def _get_hf_schema(
     schema: "pa.Schema",
 ) -> Optional[tuple["Features", dict[str, "DataType"]]]:

datachain 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

datachain 0.7.11py3-none-any.whl → 0.8.1py3-none-any.whl