PyPI - datachain - Versions diffs - 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

datachain 0.7.10py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (20) hide show

datachain/catalog/catalog.py +53 -41
datachain/cli.py +25 -3
datachain/client/__init__.py +1 -2
datachain/data_storage/sqlite.py +20 -6
datachain/lib/dc.py +160 -110
datachain/lib/diff.py +197 -0
datachain/lib/file.py +2 -1
datachain/lib/meta_formats.py +40 -43
datachain/lib/pytorch.py +1 -5
datachain/lib/signal_schema.py +28 -6
datachain/query/dataset.py +5 -1
datachain/remote/studio.py +53 -1
datachain/studio.py +47 -2
datachain/toolkit/split.py +19 -6
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/METADATA +10 -10
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/RECORD +20 -19
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/LICENSE +0 -0
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/WHEEL +0 -0
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/entry_points.txt +0 -0
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import io
 import json
 import logging
-import math
 import os
 import os.path
 import posixpath
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
 from copy import copy
 from dataclasses import dataclass
 from functools import cached_property, reduce
-from random import shuffle
 from threading import Thread
 from typing import (
     IO,
@@ -58,11 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
 from datachain.sql.types import DateTime, SQLType
-from datachain.utils import (
-    DataChainDir,
-    batched,
-    datachain_paths_join,
-)
+from datachain.utils import DataChainDir, datachain_paths_join
 from .datasource import DataSource
@@ -90,7 +84,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
 QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
 # dataset pull
-PULL_DATASET_MAX_THREADS = 10
+PULL_DATASET_MAX_THREADS = 5
 PULL_DATASET_CHUNK_TIMEOUT = 3600
 PULL_DATASET_SLEEP_INTERVAL = 0.1  # sleep time while waiting for chunk to be available
 PULL_DATASET_CHECK_STATUS_INTERVAL = 20  # interval to check export status in Studio
@@ -130,6 +124,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         local_ds_version: int,
         schema: dict[str, Union[SQLType, type[SQLType]]],
         max_threads: int = PULL_DATASET_MAX_THREADS,
+        progress_bar=None,
     ):
         super().__init__(max_threads)
         self._check_dependencies()
@@ -142,6 +137,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         self.schema = schema
         self.last_status_check: Optional[float] = None
         self.studio_client = StudioClient()
+        self.progress_bar = progress_bar
     def done_task(self, done):
         for task in done:
@@ -198,6 +194,20 @@ class DatasetRowsFetcher(NodesThreadPool):
         for c in [c for c, t in self.schema.items() if t == DateTime]:
             df[c] = pd.to_datetime(df[c], unit="s")
+        # id will be autogenerated in DB
+        return df.drop("sys__id", axis=1)
+    def get_parquet_content(self, url: str):
+        while True:
+            if self.should_check_for_status():
+                self.check_for_status()
+            r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
+            if r.status_code == 404:
+                time.sleep(PULL_DATASET_SLEEP_INTERVAL)
+                continue
+            r.raise_for_status()
+            return r.content
     def do_task(self, urls):
         import lz4.frame
         import pandas as pd
@@ -207,31 +217,22 @@ class DatasetRowsFetcher(NodesThreadPool):
             local_ds = metastore.get_dataset(self.local_ds_name)
             urls = list(urls)
-            while urls:
-                for url in urls:
-                    if self.should_check_for_status():
-                        self.check_for_status()
-                    r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
-                    if r.status_code == 404:
-                        time.sleep(PULL_DATASET_SLEEP_INTERVAL)
-                        # moving to the next url
-                        continue
-                    r.raise_for_status()
-                    df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
-                    self.fix_columns(df)
+            for url in urls:
+                if self.should_check_for_status():
+                    self.check_for_status()
-                    # id will be autogenerated in DB
-                    df = df.drop("sys__id", axis=1)
+                df = pd.read_parquet(
+                    io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
+                )
+                df = self.fix_columns(df)
-                    inserted = warehouse.insert_dataset_rows(
-                        df, local_ds, self.local_ds_version
-                    )
-                    self.increase_counter(inserted)  # type: ignore [arg-type]
-                    urls.remove(url)
+                inserted = warehouse.insert_dataset_rows(
+                    df, local_ds, self.local_ds_version
+                )
+                self.increase_counter(inserted)  # type: ignore [arg-type]
+                # sometimes progress bar doesn't get updated so manually updating it
+                self.update_progress_bar(self.progress_bar)
 @dataclass
@@ -1291,13 +1292,13 @@ class Catalog:
         for source in data_sources:  # type: ignore [union-attr]
             yield source, source.ls(fields)
-    def pull_dataset(  # noqa: PLR0915
+    def pull_dataset(  # noqa: C901, PLR0915
         self,
         remote_ds_uri: str,
         output: Optional[str] = None,
         local_ds_name: Optional[str] = None,
         local_ds_version: Optional[int] = None,
-        no_cp: bool = False,
+        cp: bool = False,
         force: bool = False,
         edatachain: bool = False,
         edatachain_file: Optional[str] = None,
@@ -1305,7 +1306,7 @@ class Catalog:
         client_config=None,
     ) -> None:
         def _instantiate(ds_uri: str) -> None:
-            if no_cp:
+            if not cp:
                 return
             assert output
             self.cp(
@@ -1318,7 +1319,7 @@ class Catalog:
             )
             print(f"Dataset {ds_uri} instantiated locally to {output}")
-        if not output and not no_cp:
+        if cp and not output:
             raise ValueError("Please provide output directory for instantiation")
         studio_client = StudioClient()
@@ -1417,12 +1418,26 @@ class Catalog:
         signed_urls = export_response.data
         if signed_urls:
-            shuffle(signed_urls)
             with (
                 self.metastore.clone() as metastore,
                 self.warehouse.clone() as warehouse,
             ):
+                def batch(urls):
+                    """
+                    Batching urls in a way that fetching is most efficient as
+                    urls with lower id will be created first. Because that, we
+                    are making sure all threads are pulling most recent urls
+                    from beginning
+                    """
+                    res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
+                    current_worker = 0
+                    for url in signed_urls:
+                        res[current_worker].append(url)
+                        current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
+                    return res
                 rows_fetcher = DatasetRowsFetcher(
                     metastore,
                     warehouse,
@@ -1431,14 +1446,11 @@ class Catalog:
                     local_ds_name,
                     local_ds_version,
                     schema,
+                    progress_bar=dataset_save_progress_bar,
                 )
                 try:
                     rows_fetcher.run(
-                        batched(
-                            signed_urls,
-                            math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
-                        ),
-                        dataset_save_progress_bar,
+                        iter(batch(signed_urls)), dataset_save_progress_bar
                     )
                 except:
                     self.remove_dataset(local_ds_name, local_ds_version)

datachain/cli.py CHANGED Viewed

@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         help="Python package requirement. Can be specified multiple times.",
     )
+    studio_cancel_help = "Cancel a job in Studio"
+    studio_cancel_description = "This command cancels a job in Studio."
+    studio_cancel_parser = studio_subparser.add_parser(
+        "cancel",
+        parents=[parent_parser],
+        description=studio_cancel_description,
+        help=studio_cancel_help,
+    )
+    studio_cancel_parser.add_argument(
+        "job_id",
+        action="store",
+        help="The job ID to cancel.",
+    )
+    studio_cancel_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to cancel a job for. By default, it will use team from config.",
+    )
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
     try:
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Copy directories recursively",
     )
     parse_pull.add_argument(
-        "--no-cp",
+        "--cp",
         default=False,
         action="store_true",
-        help="Do not copy files, just pull a remote dataset into local DB",
+        help="Copy actual files after pulling remote dataset into local DB",
     )
     parse_pull.add_argument(
         "--edatachain",
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                     args.output,
                     local_ds_name=args.local_name,
                     local_ds_version=args.local_version,
-                    no_cp=args.no_cp,
+                    cp=args.cp,
                     force=bool(args.force),
                     edatachain=args.edatachain,
                     edatachain_file=args.edatachain_file,

datachain/client/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from .fsspec import Client
-from .s3 import ClientS3
-__all__ = ["Client", "ClientS3"]
+__all__ = ["Client"]

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
     @retry_sqlite_locks
     def executemany(
-        self, query, params, cursor: Optional[sqlite3.Cursor] = None
+        self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
     ) -> sqlite3.Cursor:
         if cursor:
             return cursor.executemany(self.compile(query).string, params)
+        if conn:
+            return conn.executemany(self.compile(query).string, params)
         return self.db.executemany(self.compile(query).string, params)
     @retry_sqlite_locks
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
         return self.db.execute(sql, parameters)
     def insert_dataframe(self, table_name: str, df) -> int:
-        return df.to_sql(table_name, self.db, if_exists="append", index=False)
+        return df.to_sql(
+            table_name,
+            self.db,
+            if_exists="append",
+            index=False,
+            method="multi",
+            chunksize=1000,
+        )
     def cursor(self, factory=None):
         if factory is None:
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
         rows = list(rows)
         if not rows:
             return
-        self.db.executemany(
-            table.insert().values({f: bindparam(f) for f in rows[0]}),
-            rows,
-        )
+        with self.db.transaction() as conn:
+            # transactions speeds up inserts significantly as there is no separate
+            # transaction created for each insert row
+            self.db.executemany(
+                table.insert().values({f: bindparam(f) for f in rows[0]}),
+                rows,
+                conn=conn,
+            )
     def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
         dr = self.dataset_rows(dataset, version)

datachain/lib/dc.py CHANGED Viewed

@@ -19,7 +19,6 @@ from typing import (
 )
 import orjson
-import pandas as pd
 import sqlalchemy
 from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
@@ -42,7 +41,7 @@ from datachain.lib.listing import (
     parse_listing_uri,
 )
 from datachain.lib.listing_info import ListingInfo
-from datachain.lib.meta_formats import read_meta, read_schema
+from datachain.lib.meta_formats import read_meta
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
@@ -57,6 +56,7 @@ from datachain.telemetry import telemetry
 from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
 if TYPE_CHECKING:
+    import pandas as pd
     from pyarrow import DataType as ArrowDataType
     from typing_extensions import Concatenate, ParamSpec, Self
@@ -554,8 +554,7 @@ class DataChain:
         jmespath: Optional[str] = None,
         object_name: Optional[str] = "",
         model_name: Optional[str] = None,
-        print_schema: Optional[bool] = False,
-        meta_type: Optional[str] = "json",
+        format: Optional[str] = "json",
         nrows=None,
         **kwargs,
     ) -> "DataChain":
@@ -564,12 +563,12 @@ class DataChain:
         Parameters:
             path : storage URI with directory. URI must start with storage prefix such
                 as `s3://`, `gs://`, `az://` or "file:///"
-            type : read file as "binary", "text", or "image" data. Default is "binary".
+            type : read file as "binary", "text", or "image" data. Default is "text".
             spec : optional Data Model
             schema_from : path to sample to infer spec (if schema not provided)
             object_name : generated object column name
             model_name : optional generated model name
-            print_schema : print auto-generated schema
+            format: "json", "jsonl"
             jmespath : optional JMESPATH expression to reduce JSON
             nrows : optional row limit for jsonl and JSON arrays
@@ -594,75 +593,14 @@ class DataChain:
         if (not object_name) and jmespath:
             object_name = jmespath_to_name(jmespath)
         if not object_name:
-            object_name = meta_type
-        chain = DataChain.from_storage(uri=path, type=type, **kwargs)
-        signal_dict = {
-            object_name: read_meta(
-                schema_from=schema_from,
-                meta_type=meta_type,
-                spec=spec,
-                model_name=model_name,
-                print_schema=print_schema,
-                jmespath=jmespath,
-                nrows=nrows,
-            )
-        }
-        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
-    @classmethod
-    def from_jsonl(
-        cls,
-        path,
-        type: Literal["binary", "text", "image"] = "text",
-        spec: Optional[DataType] = None,
-        schema_from: Optional[str] = "auto",
-        jmespath: Optional[str] = None,
-        object_name: Optional[str] = "",
-        model_name: Optional[str] = None,
-        print_schema: Optional[bool] = False,
-        meta_type: Optional[str] = "jsonl",
-        nrows=None,
-        **kwargs,
-    ) -> "DataChain":
-        """Get data from JSON lines. It returns the chain itself.
-        Parameters:
-            path : storage URI with directory. URI must start with storage prefix such
-                as `s3://`, `gs://`, `az://` or "file:///"
-            type : read file as "binary", "text", or "image" data. Default is "binary".
-            spec : optional Data Model
-            schema_from : path to sample to infer spec (if schema not provided)
-            object_name : generated object column name
-            model_name : optional generated model name
-            print_schema : print auto-generated schema
-            jmespath : optional JMESPATH expression to reduce JSON
-            nrows : optional row limit for jsonl and JSON arrays
-        Example:
-            infer JSONl schema from data, limit parsing to 1 row
-            ```py
-            chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
-            ```
-        """
-        if schema_from == "auto":
-            schema_from = path
-        def jmespath_to_name(s: str):
-            name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
-            return s[:name_end]
-        if (not object_name) and jmespath:
-            object_name = jmespath_to_name(jmespath)
-        if not object_name:
-            object_name = meta_type
+            object_name = format
         chain = DataChain.from_storage(uri=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
                 schema_from=schema_from,
-                meta_type=meta_type,
+                format=format,
                 spec=spec,
                 model_name=model_name,
-                print_schema=print_schema,
                 jmespath=jmespath,
                 nrows=nrows,
             )
@@ -793,47 +731,6 @@ class DataChain:
             **{object_name: catalog.listings()},  # type: ignore[arg-type]
         )
-    def print_json_schema(  # type: ignore[override]
-        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
-    ) -> "Self":
-        """Print JSON data model and save it. It returns the chain itself.
-        Parameters:
-            jmespath : JMESPATH expression to reduce JSON
-            model_name : generated model name
-        Example:
-            print JSON schema and save to column "meta_from":
-            ```py
-            uri = "gs://datachain-demo/coco2017/annotations_captions/"
-            chain = DataChain.from_storage(uri)
-            chain = chain.print_json_schema()
-            chain.save()
-            ```
-        """
-        return self.map(
-            meta_schema=lambda file: read_schema(
-                file, data_type="json", expr=jmespath, model_name=model_name
-            ),
-            output=str,
-        )
-    def print_jsonl_schema(  # type: ignore[override]
-        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
-    ) -> "Self":
-        """Print JSON data model and save it. It returns the chain itself.
-        Parameters:
-            jmespath : JMESPATH expression to reduce JSON
-            model_name : generated model name
-        """
-        return self.map(
-            meta_schema=lambda file: read_schema(
-                file, data_type="jsonl", expr=jmespath, model_name=model_name
-            ),
-            output=str,
-        )
     def save(  # type: ignore[override]
         self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
     ) -> "Self":
@@ -1624,6 +1521,155 @@ class DataChain:
             )
         return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]
+    def compare(
+        self,
+        other: "DataChain",
+        on: Union[str, Sequence[str]],
+        right_on: Optional[Union[str, Sequence[str]]] = None,
+        compare: Optional[Union[str, Sequence[str]]] = None,
+        right_compare: Optional[Union[str, Sequence[str]]] = None,
+        added: bool = True,
+        deleted: bool = True,
+        modified: bool = True,
+        same: bool = False,
+        status_col: Optional[str] = None,
+    ) -> "DataChain":
+        """Comparing two chains by identifying rows that are added, deleted, modified
+        or same. Result is the new chain that has additional column with possible
+        values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
+        rows respectively. Note that if only one "status" is asked, by setting proper
+        flags, this additional column is not created as it would have only one value
+        for all rows. Beside additional diff column, new chain has schema of the chain
+        on which method was called.
+        Parameters:
+            other: Chain to calculate diff from.
+            on: Column or list of columns to match on. If both chains have the
+                same columns then this column is enough for the match. Otherwise,
+                `right_on` parameter has to specify the columns for the other chain.
+                This value is used to find corresponding row in other dataset. If not
+                found there, row is considered as added (or removed if vice versa), and
+                if found then row can be either modified or same.
+            right_on: Optional column or list of columns
+                for the `other` to match.
+            compare: Column or list of columns to compare on. If both chains have
+                the same columns then this column is enough for the compare. Otherwise,
+                `right_compare` parameter has to specify the columns for the other
+                chain. This value is used to see if row is modified or same. If
+                not set, all columns will be used for comparison
+            right_compare: Optional column or list of columns
+                    for the `other` to compare to.
+            added (bool): Whether to return added rows in resulting chain.
+            deleted (bool): Whether to return deleted rows in resulting chain.
+            modified (bool): Whether to return modified rows in resulting chain.
+            same (bool): Whether to return unchanged rows in resulting chain.
+            status_col (str): Name of the new column that is created in resulting chain
+                representing diff status.
+        Example:
+            ```py
+            diff = persons.diff(
+                new_persons,
+                on=["id"],
+                right_on=["other_id"],
+                compare=["name"],
+                added=True,
+                deleted=True,
+                modified=True,
+                same=True,
+                status_col="diff"
+            )
+            ```
+        """
+        from datachain.lib.diff import compare as chain_compare
+        return chain_compare(
+            self,
+            other,
+            on,
+            right_on=right_on,
+            compare=compare,
+            right_compare=right_compare,
+            added=added,
+            deleted=deleted,
+            modified=modified,
+            same=same,
+            status_col=status_col,
+        )
+    def diff(
+        self,
+        other: "DataChain",
+        on: str = "file",
+        right_on: Optional[str] = None,
+        added: bool = True,
+        modified: bool = True,
+        deleted: bool = False,
+        same: bool = False,
+        status_col: Optional[str] = None,
+    ) -> "DataChain":
+        """Similar to `.compare()`, which is more generic method to calculate difference
+        between two chains. Unlike `.compare()`, this method works only on those chains
+        that have `File` object, or it's derivatives, in it. File `source` and `path`
+        are used for matching, and file `version` and `etag` for comparing, while in
+        `.compare()` user needs to provide arbitrary columns for matching and comparing.
+        Parameters:
+            other: Chain to calculate diff from.
+            on: File signal to match on. If both chains have the
+                same file signal then this column is enough for the match. Otherwise,
+                `right_on` parameter has to specify the file signal for the other chain.
+                This value is used to find corresponding row in other dataset. If not
+                found there, row is considered as added (or removed if vice versa), and
+                if found then row can be either modified or same.
+            right_on: Optional file signal for the `other` to match.
+            added (bool): Whether to return added rows in resulting chain.
+            deleted (bool): Whether to return deleted rows in resulting chain.
+            modified (bool): Whether to return modified rows in resulting chain.
+            same (bool): Whether to return unchanged rows in resulting chain.
+            status_col (str): Optional name of the new column that is created in
+                resulting chain representing diff status.
+        Example:
+            ```py
+            diff = images.diff(
+                new_images,
+                on="file",
+                right_on="other_file",
+                added=True,
+                deleted=True,
+                modified=True,
+                same=True,
+                status_col="diff"
+            )
+            ```
+        """
+        on_file_signals = ["source", "path"]
+        compare_file_signals = ["version", "etag"]
+        def get_file_signals(file: str, signals):
+            return [f"{file}.{c}" for c in signals]
+        right_on = right_on or on
+        on_cols = get_file_signals(on, on_file_signals)
+        right_on_cols = get_file_signals(right_on, on_file_signals)
+        compare_cols = get_file_signals(on, compare_file_signals)
+        right_compare_cols = get_file_signals(right_on, compare_file_signals)
+        return self.compare(
+            other,
+            on_cols,
+            right_on=right_on_cols,
+            compare=compare_cols,
+            right_compare=right_compare_cols,
+            added=added,
+            deleted=deleted,
+            modified=modified,
+            same=same,
+            status_col=status_col,
+        )
     @classmethod
     def from_values(
         cls,
@@ -1701,6 +1747,8 @@ class DataChain:
         Parameters:
             flatten : Whether to use a multiindex or flatten column names.
         """
+        import pandas as pd
         headers, max_length = self._effective_signals_schema.get_headers_with_length()
         if flatten or max_length < 2:
             columns = [".".join(filter(None, header)) for header in headers]
@@ -1724,6 +1772,8 @@ class DataChain:
             transpose : Whether to transpose rows and columns.
             truncate : Whether or not to truncate the contents of columns.
         """
+        import pandas as pd
         dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
         df = dc.to_pandas(flatten)

datachain 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

datachain 0.7.10py3-none-any.whl → 0.8.0py3-none-any.whl