PyPI - datachain - Versions diffs - 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

datachain 0.7.11py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (27) hide show

datachain/catalog/catalog.py +56 -45
datachain/cli.py +25 -3
datachain/client/gcs.py +9 -0
datachain/data_storage/sqlite.py +20 -6
datachain/data_storage/warehouse.py +0 -1
datachain/lib/arrow.py +82 -58
datachain/lib/dc.py +167 -166
datachain/lib/diff.py +197 -0
datachain/lib/file.py +3 -1
datachain/lib/listing.py +44 -0
datachain/lib/meta_formats.py +38 -42
datachain/lib/udf.py +0 -1
datachain/query/batch.py +32 -6
datachain/query/dataset.py +18 -17
datachain/query/dispatch.py +125 -125
datachain/query/session.py +8 -5
datachain/query/udf.py +20 -0
datachain/query/utils.py +42 -0
datachain/remote/studio.py +53 -1
datachain/studio.py +47 -2
datachain/utils.py +1 -1
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/METADATA +4 -3
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/RECORD +27 -24
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/LICENSE +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/WHEEL +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/entry_points.txt +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/top_level.txt +0 -0

datachain/lib/dc.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import (
     BinaryIO,
     Callable,
     ClassVar,
-    Literal,
     Optional,
     TypeVar,
     Union,
@@ -24,8 +23,6 @@ from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
-from datachain.client import Client
-from datachain.client.local import FileClient
 from datachain.dataset import DatasetRecord
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -33,15 +30,11 @@ from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
-from datachain.lib.file import ArrowRow, File, get_file_type
+from datachain.lib.file import ArrowRow, File, FileType, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.listing import (
-    list_bucket,
-    ls,
-    parse_listing_uri,
-)
+from datachain.lib.listing import get_listing, list_bucket, ls
 from datachain.lib.listing_info import ListingInfo
-from datachain.lib.meta_formats import read_meta, read_schema
+from datachain.lib.meta_formats import read_meta
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
@@ -403,53 +396,12 @@ class DataChain:
         self.signals_schema |= signals_schema
         return self
-    @classmethod
-    def parse_uri(
-        cls, uri: str, session: Session, update: bool = False
-    ) -> tuple[str, str, str, bool]:
-        """Returns correct listing dataset name that must be used for saving listing
-        operation. It takes into account existing listings and reusability of those.
-        It also returns boolean saying if returned dataset name is reused / already
-        exists or not, and it returns correct listing path that should be used to find
-        rows based on uri.
-        """
-        catalog = session.catalog
-        cache = catalog.cache
-        client_config = catalog.client_config
-        client = Client.get_client(uri, cache, **client_config)
-        ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
-        listing = None
-        listings = [
-            ls
-            for ls in catalog.listings()
-            if not ls.is_expired and ls.contains(ds_name)
-        ]
-        if listings:
-            if update:
-                # choosing the smallest possible one to minimize update time
-                listing = sorted(listings, key=lambda ls: len(ls.name))[0]
-            else:
-                # no need to update, choosing the most recent one
-                listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
-        if isinstance(client, FileClient) and listing and listing.name != ds_name:
-            # For local file system we need to fix listing path / prefix
-            # if we are reusing existing listing
-            list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
-        ds_name = listing.name if listing else ds_name
-        return ds_name, list_uri, list_path, bool(listing)
     @classmethod
     def from_storage(
         cls,
         uri,
         *,
-        type: Literal["binary", "text", "image"] = "binary",
+        type: FileType = "binary",
         session: Optional[Session] = None,
         settings: Optional[dict] = None,
         in_memory: bool = False,
@@ -482,7 +434,7 @@ class DataChain:
         cache = session.catalog.cache
         client_config = session.catalog.client_config
-        list_ds_name, list_uri, list_path, list_ds_exists = cls.parse_uri(
+        list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
             uri, session, update=update
         )
@@ -548,14 +500,13 @@ class DataChain:
     def from_json(
         cls,
         path,
-        type: Literal["binary", "text", "image"] = "text",
+        type: FileType = "text",
         spec: Optional[DataType] = None,
         schema_from: Optional[str] = "auto",
         jmespath: Optional[str] = None,
         object_name: Optional[str] = "",
         model_name: Optional[str] = None,
-        print_schema: Optional[bool] = False,
-        meta_type: Optional[str] = "json",
+        format: Optional[str] = "json",
         nrows=None,
         **kwargs,
     ) -> "DataChain":
@@ -564,12 +515,12 @@ class DataChain:
         Parameters:
             path : storage URI with directory. URI must start with storage prefix such
                 as `s3://`, `gs://`, `az://` or "file:///"
-            type : read file as "binary", "text", or "image" data. Default is "binary".
+            type : read file as "binary", "text", or "image" data. Default is "text".
             spec : optional Data Model
             schema_from : path to sample to infer spec (if schema not provided)
             object_name : generated object column name
             model_name : optional generated model name
-            print_schema : print auto-generated schema
+            format: "json", "jsonl"
             jmespath : optional JMESPATH expression to reduce JSON
             nrows : optional row limit for jsonl and JSON arrays
@@ -594,80 +545,21 @@ class DataChain:
         if (not object_name) and jmespath:
             object_name = jmespath_to_name(jmespath)
         if not object_name:
-            object_name = meta_type
-        chain = DataChain.from_storage(uri=path, type=type, **kwargs)
-        signal_dict = {
-            object_name: read_meta(
-                schema_from=schema_from,
-                meta_type=meta_type,
-                spec=spec,
-                model_name=model_name,
-                print_schema=print_schema,
-                jmespath=jmespath,
-                nrows=nrows,
-            )
-        }
-        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
-    @classmethod
-    def from_jsonl(
-        cls,
-        path,
-        type: Literal["binary", "text", "image"] = "text",
-        spec: Optional[DataType] = None,
-        schema_from: Optional[str] = "auto",
-        jmespath: Optional[str] = None,
-        object_name: Optional[str] = "",
-        model_name: Optional[str] = None,
-        print_schema: Optional[bool] = False,
-        meta_type: Optional[str] = "jsonl",
-        nrows=None,
-        **kwargs,
-    ) -> "DataChain":
-        """Get data from JSON lines. It returns the chain itself.
-        Parameters:
-            path : storage URI with directory. URI must start with storage prefix such
-                as `s3://`, `gs://`, `az://` or "file:///"
-            type : read file as "binary", "text", or "image" data. Default is "binary".
-            spec : optional Data Model
-            schema_from : path to sample to infer spec (if schema not provided)
-            object_name : generated object column name
-            model_name : optional generated model name
-            print_schema : print auto-generated schema
-            jmespath : optional JMESPATH expression to reduce JSON
-            nrows : optional row limit for jsonl and JSON arrays
-        Example:
-            infer JSONl schema from data, limit parsing to 1 row
-            ```py
-            chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
-            ```
-        """
-        if schema_from == "auto":
-            schema_from = path
-        def jmespath_to_name(s: str):
-            name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
-            return s[:name_end]
-        if (not object_name) and jmespath:
-            object_name = jmespath_to_name(jmespath)
-        if not object_name:
-            object_name = meta_type
+            object_name = format
         chain = DataChain.from_storage(uri=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
                 schema_from=schema_from,
-                meta_type=meta_type,
+                format=format,
                 spec=spec,
                 model_name=model_name,
-                print_schema=print_schema,
                 jmespath=jmespath,
                 nrows=nrows,
             )
         }
-        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
+        # disable prefetch if nrows is set
+        settings = {"prefetch": 0} if nrows else {}
+        return chain.settings(**settings).gen(**signal_dict)  # type: ignore[misc, arg-type]
     def explode(
         self,
@@ -793,47 +685,6 @@ class DataChain:
             **{object_name: catalog.listings()},  # type: ignore[arg-type]
         )
-    def print_json_schema(  # type: ignore[override]
-        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
-    ) -> "Self":
-        """Print JSON data model and save it. It returns the chain itself.
-        Parameters:
-            jmespath : JMESPATH expression to reduce JSON
-            model_name : generated model name
-        Example:
-            print JSON schema and save to column "meta_from":
-            ```py
-            uri = "gs://datachain-demo/coco2017/annotations_captions/"
-            chain = DataChain.from_storage(uri)
-            chain = chain.print_json_schema()
-            chain.save()
-            ```
-        """
-        return self.map(
-            meta_schema=lambda file: read_schema(
-                file, data_type="json", expr=jmespath, model_name=model_name
-            ),
-            output=str,
-        )
-    def print_jsonl_schema(  # type: ignore[override]
-        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
-    ) -> "Self":
-        """Print JSON data model and save it. It returns the chain itself.
-        Parameters:
-            jmespath : JMESPATH expression to reduce JSON
-            model_name : generated model name
-        """
-        return self.map(
-            meta_schema=lambda file: read_schema(
-                file, data_type="jsonl", expr=jmespath, model_name=model_name
-            ),
-            output=str,
-        )
     def save(  # type: ignore[override]
         self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
     ) -> "Self":
@@ -1624,6 +1475,155 @@ class DataChain:
             )
         return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]
+    def compare(
+        self,
+        other: "DataChain",
+        on: Union[str, Sequence[str]],
+        right_on: Optional[Union[str, Sequence[str]]] = None,
+        compare: Optional[Union[str, Sequence[str]]] = None,
+        right_compare: Optional[Union[str, Sequence[str]]] = None,
+        added: bool = True,
+        deleted: bool = True,
+        modified: bool = True,
+        same: bool = False,
+        status_col: Optional[str] = None,
+    ) -> "DataChain":
+        """Comparing two chains by identifying rows that are added, deleted, modified
+        or same. Result is the new chain that has additional column with possible
+        values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
+        rows respectively. Note that if only one "status" is asked, by setting proper
+        flags, this additional column is not created as it would have only one value
+        for all rows. Beside additional diff column, new chain has schema of the chain
+        on which method was called.
+        Parameters:
+            other: Chain to calculate diff from.
+            on: Column or list of columns to match on. If both chains have the
+                same columns then this column is enough for the match. Otherwise,
+                `right_on` parameter has to specify the columns for the other chain.
+                This value is used to find corresponding row in other dataset. If not
+                found there, row is considered as added (or removed if vice versa), and
+                if found then row can be either modified or same.
+            right_on: Optional column or list of columns
+                for the `other` to match.
+            compare: Column or list of columns to compare on. If both chains have
+                the same columns then this column is enough for the compare. Otherwise,
+                `right_compare` parameter has to specify the columns for the other
+                chain. This value is used to see if row is modified or same. If
+                not set, all columns will be used for comparison
+            right_compare: Optional column or list of columns
+                    for the `other` to compare to.
+            added (bool): Whether to return added rows in resulting chain.
+            deleted (bool): Whether to return deleted rows in resulting chain.
+            modified (bool): Whether to return modified rows in resulting chain.
+            same (bool): Whether to return unchanged rows in resulting chain.
+            status_col (str): Name of the new column that is created in resulting chain
+                representing diff status.
+        Example:
+            ```py
+            diff = persons.diff(
+                new_persons,
+                on=["id"],
+                right_on=["other_id"],
+                compare=["name"],
+                added=True,
+                deleted=True,
+                modified=True,
+                same=True,
+                status_col="diff"
+            )
+            ```
+        """
+        from datachain.lib.diff import compare as chain_compare
+        return chain_compare(
+            self,
+            other,
+            on,
+            right_on=right_on,
+            compare=compare,
+            right_compare=right_compare,
+            added=added,
+            deleted=deleted,
+            modified=modified,
+            same=same,
+            status_col=status_col,
+        )
+    def diff(
+        self,
+        other: "DataChain",
+        on: str = "file",
+        right_on: Optional[str] = None,
+        added: bool = True,
+        modified: bool = True,
+        deleted: bool = False,
+        same: bool = False,
+        status_col: Optional[str] = None,
+    ) -> "DataChain":
+        """Similar to `.compare()`, which is more generic method to calculate difference
+        between two chains. Unlike `.compare()`, this method works only on those chains
+        that have `File` object, or it's derivatives, in it. File `source` and `path`
+        are used for matching, and file `version` and `etag` for comparing, while in
+        `.compare()` user needs to provide arbitrary columns for matching and comparing.
+        Parameters:
+            other: Chain to calculate diff from.
+            on: File signal to match on. If both chains have the
+                same file signal then this column is enough for the match. Otherwise,
+                `right_on` parameter has to specify the file signal for the other chain.
+                This value is used to find corresponding row in other dataset. If not
+                found there, row is considered as added (or removed if vice versa), and
+                if found then row can be either modified or same.
+            right_on: Optional file signal for the `other` to match.
+            added (bool): Whether to return added rows in resulting chain.
+            deleted (bool): Whether to return deleted rows in resulting chain.
+            modified (bool): Whether to return modified rows in resulting chain.
+            same (bool): Whether to return unchanged rows in resulting chain.
+            status_col (str): Optional name of the new column that is created in
+                resulting chain representing diff status.
+        Example:
+            ```py
+            diff = images.diff(
+                new_images,
+                on="file",
+                right_on="other_file",
+                added=True,
+                deleted=True,
+                modified=True,
+                same=True,
+                status_col="diff"
+            )
+            ```
+        """
+        on_file_signals = ["source", "path"]
+        compare_file_signals = ["version", "etag"]
+        def get_file_signals(file: str, signals):
+            return [f"{file}.{c}" for c in signals]
+        right_on = right_on or on
+        on_cols = get_file_signals(on, on_file_signals)
+        right_on_cols = get_file_signals(right_on, on_file_signals)
+        compare_cols = get_file_signals(on, compare_file_signals)
+        right_compare_cols = get_file_signals(right_on, compare_file_signals)
+        return self.compare(
+            other,
+            on_cols,
+            right_on=right_on_cols,
+            compare=compare_cols,
+            right_compare=right_compare_cols,
+            added=added,
+            deleted=deleted,
+            modified=modified,
+            same=same,
+            status_col=status_col,
+        )
     @classmethod
     def from_values(
         cls,
@@ -1896,7 +1896,10 @@ class DataChain:
         if source:
             output = {"source": ArrowRow} | output  # type: ignore[assignment,operator]
-        return self.gen(
+        # disable prefetch if nrows is set
+        settings = {"prefetch": 0} if nrows else {}
+        return self.settings(**settings).gen(  # type: ignore[arg-type]
             ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
         )
@@ -1978,8 +1981,6 @@ class DataChain:
             else:
                 msg = f"error parsing csv - incompatible output type {type(output)}"
                 raise DatasetPrepareError(chain.name, msg)
-        elif nrows:
-            nrows += 1
         parse_options = ParseOptions(delimiter=delimiter)
         read_options = ReadOptions(column_names=column_names)

datachain/lib/diff.py ADDED Viewed

@@ -0,0 +1,197 @@
+import random
+import string
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union
+import sqlalchemy as sa
+from datachain.lib.signal_schema import SignalSchema
+from datachain.query.schema import Column
+from datachain.sql.types import String
+if TYPE_CHECKING:
+    from datachain.lib.dc import DataChain
+C = Column
+def compare(  # noqa: PLR0912, PLR0915, C901
+    left: "DataChain",
+    right: "DataChain",
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]] = None,
+    compare: Optional[Union[str, Sequence[str]]] = None,
+    right_compare: Optional[Union[str, Sequence[str]]] = None,
+    added: bool = True,
+    deleted: bool = True,
+    modified: bool = True,
+    same: bool = True,
+    status_col: Optional[str] = None,
+) -> "DataChain":
+    """Comparing two chains by identifying rows that are added, deleted, modified
+    or same"""
+    dialect = left._query.dialect
+    rname = "right_"
+    def _rprefix(c: str, rc: str) -> str:
+        """Returns prefix of right of two companion left - right columns
+        from merge. If companion columns have the same name then prefix will
+        be present in right column name, otherwise it won't.
+        """
+        return rname if c == rc else ""
+    def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
+        return [obj] if isinstance(obj, str) else list(obj)
+    if on is None:
+        raise ValueError("'on' must be specified")
+    on = _to_list(on)
+    if right_on:
+        right_on = _to_list(right_on)
+        if len(on) != len(right_on):
+            raise ValueError("'on' and 'right_on' must be have the same length")
+    if compare:
+        compare = _to_list(compare)
+    if right_compare:
+        if not compare:
+            raise ValueError("'compare' must be defined if 'right_compare' is defined")
+        right_compare = _to_list(right_compare)
+        if len(compare) != len(right_compare):
+            raise ValueError(
+                "'compare' and 'right_compare' must be have the same length"
+            )
+    if not any([added, deleted, modified, same]):
+        raise ValueError(
+            "At least one of added, deleted, modified, same flags must be set"
+        )
+    # we still need status column for internal implementation even if not
+    # needed in output
+    need_status_col = bool(status_col)
+    status_col = status_col or "diff_" + "".join(
+        random.choice(string.ascii_letters)  # noqa: S311
+        for _ in range(10)
+    )
+    # calculate on and compare column names
+    right_on = right_on or on
+    cols = left.signals_schema.clone_without_sys_signals().db_signals()
+    right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
+    on = left.signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
+    right_on = right.signals_schema.resolve(*right_on).db_signals()  # type: ignore[assignment]
+    if compare:
+        right_compare = right_compare or compare
+        compare = left.signals_schema.resolve(*compare).db_signals()  # type: ignore[assignment]
+        right_compare = right.signals_schema.resolve(*right_compare).db_signals()  # type: ignore[assignment]
+    elif not compare and len(cols) != len(right_cols):
+        # here we will mark all rows that are not added or deleted as modified since
+        # there was no explicit list of compare columns provided (meaning we need
+        # to check all columns to determine if row is modified or same), but
+        # the number of columns on left and right is not the same (one of the chains
+        # have additional column)
+        compare = None
+        right_compare = None
+    else:
+        compare = [c for c in cols if c in right_cols]  # type: ignore[misc, assignment]
+        right_compare = compare
+    diff_cond = []
+    if added:
+        added_cond = sa.and_(
+            *[
+                C(c) == None  # noqa: E711
+                for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
+            ]
+        )
+        diff_cond.append((added_cond, "A"))
+    if modified and compare:
+        modified_cond = sa.or_(
+            *[
+                C(c) != C(f"{_rprefix(c, rc)}{rc}")
+                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
+            ]
+        )
+        diff_cond.append((modified_cond, "M"))
+    if same and compare:
+        same_cond = sa.and_(
+            *[
+                C(c) == C(f"{_rprefix(c, rc)}{rc}")
+                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
+            ]
+        )
+        diff_cond.append((same_cond, "S"))
+    diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
+    diff.type = String()
+    left_right_merge = left.merge(
+        right, on=on, right_on=right_on, inner=False, rname=rname
+    )
+    left_right_merge_select = left_right_merge._query.select(
+        *(
+            [C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
+            + [C(c) for c in on]
+            + [C(c) for c in cols if c not in on]
+            + [diff]
+        )
+    )
+    diff_col = sa.literal("D").label(status_col)
+    diff_col.type = String()
+    right_left_merge = right.merge(
+        left, on=right_on, right_on=on, inner=False, rname=rname
+    ).filter(
+        sa.and_(
+            *[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)]  # noqa: E711
+        )
+    )
+    def _default_val(chain: "DataChain", col: str):
+        col_type = chain._query.column_types[col]  # type: ignore[index]
+        val = sa.literal(col_type.default_value(dialect)).label(col)
+        val.type = col_type()
+        return val
+    right_left_merge_select = right_left_merge._query.select(
+        *(
+            [C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
+            + [
+                C(c) if c == rc else _default_val(left, c)
+                for c, rc in zip(on, right_on)
+            ]
+            + [
+                C(c) if c in right_cols else _default_val(left, c)  # type: ignore[arg-type]
+                for c in cols
+                if c not in on
+            ]
+            + [diff_col]
+        )
+    )
+    if not deleted:
+        res = left_right_merge_select
+    elif deleted and not any([added, modified, same]):
+        res = right_left_merge_select
+    else:
+        res = left_right_merge_select.union(right_left_merge_select)
+    res = res.filter(C(status_col) != None)  # noqa: E711
+    schema = left.signals_schema
+    if need_status_col:
+        res = res.select()
+        schema = SignalSchema({status_col: str}) | schema
+    else:
+        res = res.select_except(C(status_col))
+    return left._evolve(query=res, signal_schema=schema)

datachain/lib/file.py CHANGED Viewed

@@ -39,6 +39,8 @@ logger = logging.getLogger("datachain")
 # how to create file path when exporting
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
+FileType = Literal["binary", "text", "image"]
 class VFileError(DataChainError):
     def __init__(self, file: "File", message: str, vtype: str = ""):
@@ -470,7 +472,7 @@ class ArrowRow(DataModel):
             return record_batch.to_pylist()[0]
-def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
+def get_file_type(type_: FileType = "binary") -> type[File]:
     file: type[File] = File
     if type_ == "text":
         file = TextFile

datachain 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

datachain 0.7.11py3-none-any.whl → 0.8.1py3-none-any.whl