PyPI - datachain - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

datachain 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (38) hide show

datachain/__init__.py +2 -0
datachain/catalog/catalog.py +62 -228
datachain/cli.py +136 -22
datachain/client/fsspec.py +9 -0
datachain/client/local.py +11 -32
datachain/config.py +126 -51
datachain/data_storage/schema.py +66 -33
datachain/data_storage/sqlite.py +12 -4
datachain/data_storage/warehouse.py +101 -129
datachain/lib/convert/sql_to_python.py +8 -12
datachain/lib/dc.py +275 -80
datachain/lib/func/__init__.py +32 -0
datachain/lib/func/aggregate.py +353 -0
datachain/lib/func/func.py +152 -0
datachain/lib/listing.py +6 -21
datachain/lib/listing_info.py +4 -0
datachain/lib/signal_schema.py +17 -8
datachain/lib/udf.py +3 -3
datachain/lib/utils.py +5 -0
datachain/listing.py +22 -48
datachain/query/__init__.py +1 -2
datachain/query/batch.py +0 -1
datachain/query/dataset.py +33 -46
datachain/query/schema.py +1 -61
datachain/query/session.py +33 -25
datachain/remote/studio.py +63 -14
datachain/sql/functions/__init__.py +1 -1
datachain/sql/functions/aggregate.py +47 -0
datachain/sql/functions/array.py +0 -8
datachain/sql/sqlite/base.py +20 -2
datachain/studio.py +129 -0
datachain/utils.py +58 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/RECORD +38 -33
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0

datachain/lib/dc.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import copy
 import os
+import os.path
 import re
+import sys
 from collections.abc import Iterator, Sequence
 from functools import wraps
 from typing import (
@@ -23,16 +25,17 @@ from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
+from datachain.client import Client
+from datachain.client.local import FileClient
+from datachain.dataset import DatasetRecord
 from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ArrowRow, File, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
+from datachain.lib.func import Func
 from datachain.lib.listing import (
-    is_listing_dataset,
-    is_listing_expired,
-    is_listing_subset,
     list_bucket,
     ls,
     parse_listing_uri,
@@ -42,24 +45,15 @@ from datachain.lib.meta_formats import read_meta, read_schema
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.udf import (
-    Aggregator,
-    BatchMapper,
-    Generator,
-    Mapper,
-    UDFBase,
-)
+from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
 from datachain.lib.udf_signature import UdfSignature
-from datachain.lib.utils import DataChainParamsError
+from datachain.lib.utils import DataChainColumnError, DataChainParamsError
 from datachain.query import Session
-from datachain.query.dataset import (
-    DatasetQuery,
-    PartitionByType,
-)
-from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
+from datachain.query.dataset import DatasetQuery, PartitionByType
+from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
 from datachain.sql.functions import path as pathfunc
 from datachain.telemetry import telemetry
-from datachain.utils import batched_it, inside_notebook
+from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
 if TYPE_CHECKING:
     from pyarrow import DataType as ArrowDataType
@@ -149,11 +143,6 @@ class DatasetMergeError(DataChainParamsError):  # noqa: D101
         super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
-class DataChainColumnError(DataChainParamsError):  # noqa: D101
-    def __init__(self, col_name, msg):  # noqa: D107
-        super().__init__(f"Error for column {col_name}: {msg}")
 OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
@@ -300,6 +289,13 @@ class DataChain:
         """Version of the underlying dataset, if there is one."""
         return self._query.version
+    @property
+    def dataset(self) -> Optional[DatasetRecord]:
+        """Underlying dataset, if there is one."""
+        if not self.name:
+            return None
+        return self.session.catalog.get_dataset(self.name)
     def __or__(self, other: "Self") -> "Self":
         """Return `self.union(other)`."""
         return self.union(other)
@@ -380,6 +376,47 @@ class DataChain:
         self.signals_schema |= signals_schema
         return self
+    @classmethod
+    def parse_uri(
+        cls, uri: str, session: Session, update: bool = False
+    ) -> tuple[str, str, str, bool]:
+        """Returns correct listing dataset name that must be used for saving listing
+        operation. It takes into account existing listings and reusability of those.
+        It also returns boolean saying if returned dataset name is reused / already
+        exists or not, and it returns correct listing path that should be used to find
+        rows based on uri.
+        """
+        catalog = session.catalog
+        cache = catalog.cache
+        client_config = catalog.client_config
+        client = Client.get_client(uri, cache, **client_config)
+        ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
+        listing = None
+        listings = [
+            ls
+            for ls in catalog.listings()
+            if not ls.is_expired and ls.contains(ds_name)
+        ]
+        if listings:
+            if update:
+                # choosing the smallest possible one to minimize update time
+                listing = sorted(listings, key=lambda ls: len(ls.name))[0]
+            else:
+                # no need to update, choosing the most recent one
+                listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
+        if isinstance(client, FileClient) and listing and listing.name != ds_name:
+            # For local file system we need to fix listing path / prefix
+            # if we are reusing existing listing
+            list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
+        ds_name = listing.name if listing else ds_name
+        return ds_name, list_uri, list_path, bool(listing)
     @classmethod
     def from_storage(
         cls,
@@ -414,25 +451,15 @@ class DataChain:
         file_type = get_file_type(type)
         client_config = {"anon": True} if anon else None
         session = Session.get(session, client_config=client_config, in_memory=in_memory)
+        cache = session.catalog.cache
+        client_config = session.catalog.client_config
-        list_dataset_name, list_uri, list_path = parse_listing_uri(
-            uri, session.catalog.cache, session.catalog.client_config
+        list_ds_name, list_uri, list_path, list_ds_exists = cls.parse_uri(
+            uri, session, update=update
         )
-        need_listing = True
-        for ds in cls.listings(session=session, in_memory=in_memory).collect("listing"):
-            if (
-                not is_listing_expired(ds.created_at)  # type: ignore[union-attr]
-                and is_listing_subset(ds.name, list_dataset_name)  # type: ignore[union-attr]
-                and not update
-            ):
-                need_listing = False
-                list_dataset_name = ds.name  # type: ignore[union-attr]
-        if need_listing:
-            # caching new listing to special listing dataset
+        if update or not list_ds_exists:
             (
                 cls.from_records(
                     DataChain.DEFAULT_FILE_RECORD,
@@ -441,17 +468,13 @@ class DataChain:
                     in_memory=in_memory,
                 )
                 .gen(
-                    list_bucket(
-                        list_uri,
-                        session.catalog.cache,
-                        client_config=session.catalog.client_config,
-                    ),
+                    list_bucket(list_uri, cache, client_config=client_config),
                     output={f"{object_name}": File},
                 )
-                .save(list_dataset_name, listing=True)
+                .save(list_ds_name, listing=True)
             )
-        dc = cls.from_dataset(list_dataset_name, session=session, settings=settings)
+        dc = cls.from_dataset(list_ds_name, session=session, settings=settings)
         dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
         return ls(dc, list_path, recursive=recursive, object_name=object_name)
@@ -678,19 +701,11 @@ class DataChain:
         session = Session.get(session, in_memory=in_memory)
         catalog = kwargs.get("catalog") or session.catalog
-        listings = [
-            ListingInfo.from_models(d, v, j)
-            for d, v, j in catalog.list_datasets_versions(
-                include_listing=True, **kwargs
-            )
-            if is_listing_dataset(d.name)
-        ]
         return cls.from_values(
             session=session,
             in_memory=in_memory,
             output={object_name: ListingInfo},
-            **{object_name: listings},  # type: ignore[arg-type]
+            **{object_name: catalog.listings()},  # type: ignore[arg-type]
         )
     def print_json_schema(  # type: ignore[override]
@@ -982,10 +997,9 @@ class DataChain:
         row is left in the result set.
         Example:
-        ```py
-         dc.distinct("file.parent", "file.name")
-        )
-        ```
+            ```py
+            dc.distinct("file.parent", "file.name")
+            ```
         """
         return self._evolve(
             query=self._query.distinct(
@@ -1011,6 +1025,63 @@ class DataChain:
             query=self._query.select(*columns), signal_schema=new_schema
         )
+    def group_by(
+        self,
+        *,
+        partition_by: Union[str, Sequence[str]],
+        **kwargs: Func,
+    ) -> "Self":
+        """Group rows by specified set of signals and return new signals
+        with aggregated values.
+        The supported functions:
+           count(), sum(), avg(), min(), max(), any_value(), collect(), concat()
+        Example:
+            ```py
+            chain = chain.group_by(
+                cnt=func.count(),
+                partition_by=("file_source", "file_ext"),
+            )
+            ```
+        """
+        if isinstance(partition_by, str):
+            partition_by = [partition_by]
+        if not partition_by:
+            raise ValueError("At least one column should be provided for partition_by")
+        if not kwargs:
+            raise ValueError("At least one column should be provided for group_by")
+        for col_name, func in kwargs.items():
+            if not isinstance(func, Func):
+                raise DataChainColumnError(
+                    col_name,
+                    f"Column {col_name} has type {type(func)} but expected Func object",
+                )
+        partition_by_columns: list[Column] = []
+        signal_columns: list[Column] = []
+        schema_fields: dict[str, DataType] = {}
+        # validate partition_by columns and add them to the schema
+        for col_name in partition_by:
+            col_db_name = ColumnMeta.to_db_name(col_name)
+            col_type = self.signals_schema.get_column_type(col_db_name)
+            col = Column(col_db_name, python_to_sql(col_type))
+            partition_by_columns.append(col)
+            schema_fields[col_db_name] = col_type
+        # validate signal columns and add them to the schema
+        for col_name, func in kwargs.items():
+            col = func.get_column(self.signals_schema, label=col_name)
+            signal_columns.append(col)
+            schema_fields[col_name] = func.get_result_type(self.signals_schema)
+        return self._evolve(
+            query=self._query.group_by(signal_columns, partition_by_columns),
+            signal_schema=SignalSchema(schema_fields),
+        )
     def mutate(self, **kwargs) -> "Self":
         """Create new signals based on existing signals.
@@ -1029,13 +1100,22 @@ class DataChain:
            Filename:    name(), parent(), file_stem(), file_ext()
            Array:       length(), sip_hash_64(), euclidean_distance(),
                         cosine_distance()
+           Window:      row_number(), rank(), dense_rank(), first()
         Example:
         ```py
          dc.mutate(
-                area=Column("image.height") * Column("image.width"),
-                extension=file_ext(Column("file.name")),
-                dist=cosine_distance(embedding_text, embedding_image)
+            area=Column("image.height") * Column("image.width"),
+            extension=file_ext(Column("file.name")),
+            dist=cosine_distance(embedding_text, embedding_image)
+        )
+        ```
+        Window function example:
+        ```py
+        window = func.window(partition_by="file.parent", order_by="file.size")
+        dc.mutate(
+            row_number=func.row_number().over(window),
         )
         ```
@@ -1046,20 +1126,12 @@ class DataChain:
         Example:
         ```py
          dc.mutate(
-                newkey=Column("oldkey")
+            newkey=Column("oldkey")
         )
         ```
         """
-        existing_columns = set(self.signals_schema.values.keys())
-        for col_name in kwargs:
-            if col_name in existing_columns:
-                raise DataChainColumnError(
-                    col_name,
-                    "Cannot modify existing column with mutate(). "
-                    "Use a different name for the new column.",
-                )
         for col_name, expr in kwargs.items():
-            if not isinstance(expr, Column) and isinstance(expr.type, NullType):
+            if not isinstance(expr, (Column, Func)) and isinstance(expr.type, NullType):
                 raise DataChainColumnError(
                     col_name, f"Cannot infer type with expression {expr}"
                 )
@@ -1071,6 +1143,9 @@ class DataChain:
                 # renaming existing column
                 for signal in schema.db_signals(name=value.name, as_columns=True):
                     mutated[signal.name.replace(value.name, name, 1)] = signal  # type: ignore[union-attr]
+            elif isinstance(value, Func):
+                # adding new signal
+                mutated[name] = value.get_column(schema)
             else:
                 # adding new signal
                 mutated[name] = value
@@ -1477,12 +1552,6 @@ class DataChain:
         fr_map = {col.lower(): df[col].tolist() for col in df.columns}
         for column in fr_map:
-            if column in DatasetRow.schema:
-                raise DatasetPrepareError(
-                    name,
-                    f"import from pandas error - column '{column}' conflicts with"
-                    " default schema",
-                )
             if not column.isidentifier():
                 raise DatasetPrepareError(
                     name,
@@ -1853,21 +1922,48 @@ class DataChain:
         path: Union[str, os.PathLike[str], BinaryIO],
         partition_cols: Optional[Sequence[str]] = None,
         chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
+        fs_kwargs: Optional[dict[str, Any]] = None,
         **kwargs,
     ) -> None:
         """Save chain to parquet file with SignalSchema metadata.
         Parameters:
-            path : Path or a file-like binary object to save the file.
+            path : Path or a file-like binary object to save the file. This supports
+                local paths as well as remote paths, such as s3:// or hf:// with fsspec.
             partition_cols : Column names by which to partition the dataset.
             chunk_size : The chunk size of results to read and convert to columnar
                 data, to avoid running out of memory.
+            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
+                write, for fsspec-type URLs, such as s3:// or hf:// when
+                provided as the destination path.
         """
         import pyarrow as pa
         import pyarrow.parquet as pq
         from datachain.lib.arrow import DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY
+        fsspec_fs = None
+        if isinstance(path, str) and "://" in path:
+            from datachain.client.fsspec import Client
+            fs_kwargs = {
+                **self._query.catalog.client_config,
+                **(fs_kwargs or {}),
+            }
+            client = Client.get_implementation(path)
+            if path.startswith("file://"):
+                # pyarrow does not handle file:// uris, and needs a direct path instead.
+                from urllib.parse import urlparse
+                path = urlparse(path).path
+                if sys.platform == "win32":
+                    path = os.path.normpath(path.lstrip("/"))
+            fsspec_fs = client.create_fs(**fs_kwargs)
         _partition_cols = list(partition_cols) if partition_cols else None
         signal_schema_metadata = orjson.dumps(
             self._effective_signals_schema.serialize()
@@ -1902,12 +1998,15 @@ class DataChain:
                     table,
                     root_path=path,
                     partition_cols=_partition_cols,
+                    filesystem=fsspec_fs,
                     **kwargs,
                 )
             else:
                 if first_chunk:
                     # Write to a single parquet file.
-                    parquet_writer = pq.ParquetWriter(path, parquet_schema, **kwargs)
+                    parquet_writer = pq.ParquetWriter(
+                        path, parquet_schema, filesystem=fsspec_fs, **kwargs
+                    )
                     first_chunk = False
                 assert parquet_writer
@@ -1920,28 +2019,122 @@ class DataChain:
         self,
         path: Union[str, os.PathLike[str]],
         delimiter: str = ",",
+        fs_kwargs: Optional[dict[str, Any]] = None,
         **kwargs,
     ) -> None:
         """Save chain to a csv (comma-separated values) file.
         Parameters:
-            path : Path to save the file.
+            path : Path to save the file. This supports local paths as well as
+                remote paths, such as s3:// or hf:// with fsspec.
             delimiter : Delimiter to use for the resulting file.
+            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
+                write, for fsspec-type URLs, such as s3:// or hf:// when
+                provided as the destination path.
         """
         import csv
+        opener = open
+        if isinstance(path, str) and "://" in path:
+            from datachain.client.fsspec import Client
+            fs_kwargs = {
+                **self._query.catalog.client_config,
+                **(fs_kwargs or {}),
+            }
+            client = Client.get_implementation(path)
+            fsspec_fs = client.create_fs(**fs_kwargs)
+            opener = fsspec_fs.open
         headers, _ = self._effective_signals_schema.get_headers_with_length()
         column_names = [".".join(filter(None, header)) for header in headers]
         results_iter = self.collect_flatten()
-        with open(path, "w", newline="") as f:
+        with opener(path, "w", newline="") as f:
             writer = csv.writer(f, delimiter=delimiter, **kwargs)
             writer.writerow(column_names)
             for row in results_iter:
                 writer.writerow(row)
+    def to_json(
+        self,
+        path: Union[str, os.PathLike[str]],
+        fs_kwargs: Optional[dict[str, Any]] = None,
+        include_outer_list: bool = True,
+    ) -> None:
+        """Save chain to a JSON file.
+        Parameters:
+            path : Path to save the file. This supports local paths as well as
+                remote paths, such as s3:// or hf:// with fsspec.
+            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
+                write, for fsspec-type URLs, such as s3:// or hf:// when
+                provided as the destination path.
+            include_outer_list : Sets whether to include an outer list for all rows.
+                Setting this to True makes the file valid JSON, while False instead
+                writes in the JSON lines format.
+        """
+        opener = open
+        if isinstance(path, str) and "://" in path:
+            from datachain.client.fsspec import Client
+            fs_kwargs = {
+                **self._query.catalog.client_config,
+                **(fs_kwargs or {}),
+            }
+            client = Client.get_implementation(path)
+            fsspec_fs = client.create_fs(**fs_kwargs)
+            opener = fsspec_fs.open
+        headers, _ = self._effective_signals_schema.get_headers_with_length()
+        headers = [list(filter(None, header)) for header in headers]
+        is_first = True
+        with opener(path, "wb") as f:
+            if include_outer_list:
+                # This makes the file JSON instead of JSON lines.
+                f.write(b"[\n")
+            for row in self.collect_flatten():
+                if not is_first:
+                    if include_outer_list:
+                        # This makes the file JSON instead of JSON lines.
+                        f.write(b",\n")
+                    else:
+                        f.write(b"\n")
+                else:
+                    is_first = False
+                f.write(orjson.dumps(row_to_nested_dict(headers, row)))
+            if include_outer_list:
+                # This makes the file JSON instead of JSON lines.
+                f.write(b"\n]\n")
+    def to_jsonl(
+        self,
+        path: Union[str, os.PathLike[str]],
+        fs_kwargs: Optional[dict[str, Any]] = None,
+    ) -> None:
+        """Save chain to a JSON lines file.
+        Parameters:
+            path : Path to save the file. This supports local paths as well as
+                remote paths, such as s3:// or hf:// with fsspec.
+            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
+                write, for fsspec-type URLs, such as s3:// or hf:// when
+                provided as the destination path.
+        """
+        self.to_json(path, fs_kwargs, include_outer_list=False)
     @classmethod
     def from_records(
         cls,
@@ -1994,6 +2187,8 @@ class DataChain:
             ),
         )
+        session.add_dataset_version(dsr, dsr.latest_version)
         if isinstance(to_insert, dict):
             to_insert = [to_insert]
         elif not to_insert:

datachain/lib/func/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+from .aggregate import (
+    any_value,
+    avg,
+    collect,
+    concat,
+    count,
+    dense_rank,
+    first,
+    max,
+    min,
+    rank,
+    row_number,
+    sum,
+)
+from .func import Func, window
+__all__ = [
+    "Func",
+    "any_value",
+    "avg",
+    "collect",
+    "concat",
+    "count",
+    "dense_rank",
+    "first",
+    "max",
+    "min",
+    "rank",
+    "row_number",
+    "sum",
+    "window",
+]

datachain 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

Potentially problematic release.

datachain 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl