PyPI - datachain - Versions diffs - 0.20.2__py3-none-any.whl → 0.20.4__py3-none-any.whl - Mend

datachain 0.20.2py3-none-any.whl → 0.20.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (32) hide show

datachain/__init__.py +2 -3
datachain/cache.py +2 -2
datachain/catalog/catalog.py +3 -3
datachain/cli/commands/ls.py +2 -2
datachain/client/fsspec.py +5 -3
datachain/client/local.py +4 -4
datachain/data_storage/metastore.py +22 -7
datachain/data_storage/sqlite.py +1 -4
datachain/dataset.py +4 -3
datachain/delta.py +2 -2
datachain/func/func.py +1 -1
datachain/lib/arrow.py +3 -3
datachain/lib/dataset_info.py +4 -4
datachain/lib/dc/datachain.py +178 -89
datachain/lib/dc/datasets.py +46 -42
datachain/lib/dc/storage.py +24 -38
datachain/lib/file.py +77 -23
datachain/lib/meta_formats.py +1 -1
datachain/lib/namespaces.py +16 -18
datachain/lib/projects.py +26 -26
datachain/lib/pytorch.py +1 -1
datachain/lib/tar.py +1 -2
datachain/lib/udf_signature.py +1 -1
datachain/lib/webdataset.py +30 -20
datachain/namespace.py +3 -3
datachain/project.py +5 -5
{datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/METADATA +1 -1
{datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/RECORD +32 -32
{datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/WHEEL +0 -0
{datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/entry_points.txt +0 -0
{datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/licenses/LICENSE +0 -0
{datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/top_level.txt +0 -0

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -26,6 +26,7 @@ from tqdm import tqdm
 from datachain import semver
 from datachain.dataset import DatasetRecord, parse_dataset_name
 from datachain.delta import delta_disabled
+from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -37,7 +38,6 @@ from datachain.lib.file import (
     FileExporter,
 )
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.projects import get as get_project
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
@@ -439,10 +439,10 @@ class DataChain:
         from datachain.lib.arrow import schema_to_output
-        json_values = list(self.limit(schema_sample_size).collect(col))
+        json_values = self.limit(schema_sample_size).to_list(col)
         json_dicts = [
             json.loads(json_value) if isinstance(json_value, str) else json_value
-            for json_value in json_values
+            for (json_value,) in json_values
         ]
         if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -524,8 +524,10 @@ class DataChain:
         It returns the chain itself.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
-        project = get_project(
-            self.project_name, self.namespace_name, session=self.session
+        project = self.session.catalog.metastore.get_project(
+            self.project_name,
+            self.namespace_name,
+            create=True,
         )
         return self._evolve(
             query=self._query.save(project=project, feature_schema=schema)
@@ -581,7 +583,15 @@ class DataChain:
             or self.session.catalog.metastore.default_project_name
         )
-        project = get_project(project_name, namespace_name, session=self.session)
+        try:
+            project = self.session.catalog.metastore.get_project(
+                project_name,
+                namespace_name,
+                create=self.session.catalog.metastore.project_allowed_to_create,
+            )
+        except ProjectNotFoundError as e:
+            # not being able to create it as creation is not allowed
+            raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
         schema = self.signals_schema.clone_without_sys_signals().serialize()
@@ -893,7 +903,7 @@ class DataChain:
             Order is not guaranteed when steps are added after an `order_by` statement.
             I.e. when using `read_dataset` an `order_by` statement should be used if
             the order of the records in the chain is important.
-            Using `order_by` directly before `limit`, `collect` and `collect_flatten`
+            Using `order_by` directly before `limit`, `to_list` and similar methods
             will give expected results.
             See https://github.com/iterative/datachain/issues/477 for further details.
         """
@@ -1098,32 +1108,32 @@ class DataChain:
     @property
     def _effective_signals_schema(self) -> "SignalSchema":
-        """Effective schema used for user-facing API like collect, to_pandas, etc."""
+        """Effective schema used for user-facing API like to_list, to_pandas, etc."""
         signals_schema = self.signals_schema
         if not self._sys:
             return signals_schema.clone_without_sys_signals()
         return signals_schema
     @overload
-    def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
+    def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
     @overload
-    def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
+    def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
     @overload
-    def collect_flatten(
+    def _leaf_values(
         self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
     ) -> Iterator[_T]: ...
     @overload
-    def collect_flatten(
+    def _leaf_values(
         self,
         *,
         row_factory: Callable[[list[str], tuple[Any, ...]], _T],
         include_hidden: bool,
     ) -> Iterator[_T]: ...
-    def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
+    def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
         """Yields flattened rows of values as a tuple.
         Args:
@@ -1151,7 +1161,7 @@ class DataChain:
         headers, _ = self._effective_signals_schema.get_headers_with_length()
         column_names = [".".join(filter(None, header)) for header in headers]
-        results_iter = self.collect_flatten()
+        results_iter = self._leaf_values()
         def column_chunks() -> Iterator[list[list[Any]]]:
             for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1184,9 +1194,9 @@ class DataChain:
     def results(self, *, row_factory=None, include_hidden=True):
         if row_factory is None:
-            return list(self.collect_flatten(include_hidden=include_hidden))
+            return list(self._leaf_values(include_hidden=include_hidden))
         return list(
-            self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
+            self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
         )
     def to_records(self) -> list[dict[str, Any]]:
@@ -1197,42 +1207,38 @@ class DataChain:
         return self.results(row_factory=to_dict)
-    @overload
-    def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
-    @overload
-    def collect(self, col: str) -> Iterator[DataValue]: ...
-    @overload
-    def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
-    def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]:  # type: ignore[overload-overlap,misc]
+    def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
         """Yields rows of values, optionally limited to the specified columns.
         Args:
             *cols: Limit to the specified columns. By default, all columns are selected.
         Yields:
-            (DataType): Yields a single item if a column is selected.
-            (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
-                selected.
+            (tuple[DataType, ...]): Yields a tuple of items for each row.
         Example:
             Iterating over all rows:
             ```py
-            for row in dc.collect():
+            for row in ds.to_iter():
+                print(row)
+            ```
+            DataChain is iterable and can be used in a for loop directly which is
+            equivalent to `ds.to_iter()`:
+            ```py
+            for row in ds:
                 print(row)
             ```
             Iterating over all rows with selected columns:
             ```py
-            for name, size in dc.collect("file.path", "file.size"):
+            for name, size in ds.to_iter("file.path", "file.size"):
                 print(name, size)
             ```
             Iterating over a single column:
             ```py
-            for file in dc.collect("file.path"):
+            for (file,) in ds.to_iter("file.path"):
                 print(file)
             ```
         """
@@ -1244,7 +1250,31 @@ class DataChain:
                 ret = signals_schema.row_to_features(
                     row, catalog=chain.session.catalog, cache=chain._settings.cache
                 )
-                yield ret[0] if len(cols) == 1 else tuple(ret)
+                yield tuple(ret)
+    @overload
+    def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
+    @overload
+    def collect(self, col: str) -> Iterator[DataValue]: ...
+    @overload
+    def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
+    def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]:  # type: ignore[overload-overlap,misc]
+        """
+        Deprecated. Use `to_iter` method instead.
+        """
+        warnings.warn(
+            "Method `collect` is deprecated. Use `to_iter` method instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if len(cols) == 1:
+            yield from [item[0] for item in self.to_iter(*cols)]
+        else:
+            yield from self.to_iter(*cols)
     def to_pytorch(
         self,
@@ -1479,7 +1509,7 @@ class DataChain:
             )
         return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]
-    def compare(
+    def diff(
         self,
         other: "DataChain",
         on: Union[str, Sequence[str]],
@@ -1492,41 +1522,33 @@ class DataChain:
         same: bool = False,
         status_col: Optional[str] = None,
     ) -> "DataChain":
-        """Comparing two chains by identifying rows that are added, deleted, modified
-        or same. Result is the new chain that has additional column with possible
-        values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
-        rows respectively. Note that if only one "status" is asked, by setting proper
-        flags, this additional column is not created as it would have only one value
-        for all rows. Beside additional diff column, new chain has schema of the chain
-        on which method was called.
+        """Calculate differences between two chains.
+        This method identifies records that are added, deleted, modified, or unchanged
+        between two chains. It adds a status column with values: A=added, D=deleted,
+        M=modified, S=same.
         Parameters:
-            other: Chain to calculate diff from.
-            on: Column or list of columns to match on. If both chains have the
-                same columns then this column is enough for the match. Otherwise,
-                `right_on` parameter has to specify the columns for the other chain.
-                This value is used to find corresponding row in other dataset. If not
-                found there, row is considered as added (or removed if vice versa), and
-                if found then row can be either modified or same.
-            right_on: Optional column or list of columns
-                for the `other` to match.
-            compare: Column or list of columns to compare on. If both chains have
-                the same columns then this column is enough for the compare. Otherwise,
-                `right_compare` parameter has to specify the columns for the other
-                chain. This value is used to see if row is modified or same. If
-                not set, all columns will be used for comparison
-            right_compare: Optional column or list of columns
-                    for the `other` to compare to.
-            added (bool): Whether to return added rows in resulting chain.
-            deleted (bool): Whether to return deleted rows in resulting chain.
-            modified (bool): Whether to return modified rows in resulting chain.
-            same (bool): Whether to return unchanged rows in resulting chain.
-            status_col (str): Name of the new column that is created in resulting chain
-                representing diff status.
+            other: Chain to compare against.
+            on: Column(s) to match records between chains.
+            right_on: Column(s) in the other chain to match against. Defaults to `on`.
+            compare: Column(s) to check for changes.
+                     If not specified,all columns are used.
+            right_compare: Column(s) in the other chain to compare against.
+                     Defaults to values of `compare`.
+            added (bool): Include records that exist in this chain but not in the other.
+            deleted (bool): Include records that exist only in the other chain.
+            modified (bool): Include records that exist in both
+                     but have different values.
+            same (bool): Include records that are identical in both chains.
+            status_col (str): Name for the status column showing differences.
+        Default behavior: By default, shows added, deleted, and modified records,
+        but excludes unchanged records (same=False). Status column is not created.
         Example:
             ```py
-            res = persons.compare(
+            res = persons.diff(
                 new_persons,
                 on=["id"],
                 right_on=["other_id"],
@@ -1555,7 +1577,7 @@ class DataChain:
             status_col=status_col,
         )
-    def diff(
+    def file_diff(
         self,
         other: "DataChain",
         on: str = "file",
@@ -1566,31 +1588,29 @@ class DataChain:
         same: bool = False,
         status_col: Optional[str] = None,
     ) -> "DataChain":
-        """Similar to `.compare()`, which is more generic method to calculate difference
-        between two chains. Unlike `.compare()`, this method works only on those chains
-        that have `File` object, or it's derivatives, in it. File `source` and `path`
-        are used for matching, and file `version` and `etag` for comparing, while in
-        `.compare()` user needs to provide arbitrary columns for matching and comparing.
+        """Calculate differences between two chains containing files.
+        This method is specifically designed for file chains. It uses file `source`
+        and `path` to match files, and file `version` and `etag` to detect changes.
         Parameters:
-            other: Chain to calculate diff from.
-            on: File signal to match on. If both chains have the
-                same file signal then this column is enough for the match. Otherwise,
-                `right_on` parameter has to specify the file signal for the other chain.
-                This value is used to find corresponding row in other dataset. If not
-                found there, row is considered as added (or removed if vice versa), and
-                if found then row can be either modified or same.
-            right_on: Optional file signal for the `other` to match.
-            added (bool): Whether to return added rows in resulting chain.
-            deleted (bool): Whether to return deleted rows in resulting chain.
-            modified (bool): Whether to return modified rows in resulting chain.
-            same (bool): Whether to return unchanged rows in resulting chain.
-            status_col (str): Optional name of the new column that is created in
-                resulting chain representing diff status.
+            other: Chain to compare against.
+            on: File column name in this chain. Default is "file".
+            right_on: File column name in the other chain. Defaults to `on`.
+            added (bool): Include files that exist in this chain but not in the other.
+            deleted (bool): Include files that exist only in the other chain.
+            modified (bool): Include files that exist in both but have different
+                             versions/etags.
+            same (bool): Include files that are identical in both chains.
+            status_col (str): Name for the status column showing differences
+                              (A=added, D=deleted, M=modified, S=same).
+        Default behavior: By default, includes only new files (added=True and
+        modified=True). This is useful for incremental processing.
         Example:
             ```py
-            diff = images.diff(
+            diff = images.file_diff(
                 new_images,
                 on="file",
                 right_on="other_file",
@@ -1615,7 +1635,7 @@ class DataChain:
         compare_cols = get_file_signals(on, compare_file_signals)
         right_compare_cols = get_file_signals(right_on, compare_file_signals)
-        return self.compare(
+        return self.diff(
             other,
             on_cols,
             right_on=right_on_cols,
@@ -2027,7 +2047,7 @@ class DataChain:
         headers, _ = self._effective_signals_schema.get_headers_with_length()
         column_names = [".".join(filter(None, header)) for header in headers]
-        results_iter = self.collect_flatten()
+        results_iter = self._leaf_values()
         with opener(path, "w", newline="") as f:
             writer = csv.writer(f, delimiter=delimiter, **kwargs)
@@ -2079,7 +2099,7 @@ class DataChain:
             if include_outer_list:
                 # This makes the file JSON instead of JSON lines.
                 f.write(b"[\n")
-            for row in self.collect_flatten():
+            for row in self._leaf_values():
                 if not is_first:
                     if include_outer_list:
                         # This makes the file JSON instead of JSON lines.
@@ -2244,7 +2264,7 @@ class DataChain:
             max_threads=num_threads or 1,
             client_config=client_config,
         )
-        file_exporter.run(self.collect(signal), progress_bar)
+        file_exporter.run(self.to_values(signal), progress_bar)
     def shuffle(self) -> "Self":
         """Shuffle the rows of the chain deterministically."""
@@ -2378,3 +2398,72 @@ class DataChain:
             Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
         """
         return self._evolve(query=self._query.chunk(index, total))
+    def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
+        """Returns a list of rows of values, optionally limited to the specified
+        columns.
+        Args:
+            *cols: Limit to the specified columns. By default, all columns are selected.
+        Returns:
+            list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
+        Example:
+            Getting all rows as a list:
+            ```py
+            rows = dc.to_list()
+            print(rows)
+            ```
+            Getting all rows with selected columns as a list:
+            ```py
+            name_size_pairs = dc.to_list("file.path", "file.size")
+            print(name_size_pairs)
+            ```
+            Getting a single column as a list:
+            ```py
+            files = dc.to_list("file.path")
+            print(files)  # Returns list of 1-tuples
+            ```
+        """
+        return list(self.to_iter(*cols))
+    def to_values(self, col: str) -> list[DataValue]:
+        """Returns a flat list of values from a single column.
+        Args:
+            col: The name of the column to extract values from.
+        Returns:
+            list[DataValue]: Returns a flat list of values from the specified column.
+        Example:
+            Getting all values from a single column:
+            ```py
+            file_paths = dc.to_values("file.path")
+            print(file_paths)  # Returns list of strings
+            ```
+            Getting all file sizes:
+            ```py
+            sizes = dc.to_values("file.size")
+            print(sizes)  # Returns list of integers
+            ```
+        """
+        return [row[0] for row in self.to_list(col)]
+    def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
+        """Make DataChain objects iterable.
+        Yields:
+            (tuple[DataValue, ...]): Yields tuples of all column values for each row.
+        Example:
+            ```py
+            for row in chain:
+                print(row)
+            ```
+        """
+        return self.to_iter()

datachain/lib/dc/datasets.py CHANGED Viewed

@@ -2,7 +2,11 @@ from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
 from datachain.dataset import parse_dataset_name
-from datachain.error import DatasetVersionNotFoundError
+from datachain.error import (
+    DatasetNotFoundError,
+    DatasetVersionNotFoundError,
+    ProjectNotFoundError,
+)
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import (
     File,
@@ -33,7 +37,11 @@ def read_dataset(
     settings: Optional[dict] = None,
     fallback_to_studio: bool = True,
     delta: Optional[bool] = False,
-    delta_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_on: Optional[Union[str, Sequence[str]]] = (
+        "file.path",
+        "file.etag",
+        "file.version",
+    ),
     delta_result_on: Optional[Union[str, Sequence[str]]] = None,
     delta_compare: Optional[Union[str, Sequence[str]]] = None,
     delta_retry: Optional[Union[bool, str]] = None,
@@ -53,41 +61,25 @@ def read_dataset(
         settings : Settings to use for the chain.
         fallback_to_studio : Try to pull dataset from Studio if not found locally.
             Default is True.
-        delta: If set to True, we optimize the creation of new dataset versions by
-            calculating the diff between the latest version of this storage and the
-            version used to create the most recent version of the resulting chain
-            dataset (the one specified in `.save()`). We then run the "diff" chain
-            using only the diff data, rather than the entire storage data, and merge
-            that diff chain with the latest version of the resulting dataset to create
-            a new version. This approach avoids applying modifications to all records
-            from storage every time, which can be an expensive operation.
-            The diff is calculated using the `DataChain.compare()` method, which
-            compares the `delta_on` fields to find matches and checks the compare
-            fields to determine if a record has changed. Note that this process only
-            considers added and modified records in storage; deleted records are not
-            removed from the new dataset version.
-            This calculation is based on the difference between the current version
-            of the source and the version used to create the dataset.
-        delta_on: A list of fields that uniquely identify rows in the source.
-            If two rows have the same values, they are considered the same (e.g., they
-            could be different versions of the same row in a versioned source).
-            This is used in the delta update to calculate the diff.
-        delta_result_on: A list of fields in the resulting dataset that correspond
-            to the `delta_on` fields from the source.
-            This is needed to identify rows that have changed in the source but are
-            already present in the current version of the resulting dataset, in order
-            to avoid including outdated versions of those rows in the new dataset.
-            We retain only the latest versions of rows to prevent duplication.
-            There is no need to define this if the `delta_on` fields are present in
-            the final dataset and have not been renamed.
-        delta_compare: A list of fields used to check if the same row has been modified
-            in the new version of the source.
-            If not defined, all fields except those defined in delta_on will be used.
-        delta_retry: Specifies retry behavior for delta processing. If a string,
-            it's the name of a field in the result dataset that indicates an error
-            when not None - records with errors will be reprocessed. If True,
-            records that exist in the source dataset but not in the result dataset
-            will be reprocessed.
+        delta: If True, only process new or changed files instead of reprocessing
+            everything. This saves time by skipping files that were already processed in
+            previous versions. The optimization is working when a new version of the
+            dataset is created.
+            Default is False.
+        delta_on: Field(s) that uniquely identify each record in the source data.
+            Used to detect which records are new or changed.
+            Default is ("file.path", "file.etag", "file.version").
+        delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
+            Only needed if you rename the identifying fields during processing.
+            Default is None.
+        delta_compare: Field(s) used to detect if a record has changed.
+            If not specified, all fields except `delta_on` fields are used.
+            Default is None.
+        delta_retry: Controls retry behavior for failed records:
+            - String (field name): Reprocess records where this field is not empty
+              (error mode)
+            - True: Reprocess records missing from the result dataset (missing mode)
+            - None: No retry processing (default)
     Example:
         ```py
@@ -148,9 +140,15 @@ def read_dataset(
             # all 2.* dataset versions). If dataset doesn't have any versions where
             # major part is equal to that input, exception is thrown.
             major = int(version)
-            dataset = session.catalog.get_dataset(
-                name, get_project(project_name, namespace_name, session=session)
-            )
+            try:
+                ds_project = get_project(project_name, namespace_name, session=session)
+            except ProjectNotFoundError:
+                raise DatasetNotFoundError(
+                    f"Dataset {name} not found in namespace {namespace_name} and",
+                    f" project {project_name}",
+                ) from None
+            dataset = session.catalog.get_dataset(name, ds_project)
             latest_major = dataset.latest_major_version(major)
             if not latest_major:
                 raise DatasetVersionNotFoundError(
@@ -228,7 +226,7 @@ def datasets(
         import datachain as dc
         chain = dc.datasets(column="dataset")
-        for ds in chain.collect("dataset"):
+        for ds in chain.to_iter("dataset"):
             print(f"{ds.name}@v{ds.version}")
         ```
     """
@@ -333,7 +331,13 @@ def delete_dataset(
             None, name, namespace_name, project_name, version=version, force=force
         )
-    ds_project = get_project(project_name, namespace_name, session=session)
+    try:
+        ds_project = get_project(project_name, namespace_name, session=session)
+    except ProjectNotFoundError:
+        raise DatasetNotFoundError(
+            f"Dataset {name} not found in namespace {namespace_name} and project",
+            f" {project_name}",
+        ) from None
     if not force:
         version = version or catalog.get_dataset(name, ds_project).latest_version

datachain 0.20.2__py3-none-any.whl → 0.20.4__py3-none-any.whl

Potentially problematic release.

datachain 0.20.2py3-none-any.whl → 0.20.4py3-none-any.whl