PyPI - datachain - Versions diffs - 0.18.4__py3-none-any.whl → 0.18.6__py3-none-any.whl - Mend

datachain 0.18.4py3-none-any.whl → 0.18.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (25) hide show

datachain/catalog/catalog.py +2 -10
datachain/client/azure.py +1 -1
datachain/client/gcs.py +1 -1
datachain/client/s3.py +5 -3
datachain/data_storage/metastore.py +87 -42
datachain/func/aggregate.py +64 -38
datachain/func/array.py +102 -73
datachain/func/conditional.py +71 -51
datachain/func/func.py +1 -1
datachain/func/numeric.py +55 -36
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +59 -37
datachain/func/window.py +7 -8
datachain/lib/dc/datachain.py +9 -0
datachain/model/ultralytics/bbox.py +6 -4
datachain/model/ultralytics/pose.py +6 -4
datachain/model/ultralytics/segment.py +6 -4
datachain/query/dispatch.py +4 -3
{datachain-0.18.4.dist-info → datachain-0.18.6.dist-info}/METADATA +3 -3
{datachain-0.18.4.dist-info → datachain-0.18.6.dist-info}/RECORD +25 -25
{datachain-0.18.4.dist-info → datachain-0.18.6.dist-info}/WHEEL +1 -1
{datachain-0.18.4.dist-info → datachain-0.18.6.dist-info}/entry_points.txt +0 -0
{datachain-0.18.4.dist-info → datachain-0.18.6.dist-info}/licenses/LICENSE +0 -0
{datachain-0.18.4.dist-info → datachain-0.18.6.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -911,11 +911,7 @@ class Catalog:
             values["num_objects"] = None
             values["size"] = None
             values["preview"] = None
-            self.metastore.update_dataset_version(
-                dataset,
-                version,
-                **values,
-            )
+            self.metastore.update_dataset_version(dataset, version, **values)
             return
         if not dataset_version.num_objects:
@@ -935,11 +931,7 @@ class Catalog:
         if not values:
             return
-        self.metastore.update_dataset_version(
-            dataset,
-            version,
-            **values,
-        )
+        self.metastore.update_dataset_version(dataset, version, **values)
     def update_dataset(
         self, dataset: DatasetRecord, conn=None, **kwargs

datachain/client/azure.py CHANGED Viewed

@@ -65,7 +65,7 @@ class AzureClient(Client):
                         if entries:
                             await result_queue.put(entries)
                             pbar.update(len(entries))
-                    if not found:
+                    if not found and prefix:
                         raise FileNotFoundError(
                             f"Unable to resolve remote path: {prefix}"
                         )

datachain/client/gcs.py CHANGED Viewed

@@ -74,7 +74,7 @@ class GCSClient(Client):
             try:
                 await self._get_pages(prefix, page_queue)
                 found = await consumer
-                if not found:
+                if not found and prefix:
                     raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
             finally:
                 consumer.cancel()  # In case _get_pages() raised

datachain/client/s3.py CHANGED Viewed

@@ -80,7 +80,7 @@ class ClientS3(Client):
             finally:
                 await page_queue.put(None)
-        async def process_pages(page_queue, result_queue):
+        async def process_pages(page_queue, result_queue, prefix):
             found = False
             with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
                 while (res := await page_queue.get()) is not None:
@@ -94,7 +94,7 @@ class ClientS3(Client):
                     if entries:
                         await result_queue.put(entries)
                         pbar.update(len(entries))
-            if not found:
+            if not found and prefix:
                 raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
         try:
@@ -118,7 +118,9 @@ class ClientS3(Client):
                 Delimiter="",
             )
             page_queue: asyncio.Queue[list] = asyncio.Queue(2)
-            consumer = asyncio.create_task(process_pages(page_queue, result_queue))
+            consumer = asyncio.create_task(
+                process_pages(page_queue, result_queue, prefix)
+            )
             try:
                 await get_pages(it, page_queue)
                 await consumer

datachain/data_storage/metastore.py CHANGED Viewed

@@ -36,6 +36,7 @@ from datachain.dataset import (
 )
 from datachain.error import (
     DatasetNotFoundError,
+    DatasetVersionNotFoundError,
     TableMissingError,
 )
 from datachain.job import Job
@@ -273,7 +274,6 @@ class AbstractMetastore(ABC, Serializable):
         self,
         job_id: str,
         status: Optional[JobStatus] = None,
-        exit_code: Optional[int] = None,
         error_message: Optional[str] = None,
         error_stack: Optional[str] = None,
         finished_at: Optional[datetime] = None,
@@ -620,22 +620,36 @@ class AbstractDBMetastore(AbstractMetastore):
         self, dataset: DatasetRecord, conn=None, **kwargs
     ) -> DatasetRecord:
         """Updates dataset fields."""
-        values = {}
-        dataset_values = {}
+        values: dict[str, Any] = {}
+        dataset_values: dict[str, Any] = {}
         for field, value in kwargs.items():
-            if field in self._dataset_fields[1:]:
-                if field in ["attrs", "schema"]:
-                    values[field] = json.dumps(value) if value else None
+            if field in ("id", "created_at") or field not in self._dataset_fields:
+                continue  # these fields are read-only or not applicable
+            if value is None and field in ("name", "status", "sources", "query_script"):
+                raise ValueError(f"Field {field} cannot be None")
+            if field == "name" and not value:
+                raise ValueError("name cannot be empty")
+            if field == "attrs":
+                if value is None:
+                    values[field] = None
                 else:
-                    values[field] = value
-                if field == "schema":
-                    dataset_values[field] = DatasetRecord.parse_schema(value)
+                    values[field] = json.dumps(value)
+                dataset_values[field] = value
+            elif field == "schema":
+                if value is None:
+                    values[field] = None
+                    dataset_values[field] = None
                 else:
-                    dataset_values[field] = value
+                    values[field] = json.dumps(value)
+                    dataset_values[field] = DatasetRecord.parse_schema(value)
+            else:
+                values[field] = value
+                dataset_values[field] = value
         if not values:
-            # Nothing to update
-            return dataset
+            return dataset  # nothing to update
         d = self._datasets
         self.db.execute(
@@ -651,36 +665,70 @@ class AbstractDBMetastore(AbstractMetastore):
         self, dataset: DatasetRecord, version: str, conn=None, **kwargs
     ) -> DatasetVersion:
         """Updates dataset fields."""
-        dataset_version = dataset.get_version(version)
-        values = {}
-        version_values: dict = {}
+        values: dict[str, Any] = {}
+        version_values: dict[str, Any] = {}
         for field, value in kwargs.items():
-            if field in self._dataset_version_fields[1:]:
-                if field == "schema":
-                    values[field] = json.dumps(value) if value else None
-                    version_values[field] = DatasetRecord.parse_schema(value)
-                elif field == "feature_schema":
-                    values[field] = json.dumps(value) if value else None
-                    version_values[field] = value
-                elif field == "preview" and isinstance(value, list):
-                    values[field] = json.dumps(value, cls=JSONSerialize)
-                    version_values[field] = value
+            if (
+                field in ("id", "created_at")
+                or field not in self._dataset_version_fields
+            ):
+                continue  # these fields are read-only or not applicable
+            if value is None and field in (
+                "status",
+                "sources",
+                "query_script",
+                "error_message",
+                "error_stack",
+                "script_output",
+                "uuid",
+            ):
+                raise ValueError(f"Field {field} cannot be None")
+            if field == "schema":
+                values[field] = json.dumps(value) if value else None
+                version_values[field] = (
+                    DatasetRecord.parse_schema(value) if value else None
+                )
+            elif field == "feature_schema":
+                if value is None:
+                    values[field] = None
+                else:
+                    values[field] = json.dumps(value)
+                version_values[field] = value
+            elif field == "preview":
+                if value is None:
+                    values[field] = None
+                elif not isinstance(value, list):
+                    raise ValueError(
+                        f"Field '{field}' must be a list, got {type(value).__name__}"
+                    )
                 else:
-                    values[field] = value
-                    version_values[field] = value
+                    values[field] = json.dumps(value, cls=JSONSerialize)
+                version_values["_preview_data"] = value
+            else:
+                values[field] = value
+                version_values[field] = value
-        if values:
-            dv = self._datasets_versions
-            self.db.execute(
-                self._datasets_versions_update()
-                .where(dv.c.dataset_id == dataset.id, dv.c.version == version)
-                .values(values),
-                conn=conn,
-            )  # type: ignore [attr-defined]
-            dataset_version.update(**version_values)
+        if not values:
+            return dataset.get_version(version)
+        dv = self._datasets_versions
+        self.db.execute(
+            self._datasets_versions_update()
+            .where(dv.c.dataset_id == dataset.id, dv.c.version == version)
+            .values(values),
+            conn=conn,
+        )  # type: ignore [attr-defined]
+        for v in dataset.versions:
+            if v.version == version:
+                v.update(**version_values)
+                return v
-        return dataset_version
+        raise DatasetVersionNotFoundError(
+            f"Dataset {dataset.name} does not have version {version}"
+        )
     def _parse_dataset(self, rows) -> Optional[DatasetRecord]:
         versions = [self.dataset_class.parse(*r) for r in rows]
@@ -812,7 +860,7 @@ class AbstractDBMetastore(AbstractMetastore):
             update_data["error_message"] = error_message
             update_data["error_stack"] = error_stack
-        self.update_dataset(dataset, conn=conn, **update_data)
+        dataset = self.update_dataset(dataset, conn=conn, **update_data)
         if version:
             self.update_dataset_version(dataset, version, conn=conn, **update_data)
@@ -1064,7 +1112,6 @@ class AbstractDBMetastore(AbstractMetastore):
         self,
         job_id: str,
         status: Optional[JobStatus] = None,
-        exit_code: Optional[int] = None,
         error_message: Optional[str] = None,
         error_stack: Optional[str] = None,
         finished_at: Optional[datetime] = None,
@@ -1075,8 +1122,6 @@ class AbstractDBMetastore(AbstractMetastore):
         values: dict = {}
         if status is not None:
             values["status"] = status
-        if exit_code is not None:
-            values["exit_code"] = exit_code
         if error_message is not None:
             values["error_message"] = error_message
         if error_stack is not None:

datachain/func/aggregate.py CHANGED Viewed

@@ -1,78 +1,89 @@
-from typing import Optional
+from typing import Optional, Union
 from sqlalchemy import func as sa_func
+from datachain.query.schema import Column
 from datachain.sql.functions import aggregate
 from .func import Func
-def count(col: Optional[str] = None) -> Func:
+def count(col: Optional[Union[str, Column]] = None) -> Func:
     """
-    Returns the COUNT aggregate SQL function for the given column name.
+    Returns a COUNT aggregate SQL function for the specified column.
-    The COUNT function returns the number of rows in a table.
+    The COUNT function returns the number of rows, optionally filtered
+    by a specific column.
     Args:
-        col (str, optional): The name of the column for which to count rows.
-                             If not provided, it defaults to counting all rows.
+        col (str | Column, optional): The column to count.
+            If omitted, counts all rows.
+            The column can be specified as a string or a `Column` object.
     Returns:
-        Func: A Func object that represents the COUNT aggregate function.
+        Func: A `Func` object representing the COUNT aggregate function.
     Example:
         ```py
         dc.group_by(
-            count=func.count(),
+            count1=func.count(),
+            count2=func.count("signal.id"),
+            count3=func.count(dc.C("signal.category")),
             partition_by="signal.category",
         )
         ```
     Notes:
-        - Result column will always be of type int.
+        - The result column will always have an integer type.
     """
     return Func(
-        "count", inner=sa_func.count, cols=[col] if col else None, result_type=int
+        "count",
+        inner=sa_func.count,
+        cols=[col] if col is not None else None,
+        result_type=int,
     )
-def sum(col: str) -> Func:
+def sum(col: Union[str, Column]) -> Func:
     """
-    Returns the SUM aggregate SQL function for the given column name.
+    Returns the SUM aggregate SQL function for the specified column.
     The SUM function returns the total sum of a numeric column in a table.
     It sums up all the values for the specified column.
     Args:
-        col (str): The name of the column for which to calculate the sum.
+        col (str | Column): The name of the column for which to calculate the sum.
+            The column can be specified as a string or a `Column` object.
     Returns:
-        Func: A Func object that represents the SUM aggregate function.
+        Func: A `Func` object that represents the SUM aggregate function.
     Example:
         ```py
         dc.group_by(
             files_size=func.sum("file.size"),
+            total_size=func.sum(dc.C("size")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `sum` function should be used on numeric columns.
-        - Result column type will be the same as the input column type.
+        - The result column type will be the same as the input column type.
     """
     return Func("sum", inner=sa_func.sum, cols=[col])
-def avg(col: str) -> Func:
+def avg(col: Union[str, Column]) -> Func:
     """
-    Returns the AVG aggregate SQL function for the given column name.
+    Returns the AVG aggregate SQL function for the specified column.
     The AVG function returns the average of a numeric column in a table.
     It calculates the mean of all values in the specified column.
     Args:
-        col (str): The name of the column for which to calculate the average.
+        col (str | Column): The name of the column for which to calculate the average.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the AVG aggregate function.
@@ -81,26 +92,28 @@ def avg(col: str) -> Func:
         ```py
         dc.group_by(
             avg_file_size=func.avg("file.size"),
+            avg_signal_value=func.avg(dc.C("signal.value")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `avg` function should be used on numeric columns.
-        - Result column will always be of type float.
+        - The result column will always be of type float.
     """
     return Func("avg", inner=aggregate.avg, cols=[col], result_type=float)
-def min(col: str) -> Func:
+def min(col: Union[str, Column]) -> Func:
     """
-    Returns the MIN aggregate SQL function for the given column name.
+    Returns the MIN aggregate SQL function for the specified column.
     The MIN function returns the smallest value in the specified column.
     It can be used on both numeric and non-numeric columns to find the minimum value.
     Args:
-        col (str): The name of the column for which to find the minimum value.
+        col (str | Column): The name of the column for which to find the minimum value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the MIN aggregate function.
@@ -109,18 +122,19 @@ def min(col: str) -> Func:
         ```py
         dc.group_by(
             smallest_file=func.min("file.size"),
+            min_signal=func.min(dc.C("signal")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `min` function can be used with numeric, date, and string columns.
-        - Result column will have the same type as the input column.
+        - The result column will have the same type as the input column.
     """
     return Func("min", inner=sa_func.min, cols=[col])
-def max(col: str) -> Func:
+def max(col: Union[str, Column]) -> Func:
     """
     Returns the MAX aggregate SQL function for the given column name.
@@ -128,7 +142,8 @@ def max(col: str) -> Func:
     It can be used on both numeric and non-numeric columns to find the maximum value.
     Args:
-        col (str): The name of the column for which to find the maximum value.
+        col (str | Column): The name of the column for which to find the maximum value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the MAX aggregate function.
@@ -137,18 +152,19 @@ def max(col: str) -> Func:
         ```py
         dc.group_by(
             largest_file=func.max("file.size"),
+            max_signal=func.max(dc.C("signal")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `max` function can be used with numeric, date, and string columns.
-        - Result column will have the same type as the input column.
+        - The result column will have the same type as the input column.
     """
     return Func("max", inner=sa_func.max, cols=[col])
-def any_value(col: str) -> Func:
+def any_value(col: Union[str, Column]) -> Func:
     """
     Returns the ANY_VALUE aggregate SQL function for the given column name.
@@ -157,7 +173,9 @@ def any_value(col: str) -> Func:
     as long as it comes from one of the rows in the group.
     Args:
-        col (str): The name of the column from which to return an arbitrary value.
+        col (str | Column): The name of the column from which to return
+            an arbitrary value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the ANY_VALUE aggregate function.
@@ -166,20 +184,21 @@ def any_value(col: str) -> Func:
         ```py
         dc.group_by(
             file_example=func.any_value("file.path"),
+            signal_example=func.any_value(dc.C("signal.value")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `any_value` function can be used with any type of column.
-        - Result column will have the same type as the input column.
+        - The result column will have the same type as the input column.
         - The result of `any_value` is non-deterministic,
           meaning it may return different values for different executions.
     """
     return Func("any_value", inner=aggregate.any_value, cols=[col])
-def collect(col: str) -> Func:
+def collect(col: Union[str, Column]) -> Func:
     """
     Returns the COLLECT aggregate SQL function for the given column name.
@@ -188,7 +207,8 @@ def collect(col: str) -> Func:
     into a collection, often for further processing or aggregation.
     Args:
-        col (str): The name of the column from which to collect values.
+        col (str | Column): The name of the column from which to collect values.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the COLLECT aggregate function.
@@ -197,18 +217,19 @@ def collect(col: str) -> Func:
         ```py
         dc.group_by(
             signals=func.collect("signal"),
+            file_paths=func.collect(dc.C("file.path")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `collect` function can be used with numeric and string columns.
-        - Result column will have an array type.
+        - The result column will have an array type.
     """
     return Func("collect", inner=aggregate.collect, cols=[col], is_array=True)
-def concat(col: str, separator="") -> Func:
+def concat(col: Union[str, Column], separator="") -> Func:
     """
     Returns the CONCAT aggregate SQL function for the given column name.
@@ -217,9 +238,10 @@ def concat(col: str, separator="") -> Func:
     into a single combined value.
     Args:
-        col (str): The name of the column from which to concatenate values.
+        col (str | Column): The name of the column from which to concatenate values.
+            Column can be specified as a string or a `Column` object.
         separator (str, optional): The separator to use between concatenated values.
-                                   Defaults to an empty string.
+            Defaults to an empty string.
     Returns:
         Func: A Func object that represents the CONCAT aggregate function.
@@ -228,13 +250,14 @@ def concat(col: str, separator="") -> Func:
         ```py
         dc.group_by(
             files=func.concat("file.path", separator=", "),
+            signals=func.concat(dc.C("signal.name"), separator=" | "),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `concat` function can be used with string columns.
-        - Result column will have a string type.
+        - The result column will have a string type.
     """
     def inner(arg):
@@ -325,7 +348,7 @@ def dense_rank() -> Func:
     return Func("dense_rank", inner=sa_func.dense_rank, result_type=int, is_window=True)
-def first(col: str) -> Func:
+def first(col: Union[str, Column]) -> Func:
     """
     Returns the FIRST_VALUE window function for SQL queries.
@@ -334,7 +357,9 @@ def first(col: str) -> Func:
     and can be useful for retrieving the leading value in a group of rows.
     Args:
-        col (str): The name of the column from which to retrieve the first value.
+        col (str | Column): The name of the column from which to retrieve
+            the first value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the FIRST_VALUE window function.
@@ -344,6 +369,7 @@ def first(col: str) -> Func:
         window = func.window(partition_by="signal.category", order_by="created_at")
         dc.mutate(
             first_file=func.first("file.path").over(window),
+            first_signal=func.first(dc.C("signal.value")).over(window),
         )
         ```

datachain 0.18.4__py3-none-any.whl → 0.18.6__py3-none-any.whl

Potentially problematic release.

datachain 0.18.4py3-none-any.whl → 0.18.6py3-none-any.whl