PyPI - datachain - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl - Mend

datachain 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (30) hide show

datachain/catalog/catalog.py +61 -219
datachain/cli.py +136 -22
datachain/client/fsspec.py +9 -0
datachain/client/local.py +11 -32
datachain/config.py +126 -51
datachain/data_storage/schema.py +66 -33
datachain/data_storage/sqlite.py +4 -4
datachain/data_storage/warehouse.py +101 -125
datachain/lib/arrow.py +2 -15
datachain/lib/data_model.py +10 -2
datachain/lib/dc.py +211 -52
datachain/lib/func/__init__.py +20 -2
datachain/lib/func/aggregate.py +319 -8
datachain/lib/func/func.py +97 -9
datachain/lib/listing.py +6 -21
datachain/lib/listing_info.py +4 -0
datachain/lib/signal_schema.py +8 -5
datachain/lib/udf.py +3 -3
datachain/lib/utils.py +30 -0
datachain/listing.py +22 -48
datachain/query/dataset.py +11 -3
datachain/remote/studio.py +63 -14
datachain/studio.py +129 -0
datachain/utils.py +58 -0
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/METADATA +7 -6
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/RECORD +30 -29
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/WHEEL +1 -1
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/LICENSE +0 -0
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/entry_points.txt +0 -0
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/top_level.txt +0 -0

datachain/lib/func/aggregate.py CHANGED Viewed

@@ -8,35 +8,346 @@ from .func import Func
 def count(col: Optional[str] = None) -> Func:
-    return Func(inner=sa_func.count, col=col, result_type=int)
+    """
+    Returns the COUNT aggregate SQL function for the given column name.
+    The COUNT function returns the number of rows in a table.
+    Args:
+        col (str, optional): The name of the column for which to count rows.
+                             If not provided, it defaults to counting all rows.
+    Returns:
+        Func: A Func object that represents the COUNT aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            count=func.count(),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - Result column will always be of type int.
+    """
+    return Func("count", inner=sa_func.count, col=col, result_type=int)
 def sum(col: str) -> Func:
-    return Func(inner=sa_func.sum, col=col)
+    """
+    Returns the SUM aggregate SQL function for the given column name.
+    The SUM function returns the total sum of a numeric column in a table.
+    It sums up all the values for the specified column.
+    Args:
+        col (str): The name of the column for which to calculate the sum.
+    Returns:
+        Func: A Func object that represents the SUM aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            files_size=func.sum("file.size"),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - The `sum` function should be used on numeric columns.
+        - Result column type will be the same as the input column type.
+    """
+    return Func("sum", inner=sa_func.sum, col=col)
 def avg(col: str) -> Func:
-    return Func(inner=dc_func.aggregate.avg, col=col)
+    """
+    Returns the AVG aggregate SQL function for the given column name.
+    The AVG function returns the average of a numeric column in a table.
+    It calculates the mean of all values in the specified column.
+    Args:
+        col (str): The name of the column for which to calculate the average.
+    Returns:
+        Func: A Func object that represents the AVG aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            avg_file_size=func.avg("file.size"),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - The `avg` function should be used on numeric columns.
+        - Result column will always be of type float.
+    """
+    return Func("avg", inner=dc_func.aggregate.avg, col=col, result_type=float)
 def min(col: str) -> Func:
-    return Func(inner=sa_func.min, col=col)
+    """
+    Returns the MIN aggregate SQL function for the given column name.
+    The MIN function returns the smallest value in the specified column.
+    It can be used on both numeric and non-numeric columns to find the minimum value.
+    Args:
+        col (str): The name of the column for which to find the minimum value.
+    Returns:
+        Func: A Func object that represents the MIN aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            smallest_file=func.min("file.size"),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - The `min` function can be used with numeric, date, and string columns.
+        - Result column will have the same type as the input column.
+    """
+    return Func("min", inner=sa_func.min, col=col)
 def max(col: str) -> Func:
-    return Func(inner=sa_func.max, col=col)
+    """
+    Returns the MAX aggregate SQL function for the given column name.
+    The MAX function returns the smallest value in the specified column.
+    It can be used on both numeric and non-numeric columns to find the maximum value.
+    Args:
+        col (str): The name of the column for which to find the maximum value.
+    Returns:
+        Func: A Func object that represents the MAX aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            largest_file=func.max("file.size"),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - The `max` function can be used with numeric, date, and string columns.
+        - Result column will have the same type as the input column.
+    """
+    return Func("max", inner=sa_func.max, col=col)
 def any_value(col: str) -> Func:
-    return Func(inner=dc_func.aggregate.any_value, col=col)
+    """
+    Returns the ANY_VALUE aggregate SQL function for the given column name.
+    The ANY_VALUE function returns an arbitrary value from the specified column.
+    It is useful when you do not care which particular value is returned,
+    as long as it comes from one of the rows in the group.
+    Args:
+        col (str): The name of the column from which to return an arbitrary value.
+    Returns:
+        Func: A Func object that represents the ANY_VALUE aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            file_example=func.any_value("file.name"),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - The `any_value` function can be used with any type of column.
+        - Result column will have the same type as the input column.
+        - The result of `any_value` is non-deterministic,
+          meaning it may return different values for different executions.
+    """
+    return Func("any_value", inner=dc_func.aggregate.any_value, col=col)
 def collect(col: str) -> Func:
-    return Func(inner=dc_func.aggregate.collect, col=col, is_array=True)
+    """
+    Returns the COLLECT aggregate SQL function for the given column name.
+    The COLLECT function gathers all values from the specified column
+    into an array or similar structure. It is useful for combining values from a column
+    into a collection, often for further processing or aggregation.
+    Args:
+        col (str): The name of the column from which to collect values.
+    Returns:
+        Func: A Func object that represents the COLLECT aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            signals=func.collect("signal"),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - The `collect` function can be used with numeric and string columns.
+        - Result column will have an array type.
+    """
+    return Func("collect", inner=dc_func.aggregate.collect, col=col, is_array=True)
 def concat(col: str, separator="") -> Func:
+    """
+    Returns the CONCAT aggregate SQL function for the given column name.
+    The CONCAT function concatenates values from the specified column
+    into a single string. It is useful for merging text values from multiple rows
+    into a single combined value.
+    Args:
+        col (str): The name of the column from which to concatenate values.
+        separator (str, optional): The separator to use between concatenated values.
+                                   Defaults to an empty string.
+    Returns:
+        Func: A Func object that represents the CONCAT aggregate function.
+    Example:
+        ```py
+        dc.group_by(
+            files=func.concat("file.name", separator=", "),
+            partition_by="signal.category",
+        )
+        ```
+    Notes:
+        - The `concat` function can be used with string columns.
+        - Result column will have a string type.
+    """
     def inner(arg):
         return dc_func.aggregate.group_concat(arg, separator)
-    return Func(inner=inner, col=col, result_type=str)
+    return Func("concat", inner=inner, col=col, result_type=str)
+def row_number() -> Func:
+    """
+    Returns the ROW_NUMBER window function for SQL queries.
+    The ROW_NUMBER function assigns a unique sequential integer to rows
+    within a partition of a result set, starting from 1 for the first row
+    in each partition. It is commonly used to generate row numbers within
+    partitions or ordered results.
+    Returns:
+        Func: A Func object that represents the ROW_NUMBER window function.
+    Example:
+        ```py
+        window = func.window(partition_by="signal.category", order_by="created_at")
+        dc.mutate(
+            row_number=func.row_number().over(window),
+        )
+        ```
+    Note:
+        - The result column will always be of type int.
+    """
+    return Func("row_number", inner=sa_func.row_number, result_type=int, is_window=True)
+def rank() -> Func:
+    """
+    Returns the RANK window function for SQL queries.
+    The RANK function assigns a rank to each row within a partition of a result set,
+    with gaps in the ranking for ties. Rows with equal values receive the same rank,
+    and the next rank is skipped (i.e., if two rows are ranked 1,
+    the next row is ranked 3).
+    Returns:
+        Func: A Func object that represents the RANK window function.
+    Example:
+        ```py
+        window = func.window(partition_by="signal.category", order_by="created_at")
+        dc.mutate(
+            rank=func.rank().over(window),
+        )
+        ```
+    Notes:
+        - The result column will always be of type int.
+        - The RANK function differs from ROW_NUMBER in that rows with the same value
+          in the ordering column(s) receive the same rank.
+    """
+    return Func("rank", inner=sa_func.rank, result_type=int, is_window=True)
+def dense_rank() -> Func:
+    """
+    Returns the DENSE_RANK window function for SQL queries.
+    The DENSE_RANK function assigns a rank to each row within a partition
+    of a result set, without gaps in the ranking for ties. Rows with equal values
+    receive the same rank, but the next rank is assigned consecutively
+    (i.e., if two rows are ranked 1, the next row will be ranked 2).
+    Returns:
+        Func: A Func object that represents the DENSE_RANK window function.
+    Example:
+        ```py
+        window = func.window(partition_by="signal.category", order_by="created_at")
+        dc.mutate(
+            dense_rank=func.dense_rank().over(window),
+        )
+        ```
+    Notes:
+        - The result column will always be of type int.
+        - The DENSE_RANK function differs from RANK in that it does not leave gaps
+          in the ranking for tied values.
+    """
+    return Func("dense_rank", inner=sa_func.dense_rank, result_type=int, is_window=True)
+def first(col: str) -> Func:
+    """
+    Returns the FIRST_VALUE window function for SQL queries.
+    The FIRST_VALUE function returns the first value in an ordered set of values
+    within a partition. The first value is determined by the specified order
+    and can be useful for retrieving the leading value in a group of rows.
+    Args:
+        col (str): The name of the column from which to retrieve the first value.
+    Returns:
+        Func: A Func object that represents the FIRST_VALUE window function.
+    Example:
+        ```py
+        window = func.window(partition_by="signal.category", order_by="created_at")
+        dc.mutate(
+            first_file=func.first("file.name").over(window),
+        )
+        ```
+    Note:
+        - The result of `first_value` will always reflect the value of the first row
+          in the specified order.
+        - The result column will have the same type as the input column.
+    """
+    return Func("first", inner=sa_func.first_value, col=col, is_window=True)

datachain/lib/func/func.py CHANGED Viewed

@@ -1,7 +1,10 @@
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Callable, Optional
+from sqlalchemy import desc
 from datachain.lib.convert.python_to_sql import python_to_sql
-from datachain.lib.utils import DataChainColumnError
+from datachain.lib.utils import DataChainColumnError, DataChainParamsError
 from datachain.query.schema import Column, ColumnMeta
 if TYPE_CHECKING:
@@ -9,18 +12,89 @@ if TYPE_CHECKING:
     from datachain.lib.signal_schema import SignalSchema
+@dataclass
+class Window:
+    """Represents a window specification for SQL window functions."""
+    partition_by: str
+    order_by: str
+    desc: bool = False
+def window(partition_by: str, order_by: str, desc: bool = False) -> Window:
+    """
+    Defines a window specification for SQL window functions.
+    The `window` function specifies how to partition and order the result set
+    for the associated window function. It is used to define the scope of the rows
+    that the window function will operate on.
+    Args:
+        partition_by (str): The column name by which to partition the result set.
+                            Rows with the same value in the partition column
+                            will be grouped together for the window function.
+        order_by (str): The column name by which to order the rows
+                        within each partition. This determines the sequence in which
+                        the window function is applied.
+        desc (bool, optional): If True, the rows will be ordered in descending order.
+                               Defaults to False, which orders the rows
+                               in ascending order.
+    Returns:
+        Window: A Window object representing the window specification.
+    Example:
+        ```py
+        window = func.window(partition_by="signal.category", order_by="created_at")
+        dc.mutate(
+            row_number=func.row_number().over(window),
+        )
+        ```
+    """
+    return Window(
+        ColumnMeta.to_db_name(partition_by),
+        ColumnMeta.to_db_name(order_by),
+        desc,
+    )
 class Func:
+    """Represents a function to be applied to a column in a SQL query."""
     def __init__(
         self,
+        name: str,
         inner: Callable,
         col: Optional[str] = None,
         result_type: Optional["DataType"] = None,
         is_array: bool = False,
+        is_window: bool = False,
+        window: Optional[Window] = None,
     ) -> None:
+        self.name = name
         self.inner = inner
         self.col = col
         self.result_type = result_type
         self.is_array = is_array
+        self.is_window = is_window
+        self.window = window
+    def __str__(self) -> str:
+        return self.name + "()"
+    def over(self, window: Window) -> "Func":
+        if not self.is_window:
+            raise DataChainParamsError(f"{self} doesn't support window (over())")
+        return Func(
+            "over",
+            self.inner,
+            self.col,
+            self.result_type,
+            self.is_array,
+            self.is_window,
+            window,
+        )
     @property
     def db_col(self) -> Optional[str]:
@@ -33,31 +107,45 @@ class Func:
         return list[col_type] if self.is_array else col_type  # type: ignore[valid-type]
     def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
-        col_type = self.db_col_type(signals_schema)
         if self.result_type:
             return self.result_type
-        if col_type:
+        if col_type := self.db_col_type(signals_schema):
             return col_type
         raise DataChainColumnError(
-            str(self.inner),
+            str(self),
             "Column name is required to infer result type",
         )
     def get_column(
         self, signals_schema: "SignalSchema", label: Optional[str] = None
     ) -> Column:
+        col_type = self.get_result_type(signals_schema)
+        sql_type = python_to_sql(col_type)
         if self.col:
-            if label == "collect":
-                print(label)
-            col_type = self.get_result_type(signals_schema)
-            col = Column(self.db_col, python_to_sql(col_type))
+            col = Column(self.db_col, sql_type)
             func_col = self.inner(col)
         else:
             func_col = self.inner()
+        if self.is_window:
+            if not self.window:
+                raise DataChainParamsError(
+                    f"Window function {self} requires over() clause with a window spec",
+                )
+            func_col = func_col.over(
+                partition_by=self.window.partition_by,
+                order_by=(
+                    desc(self.window.order_by)
+                    if self.window.desc
+                    else self.window.order_by
+                ),
+            )
+        func_col.type = sql_type
         if label:
             func_col = func_col.label(label)

datachain/lib/listing.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import posixpath
 from collections.abc import Iterator
-from datetime import datetime, timedelta, timezone
 from typing import TYPE_CHECKING, Callable, Optional, TypeVar
 from fsspec.asyn import get_loop
@@ -85,12 +84,13 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
     storage_uri, path = Client.parse_url(uri)
     telemetry.log_param("client", client.PREFIX)
-    # clean path without globs
-    lst_uri_path = (
-        posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
-    )
+    if uses_glob(path) or client.fs.isfile(uri):
+        lst_uri_path = posixpath.dirname(path)
+    else:
+        storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
+        lst_uri_path = path
-    lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
+    lst_uri = f'{storage_uri}/{lst_uri_path.lstrip("/")}'
     ds_name = (
         f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
     )
@@ -108,18 +108,3 @@ def listing_uri_from_name(dataset_name: str) -> str:
     if not is_listing_dataset(dataset_name):
         raise ValueError(f"Dataset {dataset_name} is not a listing")
     return dataset_name.removeprefix(LISTING_PREFIX)
-def is_listing_expired(created_at: datetime) -> bool:
-    """Checks if listing has expired based on it's creation date"""
-    return datetime.now(timezone.utc) > created_at + timedelta(seconds=LISTING_TTL)
-def is_listing_subset(ds1_name: str, ds2_name: str) -> bool:
-    """
-    Checks if one listing contains another one by comparing corresponding dataset names
-    """
-    assert ds1_name.endswith("/")
-    assert ds2_name.endswith("/")
-    return ds2_name.startswith(ds1_name)

datachain/lib/listing_info.py CHANGED Viewed

@@ -30,3 +30,7 @@ class ListingInfo(DatasetInfo):
     def last_inserted_at(self):
         # TODO we need to add updated_at to dataset version or explicit last_inserted_at
         raise NotImplementedError
+    def contains(self, other_name: str) -> bool:
+        """Checks if this listing contains another one"""
+        return other_name.startswith(self.name)

datachain/lib/signal_schema.py CHANGED Viewed

@@ -20,6 +20,7 @@ from typing import (  # noqa: UP035
 )
 from pydantic import BaseModel, create_model
+from sqlalchemy import ColumnElement
 from typing_extensions import Literal as LiteralEx
 from datachain.lib.convert.python_to_sql import python_to_sql
@@ -27,6 +28,7 @@ from datachain.lib.convert.sql_to_python import sql_to_python
 from datachain.lib.convert.unflatten import unflatten_to_json_pos
 from datachain.lib.data_model import DataModel, DataType, DataValue
 from datachain.lib.file import File
+from datachain.lib.func import Func
 from datachain.lib.model_store import ModelStore
 from datachain.lib.utils import DataChainParamsError
 from datachain.query.schema import DEFAULT_DELIMITER, Column
@@ -490,13 +492,14 @@ class SignalSchema:
                 # renaming existing signal
                 del new_values[value.name]
                 new_values[name] = self.values[value.name]
-            elif name in self.values:
-                # changing the type of existing signal, e.g File -> ImageFile
-                del new_values[name]
-                new_values[name] = args_map[name]
-            else:
+            elif isinstance(value, Func):
+                # adding new signal with function
+                new_values[name] = value.get_result_type(self)
+            elif isinstance(value, ColumnElement):
                 # adding new signal
                 new_values[name] = sql_to_python(value)
+            else:
+                new_values[name] = value
         return SignalSchema(new_values)

datachain/lib/udf.py CHANGED Viewed

@@ -11,7 +11,6 @@ from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
 from datachain.lib.data_model import DataValue
 from datachain.lib.file import File
-from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
 from datachain.query.batch import (
     Batch,
@@ -25,6 +24,7 @@ if TYPE_CHECKING:
     from typing_extensions import Self
     from datachain.catalog import Catalog
+    from datachain.lib.signal_schema import SignalSchema
     from datachain.lib.udf_signature import UdfSignature
     from datachain.query.batch import RowsOutput
@@ -172,7 +172,7 @@ class UDFBase(AbstractUDF):
     def _init(
         self,
         sign: "UdfSignature",
-        params: SignalSchema,
+        params: "SignalSchema",
         func: Optional[Callable],
     ):
         self.params = params
@@ -183,7 +183,7 @@ class UDFBase(AbstractUDF):
     def _create(
         cls,
         sign: "UdfSignature",
-        params: SignalSchema,
+        params: "SignalSchema",
     ) -> "Self":
         if isinstance(sign.func, AbstractUDF):
             if not isinstance(sign.func, cls):  # type: ignore[unreachable]

datachain/lib/utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
+import re
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 class AbstractUDF(ABC):
@@ -28,3 +30,31 @@ class DataChainParamsError(DataChainError):
 class DataChainColumnError(DataChainParamsError):
     def __init__(self, col_name, msg):
         super().__init__(f"Error for column {col_name}: {msg}")
+def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
+    gen_col_counter = 0
+    new_col_names = {}
+    org_col_names = set(col_names)
+    for org_column in col_names:
+        new_column = org_column.lower()
+        new_column = re.sub("[^0-9a-z]+", "_", new_column)
+        new_column = new_column.strip("_")
+        generated_column = new_column
+        while (
+            not generated_column.isidentifier()
+            or generated_column in new_col_names
+            or (generated_column != org_column and generated_column in org_col_names)
+        ):
+            if new_column:
+                generated_column = f"c{gen_col_counter}_{new_column}"
+            else:
+                generated_column = f"c{gen_col_counter}"
+            gen_col_counter += 1
+        new_col_names[generated_column] = org_column
+    return new_col_names

datachain 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

Potentially problematic release.

datachain 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl