PyPI - datachain - Versions diffs - 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl - Mend

datachain 0.7.1py3-none-any.whl → 0.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (46) hide show

datachain/__init__.py +0 -2
datachain/catalog/catalog.py +12 -9
datachain/cli.py +109 -9
datachain/client/fsspec.py +9 -9
datachain/data_storage/metastore.py +63 -11
datachain/data_storage/schema.py +2 -2
datachain/data_storage/sqlite.py +5 -4
datachain/data_storage/warehouse.py +18 -18
datachain/dataset.py +142 -14
datachain/func/__init__.py +49 -0
datachain/{lib/func → func}/aggregate.py +13 -11
datachain/func/array.py +176 -0
datachain/func/base.py +23 -0
datachain/func/conditional.py +81 -0
datachain/func/func.py +384 -0
datachain/func/path.py +110 -0
datachain/func/random.py +23 -0
datachain/func/string.py +154 -0
datachain/func/window.py +49 -0
datachain/lib/arrow.py +24 -12
datachain/lib/data_model.py +25 -9
datachain/lib/dataset_info.py +9 -5
datachain/lib/dc.py +94 -56
datachain/lib/hf.py +1 -1
datachain/lib/signal_schema.py +1 -1
datachain/lib/utils.py +1 -0
datachain/lib/webdataset_laion.py +5 -5
datachain/model/bbox.py +2 -2
datachain/model/pose.py +5 -5
datachain/model/segment.py +2 -2
datachain/nodes_fetcher.py +2 -2
datachain/query/dataset.py +57 -34
datachain/remote/studio.py +40 -8
datachain/sql/__init__.py +0 -2
datachain/sql/functions/__init__.py +0 -26
datachain/sql/selectable.py +11 -5
datachain/sql/sqlite/base.py +11 -2
datachain/studio.py +29 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/METADATA +2 -2
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/RECORD +44 -37
datachain/lib/func/__init__.py +0 -32
datachain/lib/func/func.py +0 -152
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/LICENSE +0 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/WHEEL +0 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/entry_points.txt +0 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/top_level.txt +0 -0

datachain/dataset.py CHANGED Viewed

@@ -15,7 +15,9 @@ from datachain.error import DatasetVersionNotFoundError
 from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
 T = TypeVar("T", bound="DatasetRecord")
+LT = TypeVar("LT", bound="DatasetListRecord")
 V = TypeVar("V", bound="DatasetVersion")
+LV = TypeVar("LV", bound="DatasetListVersion")
 DD = TypeVar("DD", bound="DatasetDependency")
 DATASET_PREFIX = "ds://"
@@ -264,6 +266,59 @@ class DatasetVersion:
         return cls(**kwargs)
+@dataclass
+class DatasetListVersion:
+    id: int
+    uuid: str
+    dataset_id: int
+    version: int
+    status: int
+    created_at: datetime
+    finished_at: Optional[datetime]
+    error_message: str
+    error_stack: str
+    num_objects: Optional[int]
+    size: Optional[int]
+    query_script: str = ""
+    job_id: Optional[str] = None
+    @classmethod
+    def parse(
+        cls: type[LV],
+        id: int,
+        uuid: str,
+        dataset_id: int,
+        version: int,
+        status: int,
+        created_at: datetime,
+        finished_at: Optional[datetime],
+        error_message: str,
+        error_stack: str,
+        num_objects: Optional[int],
+        size: Optional[int],
+        query_script: str = "",
+        job_id: Optional[str] = None,
+    ):
+        return cls(
+            id,
+            uuid,
+            dataset_id,
+            version,
+            status,
+            created_at,
+            finished_at,
+            error_message,
+            error_stack,
+            num_objects,
+            size,
+            query_script,
+            job_id,
+        )
+    def __hash__(self):
+        return hash(f"{self.dataset_id}_{self.version}")
 @dataclass
 class DatasetRecord:
     id: int
@@ -447,20 +502,6 @@ class DatasetRecord:
         identifier = self.identifier(version)
         return f"{DATASET_PREFIX}{identifier}"
-    @property
-    def is_bucket_listing(self) -> bool:
-        """
-        For bucket listing we implicitly create underlying dataset to hold data. This
-        method is checking if this is one of those datasets.
-        """
-        from datachain.client import Client
-        # TODO refactor and maybe remove method in
-        # https://github.com/iterative/datachain/issues/318
-        return Client.is_data_source_uri(self.name) or self.name.startswith(
-            LISTING_PREFIX
-        )
     @property
     def versions_values(self) -> list[int]:
         """
@@ -499,5 +540,92 @@ class DatasetRecord:
         return cls(**kwargs, versions=versions)
+@dataclass
+class DatasetListRecord:
+    id: int
+    name: str
+    description: Optional[str]
+    labels: list[str]
+    versions: list[DatasetListVersion]
+    created_at: Optional[datetime] = None
+    @classmethod
+    def parse(  # noqa: PLR0913
+        cls: type[LT],
+        id: int,
+        name: str,
+        description: Optional[str],
+        labels: str,
+        created_at: datetime,
+        version_id: int,
+        version_uuid: str,
+        version_dataset_id: int,
+        version: int,
+        version_status: int,
+        version_created_at: datetime,
+        version_finished_at: Optional[datetime],
+        version_error_message: str,
+        version_error_stack: str,
+        version_num_objects: Optional[int],
+        version_size: Optional[int],
+        version_query_script: Optional[str],
+        version_job_id: Optional[str] = None,
+    ) -> "DatasetListRecord":
+        labels_lst: list[str] = json.loads(labels) if labels else []
+        dataset_version = DatasetListVersion.parse(
+            version_id,
+            version_uuid,
+            version_dataset_id,
+            version,
+            version_status,
+            version_created_at,
+            version_finished_at,
+            version_error_message,
+            version_error_stack,
+            version_num_objects,
+            version_size,
+            version_query_script,  # type: ignore[arg-type]
+            version_job_id,
+        )
+        return cls(
+            id,
+            name,
+            description,
+            labels_lst,
+            [dataset_version],
+            created_at,
+        )
+    def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
+        """Merge versions from another dataset"""
+        if other.id != self.id:
+            raise RuntimeError("Cannot merge versions of datasets with different ids")
+        if not other.versions:
+            # nothing to merge
+            return self
+        if not self.versions:
+            self.versions = []
+        self.versions = list(set(self.versions + other.versions))
+        self.versions.sort(key=lambda v: v.version)
+        return self
+    @property
+    def is_bucket_listing(self) -> bool:
+        """
+        For bucket listing we implicitly create underlying dataset to hold data. This
+        method is checking if this is one of those datasets.
+        """
+        from datachain.client import Client
+        # TODO refactor and maybe remove method in
+        # https://github.com/iterative/datachain/issues/318
+        return Client.is_data_source_uri(self.name) or self.name.startswith(
+            LISTING_PREFIX
+        )
 class RowDict(dict):
     pass

datachain/func/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+from sqlalchemy import literal
+from . import array, path, random, string
+from .aggregate import (
+    any_value,
+    avg,
+    collect,
+    concat,
+    count,
+    dense_rank,
+    first,
+    max,
+    min,
+    rank,
+    row_number,
+    sum,
+)
+from .array import cosine_distance, euclidean_distance, length, sip_hash_64
+from .conditional import greatest, least
+from .random import rand
+from .window import window
+__all__ = [
+    "any_value",
+    "array",
+    "avg",
+    "collect",
+    "concat",
+    "cosine_distance",
+    "count",
+    "dense_rank",
+    "euclidean_distance",
+    "first",
+    "greatest",
+    "least",
+    "length",
+    "literal",
+    "max",
+    "min",
+    "path",
+    "rand",
+    "random",
+    "rank",
+    "row_number",
+    "sip_hash_64",
+    "string",
+    "sum",
+    "window",
+]

datachain/{lib/func → func}/aggregate.py RENAMED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 from sqlalchemy import func as sa_func
-from datachain.sql import functions as dc_func
+from datachain.sql.functions import aggregate
 from .func import Func
@@ -31,7 +31,9 @@ def count(col: Optional[str] = None) -> Func:
     Notes:
         - Result column will always be of type int.
     """
-    return Func("count", inner=sa_func.count, col=col, result_type=int)
+    return Func(
+        "count", inner=sa_func.count, cols=[col] if col else None, result_type=int
+    )
 def sum(col: str) -> Func:
@@ -59,7 +61,7 @@ def sum(col: str) -> Func:
         - The `sum` function should be used on numeric columns.
         - Result column type will be the same as the input column type.
     """
-    return Func("sum", inner=sa_func.sum, col=col)
+    return Func("sum", inner=sa_func.sum, cols=[col])
 def avg(col: str) -> Func:
@@ -87,7 +89,7 @@ def avg(col: str) -> Func:
         - The `avg` function should be used on numeric columns.
         - Result column will always be of type float.
     """
-    return Func("avg", inner=dc_func.aggregate.avg, col=col, result_type=float)
+    return Func("avg", inner=aggregate.avg, cols=[col], result_type=float)
 def min(col: str) -> Func:
@@ -115,7 +117,7 @@ def min(col: str) -> Func:
         - The `min` function can be used with numeric, date, and string columns.
         - Result column will have the same type as the input column.
     """
-    return Func("min", inner=sa_func.min, col=col)
+    return Func("min", inner=sa_func.min, cols=[col])
 def max(col: str) -> Func:
@@ -143,7 +145,7 @@ def max(col: str) -> Func:
         - The `max` function can be used with numeric, date, and string columns.
         - Result column will have the same type as the input column.
     """
-    return Func("max", inner=sa_func.max, col=col)
+    return Func("max", inner=sa_func.max, cols=[col])
 def any_value(col: str) -> Func:
@@ -174,7 +176,7 @@ def any_value(col: str) -> Func:
         - The result of `any_value` is non-deterministic,
           meaning it may return different values for different executions.
     """
-    return Func("any_value", inner=dc_func.aggregate.any_value, col=col)
+    return Func("any_value", inner=aggregate.any_value, cols=[col])
 def collect(col: str) -> Func:
@@ -203,7 +205,7 @@ def collect(col: str) -> Func:
         - The `collect` function can be used with numeric and string columns.
         - Result column will have an array type.
     """
-    return Func("collect", inner=dc_func.aggregate.collect, col=col, is_array=True)
+    return Func("collect", inner=aggregate.collect, cols=[col], is_array=True)
 def concat(col: str, separator="") -> Func:
@@ -236,9 +238,9 @@ def concat(col: str, separator="") -> Func:
     """
     def inner(arg):
-        return dc_func.aggregate.group_concat(arg, separator)
+        return aggregate.group_concat(arg, separator)
-    return Func("concat", inner=inner, col=col, result_type=str)
+    return Func("concat", inner=inner, cols=[col], result_type=str)
 def row_number() -> Func:
@@ -350,4 +352,4 @@ def first(col: str) -> Func:
           in the specified order.
         - The result column will have the same type as the input column.
     """
-    return Func("first", inner=sa_func.first_value, col=col, is_window=True)
+    return Func("first", inner=sa_func.first_value, cols=[col], is_window=True)

datachain/func/array.py ADDED Viewed

@@ -0,0 +1,176 @@
+from collections.abc import Sequence
+from typing import Union
+from datachain.sql.functions import array
+from .func import Func
+def cosine_distance(*args: Union[str, Sequence]) -> Func:
+    """
+    Computes the cosine distance between two vectors.
+    The cosine distance is derived from the cosine similarity, which measures the angle
+    between two vectors. This function returns the dissimilarity between the vectors,
+    where 0 indicates identical vectors and values closer to 1
+    indicate higher dissimilarity.
+    Args:
+        args (str | Sequence): Two vectors to compute the cosine distance between.
+            If a string is provided, it is assumed to be the name of the column vector.
+            If a sequence is provided, it is assumed to be a vector of values.
+    Returns:
+        Func: A Func object that represents the cosine_distance function.
+    Example:
+        ```py
+        target_embedding = [0.1, 0.2, 0.3]
+        dc.mutate(
+            cos_dist1=func.cosine_distance("embedding", target_embedding),
+            cos_dist2=func.cosine_distance(target_embedding, [0.4, 0.5, 0.6]),
+        )
+        ```
+    Notes:
+        - Ensure both vectors have the same number of elements.
+        - Result column will always be of type float.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, str):
+            cols.append(arg)
+        else:
+            func_args.append(list(arg))
+    if len(cols) + len(func_args) != 2:
+        raise ValueError("cosine_distance() requires exactly two arguments")
+    if not cols and len(func_args[0]) != len(func_args[1]):
+        raise ValueError("cosine_distance() requires vectors of the same length")
+    return Func(
+        "cosine_distance",
+        inner=array.cosine_distance,
+        cols=cols,
+        args=func_args,
+        result_type=float,
+    )
+def euclidean_distance(*args: Union[str, Sequence]) -> Func:
+    """
+    Computes the Euclidean distance between two vectors.
+    The Euclidean distance is the straight-line distance between two points
+    in Euclidean space. This function returns the distance between the two vectors.
+    Args:
+        args (str | Sequence): Two vectors to compute the Euclidean distance between.
+            If a string is provided, it is assumed to be the name of the column vector.
+            If a sequence is provided, it is assumed to be a vector of values.
+    Returns:
+        Func: A Func object that represents the euclidean_distance function.
+    Example:
+        ```py
+        target_embedding = [0.1, 0.2, 0.3]
+        dc.mutate(
+            eu_dist1=func.euclidean_distance("embedding", target_embedding),
+            eu_dist2=func.euclidean_distance(target_embedding, [0.4, 0.5, 0.6]),
+        )
+        ```
+    Notes:
+        - Ensure both vectors have the same number of elements.
+        - Result column will always be of type float.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, str):
+            cols.append(arg)
+        else:
+            func_args.append(list(arg))
+    if len(cols) + len(func_args) != 2:
+        raise ValueError("euclidean_distance() requires exactly two arguments")
+    if not cols and len(func_args[0]) != len(func_args[1]):
+        raise ValueError("euclidean_distance() requires vectors of the same length")
+    return Func(
+        "euclidean_distance",
+        inner=array.euclidean_distance,
+        cols=cols,
+        args=func_args,
+        result_type=float,
+    )
+def length(arg: Union[str, Sequence, Func]) -> Func:
+    """
+    Returns the length of the array.
+    Args:
+        arg (str | Sequence | Func): Array to compute the length of.
+            If a string is provided, it is assumed to be the name of the array column.
+            If a sequence is provided, it is assumed to be an array of values.
+            If a Func is provided, it is assumed to be a function returning an array.
+    Returns:
+        Func: A Func object that represents the array length function.
+    Example:
+        ```py
+        dc.mutate(
+            len1=func.array.length("signal.values"),
+            len2=func.array.length([1, 2, 3, 4, 5]),
+        )
+        ```
+    Note:
+        - Result column will always be of type int.
+    """
+    if isinstance(arg, (str, Func)):
+        cols = [arg]
+        args = None
+    else:
+        cols = None
+        args = [arg]
+    return Func("length", inner=array.length, cols=cols, args=args, result_type=int)
+def sip_hash_64(arg: Union[str, Sequence]) -> Func:
+    """
+    Computes the SipHash-64 hash of the array.
+    Args:
+        arg (str | Sequence): Array to compute the SipHash-64 hash of.
+            If a string is provided, it is assumed to be the name of the array column.
+            If a sequence is provided, it is assumed to be an array of values.
+    Returns:
+        Func: A Func object that represents the sip_hash_64 function.
+    Example:
+        ```py
+        dc.mutate(
+            hash1=func.sip_hash_64("signal.values"),
+            hash2=func.sip_hash_64([1, 2, 3, 4, 5]),
+        )
+        ```
+    Note:
+        - This function is only available for the ClickHouse warehouse.
+        - Result column will always be of type int.
+    """
+    if isinstance(arg, str):
+        cols = [arg]
+        args = None
+    else:
+        cols = None
+        args = [arg]
+    return Func(
+        "sip_hash_64", inner=array.sip_hash_64, cols=cols, args=args, result_type=int
+    )

datachain/func/base.py ADDED Viewed

@@ -0,0 +1,23 @@
+from abc import ABCMeta, abstractmethod
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from sqlalchemy import TableClause
+    from datachain.lib.signal_schema import SignalSchema
+    from datachain.query.schema import Column
+class Function:
+    __metaclass__ = ABCMeta
+    name: str
+    @abstractmethod
+    def get_column(
+        self,
+        signals_schema: Optional["SignalSchema"] = None,
+        label: Optional[str] = None,
+        table: Optional["TableClause"] = None,
+    ) -> "Column":
+        pass

datachain/func/conditional.py ADDED Viewed

@@ -0,0 +1,81 @@
+from typing import Union
+from datachain.sql.functions import conditional
+from .func import ColT, Func
+def greatest(*args: Union[ColT, float]) -> Func:
+    """
+    Returns the greatest (largest) value from the given input values.
+    Args:
+        args (ColT | str | int | float | Sequence): The values to compare.
+            If a string is provided, it is assumed to be the name of the column.
+            If a Func is provided, it is assumed to be a function returning a value.
+            If an int, float, or Sequence is provided, it is assumed to be a literal.
+    Returns:
+        Func: A Func object that represents the greatest function.
+    Example:
+        ```py
+        dc.mutate(
+            greatest=func.greatest("signal.value", 0),
+        )
+        ```
+    Note:
+        - Result column will always be of the same type as the input columns.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, (str, Func)):
+            cols.append(arg)
+        else:
+            func_args.append(arg)
+    return Func(
+        "greatest",
+        inner=conditional.greatest,
+        cols=cols,
+        args=func_args,
+        result_type=int,
+    )
+def least(*args: Union[ColT, float]) -> Func:
+    """
+    Returns the least (smallest) value from the given input values.
+    Args:
+        args (ColT | str | int | float | Sequence): The values to compare.
+            If a string is provided, it is assumed to be the name of the column.
+            If a Func is provided, it is assumed to be a function returning a value.
+            If an int, float, or Sequence is provided, it is assumed to be a literal.
+    Returns:
+        Func: A Func object that represents the least function.
+    Example:
+        ```py
+        dc.mutate(
+            least=func.least("signal.value", 0),
+        )
+        ```
+    Note:
+        - Result column will always be of the same type as the input columns.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, (str, Func)):
+            cols.append(arg)
+        else:
+            func_args.append(arg)
+    return Func(
+        "least", inner=conditional.least, cols=cols, args=func_args, result_type=int
+    )

datachain 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

Potentially problematic release.

datachain 0.7.1py3-none-any.whl → 0.7.3py3-none-any.whl