PyPI - datachain - Versions diffs - 0.5.1__tar.gz → 0.6.1__tar.gz - Mend

datachain 0.5.1tar.gz → 0.6.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (257) hide show

{datachain-0.5.1 → datachain-0.6.1}/.pre-commit-config.yaml RENAMED Viewed

@@ -4,7 +4,7 @@ ci:
   skip: [mypy]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-added-large-files
         exclude: '^tests/examples/data/'
@@ -24,7 +24,7 @@ repos:
       - id: trailing-whitespace
         exclude: '^LICENSES/'
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.6.8'
+    rev: 'v0.6.9'
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

{datachain-0.5.1/src/datachain.egg-info → datachain-0.6.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.5.1
+Version: 0.6.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -81,7 +81,7 @@ Requires-Dist: requests-mock; extra == "tests"
 Requires-Dist: scipy; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
-Requires-Dist: mypy==1.11.2; extra == "dev"
+Requires-Dist: mypy==1.12.0; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"

{datachain-0.5.1 → datachain-0.6.1}/pyproject.toml RENAMED Viewed

@@ -93,7 +93,7 @@ tests = [
 ]
 dev = [
   "datachain[docs,tests]",
-  "mypy==1.11.2",
+  "mypy==1.12.0",
   "types-python-dateutil",
   "types-pytz",
   "types-PyYAML",

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/__init__.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from datachain.lib import func
 from datachain.lib.data_model import DataModel, DataType, is_chain_type
 from datachain.lib.dc import C, Column, DataChain, Sys
 from datachain.lib.file import (
@@ -34,6 +35,7 @@ __all__ = [
     "Sys",
     "TarVFile",
     "TextFile",
+    "func",
     "is_chain_type",
     "metrics",
     "param",

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/catalog/catalog.py RENAMED Viewed

@@ -989,13 +989,6 @@ class Catalog:
             c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
         }
-        job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
-        if not job_id:
-            from datachain.query.session import Session
-            session = Session.get(catalog=self)
-            job_id = session.job_id
         dataset = self.metastore.create_dataset_version(
             dataset,
             version,
@@ -1218,6 +1211,7 @@ class Catalog:
             preview=dataset_version.preview,
             job_id=dataset_version.job_id,
         )
         # to avoid re-creating rows table, we are just renaming it for a new version
         # of target dataset
         self.warehouse.rename_dataset_table(
@@ -1325,8 +1319,6 @@ class Catalog:
         if offset:
             q = q.offset(offset)
-        q = q.order_by("sys__id")
         return q.to_db_records()
     def signed_url(self, source: str, path: str, client_config=None) -> str:

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/sqlite.py RENAMED Viewed

@@ -763,6 +763,14 @@ class SQLiteWarehouse(AbstractWarehouse):
         query: Select,
         progress_cb: Optional[Callable[[int], None]] = None,
     ) -> None:
+        if len(query._group_by_clause) > 0:
+            select_q = query.with_only_columns(
+                *[c for c in query.selected_columns if c.name != "sys__id"]
+            )
+            q = table.insert().from_select(list(select_q.selected_columns), select_q)
+            self.db.execute(q)
+            return
         if "sys__id" in query.selected_columns:
             col_id = query.selected_columns.sys__id
         else:

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/warehouse.py RENAMED Viewed

@@ -215,10 +215,6 @@ class AbstractWarehouse(ABC, Serializable):
         limit = query._limit
         paginated_query = query.limit(page_size)
-        if not paginated_query._order_by_clauses:
-            # default order by is order by `sys__id`
-            paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
         results = None
         offset = 0
         num_yielded = 0

datachain-0.6.1/src/datachain/lib/convert/sql_to_python.py ADDED Viewed

@@ -0,0 +1,14 @@
+from decimal import Decimal
+from typing import Any
+from sqlalchemy import ColumnElement
+def sql_to_python(sql_exp: ColumnElement) -> Any:
+    try:
+        type_ = sql_exp.type.python_type
+        if type_ == Decimal:
+            type_ = float
+    except NotImplementedError:
+        type_ = str
+    return type_

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/convert/values_to_tuples.py RENAMED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Union
 from datachain.lib.data_model import (
     DataType,
     DataTypeNames,
-    DataValuesType,
+    DataValue,
     is_chain_type,
 )
 from datachain.lib.utils import DataChainParamsError
@@ -20,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
 def values_to_tuples(  # noqa: C901, PLR0912
     ds_name: str = "",
     output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
-    **fr_map: Sequence[DataValuesType],
+    **fr_map: Sequence[DataValue],
 ) -> tuple[Any, Any, Any]:
     if output:
         if not isinstance(output, (Sequence, str, dict)):

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/data_model.py RENAMED Viewed

@@ -18,7 +18,7 @@ StandardType = Union[
 ]
 DataType = Union[type[BaseModel], StandardType]
 DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
-DataValuesType = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
+DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
 class DataModel(BaseModel):

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/dc.py RENAMED Viewed

@@ -29,6 +29,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ArrowRow, File, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
+from datachain.lib.func import Func
 from datachain.lib.listing import (
     is_listing_dataset,
     is_listing_expired,
@@ -42,26 +43,18 @@ from datachain.lib.meta_formats import read_meta, read_schema
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.udf import (
-    Aggregator,
-    BatchMapper,
-    Generator,
-    Mapper,
-    UDFBase,
-)
+from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
 from datachain.lib.udf_signature import UdfSignature
-from datachain.lib.utils import DataChainParamsError
+from datachain.lib.utils import DataChainColumnError, DataChainParamsError
 from datachain.query import Session
-from datachain.query.dataset import (
-    DatasetQuery,
-    PartitionByType,
-)
-from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
+from datachain.query.dataset import DatasetQuery, PartitionByType
+from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
 from datachain.sql.functions import path as pathfunc
 from datachain.telemetry import telemetry
 from datachain.utils import batched_it, inside_notebook
 if TYPE_CHECKING:
+    from pyarrow import DataType as ArrowDataType
     from typing_extensions import Concatenate, ParamSpec, Self
     from datachain.lib.hf import HFDatasetType
@@ -148,11 +141,6 @@ class DatasetMergeError(DataChainParamsError):  # noqa: D101
         super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
-class DataChainColumnError(DataChainParamsError):  # noqa: D101
-    def __init__(self, col_name, msg):  # noqa: D107
-        super().__init__(f"Error for column {col_name}: {msg}")
 OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
@@ -981,10 +969,9 @@ class DataChain:
         row is left in the result set.
         Example:
-        ```py
-         dc.distinct("file.parent", "file.name")
-        )
-        ```
+            ```py
+            dc.distinct("file.parent", "file.name")
+            ```
         """
         return self._evolve(
             query=self._query.distinct(
@@ -1010,6 +997,60 @@ class DataChain:
             query=self._query.select(*columns), signal_schema=new_schema
         )
+    def group_by(
+        self,
+        *,
+        partition_by: Union[str, Sequence[str]],
+        **kwargs: Func,
+    ) -> "Self":
+        """Group rows by specified set of signals and return new signals
+        with aggregated values.
+        Example:
+            ```py
+            chain = chain.group_by(
+                cnt=func.count(),
+                partition_by=("file_source", "file_ext"),
+            )
+            ```
+        """
+        if isinstance(partition_by, str):
+            partition_by = [partition_by]
+        if not partition_by:
+            raise ValueError("At least one column should be provided for partition_by")
+        if not kwargs:
+            raise ValueError("At least one column should be provided for group_by")
+        for col_name, func in kwargs.items():
+            if not isinstance(func, Func):
+                raise DataChainColumnError(
+                    col_name,
+                    f"Column {col_name} has type {type(func)} but expected Func object",
+                )
+        partition_by_columns: list[Column] = []
+        signal_columns: list[Column] = []
+        schema_fields: dict[str, DataType] = {}
+        # validate partition_by columns and add them to the schema
+        for col_name in partition_by:
+            col_db_name = ColumnMeta.to_db_name(col_name)
+            col_type = self.signals_schema.get_column_type(col_db_name)
+            col = Column(col_db_name, python_to_sql(col_type))
+            partition_by_columns.append(col)
+            schema_fields[col_db_name] = col_type
+        # validate signal columns and add them to the schema
+        for col_name, func in kwargs.items():
+            col = func.get_column(self.signals_schema, label=col_name)
+            signal_columns.append(col)
+            schema_fields[col_name] = func.get_result_type(self.signals_schema)
+        return self._evolve(
+            query=self._query.group_by(signal_columns, partition_by_columns),
+            signal_schema=SignalSchema(schema_fields),
+        )
     def mutate(self, **kwargs) -> "Self":
         """Create new signals based on existing signals.
@@ -1024,7 +1065,7 @@ class DataChain:
         The supported functions:
            Numerical:   +, -, *, /, rand(), avg(), count(), func(),
                         greatest(), least(), max(), min(), sum()
-           String:      length(), split()
+           String:      length(), split(), replace(), regexp_replace()
            Filename:    name(), parent(), file_stem(), file_ext()
            Array:       length(), sip_hash_64(), euclidean_distance(),
                         cosine_distance()
@@ -1476,12 +1517,6 @@ class DataChain:
         fr_map = {col.lower(): df[col].tolist() for col in df.columns}
         for column in fr_map:
-            if column in DatasetRow.schema:
-                raise DatasetPrepareError(
-                    name,
-                    f"import from pandas error - column '{column}' conflicts with"
-                    " default schema",
-                )
             if not column.isidentifier():
                 raise DatasetPrepareError(
                     name,
@@ -1709,6 +1744,7 @@ class DataChain:
         nrows=None,
         session: Optional[Session] = None,
         settings: Optional[dict] = None,
+        column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
         **kwargs,
     ) -> "DataChain":
         """Generate chain from csv files.
@@ -1727,6 +1763,9 @@ class DataChain:
             nrows : Optional row limit.
             session : Session to use for the chain.
             settings : Settings to use for the chain.
+            column_types : Dictionary of column names and their corresponding types.
+                It is passed to CSV reader and for each column specified type auto
+                inference is disabled.
         Example:
             Reading a csv file:
@@ -1742,6 +1781,15 @@ class DataChain:
         from pandas.io.parsers.readers import STR_NA_VALUES
         from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
         from pyarrow.dataset import CsvFileFormat
+        from pyarrow.lib import type_for_alias
+        if column_types:
+            column_types = {
+                name: type_for_alias(typ) if isinstance(typ, str) else typ
+                for name, typ in column_types.items()
+            }
+        else:
+            column_types = {}
         chain = DataChain.from_storage(
             path, session=session, settings=settings, **kwargs
@@ -1767,7 +1815,9 @@ class DataChain:
         parse_options = ParseOptions(delimiter=delimiter)
         read_options = ReadOptions(column_names=column_names)
         convert_options = ConvertOptions(
-            strings_can_be_null=True, null_values=STR_NA_VALUES
+            strings_can_be_null=True,
+            null_values=STR_NA_VALUES,
+            column_types=column_types,
         )
         format = CsvFileFormat(
             parse_options=parse_options,
@@ -1978,6 +2028,8 @@ class DataChain:
             ),
         )
+        session.add_dataset_version(dsr, dsr.latest_version)
         if isinstance(to_insert, dict):
             to_insert = [to_insert]
         elif not to_insert:

datachain-0.6.1/src/datachain/lib/func/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .aggregate import any_value, avg, collect, concat, count, max, min, sum
+from .func import Func
+__all__ = [
+    "Func",
+    "any_value",
+    "avg",
+    "collect",
+    "concat",
+    "count",
+    "max",
+    "min",
+    "sum",
+]

datachain-0.6.1/src/datachain/lib/func/aggregate.py ADDED Viewed

@@ -0,0 +1,42 @@
+from typing import Optional
+from sqlalchemy import func as sa_func
+from datachain.sql import functions as dc_func
+from .func import Func
+def count(col: Optional[str] = None) -> Func:
+    return Func(inner=sa_func.count, col=col, result_type=int)
+def sum(col: str) -> Func:
+    return Func(inner=sa_func.sum, col=col)
+def avg(col: str) -> Func:
+    return Func(inner=dc_func.aggregate.avg, col=col)
+def min(col: str) -> Func:
+    return Func(inner=sa_func.min, col=col)
+def max(col: str) -> Func:
+    return Func(inner=sa_func.max, col=col)
+def any_value(col: str) -> Func:
+    return Func(inner=dc_func.aggregate.any_value, col=col)
+def collect(col: str) -> Func:
+    return Func(inner=dc_func.aggregate.collect, col=col, is_array=True)
+def concat(col: str, separator="") -> Func:
+    def inner(arg):
+        return dc_func.aggregate.group_concat(arg, separator)
+    return Func(inner=inner, col=col, result_type=str)

datachain-0.6.1/src/datachain/lib/func/func.py ADDED Viewed

@@ -0,0 +1,64 @@
+from typing import TYPE_CHECKING, Callable, Optional
+from datachain.lib.convert.python_to_sql import python_to_sql
+from datachain.lib.utils import DataChainColumnError
+from datachain.query.schema import Column, ColumnMeta
+if TYPE_CHECKING:
+    from datachain import DataType
+    from datachain.lib.signal_schema import SignalSchema
+class Func:
+    def __init__(
+        self,
+        inner: Callable,
+        col: Optional[str] = None,
+        result_type: Optional["DataType"] = None,
+        is_array: bool = False,
+    ) -> None:
+        self.inner = inner
+        self.col = col
+        self.result_type = result_type
+        self.is_array = is_array
+    @property
+    def db_col(self) -> Optional[str]:
+        return ColumnMeta.to_db_name(self.col) if self.col else None
+    def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
+        if not self.db_col:
+            return None
+        col_type: type = signals_schema.get_column_type(self.db_col)
+        return list[col_type] if self.is_array else col_type  # type: ignore[valid-type]
+    def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
+        col_type = self.db_col_type(signals_schema)
+        if self.result_type:
+            return self.result_type
+        if col_type:
+            return col_type
+        raise DataChainColumnError(
+            str(self.inner),
+            "Column name is required to infer result type",
+        )
+    def get_column(
+        self, signals_schema: "SignalSchema", label: Optional[str] = None
+    ) -> Column:
+        if self.col:
+            if label == "collect":
+                print(label)
+            col_type = self.get_result_type(signals_schema)
+            col = Column(self.db_col, python_to_sql(col_type))
+            func_col = self.inner(col)
+        else:
+            func_col = self.inner()
+        if label:
+            func_col = func_col.label(label)
+        return func_col

{datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/signal_schema.py RENAMED Viewed

@@ -25,7 +25,7 @@ from typing_extensions import Literal as LiteralEx
 from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.sql_to_python import sql_to_python
 from datachain.lib.convert.unflatten import unflatten_to_json_pos
-from datachain.lib.data_model import DataModel, DataType
+from datachain.lib.data_model import DataModel, DataType, DataValue
 from datachain.lib.file import File
 from datachain.lib.model_store import ModelStore
 from datachain.lib.utils import DataChainParamsError
@@ -110,7 +110,7 @@ class SignalSchema:
     values: dict[str, DataType]
     tree: dict[str, Any]
     setup_func: dict[str, Callable]
-    setup_values: Optional[dict[str, Callable]]
+    setup_values: Optional[dict[str, Any]]
     def __init__(
         self,
@@ -333,21 +333,21 @@ class SignalSchema:
                 res[db_name] = python_to_sql(type_)
         return res
-    def row_to_objs(self, row: Sequence[Any]) -> list[DataType]:
+    def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
         self._init_setup_values()
-        objs = []
+        objs: list[DataValue] = []
         pos = 0
         for name, fr_type in self.values.items():
             if self.setup_values and (val := self.setup_values.get(name, None)):
                 objs.append(val)
             elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
                 j, pos = unflatten_to_json_pos(fr, row, pos)
-                objs.append(fr(**j))  # type: ignore[arg-type]
+                objs.append(fr(**j))
             else:
                 objs.append(row[pos])
                 pos += 1
-        return objs  # type: ignore[return-value]
+        return objs
     def contains_file(self) -> bool:
         for type_ in self.values.values():
@@ -400,6 +400,12 @@ class SignalSchema:
             if ModelStore.is_pydantic(finfo.annotation):
                 SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
+    def get_column_type(self, col_name: str) -> DataType:
+        for path, _type, has_subtree, _ in self.get_flat_tree():
+            if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
+                return _type
+        raise SignalResolvingError([col_name], "is not found")
     def db_signals(
         self, name: Optional[str] = None, as_columns=False
     ) -> Union[list[str], list[Column]]:
@@ -490,7 +496,7 @@ class SignalSchema:
                 new_values[name] = args_map[name]
             else:
                 # adding new signal
-                new_values.update(sql_to_python({name: value}))
+                new_values[name] = sql_to_python(value)
         return SignalSchema(new_values)
@@ -534,12 +540,12 @@ class SignalSchema:
             for name, val in values.items()
         }
-    def get_flat_tree(self) -> Iterator[tuple[list[str], type, bool, int]]:
+    def get_flat_tree(self) -> Iterator[tuple[list[str], DataType, bool, int]]:
         yield from self._get_flat_tree(self.tree, [], 0)
     def _get_flat_tree(
         self, tree: dict, prefix: list[str], depth: int
-    ) -> Iterator[tuple[list[str], type, bool, int]]:
+    ) -> Iterator[tuple[list[str], DataType, bool, int]]:
         for name, (type_, substree) in tree.items():
             suffix = name.split(".")
             new_prefix = prefix + suffix

datachain 0.5.1__tar.gz → 0.6.1__tar.gz

Potentially problematic release.

datachain 0.5.1tar.gz → 0.6.1tar.gz