PyPI - datachain - Versions diffs - 0.26.0__tar.gz → 0.26.2__tar.gz - Mend

datachain 0.26.0tar.gz → 0.26.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (404) hide show

{datachain-0.26.0 → datachain-0.26.2}/.pre-commit-config.yaml RENAMED Viewed

@@ -24,7 +24,7 @@ repos:
       - id: trailing-whitespace
         exclude: '^LICENSES/'
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.12.2'
+    rev: 'v0.12.3'
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

{datachain-0.26.0 → datachain-0.26.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.26.0
+Version: 0.26.2
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
 Requires-Dist: ultralytics; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
-Requires-Dist: mypy==1.16.1; extra == "dev"
+Requires-Dist: mypy==1.17.0; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"

{datachain-0.26.0 → datachain-0.26.2}/pyproject.toml RENAMED Viewed

@@ -114,7 +114,7 @@ tests = [
 ]
 dev = [
   "datachain[docs,tests]",
-  "mypy==1.16.1",
+  "mypy==1.17.0",
   "types-python-dateutil",
   "types-pytz",
   "types-PyYAML",

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/catalog/loader.py RENAMED Viewed

@@ -18,6 +18,7 @@ WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
 WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
 DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
 DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
+DISTRIBUTED_DISABLED = "DATACHAIN_DISTRIBUTED_DISABLED"
 IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
@@ -103,6 +104,9 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
 def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
+    if os.environ.get(DISTRIBUTED_DISABLED) == "True":
+        return None
     if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
         return None

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/__init__.py RENAMED Viewed

@@ -16,7 +16,7 @@ from .aggregate import (
     sum,
 )
 from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
-from .conditional import and_, case, greatest, ifelse, isnone, least, or_
+from .conditional import and_, case, greatest, ifelse, isnone, least, not_, or_
 from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .path import file_ext, file_stem, name, parent
 from .random import rand
@@ -54,6 +54,7 @@ __all__ = [
     "max",
     "min",
     "name",
+    "not_",
     "or_",
     "parent",
     "path",

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/conditional.py RENAMED Viewed

@@ -3,6 +3,7 @@ from typing import Optional, Union
 from sqlalchemy import ColumnElement
 from sqlalchemy import and_ as sql_and
 from sqlalchemy import case as sql_case
+from sqlalchemy import not_ as sql_not
 from sqlalchemy import or_ as sql_or
 from datachain.lib.utils import DataChainParamsError
@@ -288,3 +289,36 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
             func_args.append(arg)
     return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
+def not_(arg: Union[ColumnElement, Func]) -> Func:
+    """
+    Returns the function that produces NOT of the given expressions.
+    Args:
+        arg (ColumnElement | Func): The expression for NOT statement.
+            If a string is provided, it is assumed to be the name of the column.
+            If a Column is provided, it is assumed to be a column in the dataset.
+            If a Func is provided, it is assumed to be a function returning a value.
+    Returns:
+        Func: A `Func` object that represents the NOT function.
+    Example:
+        ```py
+        dc.mutate(
+            test=not_(C("value") == 5)
+        )
+        ```
+    Notes:
+        - The result column will always be of type bool.
+    """
+    cols, func_args = [], []
+    if isinstance(arg, (str, Func)):
+        cols.append(arg)
+    else:
+        func_args.append(arg)
+    return Func("not", inner=sql_not, cols=cols, args=func_args, result_type=bool)

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/arrow.py RENAMED Viewed

@@ -262,7 +262,7 @@ def _get_hf_schema(
         from datachain.lib.hf import get_output_schema, schema_from_arrow
         features = schema_from_arrow(schema)
-        return features, get_output_schema(features)
+        return features, get_output_schema(features)[0]
     return None

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/data_model.py RENAMED Viewed

@@ -3,6 +3,7 @@ from datetime import datetime
 from typing import ClassVar, Optional, Union, get_args, get_origin
 from pydantic import AliasChoices, BaseModel, Field, create_model
+from pydantic.fields import FieldInfo
 from datachain.lib.model_store import ModelStore
 from datachain.lib.utils import normalize_col_names
@@ -89,7 +90,16 @@ def dict_to_data_model(
     }
     class _DataModelStrict(BaseModel, extra="forbid"):
-        pass
+        @classmethod
+        def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
+            """Returns a map of aliases to original field names and info."""
+            field_info = {}
+            for _name, field in cls.model_fields.items():
+                assert isinstance(field.validation_alias, AliasChoices)
+                # Add mapping for all aliases (both normalized and original names)
+                for alias in field.validation_alias.choices:
+                    field_info[str(alias)] = (_name, field)
+            return field_info
     return create_model(
         name,

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/datachain.py RENAMED Viewed

@@ -33,7 +33,13 @@ from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
 from datachain.lib.convert.python_to_sql import python_to_sql
-from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
+from datachain.lib.data_model import (
+    DataModel,
+    DataType,
+    DataValue,
+    StandardType,
+    dict_to_data_model,
+)
 from datachain.lib.file import (
     EXPORT_FILES_MAX_THREADS,
     ArrowRow,
@@ -360,14 +366,6 @@ class DataChain:
         self._settings = settings if settings else Settings()
         return self
-    def reset_schema(self, signals_schema: SignalSchema) -> "Self":
-        self.signals_schema = signals_schema
-        return self
-    def add_schema(self, signals_schema: SignalSchema) -> "Self":
-        self.signals_schema |= signals_schema
-        return self
     @classmethod
     def from_storage(
         cls,
@@ -958,7 +956,7 @@ class DataChain:
         query_func = getattr(self._query, method_name)
         new_schema = self.signals_schema.resolve(*args)
-        columns = [C(col) for col in new_schema.db_signals()]
+        columns = new_schema.db_signals(as_columns=True)
         return query_func(*columns, **kwargs)
     @resolve_columns
@@ -1445,10 +1443,6 @@ class DataChain:
             remove_prefetched=remove_prefetched,
         )
-    def remove_file_signals(self) -> "Self":
-        schema = self.signals_schema.clone_without_file_signals()
-        return self.select(*schema.values.keys())
     @delta_disabled
     def merge(
         self,
@@ -1803,12 +1797,19 @@ class DataChain:
         )
         return read_pandas(*args, **kwargs)
-    def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
+    def to_pandas(
+        self,
+        flatten: bool = False,
+        include_hidden: bool = True,
+    ) -> "pd.DataFrame":
         """Return a pandas DataFrame from the chain.
         Parameters:
-            flatten : Whether to use a multiindex or flatten column names.
-            include_hidden : Whether to include hidden columns.
+            flatten: Whether to use a multiindex or flatten column names.
+            include_hidden: Whether to include hidden columns.
+        Returns:
+            pd.DataFrame: A pandas DataFrame representation of the chain.
         """
         import pandas as pd
@@ -1826,19 +1827,19 @@ class DataChain:
     def show(
         self,
         limit: int = 20,
-        flatten=False,
-        transpose=False,
-        truncate=True,
-        include_hidden=False,
+        flatten: bool = False,
+        transpose: bool = False,
+        truncate: bool = True,
+        include_hidden: bool = False,
     ) -> None:
         """Show a preview of the chain results.
         Parameters:
-            limit : How many rows to show.
-            flatten : Whether to use a multiindex or flatten column names.
-            transpose : Whether to transpose rows and columns.
-            truncate : Whether or not to truncate the contents of columns.
-            include_hidden : Whether to include hidden columns.
+            limit: How many rows to show.
+            flatten: Whether to use a multiindex or flatten column names.
+            transpose: Whether to transpose rows and columns.
+            truncate: Whether or not to truncate the contents of columns.
+            include_hidden: Whether to include hidden columns.
         """
         import pandas as pd
@@ -2268,21 +2269,73 @@ class DataChain:
         )
         return read_records(*args, **kwargs)
-    def sum(self, fr: DataType):  # type: ignore[override]
-        """Compute the sum of a column."""
-        return self._extend_to_data_model("sum", fr)
+    def sum(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the sum of a column.
+        Parameters:
+            col: The column to compute the sum for.
+        Returns:
+            The sum of the column values.
+        Example:
+            ```py
+            total_size = chain.sum("file.size")
+            print(f"Total size: {total_size}")
+            ```
+        """
+        return self._extend_to_data_model("sum", col)
+    def avg(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the average of a column.
+        Parameters:
+            col: The column to compute the average for.
+        Returns:
+            The average of the column values.
+        Example:
+            ```py
+            average_size = chain.avg("file.size")
+            print(f"Average size: {average_size}")
+            ```
+        """
+        return self._extend_to_data_model("avg", col)
+    def min(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the minimum of a column.
+        Parameters:
+            col: The column to compute the minimum for.
+        Returns:
+            The minimum value in the column.
+        Example:
+            ```py
+            min_size = chain.min("file.size")
+            print(f"Minimum size: {min_size}")
+            ```
+        """
+        return self._extend_to_data_model("min", col)
+    def max(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the maximum of a column.
-    def avg(self, fr: DataType):  # type: ignore[override]
-        """Compute the average of a column."""
-        return self._extend_to_data_model("avg", fr)
+        Parameters:
+            col: The column to compute the maximum for.
-    def min(self, fr: DataType):  # type: ignore[override]
-        """Compute the minimum of a column."""
-        return self._extend_to_data_model("min", fr)
+        Returns:
+            The maximum value in the column.
-    def max(self, fr: DataType):  # type: ignore[override]
-        """Compute the maximum of a column."""
-        return self._extend_to_data_model("max", fr)
+        Example:
+            ```py
+            max_size = chain.max("file.size")
+            print(f"Maximum size: {max_size}")
+            ```
+        """
+        return self._extend_to_data_model("max", col)
     def setup(self, **kwargs) -> "Self":
         """Setup variables to pass to UDF functions.
@@ -2393,14 +2446,15 @@ class DataChain:
         """Shuffle the rows of the chain deterministically."""
         return self.order_by("sys.rand")
-    def sample(self, n) -> "Self":
+    def sample(self, n: int) -> "Self":
         """Return a random sample from the chain.
         Parameters:
-            n (int): Number of samples to draw.
+            n: Number of samples to draw.
-        NOTE: Samples are not deterministic, and streamed/paginated queries or
-        multiple workers will draw samples with replacement.
+        Note:
+            Samples are not deterministic, and streamed/paginated queries or
+            multiple workers will draw samples with replacement.
         """
         return self._evolve(query=self._query.sample(n))
@@ -2507,6 +2561,10 @@ class DataChain:
     def chunk(self, index: int, total: int) -> "Self":
         """Split a chain into smaller chunks for e.g. parallelization.
+        Parameters:
+            index: The index of the chunk (0-indexed).
+            total: The total number of chunks.
         Example:
             ```py
             import datachain as dc
@@ -2526,7 +2584,7 @@ class DataChain:
         """Returns a list of rows of values, optionally limited to the specified
         columns.
-        Args:
+        Parameters:
             *cols: Limit to the specified columns. By default, all columns are selected.
         Returns:
@@ -2556,7 +2614,7 @@ class DataChain:
     def to_values(self, col: str) -> list[DataValue]:
         """Returns a flat list of values from a single column.
-        Args:
+        Parameters:
             col: The name of the column to extract values from.
         Returns:

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/hf.py RENAMED Viewed

@@ -32,6 +32,7 @@ def read_hf(
     Parameters:
         dataset : Path or name of the dataset to read from Hugging Face Hub,
             or an instance of `datasets.Dataset`-like object.
+        args : Additional positional arguments to pass to datasets.load_dataset.
         session : Session to use for the chain.
         settings : Settings to use for the chain.
         column : Generated object column name.
@@ -64,8 +65,9 @@ def read_hf(
     model_name = model_name or column or ""
     hf_features = next(iter(ds_dict.values())).features
-    output = output | get_output_schema(hf_features)
-    model = dict_to_data_model(model_name, output)
+    hf_output, normalized_names = get_output_schema(hf_features, list(output.keys()))
+    output = output | hf_output
+    model = dict_to_data_model(model_name, output, list(normalized_names.values()))
     if column:
         output = {column: model}

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/hf.py RENAMED Viewed

@@ -26,7 +26,7 @@ except ImportError as exc:
     ) from exc
 from io import BytesIO
-from typing import TYPE_CHECKING, Any, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 import PIL
 from tqdm.auto import tqdm
@@ -34,6 +34,7 @@ from tqdm.auto import tqdm
 from datachain.lib.arrow import arrow_type_mapper
 from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
 from datachain.lib.udf import Generator
+from datachain.lib.utils import normalize_col_names
 if TYPE_CHECKING:
     import pyarrow as pa
@@ -94,14 +95,18 @@ class HFGenerator(Generator):
         ds = self.ds_dict[split]
         if split:
             desc += f" split '{split}'"
+        model_fields = self.output_schema._model_fields_by_aliases()  # type: ignore[attr-defined]
         with tqdm(desc=desc, unit=" rows", leave=False) as pbar:
             for row in ds:
                 output_dict = {}
                 if split and "split" in self.output_schema.model_fields:
                     output_dict["split"] = split
                 for name, feat in ds.features.items():
-                    anno = self.output_schema.model_fields[name].annotation
-                    output_dict[name] = convert_feature(row[name], feat, anno)
+                    normalized_name, info = model_fields[name]
+                    anno = info.annotation
+                    output_dict[normalized_name] = convert_feature(
+                        row[name], feat, anno
+                    )
                 yield self.output_schema(**output_dict)
                 pbar.update(1)
@@ -122,10 +127,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
         return HFClassLabel(string=feat.names[val], integer=val)
     if isinstance(feat, dict):
         sdict = {}
+        model_fields = anno._model_fields_by_aliases()  # type: ignore[attr-defined]
         for sname in val:
             sfeat = feat[sname]
-            sanno = anno.model_fields[sname].annotation
-            sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
+            norm_name, info = model_fields[sname]
+            sanno = info.annotation
+            sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
         return anno(**sdict)
     if isinstance(feat, Image):
         if isinstance(val, dict):
@@ -135,12 +142,26 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
         return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
-def get_output_schema(features: Features) -> dict[str, DataType]:
-    """Generate UDF output schema from huggingface datasets features."""
+def get_output_schema(
+    features: Features, existing_column_names: Optional[list[str]] = None
+) -> tuple[dict[str, DataType], dict[str, str]]:
+    """
+    Generate UDF output schema from Hugging Face datasets features. It normalizes the
+    column names and returns a mapping of normalized names to original names along with
+    the data types. `existing_column_names` is the list of column names that already
+    exist in the dataset (to avoid name collisions due to normalization).
+    """
+    existing_column_names = existing_column_names or []
     fields_dict = {}
-    for name, val in features.items():
-        fields_dict[name] = _feature_to_chain_type(name, val)
-    return fields_dict
+    normalized_names = normalize_col_names(
+        existing_column_names + list(features.keys())
+    )
+    # List of tuple(str, str) for HF dataset feature names, (normalized, original)
+    new_feature_names = list(normalized_names.items())[len(existing_column_names) :]
+    for idx, feat in enumerate(features.items()):
+        name, val = feat
+        fields_dict[new_feature_names[idx][0]] = _feature_to_chain_type(name, val)
+    return fields_dict, normalized_names
 def _feature_to_chain_type(name: str, val: Any) -> DataType:  # noqa: PLR0911

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/pytorch.py RENAMED Viewed

@@ -125,7 +125,10 @@ class PytorchDataset(IterableDataset):
         ds = read_dataset(
             name=self.name, version=self.version, session=session
         ).settings(cache=self.cache, prefetch=self.prefetch)
-        ds = ds.remove_file_signals()
+        # remove file signals from dataset
+        schema = ds.signals_schema.clone_without_file_signals()
+        ds = ds.select(*schema.values.keys())
         if self.num_samples > 0:
             ds = ds.sample(self.num_samples)

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/signal_schema.py RENAMED Viewed

@@ -610,20 +610,25 @@ class SignalSchema:
         return SignalSchema(schema)
     def _find_in_tree(self, path: list[str]) -> DataType:
+        if val := self.tree.get(".".join(path)):
+            # If the path is a single string, we can directly access it
+            # without traversing the tree.
+            return val[0]
         curr_tree = self.tree
         curr_type = None
         i = 0
         while curr_tree is not None and i < len(path):
             if val := curr_tree.get(path[i]):
                 curr_type, curr_tree = val
-            elif i == 0 and len(path) > 1 and (val := curr_tree.get(".".join(path))):
-                curr_type, curr_tree = val
-                break
             else:
                 curr_type = None
+                break
             i += 1
-        if curr_type is None:
+        if curr_type is None or i < len(path):
+            # If we reached the end of the path and didn't find a type,
+            # or if we didn't traverse the entire path, raise an error.
             raise SignalResolvingError(path, "is not found")
         return curr_type

{datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/dataset.py RENAMED Viewed

@@ -559,7 +559,13 @@ class UDFStep(Step, ABC):
         """
         Create temporary table with group by partitions.
         """
+        # Check if partition_by is set, we need it to create partitions.
         assert self.partition_by is not None
+        # Check if sys__id is in the query, we need it to be able to join
+        # the partition table with the udf table later.
+        assert any(c.name == "sys__id" for c in query.selected_columns), (
+            "Query must have sys__id column to use partitioning."
+        )
         if isinstance(self.partition_by, (list, tuple, GeneratorType)):
             list_partition_by = list(self.partition_by)
@@ -606,6 +612,22 @@ class UDFStep(Step, ABC):
         # Apply partitioning if needed.
         if self.partition_by is not None:
+            if not any(c.name == "sys__id" for c in query.selected_columns):
+                # If sys__id is not in the query, we need to create a temp table
+                # to hold the query results, so we can join it with the
+                # partition table later.
+                columns = [
+                    c if isinstance(c, Column) else Column(c.name, c.type)
+                    for c in query.subquery().columns
+                ]
+                temp_table = self.catalog.warehouse.create_dataset_rows_table(
+                    self.catalog.warehouse.temp_table_name(),
+                    columns=columns,
+                )
+                temp_tables.append(temp_table.name)
+                self.catalog.warehouse.copy_table(temp_table, query)
+                _query = query = temp_table.select()
             partition_tbl = self.create_partitions_table(query)
             temp_tables.append(partition_tbl.name)
             query = query.outerjoin(

{datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.26.0
+Version: 0.26.2
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
 Requires-Dist: ultralytics; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
-Requires-Dist: mypy==1.16.1; extra == "dev"
+Requires-Dist: mypy==1.17.0; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"

{datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/requires.txt RENAMED Viewed

@@ -41,7 +41,7 @@ soundfile
 [dev]
 datachain[docs,tests]
-mypy==1.16.1
+mypy==1.17.0
 types-python-dateutil
 types-pytz
 types-PyYAML

{datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_conditional.py RENAMED Viewed

@@ -5,7 +5,7 @@ from datachain import func
 from tests.utils import skip_if_not_sqlite
-def test_conditional_and_or(test_session):
+def test_conditional_and_or_not(test_session):
     class Data(dc.DataModel):
         i: int
         f: float
@@ -25,11 +25,12 @@ def test_conditional_and_or(test_session):
             t2=func.and_(dc.C("data.i") > 15, dc.C("data.f") > 2.5),
             t3=func.or_(dc.C("data.i") > 15, dc.C("data.f") > 1.5),
             t4=func.or_(dc.C("data.i") > 15, dc.C("data.f") > 2.5),
+            t5=func.not_(dc.C("data.i") > 15),
         )
         .order_by("id")
-    ).to_list("t1", "t2", "t3", "t4")
+    ).to_list("t1", "t2", "t3", "t4", "t5")
-    assert ds == [(0, 0, 0, 0), (1, 0, 1, 1), (1, 1, 1, 1)]
+    assert ds == [(0, 0, 0, 0, 1), (1, 0, 1, 1, 0), (1, 1, 1, 1, 0)]
 def test_conditional_case(test_session):

{datachain-0.26.0 → datachain-0.26.2}/tests/func/test_dataset_query.py RENAMED Viewed

@@ -227,7 +227,7 @@ def test_select_missing_column(cloud_test_catalog, animal_dataset):
     ds1 = ds.select(C.missing_column_name)
     ds2 = ds.select("missing_column_name")
     # The exception type varies by database backend
-    exc1 = pytest.raises(Exception, ds1.db_results)
+    exc1 = pytest.raises(Exception, ds1.db_results)  # noqa: B017
     assert "missing_column_name" in str(exc1.value)
     exc2 = pytest.raises(KeyError, ds2.db_results)
     assert "missing_column_name" in str(exc2.value)

{datachain-0.26.0 → datachain-0.26.2}/tests/func/test_hf.py RENAMED Viewed

@@ -34,10 +34,11 @@ def test_hf_image(tmp_path):
     img.save(train_dir / "img1.png")
     ds = load_dataset("imagefolder", data_dir=tmp_path)
-    schema = {"split": str} | get_output_schema(ds["train"].features)
+    hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
+    schema = {"split": str} | hf_schema
     assert schema["image"] is HFImage
-    gen = HFGenerator(ds, dict_to_data_model("", schema))
+    gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
     gen.setup()
     row = next(iter(gen.process("train")))
     assert row.image.img == image_to_bytes(img)
@@ -56,9 +57,10 @@ def test_hf_audio(tmp_path):
     write(train_dir / "example.wav", samplerate, data.astype(np.int16))
     ds = load_dataset("audiofolder", data_dir=tmp_path)
-    schema = {"split": str} | get_output_schema(ds["train"].features)
+    hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
+    schema = {"split": str} | hf_schema
-    gen = HFGenerator(ds, dict_to_data_model("", schema))
+    gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
     gen.setup()
     row = next(iter(gen.process("train")))
     assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)

datachain 0.26.0__tar.gz → 0.26.2__tar.gz

Potentially problematic release.

datachain 0.26.0tar.gz → 0.26.2tar.gz