PyPI - datachain - Versions diffs - 0.6.9__tar.gz → 0.6.11__tar.gz - Mend

datachain 0.6.9tar.gz → 0.6.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (265) hide show

{datachain-0.6.9/src/datachain.egg-info → datachain-0.6.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.6.9
+Version: 0.6.11
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -71,7 +71,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
 Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
 Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
 Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
-Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
+Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
 Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
 Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
 Requires-Dist: virtualenv; extra == "tests"

{datachain-0.6.9 → datachain-0.6.11}/mkdocs.yml RENAMED Viewed

@@ -15,7 +15,7 @@ validation:
 theme:
   name: material
-  custom_dir: overrides
+  custom_dir: docs/overrides
   logo: assets/datachain-white.svg
   favicon: assets/datachain.svg
   icon:

{datachain-0.6.9 → datachain-0.6.11}/pyproject.toml RENAMED Viewed

@@ -82,7 +82,7 @@ tests = [
   "pytest-sugar>=0.9.6",
   "pytest-cov>=4.1.0",
   "pytest-mock>=3.12.0",
-  "pytest-servers[all]>=0.5.7",
+  "pytest-servers[all]>=0.5.8",
   "pytest-benchmark[histogram]",
   "pytest-xdist>=3.3.1",
   "virtualenv",

{datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/catalog.py RENAMED Viewed

@@ -603,9 +603,10 @@ class Catalog:
         )
         lst = Listing(
+            self.metastore.clone(),
             self.warehouse.clone(),
             Client.get_client(list_uri, self.cache, **self.client_config),
-            self.get_dataset(list_ds_name),
+            dataset_name=list_ds_name,
             object_name=object_name,
         )
@@ -698,9 +699,13 @@ class Catalog:
                     client = self.get_client(source, **client_config)
                     uri = client.uri
-                    st = self.warehouse.clone()
                     dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
-                    listing = Listing(st, client, self.get_dataset(dataset_name))
+                    listing = Listing(
+                        self.metastore.clone(),
+                        self.warehouse.clone(),
+                        client,
+                        dataset_name=dataset_name,
+                    )
                     rows = DatasetQuery(
                         name=dataset.name, version=ds_version, catalog=self
                     ).to_db_records()
@@ -1354,6 +1359,13 @@ class Catalog:
             # we will create new one if it doesn't exist
             pass
+        if dataset and version and dataset.has_version(version):
+            """No need to communicate with Studio at all"""
+            dataset_uri = create_dataset_uri(remote_dataset_name, version)
+            print(f"Local copy of dataset {dataset_uri} already present")
+            _instantiate_dataset()
+            return
         remote_dataset = self.get_remote_dataset(remote_dataset_name)
         # if version is not specified in uri, take the latest one
         if not version:

{datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/sqlite.py RENAMED Viewed

@@ -747,8 +747,12 @@ class SQLiteWarehouse(AbstractWarehouse):
         ids = self.db.execute(select_ids).fetchall()
-        select_q = query.with_only_columns(
-            *[c for c in query.selected_columns if c.name != "sys__id"]
+        select_q = (
+            query.with_only_columns(
+                *[c for c in query.selected_columns if c.name != "sys__id"]
+            )
+            .offset(None)
+            .limit(None)
         )
         for batch in batched_it(ids, 10_000):

{datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/dc.py RENAMED Viewed

@@ -642,6 +642,59 @@ class DataChain:
         }
         return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
+    def explode(
+        self,
+        col: str,
+        model_name: Optional[str] = None,
+        object_name: Optional[str] = None,
+    ) -> "DataChain":
+        """Explodes a column containing JSON objects (dict or str DataChain type) into
+           individual columns based on the schema of the JSON. Schema is inferred from
+           the first row of the column.
+        Args:
+            col: the name of the column containing JSON to be exploded.
+            model_name: optional generated model name.  By default generates the name
+                automatically.
+            object_name: optional generated object column name. By default generates the
+                name automatically.
+        Returns:
+            DataChain: A new DataChain instance with the new set of columns.
+        """
+        import json
+        import pyarrow as pa
+        from datachain.lib.arrow import schema_to_output
+        json_value = next(self.limit(1).collect(col))
+        json_dict = (
+            json.loads(json_value) if isinstance(json_value, str) else json_value
+        )
+        if not isinstance(json_dict, dict):
+            raise TypeError(f"Column {col} should be a string or dict type with JSON")
+        schema = pa.Table.from_pylist([json_dict]).schema
+        output = schema_to_output(schema, None)
+        if not model_name:
+            model_name = f"{col.title()}ExplodedModel"
+        model = dict_to_data_model(model_name, output)
+        def json_to_model(json_value: Union[str, dict]):
+            json_dict = (
+                json.loads(json_value) if isinstance(json_value, str) else json_value
+            )
+            return model.model_validate(json_dict)
+        if not object_name:
+            object_name = f"{col}_expl"
+        return self.map(json_to_model, params=col, output={object_name: model})
     @classmethod
     def datasets(
         cls,

{datachain-0.6.9 → datachain-0.6.11}/src/datachain/listing.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import glob
 import os
 from collections.abc import Iterable, Iterator
+from functools import cached_property
 from itertools import zip_longest
 from typing import TYPE_CHECKING, Optional
@@ -15,28 +16,34 @@ from datachain.utils import suffix_to_number
 if TYPE_CHECKING:
     from datachain.catalog.datasource import DataSource
     from datachain.client import Client
-    from datachain.data_storage import AbstractWarehouse
+    from datachain.data_storage import AbstractMetastore, AbstractWarehouse
     from datachain.dataset import DatasetRecord
 class Listing:
     def __init__(
         self,
+        metastore: "AbstractMetastore",
         warehouse: "AbstractWarehouse",
         client: "Client",
-        dataset: Optional["DatasetRecord"],
+        dataset_name: Optional["str"] = None,
+        dataset_version: Optional[int] = None,
         object_name: str = "file",
     ):
+        self.metastore = metastore
         self.warehouse = warehouse
         self.client = client
-        self.dataset = dataset  # dataset representing bucket listing
+        self.dataset_name = dataset_name  # dataset representing bucket listing
+        self.dataset_version = dataset_version  # dataset representing bucket listing
         self.object_name = object_name
     def clone(self) -> "Listing":
         return self.__class__(
+            self.metastore.clone(),
             self.warehouse.clone(),
             self.client,
-            self.dataset,
+            self.dataset_name,
+            self.dataset_version,
             self.object_name,
         )
@@ -53,12 +60,22 @@ class Listing:
     def uri(self):
         from datachain.lib.listing import listing_uri_from_name
-        return listing_uri_from_name(self.dataset.name)
+        assert self.dataset_name
-    @property
+        return listing_uri_from_name(self.dataset_name)
+    @cached_property
+    def dataset(self) -> "DatasetRecord":
+        assert self.dataset_name
+        return self.metastore.get_dataset(self.dataset_name)
+    @cached_property
     def dataset_rows(self):
+        dataset = self.dataset
         return self.warehouse.dataset_rows(
-            self.dataset, self.dataset.latest_version, object_name=self.object_name
+            dataset,
+            self.dataset_version or dataset.latest_version,
+            object_name=self.object_name,
         )
     def expand_path(self, path, use_glob=True) -> list[Node]:

{datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/types.py RENAMED Viewed

@@ -36,7 +36,14 @@ def convert_array(arr):
 def adapt_np_array(arr):
-    return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
+    def _json_serialize(obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return obj
+    return orjson.dumps(
+        arr, option=orjson.OPT_SERIALIZE_NUMPY, default=_json_serialize
+    ).decode("utf-8")
 def adapt_np_generic(val):

datachain-0.6.11/src/datachain/toolkit/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .split import train_test_split
+__all__ = ["train_test_split"]

datachain-0.6.11/src/datachain/toolkit/split.py ADDED Viewed

@@ -0,0 +1,67 @@
+from datachain import C, DataChain
+def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
+    """
+    Splits a DataChain into multiple subsets based on the provided weights.
+    This function partitions the rows or items of a DataChain into disjoint subsets,
+    ensuring that the relative sizes of the subsets correspond to the given weights.
+    It is particularly useful for creating training, validation, and test datasets.
+    Args:
+        dc (DataChain):
+            The DataChain instance to split.
+        weights (list[float]):
+            A list of weights indicating the relative proportions of the splits.
+            The weights do not need to sum to 1; they will be normalized internally.
+            For example:
+            - `[0.7, 0.3]` corresponds to a 70/30 split;
+            - `[2, 1, 1]` corresponds to a 50/25/25 split.
+    Returns:
+        list[DataChain]:
+            A list of DataChain instances, one for each weight in the weights list.
+    Examples:
+        Train-test split:
+        ```python
+        from datachain import DataChain
+        from datachain.toolkit import train_test_split
+        # Load a DataChain from a storage source (e.g., S3 bucket)
+        dc = DataChain.from_storage("s3://bucket/dir/")
+        # Perform a 70/30 train-test split
+        train, test = train_test_split(dc, [0.7, 0.3])
+        # Save the resulting splits
+        train.save("dataset_train")
+        test.save("dataset_test")
+        ```
+        Train-test-validation split:
+        ```python
+        train, test, val = train_test_split(dc, [0.7, 0.2, 0.1])
+        train.save("dataset_train")
+        test.save("dataset_test")
+        val.save("dataset_val")
+        ```
+    Note:
+        The splits are random but deterministic, based on Dataset `sys__rand` field.
+    """
+    if len(weights) < 2:
+        raise ValueError("Weights should have at least two elements")
+    if any(weight < 0 for weight in weights):
+        raise ValueError("Weights should be non-negative")
+    weights_normalized = [weight / sum(weights) for weight in weights]
+    return [
+        dc.filter(
+            C("sys__rand") % 1000 >= round(sum(weights_normalized[:index]) * 1000),
+            C("sys__rand") % 1000 < round(sum(weights_normalized[: index + 1]) * 1000),
+        )
+        for index, _ in enumerate(weights_normalized)
+    ]

{datachain-0.6.9 → datachain-0.6.11/src/datachain.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.6.9
+Version: 0.6.11
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -71,7 +71,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
 Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
 Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
 Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
-Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
+Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
 Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
 Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
 Requires-Dist: virtualenv; extra == "tests"

{datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/SOURCES.txt RENAMED Viewed

@@ -23,6 +23,7 @@ docs/index.md
 docs/assets/captioned_cartoons.png
 docs/assets/datachain-white.svg
 docs/assets/datachain.svg
+docs/overrides/main.html
 docs/references/datachain.md
 docs/references/datatype.md
 docs/references/file.md
@@ -48,7 +49,6 @@ examples/multimodal/hf_pipeline.py
 examples/multimodal/openai_image_desc_lib.py
 examples/multimodal/wds.py
 examples/multimodal/wds_filtered.py
-overrides/main.html
 src/datachain/__init__.py
 src/datachain/__main__.py
 src/datachain/asyn.py
@@ -160,6 +160,8 @@ src/datachain/sql/sqlite/__init__.py
 src/datachain/sql/sqlite/base.py
 src/datachain/sql/sqlite/types.py
 src/datachain/sql/sqlite/vector.py
+src/datachain/toolkit/__init__.py
+src/datachain/toolkit/split.py
 src/datachain/torch/__init__.py
 tests/__init__.py
 tests/conftest.py
@@ -197,6 +199,7 @@ tests/func/test_metrics.py
 tests/func/test_pull.py
 tests/func/test_pytorch.py
 tests/func/test_query.py
+tests/func/test_toolkit.py
 tests/scripts/feature_class.py
 tests/scripts/feature_class_exception.py
 tests/scripts/feature_class_parallel.py
@@ -256,4 +259,5 @@ tests/unit/sql/test_random.py
 tests/unit/sql/test_selectable.py
 tests/unit/sql/test_string.py
 tests/unit/sql/sqlite/__init__.py
+tests/unit/sql/sqlite/test_types.py
 tests/unit/sql/sqlite/test_utils.py

{datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/requires.txt RENAMED Viewed

@@ -70,7 +70,7 @@ pytest<9,>=8
 pytest-sugar>=0.9.6
 pytest-cov>=4.1.0
 pytest-mock>=3.12.0
-pytest-servers[all]>=0.5.7
+pytest-servers[all]>=0.5.8
 pytest-benchmark[histogram]
 pytest-xdist>=3.3.1
 virtualenv

{datachain-0.6.9 → datachain-0.6.11}/tests/conftest.py RENAMED Viewed

@@ -22,7 +22,7 @@ from datachain.data_storage.sqlite import (
     SQLiteWarehouse,
 )
 from datachain.dataset import DatasetRecord
-from datachain.lib.dc import DataChain
+from datachain.lib.dc import DataChain, Sys
 from datachain.query.session import Session
 from datachain.utils import (
     ENV_DATACHAIN_GLOBAL_CONFIG_DIR,
@@ -701,3 +701,43 @@ def studio_datasets(requests_mock):
     ]
     requests_mock.post(f"{STUDIO_URL}/api/datachain/ls-datasets", json=datasets)
+@pytest.fixture
+def not_random_ds(test_session):
+    return DataChain.from_records(
+        [
+            {"sys__id": 1, "sys__rand": 50, "fib": 0},
+            {"sys__id": 2, "sys__rand": 150, "fib": 1},
+            {"sys__id": 3, "sys__rand": 250, "fib": 1},
+            {"sys__id": 4, "sys__rand": 350, "fib": 2},
+            {"sys__id": 5, "sys__rand": 450, "fib": 3},
+            {"sys__id": 6, "sys__rand": 550, "fib": 5},
+            {"sys__id": 7, "sys__rand": 650, "fib": 8},
+            {"sys__id": 8, "sys__rand": 750, "fib": 13},
+            {"sys__id": 9, "sys__rand": 850, "fib": 21},
+            {"sys__id": 10, "sys__rand": 950, "fib": 34},
+        ],
+        session=test_session,
+        schema={"sys": Sys, "fib": int},
+    )
+@pytest.fixture
+def pseudo_random_ds(test_session):
+    return DataChain.from_records(
+        [
+            {"sys__id": 1, "sys__rand": 1344339883, "fib": 0},
+            {"sys__id": 2, "sys__rand": 3901153096, "fib": 1},
+            {"sys__id": 3, "sys__rand": 4255991360, "fib": 1},
+            {"sys__id": 4, "sys__rand": 2526403609, "fib": 2},
+            {"sys__id": 5, "sys__rand": 1871733386, "fib": 3},
+            {"sys__id": 6, "sys__rand": 9380910850, "fib": 5},
+            {"sys__id": 7, "sys__rand": 2770679740, "fib": 8},
+            {"sys__id": 8, "sys__rand": 2538886575, "fib": 13},
+            {"sys__id": 9, "sys__rand": 3969542617, "fib": 21},
+            {"sys__id": 10, "sys__rand": 7541790992, "fib": 34},
+        ],
+        session=test_session,
+        schema={"sys": Sys, "fib": int},
+    )

{datachain-0.6.9 → datachain-0.6.11}/tests/func/test_dataset_query.py RENAMED Viewed

@@ -459,6 +459,72 @@ def test_order_by_limit(cloud_test_catalog, save, animal_dataset):
     ]
+@pytest.mark.parametrize("save", [True, False])
+def test_limit(cloud_test_catalog, save, animal_dataset):
+    catalog = cloud_test_catalog.catalog
+    q = (
+        DatasetQuery(animal_dataset.name, catalog=catalog)
+        .order_by(C("file.path"))
+        .limit(2)
+    )
+    if save:
+        ds_name = "animals_cats"
+        q.save(ds_name)
+        result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
+        dataset_record = catalog.get_dataset(ds_name)
+        assert dataset_record.status == DatasetStatus.COMPLETE
+    else:
+        result = q.db_results()
+    assert len(result) == 2
+    assert [posixpath.basename(r[3]) for r in result] == ["cat1", "cat2"]
+@pytest.mark.parametrize("save", [True, False])
+def test_offset_limit(cloud_test_catalog, save, animal_dataset):
+    catalog = cloud_test_catalog.catalog
+    q = (
+        DatasetQuery(animal_dataset.name, catalog=catalog)
+        .order_by(C("file.path"))
+        .offset(3)
+        .limit(2)
+    )
+    if save:
+        ds_name = "animals_cats"
+        q.save(ds_name)
+        result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
+        dataset_record = catalog.get_dataset(ds_name)
+        assert dataset_record.status == DatasetStatus.COMPLETE
+    else:
+        result = q.db_results()
+    assert len(result) == 2
+    assert [posixpath.basename(r[3]) for r in result] == ["dog1", "dog2"]
+@pytest.mark.parametrize("save", [True, False])
+def test_mutate_offset_limit(cloud_test_catalog, save, animal_dataset):
+    catalog = cloud_test_catalog.catalog
+    q = (
+        DatasetQuery(animal_dataset.name, catalog=catalog)
+        .order_by(C("file.path"))
+        .mutate(size10x=C("file.size") * 10)
+        .offset(3)
+        .limit(2)
+    )
+    if save:
+        ds_name = "animals_cats"
+        q.save(ds_name)
+        result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
+        dataset_record = catalog.get_dataset(ds_name)
+        assert dataset_record.status == DatasetStatus.COMPLETE
+    else:
+        result = q.db_results()
+    assert len(result) == 2
+    assert [posixpath.basename(r[3]) for r in result] == ["dog1", "dog2"]
 @pytest.mark.parametrize(
     "cloud_type,version_aware",
     [("s3", True)],

{datachain-0.6.9 → datachain-0.6.11}/tests/func/test_pull.py RENAMED Viewed

@@ -6,12 +6,13 @@ import lz4.frame
 import pandas as pd
 import pytest
+from datachain.client.fsspec import Client
 from datachain.config import Config, ConfigLevel
 from datachain.dataset import DatasetStatus
 from datachain.error import DataChainError
 from datachain.utils import STUDIO_URL, JSONSerialize
 from tests.data import ENTRIES
-from tests.utils import assert_row_names, skip_if_not_sqlite
+from tests.utils import assert_row_names, skip_if_not_sqlite, tree_from_path
 DATASET_UUID = "20f5a2f1-fc9a-4e36-8b91-5a530f289451"
@@ -40,10 +41,11 @@ def dog_entries():
 @pytest.fixture
-def dog_entries_parquet_lz4(dog_entries) -> bytes:
+def dog_entries_parquet_lz4(dog_entries, cloud_test_catalog) -> bytes:
     """
     Returns dogs entries in lz4 compressed parquet format
     """
+    src_uri = cloud_test_catalog.src_uri
     def _adapt_row(row):
         """
@@ -61,7 +63,7 @@ def dog_entries_parquet_lz4(dog_entries) -> bytes:
         adapted["sys__id"] = 1
         adapted["sys__rand"] = 1
         adapted["file__location"] = ""
-        adapted["file__source"] = "s3://dogs"
+        adapted["file__source"] = src_uri
         return adapted
     dog_entries = [_adapt_row(e) for e in dog_entries]
@@ -141,6 +143,7 @@ def remote_dataset(remote_dataset_version, schema):
 @pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
 @pytest.mark.parametrize("dataset_uri", ["ds://dogs@v1", "ds://dogs"])
+@pytest.mark.parametrize("instantiate", [True, False])
 @skip_if_not_sqlite
 def test_pull_dataset_success(
     requests_mock,
@@ -148,7 +151,10 @@ def test_pull_dataset_success(
     remote_dataset,
     dog_entries_parquet_lz4,
     dataset_uri,
+    instantiate,
 ):
+    src_uri = cloud_test_catalog.src_uri
+    working_dir = cloud_test_catalog.working_dir
     data_url = (
         "https://studio-blobvault.s3.amazonaws.com/datachain_ds_export_1_0.parquet.lz4"
     )
@@ -165,9 +171,16 @@ def test_pull_dataset_success(
     requests_mock.get(data_url, content=dog_entries_parquet_lz4)
     catalog = cloud_test_catalog.catalog
-    catalog.pull_dataset(dataset_uri, no_cp=True)
-    # trying to pull multiple times as it should work
-    catalog.pull_dataset(dataset_uri, no_cp=True)
+    dest = None
+    if instantiate:
+        dest = working_dir / "data"
+        dest.mkdir()
+        catalog.pull_dataset(dataset_uri, output=str(dest), no_cp=False)
+    else:
+        # trying to pull multiple times since that should work as well
+        catalog.pull_dataset(dataset_uri, no_cp=True)
+        catalog.pull_dataset(dataset_uri, no_cp=True)
     dataset = catalog.get_dataset("dogs")
     assert dataset.versions_values == [1]
@@ -196,6 +209,20 @@ def test_pull_dataset_success(
         },
     )
+    client = Client.get_client(src_uri, None)
+    if instantiate:
+        assert tree_from_path(dest) == {
+            f"{client.name}": {
+                "dogs": {
+                    "dog1": "woof",
+                    "dog2": "arf",
+                    "dog3": "bark",
+                    "others": {"dog4": "ruff"},
+                }
+            }
+        }
 @pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
 @skip_if_not_sqlite

datachain-0.6.11/tests/func/test_toolkit.py ADDED Viewed

@@ -0,0 +1,42 @@
+import pytest
+from datachain.toolkit import train_test_split
+@pytest.mark.parametrize(
+    "weights,expected",
+    [
+        [[1, 1], [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]],
+        [[4, 1], [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10]]],
+        [[0.7, 0.2, 0.1], [[1, 2, 3, 4, 5, 6, 7], [8, 9], [10]]],
+    ],
+)
+def test_train_test_split_not_random(not_random_ds, weights, expected):
+    res = train_test_split(not_random_ds, weights)
+    assert len(res) == len(expected)
+    for i, dc in enumerate(res):
+        assert list(dc.collect("sys.id")) == expected[i]
+@pytest.mark.parametrize(
+    "weights,expected",
+    [
+        [[1, 1], [[2, 3, 5], [1, 4, 6, 7, 8, 9, 10]]],
+        [[4, 1], [[2, 3, 4, 5, 7, 8, 9], [1, 6, 10]]],
+        [[0.7, 0.2, 0.1], [[2, 3, 4, 5, 8, 9], [1, 6, 7], [10]]],
+    ],
+)
+def test_train_test_split_random(pseudo_random_ds, weights, expected):
+    res = train_test_split(pseudo_random_ds, weights)
+    assert len(res) == len(expected)
+    for i, dc in enumerate(res):
+        assert list(dc.collect("sys.id")) == expected[i]
+def test_train_test_split_errors(not_random_ds):
+    with pytest.raises(ValueError, match="Weights should have at least two elements"):
+        train_test_split(not_random_ds, [0.5])
+    with pytest.raises(ValueError, match="Weights should be non-negative"):
+        train_test_split(not_random_ds, [-1, 1])

datachain 0.6.9__tar.gz → 0.6.11__tar.gz

Potentially problematic release.

datachain 0.6.9tar.gz → 0.6.11tar.gz