PyPI - datachain - Versions diffs - 0.37.6__py3-none-any.whl → 0.37.8__py3-none-any.whl - Mend

datachain 0.37.6py3-none-any.whl → 0.37.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (9) hide show

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -52,7 +52,11 @@ from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import DataChainColumnError, DataChainParamsError
 from datachain.project import Project
 from datachain.query import Session
-from datachain.query.dataset import DatasetQuery, PartitionByType
+from datachain.query.dataset import (
+    DatasetQuery,
+    PartitionByType,
+    RegenerateSystemColumns,
+)
 from datachain.query.schema import DEFAULT_DELIMITER, Column
 from datachain.sql.functions import path as pathfunc
 from datachain.utils import batched_it, env2bool, inside_notebook, row_to_nested_dict
@@ -577,7 +581,8 @@ class DataChain:
             create=True,
         )
         return self._evolve(
-            query=self._query.save(project=project, feature_schema=schema)
+            query=self._query.save(project=project, feature_schema=schema),
+            signal_schema=self.signals_schema | SignalSchema({"sys": Sys}),
         )
     def _calculate_job_hash(self, job_id: str) -> str:
@@ -2739,8 +2744,20 @@ class DataChain:
         )
     def shuffle(self) -> "Self":
-        """Shuffle the rows of the chain deterministically."""
-        return self.order_by("sys.rand")
+        """Shuffle rows with a best-effort deterministic ordering.
+        This produces repeatable shuffles. Merge and union operations can
+        lead to non-deterministic results. Use order by or save a dataset
+        afterward to guarantee the same result.
+        """
+        query = self._query.clone(new_table=False)
+        query.steps.append(RegenerateSystemColumns(self._query.catalog))
+        chain = self._evolve(
+            query=query,
+            signal_schema=SignalSchema({"sys": Sys}) | self.signals_schema,
+        )
+        return chain.order_by("sys.rand")
     def sample(self, n: int) -> "Self":
         """Return a random sample from the chain.

datachain/query/dataset.py CHANGED Viewed

@@ -786,10 +786,31 @@ class SQLClause(Step, ABC):
         return tuple(c.get_column() if isinstance(c, Function) else c for c in cols)
     @abstractmethod
-    def apply_sql_clause(self, query):
+    def apply_sql_clause(self, query: Any) -> Any:
         pass
+@frozen
+class RegenerateSystemColumns(Step):
+    catalog: "Catalog"
+    def hash_inputs(self) -> str:
+        return hashlib.sha256(b"regenerate_system_columns").hexdigest()
+    def apply(
+        self, query_generator: QueryGenerator, temp_tables: list[str]
+    ) -> StepResult:
+        query = query_generator.select()
+        new_query = self.catalog.warehouse._regenerate_system_columns(
+            query, keep_existing_columns=True
+        )
+        def q(*columns):
+            return new_query.with_only_columns(*columns)
+        return step_result(q, new_query.selected_columns)
 @frozen
 class SQLSelect(SQLClause):
     args: tuple[Function | ColumnElement, ...]
@@ -1488,10 +1509,6 @@ class DatasetQuery:
         finally:
             self.cleanup()
-    def shuffle(self) -> "Self":
-        # ToDo: implement shaffle based on seed and/or generating random column
-        return self.order_by(C.sys__rand)
     def sample(self, n) -> "Self":
         """
         Return a random sample from the dataset.

datachain/toolkit/split.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import random
 from datachain import C, DataChain
+from datachain.lib.signal_schema import SignalResolvingError
 RESOLUTION = 2**31 - 1  # Maximum positive value for a 32-bit signed integer.
@@ -59,7 +60,10 @@ def train_test_split(
         ```
     Note:
-        The splits are random but deterministic, based on Dataset `sys__rand` field.
+        Splits reuse the same best-effort shuffle used by `DataChain.shuffle`. Results
+        are typically repeatable, but earlier operations such as `merge`, `union`, or
+        custom SQL that reshuffle rows can change the outcome between runs. Add order by
+        stable keys first when you need strict reproducibility.
     """
     if len(weights) < 2:
         raise ValueError("Weights should have at least two elements")
@@ -68,16 +72,34 @@ def train_test_split(
     weights_normalized = [weight / sum(weights) for weight in weights]
+    try:
+        dc.signals_schema.resolve("sys.rand")
+    except SignalResolvingError:
+        dc = dc.persist()
     rand_col = C("sys.rand")
     if seed is not None:
         uniform_seed = random.Random(seed).randrange(1, RESOLUTION)  # noqa: S311
         rand_col = (rand_col % RESOLUTION) * uniform_seed  # type: ignore[assignment]
     rand_col = rand_col % RESOLUTION  # type: ignore[assignment]
-    return [
-        dc.filter(
-            rand_col >= round(sum(weights_normalized[:index]) * (RESOLUTION - 1)),
-            rand_col < round(sum(weights_normalized[: index + 1]) * (RESOLUTION - 1)),
-        )
-        for index, _ in enumerate(weights_normalized)
-    ]
+    boundaries: list[int] = [0]
+    cumulative = 0.0
+    for weight in weights_normalized[:-1]:
+        cumulative += weight
+        boundary = round(cumulative * RESOLUTION)
+        boundaries.append(min(boundary, RESOLUTION))
+    boundaries.append(RESOLUTION)
+    splits: list[DataChain] = []
+    last_index = len(weights_normalized) - 1
+    for index in range(len(weights_normalized)):
+        lower = boundaries[index]
+        if index == last_index:
+            condition = rand_col >= lower
+        else:
+            upper = boundaries[index + 1]
+            condition = (rand_col >= lower) & (rand_col < upper)
+        splits.append(dc.filter(condition))
+    return splits

{datachain-0.37.6.dist-info → datachain-0.37.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.37.6
+Version: 0.37.8
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -55,9 +55,9 @@ Provides-Extra: docs
 Requires-Dist: mkdocs>=1.5.2; extra == "docs"
 Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
 Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
-Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
 Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
 Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
+Requires-Dist: mkdocs-section-index>=0.3.10; extra == "docs"
 Requires-Dist: eval-type-backport; extra == "docs"
 Provides-Extra: torch
 Requires-Dist: torch>=2.1.0; extra == "torch"

{datachain-0.37.6.dist-info → datachain-0.37.8.dist-info}/RECORD RENAMED Viewed

@@ -109,7 +109,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=Sxj0ojeMSpAwM_NNoXa1dMR_2L_cQ6X
 datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
 datachain/lib/dc/csv.py,sha256=fIfj5-2Ix4z5D5yZueagd5WUWw86pusJ9JJKD-U3KGg,4407
 datachain/lib/dc/database.py,sha256=Wqob3dQc9Mol_0vagzVEXzteCKS9M0E3U5130KVmQKg,14629
-datachain/lib/dc/datachain.py,sha256=RYhinLQ6CMU3tudLpiJGh-vfCL24KDKbKM3Q1EsWoAE,104072
+datachain/lib/dc/datachain.py,sha256=XHr3gbdpLwzHhhIzPQXL5uZJQMFZ1AypCENdRlWWxoM,104671
 datachain/lib/dc/datasets.py,sha256=oY1t8QBAaZdhjwR439zZT74hMOspewVCrgdwy6juXng,15321
 datachain/lib/dc/hf.py,sha256=FeruEO176L2qQ1Mnx0QmK4kV0GuQ4xtj717N8fGJrBI,2849
 datachain/lib/dc/json.py,sha256=iJ6G0jwTKz8xtfh1eICShnWk_bAMWjF5bFnOXLHaTlw,2683
@@ -132,7 +132,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
 datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=ugTlSFqh_kxMcG6vJ5XrEzG9jBXRdb7KRAEEsFWiPew,4190
-datachain/query/dataset.py,sha256=kfNh6B6pYSz3batUpwW_6vJ7XRLwLfC08hKOZUMjf3o,67126
+datachain/query/dataset.py,sha256=9Ky0LZ7wMpfJbIZyXjnensrDQJvGg1pysZs96AYZqIY,67576
 datachain/query/dispatch.py,sha256=Tg73zB6vDnYYYAvtlS9l7BI3sI1EfRCbDjiasvNxz2s,16385
 datachain/query/metrics.py,sha256=qOMHiYPTMtVs2zI-mUSy8OPAVwrg4oJtVF85B9tdQyM,810
 datachain/query/params.py,sha256=JkVz6IKUIpF58JZRkUXFT8DAHX2yfaULbhVaGmHKFLc,826
@@ -163,11 +163,11 @@ datachain/sql/sqlite/base.py,sha256=T4G46GggBRMZaDCRnfBWDv_-P2aLisqJ947xMnkB3Pk,
 datachain/sql/sqlite/types.py,sha256=DCK7q-Zdc_m1o1T33xrKjYX1zRg1231gw3o3ACO_qho,1815
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
-datachain/toolkit/split.py,sha256=xQzzmvQRKsPteDKbpgOxd4r971BnFaK33mcOl0FuGeI,2883
+datachain/toolkit/split.py,sha256=9HHZl0fGs5Zj8b9l2L3IKf0AiiVNL9SnWbc2rfDiXRA,3710
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.37.6.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.37.6.dist-info/METADATA,sha256=zBPCt_CUJzcP3rNzpykwH9v9A388r273Huo6Hp_f0Jk,13762
-datachain-0.37.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datachain-0.37.6.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.37.6.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.37.6.dist-info/RECORD,,
+datachain-0.37.8.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.37.8.dist-info/METADATA,sha256=6MLsgOSmSsxKXzbiOqTs9yQXaPhFu1QwgSqN_OmuQQM,13763
+datachain-0.37.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datachain-0.37.8.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.37.8.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.37.8.dist-info/RECORD,,

{datachain-0.37.6.dist-info → datachain-0.37.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.37.6.dist-info → datachain-0.37.8.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.37.6.dist-info → datachain-0.37.8.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datachain-0.37.6.dist-info → datachain-0.37.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.37.6__py3-none-any.whl → 0.37.8__py3-none-any.whl

Potentially problematic release.

datachain 0.37.6py3-none-any.whl → 0.37.8py3-none-any.whl