PyPI - datachain - Versions diffs - 0.7.9__py3-none-any.whl → 0.7.11__py3-none-any.whl - Mend

datachain 0.7.9py3-none-any.whl → 0.7.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (27) hide show

datachain/client/__init__.py +1 -2
datachain/client/fsspec.py +4 -2
datachain/client/local.py +9 -4
datachain/func/__init__.py +4 -1
datachain/func/numeric.py +46 -0
datachain/func/string.py +46 -0
datachain/lib/convert/flatten.py +7 -5
datachain/lib/convert/unflatten.py +2 -2
datachain/lib/convert/values_to_tuples.py +1 -1
datachain/lib/dc.py +5 -1
datachain/lib/file.py +2 -1
datachain/lib/meta_formats.py +2 -1
datachain/lib/pytorch.py +1 -5
datachain/lib/signal_schema.py +28 -6
datachain/lib/utils.py +1 -1
datachain/query/dataset.py +5 -2
datachain/sql/functions/numeric.py +12 -0
datachain/sql/functions/string.py +12 -0
datachain/sql/sqlite/base.py +40 -0
datachain/toolkit/split.py +19 -6
datachain-0.7.11.dist-info/METADATA +206 -0
{datachain-0.7.9.dist-info → datachain-0.7.11.dist-info}/RECORD +26 -26
datachain-0.7.9.dist-info/METADATA +0 -488
{datachain-0.7.9.dist-info → datachain-0.7.11.dist-info}/LICENSE +0 -0
{datachain-0.7.9.dist-info → datachain-0.7.11.dist-info}/WHEEL +0 -0
{datachain-0.7.9.dist-info → datachain-0.7.11.dist-info}/entry_points.txt +0 -0
{datachain-0.7.9.dist-info → datachain-0.7.11.dist-info}/top_level.txt +0 -0

datachain/client/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from .fsspec import Client
-from .s3 import ClientS3
-__all__ = ["Client", "ClientS3"]
+__all__ = ["Client"]

datachain/client/fsspec.py CHANGED Viewed

@@ -172,7 +172,7 @@ class Client(ABC):
         return url == cls.PREFIX
     @classmethod
-    def get_uri(cls, name) -> "StorageURI":
+    def get_uri(cls, name: str) -> "StorageURI":
         from datachain.dataset import StorageURI
         return StorageURI(f"{cls.PREFIX}{name}")
@@ -278,7 +278,9 @@ class Client(ABC):
     ) -> None:
         await self._fetch_nested(start_prefix, result_queue)
-    async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
+    async def _fetch_dir(
+        self, prefix: str, pbar, result_queue: ResultQueue
+    ) -> set[str]:
         path = f"{self.name}/{prefix}"
         infos = await self.ls_dir(path)
         files = []

datachain/client/local.py CHANGED Viewed

@@ -12,6 +12,7 @@ from datachain.lib.file import File
 from .fsspec import Client
 if TYPE_CHECKING:
+    from datachain.cache import DataChainCache
     from datachain.dataset import StorageURI
@@ -21,7 +22,11 @@ class FileClient(Client):
     protocol = "file"
     def __init__(
-        self, name: str, fs_kwargs: dict[str, Any], cache, use_symlinks: bool = False
+        self,
+        name: str,
+        fs_kwargs: dict[str, Any],
+        cache: "DataChainCache",
+        use_symlinks: bool = False,
     ) -> None:
         super().__init__(name, fs_kwargs, cache)
         self.use_symlinks = use_symlinks
@@ -30,7 +35,7 @@ class FileClient(Client):
         raise TypeError("Signed urls are not implemented for local file system")
     @classmethod
-    def get_uri(cls, name) -> "StorageURI":
+    def get_uri(cls, name: str) -> "StorageURI":
         from datachain.dataset import StorageURI
         return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
@@ -77,7 +82,7 @@ class FileClient(Client):
         return bucket, path
     @classmethod
-    def from_name(cls, name: str, cache, kwargs) -> "FileClient":
+    def from_name(cls, name: str, cache: "DataChainCache", kwargs) -> "FileClient":
         use_symlinks = kwargs.pop("use_symlinks", False)
         return cls(name, kwargs, cache, use_symlinks=use_symlinks)
@@ -85,7 +90,7 @@ class FileClient(Client):
     def from_source(
         cls,
         uri: str,
-        cache,
+        cache: "DataChainCache",
         use_symlinks: bool = False,
         **kwargs,
     ) -> "FileClient":

datachain/func/__init__.py CHANGED Viewed

@@ -17,8 +17,9 @@ from .aggregate import (
 )
 from .array import cosine_distance, euclidean_distance, length, sip_hash_64
 from .conditional import greatest, least
-from .numeric import bit_and, bit_or, bit_xor, int_hash_64
+from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .random import rand
+from .string import byte_hamming_distance
 from .window import window
 __all__ = [
@@ -26,8 +27,10 @@ __all__ = [
     "array",
     "avg",
     "bit_and",
+    "bit_hamming_distance",
     "bit_or",
     "bit_xor",
+    "byte_hamming_distance",
     "case",
     "collect",
     "concat",

datachain/func/numeric.py CHANGED Viewed

@@ -160,3 +160,49 @@ def int_hash_64(col: Union[ColT, int]) -> Func:
     return Func(
         "int_hash_64", inner=numeric.int_hash_64, cols=cols, args=args, result_type=int
     )
+def bit_hamming_distance(*args: Union[ColT, int]) -> Func:
+    """
+    Computes the Hamming distance between the bit representations of two integer values.
+    The Hamming distance is the number of positions at which the corresponding bits
+    are different. This function returns the dissimilarity between the integers,
+    where 0 indicates identical integers and values closer to the number of bits
+    in the integer indicate higher dissimilarity.
+    Args:
+        args (str | int): Two integers to compute the Hamming distance between.
+            If a str is provided, it is assumed to be the name of the column.
+            If an int is provided, it is assumed to be an integer literal.
+    Returns:
+        Func: A Func object that represents the Hamming distance function.
+    Example:
+        ```py
+        dc.mutate(
+            ham_dist=func.bit_hamming_distance("embed1", 123456),
+        )
+        ```
+    Notes:
+        - Result column will always be of type int.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, int):
+            func_args.append(arg)
+        else:
+            cols.append(arg)
+    if len(cols) + len(func_args) != 2:
+        raise ValueError("bit_hamming_distance() requires exactly two arguments")
+    return Func(
+        "bit_hamming_distance",
+        inner=numeric.bit_hamming_distance,
+        cols=cols,
+        args=func_args,
+        result_type=int,
+    )

datachain/func/string.py CHANGED Viewed

@@ -152,3 +152,49 @@ def regexp_replace(col: Union[str, Func], regex: str, replacement: str) -> Func:
         args = None
     return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str)
+def byte_hamming_distance(*args: Union[str, Func]) -> Func:
+    """
+    Computes the Hamming distance between two strings.
+    The Hamming distance is the number of positions at which the corresponding
+    characters are different. This function returns the dissimilarity between
+    the strings, where 0 indicates identical strings and values closer to the length
+    of the strings indicate higher dissimilarity.
+    Args:
+        args (str | literal): Two strings to compute the Hamming distance between.
+            If a str is provided, it is assumed to be the name of the column.
+            If a Literal is provided, it is assumed to be a string literal.
+    Returns:
+        Func: A Func object that represents the Hamming distance function.
+    Example:
+        ```py
+        dc.mutate(
+            ham_dist=func.byte_hamming_distance("file.phash", literal("hello")),
+        )
+        ```
+    Notes:
+        - Result column will always be of type int.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if get_origin(arg) is literal:
+            func_args.append(arg)
+        else:
+            cols.append(arg)
+    if len(cols) + len(func_args) != 2:
+        raise ValueError("byte_hamming_distance() requires exactly two arguments")
+    return Func(
+        "byte_hamming_distance",
+        inner=string.byte_hamming_distance,
+        cols=cols,
+        args=func_args,
+        result_type=int,
+    )

datachain/lib/convert/flatten.py CHANGED Viewed

@@ -1,19 +1,21 @@
+from collections.abc import Generator
 from pydantic import BaseModel
 from datachain.lib.model_store import ModelStore
-def flatten(obj: BaseModel):
+def flatten(obj: BaseModel) -> tuple:
     return tuple(_flatten_fields_values(obj.model_fields, obj))
-def flatten_list(obj_list):
+def flatten_list(obj_list: list[BaseModel]) -> tuple:
     return tuple(
         val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
     )
-def _flatten_list_field(value: list):
+def _flatten_list_field(value: list) -> list:
     assert isinstance(value, list)
     if value and ModelStore.is_pydantic(type(value[0])):
         return [val.model_dump() for val in value]
@@ -22,7 +24,7 @@ def _flatten_list_field(value: list):
     return value
-def _flatten_fields_values(fields, obj: BaseModel):
+def _flatten_fields_values(fields: dict, obj: BaseModel) -> Generator:
     for name, f_info in fields.items():
         anno = f_info.annotation
         # Optimization: Access attributes directly to skip the model_dump() call.
@@ -40,5 +42,5 @@ def _flatten_fields_values(fields, obj: BaseModel):
             yield value
-def _flatten(obj):
+def _flatten(obj: BaseModel) -> tuple:
     return tuple(_flatten_fields_values(obj.model_fields, obj))

datachain/lib/convert/unflatten.py CHANGED Viewed

@@ -9,12 +9,12 @@ from pydantic import BaseModel
 from datachain.query.schema import DEFAULT_DELIMITER
-def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict:
+def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos: int = 0) -> dict:
     return unflatten_to_json_pos(model, row, pos)[0]
 def unflatten_to_json_pos(
-    model: type[BaseModel], row: Sequence[Any], pos=0
+    model: type[BaseModel], row: Sequence[Any], pos: int = 0
 ) -> tuple[dict, int]:
     res = {}
     for name, f_info in model.model_fields.items():

datachain/lib/convert/values_to_tuples.py CHANGED Viewed

@@ -11,7 +11,7 @@ from datachain.lib.utils import DataChainParamsError
 class ValuesToTupleError(DataChainParamsError):
-    def __init__(self, ds_name, msg):
+    def __init__(self, ds_name: str, msg: str):
         if ds_name:
             ds_name = f"' {ds_name}'"
         super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")

datachain/lib/dc.py CHANGED Viewed

@@ -19,7 +19,6 @@ from typing import (
 )
 import orjson
-import pandas as pd
 import sqlalchemy
 from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
@@ -57,6 +56,7 @@ from datachain.telemetry import telemetry
 from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
 if TYPE_CHECKING:
+    import pandas as pd
     from pyarrow import DataType as ArrowDataType
     from typing_extensions import Concatenate, ParamSpec, Self
@@ -1701,6 +1701,8 @@ class DataChain:
         Parameters:
             flatten : Whether to use a multiindex or flatten column names.
         """
+        import pandas as pd
         headers, max_length = self._effective_signals_schema.get_headers_with_length()
         if flatten or max_length < 2:
             columns = [".".join(filter(None, header)) for header in headers]
@@ -1724,6 +1726,8 @@ class DataChain:
             transpose : Whether to transpose rows and columns.
             truncate : Whether or not to truncate the contents of columns.
         """
+        import pandas as pd
         dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
         df = dc.to_pandas(flatten)

datachain/lib/file.py CHANGED Viewed

@@ -17,7 +17,6 @@ from urllib.request import url2pathname
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from PIL import Image
-from pyarrow.dataset import dataset
 from pydantic import Field, field_validator
 from datachain.client.fileslice import FileSlice
@@ -452,6 +451,8 @@ class ArrowRow(DataModel):
     @contextmanager
     def open(self):
         """Stream row contents from indexed file."""
+        from pyarrow.dataset import dataset
         if self.file._caching_enabled:
             self.file.ensure_cached()
             path = self.file.get_local_path()

datachain/lib/meta_formats.py CHANGED Viewed

@@ -6,7 +6,6 @@ from collections.abc import Iterator
 from pathlib import Path
 from typing import Callable
-import datamodel_code_generator
 import jmespath as jsp
 from pydantic import BaseModel, ConfigDict, Field, ValidationError  # noqa: F401
@@ -67,6 +66,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
             data_type = "json"  # treat json line as plain JSON in auto-schema
         data_string = json.dumps(json_object)
+    import datamodel_code_generator
     input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
     input_file_type = input_file_types[data_type]
     with tempfile.TemporaryDirectory() as tmpdir:

datachain/lib/pytorch.py CHANGED Viewed

@@ -7,7 +7,6 @@ from torch import float32
 from torch.distributed import get_rank, get_world_size
 from torch.utils.data import IterableDataset, get_worker_info
 from torchvision.transforms import v2
-from tqdm import tqdm
 from datachain import Session
 from datachain.asyn import AsyncMapper
@@ -112,10 +111,7 @@ class PytorchDataset(IterableDataset):
             from datachain.lib.udf import _prefetch_input
             rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
-        desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
-        with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
-            yield from map(self._process_row, rows_it)
+        yield from map(self._process_row, rows)
     def _process_row(self, row_features):
         row = []

datachain/lib/signal_schema.py CHANGED Viewed

@@ -402,9 +402,20 @@ class SignalSchema:
             if ModelStore.is_pydantic(finfo.annotation):
                 SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
-    def get_column_type(self, col_name: str) -> DataType:
+    def get_column_type(self, col_name: str, with_subtree: bool = False) -> DataType:
+        """
+        Returns column type by column name.
+        If `with_subtree` is True, then it will return the type of the column
+        even if it has a subtree (e.g. model with nested fields), otherwise it will
+        return the type of the column (standard type field, not the model).
+        If column is not found, raises `SignalResolvingError`.
+        """
         for path, _type, has_subtree, _ in self.get_flat_tree():
-            if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
+            if (with_subtree or not has_subtree) and DEFAULT_DELIMITER.join(
+                path
+            ) == col_name:
                 return _type
         raise SignalResolvingError([col_name], "is not found")
@@ -492,14 +503,25 @@ class SignalSchema:
                 # renaming existing signal
                 del new_values[value.name]
                 new_values[name] = self.values[value.name]
-            elif isinstance(value, Func):
+                continue
+            if isinstance(value, Column):
+                # adding new signal from existing signal field
+                try:
+                    new_values[name] = self.get_column_type(
+                        value.name, with_subtree=True
+                    )
+                    continue
+                except SignalResolvingError:
+                    pass
+            if isinstance(value, Func):
                 # adding new signal with function
                 new_values[name] = value.get_result_type(self)
-            elif isinstance(value, ColumnElement):
+                continue
+            if isinstance(value, ColumnElement):
                 # adding new signal
                 new_values[name] = sql_to_python(value)
-            else:
-                new_values[name] = value
+                continue
+            new_values[name] = value
         return SignalSchema(new_values)

datachain/lib/utils.py CHANGED Viewed

@@ -28,7 +28,7 @@ class DataChainParamsError(DataChainError):
 class DataChainColumnError(DataChainParamsError):
-    def __init__(self, col_name, msg):
+    def __init__(self, col_name: str, msg: str):
         super().__init__(f"Error for column {col_name}: {msg}")

datachain/query/dataset.py CHANGED Viewed

@@ -35,7 +35,6 @@ from sqlalchemy.sql.schema import TableClause
 from sqlalchemy.sql.selectable import Select
 from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
-from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
 from datachain.data_storage.schema import (
     PARTITION_COLUMN_ID,
     partition_col_names,
@@ -215,7 +214,7 @@ class DatasetDiffOperation(Step):
         Should return select query that calculates desired diff between dataset queries
         """
-    def apply(self, query_generator, temp_tables: list[str]):
+    def apply(self, query_generator, temp_tables: list[str]) -> "StepResult":
         source_query = query_generator.exclude(("sys__id",))
         target_query = self.dq.apply_steps().select()
         temp_tables.extend(self.dq.temp_table_names)
@@ -394,6 +393,8 @@ class UDFStep(Step, ABC):
         """
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
+        from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
         use_partitioning = self.partition_by is not None
         batching = self.udf.get_batching(use_partitioning)
         workers = self.workers
@@ -1087,6 +1088,8 @@ class DatasetQuery:
     def delete(
         name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
     ) -> None:
+        from datachain.catalog import get_catalog
         catalog = catalog or get_catalog()
         version = version or catalog.get_dataset(name).latest_version
         catalog.remove_dataset(name, version)

datachain/sql/functions/numeric.py CHANGED Viewed

@@ -35,9 +35,21 @@ class int_hash_64(GenericFunction):  # noqa: N801
     inherit_cache = True
+class bit_hamming_distance(GenericFunction):  # noqa: N801
+    """
+    Returns the Hamming distance between two integers.
+    """
+    type = Int64()
+    package = "numeric"
+    name = "hamming_distance"
+    inherit_cache = True
 compiler_not_implemented(bit_and)
 compiler_not_implemented(bit_or)
 compiler_not_implemented(bit_xor)
 compiler_not_implemented(bit_rshift)
 compiler_not_implemented(bit_lshift)
 compiler_not_implemented(int_hash_64)
+compiler_not_implemented(bit_hamming_distance)

datachain/sql/functions/string.py CHANGED Viewed

@@ -48,7 +48,19 @@ class replace(GenericFunction):  # noqa: N801
     inherit_cache = True
+class byte_hamming_distance(GenericFunction):  # noqa: N801
+    """
+    Returns the Hamming distance between two strings.
+    """
+    type = Int64()
+    package = "string"
+    name = "hamming_distance"
+    inherit_cache = True
 compiler_not_implemented(length)
 compiler_not_implemented(split)
 compiler_not_implemented(regexp_replace)
 compiler_not_implemented(replace)
+compiler_not_implemented(byte_hamming_distance)

datachain/sql/sqlite/base.py CHANGED Viewed

@@ -90,6 +90,7 @@ def setup():
     compiles(string.split, "sqlite")(compile_string_split)
     compiles(string.regexp_replace, "sqlite")(compile_string_regexp_replace)
     compiles(string.replace, "sqlite")(compile_string_replace)
+    compiles(string.byte_hamming_distance, "sqlite")(compile_byte_hamming_distance)
     compiles(conditional.greatest, "sqlite")(compile_greatest)
     compiles(conditional.least, "sqlite")(compile_least)
     compiles(Values, "sqlite")(compile_values)
@@ -104,6 +105,7 @@ def setup():
     compiles(numeric.bit_rshift, "sqlite")(compile_bitwise_rshift)
     compiles(numeric.bit_lshift, "sqlite")(compile_bitwise_lshift)
     compiles(numeric.int_hash_64, "sqlite")(compile_int_hash_64)
+    compiles(numeric.bit_hamming_distance, "sqlite")(compile_bit_hamming_distance)
     if load_usearch_extension(sqlite3.connect(":memory:")):
         compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
@@ -191,6 +193,26 @@ def sqlite_int_hash_64(x: int) -> int:
     return x if x < 1 << 63 else (x & MAX_INT64) - (1 << 64)
+def sqlite_bit_hamming_distance(a: int, b: int) -> int:
+    """Calculate the Hamming distance between two integers."""
+    diff = (a & MAX_INT64) ^ (b & MAX_INT64)
+    if hasattr(diff, "bit_count"):
+        return diff.bit_count()
+    return bin(diff).count("1")
+def sqlite_byte_hamming_distance(a: str, b: str) -> int:
+    """Calculate the Hamming distance between two strings."""
+    diff = 0
+    if len(a) < len(b):
+        diff = len(b) - len(a)
+        b = b[: len(a)]
+    elif len(b) < len(a):
+        diff = len(a) - len(b)
+        a = a[: len(b)]
+    return diff + sum(c1 != c2 for c1, c2 in zip(a, b))
 def register_user_defined_sql_functions() -> None:
     # Register optional functions if we have the necessary dependencies
     # and otherwise register functions that will raise an exception with
@@ -225,6 +247,9 @@ def register_user_defined_sql_functions() -> None:
             "bitwise_lshift", 2, lambda a, b: a << b, deterministic=True
         )
         conn.create_function("int_hash_64", 1, sqlite_int_hash_64, deterministic=True)
+        conn.create_function(
+            "bit_hamming_distance", 2, sqlite_bit_hamming_distance, deterministic=True
+        )
     _registered_function_creators["numeric_functions"] = create_numeric_functions
@@ -237,6 +262,9 @@ def register_user_defined_sql_functions() -> None:
         conn.create_function(
             "regexp_replace", 3, sqlite_regexp_replace, deterministic=True
         )
+        conn.create_function(
+            "byte_hamming_distance", 2, sqlite_byte_hamming_distance, deterministic=True
+        )
     _registered_function_creators["string_functions"] = create_string_functions
@@ -383,6 +411,18 @@ def compile_int_hash_64(element, compiler, **kwargs):
     return compiler.process(func.int_hash_64(*element.clauses.clauses), **kwargs)
+def compile_bit_hamming_distance(element, compiler, **kwargs):
+    return compiler.process(
+        func.bit_hamming_distance(*element.clauses.clauses), **kwargs
+    )
+def compile_byte_hamming_distance(element, compiler, **kwargs):
+    return compiler.process(
+        func.byte_hamming_distance(*element.clauses.clauses), **kwargs
+    )
 def py_json_array_length(arr):
     return len(orjson.loads(arr))

datachain/toolkit/split.py CHANGED Viewed

@@ -1,7 +1,16 @@
+import random
+from typing import Optional
 from datachain import C, DataChain
+RESOLUTION = 2**31 - 1  # Maximum positive value for a 32-bit signed integer.
-def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
+def train_test_split(
+    dc: DataChain,
+    weights: list[float],
+    seed: Optional[int] = None,
+) -> list[DataChain]:
     """
     Splits a DataChain into multiple subsets based on the provided weights.
@@ -18,6 +27,8 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
             For example:
             - `[0.7, 0.3]` corresponds to a 70/30 split;
             - `[2, 1, 1]` corresponds to a 50/25/25 split.
+        seed (int, optional):
+            The seed for the random number generator. Defaults to None.
     Returns:
         list[DataChain]:
@@ -58,14 +69,16 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
     weights_normalized = [weight / sum(weights) for weight in weights]
-    resolution = 2**31 - 1  # Maximum positive value for a 32-bit signed integer.
+    rand_col = C("sys.rand")
+    if seed is not None:
+        uniform_seed = random.Random(seed).randrange(1, RESOLUTION)  # noqa: S311
+        rand_col = (rand_col % RESOLUTION) * uniform_seed  # type: ignore[assignment]
+    rand_col = rand_col % RESOLUTION  # type: ignore[assignment]
     return [
         dc.filter(
-            C("sys__rand") % resolution
-            >= round(sum(weights_normalized[:index]) * resolution),
-            C("sys__rand") % resolution
-            < round(sum(weights_normalized[: index + 1]) * resolution),
+            rand_col >= round(sum(weights_normalized[:index]) * (RESOLUTION - 1)),
+            rand_col < round(sum(weights_normalized[: index + 1]) * (RESOLUTION - 1)),
         )
         for index, _ in enumerate(weights_normalized)
     ]

datachain 0.7.9__py3-none-any.whl → 0.7.11__py3-none-any.whl

Potentially problematic release.

datachain 0.7.9py3-none-any.whl → 0.7.11py3-none-any.whl