PyPI - datachain - Versions diffs - 0.7.10__py3-none-any.whl → 0.7.11__py3-none-any.whl - Mend

datachain 0.7.10py3-none-any.whl → 0.7.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (14) hide show

datachain/client/__init__.py +1 -2
datachain/lib/dc.py +5 -1
datachain/lib/file.py +2 -1
datachain/lib/meta_formats.py +2 -1
datachain/lib/pytorch.py +1 -5
datachain/lib/signal_schema.py +28 -6
datachain/query/dataset.py +4 -1
datachain/toolkit/split.py +19 -6
{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/METADATA +9 -10
{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/RECORD +14 -14
{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/LICENSE +0 -0
{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/WHEEL +0 -0
{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/entry_points.txt +0 -0
{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/top_level.txt +0 -0

datachain/client/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from .fsspec import Client
-from .s3 import ClientS3
-__all__ = ["Client", "ClientS3"]
+__all__ = ["Client"]

datachain/lib/dc.py CHANGED Viewed

@@ -19,7 +19,6 @@ from typing import (
 )
 import orjson
-import pandas as pd
 import sqlalchemy
 from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
@@ -57,6 +56,7 @@ from datachain.telemetry import telemetry
 from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
 if TYPE_CHECKING:
+    import pandas as pd
     from pyarrow import DataType as ArrowDataType
     from typing_extensions import Concatenate, ParamSpec, Self
@@ -1701,6 +1701,8 @@ class DataChain:
         Parameters:
             flatten : Whether to use a multiindex or flatten column names.
         """
+        import pandas as pd
         headers, max_length = self._effective_signals_schema.get_headers_with_length()
         if flatten or max_length < 2:
             columns = [".".join(filter(None, header)) for header in headers]
@@ -1724,6 +1726,8 @@ class DataChain:
             transpose : Whether to transpose rows and columns.
             truncate : Whether or not to truncate the contents of columns.
         """
+        import pandas as pd
         dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
         df = dc.to_pandas(flatten)

datachain/lib/file.py CHANGED Viewed

@@ -17,7 +17,6 @@ from urllib.request import url2pathname
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from PIL import Image
-from pyarrow.dataset import dataset
 from pydantic import Field, field_validator
 from datachain.client.fileslice import FileSlice
@@ -452,6 +451,8 @@ class ArrowRow(DataModel):
     @contextmanager
     def open(self):
         """Stream row contents from indexed file."""
+        from pyarrow.dataset import dataset
         if self.file._caching_enabled:
             self.file.ensure_cached()
             path = self.file.get_local_path()

datachain/lib/meta_formats.py CHANGED Viewed

@@ -6,7 +6,6 @@ from collections.abc import Iterator
 from pathlib import Path
 from typing import Callable
-import datamodel_code_generator
 import jmespath as jsp
 from pydantic import BaseModel, ConfigDict, Field, ValidationError  # noqa: F401
@@ -67,6 +66,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
             data_type = "json"  # treat json line as plain JSON in auto-schema
         data_string = json.dumps(json_object)
+    import datamodel_code_generator
     input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
     input_file_type = input_file_types[data_type]
     with tempfile.TemporaryDirectory() as tmpdir:

datachain/lib/pytorch.py CHANGED Viewed

@@ -7,7 +7,6 @@ from torch import float32
 from torch.distributed import get_rank, get_world_size
 from torch.utils.data import IterableDataset, get_worker_info
 from torchvision.transforms import v2
-from tqdm import tqdm
 from datachain import Session
 from datachain.asyn import AsyncMapper
@@ -112,10 +111,7 @@ class PytorchDataset(IterableDataset):
             from datachain.lib.udf import _prefetch_input
             rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
-        desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
-        with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
-            yield from map(self._process_row, rows_it)
+        yield from map(self._process_row, rows)
     def _process_row(self, row_features):
         row = []

datachain/lib/signal_schema.py CHANGED Viewed

@@ -402,9 +402,20 @@ class SignalSchema:
             if ModelStore.is_pydantic(finfo.annotation):
                 SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
-    def get_column_type(self, col_name: str) -> DataType:
+    def get_column_type(self, col_name: str, with_subtree: bool = False) -> DataType:
+        """
+        Returns column type by column name.
+        If `with_subtree` is True, then it will return the type of the column
+        even if it has a subtree (e.g. model with nested fields), otherwise it will
+        return the type of the column (standard type field, not the model).
+        If column is not found, raises `SignalResolvingError`.
+        """
         for path, _type, has_subtree, _ in self.get_flat_tree():
-            if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
+            if (with_subtree or not has_subtree) and DEFAULT_DELIMITER.join(
+                path
+            ) == col_name:
                 return _type
         raise SignalResolvingError([col_name], "is not found")
@@ -492,14 +503,25 @@ class SignalSchema:
                 # renaming existing signal
                 del new_values[value.name]
                 new_values[name] = self.values[value.name]
-            elif isinstance(value, Func):
+                continue
+            if isinstance(value, Column):
+                # adding new signal from existing signal field
+                try:
+                    new_values[name] = self.get_column_type(
+                        value.name, with_subtree=True
+                    )
+                    continue
+                except SignalResolvingError:
+                    pass
+            if isinstance(value, Func):
                 # adding new signal with function
                 new_values[name] = value.get_result_type(self)
-            elif isinstance(value, ColumnElement):
+                continue
+            if isinstance(value, ColumnElement):
                 # adding new signal
                 new_values[name] = sql_to_python(value)
-            else:
-                new_values[name] = value
+                continue
+            new_values[name] = value
         return SignalSchema(new_values)

datachain/query/dataset.py CHANGED Viewed

@@ -35,7 +35,6 @@ from sqlalchemy.sql.schema import TableClause
 from sqlalchemy.sql.selectable import Select
 from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
-from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
 from datachain.data_storage.schema import (
     PARTITION_COLUMN_ID,
     partition_col_names,
@@ -394,6 +393,8 @@ class UDFStep(Step, ABC):
         """
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
+        from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
         use_partitioning = self.partition_by is not None
         batching = self.udf.get_batching(use_partitioning)
         workers = self.workers
@@ -1087,6 +1088,8 @@ class DatasetQuery:
     def delete(
         name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
     ) -> None:
+        from datachain.catalog import get_catalog
         catalog = catalog or get_catalog()
         version = version or catalog.get_dataset(name).latest_version
         catalog.remove_dataset(name, version)

datachain/toolkit/split.py CHANGED Viewed

@@ -1,7 +1,16 @@
+import random
+from typing import Optional
 from datachain import C, DataChain
+RESOLUTION = 2**31 - 1  # Maximum positive value for a 32-bit signed integer.
-def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
+def train_test_split(
+    dc: DataChain,
+    weights: list[float],
+    seed: Optional[int] = None,
+) -> list[DataChain]:
     """
     Splits a DataChain into multiple subsets based on the provided weights.
@@ -18,6 +27,8 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
             For example:
             - `[0.7, 0.3]` corresponds to a 70/30 split;
             - `[2, 1, 1]` corresponds to a 50/25/25 split.
+        seed (int, optional):
+            The seed for the random number generator. Defaults to None.
     Returns:
         list[DataChain]:
@@ -58,14 +69,16 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
     weights_normalized = [weight / sum(weights) for weight in weights]
-    resolution = 2**31 - 1  # Maximum positive value for a 32-bit signed integer.
+    rand_col = C("sys.rand")
+    if seed is not None:
+        uniform_seed = random.Random(seed).randrange(1, RESOLUTION)  # noqa: S311
+        rand_col = (rand_col % RESOLUTION) * uniform_seed  # type: ignore[assignment]
+    rand_col = rand_col % RESOLUTION  # type: ignore[assignment]
     return [
         dc.filter(
-            C("sys__rand") % resolution
-            >= round(sum(weights_normalized[:index]) * resolution),
-            C("sys__rand") % resolution
-            < round(sum(weights_normalized[: index + 1]) * resolution),
+            rand_col >= round(sum(weights_normalized[:index]) * (RESOLUTION - 1)),
+            rand_col < round(sum(weights_normalized[: index + 1]) * (RESOLUTION - 1)),
         )
         for index, _ in enumerate(weights_normalized)
     ]

{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.7.10
+Version: 0.7.11
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -91,14 +91,14 @@ Requires-Dist: types-requests; extra == "dev"
 Requires-Dist: types-tabulate; extra == "dev"
 Provides-Extra: examples
 Requires-Dist: datachain[tests]; extra == "examples"
-Requires-Dist: numpy<2,>=1; extra == "examples"
 Requires-Dist: defusedxml; extra == "examples"
 Requires-Dist: accelerate; extra == "examples"
-Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
+Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
+Requires-Dist: unstructured[pdf]; extra == "examples"
 Requires-Dist: pdfplumber==0.11.4; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: onnx==1.16.1; extra == "examples"
-Requires-Dist: ultralytics==8.3.37; extra == "examples"
+Requires-Dist: ultralytics==8.3.48; extra == "examples"
 ================
 |logo| DataChain
@@ -138,6 +138,11 @@ Use Cases
 3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
    Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
+Getting Started
+===============
+Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
+to get started with `DataChain` and learn more.
 Key Features
 ============
@@ -161,12 +166,6 @@ Key Features
    - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
-Getting Started
-===============
-Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
 Contributing
 ============

{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/RECORD RENAMED Viewed

@@ -21,7 +21,7 @@ datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4
 datachain/catalog/catalog.py,sha256=s4fat0jjP3JPq0RGQ9zfzRkX1JavxxCrcB1tJKMgsks,57686
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
-datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
+datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
 datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
 datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
@@ -53,17 +53,17 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
 datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
-datachain/lib/dc.py,sha256=xqLR4IH_mbuet0FsxBHDsRUg-zR6tO8UZdLQQTLG8EE,89533
-datachain/lib/file.py,sha256=-XMkL6ED1sE7TMhWoMRTEuOXswZJw8X6AEmJDONFP74,15019
+datachain/lib/dc.py,sha256=qMhpVPdWeuXBDhmKKoq3fkq12Cx_ZPxDdsl_juu482o,89595
+datachain/lib/file.py,sha256=4dDWXVCHHP2uELDPHP_LheyTyyr01jwp5wp3HaOIeFI,15028
 datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
 datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
 datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
-datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
+datachain/lib/meta_formats.py,sha256=6_gB23fWlvd-edOO3UvDHvj6dBXVL61T7x8RX51FW84,6685
 datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
-datachain/lib/pytorch.py,sha256=QMJO_OGEMvBi2x71vGcG25agLzNwyLmF4Qx5iILlwaM,6350
+datachain/lib/pytorch.py,sha256=dA3r1JY0wqV_907a1D0lFaEN-7v3fMRpc1ePFE9CnvA,6168
 datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
-datachain/lib/signal_schema.py,sha256=_uh19nCKhiD9ua8oIN1Q8R9iYv1BZAuqTJCLYVmyW8k,24557
+datachain/lib/signal_schema.py,sha256=ziRTctom0-wAqURZfkfG6dc_3P2FcYxKjYsKC49NQ1Q,25415
 datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
 datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=-j0krjNAELTqRI0dB1N65AmawtcIY5vN---AuUcW8Us,13637
@@ -88,7 +88,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
 datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
-datachain/query/dataset.py,sha256=eXr9fJz2grX2evmkmsiH0Xeqajd8gFnujmt_USMxy0c,54563
+datachain/query/dataset.py,sha256=JrImhguXj2ZDwJpfuyhcgxSIlqSPy5NmLDLc3muFQJs,54610
 datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
 datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -116,11 +116,11 @@ datachain/sql/sqlite/base.py,sha256=E2PK3hoGlHey1eEjcReXRrI-c_ASr3AmAXaNYKDY_o8,
 datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
-datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
+datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.7.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.7.10.dist-info/METADATA,sha256=qtw_rToRdmR9-CO6MFCAGv6NWJJ87C95iQaDEnDE4H8,8371
-datachain-0.7.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-datachain-0.7.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.7.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.7.10.dist-info/RECORD,,
+datachain-0.7.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.7.11.dist-info/METADATA,sha256=ADTTf0_eJImM-tIPR-jQydM3N9Iis-ECRxWgkwLM8lU,8412
+datachain-0.7.11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+datachain-0.7.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.7.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.7.11.dist-info/RECORD,,

{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.7.10.dist-info → datachain-0.7.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.7.10__py3-none-any.whl → 0.7.11__py3-none-any.whl

Potentially problematic release.

datachain 0.7.10py3-none-any.whl → 0.7.11py3-none-any.whl