PyPI - datachain - Versions diffs - 0.3.6__tar.gz → 0.3.8__tar.gz - Mend

datachain 0.3.6tar.gz → 0.3.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (244) hide show

{datachain-0.3.6 → datachain-0.3.8}/.github/workflows/tests.yml RENAMED Viewed

@@ -50,7 +50,7 @@ jobs:
         run: nox -s lint
   datachain:
-    timeout-minutes: 25
+    timeout-minutes: 30
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false

{datachain-0.3.6/src/datachain.egg-info → datachain-0.3.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.3.6
+Version: 0.3.8
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -42,6 +42,7 @@ Requires-Dist: jmespath>=1.0
 Requires-Dist: datamodel-code-generator>=0.25
 Requires-Dist: Pillow<11,>=10.0.0
 Requires-Dist: msgpack<2,>=1.0.4
+Requires-Dist: psutil
 Provides-Extra: docs
 Requires-Dist: mkdocs>=1.5.2; extra == "docs"
 Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -58,8 +59,11 @@ Requires-Dist: lz4; extra == "remote"
 Requires-Dist: requests>=2.22.0; extra == "remote"
 Provides-Extra: vector
 Requires-Dist: usearch; extra == "vector"
+Provides-Extra: hf
+Requires-Dist: numba>=0.60.0; extra == "hf"
+Requires-Dist: datasets[audio,vision]; extra == "hf"
 Provides-Extra: tests
-Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
+Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
 Requires-Dist: pytest<9,>=8; extra == "tests"
 Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
 Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -74,9 +78,10 @@ Requires-Dist: hypothesis; extra == "tests"
 Requires-Dist: open_clip_torch; extra == "tests"
 Requires-Dist: aiotools>=1.7.0; extra == "tests"
 Requires-Dist: requests-mock; extra == "tests"
+Requires-Dist: scipy; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
-Requires-Dist: mypy==1.11.1; extra == "dev"
+Requires-Dist: mypy==1.11.2; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"

{datachain-0.3.6 → datachain-0.3.8}/pyproject.toml RENAMED Viewed

@@ -44,7 +44,8 @@ dependencies = [
   "jmespath>=1.0",
   "datamodel-code-generator>=0.25",
   "Pillow>=10.0.0,<11",
-  "msgpack>=1.0.4,<2"
+  "msgpack>=1.0.4,<2",
+  "psutil"
 ]
 [project.optional-dependencies]
@@ -68,8 +69,12 @@ remote = [
 vector = [
   "usearch"
 ]
+hf = [
+  "numba>=0.60.0",
+  "datasets[audio,vision]"
+]
 tests = [
-  "datachain[torch,remote,vector]",
+  "datachain[torch,remote,vector,hf]",
   "pytest>=8,<9",
   "pytest-sugar>=0.9.6",
   "pytest-cov>=4.1.0",
@@ -83,11 +88,12 @@ tests = [
   "hypothesis",
   "open_clip_torch",
   "aiotools>=1.7.0",
-  "requests-mock"
+  "requests-mock",
+  "scipy"
 ]
 dev = [
   "datachain[docs,tests]",
-  "mypy==1.11.1",
+  "mypy==1.11.2",
   "types-python-dateutil",
   "types-pytz",
   "types-PyYAML",

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/catalog/catalog.py RENAMED Viewed

@@ -1540,87 +1540,6 @@ class Catalog:
         dataset = self.get_dataset(name)
         return self.update_dataset(dataset, **update_data)
-    def merge_datasets(
-        self,
-        src: DatasetRecord,
-        dst: DatasetRecord,
-        src_version: int,
-        dst_version: Optional[int] = None,
-    ) -> DatasetRecord:
-        """
-        Merges records from source to destination dataset.
-        It will create new version
-        of a dataset with records merged from old version and the source, unless
-        existing version is specified for destination in which case it must
-        be in non final status as datasets are immutable
-        """
-        if (
-            dst_version
-            and not dst.is_valid_next_version(dst_version)
-            and dst.get_version(dst_version).is_final_status()
-        ):
-            raise DatasetInvalidVersionError(
-                f"Version {dst_version} must be higher than the current latest one"
-            )
-        src_dep = self.get_dataset_dependencies(src.name, src_version)
-        dst_dep = self.get_dataset_dependencies(
-            dst.name,
-            dst.latest_version,  # type: ignore[arg-type]
-        )
-        if dst.has_version(dst_version):  # type: ignore[arg-type]
-            # case where we don't create new version, but append to the existing one
-            self.warehouse.merge_dataset_rows(
-                src,
-                dst,
-                src_version,
-                dst_version=dst_version,  # type: ignore[arg-type]
-            )
-            merged_schema = src.serialized_schema | dst.serialized_schema
-            self.update_dataset(dst, schema=merged_schema)
-            self.update_dataset_version_with_warehouse_info(
-                dst,
-                dst_version,  # type: ignore[arg-type]
-                schema=merged_schema,
-            )
-            for dep in src_dep:
-                if dep and dep not in dst_dep:
-                    self.metastore.add_dependency(
-                        dep,
-                        dst.name,
-                        dst_version,  # type: ignore[arg-type]
-                    )
-        else:
-            # case where we create new version of merged results
-            src_dr = self.warehouse.dataset_rows(src, src_version)
-            dst_dr = self.warehouse.dataset_rows(dst)
-            merge_result_columns = list(
-                {
-                    c.name: c for c in list(src_dr.table.c) + list(dst_dr.table.c)
-                }.values()
-            )
-            dst_version = dst_version or dst.next_version
-            dst = self.create_new_dataset_version(
-                dst,
-                dst_version,
-                columns=merge_result_columns,
-            )
-            self.warehouse.merge_dataset_rows(
-                src,
-                dst,
-                src_version,
-                dst_version,
-            )
-            self.update_dataset_version_with_warehouse_info(dst, dst_version)
-            for dep in set(src_dep + dst_dep):
-                if dep:
-                    self.metastore.add_dependency(dep, dst.name, dst_version)
-        return dst
     def get_file_signals(
         self, dataset_name: str, dataset_version: int, row: RowDict
     ) -> Optional[dict]:

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/cli.py RENAMED Viewed

@@ -336,36 +336,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Display size using powers of 1000 not 1024",
     )
-    parse_merge_datasets = subp.add_parser(
-        "merge-datasets", parents=[parent_parser], description="Merges datasets"
-    )
-    parse_merge_datasets.add_argument(
-        "--src",
-        action="store",
-        default=None,
-        help="Source dataset name",
-    )
-    parse_merge_datasets.add_argument(
-        "--dst",
-        action="store",
-        default=None,
-        help="Destination dataset name",
-    )
-    parse_merge_datasets.add_argument(
-        "--src-version",
-        action="store",
-        default=None,
-        type=int,
-        help="Source dataset version",
-    )
-    parse_merge_datasets.add_argument(
-        "--dst-version",
-        action="store",
-        default=None,
-        type=int,
-        help="Destination dataset version",
-    )
     parse_ls = subp.add_parser(
         "ls", parents=[parent_parser], description="List storage contents"
     )
@@ -996,13 +966,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 new_name=args.new_name,
                 labels=args.labels,
             )
-        elif args.command == "merge-datasets":
-            catalog.merge_datasets(
-                catalog.get_dataset(args.src),
-                catalog.get_dataset(args.dst),
-                args.src_version,
-                dst_version=args.dst_version,
-            )
         elif args.command == "ls":
             ls(
                 args.sources,

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/schema.py RENAMED Viewed

@@ -50,7 +50,7 @@ def convert_rows_custom_column_types(
     columns: "ColumnCollection[str, ColumnElement[Any]]",
     rows: Iterator[tuple[Any, ...]],
     dialect: "Dialect",
-):
+) -> Iterator[tuple[Any, ...]]:
     """
     This function converts values of rows columns based on their types which are
     defined in columns. We are only converting column values for which types are

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/sqlite.py RENAMED Viewed

@@ -27,10 +27,7 @@ import datachain.sql.sqlite
 from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
 from datachain.data_storage.db_engine import DatabaseEngine
 from datachain.data_storage.id_generator import AbstractDBIDGenerator
-from datachain.data_storage.schema import (
-    DefaultSchema,
-    convert_rows_custom_column_types,
-)
+from datachain.data_storage.schema import DefaultSchema
 from datachain.dataset import DatasetRecord
 from datachain.error import DataChainError
 from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
@@ -651,12 +648,6 @@ class SQLiteWarehouse(AbstractWarehouse):
         self.db.create_table(table, if_not_exists=if_not_exists)
         return table
-    def dataset_rows_select(self, select_query: Select, **kwargs):
-        rows = self.db.execute(select_query, **kwargs)
-        yield from convert_rows_custom_column_types(
-            select_query.selected_columns, rows, sqlite_dialect
-        )
     def get_dataset_sources(
         self, dataset: DatasetRecord, version: int
     ) -> list[StorageURI]:

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/warehouse.py RENAMED Viewed

@@ -17,6 +17,7 @@ from sqlalchemy.sql.expression import true
 from tqdm import tqdm
 from datachain.client import Client
+from datachain.data_storage.schema import convert_rows_custom_column_types
 from datachain.data_storage.serializer import Serializable
 from datachain.dataset import DatasetRecord
 from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
@@ -226,7 +227,7 @@ class AbstractWarehouse(ABC, Serializable):
                     if limit < page_size:
                         paginated_query = paginated_query.limit(None).limit(limit)
-                results = self.db.execute(paginated_query.offset(offset))
+                results = self.dataset_rows_select(paginated_query.offset(offset))
                 processed = False
                 for row in results:
@@ -309,12 +310,18 @@ class AbstractWarehouse(ABC, Serializable):
         Merge results should not contain duplicates.
         """
-    @abstractmethod
-    def dataset_rows_select(self, select_query: sa.sql.selectable.Select, **kwargs):
+    def dataset_rows_select(
+        self,
+        query: sa.sql.selectable.Select,
+        **kwargs,
+    ) -> Iterator[tuple[Any, ...]]:
         """
-        Method for fetching dataset rows from database. This is abstract since
-        in some DBs we need to use special settings
+        Fetch dataset rows from database.
         """
+        rows = self.db.execute(query, **kwargs)
+        yield from convert_rows_custom_column_types(
+            query.selected_columns, rows, self.db.dialect
+        )
     @abstractmethod
     def get_dataset_sources(

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/arrow.py RENAMED Viewed

@@ -95,7 +95,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
         if not column:
             column = f"c{default_column}"
             default_column += 1
-        dtype = _arrow_type_mapper(field.type)  # type: ignore[assignment]
+        dtype = arrow_type_mapper(field.type)  # type: ignore[assignment]
         if field.nullable:
             dtype = Optional[dtype]  # type: ignore[assignment]
         output[column] = dtype
@@ -103,7 +103,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
     return output
-def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
+def arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     """Convert pyarrow types to basic types."""
     from datetime import datetime
@@ -122,11 +122,11 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
         return str
     if pa.types.is_list(col_type):
-        return list[_arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
+        return list[arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
     if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
-        return _arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
+        return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
     raise TypeError(f"{col_type!r} datatypes not supported")

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/clip.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import inspect
-from typing import TYPE_CHECKING, Any, Callable, Literal, Union
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
 import torch
 from transformers.modeling_utils import PreTrainedModel
@@ -39,6 +39,7 @@ def clip_similarity_scores(
     tokenizer: Callable,
     prob: bool = False,
     image_to_text: bool = True,
+    device: Optional[Union[str, torch.device]] = None,
 ) -> list[list[float]]:
     """
     Calculate CLIP similarity scores between one or more images and/or text.
@@ -52,6 +53,7 @@ def clip_similarity_scores(
         prob : Compute softmax probabilities.
         image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
             if only one of images or text provided.
+        device : Device to use. Defaults is None - use model's device.
     Example:
@@ -130,17 +132,26 @@ def clip_similarity_scores(
         ```
     """
+    if device is None:
+        if hasattr(model, "device"):
+            device = model.device
+        else:
+            device = next(model.parameters()).device
+    else:
+        model = model.to(device)
     with torch.no_grad():
         if images is not None:
             encoder = _get_encoder(model, "image")
             image_features = convert_images(
-                images, transform=preprocess, encoder=encoder
+                images, transform=preprocess, encoder=encoder, device=device
             )
             image_features /= image_features.norm(dim=-1, keepdim=True)  # type: ignore[union-attr]
         if text is not None:
             encoder = _get_encoder(model, "text")
-            text_features = convert_text(text, tokenizer, encoder=encoder)
+            text_features = convert_text(
+                text, tokenizer, encoder=encoder, device=device
+            )
             text_features /= text_features.norm(dim=-1, keepdim=True)  # type: ignore[union-attr]
         if images is not None and text is not None:

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/convert/python_to_sql.py RENAMED Viewed

@@ -73,6 +73,9 @@ def python_to_sql(typ):  # noqa: PLR0911
         if len(args) == 2 and (type(None) in args):
             return python_to_sql(args[0])
+        if _is_union_str_literal(orig, args):
+            return String
         if _is_json_inside_union(orig, args):
             return JSON
@@ -94,3 +97,9 @@ def _is_json_inside_union(orig, args) -> bool:
         if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
             return True
     return False
+def _is_union_str_literal(orig, args) -> bool:
+    if orig != Union:
+        return False
+    return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)

{datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/data_model.py RENAMED Viewed

@@ -2,7 +2,7 @@ from collections.abc import Sequence
 from datetime import datetime
 from typing import ClassVar, Union, get_args, get_origin
-from pydantic import BaseModel
+from pydantic import BaseModel, create_model
 from datachain.lib.model_store import ModelStore
@@ -57,3 +57,12 @@ def is_chain_type(t: type) -> bool:
         return is_chain_type(args[0])
     return False
+def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
+    fields = {name: (anno, ...) for name, anno in data_dict.items()}
+    return create_model(
+        name,
+        __base__=(DataModel,),  # type: ignore[call-overload]
+        **fields,
+    )  # type: ignore[call-overload]

datachain 0.3.6__tar.gz → 0.3.8__tar.gz

Potentially problematic release.

datachain 0.3.6tar.gz → 0.3.8tar.gz