PyPI - datachain - Versions diffs - 0.8.8__py3-none-any.whl → 0.8.10__py3-none-any.whl - Mend

datachain 0.8.8py3-none-any.whl → 0.8.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (32) hide show

datachain/cli/__init__.py +14 -7
datachain/cli/commands/datasets.py +2 -3
datachain/cli/parser/__init__.py +69 -82
datachain/cli/parser/job.py +20 -25
datachain/cli/parser/studio.py +41 -65
datachain/cli/parser/utils.py +1 -1
datachain/cli/utils.py +1 -1
datachain/client/local.py +1 -1
datachain/data_storage/sqlite.py +38 -7
datachain/data_storage/warehouse.py +2 -2
datachain/lib/arrow.py +1 -1
datachain/lib/convert/python_to_sql.py +15 -3
datachain/lib/convert/unflatten.py +1 -2
datachain/lib/dc.py +26 -5
datachain/lib/file.py +27 -4
datachain/lib/listing.py +4 -4
datachain/lib/pytorch.py +3 -1
datachain/lib/udf.py +56 -20
datachain/model/bbox.py +9 -9
datachain/model/pose.py +9 -9
datachain/model/segment.py +6 -6
datachain/progress.py +0 -13
datachain/query/dataset.py +20 -14
datachain/remote/studio.py +2 -2
datachain/sql/sqlite/base.py +35 -14
datachain/studio.py +22 -16
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/METADATA +4 -3
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/RECORD +32 -32
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/LICENSE +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/WHEEL +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/entry_points.txt +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/top_level.txt +0 -0

datachain/lib/udf.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datachain.lib.convert.flatten import flatten
 from datachain.lib.data_model import DataValue
 from datachain.lib.file import File
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
+from datachain.progress import CombinedDownloadCallback
 from datachain.query.batch import (
     Batch,
     BatchingStrategy,
@@ -301,20 +302,42 @@ async def _prefetch_input(
     return row
+def _remove_prefetched(row: T) -> None:
+    for obj in row:
+        if isinstance(obj, File):
+            catalog = obj._catalog
+            assert catalog is not None
+            try:
+                catalog.cache.remove(obj)
+            except Exception as e:  # noqa: BLE001
+                print(f"Failed to remove prefetched item {obj.name!r}: {e!s}")
 def _prefetch_inputs(
     prepared_inputs: "Iterable[T]",
     prefetch: int = 0,
     download_cb: Optional["Callback"] = None,
-    after_prefetch: "Callable[[], None]" = noop,
+    after_prefetch: Optional[Callable[[], None]] = None,
+    remove_prefetched: bool = False,
 ) -> "abc.Generator[T, None, None]":
-    if prefetch > 0:
-        f = partial(
-            _prefetch_input,
-            download_cb=download_cb,
-            after_prefetch=after_prefetch,
-        )
-        prepared_inputs = AsyncMapper(f, prepared_inputs, workers=prefetch).iterate()  # type: ignore[assignment]
-    yield from prepared_inputs
+    if not prefetch:
+        yield from prepared_inputs
+        return
+    if after_prefetch is None:
+        after_prefetch = noop
+        if isinstance(download_cb, CombinedDownloadCallback):
+            after_prefetch = download_cb.increment_file_count
+    f = partial(_prefetch_input, download_cb=download_cb, after_prefetch=after_prefetch)
+    mapper = AsyncMapper(f, prepared_inputs, workers=prefetch)
+    with closing(mapper.iterate()) as row_iter:
+        for row in row_iter:
+            try:
+                yield row  # type: ignore[misc]
+            finally:
+                if remove_prefetched:
+                    _remove_prefetched(row)
 def _get_cache(
@@ -351,7 +374,13 @@ class Mapper(UDFBase):
                     )
         prepared_inputs = _prepare_rows(udf_inputs)
-        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        prepared_inputs = _prefetch_inputs(
+            prepared_inputs,
+            self.prefetch,
+            download_cb=download_cb,
+            remove_prefetched=bool(self.prefetch) and not cache,
+        )
         with closing(prepared_inputs):
             for id_, *udf_args in prepared_inputs:
                 result_objs = self.process_safe(udf_args)
@@ -391,9 +420,9 @@ class BatchMapper(UDFBase):
             )
             result_objs = list(self.process_safe(udf_args))
             n_objs = len(result_objs)
-            assert (
-                n_objs == n_rows
-            ), f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
+            assert n_objs == n_rows, (
+                f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
+            )
             udf_outputs = (self._flatten_row(row) for row in result_objs)
             output = [
                 {"sys__id": row_id} | dict(zip(self.signal_names, signals))
@@ -429,15 +458,22 @@ class Generator(UDFBase):
                         row, udf_fields, catalog, cache, download_cb
                     )
+        def _process_row(row):
+            with safe_closing(self.process_safe(row)) as result_objs:
+                for result_obj in result_objs:
+                    udf_output = self._flatten_row(result_obj)
+                    yield dict(zip(self.signal_names, udf_output))
         prepared_inputs = _prepare_rows(udf_inputs)
-        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        prepared_inputs = _prefetch_inputs(
+            prepared_inputs,
+            self.prefetch,
+            download_cb=download_cb,
+            remove_prefetched=bool(self.prefetch) and not cache,
+        )
         with closing(prepared_inputs):
-            for row in prepared_inputs:
-                result_objs = self.process_safe(row)
-                udf_outputs = (self._flatten_row(row) for row in result_objs)
-                output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
-                processed_cb.relative_update(1)
-                yield output
+            for row in processed_cb.wrap(prepared_inputs):
+                yield _process_row(row)
         self.teardown()

datachain/model/bbox.py CHANGED Viewed

@@ -22,9 +22,9 @@ class BBox(DataModel):
     @staticmethod
     def from_list(coords: list[float], title: str = "") -> "BBox":
         assert len(coords) == 4, "Bounding box must be a list of 4 coordinates."
-        assert all(
-            isinstance(value, (int, float)) for value in coords
-        ), "Bounding box coordinates must be floats or integers."
+        assert all(isinstance(value, (int, float)) for value in coords), (
+            "Bounding box coordinates must be floats or integers."
+        )
         return BBox(
             title=title,
             coords=[round(c) for c in coords],
@@ -64,12 +64,12 @@ class OBBox(DataModel):
     @staticmethod
     def from_list(coords: list[float], title: str = "") -> "OBBox":
-        assert (
-            len(coords) == 8
-        ), "Oriented bounding box must be a list of 8 coordinates."
-        assert all(
-            isinstance(value, (int, float)) for value in coords
-        ), "Oriented bounding box coordinates must be floats or integers."
+        assert len(coords) == 8, (
+            "Oriented bounding box must be a list of 8 coordinates."
+        )
+        assert all(isinstance(value, (int, float)) for value in coords), (
+            "Oriented bounding box coordinates must be floats or integers."
+        )
         return OBBox(
             title=title,
             coords=[round(c) for c in coords],

datachain/model/pose.py CHANGED Viewed

@@ -22,9 +22,9 @@ class Pose(DataModel):
     def from_list(points: list[list[float]]) -> "Pose":
         assert len(points) == 2, "Pose must be a list of 2 lists: x and y coordinates."
         points_x, points_y = points
-        assert (
-            len(points_x) == len(points_y) == 17
-        ), "Pose x and y coordinates must have the same length of 17."
+        assert len(points_x) == len(points_y) == 17, (
+            "Pose x and y coordinates must have the same length of 17."
+        )
         assert all(
             isinstance(value, (int, float)) for value in [*points_x, *points_y]
         ), "Pose coordinates must be floats or integers."
@@ -61,13 +61,13 @@ class Pose3D(DataModel):
     @staticmethod
     def from_list(points: list[list[float]]) -> "Pose3D":
-        assert (
-            len(points) == 3
-        ), "Pose3D must be a list of 3 lists: x, y coordinates and visible."
+        assert len(points) == 3, (
+            "Pose3D must be a list of 3 lists: x, y coordinates and visible."
+        )
         points_x, points_y, points_v = points
-        assert (
-            len(points_x) == len(points_y) == len(points_v) == 17
-        ), "Pose3D x, y coordinates and visible must have the same length of 17."
+        assert len(points_x) == len(points_y) == len(points_v) == 17, (
+            "Pose3D x, y coordinates and visible must have the same length of 17."
+        )
         assert all(
             isinstance(value, (int, float))
             for value in [*points_x, *points_y, *points_v]

datachain/model/segment.py CHANGED Viewed

@@ -22,13 +22,13 @@ class Segment(DataModel):
     @staticmethod
     def from_list(points: list[list[float]], title: str = "") -> "Segment":
-        assert (
-            len(points) == 2
-        ), "Segment must be a list of 2 lists: x and y coordinates."
+        assert len(points) == 2, (
+            "Segment must be a list of 2 lists: x and y coordinates."
+        )
         points_x, points_y = points
-        assert len(points_x) == len(
-            points_y
-        ), "Segment x and y coordinates must have the same length."
+        assert len(points_x) == len(points_y), (
+            "Segment x and y coordinates must have the same length."
+        )
         assert all(
             isinstance(value, (int, float)) for value in [*points_x, *points_y]
         ), "Segment coordinates must be floats or integers."

datachain/progress.py CHANGED Viewed

@@ -1,14 +1,5 @@
-"""Manages progress bars."""
-import logging
-from threading import RLock
 from fsspec import Callback
 from fsspec.callbacks import TqdmCallback
-from tqdm.auto import tqdm
-logger = logging.getLogger(__name__)
-tqdm.set_lock(RLock())
 class CombinedDownloadCallback(Callback):
@@ -24,10 +15,6 @@ class CombinedDownloadCallback(Callback):
 class TqdmCombinedDownloadCallback(CombinedDownloadCallback, TqdmCallback):
     def __init__(self, tqdm_kwargs=None, *args, **kwargs):
         self.files_count = 0
-        tqdm_kwargs = tqdm_kwargs or {}
-        tqdm_kwargs.setdefault("postfix", {}).setdefault("files", self.files_count)
-        kwargs = kwargs or {}
-        kwargs["tqdm_cls"] = tqdm
         super().__init__(tqdm_kwargs, *args, **kwargs)
     def increment_file_count(self, n: int = 1) -> None:

datachain/query/dataset.py CHANGED Viewed

@@ -336,15 +336,16 @@ def process_udf_outputs(
     for udf_output in udf_results:
         if not udf_output:
             continue
-        for row in udf_output:
-            cb.relative_update()
-            rows.append(adjust_outputs(warehouse, row, udf_col_types))
-            if len(rows) >= batch_size or (
-                len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
-            ):
-                for row_chunk in batched(rows, batch_size):
-                    warehouse.insert_rows(udf_table, row_chunk)
-                rows.clear()
+        with safe_closing(udf_output):
+            for row in udf_output:
+                cb.relative_update()
+                rows.append(adjust_outputs(warehouse, row, udf_col_types))
+                if len(rows) >= batch_size or (
+                    len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
+                ):
+                    for row_chunk in batched(rows, batch_size):
+                        warehouse.insert_rows(udf_table, row_chunk)
+                    rows.clear()
     if rows:
         for row_chunk in batched(rows, batch_size):
@@ -355,7 +356,7 @@ def process_udf_outputs(
 def get_download_callback(suffix: str = "", **kwargs) -> CombinedDownloadCallback:
     return TqdmCombinedDownloadCallback(
-        {
+        tqdm_kwargs={
             "desc": "Download" + suffix,
             "unit": "B",
             "unit_scale": True,
@@ -363,6 +364,7 @@ def get_download_callback(suffix: str = "", **kwargs) -> CombinedDownloadCallbac
             "leave": False,
             **kwargs,
         },
+        tqdm_cls=tqdm,
     )
@@ -873,6 +875,7 @@ class SQLJoin(Step):
     query2: "DatasetQuery"
     predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
     inner: bool
+    full: bool
     rname: str
     def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
@@ -975,14 +978,14 @@ class SQLJoin(Step):
         self.validate_expression(join_expression, q1, q2)
         def q(*columns):
-            join_query = self.catalog.warehouse.join(
+            return self.catalog.warehouse.join(
                 q1,
                 q2,
                 join_expression,
                 inner=self.inner,
+                full=self.full,
+                columns=columns,
             )
-            return sqlalchemy.select(*columns).select_from(join_query)
-            # return sqlalchemy.select(*subquery.c).select_from(subquery)
         return step_result(
             q,
@@ -1487,6 +1490,7 @@ class DatasetQuery:
         dataset_query: "DatasetQuery",
         predicates: Union[JoinPredicateType, Sequence[JoinPredicateType]],
         inner=False,
+        full=False,
         rname="{name}_right",
     ) -> "Self":
         left = self.clone(new_table=False)
@@ -1502,7 +1506,9 @@ class DatasetQuery:
             if isinstance(predicates, (str, ColumnClause, ColumnElement))
             else tuple(predicates)
         )
-        new_query.steps = [SQLJoin(self.catalog, left, right, predicates, inner, rname)]
+        new_query.steps = [
+            SQLJoin(self.catalog, left, right, predicates, inner, full, rname)
+        ]
         return new_query
     @detach

datachain/remote/studio.py CHANGED Viewed

@@ -75,7 +75,7 @@ class StudioClient:
         if not token:
             raise DataChainError(
-                "Studio token is not set. Use `datachain studio login` "
+                "Studio token is not set. Use `datachain auth login` "
                 "or environment variable `DVC_STUDIO_TOKEN` to set it."
             )
@@ -105,7 +105,7 @@ class StudioClient:
         if not team:
             raise DataChainError(
                 "Studio team is not set. "
-                "Use `datachain studio team <team_name>` "
+                "Use `datachain auth team <team_name>` "
                 "or environment variable `DVC_STUDIO_TEAM` to set it."
                 "You can also set it in the config file as team under studio."
             )

datachain/sql/sqlite/base.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sqlite3
 import warnings
 from collections.abc import Iterable
 from datetime import MAXYEAR, MINYEAR, datetime, timezone
+from functools import cache
 from types import MappingProxyType
 from typing import Callable, Optional
@@ -526,24 +527,44 @@ def compile_collect(element, compiler, **kwargs):
     return compiler.process(func.json_group_array(*element.clauses.clauses), **kwargs)
-def load_usearch_extension(conn: sqlite3.Connection) -> bool:
+@cache
+def usearch_sqlite_path() -> Optional[str]:
     try:
-        # usearch is part of the vector optional dependencies
-        # we use the extension's cosine and euclidean distance functions
-        from usearch import sqlite_path
+        import usearch
+    except ImportError:
+        return None
-        conn.enable_load_extension(True)
+    with warnings.catch_warnings():
+        # usearch binary is not available for Windows, see: https://github.com/unum-cloud/usearch/issues/427.
+        # and, sometimes fail to download the binary in other platforms
+        # triggering UserWarning.
-        with warnings.catch_warnings():
-            # usearch binary is not available for Windows, see: https://github.com/unum-cloud/usearch/issues/427.
-            # and, sometimes fail to download the binary in other platforms
-            # triggering UserWarning.
+        warnings.filterwarnings("ignore", category=UserWarning, module="usearch")
-            warnings.filterwarnings("ignore", category=UserWarning, module="usearch")
-            conn.load_extension(sqlite_path())
+        try:
+            return usearch.sqlite_path()
+        except FileNotFoundError:
+            return None
-        conn.enable_load_extension(False)
-        return True
-    except Exception:  # noqa: BLE001
+def load_usearch_extension(conn: sqlite3.Connection) -> bool:
+    # usearch is part of the vector optional dependencies
+    # we use the extension's cosine and euclidean distance functions
+    ext_path = usearch_sqlite_path()
+    if ext_path is None:
+        return False
+    try:
+        conn.enable_load_extension(True)
+    except AttributeError:
+        # sqlite3 module is not built with loadable extension support by default.
+        return False
+    try:
+        conn.load_extension(ext_path)
+    except sqlite3.OperationalError:
         return False
+    else:
+        return True
+    finally:
+        conn.enable_load_extension(False)

datachain/studio.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import asyncio
 import os
+import sys
 from typing import TYPE_CHECKING, Optional
-from tabulate import tabulate
 from datachain.catalog.catalog import raise_remote_error
 from datachain.config import Config, ConfigLevel
 from datachain.dataset import QUERY_DATASET_PREFIX
@@ -21,6 +20,13 @@ POST_LOGIN_MESSAGE = (
 def process_jobs_args(args: "Namespace"):
+    if args.cmd is None:
+        print(
+            f"Use 'datachain {args.command} --help' to see available options",
+            file=sys.stderr,
+        )
+        return 1
     if args.cmd == "run":
         return create_job(
             args.query_file,
@@ -41,20 +47,20 @@ def process_jobs_args(args: "Namespace"):
     raise DataChainError(f"Unknown command '{args.cmd}'.")
-def process_studio_cli_args(args: "Namespace"):
+def process_auth_cli_args(args: "Namespace"):
+    if args.cmd is None:
+        print(
+            f"Use 'datachain {args.command} --help' to see available options",
+            file=sys.stderr,
+        )
+        return 1
     if args.cmd == "login":
         return login(args)
     if args.cmd == "logout":
         return logout()
     if args.cmd == "token":
         return token()
-    if args.cmd == "dataset":
-        rows = [
-            {"Name": name, "Version": version}
-            for name, version in list_datasets(args.team)
-        ]
-        print(tabulate(rows, headers="keys"))
-        return 0
     if args.cmd == "team":
         return set_team(args)
@@ -89,7 +95,7 @@ def login(args: "Namespace"):
         raise DataChainError(
             "Token already exists. "
             "To login with a different token, "
-            "logout using `datachain studio logout`."
+            "logout using `datachain auth logout`."
         )
     open_browser = not args.no_open
@@ -115,12 +121,12 @@ def logout():
         token = conf.get("studio", {}).get("token")
         if not token:
             raise DataChainError(
-                "Not logged in to Studio. Log in with 'datachain studio login'."
+                "Not logged in to Studio. Log in with 'datachain auth login'."
             )
         del conf["studio"]["token"]
-    print("Logged out from Studio. (you can log back in with 'datachain studio login')")
+    print("Logged out from Studio. (you can log back in with 'datachain auth login')")
 def token():
@@ -128,7 +134,7 @@ def token():
     token = config.get("token")
     if not token:
         raise DataChainError(
-            "Not logged in to Studio. Log in with 'datachain studio login'."
+            "Not logged in to Studio. Log in with 'datachain auth login'."
         )
     print(token)
@@ -293,7 +299,7 @@ def cancel_job(job_id: str, team_name: Optional[str]):
     token = Config().read().get("studio", {}).get("token")
     if not token:
         raise DataChainError(
-            "Not logged in to Studio. Log in with 'datachain studio login'."
+            "Not logged in to Studio. Log in with 'datachain auth login'."
         )
     client = StudioClient(team=team_name)
@@ -308,7 +314,7 @@ def show_job_logs(job_id: str, team_name: Optional[str]):
     token = Config().read().get("studio", {}).get("token")
     if not token:
         raise DataChainError(
-            "Not logged in to Studio. Log in with 'datachain studio login'."
+            "Not logged in to Studio. Log in with 'datachain auth login'."
         )
     client = StudioClient(team=team_name)

{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: datachain
-Version: 0.8.8
+Version: 0.8.10
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -99,7 +99,7 @@ Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
 Requires-Dist: pdfplumber==0.11.5; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: onnx==1.16.1; extra == "examples"
-Requires-Dist: ultralytics==8.3.58; extra == "examples"
+Requires-Dist: ultralytics==8.3.61; extra == "examples"
 ================
 |logo| DataChain
@@ -189,13 +189,14 @@ Python code:
 .. code:: py
+    import os
     from mistralai import Mistral
     from datachain import File, DataChain, Column
     PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
     def eval_dialogue(file: File) -> bool:
-         client = Mistral()
+         client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
          response = client.chat.complete(
              model="open-mixtral-8x22b",
              messages=[{"role": "system", "content": PROMPT},

datachain 0.8.8__py3-none-any.whl → 0.8.10__py3-none-any.whl

Potentially problematic release.

datachain 0.8.8py3-none-any.whl → 0.8.10py3-none-any.whl