PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/query/queue.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import datetime
 from collections.abc import Iterable, Iterator
-from queue import Empty, Full, Queue
+from queue import Empty, Full
 from struct import pack, unpack
 from time import sleep
 from typing import Any
 import msgpack
+from multiprocess.queues import Queue
-from datachain.query.batch import RowsOutput, RowsOutputBatch
+from datachain.query.batch import RowsOutput
 DEFAULT_BATCH_SIZE = 10000
 STOP_SIGNAL = "STOP"
@@ -56,7 +57,6 @@ def put_into_queue(queue: Queue, item: Any) -> None:
 MSGPACK_EXT_TYPE_DATETIME = 42
-MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH = 43
 def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
@@ -70,12 +70,6 @@ def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
         data = (obj.timestamp(),)  # type: ignore   # noqa: PGH003
         return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!d", *data))
-    if isinstance(obj, RowsOutputBatch):
-        return msgpack.ExtType(
-            MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH,
-            msgpack_pack(obj.rows),
-        )
     raise TypeError(f"Unknown type: {obj}")
@@ -100,9 +94,6 @@ def _msgpack_unpack_extended_types(code: int, data: bytes) -> Any:
             tz_info = datetime.timezone(datetime.timedelta(seconds=timezone_offset))
         return datetime.datetime.fromtimestamp(timestamp, tz=tz_info)
-    if code == MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH:
-        return RowsOutputBatch(msgpack_unpack(data))
     return msgpack.ExtType(code, data)

datachain/query/schema.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import functools
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from fnmatch import fnmatch
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any
 import attrs
 import sqlalchemy as sa
@@ -36,9 +37,13 @@ class ColumnMeta(type):
     def __getattr__(cls, name: str):
         return cls(ColumnMeta.to_db_name(name))
+    @staticmethod
+    def is_nested(name: str) -> bool:
+        return DEFAULT_DELIMITER in name
 class Column(sa.ColumnClause, metaclass=ColumnMeta):
-    inherit_cache: Optional[bool] = True
+    inherit_cache: bool | None = True
     def __init__(self, text, type_=None, is_literal=False, _selectable=None):
         """Dataset column."""
@@ -173,7 +178,7 @@ class LocalFilename(UDFParameter):
     otherwise None will be returned.
     """
-    glob: Optional[str] = None
+    glob: str | None = None
     def get_value(
         self,
@@ -182,7 +187,7 @@ class LocalFilename(UDFParameter):
         *,
         cb: Callback = DEFAULT_CALLBACK,
         **kwargs,
-    ) -> Optional[str]:
+    ) -> str | None:
         if self.glob and not fnmatch(row["name"], self.glob):  # type: ignore[type-var]
             # If the glob pattern is specified and the row filename
             # does not match it, then return None
@@ -201,7 +206,7 @@ class LocalFilename(UDFParameter):
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
         **kwargs,
-    ) -> Optional[str]:
+    ) -> str | None:
         if self.glob and not fnmatch(row["name"], self.glob):  # type: ignore[type-var]
             # If the glob pattern is specified and the row filename
             # does not match it, then return None
@@ -212,7 +217,7 @@ class LocalFilename(UDFParameter):
         return client.cache.get_path(file)
-UDFParamSpec = Union[str, Column, UDFParameter]
+UDFParamSpec = str | Column | UDFParameter
 def normalize_param(param: UDFParamSpec) -> UDFParameter:

datachain/query/session.py CHANGED Viewed

@@ -1,21 +1,37 @@
 import atexit
-import gc
 import logging
+import os
 import re
 import sys
-from typing import TYPE_CHECKING, ClassVar, Optional
+import traceback
+from collections.abc import Callable
+from typing import TYPE_CHECKING, ClassVar
 from uuid import uuid4
+from weakref import WeakSet
 from datachain.catalog import get_catalog
-from datachain.error import TableMissingError
+from datachain.data_storage import JobQueryType, JobStatus
+from datachain.error import JobNotFoundError, TableMissingError
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
-    from datachain.dataset import DatasetRecord
+    from datachain.job import Job
 logger = logging.getLogger("datachain")
+def is_script_run() -> bool:
+    """
+    Returns True if this was ran as python script, e.g python my_script.py.
+    Otherwise (if interactive or module run) returns False.
+    """
+    try:
+        argv0 = sys.argv[0]
+    except (IndexError, AttributeError):
+        return False
+    return bool(argv0) and argv0 not in ("-c", "-m", "ipython")
 class Session:
     """
     Session is a context that keeps track of temporary DataChain datasets for a proper
@@ -39,10 +55,18 @@ class Session:
     catalog (Catalog): Catalog object.
     """
-    GLOBAL_SESSION_CTX: Optional["Session"] = None
+    GLOBAL_SESSION_CTX: "Session | None" = None
     SESSION_CONTEXTS: ClassVar[list["Session"]] = []
+    _ALL_SESSIONS: ClassVar[WeakSet["Session"]] = WeakSet()
     ORIGINAL_EXCEPT_HOOK = None
+    # Job management - class-level to ensure one job per process
+    _CURRENT_JOB: ClassVar["Job | None"] = None
+    _JOB_STATUS: ClassVar[JobStatus | None] = None
+    _OWNS_JOB: ClassVar[bool | None] = None
+    _JOB_HOOKS_REGISTERED: ClassVar[bool] = False
+    _JOB_FINALIZE_HOOK: ClassVar[Callable[[], None] | None] = None
     DATASET_PREFIX = "session_"
     GLOBAL_SESSION_NAME = "global"
     SESSION_UUID_LEN = 6
@@ -51,8 +75,8 @@ class Session:
     def __init__(
         self,
         name="",
-        catalog: Optional["Catalog"] = None,
-        client_config: Optional[dict] = None,
+        catalog: "Catalog | None" = None,
+        client_config: dict | None = None,
         in_memory: bool = False,
     ):
         if re.match(r"^[0-9a-zA-Z]*$", name) is None:
@@ -69,7 +93,7 @@ class Session:
         self.catalog = catalog or get_catalog(
             client_config=client_config, in_memory=in_memory
         )
-        self.dataset_versions: list[tuple[DatasetRecord, int, bool]] = []
+        Session._ALL_SESSIONS.add(self)
     def __enter__(self):
         # Push the current context onto the stack
@@ -78,9 +102,8 @@ class Session:
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type:
-            self._cleanup_created_versions()
+        # Don't cleanup created versions on exception
+        # Datasets should persist even if the session fails
         self._cleanup_temp_datasets()
         if self.is_new_catalog:
             self.catalog.metastore.close_on_exit()
@@ -88,11 +111,116 @@ class Session:
         if Session.SESSION_CONTEXTS:
             Session.SESSION_CONTEXTS.pop()
+        Session._ALL_SESSIONS.discard(self)
-    def add_dataset_version(
-        self, dataset: "DatasetRecord", version: int, listing: bool = False
-    ) -> None:
-        self.dataset_versions.append((dataset, version, listing))
+    def get_or_create_job(self) -> "Job":
+        """
+        Get or create a Job for this process.
+        Returns:
+            Job: The active Job instance.
+        Behavior:
+            - If a job already exists, it is returned.
+            - If ``DATACHAIN_JOB_ID`` is set, the corresponding job is fetched.
+            - Otherwise, a new job is created:
+                * Name = absolute path to the Python script.
+                * Query = empty string.
+                * Parent = last job with the same name, if available.
+                * Status = "running".
+              Exit hooks are registered to finalize the job.
+        Note:
+            Job is shared across all Session instances to ensure one job per process.
+        """
+        if Session._CURRENT_JOB:
+            return Session._CURRENT_JOB
+        if env_job_id := os.getenv("DATACHAIN_JOB_ID"):
+            # SaaS run: just fetch existing job
+            Session._CURRENT_JOB = self.catalog.metastore.get_job(env_job_id)
+            if not Session._CURRENT_JOB:
+                raise JobNotFoundError(
+                    f"Job {env_job_id} from DATACHAIN_JOB_ID env not found"
+                )
+            Session._OWNS_JOB = False
+        else:
+            # Local run: create new job
+            if is_script_run():
+                script = os.path.abspath(sys.argv[0])
+            else:
+                # Interactive session or module run - use unique name to avoid
+                # linking unrelated sessions
+                script = str(uuid4())
+            python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
+            # try to find the parent job
+            parent = self.catalog.metastore.get_last_job_by_name(script)
+            job_id = self.catalog.metastore.create_job(
+                name=script,
+                query="",
+                query_type=JobQueryType.PYTHON,
+                status=JobStatus.RUNNING,
+                python_version=python_version,
+                parent_job_id=parent.id if parent else None,
+            )
+            Session._CURRENT_JOB = self.catalog.metastore.get_job(job_id)
+            Session._OWNS_JOB = True
+            Session._JOB_STATUS = JobStatus.RUNNING
+            # register cleanup hooks only once
+            if not Session._JOB_HOOKS_REGISTERED:
+                def _finalize_success_hook() -> None:
+                    self._finalize_job_success()
+                Session._JOB_FINALIZE_HOOK = _finalize_success_hook
+                atexit.register(Session._JOB_FINALIZE_HOOK)
+                Session._JOB_HOOKS_REGISTERED = True
+        assert Session._CURRENT_JOB is not None
+        return Session._CURRENT_JOB
+    def _finalize_job_success(self):
+        """Mark the current job as completed."""
+        if (
+            Session._CURRENT_JOB
+            and Session._OWNS_JOB
+            and Session._JOB_STATUS == JobStatus.RUNNING
+        ):
+            self.catalog.metastore.set_job_status(
+                Session._CURRENT_JOB.id, JobStatus.COMPLETE
+            )
+            Session._JOB_STATUS = JobStatus.COMPLETE
+    def _finalize_job_as_canceled(self):
+        """Mark the current job as canceled."""
+        if (
+            Session._CURRENT_JOB
+            and Session._OWNS_JOB
+            and Session._JOB_STATUS == JobStatus.RUNNING
+        ):
+            self.catalog.metastore.set_job_status(
+                Session._CURRENT_JOB.id, JobStatus.CANCELED
+            )
+            Session._JOB_STATUS = JobStatus.CANCELED
+    def _finalize_job_as_failed(self, exc_type, exc_value, tb):
+        """Mark the current job as failed with error details."""
+        if (
+            Session._CURRENT_JOB
+            and Session._OWNS_JOB
+            and Session._JOB_STATUS == JobStatus.RUNNING
+        ):
+            error_stack = "".join(traceback.format_exception(exc_type, exc_value, tb))
+            self.catalog.metastore.set_job_status(
+                Session._CURRENT_JOB.id,
+                JobStatus.FAILED,
+                error_message=str(exc_value),
+                error_stack=error_stack,
+            )
+            Session._JOB_STATUS = JobStatus.FAILED
     def generate_temp_dataset_name(self) -> str:
         return self.get_temp_prefix() + uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
@@ -100,31 +228,25 @@ class Session:
     def get_temp_prefix(self) -> str:
         return f"{self.DATASET_PREFIX}{self.name}_"
+    @classmethod
+    def is_temp_dataset(cls, name) -> bool:
+        return name.startswith(cls.DATASET_PREFIX)
     def _cleanup_temp_datasets(self) -> None:
         prefix = self.get_temp_prefix()
         try:
             for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
-                self.catalog.remove_dataset(dataset.name, force=True)
+                self.catalog.remove_dataset(dataset.name, dataset.project, force=True)
         # suppress error when metastore has been reset during testing
         except TableMissingError:
             pass
-    def _cleanup_created_versions(self) -> None:
-        if not self.dataset_versions:
-            return
-        for dataset, version, listing in self.dataset_versions:
-            if not listing:
-                self.catalog.remove_dataset_version(dataset, version)
-        self.dataset_versions.clear()
     @classmethod
     def get(
         cls,
-        session: Optional["Session"] = None,
-        catalog: Optional["Catalog"] = None,
-        client_config: Optional[dict] = None,
+        session: "Session | None" = None,
+        catalog: "Catalog | None" = None,
+        client_config: dict | None = None,
         in_memory: bool = False,
     ) -> "Session":
         """Creates a Session() object from a catalog.
@@ -169,27 +291,72 @@ class Session:
     @staticmethod
     def except_hook(exc_type, exc_value, exc_traceback):
-        Session.GLOBAL_SESSION_CTX.__exit__(exc_type, exc_value, exc_traceback)
+        if Session.GLOBAL_SESSION_CTX:
+            # Handle KeyboardInterrupt specially - mark as canceled and exit with
+            # signal code
+            if exc_type is KeyboardInterrupt:
+                Session.GLOBAL_SESSION_CTX._finalize_job_as_canceled()
+            else:
+                Session.GLOBAL_SESSION_CTX._finalize_job_as_failed(
+                    exc_type, exc_value, exc_traceback
+                )
+            Session.GLOBAL_SESSION_CTX.__exit__(exc_type, exc_value, exc_traceback)
         Session._global_cleanup()
+        # Always delegate to original hook if it exists
         if Session.ORIGINAL_EXCEPT_HOOK:
             Session.ORIGINAL_EXCEPT_HOOK(exc_type, exc_value, exc_traceback)
+        if exc_type is KeyboardInterrupt:
+            # Exit with SIGINT signal code (128 + 2 = 130, or -2 in subprocess terms)
+            sys.exit(130)
     @classmethod
     def cleanup_for_tests(cls):
+        cls._close_all_contexts()
         if cls.GLOBAL_SESSION_CTX is not None:
             cls.GLOBAL_SESSION_CTX.__exit__(None, None, None)
             cls.GLOBAL_SESSION_CTX = None
             atexit.unregister(cls._global_cleanup)
+        # Reset job-related class variables
+        if cls._JOB_FINALIZE_HOOK:
+            try:
+                atexit.unregister(cls._JOB_FINALIZE_HOOK)
+            except ValueError:
+                pass  # Hook was not registered
+        cls._CURRENT_JOB = None
+        cls._JOB_STATUS = None
+        cls._OWNS_JOB = None
+        cls._JOB_HOOKS_REGISTERED = False
+        cls._JOB_FINALIZE_HOOK = None
         if cls.ORIGINAL_EXCEPT_HOOK:
             sys.excepthook = cls.ORIGINAL_EXCEPT_HOOK
     @staticmethod
     def _global_cleanup():
+        Session._close_all_contexts()
         if Session.GLOBAL_SESSION_CTX is not None:
             Session.GLOBAL_SESSION_CTX.__exit__(None, None, None)
-        for obj in gc.get_objects():  # Get all tracked objects
-            if isinstance(obj, Session):  # Cleanup temp dataset for session variables.
-                obj.__exit__(None, None, None)
+        for session in list(Session._ALL_SESSIONS):
+            try:
+                session.__exit__(None, None, None)
+            except ReferenceError:
+                continue  # Object has been finalized already
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Exception while cleaning up session: {e}")  # noqa: G004
+    @classmethod
+    def _close_all_contexts(cls) -> None:
+        while cls.SESSION_CONTEXTS:
+            session = cls.SESSION_CONTEXTS.pop()
+            try:
+                session.__exit__(None, None, None)
+            except Exception as exc:  # noqa: BLE001
+                logger.error(
+                    "Exception while closing session context during cleanup: %s",
+                    exc,
+                )

datachain/query/udf.py CHANGED Viewed

@@ -1,8 +1,11 @@
-from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, TypedDict
 if TYPE_CHECKING:
     from sqlalchemy import Select, Table
+    from datachain.catalog import Catalog
     from datachain.query.batch import BatchingStrategy
@@ -15,6 +18,35 @@ class UdfInfo(TypedDict):
     query: "Select"
     udf_fields: list[str]
     batching: "BatchingStrategy"
-    processes: Optional[int]
+    processes: int | None
     is_generator: bool
     cache: bool
+    rows_total: int
+    batch_size: int
+class AbstractUDFDistributor(ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        catalog: "Catalog",
+        table: "Table",
+        query: "Select",
+        udf_data: bytes,
+        batching: "BatchingStrategy",
+        workers: bool | int,
+        processes: bool | int,
+        udf_fields: list[str],
+        rows_total: int,
+        use_cache: bool,
+        is_generator: bool = False,
+        min_task_size: str | int | None = None,
+        batch_size: int | None = None,
+    ) -> None: ...
+    @abstractmethod
+    def __call__(self) -> None: ...
+    @staticmethod
+    @abstractmethod
+    def run_udf() -> int: ...

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl