PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

datachain/__init__.py +4 -0
datachain/asyn.py +11 -12
datachain/cache.py +5 -5
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +276 -354
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +8 -3
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +10 -17
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +42 -27
datachain/cli/commands/ls.py +15 -15
datachain/cli/commands/show.py +2 -2
datachain/cli/parser/__init__.py +3 -43
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +2 -2
datachain/client/fsspec.py +34 -23
datachain/client/gcs.py +3 -3
datachain/client/http.py +157 -0
datachain/client/local.py +11 -7
datachain/client/s3.py +3 -3
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +2 -0
datachain/data_storage/metastore.py +716 -137
datachain/data_storage/schema.py +20 -27
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +114 -114
datachain/data_storage/warehouse.py +140 -48
datachain/dataset.py +109 -89
datachain/delta.py +117 -42
datachain/diff/__init__.py +25 -33
datachain/error.py +24 -0
datachain/func/aggregate.py +9 -11
datachain/func/array.py +12 -12
datachain/func/base.py +7 -4
datachain/func/conditional.py +9 -13
datachain/func/func.py +63 -45
datachain/func/numeric.py +5 -7
datachain/func/string.py +2 -2
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +18 -15
datachain/lib/audio.py +60 -59
datachain/lib/clip.py +14 -13
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/values_to_tuples.py +151 -53
datachain/lib/data_model.py +23 -19
datachain/lib/dataset_info.py +7 -7
datachain/lib/dc/__init__.py +2 -1
datachain/lib/dc/csv.py +22 -26
datachain/lib/dc/database.py +37 -34
datachain/lib/dc/datachain.py +518 -324
datachain/lib/dc/datasets.py +38 -30
datachain/lib/dc/hf.py +16 -20
datachain/lib/dc/json.py +17 -18
datachain/lib/dc/listings.py +5 -8
datachain/lib/dc/pandas.py +3 -6
datachain/lib/dc/parquet.py +33 -21
datachain/lib/dc/records.py +9 -13
datachain/lib/dc/storage.py +103 -65
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +17 -14
datachain/lib/dc/values.py +3 -6
datachain/lib/file.py +187 -50
datachain/lib/hf.py +7 -5
datachain/lib/image.py +13 -13
datachain/lib/listing.py +5 -5
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +2 -3
datachain/lib/model_store.py +20 -8
datachain/lib/namespaces.py +59 -7
datachain/lib/projects.py +51 -9
datachain/lib/pytorch.py +31 -23
datachain/lib/settings.py +188 -85
datachain/lib/signal_schema.py +302 -64
datachain/lib/text.py +8 -7
datachain/lib/udf.py +103 -63
datachain/lib/udf_signature.py +59 -34
datachain/lib/utils.py +20 -0
datachain/lib/video.py +3 -4
datachain/lib/webdataset.py +31 -36
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +12 -5
datachain/model/bbox.py +3 -1
datachain/namespace.py +22 -3
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +4 -4
datachain/query/batch.py +10 -12
datachain/query/dataset.py +376 -194
datachain/query/dispatch.py +112 -84
datachain/query/metrics.py +3 -4
datachain/query/params.py +2 -3
datachain/query/queue.py +2 -1
datachain/query/schema.py +7 -6
datachain/query/session.py +190 -33
datachain/query/udf.py +9 -6
datachain/remote/studio.py +90 -53
datachain/script_meta.py +12 -12
datachain/sql/sqlite/base.py +37 -25
datachain/sql/sqlite/types.py +1 -1
datachain/sql/types.py +36 -5
datachain/studio.py +49 -40
datachain/toolkit/split.py +31 -10
datachain/utils.py +39 -48
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
datachain-0.39.0.dist-info/RECORD +173 -0
datachain/cli/commands/query.py +0 -54
datachain/query/utils.py +0 -36
datachain-0.30.5.dist-info/RECORD +0 -168
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/cli/parser/utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from argparse import Action, ArgumentParser, ArgumentTypeError, HelpFormatter
-from typing import Union
 from datachain.cli.utils import CommaSeparatedArgs
@@ -44,7 +43,7 @@ def parse_find_column(column: str) -> str:
     )
-def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Action:
+def add_sources_arg(parser: ArgumentParser, nargs: str | int = "+") -> Action:
     return parser.add_argument(
         "sources",
         type=str,

datachain/cli/utils.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
-from argparse import SUPPRESS, Action, ArgumentError, Namespace, _AppendAction
-from typing import Optional
+from argparse import SUPPRESS, Action, Namespace, _AppendAction
 from datachain.error import DataChainError
@@ -64,18 +63,6 @@ class CommaSeparatedArgs(_AppendAction):  # pylint: disable=protected-access
         setattr(namespace, self.dest, list(dict.fromkeys(items)))
-class KeyValueArgs(_AppendAction):  # pylint: disable=protected-access
-    def __call__(self, parser, namespace, values, option_string=None):
-        items = getattr(namespace, self.dest) or {}
-        for raw_value in filter(bool, values):
-            key, sep, value = raw_value.partition("=")
-            if not key or not sep or value == "":
-                raise ArgumentError(self, f"expected 'key=value', got {raw_value!r}")
-            items[key.strip()] = value
-        setattr(namespace, self.dest, items)
 def get_logging_level(args: Namespace) -> int:
     if args.quiet:
         return logging.CRITICAL
@@ -84,7 +71,7 @@ def get_logging_level(args: Namespace) -> int:
     return logging.INFO
-def determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
+def determine_flavors(studio: bool, local: bool, all: bool, token: str | None):
     if studio and not token:
         raise DataChainError(
             "Not logged in to Studio. Log in with 'datachain auth login'."

datachain/client/azure.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 from urllib.parse import parse_qs, urlsplit, urlunsplit
 from adlfs import AzureBlobFileSystem
@@ -73,7 +73,7 @@ class AzureClient(Client):
             result_queue.put_nowait(None)
     @classmethod
-    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+    def version_path(cls, path: str, version_id: str | None) -> str:
         parts = list(urlsplit(path))
         query = parse_qs(parts[3])
         if "versionid" in query:

datachain/client/fsspec.py CHANGED Viewed

@@ -10,15 +10,7 @@ from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator, Iterator, Sequence
 from datetime import datetime
 from shutil import copy2
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    BinaryIO,
-    ClassVar,
-    NamedTuple,
-    Optional,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, NamedTuple
 from urllib.parse import urlparse
 from dvc_objects.fs.system import reflink
@@ -44,11 +36,12 @@ FETCH_WORKERS = 100
 DELIMITER = "/"  # Path delimiter.
 DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
+CLOUD_STORAGE_PROTOCOLS = {"s3", "gs", "az", "hf"}
-ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
+ResultQueue = asyncio.Queue[Sequence["File"] | None]
-def _is_win_local_path(uri: str) -> bool:
+def is_win_local_path(uri: str) -> bool:
     if sys.platform == "win32":
         if len(uri) >= 1 and uri[0] == "\\":
             return True
@@ -62,10 +55,20 @@ def _is_win_local_path(uri: str) -> bool:
     return False
+def is_cloud_uri(uri: str) -> bool:
+    protocol = urlparse(uri).scheme
+    return protocol in CLOUD_STORAGE_PROTOCOLS
+def get_cloud_schemes() -> list[str]:
+    """Get list of cloud storage scheme prefixes."""
+    return [f"{p}://" for p in CLOUD_STORAGE_PROTOCOLS]
 class Bucket(NamedTuple):
     name: str
     uri: "StorageURI"
-    created: Optional[datetime]
+    created: datetime | None
 class Client(ABC):
@@ -77,21 +80,22 @@ class Client(ABC):
     def __init__(self, name: str, fs_kwargs: dict[str, Any], cache: Cache) -> None:
         self.name = name
         self.fs_kwargs = fs_kwargs
-        self._fs: Optional[AbstractFileSystem] = None
+        self._fs: AbstractFileSystem | None = None
         self.cache = cache
         self.uri = self.get_uri(self.name)
     @staticmethod
-    def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
+    def get_implementation(url: str | os.PathLike[str]) -> type["Client"]:  # noqa: PLR0911
         from .azure import AzureClient
         from .gcs import GCSClient
         from .hf import HfClient
+        from .http import HTTPClient, HTTPSClient
         from .local import FileClient
         from .s3 import ClientS3
         protocol = urlparse(os.fspath(url)).scheme
-        if not protocol or _is_win_local_path(os.fspath(url)):
+        if not protocol or is_win_local_path(os.fspath(url)):
             return FileClient
         if protocol == ClientS3.protocol:
             return ClientS3
@@ -103,9 +107,18 @@ class Client(ABC):
             return FileClient
         if protocol == HfClient.protocol:
             return HfClient
+        if protocol == HTTPClient.protocol:
+            return HTTPClient
+        if protocol == HTTPSClient.protocol:
+            return HTTPSClient
         raise NotImplementedError(f"Unsupported protocol: {protocol}")
+    @classmethod
+    def path_to_uri(cls, path: str) -> str:
+        """Convert a path-like object to a URI. Default: identity."""
+        return path
     @staticmethod
     def is_data_source_uri(name: str) -> bool:
         # Returns True if name is one of supported data sources URIs, e.g s3 bucket
@@ -118,9 +131,7 @@ class Client(ABC):
         return cls.get_uri(storage_name), rel_path
     @staticmethod
-    def get_client(
-        source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
-    ) -> "Client":
+    def get_client(source: str | os.PathLike[str], cache: Cache, **kwargs) -> "Client":
         cls = Client.get_implementation(source)
         storage_url, _ = cls.split_url(os.fspath(source))
         if os.name == "nt":
@@ -136,7 +147,7 @@ class Client(ABC):
         return fs
     @classmethod
-    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+    def version_path(cls, path: str, version_id: str | None) -> str:
         return path
     @classmethod
@@ -216,16 +227,16 @@ class Client(ABC):
         )
         return self.info_to_file(info, file_path).etag
-    def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
+    def get_file_info(self, path: str, version_id: str | None = None) -> "File":
         info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
         return self.info_to_file(info, path)
-    async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
+    async def get_size(self, path: str, version_id: str | None = None) -> int:
         return await self.fs._size(
             self.version_path(path, version_id), version_id=version_id
         )
-    async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
+    async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
         return await self.fs._get_file(
             self.version_path(lpath, version_id),
             rpath,
@@ -339,7 +350,7 @@ class Client(ABC):
     def rel_path(self, path: str) -> str:
         return self.fs.split_path(path)[1]
-    def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
+    def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
         return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
     @abstractmethod

datachain/client/gcs.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import os
 from collections.abc import Iterable
 from datetime import datetime
-from typing import Any, Optional, cast
+from typing import Any, cast
 from dateutil.parser import isoparse
 from gcsfs import GCSFileSystem
@@ -15,7 +15,7 @@ from .fsspec import DELIMITER, Client, ResultQueue
 # Patch gcsfs for consistency with s3fs
 GCSFileSystem.set_session = GCSFileSystem._set_session
-PageQueue = asyncio.Queue[Optional[Iterable[dict[str, Any]]]]
+PageQueue = asyncio.Queue[Iterable[dict[str, Any]] | None]
 class GCSClient(Client):
@@ -141,5 +141,5 @@ class GCSClient(Client):
         )
     @classmethod
-    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+    def version_path(cls, path: str, version_id: str | None) -> str:
         return f"{path}#{version_id}" if version_id else path

datachain/client/http.py ADDED Viewed

@@ -0,0 +1,157 @@
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any, ClassVar, cast
+from urllib.parse import urlparse
+from fsspec.implementations.http import HTTPFileSystem
+from datachain.dataset import StorageURI
+from datachain.lib.file import File
+from .fsspec import Client
+if TYPE_CHECKING:
+    from datachain.cache import Cache
+class HTTPClient(Client):
+    FS_CLASS = HTTPFileSystem
+    PREFIX: ClassVar[str] = "http://"
+    protocol: ClassVar[str] = "http"
+    @classmethod
+    def create_fs(cls, **kwargs) -> HTTPFileSystem:
+        # Configure HTTPFileSystem options
+        kwargs.setdefault("simple_links", True)
+        kwargs.setdefault("same_scheme", True)
+        kwargs.setdefault("cache_type", "bytes")
+        kwargs.pop("version_aware", None)
+        fs = cls.FS_CLASS(**kwargs)
+        fs.invalidate_cache()
+        return cast("HTTPFileSystem", fs)
+    @classmethod
+    def from_name(
+        cls,
+        name: str,
+        cache: "Cache",
+        kwargs: dict[str, Any],
+    ) -> "HTTPClient":
+        parsed = urlparse(name)
+        if parsed.scheme:
+            name = parsed.netloc + parsed.path
+        return cls(name, kwargs, cache)
+    @classmethod
+    def split_url(cls, url: str) -> tuple[str, str]:
+        """Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
+        parsed = urlparse(url)
+        domain = parsed.netloc
+        path = parsed.path.lstrip("/")
+        if parsed.query:
+            path += f"?{parsed.query}"
+        if parsed.fragment:
+            path += f"#{parsed.fragment}"
+        return domain, path
+    @classmethod
+    def get_uri(cls, name: str) -> "StorageURI":
+        if not name.startswith(("http://", "https://")):
+            return StorageURI(f"{cls.PREFIX}{name}")
+        return StorageURI(name)
+    @classmethod
+    def is_root_url(cls, url: str) -> bool:
+        parsed = urlparse(url)
+        return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
+    def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
+        if self.name.startswith(("http://", "https://")):
+            base_url = self.name
+        else:
+            if rel_path and "/" in rel_path:
+                first_part = rel_path.split("/")[0]
+                if "." in first_part and not first_part.startswith("."):
+                    return f"{self.protocol}://{rel_path}"
+            base_url = f"{self.protocol}://{self.name}"
+        if rel_path:
+            if not base_url.endswith("/") and not rel_path.startswith("/"):
+                base_url += "/"
+            full_url = base_url + rel_path
+        else:
+            full_url = base_url
+        return full_url
+    def url(self, path: str, expires: int = 3600, **kwargs) -> str:
+        """
+        Generate URL for the given path.
+        Note: HTTP URLs don't support signed/expiring URLs.
+        """
+        return self.get_full_path(path, kwargs.pop("version_id", None))
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        etag = v.get("ETag", "").strip('"')
+        last_modified = v.get("last_modified")
+        if last_modified:
+            if isinstance(last_modified, str):
+                try:
+                    from email.utils import parsedate_to_datetime
+                    last_modified = parsedate_to_datetime(last_modified)
+                except (ValueError, TypeError):
+                    last_modified = datetime.now(timezone.utc)
+            elif isinstance(last_modified, (int, float)):
+                last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
+        else:
+            last_modified = datetime.now(timezone.utc)
+        return File(
+            source=self.uri,
+            path=path,
+            size=v.get("size", 0),
+            etag=etag,
+            version="",
+            is_latest=True,
+            last_modified=last_modified,
+        )
+    def upload(self, data: bytes, path: str) -> "File":
+        raise NotImplementedError(
+            "HTTP/HTTPS client is read-only. Upload operations are not supported."
+        )
+    def get_file_info(self, path: str, version_id: str | None = None) -> "File":
+        info = self.fs.info(self.get_full_path(path))
+        return self.info_to_file(info, path)
+    def open_object(self, file: "File", use_cache: bool = True, cb=None):
+        from datachain.client.fileslice import FileWrapper
+        if use_cache and (cache_path := self.cache.get_path(file)):
+            return open(cache_path, mode="rb")
+        assert not file.location
+        return FileWrapper(
+            self.fs.open(self.get_full_path(file.get_path_normalized())),
+            cb or (lambda x: None),
+        )
+    async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
+        return await self.fs._get_file(lpath, rpath, callback=callback)
+    async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
+        full_url = self.get_full_path(prefix)
+        raise NotImplementedError(f"Cannot download file from {full_url}")
+class HTTPSClient(HTTPClient):
+    protocol = "https"
+    PREFIX = "https://"

datachain/client/local.py CHANGED Viewed

@@ -2,14 +2,14 @@ import os
 import posixpath
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
 from datachain.lib.file import File
-from .fsspec import Client
+from .fsspec import Client, is_win_local_path
 if TYPE_CHECKING:
     from datachain.cache import Cache
@@ -57,9 +57,13 @@ class FileClient(Client):
             /home/user/animals/ -> file:///home/user/animals/
             C:\\windows\animals -> file:///C:/windows/animals
         """
+        parsed = urlparse(path)
+        if parsed.scheme and not is_win_local_path(path):
+            return path
         uri = Path(path).expanduser().absolute().resolve().as_uri()
-        if path[-1] == os.sep:
-            # we should keep os separator from the end of the path
+        if path and path[-1] in (os.sep, "/"):
+            # keep trailing separator so directory URIs stay rooted
             uri += "/"  # in uri (file:///...) all separators are / regardless of os
         return uri
@@ -102,10 +106,10 @@ class FileClient(Client):
         info = self.fs.info(self.get_full_path(file.get_path_normalized()))
         return self.info_to_file(info, "").etag
-    async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
+    async def get_size(self, path: str, version_id: str | None = None) -> int:
         return self.fs.size(path)
-    async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
+    async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
         return self.fs.get_file(lpath, rpath, callback=callback)
     async def ls_dir(self, path):
@@ -114,7 +118,7 @@ class FileClient(Client):
     def rel_path(self, path):
         return posixpath.relpath(path, self.name)
-    def get_full_path(self, rel_path, version_id: Optional[str] = None):
+    def get_full_path(self, rel_path, version_id: str | None = None):
         full_path = Path(self.name, rel_path).as_posix()
         if rel_path.endswith("/") or not rel_path:
             full_path += "/"

datachain/client/s3.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import Any, Optional, cast
+from typing import Any, cast
 from urllib.parse import parse_qs, urlsplit, urlunsplit
 from botocore.exceptions import NoCredentialsError
@@ -148,7 +148,7 @@ class ClientS3(Client):
         )
     @classmethod
-    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+    def version_path(cls, path: str, version_id: str | None) -> str:
         parts = list(urlsplit(path))
         query = parse_qs(parts[3])
         if "versionId" in query:
@@ -187,7 +187,7 @@ class ClientS3(Client):
         return subdirs
     @staticmethod
-    def clean_s3_version(ver: Optional[str]) -> str:
+    def clean_s3_version(ver: str | None) -> str:
         return ver if (ver is not None and ver != "null") else ""
     def info_to_file(self, v: dict[str, Any], path: str) -> File:

datachain/config.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from collections.abc import Mapping
 from contextlib import contextmanager
 from enum import Enum
-from typing import Optional, Union
 from tomlkit import TOMLDocument, dump, load
@@ -22,16 +21,13 @@ class Config:
     # In the order of precedence
     LEVELS = SYSTEM_LEVELS + LOCAL_LEVELS
-    def __init__(
-        self,
-        level: Optional[ConfigLevel] = None,
-    ):
+    def __init__(self, level: ConfigLevel | None = None):
         self.level = level
         self.init()
     @classmethod
-    def get_dir(cls, level: Optional[ConfigLevel]) -> str:
+    def get_dir(cls, level: ConfigLevel | None) -> str:
         if level == ConfigLevel.SYSTEM:
             return system_config_dir()
         if level == ConfigLevel.GLOBAL:
@@ -43,7 +39,7 @@ class Config:
         d = DataChainDir(self.get_dir(self.level))
         d.init()
-    def load_one(self, level: Optional[ConfigLevel] = None) -> TOMLDocument:
+    def load_one(self, level: ConfigLevel | None = None) -> TOMLDocument:
         config_path = DataChainDir(self.get_dir(level)).config
         try:
@@ -128,7 +124,7 @@ class Config:
         return remote_conf
-def merge(into: Union[TOMLDocument, dict], update: Union[TOMLDocument, dict]):
+def merge(into: TOMLDocument | dict, update: TOMLDocument | dict):
     """Merges second dict into first recursively"""
     for key, val in update.items():
         if isinstance(into.get(key), dict) and isinstance(val, dict):

datachain/data_storage/db_engine.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
-from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar
 import sqlalchemy as sa
 from sqlalchemy.sql import FROM_LINTING
@@ -58,7 +58,7 @@ class DatabaseEngine(ABC, Serializable):
     @classmethod
     def compile_to_args(
         cls, statement: "ClauseElement", **kwargs
-    ) -> Union[tuple[str], tuple[str, dict[str, Any]]]:
+    ) -> tuple[str] | tuple[str, dict[str, Any]]:
         """
         Compile a sqlalchemy query or ddl object to an args tuple.
@@ -75,8 +75,8 @@ class DatabaseEngine(ABC, Serializable):
     def execute(
         self,
         query,
-        cursor: Optional[Any] = None,
-        conn: Optional[Any] = None,
+        cursor: Any | None = None,
+        conn: Any | None = None,
     ) -> Iterator[tuple[Any, ...]]: ...
     def get_table(self, name: str) -> "Table":
@@ -90,7 +90,7 @@ class DatabaseEngine(ABC, Serializable):
     @abstractmethod
     def executemany(
-        self, query, params, cursor: Optional[Any] = None
+        self, query, params, cursor: Any | None = None
     ) -> Iterator[tuple[Any, ...]]: ...
     @abstractmethod
@@ -112,7 +112,13 @@ class DatabaseEngine(ABC, Serializable):
         return sa.inspect(self.engine).has_table(name)
     @abstractmethod
-    def create_table(self, table: "Table", if_not_exists: bool = True) -> None: ...
+    def create_table(
+        self,
+        table: "Table",
+        if_not_exists: bool = True,
+        *,
+        kind: str | None = None,
+    ) -> None: ...
     @abstractmethod
     def drop_table(self, table: "Table", if_exists: bool = False) -> None: ...

datachain/data_storage/job.py CHANGED Viewed

@@ -4,6 +4,7 @@ from enum import Enum
 class JobStatus(int, Enum):
     CREATED = 1
     SCHEDULED = 10
+    PROVISIONING = 12
     QUEUED = 2
     INIT = 3
     RUNNING = 4
@@ -13,6 +14,7 @@ class JobStatus(int, Enum):
     CANCELED = 8
     CANCELING_SCHEDULED = 9
     TASK = 11
+    PENDING = 13
     @classmethod
     def finished(cls) -> tuple[int, ...]:

datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl