PyPI - datachain - Versions diffs - 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl - Mend

datachain 0.6.3py3-none-any.whl → 0.6.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (19) hide show

datachain/catalog/catalog.py +3 -25
datachain/cli.py +0 -8
datachain/client/fsspec.py +10 -5
datachain/client/hf.py +1 -0
datachain/client/local.py +7 -3
datachain/data_storage/metastore.py +11 -478
datachain/data_storage/sqlite.py +9 -41
datachain/data_storage/warehouse.py +1 -2
datachain/dataset.py +12 -10
datachain/error.py +0 -4
datachain/lib/arrow.py +1 -1
datachain/node.py +1 -1
{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/METADATA +1 -1
{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/RECORD +18 -19
datachain/storage.py +0 -136
{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/LICENSE +0 -0
{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/WHEEL +0 -0
{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/entry_points.txt +0 -0
{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -42,6 +42,7 @@ from datachain.dataset import (
     DatasetStats,
     DatasetStatus,
     RowDict,
+    StorageURI,
     create_dataset_uri,
     parse_dataset_uri,
 )
@@ -58,7 +59,6 @@ from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
 from datachain.sql.types import DateTime, SQLType, String
-from datachain.storage import StorageURI
 from datachain.utils import (
     DataChainDir,
     batched,
@@ -1702,31 +1702,9 @@ class Catalog:
         *,
         client_config=None,
     ) -> None:
-        root_sources = [
-            src for src in sources if Client.get_implementation(src).is_root_url(src)
-        ]
-        non_root_sources = [
-            src
-            for src in sources
-            if not Client.get_implementation(src).is_root_url(src)
-        ]
-        client_config = client_config or self.client_config
-        # for root sources (e.g s3://) we are just getting all buckets and
-        # saving them as storages, without further indexing in each bucket
-        for source in root_sources:
-            for bucket in Client.get_implementation(source).ls_buckets(**client_config):
-                client = self.get_client(bucket.uri, **client_config)
-                print(f"Registering storage {client.uri}")
-                self.metastore.create_storage_if_not_registered(client.uri)
         self.enlist_sources(
-            non_root_sources,
+            sources,
             update,
-            client_config=client_config,
+            client_config=client_config or self.client_config,
             only_index=True,
         )
-    def find_stale_storages(self) -> None:
-        self.metastore.find_stale_storages()

datachain/cli.py CHANGED Viewed

@@ -568,12 +568,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     add_sources_arg(parse_index)
-    subp.add_parser(
-        "find-stale-storages",
-        parents=[parent_parser],
-        description="Finds and marks stale storages",
-    )
     show_parser = subp.add_parser(
         "show",
         parents=[parent_parser],
@@ -1100,8 +1094,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             )
         elif args.command == "completion":
             print(completion(args.shell))
-        elif args.command == "find-stale-storages":
-            catalog.find_stale_storages()
         elif args.command == "query":
             query(
                 catalog,

datachain/client/fsspec.py CHANGED Viewed

@@ -31,11 +31,12 @@ from datachain.error import ClientError as DataChainClientError
 from datachain.lib.file import File
 from datachain.nodes_fetcher import NodesFetcher
 from datachain.nodes_thread_pool import NodeChunk
-from datachain.storage import StorageURI
 if TYPE_CHECKING:
     from fsspec.spec import AbstractFileSystem
+    from datachain.dataset import StorageURI
 logger = logging.getLogger("datachain")
@@ -63,7 +64,7 @@ def _is_win_local_path(uri: str) -> bool:
 class Bucket(NamedTuple):
     name: str
-    uri: StorageURI
+    uri: "StorageURI"
     created: Optional[datetime]
@@ -115,7 +116,7 @@ class Client(ABC):
         return DATA_SOURCE_URI_PATTERN.match(name) is not None
     @staticmethod
-    def parse_url(source: str) -> tuple[StorageURI, str]:
+    def parse_url(source: str) -> tuple["StorageURI", str]:
         cls = Client.get_implementation(source)
         storage_name, rel_path = cls.split_url(source)
         return cls.get_uri(storage_name), rel_path
@@ -148,7 +149,7 @@ class Client(ABC):
     @classmethod
     def from_source(
         cls,
-        uri: StorageURI,
+        uri: "StorageURI",
         cache: DataChainCache,
         **kwargs,
     ) -> "Client":
@@ -156,6 +157,8 @@ class Client(ABC):
     @classmethod
     def ls_buckets(cls, **kwargs) -> Iterator[Bucket]:
+        from datachain.dataset import StorageURI
         for entry in cls.create_fs(**kwargs).ls(cls.PREFIX, detail=True):
             name = entry["name"].rstrip("/")
             yield Bucket(
@@ -169,7 +172,9 @@ class Client(ABC):
         return url == cls.PREFIX
     @classmethod
-    def get_uri(cls, name) -> StorageURI:
+    def get_uri(cls, name) -> "StorageURI":
+        from datachain.dataset import StorageURI
         return StorageURI(f"{cls.PREFIX}{name}")
     @classmethod

datachain/client/hf.py CHANGED Viewed

@@ -23,6 +23,7 @@ class HfClient(Client):
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(
+            source=self.uri,
             path=path,
             size=v["size"],
             version=v["last_commit"].oid,

datachain/client/local.py CHANGED Viewed

@@ -2,16 +2,18 @@ import os
 import posixpath
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
 from datachain.lib.file import File
-from datachain.storage import StorageURI
 from .fsspec import Client
+if TYPE_CHECKING:
+    from datachain.dataset import StorageURI
 class FileClient(Client):
     FS_CLASS = LocalFileSystem
@@ -28,7 +30,9 @@ class FileClient(Client):
         raise TypeError("Signed urls are not implemented for local file system")
     @classmethod
-    def get_uri(cls, name) -> StorageURI:
+    def get_uri(cls, name) -> "StorageURI":
+        from datachain.dataset import StorageURI
         return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
     @classmethod

datachain/data_storage/metastore.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import copy
-import hashlib
 import json
 import logging
 import os
-import posixpath
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from datetime import datetime, timezone
@@ -24,7 +22,6 @@ from sqlalchemy import (
     UniqueConstraint,
     select,
 )
-from sqlalchemy.sql import func
 from datachain.data_storage import JobQueryType, JobStatus
 from datachain.data_storage.serializer import Serializable
@@ -33,15 +30,14 @@ from datachain.dataset import (
     DatasetRecord,
     DatasetStatus,
     DatasetVersion,
+    StorageURI,
 )
 from datachain.error import (
     DatasetNotFoundError,
-    StorageNotFoundError,
     TableMissingError,
 )
 from datachain.job import Job
-from datachain.storage import Storage, StorageStatus, StorageURI
-from datachain.utils import JSONSerialize, is_expired
+from datachain.utils import JSONSerialize
 if TYPE_CHECKING:
     from sqlalchemy import Delete, Insert, Select, Update
@@ -60,21 +56,17 @@ class AbstractMetastore(ABC, Serializable):
     """
     uri: StorageURI
-    partial_id: Optional[int]
     schema: "schema.Schema"
-    storage_class: type[Storage] = Storage
     dataset_class: type[DatasetRecord] = DatasetRecord
     dependency_class: type[DatasetDependency] = DatasetDependency
     job_class: type[Job] = Job
     def __init__(
         self,
-        uri: StorageURI = StorageURI(""),
-        partial_id: Optional[int] = None,
+        uri: Optional[StorageURI] = None,
     ):
-        self.uri = uri
-        self.partial_id: Optional[int] = partial_id
+        self.uri = uri or StorageURI("")
     def __enter__(self) -> "AbstractMetastore":
         """Returns self upon entering context manager."""
@@ -86,8 +78,7 @@ class AbstractMetastore(ABC, Serializable):
     @abstractmethod
     def clone(
         self,
-        uri: StorageURI = StorageURI(""),
-        partial_id: Optional[int] = None,
+        uri: Optional[StorageURI] = None,
         use_new_connection: bool = False,
     ) -> "AbstractMetastore":
         """Clones AbstractMetastore implementation for some Storage input.
@@ -95,10 +86,6 @@ class AbstractMetastore(ABC, Serializable):
         New connections should only be used if needed due to errors with
         closed connections."""
-    @abstractmethod
-    def init(self, uri: StorageURI) -> None:
-        """Initialize partials table for given storage uri."""
     def close(self) -> None:
         """Closes any active database or HTTP connections."""
@@ -114,96 +101,6 @@ class AbstractMetastore(ABC, Serializable):
     def cleanup_for_tests(self) -> None:
         """Cleanup for tests."""
-    #
-    # Storages
-    #
-    @abstractmethod
-    def create_storage_if_not_registered(self, uri: StorageURI) -> None:
-        """Saves new storage if it doesn't exist in database."""
-    @abstractmethod
-    def register_storage_for_indexing(
-        self,
-        uri: StorageURI,
-        force_update: bool = True,
-        prefix: str = "",
-    ) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
-        """
-        Prepares storage for indexing operation.
-        This method should be called before index operation is started
-        It returns:
-            - storage, prepared for indexing
-            - boolean saying if indexing is needed
-            - boolean saying if indexing is currently pending (running)
-            - partial id
-            - partial path
-        """
-    @abstractmethod
-    def find_stale_storages(self) -> None:
-        """
-        Finds all pending storages for which the last inserted node has happened
-        before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
-        """
-    @abstractmethod
-    def mark_storage_indexed(
-        self,
-        uri: StorageURI,
-        status: int,
-        ttl: int,
-        end_time: Optional[datetime] = None,
-        prefix: str = "",
-        partial_id: int = 0,
-        error_message: str = "",
-        error_stack: str = "",
-        dataset: Optional[DatasetRecord] = None,
-    ) -> None:
-        """
-        Marks storage as indexed.
-        This method should be called when index operation is finished.
-        """
-    @abstractmethod
-    def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
-        """Updates last inserted datetime in bucket with current time."""
-    @abstractmethod
-    def get_storage(self, uri: StorageURI) -> Storage:
-        """
-        Gets storage representation from database.
-        E.g. if s3 is used as storage this would be s3 bucket data.
-        """
-    @abstractmethod
-    def mark_storage_pending(self, storage: Storage) -> Storage:
-        """Marks storage as pending."""
-    #
-    # Partial Indexes
-    #
-    @abstractmethod
-    def init_partial_id(self, uri: StorageURI) -> None:
-        """Initializes partial id for given storage."""
-    @abstractmethod
-    def get_next_partial_id(self, uri: StorageURI) -> int:
-        """Returns next partial id for given storage."""
-    @abstractmethod
-    def get_valid_partial_id(
-        self, uri: StorageURI, prefix: str, raise_exc: bool = True
-    ) -> tuple[Optional[int], Optional[str]]:
-        """
-        Returns valid partial id and it's path, if they exist, for a given storage.
-        """
-    @abstractmethod
-    def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
-        """Returns last partial path for given storage."""
     #
     # Datasets
     #
@@ -397,8 +294,6 @@ class AbstractDBMetastore(AbstractMetastore):
     and has shared logic for all database systems currently in use.
     """
-    PARTIALS_TABLE_NAME_PREFIX = "prt_"
-    STORAGE_TABLE = "buckets"
     DATASET_TABLE = "datasets"
     DATASET_VERSION_TABLE = "datasets_versions"
     DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
@@ -410,15 +305,11 @@ class AbstractDBMetastore(AbstractMetastore):
     def __init__(
         self,
         id_generator: "AbstractIDGenerator",
-        uri: StorageURI = StorageURI(""),
-        partial_id: Optional[int] = None,
+        uri: Optional[StorageURI] = None,
     ):
+        uri = uri or StorageURI("")
         self.id_generator = id_generator
-        super().__init__(uri, partial_id)
-    @abstractmethod
-    def init(self, uri: StorageURI) -> None:
-        """Initialize partials table for given storage uri."""
+        super().__init__(uri)
     def close(self) -> None:
         """Closes any active database connections."""
@@ -428,21 +319,6 @@ class AbstractDBMetastore(AbstractMetastore):
         """Cleanup temp tables."""
         self.id_generator.delete_uris(temp_table_names)
-    @classmethod
-    def _buckets_columns(cls) -> list["SchemaItem"]:
-        """Buckets (storages) table columns."""
-        return [
-            Column("id", Integer, primary_key=True, nullable=False),
-            Column("uri", Text, nullable=False),
-            Column("timestamp", DateTime(timezone=True)),
-            Column("expires", DateTime(timezone=True)),
-            Column("started_inserting_at", DateTime(timezone=True)),
-            Column("last_inserted_at", DateTime(timezone=True)),
-            Column("status", Integer, nullable=False),
-            Column("error_message", Text, nullable=False, default=""),
-            Column("error_stack", Text, nullable=False, default=""),
-        ]
     @classmethod
     def _datasets_columns(cls) -> list["SchemaItem"]:
         """Datasets table columns."""
@@ -543,58 +419,11 @@ class AbstractDBMetastore(AbstractMetastore):
                 ForeignKey(f"{cls.DATASET_VERSION_TABLE}.id"),
                 nullable=True,
             ),
-            # TODO remove when https://github.com/iterative/dvcx/issues/1121 is done
-            # If we unify datasets and bucket listing then both bucket fields won't
-            # be needed
-            Column(
-                "bucket_id",
-                Integer,
-                ForeignKey(f"{cls.STORAGE_TABLE}.id"),
-                nullable=True,
-            ),
-            Column("bucket_version", Text, nullable=True),
-        ]
-    @classmethod
-    def _storage_partial_columns(cls) -> list["SchemaItem"]:
-        """Storage partial table columns."""
-        return [
-            Column("path_str", Text, nullable=False),
-            # This is generated before insert and is not the SQLite rowid,
-            # so it is not the primary key.
-            Column("partial_id", Integer, nullable=False, index=True),
-            Column("timestamp", DateTime(timezone=True)),
-            Column("expires", DateTime(timezone=True)),
         ]
-    def _get_storage_partial_table(self, name: str) -> Table:
-        table = self.db.metadata.tables.get(name)
-        if table is None:
-            table = Table(
-                name,
-                self.db.metadata,
-                *self._storage_partial_columns(),
-            )
-        return table
     #
     # Query Tables
     #
-    def _partials_table(self, uri: StorageURI) -> Table:
-        return self._get_storage_partial_table(self._partials_table_name(uri))
-    @cached_property
-    def _storages(self) -> Table:
-        return Table(self.STORAGE_TABLE, self.db.metadata, *self._buckets_columns())
-    @cached_property
-    def _partials(self) -> Table:
-        assert (
-            self._current_partials_table_name
-        ), "Partials can only be used if uri/current_partials_table_name is set"
-        return self._get_storage_partial_table(self._current_partials_table_name)
     @cached_property
     def _datasets(self) -> Table:
         return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
@@ -618,32 +447,6 @@ class AbstractDBMetastore(AbstractMetastore):
     #
     # Query Starters (These can be overridden by subclasses)
     #
-    @abstractmethod
-    def _storages_insert(self) -> "Insert": ...
-    def _storages_select(self, *columns) -> "Select":
-        if not columns:
-            return self._storages.select()
-        return select(*columns)
-    def _storages_update(self) -> "Update":
-        return self._storages.update()
-    def _storages_delete(self) -> "Delete":
-        return self._storages.delete()
-    @abstractmethod
-    def _partials_insert(self) -> "Insert": ...
-    def _partials_select(self, *columns) -> "Select":
-        if not columns:
-            return self._partials.select()
-        return select(*columns)
-    def _partials_update(self) -> "Update":
-        return self._partials.update()
     @abstractmethod
     def _datasets_insert(self) -> "Insert": ...
@@ -686,275 +489,6 @@ class AbstractDBMetastore(AbstractMetastore):
     def _datasets_dependencies_delete(self) -> "Delete":
         return self._datasets_dependencies.delete()
-    #
-    # Table Name Internal Functions
-    #
-    def _partials_table_name(self, uri: StorageURI) -> str:
-        sha = hashlib.sha256(uri.encode("utf-8")).hexdigest()[:12]
-        return f"{self.PARTIALS_TABLE_NAME_PREFIX}_{sha}"
-    @property
-    def _current_partials_table_name(self) -> Optional[str]:
-        if not self.uri:
-            return None
-        return self._partials_table_name(self.uri)
-    #
-    # Storages
-    #
-    def create_storage_if_not_registered(self, uri: StorageURI, conn=None) -> None:
-        """Saves new storage if it doesn't exist in database."""
-        query = self._storages_insert().values(
-            uri=uri,
-            status=StorageStatus.CREATED,
-            error_message="",
-            error_stack="",
-        )
-        if hasattr(query, "on_conflict_do_nothing"):
-            # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
-            # but generic SQL does not
-            query = query.on_conflict_do_nothing()
-        self.db.execute(query, conn=conn)
-    def register_storage_for_indexing(
-        self,
-        uri: StorageURI,
-        force_update: bool = True,
-        prefix: str = "",
-    ) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
-        """
-        Prepares storage for indexing operation.
-        This method should be called before index operation is started
-        It returns:
-            - storage, prepared for indexing
-            - boolean saying if indexing is needed
-            - boolean saying if indexing is currently pending (running)
-            - partial id
-            - partial path
-        """
-        # This ensures that all calls to the DB are in a single transaction
-        # and commit is automatically called once this function returns
-        with self.db.transaction() as conn:
-            # Create storage if it doesn't exist
-            self.create_storage_if_not_registered(uri, conn=conn)
-            storage = self.get_storage(uri, conn=conn)
-            if storage.status == StorageStatus.PENDING:
-                return storage, False, True, None, None
-            if storage.is_expired or storage.status == StorageStatus.STALE:
-                storage = self.mark_storage_pending(storage, conn=conn)
-                return storage, True, False, None, None
-            if (
-                storage.status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE)
-                and not force_update
-            ):
-                partial_id, partial_path = self.get_valid_partial_id(
-                    uri, prefix, raise_exc=False
-                )
-                if partial_id is not None:
-                    return storage, False, False, partial_id, partial_path
-                return storage, True, False, None, None
-            storage = self.mark_storage_pending(storage, conn=conn)
-            return storage, True, False, None, None
-    def find_stale_storages(self) -> None:
-        """
-        Finds all pending storages for which the last inserted node has happened
-        before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
-        """
-        s = self._storages
-        with self.db.transaction() as conn:
-            pending_storages = map(
-                self.storage_class._make,
-                self.db.execute(
-                    self._storages_select().where(s.c.status == StorageStatus.PENDING),
-                    conn=conn,
-                ),
-            )
-            for storage in pending_storages:
-                if storage.is_stale:
-                    print(f"Marking storage {storage.uri} as stale")
-                    self._mark_storage_stale(storage.id, conn=conn)
-    def mark_storage_indexed(
-        self,
-        uri: StorageURI,
-        status: int,
-        ttl: int,
-        end_time: Optional[datetime] = None,
-        prefix: str = "",
-        partial_id: int = 0,
-        error_message: str = "",
-        error_stack: str = "",
-        dataset: Optional[DatasetRecord] = None,
-    ) -> None:
-        """
-        Marks storage as indexed.
-        This method should be called when index operation is finished.
-        """
-        if status == StorageStatus.PARTIAL and not prefix:
-            raise AssertionError("Partial indexing requires a prefix")
-        if end_time is None:
-            end_time = datetime.now(timezone.utc)
-        expires = Storage.get_expiration_time(end_time, ttl)
-        s = self._storages
-        with self.db.transaction() as conn:
-            self.db.execute(
-                self._storages_update()
-                .where(s.c.uri == uri)
-                .values(  # type: ignore [attr-defined]
-                    timestamp=end_time,
-                    expires=expires,
-                    status=status,
-                    last_inserted_at=end_time,
-                    error_message=error_message,
-                    error_stack=error_stack,
-                ),
-                conn=conn,
-            )
-            if not self._current_partials_table_name:
-                # This only occurs in tests
-                return
-            if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE):
-                dir_prefix = posixpath.join(prefix, "")
-                self.db.execute(
-                    self._partials_insert().values(
-                        path_str=dir_prefix,
-                        timestamp=end_time,
-                        expires=expires,
-                        partial_id=partial_id,
-                    ),
-                    conn=conn,
-                )
-            # update underlying dataset status as well
-            if status == StorageStatus.FAILED and dataset:
-                self.update_dataset_status(
-                    dataset,
-                    DatasetStatus.FAILED,
-                    dataset.latest_version,
-                    error_message=error_message,
-                    error_stack=error_stack,
-                    conn=conn,
-                )
-            if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE) and dataset:
-                self.update_dataset_status(
-                    dataset, DatasetStatus.COMPLETE, dataset.latest_version, conn=conn
-                )
-    def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
-        """Updates last inserted datetime in bucket with current time"""
-        uri = uri or self.uri
-        updates = {"last_inserted_at": datetime.now(timezone.utc)}
-        s = self._storages
-        self.db.execute(
-            self._storages_update().where(s.c.uri == uri).values(**updates)  # type: ignore [attr-defined]
-        )
-    def get_storage(self, uri: StorageURI, conn=None) -> Storage:
-        """
-        Gets storage representation from database.
-        E.g. if s3 is used as storage this would be s3 bucket data
-        """
-        s = self._storages
-        result = next(
-            self.db.execute(self._storages_select().where(s.c.uri == uri), conn=conn),
-            None,
-        )
-        if not result:
-            raise StorageNotFoundError(f"Storage {uri} not found.")
-        return self.storage_class._make(result)
-    def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
-        # Update status to pending and dates
-        updates = {
-            "status": StorageStatus.PENDING,
-            "timestamp": None,
-            "expires": None,
-            "last_inserted_at": None,
-            "started_inserting_at": datetime.now(timezone.utc),
-        }
-        storage = storage._replace(**updates)  # type: ignore [arg-type]
-        s = self._storages
-        self.db.execute(
-            self._storages_update().where(s.c.uri == storage.uri).values(**updates),  # type: ignore [attr-defined]
-            conn=conn,
-        )
-        return storage
-    def _mark_storage_stale(self, storage_id: int, conn=None) -> None:
-        # Update status to pending and dates
-        updates = {"status": StorageStatus.STALE, "timestamp": None, "expires": None}
-        s = self._storages
-        self.db.execute(
-            self._storages.update().where(s.c.id == storage_id).values(**updates),  # type: ignore [attr-defined]
-            conn=conn,
-        )
-    #
-    # Partial Indexes
-    #
-    def init_partial_id(self, uri: StorageURI) -> None:
-        """Initializes partial id for given storage."""
-        if not uri:
-            raise ValueError("uri for get_next_partial_id() cannot be empty")
-        self.id_generator.init_id(f"partials:{uri}")
-    def get_next_partial_id(self, uri: StorageURI) -> int:
-        """Returns next partial id for given storage."""
-        if not uri:
-            raise ValueError("uri for get_next_partial_id() cannot be empty")
-        return self.id_generator.get_next_id(f"partials:{uri}")
-    def get_valid_partial_id(
-        self, uri: StorageURI, prefix: str, raise_exc: bool = True
-    ) -> tuple[Optional[int], Optional[str]]:
-        """
-        Returns valid partial id and it's path, if they exist, for a given storage.
-        """
-        # This SQL statement finds all entries that are
-        # prefixes of the given prefix, matching this or parent directories
-        # that are indexed.
-        dir_prefix = posixpath.join(prefix, "")
-        p = self._partials_table(uri)
-        expire_values = self.db.execute(
-            select(p.c.expires, p.c.partial_id, p.c.path_str)
-            .where(
-                p.c.path_str == func.substr(dir_prefix, 1, func.length(p.c.path_str))
-            )
-            .order_by(p.c.expires.desc())
-        )
-        for expires, partial_id, path_str in expire_values:
-            if not is_expired(expires):
-                return partial_id, path_str
-        if raise_exc:
-            raise RuntimeError(f"Unable to get valid partial_id: {uri=}, {prefix=}")
-        return None, None
-    def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
-        """Returns last partial path for given storage."""
-        p = self._partials_table(uri)
-        if not self.db.has_table(p.name):
-            raise StorageNotFoundError(f"Storage {uri} partials are not found.")
-        last_partial = self.db.execute(
-            select(p.c.path_str).order_by(p.c.timestamp.desc()).limit(1)
-        )
-        for (path_str,) in last_partial:
-            return path_str
-        return None
     #
     # Datasets
     #
@@ -1298,7 +832,6 @@ class AbstractDBMetastore(AbstractMetastore):
         d = self._datasets
         dd = self._datasets_dependencies
         dv = self._datasets_versions
-        s = self._storages
         dataset_version = dataset.get_version(version)
@@ -1307,9 +840,9 @@ class AbstractDBMetastore(AbstractMetastore):
         query = (
             self._datasets_dependencies_select(*select_cols)
             .select_from(
-                dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
-                .join(s, dd.c.bucket_id == s.c.id, isouter=True)
-                .join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
+                dd.join(d, dd.c.dataset_id == d.c.id, isouter=True).join(
+                    dv, dd.c.dataset_version_id == dv.c.id, isouter=True
+                )
             )
             .where(
                 (dd.c.source_dataset_id == dataset.id)

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -29,12 +29,11 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
 from datachain.data_storage.db_engine import DatabaseEngine
 from datachain.data_storage.id_generator import AbstractDBIDGenerator
 from datachain.data_storage.schema import DefaultSchema
-from datachain.dataset import DatasetRecord
+from datachain.dataset import DatasetRecord, StorageURI
 from datachain.error import DataChainError
 from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
 from datachain.sql.sqlite.base import load_usearch_extension
 from datachain.sql.types import SQLType
-from datachain.storage import StorageURI
 from datachain.utils import DataChainDir, batched_it
 if TYPE_CHECKING:
@@ -392,14 +391,14 @@ class SQLiteMetastore(AbstractDBMetastore):
     def __init__(
         self,
         id_generator: "SQLiteIDGenerator",
-        uri: StorageURI = StorageURI(""),
-        partial_id: Optional[int] = None,
+        uri: Optional[StorageURI] = None,
         db: Optional["SQLiteDatabaseEngine"] = None,
         db_file: Optional[str] = None,
         in_memory: bool = False,
     ):
+        uri = uri or StorageURI("")
         self.schema: DefaultSchema = DefaultSchema()
-        super().__init__(id_generator, uri, partial_id)
+        super().__init__(id_generator, uri)
         # needed for dropping tables in correct order for tests because of
         # foreign keys
@@ -417,21 +416,16 @@ class SQLiteMetastore(AbstractDBMetastore):
     def clone(
         self,
-        uri: StorageURI = StorageURI(""),
-        partial_id: Optional[int] = None,
+        uri: Optional[StorageURI] = None,
         use_new_connection: bool = False,
     ) -> "SQLiteMetastore":
-        if not uri:
-            if partial_id is not None:
-                raise ValueError("if partial_id is used, uri cannot be empty")
-            if self.uri:
-                uri = self.uri
-                if self.partial_id:
-                    partial_id = self.partial_id
+        uri = uri or StorageURI("")
+        if not uri and self.uri:
+            uri = self.uri
         return SQLiteMetastore(
             self.id_generator.clone(),
             uri=uri,
-            partial_id=partial_id,
             db=self.db.clone(),
         )
@@ -446,7 +440,6 @@ class SQLiteMetastore(AbstractDBMetastore):
             {
                 "id_generator_clone_params": self.id_generator.clone_params(),
                 "uri": self.uri,
-                "partial_id": self.partial_id,
                 "db_clone_params": self.db.clone_params(),
             },
         )
@@ -457,7 +450,6 @@ class SQLiteMetastore(AbstractDBMetastore):
         *,
         id_generator_clone_params: tuple[Callable, list, dict[str, Any]],
         uri: StorageURI,
-        partial_id: Optional[int],
         db_clone_params: tuple[Callable, list, dict[str, Any]],
     ) -> "SQLiteMetastore":
         (
@@ -469,14 +461,11 @@ class SQLiteMetastore(AbstractDBMetastore):
         return cls(
             id_generator=id_generator_class(*id_generator_args, **id_generator_kwargs),
             uri=uri,
-            partial_id=partial_id,
             db=db_class(*db_args, **db_kwargs),
         )
     def _init_tables(self) -> None:
         """Initialize tables."""
-        self.db.create_table(self._storages, if_not_exists=True)
-        self.default_table_names.append(self._storages.name)
         self.db.create_table(self._datasets, if_not_exists=True)
         self.default_table_names.append(self._datasets.name)
         self.db.create_table(self._datasets_versions, if_not_exists=True)
@@ -486,28 +475,11 @@ class SQLiteMetastore(AbstractDBMetastore):
         self.db.create_table(self._jobs, if_not_exists=True)
         self.default_table_names.append(self._jobs.name)
-    def init(self, uri: StorageURI) -> None:
-        if not uri:
-            raise ValueError("uri for init() cannot be empty")
-        partials_table = self._partials_table(uri)
-        self.db.create_table(partials_table, if_not_exists=True)
-    @classmethod
-    def _buckets_columns(cls) -> list["SchemaItem"]:
-        """Buckets (storages) table columns."""
-        return [*super()._buckets_columns(), UniqueConstraint("uri")]
     @classmethod
     def _datasets_columns(cls) -> list["SchemaItem"]:
         """Datasets table columns."""
         return [*super()._datasets_columns(), UniqueConstraint("name")]
-    def _storages_insert(self) -> "Insert":
-        return sqlite.insert(self._storages)
-    def _partials_insert(self) -> "Insert":
-        return sqlite.insert(self._partials)
     def _datasets_insert(self) -> "Insert":
         return sqlite.insert(self._datasets)
@@ -526,13 +498,9 @@ class SQLiteMetastore(AbstractDBMetastore):
             self._datasets_dependencies.c.id,
             self._datasets_dependencies.c.dataset_id,
             self._datasets_dependencies.c.dataset_version_id,
-            self._datasets_dependencies.c.bucket_id,
-            self._datasets_dependencies.c.bucket_version,
             self._datasets.c.name,
-            self._datasets.c.created_at,
             self._datasets_versions.c.version,
             self._datasets_versions.c.created_at,
-            self._storages.c.uri,
         ]
     #

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -19,11 +19,10 @@ from tqdm import tqdm
 from datachain.client import Client
 from datachain.data_storage.schema import convert_rows_custom_column_types
 from datachain.data_storage.serializer import Serializable
-from datachain.dataset import DatasetRecord
+from datachain.dataset import DatasetRecord, StorageURI
 from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
 from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType
-from datachain.storage import StorageURI
 from datachain.utils import sql_escape_like
 if TYPE_CHECKING:

datachain/dataset.py CHANGED Viewed

@@ -3,21 +3,17 @@ import json
 from dataclasses import dataclass, fields
 from datetime import datetime
 from typing import (
-    TYPE_CHECKING,
     Any,
+    NewType,
     Optional,
     TypeVar,
     Union,
 )
 from urllib.parse import urlparse
-from datachain.client import Client
 from datachain.error import DatasetVersionNotFoundError
 from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
-if TYPE_CHECKING:
-    from datachain.storage import StorageURI
 T = TypeVar("T", bound="DatasetRecord")
 V = TypeVar("V", bound="DatasetVersion")
 DD = TypeVar("DD", bound="DatasetDependency")
@@ -27,6 +23,13 @@ QUERY_DATASET_PREFIX = "ds_query_"
 LISTING_PREFIX = "lst__"
+# StorageURI represents a normalised URI to a valid storage location (full bucket or
+# absolute local path).
+# Valid examples: s3://foo, file:///var/data
+# Invalid examples: s3://foo/, s3://foo/bar, file://~
+StorageURI = NewType("StorageURI", str)
 def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
     """
     Parse dataser uri to extract name and version out of it (if version is defined)
@@ -94,14 +97,11 @@ class DatasetDependency:
         id: int,
         dataset_id: Optional[int],
         dataset_version_id: Optional[int],
-        bucket_id: Optional[int],
-        bucket_version: Optional[str],
         dataset_name: Optional[str],
-        dataset_created_at: Optional[datetime],
         dataset_version: Optional[int],
         dataset_version_created_at: Optional[datetime],
-        bucket_uri: Optional["StorageURI"],
     ) -> Optional["DatasetDependency"]:
+        from datachain.client import Client
         from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
         if not dataset_id:
@@ -124,7 +124,7 @@ class DatasetDependency:
                 if dataset_version
                 else None
             ),
-            dataset_version_created_at or dataset_created_at,  # type: ignore[arg-type]
+            dataset_version_created_at,  # type: ignore[arg-type]
             [],
         )
@@ -448,6 +448,8 @@ class DatasetRecord:
         For bucket listing we implicitly create underlying dataset to hold data. This
         method is checking if this is one of those datasets.
         """
+        from datachain.client import Client
         # TODO refactor and maybe remove method in
         # https://github.com/iterative/datachain/issues/318
         return Client.is_data_source_uri(self.name) or self.name.startswith(

datachain/error.py CHANGED Viewed

@@ -18,10 +18,6 @@ class DatasetInvalidVersionError(Exception):
     pass
-class StorageNotFoundError(NotFoundError):
-    pass
 class PendingIndexingError(Exception):
     """An indexing operation is already in progress."""

datachain/lib/arrow.py CHANGED Viewed

@@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
         return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
-    raise TypeError(f"{col_type!r} datatypes not supported")
+    raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
 def _nrows_file(file: File, nrows: int) -> str:

datachain/node.py CHANGED Viewed

@@ -3,8 +3,8 @@ from typing import TYPE_CHECKING, Any, Optional
 import attrs
+from datachain.dataset import StorageURI
 from datachain.lib.file import File
-from datachain.storage import StorageURI
 from datachain.utils import TIME_ZERO, time_to_str
 if TYPE_CHECKING:

{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.6.3
+Version: 0.6.5
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0

{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/RECORD RENAMED Viewed

@@ -2,45 +2,44 @@ datachain/__init__.py,sha256=OGzc8xZWtwqxiiutjU4AxCRPY0lrX_csgERiTrq4G0o,908
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
 datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
-datachain/cli.py,sha256=EM6jlc9zunOJQi7-GwCyVtlumHmLM8NwN9Y6jqVGzyY,33769
+datachain/cli.py,sha256=Wl-xMpTRgrkg4drX5I_QxAB1IATyULHCXOdx_wfoLVg,33529
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
-datachain/dataset.py,sha256=w7qqJP7xYrm9CmBSmSezSxUQHZDsHKkwviF8AYUob7o,14671
-datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
+datachain/dataset.py,sha256=lLUbUbJP1TYL9Obkc0f2IDziGcDylZge9ORQjK-WtXs,14717
+datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
 datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
 datachain/listing.py,sha256=AV23WZq-k6e2zeeNBhVQP1-2PrwNCYidO0HBDKzpVaA,7152
-datachain/node.py,sha256=ThE6Ue4BqpaBvrkFFJW_ljLxchixUX2aWz3l_nbwY54,5195
+datachain/node.py,sha256=i7_jC8VcW6W5VYkDszAOu0H-rNBuqXB4UnLEh4wFzjc,5195
 datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
 datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
 datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/studio.py,sha256=d-jUsYpfI1LEv3g8KU-lLchVgb9L0TXvlHakieFud_E,3788
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=PvJ-BRoSuI_FRCrXJ6tjMhYZD6L8Beq-ynrdPYRrwiw,58270
+datachain/catalog/catalog.py,sha256=qFlRrR01_9h1MjK6DEgVSgIwbtZEGV_SdG_E5qUsHmM,57352
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
 datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=sB98CO7covhmFZg36hsnyv9UwUI8J94AD1QWgGdcBlY,12595
+datachain/client/fsspec.py,sha256=C6C5AO6ndkgcoUxCRN9_8fUzqX2cRWJWG6FL6oD9X_Q,12708
 datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
-datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
-datachain/client/local.py,sha256=Uaf_y_UGspOgprDysUTI9wDo334MLjGPUudqVtvef0c,4367
+datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
+datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
 datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
 datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
 datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
 datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
 datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
-datachain/data_storage/metastore.py,sha256=HfCxk4lmDUg2Q4WsFNQGMWxllP0mToA00fxkFTwdNIE,52919
+datachain/data_storage/metastore.py,sha256=-TJCqG70VofSVOh2yEez4dwjHS3eQL8p7d9uO3WTVwM,35878
 datachain/data_storage/schema.py,sha256=CiRXrDYp5ZZopSyUgZ7MT2ml_6YvqSTYXdybatcbX9M,9849
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
-datachain/data_storage/sqlite.py,sha256=jopfVftng157TVcBKMB_QPlbkE6fTatiY4GYSSLNkig,28737
-datachain/data_storage/warehouse.py,sha256=iIjFOutYxhLev3CcUhUTwMJOkHeAEBwXZ2y3wmjrF1s,30756
+datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
+datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/arrow.py,sha256=M6SM4u2LeHgylzkPZBWckFeZt3CH3ehpBod3nGl6OYY,9138
+datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
 datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
@@ -101,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.6.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.6.3.dist-info/METADATA,sha256=BnPIINjkfA0P2Sj9mRziNuKm8SWyINrf8qqCic7NUAo,17188
-datachain-0.6.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-datachain-0.6.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.6.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.6.3.dist-info/RECORD,,
+datachain-0.6.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.6.5.dist-info/METADATA,sha256=eSh62q8OKalsO_IHYb0M2lT4y0x5z84uX1WVt7_dZlM,17188
+datachain-0.6.5.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+datachain-0.6.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.6.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.6.5.dist-info/RECORD,,

datachain/storage.py DELETED Viewed

@@ -1,136 +0,0 @@
-import posixpath
-from abc import ABC, abstractmethod
-from datetime import datetime, timedelta, timezone
-from functools import cached_property
-from typing import NamedTuple, NewType, Optional, Union
-from urllib.parse import urlparse
-from datachain.utils import is_expired, time_to_local_str, time_to_str
-STALE_MINUTES_LIMIT = 15
-# StorageURI represents a normalised URI to a valid storage location (full bucket or
-# absolute local path).
-# Valid examples: s3://foo, file:///var/data
-# Invalid examples: s3://foo/, s3://foo/bar, file://~
-StorageURI = NewType("StorageURI", str)
-class StorageStatus:
-    CREATED = 1
-    PENDING = 2
-    FAILED = 3
-    COMPLETE = 4
-    PARTIAL = 5
-    STALE = 6
-    INDEXING_SCHEDULED = 7
-    DELETE_SCHEDULED = 8
-class AbstractStorage(ABC):
-    @property
-    @abstractmethod
-    def uri(self) -> StorageURI: ...
-    @property
-    @abstractmethod
-    def timestamp(self) -> Optional[Union[datetime, str]]: ...
-    @property
-    @abstractmethod
-    def expires(self) -> Optional[Union[datetime, str]]: ...
-    @property
-    @abstractmethod
-    def status(self) -> int: ...
-    @property
-    def type(self):
-        return self._parsed_uri.scheme
-    @property
-    def name(self):
-        return self._parsed_uri.netloc
-    @cached_property
-    def _parsed_uri(self):
-        return urlparse(self.uri)
-class StorageRecord(NamedTuple):
-    id: int
-    uri: StorageURI
-    timestamp: Optional[Union[datetime, str]] = None
-    expires: Optional[Union[datetime, str]] = None
-    started_inserting_at: Optional[Union[datetime, str]] = None
-    last_inserted_at: Optional[Union[datetime, str]] = None
-    status: int = StorageStatus.CREATED
-    error_message: str = ""
-    error_stack: str = ""
-class Storage(StorageRecord, AbstractStorage):
-    @property
-    def is_indexed(self) -> bool:
-        return self.status == StorageStatus.COMPLETE
-    @property
-    def is_expired(self) -> bool:
-        return is_expired(self.expires)
-    @property
-    def is_pending(self) -> bool:
-        return self.status == StorageStatus.PENDING
-    @property
-    def is_stale(self) -> bool:
-        limit = datetime.now(timezone.utc) - timedelta(minutes=STALE_MINUTES_LIMIT)
-        date_to_check = self.last_inserted_at or self.started_inserting_at
-        return self.is_pending and date_to_check < limit  # type: ignore [operator]
-    @property
-    def need_indexing(self) -> bool:
-        return self.is_expired or not self.is_indexed
-    @property
-    def timestamp_str(self) -> Optional[str]:
-        if not self.timestamp:
-            return None
-        return time_to_str(self.timestamp)
-    @property
-    def timestamp_to_local(self) -> Optional[str]:
-        if not self.timestamp:
-            return None
-        return time_to_local_str(self.timestamp)
-    @property
-    def expires_to_local(self) -> Optional[str]:
-        if not self.expires:
-            return None
-        return time_to_local_str(self.expires)
-    @staticmethod
-    def get_expiration_time(timestamp: datetime, ttl: int):
-        if ttl >= 0:
-            try:
-                return timestamp + timedelta(seconds=ttl)
-            except OverflowError:
-                return datetime.max
-        else:
-            return datetime.max
-    @staticmethod
-    def dataset_name(uri: str, partial_path: str) -> str:
-        return f"{uri}/{partial_path}"
-    def to_dict(self, file_path=""):
-        uri = self.uri
-        if file_path:
-            uri = posixpath.join(uri, *file_path.rstrip("/").split("/"))
-        return {
-            "uri": uri,
-            "timestamp": time_to_str(self.timestamp) if self.timestamp else None,
-            "expires": time_to_str(self.expires) if self.expires else None,
-        }

{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.6.3.dist-info → datachain-0.6.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

Potentially problematic release.

datachain 0.6.3py3-none-any.whl → 0.6.5py3-none-any.whl