datachain 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -42,6 +42,7 @@ from datachain.dataset import (
42
42
  DatasetStats,
43
43
  DatasetStatus,
44
44
  RowDict,
45
+ StorageURI,
45
46
  create_dataset_uri,
46
47
  parse_dataset_uri,
47
48
  )
@@ -58,7 +59,6 @@ from datachain.node import DirType, Node, NodeWithPath
58
59
  from datachain.nodes_thread_pool import NodesThreadPool
59
60
  from datachain.remote.studio import StudioClient
60
61
  from datachain.sql.types import DateTime, SQLType, String
61
- from datachain.storage import StorageURI
62
62
  from datachain.utils import (
63
63
  DataChainDir,
64
64
  batched,
@@ -1702,31 +1702,9 @@ class Catalog:
1702
1702
  *,
1703
1703
  client_config=None,
1704
1704
  ) -> None:
1705
- root_sources = [
1706
- src for src in sources if Client.get_implementation(src).is_root_url(src)
1707
- ]
1708
- non_root_sources = [
1709
- src
1710
- for src in sources
1711
- if not Client.get_implementation(src).is_root_url(src)
1712
- ]
1713
-
1714
- client_config = client_config or self.client_config
1715
-
1716
- # for root sources (e.g s3://) we are just getting all buckets and
1717
- # saving them as storages, without further indexing in each bucket
1718
- for source in root_sources:
1719
- for bucket in Client.get_implementation(source).ls_buckets(**client_config):
1720
- client = self.get_client(bucket.uri, **client_config)
1721
- print(f"Registering storage {client.uri}")
1722
- self.metastore.create_storage_if_not_registered(client.uri)
1723
-
1724
1705
  self.enlist_sources(
1725
- non_root_sources,
1706
+ sources,
1726
1707
  update,
1727
- client_config=client_config,
1708
+ client_config=client_config or self.client_config,
1728
1709
  only_index=True,
1729
1710
  )
1730
-
1731
- def find_stale_storages(self) -> None:
1732
- self.metastore.find_stale_storages()
datachain/cli.py CHANGED
@@ -568,12 +568,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
568
568
  )
569
569
  add_sources_arg(parse_index)
570
570
 
571
- subp.add_parser(
572
- "find-stale-storages",
573
- parents=[parent_parser],
574
- description="Finds and marks stale storages",
575
- )
576
-
577
571
  show_parser = subp.add_parser(
578
572
  "show",
579
573
  parents=[parent_parser],
@@ -1100,8 +1094,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1100
1094
  )
1101
1095
  elif args.command == "completion":
1102
1096
  print(completion(args.shell))
1103
- elif args.command == "find-stale-storages":
1104
- catalog.find_stale_storages()
1105
1097
  elif args.command == "query":
1106
1098
  query(
1107
1099
  catalog,
@@ -31,11 +31,12 @@ from datachain.error import ClientError as DataChainClientError
31
31
  from datachain.lib.file import File
32
32
  from datachain.nodes_fetcher import NodesFetcher
33
33
  from datachain.nodes_thread_pool import NodeChunk
34
- from datachain.storage import StorageURI
35
34
 
36
35
  if TYPE_CHECKING:
37
36
  from fsspec.spec import AbstractFileSystem
38
37
 
38
+ from datachain.dataset import StorageURI
39
+
39
40
 
40
41
  logger = logging.getLogger("datachain")
41
42
 
@@ -63,7 +64,7 @@ def _is_win_local_path(uri: str) -> bool:
63
64
 
64
65
  class Bucket(NamedTuple):
65
66
  name: str
66
- uri: StorageURI
67
+ uri: "StorageURI"
67
68
  created: Optional[datetime]
68
69
 
69
70
 
@@ -115,7 +116,7 @@ class Client(ABC):
115
116
  return DATA_SOURCE_URI_PATTERN.match(name) is not None
116
117
 
117
118
  @staticmethod
118
- def parse_url(source: str) -> tuple[StorageURI, str]:
119
+ def parse_url(source: str) -> tuple["StorageURI", str]:
119
120
  cls = Client.get_implementation(source)
120
121
  storage_name, rel_path = cls.split_url(source)
121
122
  return cls.get_uri(storage_name), rel_path
@@ -148,7 +149,7 @@ class Client(ABC):
148
149
  @classmethod
149
150
  def from_source(
150
151
  cls,
151
- uri: StorageURI,
152
+ uri: "StorageURI",
152
153
  cache: DataChainCache,
153
154
  **kwargs,
154
155
  ) -> "Client":
@@ -156,6 +157,8 @@ class Client(ABC):
156
157
 
157
158
  @classmethod
158
159
  def ls_buckets(cls, **kwargs) -> Iterator[Bucket]:
160
+ from datachain.dataset import StorageURI
161
+
159
162
  for entry in cls.create_fs(**kwargs).ls(cls.PREFIX, detail=True):
160
163
  name = entry["name"].rstrip("/")
161
164
  yield Bucket(
@@ -169,7 +172,9 @@ class Client(ABC):
169
172
  return url == cls.PREFIX
170
173
 
171
174
  @classmethod
172
- def get_uri(cls, name) -> StorageURI:
175
+ def get_uri(cls, name) -> "StorageURI":
176
+ from datachain.dataset import StorageURI
177
+
173
178
  return StorageURI(f"{cls.PREFIX}{name}")
174
179
 
175
180
  @classmethod
datachain/client/hf.py CHANGED
@@ -23,6 +23,7 @@ class HfClient(Client):
23
23
 
24
24
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
25
25
  return File(
26
+ source=self.uri,
26
27
  path=path,
27
28
  size=v["size"],
28
29
  version=v["last_commit"].oid,
datachain/client/local.py CHANGED
@@ -2,16 +2,18 @@ import os
2
2
  import posixpath
3
3
  from datetime import datetime, timezone
4
4
  from pathlib import Path
5
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
10
  from datachain.lib.file import File
11
- from datachain.storage import StorageURI
12
11
 
13
12
  from .fsspec import Client
14
13
 
14
+ if TYPE_CHECKING:
15
+ from datachain.dataset import StorageURI
16
+
15
17
 
16
18
  class FileClient(Client):
17
19
  FS_CLASS = LocalFileSystem
@@ -28,7 +30,9 @@ class FileClient(Client):
28
30
  raise TypeError("Signed urls are not implemented for local file system")
29
31
 
30
32
  @classmethod
31
- def get_uri(cls, name) -> StorageURI:
33
+ def get_uri(cls, name) -> "StorageURI":
34
+ from datachain.dataset import StorageURI
35
+
32
36
  return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
33
37
 
34
38
  @classmethod
@@ -1,9 +1,7 @@
1
1
  import copy
2
- import hashlib
3
2
  import json
4
3
  import logging
5
4
  import os
6
- import posixpath
7
5
  from abc import ABC, abstractmethod
8
6
  from collections.abc import Iterator
9
7
  from datetime import datetime, timezone
@@ -24,7 +22,6 @@ from sqlalchemy import (
24
22
  UniqueConstraint,
25
23
  select,
26
24
  )
27
- from sqlalchemy.sql import func
28
25
 
29
26
  from datachain.data_storage import JobQueryType, JobStatus
30
27
  from datachain.data_storage.serializer import Serializable
@@ -33,15 +30,14 @@ from datachain.dataset import (
33
30
  DatasetRecord,
34
31
  DatasetStatus,
35
32
  DatasetVersion,
33
+ StorageURI,
36
34
  )
37
35
  from datachain.error import (
38
36
  DatasetNotFoundError,
39
- StorageNotFoundError,
40
37
  TableMissingError,
41
38
  )
42
39
  from datachain.job import Job
43
- from datachain.storage import Storage, StorageStatus, StorageURI
44
- from datachain.utils import JSONSerialize, is_expired
40
+ from datachain.utils import JSONSerialize
45
41
 
46
42
  if TYPE_CHECKING:
47
43
  from sqlalchemy import Delete, Insert, Select, Update
@@ -60,21 +56,17 @@ class AbstractMetastore(ABC, Serializable):
60
56
  """
61
57
 
62
58
  uri: StorageURI
63
- partial_id: Optional[int]
64
59
 
65
60
  schema: "schema.Schema"
66
- storage_class: type[Storage] = Storage
67
61
  dataset_class: type[DatasetRecord] = DatasetRecord
68
62
  dependency_class: type[DatasetDependency] = DatasetDependency
69
63
  job_class: type[Job] = Job
70
64
 
71
65
  def __init__(
72
66
  self,
73
- uri: StorageURI = StorageURI(""),
74
- partial_id: Optional[int] = None,
67
+ uri: Optional[StorageURI] = None,
75
68
  ):
76
- self.uri = uri
77
- self.partial_id: Optional[int] = partial_id
69
+ self.uri = uri or StorageURI("")
78
70
 
79
71
  def __enter__(self) -> "AbstractMetastore":
80
72
  """Returns self upon entering context manager."""
@@ -86,8 +78,7 @@ class AbstractMetastore(ABC, Serializable):
86
78
  @abstractmethod
87
79
  def clone(
88
80
  self,
89
- uri: StorageURI = StorageURI(""),
90
- partial_id: Optional[int] = None,
81
+ uri: Optional[StorageURI] = None,
91
82
  use_new_connection: bool = False,
92
83
  ) -> "AbstractMetastore":
93
84
  """Clones AbstractMetastore implementation for some Storage input.
@@ -95,10 +86,6 @@ class AbstractMetastore(ABC, Serializable):
95
86
  New connections should only be used if needed due to errors with
96
87
  closed connections."""
97
88
 
98
- @abstractmethod
99
- def init(self, uri: StorageURI) -> None:
100
- """Initialize partials table for given storage uri."""
101
-
102
89
  def close(self) -> None:
103
90
  """Closes any active database or HTTP connections."""
104
91
 
@@ -114,96 +101,6 @@ class AbstractMetastore(ABC, Serializable):
114
101
  def cleanup_for_tests(self) -> None:
115
102
  """Cleanup for tests."""
116
103
 
117
- #
118
- # Storages
119
- #
120
-
121
- @abstractmethod
122
- def create_storage_if_not_registered(self, uri: StorageURI) -> None:
123
- """Saves new storage if it doesn't exist in database."""
124
-
125
- @abstractmethod
126
- def register_storage_for_indexing(
127
- self,
128
- uri: StorageURI,
129
- force_update: bool = True,
130
- prefix: str = "",
131
- ) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
132
- """
133
- Prepares storage for indexing operation.
134
- This method should be called before index operation is started
135
- It returns:
136
- - storage, prepared for indexing
137
- - boolean saying if indexing is needed
138
- - boolean saying if indexing is currently pending (running)
139
- - partial id
140
- - partial path
141
- """
142
-
143
- @abstractmethod
144
- def find_stale_storages(self) -> None:
145
- """
146
- Finds all pending storages for which the last inserted node has happened
147
- before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
148
- """
149
-
150
- @abstractmethod
151
- def mark_storage_indexed(
152
- self,
153
- uri: StorageURI,
154
- status: int,
155
- ttl: int,
156
- end_time: Optional[datetime] = None,
157
- prefix: str = "",
158
- partial_id: int = 0,
159
- error_message: str = "",
160
- error_stack: str = "",
161
- dataset: Optional[DatasetRecord] = None,
162
- ) -> None:
163
- """
164
- Marks storage as indexed.
165
- This method should be called when index operation is finished.
166
- """
167
-
168
- @abstractmethod
169
- def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
170
- """Updates last inserted datetime in bucket with current time."""
171
-
172
- @abstractmethod
173
- def get_storage(self, uri: StorageURI) -> Storage:
174
- """
175
- Gets storage representation from database.
176
- E.g. if s3 is used as storage this would be s3 bucket data.
177
- """
178
-
179
- @abstractmethod
180
- def mark_storage_pending(self, storage: Storage) -> Storage:
181
- """Marks storage as pending."""
182
-
183
- #
184
- # Partial Indexes
185
- #
186
-
187
- @abstractmethod
188
- def init_partial_id(self, uri: StorageURI) -> None:
189
- """Initializes partial id for given storage."""
190
-
191
- @abstractmethod
192
- def get_next_partial_id(self, uri: StorageURI) -> int:
193
- """Returns next partial id for given storage."""
194
-
195
- @abstractmethod
196
- def get_valid_partial_id(
197
- self, uri: StorageURI, prefix: str, raise_exc: bool = True
198
- ) -> tuple[Optional[int], Optional[str]]:
199
- """
200
- Returns valid partial id and it's path, if they exist, for a given storage.
201
- """
202
-
203
- @abstractmethod
204
- def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
205
- """Returns last partial path for given storage."""
206
-
207
104
  #
208
105
  # Datasets
209
106
  #
@@ -397,8 +294,6 @@ class AbstractDBMetastore(AbstractMetastore):
397
294
  and has shared logic for all database systems currently in use.
398
295
  """
399
296
 
400
- PARTIALS_TABLE_NAME_PREFIX = "prt_"
401
- STORAGE_TABLE = "buckets"
402
297
  DATASET_TABLE = "datasets"
403
298
  DATASET_VERSION_TABLE = "datasets_versions"
404
299
  DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
@@ -410,15 +305,11 @@ class AbstractDBMetastore(AbstractMetastore):
410
305
  def __init__(
411
306
  self,
412
307
  id_generator: "AbstractIDGenerator",
413
- uri: StorageURI = StorageURI(""),
414
- partial_id: Optional[int] = None,
308
+ uri: Optional[StorageURI] = None,
415
309
  ):
310
+ uri = uri or StorageURI("")
416
311
  self.id_generator = id_generator
417
- super().__init__(uri, partial_id)
418
-
419
- @abstractmethod
420
- def init(self, uri: StorageURI) -> None:
421
- """Initialize partials table for given storage uri."""
312
+ super().__init__(uri)
422
313
 
423
314
  def close(self) -> None:
424
315
  """Closes any active database connections."""
@@ -428,21 +319,6 @@ class AbstractDBMetastore(AbstractMetastore):
428
319
  """Cleanup temp tables."""
429
320
  self.id_generator.delete_uris(temp_table_names)
430
321
 
431
- @classmethod
432
- def _buckets_columns(cls) -> list["SchemaItem"]:
433
- """Buckets (storages) table columns."""
434
- return [
435
- Column("id", Integer, primary_key=True, nullable=False),
436
- Column("uri", Text, nullable=False),
437
- Column("timestamp", DateTime(timezone=True)),
438
- Column("expires", DateTime(timezone=True)),
439
- Column("started_inserting_at", DateTime(timezone=True)),
440
- Column("last_inserted_at", DateTime(timezone=True)),
441
- Column("status", Integer, nullable=False),
442
- Column("error_message", Text, nullable=False, default=""),
443
- Column("error_stack", Text, nullable=False, default=""),
444
- ]
445
-
446
322
  @classmethod
447
323
  def _datasets_columns(cls) -> list["SchemaItem"]:
448
324
  """Datasets table columns."""
@@ -543,58 +419,11 @@ class AbstractDBMetastore(AbstractMetastore):
543
419
  ForeignKey(f"{cls.DATASET_VERSION_TABLE}.id"),
544
420
  nullable=True,
545
421
  ),
546
- # TODO remove when https://github.com/iterative/dvcx/issues/1121 is done
547
- # If we unify datasets and bucket listing then both bucket fields won't
548
- # be needed
549
- Column(
550
- "bucket_id",
551
- Integer,
552
- ForeignKey(f"{cls.STORAGE_TABLE}.id"),
553
- nullable=True,
554
- ),
555
- Column("bucket_version", Text, nullable=True),
556
- ]
557
-
558
- @classmethod
559
- def _storage_partial_columns(cls) -> list["SchemaItem"]:
560
- """Storage partial table columns."""
561
- return [
562
- Column("path_str", Text, nullable=False),
563
- # This is generated before insert and is not the SQLite rowid,
564
- # so it is not the primary key.
565
- Column("partial_id", Integer, nullable=False, index=True),
566
- Column("timestamp", DateTime(timezone=True)),
567
- Column("expires", DateTime(timezone=True)),
568
422
  ]
569
423
 
570
- def _get_storage_partial_table(self, name: str) -> Table:
571
- table = self.db.metadata.tables.get(name)
572
- if table is None:
573
- table = Table(
574
- name,
575
- self.db.metadata,
576
- *self._storage_partial_columns(),
577
- )
578
- return table
579
-
580
424
  #
581
425
  # Query Tables
582
426
  #
583
-
584
- def _partials_table(self, uri: StorageURI) -> Table:
585
- return self._get_storage_partial_table(self._partials_table_name(uri))
586
-
587
- @cached_property
588
- def _storages(self) -> Table:
589
- return Table(self.STORAGE_TABLE, self.db.metadata, *self._buckets_columns())
590
-
591
- @cached_property
592
- def _partials(self) -> Table:
593
- assert (
594
- self._current_partials_table_name
595
- ), "Partials can only be used if uri/current_partials_table_name is set"
596
- return self._get_storage_partial_table(self._current_partials_table_name)
597
-
598
427
  @cached_property
599
428
  def _datasets(self) -> Table:
600
429
  return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
@@ -618,32 +447,6 @@ class AbstractDBMetastore(AbstractMetastore):
618
447
  #
619
448
  # Query Starters (These can be overridden by subclasses)
620
449
  #
621
-
622
- @abstractmethod
623
- def _storages_insert(self) -> "Insert": ...
624
-
625
- def _storages_select(self, *columns) -> "Select":
626
- if not columns:
627
- return self._storages.select()
628
- return select(*columns)
629
-
630
- def _storages_update(self) -> "Update":
631
- return self._storages.update()
632
-
633
- def _storages_delete(self) -> "Delete":
634
- return self._storages.delete()
635
-
636
- @abstractmethod
637
- def _partials_insert(self) -> "Insert": ...
638
-
639
- def _partials_select(self, *columns) -> "Select":
640
- if not columns:
641
- return self._partials.select()
642
- return select(*columns)
643
-
644
- def _partials_update(self) -> "Update":
645
- return self._partials.update()
646
-
647
450
  @abstractmethod
648
451
  def _datasets_insert(self) -> "Insert": ...
649
452
 
@@ -686,275 +489,6 @@ class AbstractDBMetastore(AbstractMetastore):
686
489
  def _datasets_dependencies_delete(self) -> "Delete":
687
490
  return self._datasets_dependencies.delete()
688
491
 
689
- #
690
- # Table Name Internal Functions
691
- #
692
-
693
- def _partials_table_name(self, uri: StorageURI) -> str:
694
- sha = hashlib.sha256(uri.encode("utf-8")).hexdigest()[:12]
695
- return f"{self.PARTIALS_TABLE_NAME_PREFIX}_{sha}"
696
-
697
- @property
698
- def _current_partials_table_name(self) -> Optional[str]:
699
- if not self.uri:
700
- return None
701
- return self._partials_table_name(self.uri)
702
-
703
- #
704
- # Storages
705
- #
706
-
707
- def create_storage_if_not_registered(self, uri: StorageURI, conn=None) -> None:
708
- """Saves new storage if it doesn't exist in database."""
709
- query = self._storages_insert().values(
710
- uri=uri,
711
- status=StorageStatus.CREATED,
712
- error_message="",
713
- error_stack="",
714
- )
715
- if hasattr(query, "on_conflict_do_nothing"):
716
- # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
717
- # but generic SQL does not
718
- query = query.on_conflict_do_nothing()
719
- self.db.execute(query, conn=conn)
720
-
721
- def register_storage_for_indexing(
722
- self,
723
- uri: StorageURI,
724
- force_update: bool = True,
725
- prefix: str = "",
726
- ) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
727
- """
728
- Prepares storage for indexing operation.
729
- This method should be called before index operation is started
730
- It returns:
731
- - storage, prepared for indexing
732
- - boolean saying if indexing is needed
733
- - boolean saying if indexing is currently pending (running)
734
- - partial id
735
- - partial path
736
- """
737
- # This ensures that all calls to the DB are in a single transaction
738
- # and commit is automatically called once this function returns
739
- with self.db.transaction() as conn:
740
- # Create storage if it doesn't exist
741
- self.create_storage_if_not_registered(uri, conn=conn)
742
- storage = self.get_storage(uri, conn=conn)
743
-
744
- if storage.status == StorageStatus.PENDING:
745
- return storage, False, True, None, None
746
-
747
- if storage.is_expired or storage.status == StorageStatus.STALE:
748
- storage = self.mark_storage_pending(storage, conn=conn)
749
- return storage, True, False, None, None
750
-
751
- if (
752
- storage.status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE)
753
- and not force_update
754
- ):
755
- partial_id, partial_path = self.get_valid_partial_id(
756
- uri, prefix, raise_exc=False
757
- )
758
- if partial_id is not None:
759
- return storage, False, False, partial_id, partial_path
760
- return storage, True, False, None, None
761
-
762
- storage = self.mark_storage_pending(storage, conn=conn)
763
- return storage, True, False, None, None
764
-
765
- def find_stale_storages(self) -> None:
766
- """
767
- Finds all pending storages for which the last inserted node has happened
768
- before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
769
- """
770
- s = self._storages
771
- with self.db.transaction() as conn:
772
- pending_storages = map(
773
- self.storage_class._make,
774
- self.db.execute(
775
- self._storages_select().where(s.c.status == StorageStatus.PENDING),
776
- conn=conn,
777
- ),
778
- )
779
- for storage in pending_storages:
780
- if storage.is_stale:
781
- print(f"Marking storage {storage.uri} as stale")
782
- self._mark_storage_stale(storage.id, conn=conn)
783
-
784
- def mark_storage_indexed(
785
- self,
786
- uri: StorageURI,
787
- status: int,
788
- ttl: int,
789
- end_time: Optional[datetime] = None,
790
- prefix: str = "",
791
- partial_id: int = 0,
792
- error_message: str = "",
793
- error_stack: str = "",
794
- dataset: Optional[DatasetRecord] = None,
795
- ) -> None:
796
- """
797
- Marks storage as indexed.
798
- This method should be called when index operation is finished.
799
- """
800
- if status == StorageStatus.PARTIAL and not prefix:
801
- raise AssertionError("Partial indexing requires a prefix")
802
-
803
- if end_time is None:
804
- end_time = datetime.now(timezone.utc)
805
- expires = Storage.get_expiration_time(end_time, ttl)
806
-
807
- s = self._storages
808
- with self.db.transaction() as conn:
809
- self.db.execute(
810
- self._storages_update()
811
- .where(s.c.uri == uri)
812
- .values( # type: ignore [attr-defined]
813
- timestamp=end_time,
814
- expires=expires,
815
- status=status,
816
- last_inserted_at=end_time,
817
- error_message=error_message,
818
- error_stack=error_stack,
819
- ),
820
- conn=conn,
821
- )
822
-
823
- if not self._current_partials_table_name:
824
- # This only occurs in tests
825
- return
826
-
827
- if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE):
828
- dir_prefix = posixpath.join(prefix, "")
829
- self.db.execute(
830
- self._partials_insert().values(
831
- path_str=dir_prefix,
832
- timestamp=end_time,
833
- expires=expires,
834
- partial_id=partial_id,
835
- ),
836
- conn=conn,
837
- )
838
-
839
- # update underlying dataset status as well
840
- if status == StorageStatus.FAILED and dataset:
841
- self.update_dataset_status(
842
- dataset,
843
- DatasetStatus.FAILED,
844
- dataset.latest_version,
845
- error_message=error_message,
846
- error_stack=error_stack,
847
- conn=conn,
848
- )
849
-
850
- if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE) and dataset:
851
- self.update_dataset_status(
852
- dataset, DatasetStatus.COMPLETE, dataset.latest_version, conn=conn
853
- )
854
-
855
- def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
856
- """Updates last inserted datetime in bucket with current time"""
857
- uri = uri or self.uri
858
- updates = {"last_inserted_at": datetime.now(timezone.utc)}
859
- s = self._storages
860
- self.db.execute(
861
- self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
862
- )
863
-
864
- def get_storage(self, uri: StorageURI, conn=None) -> Storage:
865
- """
866
- Gets storage representation from database.
867
- E.g. if s3 is used as storage this would be s3 bucket data
868
- """
869
- s = self._storages
870
- result = next(
871
- self.db.execute(self._storages_select().where(s.c.uri == uri), conn=conn),
872
- None,
873
- )
874
- if not result:
875
- raise StorageNotFoundError(f"Storage {uri} not found.")
876
-
877
- return self.storage_class._make(result)
878
-
879
- def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
880
- # Update status to pending and dates
881
- updates = {
882
- "status": StorageStatus.PENDING,
883
- "timestamp": None,
884
- "expires": None,
885
- "last_inserted_at": None,
886
- "started_inserting_at": datetime.now(timezone.utc),
887
- }
888
- storage = storage._replace(**updates) # type: ignore [arg-type]
889
- s = self._storages
890
- self.db.execute(
891
- self._storages_update().where(s.c.uri == storage.uri).values(**updates), # type: ignore [attr-defined]
892
- conn=conn,
893
- )
894
- return storage
895
-
896
- def _mark_storage_stale(self, storage_id: int, conn=None) -> None:
897
- # Update status to pending and dates
898
- updates = {"status": StorageStatus.STALE, "timestamp": None, "expires": None}
899
- s = self._storages
900
- self.db.execute(
901
- self._storages.update().where(s.c.id == storage_id).values(**updates), # type: ignore [attr-defined]
902
- conn=conn,
903
- )
904
-
905
- #
906
- # Partial Indexes
907
- #
908
-
909
- def init_partial_id(self, uri: StorageURI) -> None:
910
- """Initializes partial id for given storage."""
911
- if not uri:
912
- raise ValueError("uri for get_next_partial_id() cannot be empty")
913
- self.id_generator.init_id(f"partials:{uri}")
914
-
915
- def get_next_partial_id(self, uri: StorageURI) -> int:
916
- """Returns next partial id for given storage."""
917
- if not uri:
918
- raise ValueError("uri for get_next_partial_id() cannot be empty")
919
- return self.id_generator.get_next_id(f"partials:{uri}")
920
-
921
- def get_valid_partial_id(
922
- self, uri: StorageURI, prefix: str, raise_exc: bool = True
923
- ) -> tuple[Optional[int], Optional[str]]:
924
- """
925
- Returns valid partial id and it's path, if they exist, for a given storage.
926
- """
927
- # This SQL statement finds all entries that are
928
- # prefixes of the given prefix, matching this or parent directories
929
- # that are indexed.
930
- dir_prefix = posixpath.join(prefix, "")
931
- p = self._partials_table(uri)
932
- expire_values = self.db.execute(
933
- select(p.c.expires, p.c.partial_id, p.c.path_str)
934
- .where(
935
- p.c.path_str == func.substr(dir_prefix, 1, func.length(p.c.path_str))
936
- )
937
- .order_by(p.c.expires.desc())
938
- )
939
- for expires, partial_id, path_str in expire_values:
940
- if not is_expired(expires):
941
- return partial_id, path_str
942
- if raise_exc:
943
- raise RuntimeError(f"Unable to get valid partial_id: {uri=}, {prefix=}")
944
- return None, None
945
-
946
- def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
947
- """Returns last partial path for given storage."""
948
- p = self._partials_table(uri)
949
- if not self.db.has_table(p.name):
950
- raise StorageNotFoundError(f"Storage {uri} partials are not found.")
951
- last_partial = self.db.execute(
952
- select(p.c.path_str).order_by(p.c.timestamp.desc()).limit(1)
953
- )
954
- for (path_str,) in last_partial:
955
- return path_str
956
- return None
957
-
958
492
  #
959
493
  # Datasets
960
494
  #
@@ -1298,7 +832,6 @@ class AbstractDBMetastore(AbstractMetastore):
1298
832
  d = self._datasets
1299
833
  dd = self._datasets_dependencies
1300
834
  dv = self._datasets_versions
1301
- s = self._storages
1302
835
 
1303
836
  dataset_version = dataset.get_version(version)
1304
837
 
@@ -1307,9 +840,9 @@ class AbstractDBMetastore(AbstractMetastore):
1307
840
  query = (
1308
841
  self._datasets_dependencies_select(*select_cols)
1309
842
  .select_from(
1310
- dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
1311
- .join(s, dd.c.bucket_id == s.c.id, isouter=True)
1312
- .join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
843
+ dd.join(d, dd.c.dataset_id == d.c.id, isouter=True).join(
844
+ dv, dd.c.dataset_version_id == dv.c.id, isouter=True
845
+ )
1313
846
  )
1314
847
  .where(
1315
848
  (dd.c.source_dataset_id == dataset.id)
@@ -29,12 +29,11 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
29
29
  from datachain.data_storage.db_engine import DatabaseEngine
30
30
  from datachain.data_storage.id_generator import AbstractDBIDGenerator
31
31
  from datachain.data_storage.schema import DefaultSchema
32
- from datachain.dataset import DatasetRecord
32
+ from datachain.dataset import DatasetRecord, StorageURI
33
33
  from datachain.error import DataChainError
34
34
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
35
35
  from datachain.sql.sqlite.base import load_usearch_extension
36
36
  from datachain.sql.types import SQLType
37
- from datachain.storage import StorageURI
38
37
  from datachain.utils import DataChainDir, batched_it
39
38
 
40
39
  if TYPE_CHECKING:
@@ -392,14 +391,14 @@ class SQLiteMetastore(AbstractDBMetastore):
392
391
  def __init__(
393
392
  self,
394
393
  id_generator: "SQLiteIDGenerator",
395
- uri: StorageURI = StorageURI(""),
396
- partial_id: Optional[int] = None,
394
+ uri: Optional[StorageURI] = None,
397
395
  db: Optional["SQLiteDatabaseEngine"] = None,
398
396
  db_file: Optional[str] = None,
399
397
  in_memory: bool = False,
400
398
  ):
399
+ uri = uri or StorageURI("")
401
400
  self.schema: DefaultSchema = DefaultSchema()
402
- super().__init__(id_generator, uri, partial_id)
401
+ super().__init__(id_generator, uri)
403
402
 
404
403
  # needed for dropping tables in correct order for tests because of
405
404
  # foreign keys
@@ -417,21 +416,16 @@ class SQLiteMetastore(AbstractDBMetastore):
417
416
 
418
417
  def clone(
419
418
  self,
420
- uri: StorageURI = StorageURI(""),
421
- partial_id: Optional[int] = None,
419
+ uri: Optional[StorageURI] = None,
422
420
  use_new_connection: bool = False,
423
421
  ) -> "SQLiteMetastore":
424
- if not uri:
425
- if partial_id is not None:
426
- raise ValueError("if partial_id is used, uri cannot be empty")
427
- if self.uri:
428
- uri = self.uri
429
- if self.partial_id:
430
- partial_id = self.partial_id
422
+ uri = uri or StorageURI("")
423
+ if not uri and self.uri:
424
+ uri = self.uri
425
+
431
426
  return SQLiteMetastore(
432
427
  self.id_generator.clone(),
433
428
  uri=uri,
434
- partial_id=partial_id,
435
429
  db=self.db.clone(),
436
430
  )
437
431
 
@@ -446,7 +440,6 @@ class SQLiteMetastore(AbstractDBMetastore):
446
440
  {
447
441
  "id_generator_clone_params": self.id_generator.clone_params(),
448
442
  "uri": self.uri,
449
- "partial_id": self.partial_id,
450
443
  "db_clone_params": self.db.clone_params(),
451
444
  },
452
445
  )
@@ -457,7 +450,6 @@ class SQLiteMetastore(AbstractDBMetastore):
457
450
  *,
458
451
  id_generator_clone_params: tuple[Callable, list, dict[str, Any]],
459
452
  uri: StorageURI,
460
- partial_id: Optional[int],
461
453
  db_clone_params: tuple[Callable, list, dict[str, Any]],
462
454
  ) -> "SQLiteMetastore":
463
455
  (
@@ -469,14 +461,11 @@ class SQLiteMetastore(AbstractDBMetastore):
469
461
  return cls(
470
462
  id_generator=id_generator_class(*id_generator_args, **id_generator_kwargs),
471
463
  uri=uri,
472
- partial_id=partial_id,
473
464
  db=db_class(*db_args, **db_kwargs),
474
465
  )
475
466
 
476
467
  def _init_tables(self) -> None:
477
468
  """Initialize tables."""
478
- self.db.create_table(self._storages, if_not_exists=True)
479
- self.default_table_names.append(self._storages.name)
480
469
  self.db.create_table(self._datasets, if_not_exists=True)
481
470
  self.default_table_names.append(self._datasets.name)
482
471
  self.db.create_table(self._datasets_versions, if_not_exists=True)
@@ -486,28 +475,11 @@ class SQLiteMetastore(AbstractDBMetastore):
486
475
  self.db.create_table(self._jobs, if_not_exists=True)
487
476
  self.default_table_names.append(self._jobs.name)
488
477
 
489
- def init(self, uri: StorageURI) -> None:
490
- if not uri:
491
- raise ValueError("uri for init() cannot be empty")
492
- partials_table = self._partials_table(uri)
493
- self.db.create_table(partials_table, if_not_exists=True)
494
-
495
- @classmethod
496
- def _buckets_columns(cls) -> list["SchemaItem"]:
497
- """Buckets (storages) table columns."""
498
- return [*super()._buckets_columns(), UniqueConstraint("uri")]
499
-
500
478
  @classmethod
501
479
  def _datasets_columns(cls) -> list["SchemaItem"]:
502
480
  """Datasets table columns."""
503
481
  return [*super()._datasets_columns(), UniqueConstraint("name")]
504
482
 
505
- def _storages_insert(self) -> "Insert":
506
- return sqlite.insert(self._storages)
507
-
508
- def _partials_insert(self) -> "Insert":
509
- return sqlite.insert(self._partials)
510
-
511
483
  def _datasets_insert(self) -> "Insert":
512
484
  return sqlite.insert(self._datasets)
513
485
 
@@ -526,13 +498,9 @@ class SQLiteMetastore(AbstractDBMetastore):
526
498
  self._datasets_dependencies.c.id,
527
499
  self._datasets_dependencies.c.dataset_id,
528
500
  self._datasets_dependencies.c.dataset_version_id,
529
- self._datasets_dependencies.c.bucket_id,
530
- self._datasets_dependencies.c.bucket_version,
531
501
  self._datasets.c.name,
532
- self._datasets.c.created_at,
533
502
  self._datasets_versions.c.version,
534
503
  self._datasets_versions.c.created_at,
535
- self._storages.c.uri,
536
504
  ]
537
505
 
538
506
  #
@@ -19,11 +19,10 @@ from tqdm import tqdm
19
19
  from datachain.client import Client
20
20
  from datachain.data_storage.schema import convert_rows_custom_column_types
21
21
  from datachain.data_storage.serializer import Serializable
22
- from datachain.dataset import DatasetRecord
22
+ from datachain.dataset import DatasetRecord, StorageURI
23
23
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
24
24
  from datachain.sql.functions import path as pathfunc
25
25
  from datachain.sql.types import Int, SQLType
26
- from datachain.storage import StorageURI
27
26
  from datachain.utils import sql_escape_like
28
27
 
29
28
  if TYPE_CHECKING:
datachain/dataset.py CHANGED
@@ -3,21 +3,17 @@ import json
3
3
  from dataclasses import dataclass, fields
4
4
  from datetime import datetime
5
5
  from typing import (
6
- TYPE_CHECKING,
7
6
  Any,
7
+ NewType,
8
8
  Optional,
9
9
  TypeVar,
10
10
  Union,
11
11
  )
12
12
  from urllib.parse import urlparse
13
13
 
14
- from datachain.client import Client
15
14
  from datachain.error import DatasetVersionNotFoundError
16
15
  from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
17
16
 
18
- if TYPE_CHECKING:
19
- from datachain.storage import StorageURI
20
-
21
17
  T = TypeVar("T", bound="DatasetRecord")
22
18
  V = TypeVar("V", bound="DatasetVersion")
23
19
  DD = TypeVar("DD", bound="DatasetDependency")
@@ -27,6 +23,13 @@ QUERY_DATASET_PREFIX = "ds_query_"
27
23
  LISTING_PREFIX = "lst__"
28
24
 
29
25
 
26
+ # StorageURI represents a normalised URI to a valid storage location (full bucket or
27
+ # absolute local path).
28
+ # Valid examples: s3://foo, file:///var/data
29
+ # Invalid examples: s3://foo/, s3://foo/bar, file://~
30
+ StorageURI = NewType("StorageURI", str)
31
+
32
+
30
33
  def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
31
34
  """
32
35
  Parse dataser uri to extract name and version out of it (if version is defined)
@@ -94,14 +97,11 @@ class DatasetDependency:
94
97
  id: int,
95
98
  dataset_id: Optional[int],
96
99
  dataset_version_id: Optional[int],
97
- bucket_id: Optional[int],
98
- bucket_version: Optional[str],
99
100
  dataset_name: Optional[str],
100
- dataset_created_at: Optional[datetime],
101
101
  dataset_version: Optional[int],
102
102
  dataset_version_created_at: Optional[datetime],
103
- bucket_uri: Optional["StorageURI"],
104
103
  ) -> Optional["DatasetDependency"]:
104
+ from datachain.client import Client
105
105
  from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
106
106
 
107
107
  if not dataset_id:
@@ -124,7 +124,7 @@ class DatasetDependency:
124
124
  if dataset_version
125
125
  else None
126
126
  ),
127
- dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
127
+ dataset_version_created_at, # type: ignore[arg-type]
128
128
  [],
129
129
  )
130
130
 
@@ -448,6 +448,8 @@ class DatasetRecord:
448
448
  For bucket listing we implicitly create underlying dataset to hold data. This
449
449
  method is checking if this is one of those datasets.
450
450
  """
451
+ from datachain.client import Client
452
+
451
453
  # TODO refactor and maybe remove method in
452
454
  # https://github.com/iterative/datachain/issues/318
453
455
  return Client.is_data_source_uri(self.name) or self.name.startswith(
datachain/error.py CHANGED
@@ -18,10 +18,6 @@ class DatasetInvalidVersionError(Exception):
18
18
  pass
19
19
 
20
20
 
21
- class StorageNotFoundError(NotFoundError):
22
- pass
23
-
24
-
25
21
  class PendingIndexingError(Exception):
26
22
  """An indexing operation is already in progress."""
27
23
 
datachain/lib/arrow.py CHANGED
@@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
175
175
  return dict
176
176
  if isinstance(col_type, pa.lib.DictionaryType):
177
177
  return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
178
- raise TypeError(f"{col_type!r} datatypes not supported")
178
+ raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
179
179
 
180
180
 
181
181
  def _nrows_file(file: File, nrows: int) -> str:
datachain/node.py CHANGED
@@ -3,8 +3,8 @@ from typing import TYPE_CHECKING, Any, Optional
3
3
 
4
4
  import attrs
5
5
 
6
+ from datachain.dataset import StorageURI
6
7
  from datachain.lib.file import File
7
- from datachain.storage import StorageURI
8
8
  from datachain.utils import TIME_ZERO, time_to_str
9
9
 
10
10
  if TYPE_CHECKING:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.3
3
+ Version: 0.6.5
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -2,45 +2,44 @@ datachain/__init__.py,sha256=OGzc8xZWtwqxiiutjU4AxCRPY0lrX_csgERiTrq4G0o,908
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=EM6jlc9zunOJQi7-GwCyVtlumHmLM8NwN9Y6jqVGzyY,33769
5
+ datachain/cli.py,sha256=Wl-xMpTRgrkg4drX5I_QxAB1IATyULHCXOdx_wfoLVg,33529
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
- datachain/dataset.py,sha256=w7qqJP7xYrm9CmBSmSezSxUQHZDsHKkwviF8AYUob7o,14671
9
- datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
8
+ datachain/dataset.py,sha256=lLUbUbJP1TYL9Obkc0f2IDziGcDylZge9ORQjK-WtXs,14717
9
+ datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=AV23WZq-k6e2zeeNBhVQP1-2PrwNCYidO0HBDKzpVaA,7152
12
- datachain/node.py,sha256=ThE6Ue4BqpaBvrkFFJW_ljLxchixUX2aWz3l_nbwY54,5195
12
+ datachain/node.py,sha256=i7_jC8VcW6W5VYkDszAOu0H-rNBuqXB4UnLEh4wFzjc,5195
13
13
  datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
17
  datachain/studio.py,sha256=d-jUsYpfI1LEv3g8KU-lLchVgb9L0TXvlHakieFud_E,3788
19
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
20
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
21
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
22
- datachain/catalog/catalog.py,sha256=PvJ-BRoSuI_FRCrXJ6tjMhYZD6L8Beq-ynrdPYRrwiw,58270
21
+ datachain/catalog/catalog.py,sha256=qFlRrR01_9h1MjK6DEgVSgIwbtZEGV_SdG_E5qUsHmM,57352
23
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
24
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
25
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
26
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
27
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
28
- datachain/client/fsspec.py,sha256=sB98CO7covhmFZg36hsnyv9UwUI8J94AD1QWgGdcBlY,12595
27
+ datachain/client/fsspec.py,sha256=C6C5AO6ndkgcoUxCRN9_8fUzqX2cRWJWG6FL6oD9X_Q,12708
29
28
  datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
30
- datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
31
- datachain/client/local.py,sha256=Uaf_y_UGspOgprDysUTI9wDo334MLjGPUudqVtvef0c,4367
29
+ datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
30
+ datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
32
31
  datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
33
32
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
34
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
35
34
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
36
35
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
37
- datachain/data_storage/metastore.py,sha256=HfCxk4lmDUg2Q4WsFNQGMWxllP0mToA00fxkFTwdNIE,52919
36
+ datachain/data_storage/metastore.py,sha256=-TJCqG70VofSVOh2yEez4dwjHS3eQL8p7d9uO3WTVwM,35878
38
37
  datachain/data_storage/schema.py,sha256=CiRXrDYp5ZZopSyUgZ7MT2ml_6YvqSTYXdybatcbX9M,9849
39
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
40
- datachain/data_storage/sqlite.py,sha256=jopfVftng157TVcBKMB_QPlbkE6fTatiY4GYSSLNkig,28737
41
- datachain/data_storage/warehouse.py,sha256=iIjFOutYxhLev3CcUhUTwMJOkHeAEBwXZ2y3wmjrF1s,30756
39
+ datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
40
+ datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
42
41
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
- datachain/lib/arrow.py,sha256=M6SM4u2LeHgylzkPZBWckFeZt3CH3ehpBod3nGl6OYY,9138
42
+ datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
44
43
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
45
44
  datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
46
45
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
@@ -101,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
101
100
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
102
101
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
103
102
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
104
- datachain-0.6.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
105
- datachain-0.6.3.dist-info/METADATA,sha256=BnPIINjkfA0P2Sj9mRziNuKm8SWyINrf8qqCic7NUAo,17188
106
- datachain-0.6.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
107
- datachain-0.6.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
108
- datachain-0.6.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
109
- datachain-0.6.3.dist-info/RECORD,,
103
+ datachain-0.6.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
104
+ datachain-0.6.5.dist-info/METADATA,sha256=eSh62q8OKalsO_IHYb0M2lT4y0x5z84uX1WVt7_dZlM,17188
105
+ datachain-0.6.5.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
106
+ datachain-0.6.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
107
+ datachain-0.6.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
108
+ datachain-0.6.5.dist-info/RECORD,,
datachain/storage.py DELETED
@@ -1,136 +0,0 @@
1
- import posixpath
2
- from abc import ABC, abstractmethod
3
- from datetime import datetime, timedelta, timezone
4
- from functools import cached_property
5
- from typing import NamedTuple, NewType, Optional, Union
6
- from urllib.parse import urlparse
7
-
8
- from datachain.utils import is_expired, time_to_local_str, time_to_str
9
-
10
- STALE_MINUTES_LIMIT = 15
11
-
12
- # StorageURI represents a normalised URI to a valid storage location (full bucket or
13
- # absolute local path).
14
- # Valid examples: s3://foo, file:///var/data
15
- # Invalid examples: s3://foo/, s3://foo/bar, file://~
16
- StorageURI = NewType("StorageURI", str)
17
-
18
-
19
- class StorageStatus:
20
- CREATED = 1
21
- PENDING = 2
22
- FAILED = 3
23
- COMPLETE = 4
24
- PARTIAL = 5
25
- STALE = 6
26
- INDEXING_SCHEDULED = 7
27
- DELETE_SCHEDULED = 8
28
-
29
-
30
- class AbstractStorage(ABC):
31
- @property
32
- @abstractmethod
33
- def uri(self) -> StorageURI: ...
34
-
35
- @property
36
- @abstractmethod
37
- def timestamp(self) -> Optional[Union[datetime, str]]: ...
38
-
39
- @property
40
- @abstractmethod
41
- def expires(self) -> Optional[Union[datetime, str]]: ...
42
-
43
- @property
44
- @abstractmethod
45
- def status(self) -> int: ...
46
-
47
- @property
48
- def type(self):
49
- return self._parsed_uri.scheme
50
-
51
- @property
52
- def name(self):
53
- return self._parsed_uri.netloc
54
-
55
- @cached_property
56
- def _parsed_uri(self):
57
- return urlparse(self.uri)
58
-
59
-
60
- class StorageRecord(NamedTuple):
61
- id: int
62
- uri: StorageURI
63
- timestamp: Optional[Union[datetime, str]] = None
64
- expires: Optional[Union[datetime, str]] = None
65
- started_inserting_at: Optional[Union[datetime, str]] = None
66
- last_inserted_at: Optional[Union[datetime, str]] = None
67
- status: int = StorageStatus.CREATED
68
- error_message: str = ""
69
- error_stack: str = ""
70
-
71
-
72
- class Storage(StorageRecord, AbstractStorage):
73
- @property
74
- def is_indexed(self) -> bool:
75
- return self.status == StorageStatus.COMPLETE
76
-
77
- @property
78
- def is_expired(self) -> bool:
79
- return is_expired(self.expires)
80
-
81
- @property
82
- def is_pending(self) -> bool:
83
- return self.status == StorageStatus.PENDING
84
-
85
- @property
86
- def is_stale(self) -> bool:
87
- limit = datetime.now(timezone.utc) - timedelta(minutes=STALE_MINUTES_LIMIT)
88
- date_to_check = self.last_inserted_at or self.started_inserting_at
89
-
90
- return self.is_pending and date_to_check < limit # type: ignore [operator]
91
-
92
- @property
93
- def need_indexing(self) -> bool:
94
- return self.is_expired or not self.is_indexed
95
-
96
- @property
97
- def timestamp_str(self) -> Optional[str]:
98
- if not self.timestamp:
99
- return None
100
- return time_to_str(self.timestamp)
101
-
102
- @property
103
- def timestamp_to_local(self) -> Optional[str]:
104
- if not self.timestamp:
105
- return None
106
- return time_to_local_str(self.timestamp)
107
-
108
- @property
109
- def expires_to_local(self) -> Optional[str]:
110
- if not self.expires:
111
- return None
112
- return time_to_local_str(self.expires)
113
-
114
- @staticmethod
115
- def get_expiration_time(timestamp: datetime, ttl: int):
116
- if ttl >= 0:
117
- try:
118
- return timestamp + timedelta(seconds=ttl)
119
- except OverflowError:
120
- return datetime.max
121
- else:
122
- return datetime.max
123
-
124
- @staticmethod
125
- def dataset_name(uri: str, partial_path: str) -> str:
126
- return f"{uri}/{partial_path}"
127
-
128
- def to_dict(self, file_path=""):
129
- uri = self.uri
130
- if file_path:
131
- uri = posixpath.join(uri, *file_path.rstrip("/").split("/"))
132
- return {
133
- "uri": uri,
134
- "timestamp": time_to_str(self.timestamp) if self.timestamp else None,
135
- "expires": time_to_str(self.expires) if self.expires else None,
136
- }