datachain 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,9 +1,7 @@
1
1
  import copy
2
- import hashlib
3
2
  import json
4
3
  import logging
5
4
  import os
6
- import posixpath
7
5
  from abc import ABC, abstractmethod
8
6
  from collections.abc import Iterator
9
7
  from datetime import datetime, timezone
@@ -24,7 +22,6 @@ from sqlalchemy import (
24
22
  UniqueConstraint,
25
23
  select,
26
24
  )
27
- from sqlalchemy.sql import func
28
25
 
29
26
  from datachain.data_storage import JobQueryType, JobStatus
30
27
  from datachain.data_storage.serializer import Serializable
@@ -33,15 +30,14 @@ from datachain.dataset import (
33
30
  DatasetRecord,
34
31
  DatasetStatus,
35
32
  DatasetVersion,
33
+ StorageURI,
36
34
  )
37
35
  from datachain.error import (
38
36
  DatasetNotFoundError,
39
- StorageNotFoundError,
40
37
  TableMissingError,
41
38
  )
42
39
  from datachain.job import Job
43
- from datachain.storage import Storage, StorageStatus, StorageURI
44
- from datachain.utils import JSONSerialize, is_expired
40
+ from datachain.utils import JSONSerialize
45
41
 
46
42
  if TYPE_CHECKING:
47
43
  from sqlalchemy import Delete, Insert, Select, Update
@@ -60,21 +56,17 @@ class AbstractMetastore(ABC, Serializable):
60
56
  """
61
57
 
62
58
  uri: StorageURI
63
- partial_id: Optional[int]
64
59
 
65
60
  schema: "schema.Schema"
66
- storage_class: type[Storage] = Storage
67
61
  dataset_class: type[DatasetRecord] = DatasetRecord
68
62
  dependency_class: type[DatasetDependency] = DatasetDependency
69
63
  job_class: type[Job] = Job
70
64
 
71
65
  def __init__(
72
66
  self,
73
- uri: StorageURI = StorageURI(""),
74
- partial_id: Optional[int] = None,
67
+ uri: Optional[StorageURI] = None,
75
68
  ):
76
- self.uri = uri
77
- self.partial_id: Optional[int] = partial_id
69
+ self.uri = uri or StorageURI("")
78
70
 
79
71
  def __enter__(self) -> "AbstractMetastore":
80
72
  """Returns self upon entering context manager."""
@@ -86,8 +78,7 @@ class AbstractMetastore(ABC, Serializable):
86
78
  @abstractmethod
87
79
  def clone(
88
80
  self,
89
- uri: StorageURI = StorageURI(""),
90
- partial_id: Optional[int] = None,
81
+ uri: Optional[StorageURI] = None,
91
82
  use_new_connection: bool = False,
92
83
  ) -> "AbstractMetastore":
93
84
  """Clones AbstractMetastore implementation for some Storage input.
@@ -95,10 +86,6 @@ class AbstractMetastore(ABC, Serializable):
95
86
  New connections should only be used if needed due to errors with
96
87
  closed connections."""
97
88
 
98
- @abstractmethod
99
- def init(self, uri: StorageURI) -> None:
100
- """Initialize partials table for given storage uri."""
101
-
102
89
  def close(self) -> None:
103
90
  """Closes any active database or HTTP connections."""
104
91
 
@@ -114,96 +101,6 @@ class AbstractMetastore(ABC, Serializable):
114
101
  def cleanup_for_tests(self) -> None:
115
102
  """Cleanup for tests."""
116
103
 
117
- #
118
- # Storages
119
- #
120
-
121
- @abstractmethod
122
- def create_storage_if_not_registered(self, uri: StorageURI) -> None:
123
- """Saves new storage if it doesn't exist in database."""
124
-
125
- @abstractmethod
126
- def register_storage_for_indexing(
127
- self,
128
- uri: StorageURI,
129
- force_update: bool = True,
130
- prefix: str = "",
131
- ) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
132
- """
133
- Prepares storage for indexing operation.
134
- This method should be called before index operation is started
135
- It returns:
136
- - storage, prepared for indexing
137
- - boolean saying if indexing is needed
138
- - boolean saying if indexing is currently pending (running)
139
- - partial id
140
- - partial path
141
- """
142
-
143
- @abstractmethod
144
- def find_stale_storages(self) -> None:
145
- """
146
- Finds all pending storages for which the last inserted node has happened
147
- before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
148
- """
149
-
150
- @abstractmethod
151
- def mark_storage_indexed(
152
- self,
153
- uri: StorageURI,
154
- status: int,
155
- ttl: int,
156
- end_time: Optional[datetime] = None,
157
- prefix: str = "",
158
- partial_id: int = 0,
159
- error_message: str = "",
160
- error_stack: str = "",
161
- dataset: Optional[DatasetRecord] = None,
162
- ) -> None:
163
- """
164
- Marks storage as indexed.
165
- This method should be called when index operation is finished.
166
- """
167
-
168
- @abstractmethod
169
- def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
170
- """Updates last inserted datetime in bucket with current time."""
171
-
172
- @abstractmethod
173
- def get_storage(self, uri: StorageURI) -> Storage:
174
- """
175
- Gets storage representation from database.
176
- E.g. if s3 is used as storage this would be s3 bucket data.
177
- """
178
-
179
- @abstractmethod
180
- def mark_storage_pending(self, storage: Storage) -> Storage:
181
- """Marks storage as pending."""
182
-
183
- #
184
- # Partial Indexes
185
- #
186
-
187
- @abstractmethod
188
- def init_partial_id(self, uri: StorageURI) -> None:
189
- """Initializes partial id for given storage."""
190
-
191
- @abstractmethod
192
- def get_next_partial_id(self, uri: StorageURI) -> int:
193
- """Returns next partial id for given storage."""
194
-
195
- @abstractmethod
196
- def get_valid_partial_id(
197
- self, uri: StorageURI, prefix: str, raise_exc: bool = True
198
- ) -> tuple[Optional[int], Optional[str]]:
199
- """
200
- Returns valid partial id and it's path, if they exist, for a given storage.
201
- """
202
-
203
- @abstractmethod
204
- def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
205
- """Returns last partial path for given storage."""
206
-
207
104
  #
208
105
  # Datasets
209
106
  #
@@ -397,8 +294,6 @@ class AbstractDBMetastore(AbstractMetastore):
397
294
  and has shared logic for all database systems currently in use.
398
295
  """
399
296
 
400
- PARTIALS_TABLE_NAME_PREFIX = "prt_"
401
- STORAGE_TABLE = "buckets"
402
297
  DATASET_TABLE = "datasets"
403
298
  DATASET_VERSION_TABLE = "datasets_versions"
404
299
  DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
@@ -410,15 +305,11 @@ class AbstractDBMetastore(AbstractMetastore):
410
305
  def __init__(
411
306
  self,
412
307
  id_generator: "AbstractIDGenerator",
413
- uri: StorageURI = StorageURI(""),
414
- partial_id: Optional[int] = None,
308
+ uri: Optional[StorageURI] = None,
415
309
  ):
310
+ uri = uri or StorageURI("")
416
311
  self.id_generator = id_generator
417
- super().__init__(uri, partial_id)
418
-
419
- @abstractmethod
420
- def init(self, uri: StorageURI) -> None:
421
- """Initialize partials table for given storage uri."""
312
+ super().__init__(uri)
422
313
 
423
314
  def close(self) -> None:
424
315
  """Closes any active database connections."""
@@ -428,21 +319,6 @@ class AbstractDBMetastore(AbstractMetastore):
428
319
  """Cleanup temp tables."""
429
320
  self.id_generator.delete_uris(temp_table_names)
430
321
 
431
- @classmethod
432
- def _buckets_columns(cls) -> list["SchemaItem"]:
433
- """Buckets (storages) table columns."""
434
- return [
435
- Column("id", Integer, primary_key=True, nullable=False),
436
- Column("uri", Text, nullable=False),
437
- Column("timestamp", DateTime(timezone=True)),
438
- Column("expires", DateTime(timezone=True)),
439
- Column("started_inserting_at", DateTime(timezone=True)),
440
- Column("last_inserted_at", DateTime(timezone=True)),
441
- Column("status", Integer, nullable=False),
442
- Column("error_message", Text, nullable=False, default=""),
443
- Column("error_stack", Text, nullable=False, default=""),
444
- ]
445
-
446
322
  @classmethod
447
323
  def _datasets_columns(cls) -> list["SchemaItem"]:
448
324
  """Datasets table columns."""
@@ -543,58 +419,11 @@ class AbstractDBMetastore(AbstractMetastore):
543
419
  ForeignKey(f"{cls.DATASET_VERSION_TABLE}.id"),
544
420
  nullable=True,
545
421
  ),
546
- # TODO remove when https://github.com/iterative/dvcx/issues/1121 is done
547
- # If we unify datasets and bucket listing then both bucket fields won't
548
- # be needed
549
- Column(
550
- "bucket_id",
551
- Integer,
552
- ForeignKey(f"{cls.STORAGE_TABLE}.id"),
553
- nullable=True,
554
- ),
555
- Column("bucket_version", Text, nullable=True),
556
- ]
557
-
558
- @classmethod
559
- def _storage_partial_columns(cls) -> list["SchemaItem"]:
560
- """Storage partial table columns."""
561
- return [
562
- Column("path_str", Text, nullable=False),
563
- # This is generated before insert and is not the SQLite rowid,
564
- # so it is not the primary key.
565
- Column("partial_id", Integer, nullable=False, index=True),
566
- Column("timestamp", DateTime(timezone=True)),
567
- Column("expires", DateTime(timezone=True)),
568
422
  ]
569
423
 
570
- def _get_storage_partial_table(self, name: str) -> Table:
571
- table = self.db.metadata.tables.get(name)
572
- if table is None:
573
- table = Table(
574
- name,
575
- self.db.metadata,
576
- *self._storage_partial_columns(),
577
- )
578
- return table
579
-
580
424
  #
581
425
  # Query Tables
582
426
  #
583
-
584
- def _partials_table(self, uri: StorageURI) -> Table:
585
- return self._get_storage_partial_table(self._partials_table_name(uri))
586
-
587
- @cached_property
588
- def _storages(self) -> Table:
589
- return Table(self.STORAGE_TABLE, self.db.metadata, *self._buckets_columns())
590
-
591
- @cached_property
592
- def _partials(self) -> Table:
593
- assert (
594
- self._current_partials_table_name
595
- ), "Partials can only be used if uri/current_partials_table_name is set"
596
- return self._get_storage_partial_table(self._current_partials_table_name)
597
-
598
427
  @cached_property
599
428
  def _datasets(self) -> Table:
600
429
  return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
@@ -618,32 +447,6 @@ class AbstractDBMetastore(AbstractMetastore):
618
447
  #
619
448
  # Query Starters (These can be overridden by subclasses)
620
449
  #
621
-
622
- @abstractmethod
623
- def _storages_insert(self) -> "Insert": ...
624
-
625
- def _storages_select(self, *columns) -> "Select":
626
- if not columns:
627
- return self._storages.select()
628
- return select(*columns)
629
-
630
- def _storages_update(self) -> "Update":
631
- return self._storages.update()
632
-
633
- def _storages_delete(self) -> "Delete":
634
- return self._storages.delete()
635
-
636
- @abstractmethod
637
- def _partials_insert(self) -> "Insert": ...
638
-
639
- def _partials_select(self, *columns) -> "Select":
640
- if not columns:
641
- return self._partials.select()
642
- return select(*columns)
643
-
644
- def _partials_update(self) -> "Update":
645
- return self._partials.update()
646
-
647
450
  @abstractmethod
648
451
  def _datasets_insert(self) -> "Insert": ...
649
452
 
@@ -686,275 +489,6 @@ class AbstractDBMetastore(AbstractMetastore):
686
489
  def _datasets_dependencies_delete(self) -> "Delete":
687
490
  return self._datasets_dependencies.delete()
688
491
 
689
- #
690
- # Table Name Internal Functions
691
- #
692
-
693
- def _partials_table_name(self, uri: StorageURI) -> str:
694
- sha = hashlib.sha256(uri.encode("utf-8")).hexdigest()[:12]
695
- return f"{self.PARTIALS_TABLE_NAME_PREFIX}_{sha}"
696
-
697
- @property
698
- def _current_partials_table_name(self) -> Optional[str]:
699
- if not self.uri:
700
- return None
701
- return self._partials_table_name(self.uri)
702
-
703
- #
704
- # Storages
705
- #
706
-
707
- def create_storage_if_not_registered(self, uri: StorageURI, conn=None) -> None:
708
- """Saves new storage if it doesn't exist in database."""
709
- query = self._storages_insert().values(
710
- uri=uri,
711
- status=StorageStatus.CREATED,
712
- error_message="",
713
- error_stack="",
714
- )
715
- if hasattr(query, "on_conflict_do_nothing"):
716
- # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
717
- # but generic SQL does not
718
- query = query.on_conflict_do_nothing()
719
- self.db.execute(query, conn=conn)
720
-
721
- def register_storage_for_indexing(
722
- self,
723
- uri: StorageURI,
724
- force_update: bool = True,
725
- prefix: str = "",
726
- ) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
727
- """
728
- Prepares storage for indexing operation.
729
- This method should be called before index operation is started
730
- It returns:
731
- - storage, prepared for indexing
732
- - boolean saying if indexing is needed
733
- - boolean saying if indexing is currently pending (running)
734
- - partial id
735
- - partial path
736
- """
737
- # This ensures that all calls to the DB are in a single transaction
738
- # and commit is automatically called once this function returns
739
- with self.db.transaction() as conn:
740
- # Create storage if it doesn't exist
741
- self.create_storage_if_not_registered(uri, conn=conn)
742
- storage = self.get_storage(uri, conn=conn)
743
-
744
- if storage.status == StorageStatus.PENDING:
745
- return storage, False, True, None, None
746
-
747
- if storage.is_expired or storage.status == StorageStatus.STALE:
748
- storage = self.mark_storage_pending(storage, conn=conn)
749
- return storage, True, False, None, None
750
-
751
- if (
752
- storage.status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE)
753
- and not force_update
754
- ):
755
- partial_id, partial_path = self.get_valid_partial_id(
756
- uri, prefix, raise_exc=False
757
- )
758
- if partial_id is not None:
759
- return storage, False, False, partial_id, partial_path
760
- return storage, True, False, None, None
761
-
762
- storage = self.mark_storage_pending(storage, conn=conn)
763
- return storage, True, False, None, None
764
-
765
- def find_stale_storages(self) -> None:
766
- """
767
- Finds all pending storages for which the last inserted node has happened
768
- before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
769
- """
770
- s = self._storages
771
- with self.db.transaction() as conn:
772
- pending_storages = map(
773
- self.storage_class._make,
774
- self.db.execute(
775
- self._storages_select().where(s.c.status == StorageStatus.PENDING),
776
- conn=conn,
777
- ),
778
- )
779
- for storage in pending_storages:
780
- if storage.is_stale:
781
- print(f"Marking storage {storage.uri} as stale")
782
- self._mark_storage_stale(storage.id, conn=conn)
783
-
784
- def mark_storage_indexed(
785
- self,
786
- uri: StorageURI,
787
- status: int,
788
- ttl: int,
789
- end_time: Optional[datetime] = None,
790
- prefix: str = "",
791
- partial_id: int = 0,
792
- error_message: str = "",
793
- error_stack: str = "",
794
- dataset: Optional[DatasetRecord] = None,
795
- ) -> None:
796
- """
797
- Marks storage as indexed.
798
- This method should be called when index operation is finished.
799
- """
800
- if status == StorageStatus.PARTIAL and not prefix:
801
- raise AssertionError("Partial indexing requires a prefix")
802
-
803
- if end_time is None:
804
- end_time = datetime.now(timezone.utc)
805
- expires = Storage.get_expiration_time(end_time, ttl)
806
-
807
- s = self._storages
808
- with self.db.transaction() as conn:
809
- self.db.execute(
810
- self._storages_update()
811
- .where(s.c.uri == uri)
812
- .values( # type: ignore [attr-defined]
813
- timestamp=end_time,
814
- expires=expires,
815
- status=status,
816
- last_inserted_at=end_time,
817
- error_message=error_message,
818
- error_stack=error_stack,
819
- ),
820
- conn=conn,
821
- )
822
-
823
- if not self._current_partials_table_name:
824
- # This only occurs in tests
825
- return
826
-
827
- if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE):
828
- dir_prefix = posixpath.join(prefix, "")
829
- self.db.execute(
830
- self._partials_insert().values(
831
- path_str=dir_prefix,
832
- timestamp=end_time,
833
- expires=expires,
834
- partial_id=partial_id,
835
- ),
836
- conn=conn,
837
- )
838
-
839
- # update underlying dataset status as well
840
- if status == StorageStatus.FAILED and dataset:
841
- self.update_dataset_status(
842
- dataset,
843
- DatasetStatus.FAILED,
844
- dataset.latest_version,
845
- error_message=error_message,
846
- error_stack=error_stack,
847
- conn=conn,
848
- )
849
-
850
- if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE) and dataset:
851
- self.update_dataset_status(
852
- dataset, DatasetStatus.COMPLETE, dataset.latest_version, conn=conn
853
- )
854
-
855
- def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
856
- """Updates last inserted datetime in bucket with current time"""
857
- uri = uri or self.uri
858
- updates = {"last_inserted_at": datetime.now(timezone.utc)}
859
- s = self._storages
860
- self.db.execute(
861
- self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
862
- )
863
-
864
- def get_storage(self, uri: StorageURI, conn=None) -> Storage:
865
- """
866
- Gets storage representation from database.
867
- E.g. if s3 is used as storage this would be s3 bucket data
868
- """
869
- s = self._storages
870
- result = next(
871
- self.db.execute(self._storages_select().where(s.c.uri == uri), conn=conn),
872
- None,
873
- )
874
- if not result:
875
- raise StorageNotFoundError(f"Storage {uri} not found.")
876
-
877
- return self.storage_class._make(result)
878
-
879
- def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
880
- # Update status to pending and dates
881
- updates = {
882
- "status": StorageStatus.PENDING,
883
- "timestamp": None,
884
- "expires": None,
885
- "last_inserted_at": None,
886
- "started_inserting_at": datetime.now(timezone.utc),
887
- }
888
- storage = storage._replace(**updates) # type: ignore [arg-type]
889
- s = self._storages
890
- self.db.execute(
891
- self._storages_update().where(s.c.uri == storage.uri).values(**updates), # type: ignore [attr-defined]
892
- conn=conn,
893
- )
894
- return storage
895
-
896
- def _mark_storage_stale(self, storage_id: int, conn=None) -> None:
897
- # Update status to pending and dates
898
- updates = {"status": StorageStatus.STALE, "timestamp": None, "expires": None}
899
- s = self._storages
900
- self.db.execute(
901
- self._storages.update().where(s.c.id == storage_id).values(**updates), # type: ignore [attr-defined]
902
- conn=conn,
903
- )
904
-
905
- #
906
- # Partial Indexes
907
- #
908
-
909
- def init_partial_id(self, uri: StorageURI) -> None:
910
- """Initializes partial id for given storage."""
911
- if not uri:
912
- raise ValueError("uri for get_next_partial_id() cannot be empty")
913
- self.id_generator.init_id(f"partials:{uri}")
914
-
915
- def get_next_partial_id(self, uri: StorageURI) -> int:
916
- """Returns next partial id for given storage."""
917
- if not uri:
918
- raise ValueError("uri for get_next_partial_id() cannot be empty")
919
- return self.id_generator.get_next_id(f"partials:{uri}")
920
-
921
- def get_valid_partial_id(
922
- self, uri: StorageURI, prefix: str, raise_exc: bool = True
923
- ) -> tuple[Optional[int], Optional[str]]:
924
- """
925
- Returns valid partial id and it's path, if they exist, for a given storage.
926
- """
927
- # This SQL statement finds all entries that are
928
- # prefixes of the given prefix, matching this or parent directories
929
- # that are indexed.
930
- dir_prefix = posixpath.join(prefix, "")
931
- p = self._partials_table(uri)
932
- expire_values = self.db.execute(
933
- select(p.c.expires, p.c.partial_id, p.c.path_str)
934
- .where(
935
- p.c.path_str == func.substr(dir_prefix, 1, func.length(p.c.path_str))
936
- )
937
- .order_by(p.c.expires.desc())
938
- )
939
- for expires, partial_id, path_str in expire_values:
940
- if not is_expired(expires):
941
- return partial_id, path_str
942
- if raise_exc:
943
- raise RuntimeError(f"Unable to get valid partial_id: {uri=}, {prefix=}")
944
- return None, None
945
-
946
- def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
947
- """Returns last partial path for given storage."""
948
- p = self._partials_table(uri)
949
- if not self.db.has_table(p.name):
950
- raise StorageNotFoundError(f"Storage {uri} partials are not found.")
951
- last_partial = self.db.execute(
952
- select(p.c.path_str).order_by(p.c.timestamp.desc()).limit(1)
953
- )
954
- for (path_str,) in last_partial:
955
- return path_str
956
- return None
957
-
958
492
  #
959
493
  # Datasets
960
494
  #
@@ -1298,7 +832,6 @@ class AbstractDBMetastore(AbstractMetastore):
1298
832
  d = self._datasets
1299
833
  dd = self._datasets_dependencies
1300
834
  dv = self._datasets_versions
1301
- s = self._storages
1302
835
 
1303
836
  dataset_version = dataset.get_version(version)
1304
837
 
@@ -1307,9 +840,9 @@ class AbstractDBMetastore(AbstractMetastore):
1307
840
  query = (
1308
841
  self._datasets_dependencies_select(*select_cols)
1309
842
  .select_from(
1310
- dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
1311
- .join(s, dd.c.bucket_id == s.c.id, isouter=True)
1312
- .join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
843
+ dd.join(d, dd.c.dataset_id == d.c.id, isouter=True).join(
844
+ dv, dd.c.dataset_version_id == dv.c.id, isouter=True
845
+ )
1313
846
  )
1314
847
  .where(
1315
848
  (dd.c.source_dataset_id == dataset.id)