datachain 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +3 -25
- datachain/cli.py +0 -8
- datachain/client/fsspec.py +10 -5
- datachain/client/local.py +7 -3
- datachain/data_storage/metastore.py +11 -478
- datachain/data_storage/sqlite.py +9 -41
- datachain/data_storage/warehouse.py +1 -2
- datachain/dataset.py +12 -10
- datachain/error.py +0 -4
- datachain/lib/arrow.py +2 -15
- datachain/lib/data_model.py +10 -2
- datachain/lib/utils.py +30 -0
- datachain/node.py +1 -1
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/METADATA +2 -2
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/RECORD +19 -20
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/WHEEL +1 -1
- datachain/storage.py +0 -136
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/LICENSE +0 -0
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import copy
|
|
2
|
-
import hashlib
|
|
3
2
|
import json
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
6
|
-
import posixpath
|
|
7
5
|
from abc import ABC, abstractmethod
|
|
8
6
|
from collections.abc import Iterator
|
|
9
7
|
from datetime import datetime, timezone
|
|
@@ -24,7 +22,6 @@ from sqlalchemy import (
|
|
|
24
22
|
UniqueConstraint,
|
|
25
23
|
select,
|
|
26
24
|
)
|
|
27
|
-
from sqlalchemy.sql import func
|
|
28
25
|
|
|
29
26
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
30
27
|
from datachain.data_storage.serializer import Serializable
|
|
@@ -33,15 +30,14 @@ from datachain.dataset import (
|
|
|
33
30
|
DatasetRecord,
|
|
34
31
|
DatasetStatus,
|
|
35
32
|
DatasetVersion,
|
|
33
|
+
StorageURI,
|
|
36
34
|
)
|
|
37
35
|
from datachain.error import (
|
|
38
36
|
DatasetNotFoundError,
|
|
39
|
-
StorageNotFoundError,
|
|
40
37
|
TableMissingError,
|
|
41
38
|
)
|
|
42
39
|
from datachain.job import Job
|
|
43
|
-
from datachain.
|
|
44
|
-
from datachain.utils import JSONSerialize, is_expired
|
|
40
|
+
from datachain.utils import JSONSerialize
|
|
45
41
|
|
|
46
42
|
if TYPE_CHECKING:
|
|
47
43
|
from sqlalchemy import Delete, Insert, Select, Update
|
|
@@ -60,21 +56,17 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
60
56
|
"""
|
|
61
57
|
|
|
62
58
|
uri: StorageURI
|
|
63
|
-
partial_id: Optional[int]
|
|
64
59
|
|
|
65
60
|
schema: "schema.Schema"
|
|
66
|
-
storage_class: type[Storage] = Storage
|
|
67
61
|
dataset_class: type[DatasetRecord] = DatasetRecord
|
|
68
62
|
dependency_class: type[DatasetDependency] = DatasetDependency
|
|
69
63
|
job_class: type[Job] = Job
|
|
70
64
|
|
|
71
65
|
def __init__(
|
|
72
66
|
self,
|
|
73
|
-
uri: StorageURI =
|
|
74
|
-
partial_id: Optional[int] = None,
|
|
67
|
+
uri: Optional[StorageURI] = None,
|
|
75
68
|
):
|
|
76
|
-
self.uri = uri
|
|
77
|
-
self.partial_id: Optional[int] = partial_id
|
|
69
|
+
self.uri = uri or StorageURI("")
|
|
78
70
|
|
|
79
71
|
def __enter__(self) -> "AbstractMetastore":
|
|
80
72
|
"""Returns self upon entering context manager."""
|
|
@@ -86,8 +78,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
86
78
|
@abstractmethod
|
|
87
79
|
def clone(
|
|
88
80
|
self,
|
|
89
|
-
uri: StorageURI =
|
|
90
|
-
partial_id: Optional[int] = None,
|
|
81
|
+
uri: Optional[StorageURI] = None,
|
|
91
82
|
use_new_connection: bool = False,
|
|
92
83
|
) -> "AbstractMetastore":
|
|
93
84
|
"""Clones AbstractMetastore implementation for some Storage input.
|
|
@@ -95,10 +86,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
95
86
|
New connections should only be used if needed due to errors with
|
|
96
87
|
closed connections."""
|
|
97
88
|
|
|
98
|
-
@abstractmethod
|
|
99
|
-
def init(self, uri: StorageURI) -> None:
|
|
100
|
-
"""Initialize partials table for given storage uri."""
|
|
101
|
-
|
|
102
89
|
def close(self) -> None:
|
|
103
90
|
"""Closes any active database or HTTP connections."""
|
|
104
91
|
|
|
@@ -114,96 +101,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
114
101
|
def cleanup_for_tests(self) -> None:
|
|
115
102
|
"""Cleanup for tests."""
|
|
116
103
|
|
|
117
|
-
#
|
|
118
|
-
# Storages
|
|
119
|
-
#
|
|
120
|
-
|
|
121
|
-
@abstractmethod
|
|
122
|
-
def create_storage_if_not_registered(self, uri: StorageURI) -> None:
|
|
123
|
-
"""Saves new storage if it doesn't exist in database."""
|
|
124
|
-
|
|
125
|
-
@abstractmethod
|
|
126
|
-
def register_storage_for_indexing(
|
|
127
|
-
self,
|
|
128
|
-
uri: StorageURI,
|
|
129
|
-
force_update: bool = True,
|
|
130
|
-
prefix: str = "",
|
|
131
|
-
) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
|
|
132
|
-
"""
|
|
133
|
-
Prepares storage for indexing operation.
|
|
134
|
-
This method should be called before index operation is started
|
|
135
|
-
It returns:
|
|
136
|
-
- storage, prepared for indexing
|
|
137
|
-
- boolean saying if indexing is needed
|
|
138
|
-
- boolean saying if indexing is currently pending (running)
|
|
139
|
-
- partial id
|
|
140
|
-
- partial path
|
|
141
|
-
"""
|
|
142
|
-
|
|
143
|
-
@abstractmethod
|
|
144
|
-
def find_stale_storages(self) -> None:
|
|
145
|
-
"""
|
|
146
|
-
Finds all pending storages for which the last inserted node has happened
|
|
147
|
-
before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
|
|
148
|
-
"""
|
|
149
|
-
|
|
150
|
-
@abstractmethod
|
|
151
|
-
def mark_storage_indexed(
|
|
152
|
-
self,
|
|
153
|
-
uri: StorageURI,
|
|
154
|
-
status: int,
|
|
155
|
-
ttl: int,
|
|
156
|
-
end_time: Optional[datetime] = None,
|
|
157
|
-
prefix: str = "",
|
|
158
|
-
partial_id: int = 0,
|
|
159
|
-
error_message: str = "",
|
|
160
|
-
error_stack: str = "",
|
|
161
|
-
dataset: Optional[DatasetRecord] = None,
|
|
162
|
-
) -> None:
|
|
163
|
-
"""
|
|
164
|
-
Marks storage as indexed.
|
|
165
|
-
This method should be called when index operation is finished.
|
|
166
|
-
"""
|
|
167
|
-
|
|
168
|
-
@abstractmethod
|
|
169
|
-
def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
|
|
170
|
-
"""Updates last inserted datetime in bucket with current time."""
|
|
171
|
-
|
|
172
|
-
@abstractmethod
|
|
173
|
-
def get_storage(self, uri: StorageURI) -> Storage:
|
|
174
|
-
"""
|
|
175
|
-
Gets storage representation from database.
|
|
176
|
-
E.g. if s3 is used as storage this would be s3 bucket data.
|
|
177
|
-
"""
|
|
178
|
-
|
|
179
|
-
@abstractmethod
|
|
180
|
-
def mark_storage_pending(self, storage: Storage) -> Storage:
|
|
181
|
-
"""Marks storage as pending."""
|
|
182
|
-
|
|
183
|
-
#
|
|
184
|
-
# Partial Indexes
|
|
185
|
-
#
|
|
186
|
-
|
|
187
|
-
@abstractmethod
|
|
188
|
-
def init_partial_id(self, uri: StorageURI) -> None:
|
|
189
|
-
"""Initializes partial id for given storage."""
|
|
190
|
-
|
|
191
|
-
@abstractmethod
|
|
192
|
-
def get_next_partial_id(self, uri: StorageURI) -> int:
|
|
193
|
-
"""Returns next partial id for given storage."""
|
|
194
|
-
|
|
195
|
-
@abstractmethod
|
|
196
|
-
def get_valid_partial_id(
|
|
197
|
-
self, uri: StorageURI, prefix: str, raise_exc: bool = True
|
|
198
|
-
) -> tuple[Optional[int], Optional[str]]:
|
|
199
|
-
"""
|
|
200
|
-
Returns valid partial id and it's path, if they exist, for a given storage.
|
|
201
|
-
"""
|
|
202
|
-
|
|
203
|
-
@abstractmethod
|
|
204
|
-
def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
|
|
205
|
-
"""Returns last partial path for given storage."""
|
|
206
|
-
|
|
207
104
|
#
|
|
208
105
|
# Datasets
|
|
209
106
|
#
|
|
@@ -397,8 +294,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
397
294
|
and has shared logic for all database systems currently in use.
|
|
398
295
|
"""
|
|
399
296
|
|
|
400
|
-
PARTIALS_TABLE_NAME_PREFIX = "prt_"
|
|
401
|
-
STORAGE_TABLE = "buckets"
|
|
402
297
|
DATASET_TABLE = "datasets"
|
|
403
298
|
DATASET_VERSION_TABLE = "datasets_versions"
|
|
404
299
|
DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
|
|
@@ -410,15 +305,11 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
410
305
|
def __init__(
|
|
411
306
|
self,
|
|
412
307
|
id_generator: "AbstractIDGenerator",
|
|
413
|
-
uri: StorageURI =
|
|
414
|
-
partial_id: Optional[int] = None,
|
|
308
|
+
uri: Optional[StorageURI] = None,
|
|
415
309
|
):
|
|
310
|
+
uri = uri or StorageURI("")
|
|
416
311
|
self.id_generator = id_generator
|
|
417
|
-
super().__init__(uri
|
|
418
|
-
|
|
419
|
-
@abstractmethod
|
|
420
|
-
def init(self, uri: StorageURI) -> None:
|
|
421
|
-
"""Initialize partials table for given storage uri."""
|
|
312
|
+
super().__init__(uri)
|
|
422
313
|
|
|
423
314
|
def close(self) -> None:
|
|
424
315
|
"""Closes any active database connections."""
|
|
@@ -428,21 +319,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
428
319
|
"""Cleanup temp tables."""
|
|
429
320
|
self.id_generator.delete_uris(temp_table_names)
|
|
430
321
|
|
|
431
|
-
@classmethod
|
|
432
|
-
def _buckets_columns(cls) -> list["SchemaItem"]:
|
|
433
|
-
"""Buckets (storages) table columns."""
|
|
434
|
-
return [
|
|
435
|
-
Column("id", Integer, primary_key=True, nullable=False),
|
|
436
|
-
Column("uri", Text, nullable=False),
|
|
437
|
-
Column("timestamp", DateTime(timezone=True)),
|
|
438
|
-
Column("expires", DateTime(timezone=True)),
|
|
439
|
-
Column("started_inserting_at", DateTime(timezone=True)),
|
|
440
|
-
Column("last_inserted_at", DateTime(timezone=True)),
|
|
441
|
-
Column("status", Integer, nullable=False),
|
|
442
|
-
Column("error_message", Text, nullable=False, default=""),
|
|
443
|
-
Column("error_stack", Text, nullable=False, default=""),
|
|
444
|
-
]
|
|
445
|
-
|
|
446
322
|
@classmethod
|
|
447
323
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
448
324
|
"""Datasets table columns."""
|
|
@@ -543,58 +419,11 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
543
419
|
ForeignKey(f"{cls.DATASET_VERSION_TABLE}.id"),
|
|
544
420
|
nullable=True,
|
|
545
421
|
),
|
|
546
|
-
# TODO remove when https://github.com/iterative/dvcx/issues/1121 is done
|
|
547
|
-
# If we unify datasets and bucket listing then both bucket fields won't
|
|
548
|
-
# be needed
|
|
549
|
-
Column(
|
|
550
|
-
"bucket_id",
|
|
551
|
-
Integer,
|
|
552
|
-
ForeignKey(f"{cls.STORAGE_TABLE}.id"),
|
|
553
|
-
nullable=True,
|
|
554
|
-
),
|
|
555
|
-
Column("bucket_version", Text, nullable=True),
|
|
556
|
-
]
|
|
557
|
-
|
|
558
|
-
@classmethod
|
|
559
|
-
def _storage_partial_columns(cls) -> list["SchemaItem"]:
|
|
560
|
-
"""Storage partial table columns."""
|
|
561
|
-
return [
|
|
562
|
-
Column("path_str", Text, nullable=False),
|
|
563
|
-
# This is generated before insert and is not the SQLite rowid,
|
|
564
|
-
# so it is not the primary key.
|
|
565
|
-
Column("partial_id", Integer, nullable=False, index=True),
|
|
566
|
-
Column("timestamp", DateTime(timezone=True)),
|
|
567
|
-
Column("expires", DateTime(timezone=True)),
|
|
568
422
|
]
|
|
569
423
|
|
|
570
|
-
def _get_storage_partial_table(self, name: str) -> Table:
|
|
571
|
-
table = self.db.metadata.tables.get(name)
|
|
572
|
-
if table is None:
|
|
573
|
-
table = Table(
|
|
574
|
-
name,
|
|
575
|
-
self.db.metadata,
|
|
576
|
-
*self._storage_partial_columns(),
|
|
577
|
-
)
|
|
578
|
-
return table
|
|
579
|
-
|
|
580
424
|
#
|
|
581
425
|
# Query Tables
|
|
582
426
|
#
|
|
583
|
-
|
|
584
|
-
def _partials_table(self, uri: StorageURI) -> Table:
|
|
585
|
-
return self._get_storage_partial_table(self._partials_table_name(uri))
|
|
586
|
-
|
|
587
|
-
@cached_property
|
|
588
|
-
def _storages(self) -> Table:
|
|
589
|
-
return Table(self.STORAGE_TABLE, self.db.metadata, *self._buckets_columns())
|
|
590
|
-
|
|
591
|
-
@cached_property
|
|
592
|
-
def _partials(self) -> Table:
|
|
593
|
-
assert (
|
|
594
|
-
self._current_partials_table_name
|
|
595
|
-
), "Partials can only be used if uri/current_partials_table_name is set"
|
|
596
|
-
return self._get_storage_partial_table(self._current_partials_table_name)
|
|
597
|
-
|
|
598
427
|
@cached_property
|
|
599
428
|
def _datasets(self) -> Table:
|
|
600
429
|
return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
|
|
@@ -618,32 +447,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
618
447
|
#
|
|
619
448
|
# Query Starters (These can be overridden by subclasses)
|
|
620
449
|
#
|
|
621
|
-
|
|
622
|
-
@abstractmethod
|
|
623
|
-
def _storages_insert(self) -> "Insert": ...
|
|
624
|
-
|
|
625
|
-
def _storages_select(self, *columns) -> "Select":
|
|
626
|
-
if not columns:
|
|
627
|
-
return self._storages.select()
|
|
628
|
-
return select(*columns)
|
|
629
|
-
|
|
630
|
-
def _storages_update(self) -> "Update":
|
|
631
|
-
return self._storages.update()
|
|
632
|
-
|
|
633
|
-
def _storages_delete(self) -> "Delete":
|
|
634
|
-
return self._storages.delete()
|
|
635
|
-
|
|
636
|
-
@abstractmethod
|
|
637
|
-
def _partials_insert(self) -> "Insert": ...
|
|
638
|
-
|
|
639
|
-
def _partials_select(self, *columns) -> "Select":
|
|
640
|
-
if not columns:
|
|
641
|
-
return self._partials.select()
|
|
642
|
-
return select(*columns)
|
|
643
|
-
|
|
644
|
-
def _partials_update(self) -> "Update":
|
|
645
|
-
return self._partials.update()
|
|
646
|
-
|
|
647
450
|
@abstractmethod
|
|
648
451
|
def _datasets_insert(self) -> "Insert": ...
|
|
649
452
|
|
|
@@ -686,275 +489,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
686
489
|
def _datasets_dependencies_delete(self) -> "Delete":
|
|
687
490
|
return self._datasets_dependencies.delete()
|
|
688
491
|
|
|
689
|
-
#
|
|
690
|
-
# Table Name Internal Functions
|
|
691
|
-
#
|
|
692
|
-
|
|
693
|
-
def _partials_table_name(self, uri: StorageURI) -> str:
|
|
694
|
-
sha = hashlib.sha256(uri.encode("utf-8")).hexdigest()[:12]
|
|
695
|
-
return f"{self.PARTIALS_TABLE_NAME_PREFIX}_{sha}"
|
|
696
|
-
|
|
697
|
-
@property
|
|
698
|
-
def _current_partials_table_name(self) -> Optional[str]:
|
|
699
|
-
if not self.uri:
|
|
700
|
-
return None
|
|
701
|
-
return self._partials_table_name(self.uri)
|
|
702
|
-
|
|
703
|
-
#
|
|
704
|
-
# Storages
|
|
705
|
-
#
|
|
706
|
-
|
|
707
|
-
def create_storage_if_not_registered(self, uri: StorageURI, conn=None) -> None:
|
|
708
|
-
"""Saves new storage if it doesn't exist in database."""
|
|
709
|
-
query = self._storages_insert().values(
|
|
710
|
-
uri=uri,
|
|
711
|
-
status=StorageStatus.CREATED,
|
|
712
|
-
error_message="",
|
|
713
|
-
error_stack="",
|
|
714
|
-
)
|
|
715
|
-
if hasattr(query, "on_conflict_do_nothing"):
|
|
716
|
-
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
717
|
-
# but generic SQL does not
|
|
718
|
-
query = query.on_conflict_do_nothing()
|
|
719
|
-
self.db.execute(query, conn=conn)
|
|
720
|
-
|
|
721
|
-
def register_storage_for_indexing(
|
|
722
|
-
self,
|
|
723
|
-
uri: StorageURI,
|
|
724
|
-
force_update: bool = True,
|
|
725
|
-
prefix: str = "",
|
|
726
|
-
) -> tuple[Storage, bool, bool, Optional[int], Optional[str]]:
|
|
727
|
-
"""
|
|
728
|
-
Prepares storage for indexing operation.
|
|
729
|
-
This method should be called before index operation is started
|
|
730
|
-
It returns:
|
|
731
|
-
- storage, prepared for indexing
|
|
732
|
-
- boolean saying if indexing is needed
|
|
733
|
-
- boolean saying if indexing is currently pending (running)
|
|
734
|
-
- partial id
|
|
735
|
-
- partial path
|
|
736
|
-
"""
|
|
737
|
-
# This ensures that all calls to the DB are in a single transaction
|
|
738
|
-
# and commit is automatically called once this function returns
|
|
739
|
-
with self.db.transaction() as conn:
|
|
740
|
-
# Create storage if it doesn't exist
|
|
741
|
-
self.create_storage_if_not_registered(uri, conn=conn)
|
|
742
|
-
storage = self.get_storage(uri, conn=conn)
|
|
743
|
-
|
|
744
|
-
if storage.status == StorageStatus.PENDING:
|
|
745
|
-
return storage, False, True, None, None
|
|
746
|
-
|
|
747
|
-
if storage.is_expired or storage.status == StorageStatus.STALE:
|
|
748
|
-
storage = self.mark_storage_pending(storage, conn=conn)
|
|
749
|
-
return storage, True, False, None, None
|
|
750
|
-
|
|
751
|
-
if (
|
|
752
|
-
storage.status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE)
|
|
753
|
-
and not force_update
|
|
754
|
-
):
|
|
755
|
-
partial_id, partial_path = self.get_valid_partial_id(
|
|
756
|
-
uri, prefix, raise_exc=False
|
|
757
|
-
)
|
|
758
|
-
if partial_id is not None:
|
|
759
|
-
return storage, False, False, partial_id, partial_path
|
|
760
|
-
return storage, True, False, None, None
|
|
761
|
-
|
|
762
|
-
storage = self.mark_storage_pending(storage, conn=conn)
|
|
763
|
-
return storage, True, False, None, None
|
|
764
|
-
|
|
765
|
-
def find_stale_storages(self) -> None:
|
|
766
|
-
"""
|
|
767
|
-
Finds all pending storages for which the last inserted node has happened
|
|
768
|
-
before STALE_MINUTES_LIMIT minutes, and marks it as STALE.
|
|
769
|
-
"""
|
|
770
|
-
s = self._storages
|
|
771
|
-
with self.db.transaction() as conn:
|
|
772
|
-
pending_storages = map(
|
|
773
|
-
self.storage_class._make,
|
|
774
|
-
self.db.execute(
|
|
775
|
-
self._storages_select().where(s.c.status == StorageStatus.PENDING),
|
|
776
|
-
conn=conn,
|
|
777
|
-
),
|
|
778
|
-
)
|
|
779
|
-
for storage in pending_storages:
|
|
780
|
-
if storage.is_stale:
|
|
781
|
-
print(f"Marking storage {storage.uri} as stale")
|
|
782
|
-
self._mark_storage_stale(storage.id, conn=conn)
|
|
783
|
-
|
|
784
|
-
def mark_storage_indexed(
|
|
785
|
-
self,
|
|
786
|
-
uri: StorageURI,
|
|
787
|
-
status: int,
|
|
788
|
-
ttl: int,
|
|
789
|
-
end_time: Optional[datetime] = None,
|
|
790
|
-
prefix: str = "",
|
|
791
|
-
partial_id: int = 0,
|
|
792
|
-
error_message: str = "",
|
|
793
|
-
error_stack: str = "",
|
|
794
|
-
dataset: Optional[DatasetRecord] = None,
|
|
795
|
-
) -> None:
|
|
796
|
-
"""
|
|
797
|
-
Marks storage as indexed.
|
|
798
|
-
This method should be called when index operation is finished.
|
|
799
|
-
"""
|
|
800
|
-
if status == StorageStatus.PARTIAL and not prefix:
|
|
801
|
-
raise AssertionError("Partial indexing requires a prefix")
|
|
802
|
-
|
|
803
|
-
if end_time is None:
|
|
804
|
-
end_time = datetime.now(timezone.utc)
|
|
805
|
-
expires = Storage.get_expiration_time(end_time, ttl)
|
|
806
|
-
|
|
807
|
-
s = self._storages
|
|
808
|
-
with self.db.transaction() as conn:
|
|
809
|
-
self.db.execute(
|
|
810
|
-
self._storages_update()
|
|
811
|
-
.where(s.c.uri == uri)
|
|
812
|
-
.values( # type: ignore [attr-defined]
|
|
813
|
-
timestamp=end_time,
|
|
814
|
-
expires=expires,
|
|
815
|
-
status=status,
|
|
816
|
-
last_inserted_at=end_time,
|
|
817
|
-
error_message=error_message,
|
|
818
|
-
error_stack=error_stack,
|
|
819
|
-
),
|
|
820
|
-
conn=conn,
|
|
821
|
-
)
|
|
822
|
-
|
|
823
|
-
if not self._current_partials_table_name:
|
|
824
|
-
# This only occurs in tests
|
|
825
|
-
return
|
|
826
|
-
|
|
827
|
-
if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE):
|
|
828
|
-
dir_prefix = posixpath.join(prefix, "")
|
|
829
|
-
self.db.execute(
|
|
830
|
-
self._partials_insert().values(
|
|
831
|
-
path_str=dir_prefix,
|
|
832
|
-
timestamp=end_time,
|
|
833
|
-
expires=expires,
|
|
834
|
-
partial_id=partial_id,
|
|
835
|
-
),
|
|
836
|
-
conn=conn,
|
|
837
|
-
)
|
|
838
|
-
|
|
839
|
-
# update underlying dataset status as well
|
|
840
|
-
if status == StorageStatus.FAILED and dataset:
|
|
841
|
-
self.update_dataset_status(
|
|
842
|
-
dataset,
|
|
843
|
-
DatasetStatus.FAILED,
|
|
844
|
-
dataset.latest_version,
|
|
845
|
-
error_message=error_message,
|
|
846
|
-
error_stack=error_stack,
|
|
847
|
-
conn=conn,
|
|
848
|
-
)
|
|
849
|
-
|
|
850
|
-
if status in (StorageStatus.PARTIAL, StorageStatus.COMPLETE) and dataset:
|
|
851
|
-
self.update_dataset_status(
|
|
852
|
-
dataset, DatasetStatus.COMPLETE, dataset.latest_version, conn=conn
|
|
853
|
-
)
|
|
854
|
-
|
|
855
|
-
def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
|
|
856
|
-
"""Updates last inserted datetime in bucket with current time"""
|
|
857
|
-
uri = uri or self.uri
|
|
858
|
-
updates = {"last_inserted_at": datetime.now(timezone.utc)}
|
|
859
|
-
s = self._storages
|
|
860
|
-
self.db.execute(
|
|
861
|
-
self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
|
|
862
|
-
)
|
|
863
|
-
|
|
864
|
-
def get_storage(self, uri: StorageURI, conn=None) -> Storage:
|
|
865
|
-
"""
|
|
866
|
-
Gets storage representation from database.
|
|
867
|
-
E.g. if s3 is used as storage this would be s3 bucket data
|
|
868
|
-
"""
|
|
869
|
-
s = self._storages
|
|
870
|
-
result = next(
|
|
871
|
-
self.db.execute(self._storages_select().where(s.c.uri == uri), conn=conn),
|
|
872
|
-
None,
|
|
873
|
-
)
|
|
874
|
-
if not result:
|
|
875
|
-
raise StorageNotFoundError(f"Storage {uri} not found.")
|
|
876
|
-
|
|
877
|
-
return self.storage_class._make(result)
|
|
878
|
-
|
|
879
|
-
def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
|
|
880
|
-
# Update status to pending and dates
|
|
881
|
-
updates = {
|
|
882
|
-
"status": StorageStatus.PENDING,
|
|
883
|
-
"timestamp": None,
|
|
884
|
-
"expires": None,
|
|
885
|
-
"last_inserted_at": None,
|
|
886
|
-
"started_inserting_at": datetime.now(timezone.utc),
|
|
887
|
-
}
|
|
888
|
-
storage = storage._replace(**updates) # type: ignore [arg-type]
|
|
889
|
-
s = self._storages
|
|
890
|
-
self.db.execute(
|
|
891
|
-
self._storages_update().where(s.c.uri == storage.uri).values(**updates), # type: ignore [attr-defined]
|
|
892
|
-
conn=conn,
|
|
893
|
-
)
|
|
894
|
-
return storage
|
|
895
|
-
|
|
896
|
-
def _mark_storage_stale(self, storage_id: int, conn=None) -> None:
|
|
897
|
-
# Update status to pending and dates
|
|
898
|
-
updates = {"status": StorageStatus.STALE, "timestamp": None, "expires": None}
|
|
899
|
-
s = self._storages
|
|
900
|
-
self.db.execute(
|
|
901
|
-
self._storages.update().where(s.c.id == storage_id).values(**updates), # type: ignore [attr-defined]
|
|
902
|
-
conn=conn,
|
|
903
|
-
)
|
|
904
|
-
|
|
905
|
-
#
|
|
906
|
-
# Partial Indexes
|
|
907
|
-
#
|
|
908
|
-
|
|
909
|
-
def init_partial_id(self, uri: StorageURI) -> None:
|
|
910
|
-
"""Initializes partial id for given storage."""
|
|
911
|
-
if not uri:
|
|
912
|
-
raise ValueError("uri for get_next_partial_id() cannot be empty")
|
|
913
|
-
self.id_generator.init_id(f"partials:{uri}")
|
|
914
|
-
|
|
915
|
-
def get_next_partial_id(self, uri: StorageURI) -> int:
|
|
916
|
-
"""Returns next partial id for given storage."""
|
|
917
|
-
if not uri:
|
|
918
|
-
raise ValueError("uri for get_next_partial_id() cannot be empty")
|
|
919
|
-
return self.id_generator.get_next_id(f"partials:{uri}")
|
|
920
|
-
|
|
921
|
-
def get_valid_partial_id(
|
|
922
|
-
self, uri: StorageURI, prefix: str, raise_exc: bool = True
|
|
923
|
-
) -> tuple[Optional[int], Optional[str]]:
|
|
924
|
-
"""
|
|
925
|
-
Returns valid partial id and it's path, if they exist, for a given storage.
|
|
926
|
-
"""
|
|
927
|
-
# This SQL statement finds all entries that are
|
|
928
|
-
# prefixes of the given prefix, matching this or parent directories
|
|
929
|
-
# that are indexed.
|
|
930
|
-
dir_prefix = posixpath.join(prefix, "")
|
|
931
|
-
p = self._partials_table(uri)
|
|
932
|
-
expire_values = self.db.execute(
|
|
933
|
-
select(p.c.expires, p.c.partial_id, p.c.path_str)
|
|
934
|
-
.where(
|
|
935
|
-
p.c.path_str == func.substr(dir_prefix, 1, func.length(p.c.path_str))
|
|
936
|
-
)
|
|
937
|
-
.order_by(p.c.expires.desc())
|
|
938
|
-
)
|
|
939
|
-
for expires, partial_id, path_str in expire_values:
|
|
940
|
-
if not is_expired(expires):
|
|
941
|
-
return partial_id, path_str
|
|
942
|
-
if raise_exc:
|
|
943
|
-
raise RuntimeError(f"Unable to get valid partial_id: {uri=}, {prefix=}")
|
|
944
|
-
return None, None
|
|
945
|
-
|
|
946
|
-
def get_last_partial_path(self, uri: StorageURI) -> Optional[str]:
|
|
947
|
-
"""Returns last partial path for given storage."""
|
|
948
|
-
p = self._partials_table(uri)
|
|
949
|
-
if not self.db.has_table(p.name):
|
|
950
|
-
raise StorageNotFoundError(f"Storage {uri} partials are not found.")
|
|
951
|
-
last_partial = self.db.execute(
|
|
952
|
-
select(p.c.path_str).order_by(p.c.timestamp.desc()).limit(1)
|
|
953
|
-
)
|
|
954
|
-
for (path_str,) in last_partial:
|
|
955
|
-
return path_str
|
|
956
|
-
return None
|
|
957
|
-
|
|
958
492
|
#
|
|
959
493
|
# Datasets
|
|
960
494
|
#
|
|
@@ -1298,7 +832,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1298
832
|
d = self._datasets
|
|
1299
833
|
dd = self._datasets_dependencies
|
|
1300
834
|
dv = self._datasets_versions
|
|
1301
|
-
s = self._storages
|
|
1302
835
|
|
|
1303
836
|
dataset_version = dataset.get_version(version)
|
|
1304
837
|
|
|
@@ -1307,9 +840,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1307
840
|
query = (
|
|
1308
841
|
self._datasets_dependencies_select(*select_cols)
|
|
1309
842
|
.select_from(
|
|
1310
|
-
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
|
|
1311
|
-
|
|
1312
|
-
|
|
843
|
+
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True).join(
|
|
844
|
+
dv, dd.c.dataset_version_id == dv.c.id, isouter=True
|
|
845
|
+
)
|
|
1313
846
|
)
|
|
1314
847
|
.where(
|
|
1315
848
|
(dd.c.source_dataset_id == dataset.id)
|