datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +213 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +35 -26
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +433 -37
- datachain/data_storage/sqlite.py +140 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +128 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +253 -91
- datachain/lib/dc/datasets.py +103 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +2 -1
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/signal_schema.py +8 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +23 -6
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
|
@@ -37,9 +37,13 @@ from datachain.dataset import (
|
|
|
37
37
|
from datachain.error import (
|
|
38
38
|
DatasetNotFoundError,
|
|
39
39
|
DatasetVersionNotFoundError,
|
|
40
|
+
NamespaceNotFoundError,
|
|
41
|
+
ProjectNotFoundError,
|
|
40
42
|
TableMissingError,
|
|
41
43
|
)
|
|
42
44
|
from datachain.job import Job
|
|
45
|
+
from datachain.namespace import Namespace
|
|
46
|
+
from datachain.project import Project
|
|
43
47
|
from datachain.utils import JSONSerialize
|
|
44
48
|
|
|
45
49
|
if TYPE_CHECKING:
|
|
@@ -61,6 +65,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
61
65
|
uri: StorageURI
|
|
62
66
|
|
|
63
67
|
schema: "schema.Schema"
|
|
68
|
+
namespace_class: type[Namespace] = Namespace
|
|
69
|
+
project_class: type[Project] = Project
|
|
64
70
|
dataset_class: type[DatasetRecord] = DatasetRecord
|
|
65
71
|
dataset_list_class: type[DatasetListRecord] = DatasetListRecord
|
|
66
72
|
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
@@ -107,13 +113,116 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
107
113
|
"""Cleanup for tests."""
|
|
108
114
|
|
|
109
115
|
#
|
|
110
|
-
#
|
|
116
|
+
# Namespaces
|
|
117
|
+
#
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def default_namespace_name(self):
|
|
122
|
+
"""Gets default namespace name"""
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def system_namespace_name(self):
|
|
126
|
+
return Namespace.system()
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def create_namespace(
|
|
130
|
+
self,
|
|
131
|
+
name: str,
|
|
132
|
+
description: Optional[str] = None,
|
|
133
|
+
uuid: Optional[str] = None,
|
|
134
|
+
ignore_if_exists: bool = True,
|
|
135
|
+
validate: bool = True,
|
|
136
|
+
**kwargs,
|
|
137
|
+
) -> Namespace:
|
|
138
|
+
"""Creates new namespace"""
|
|
139
|
+
|
|
140
|
+
@abstractmethod
|
|
141
|
+
def get_namespace(self, name: str, conn=None) -> Namespace:
|
|
142
|
+
"""Gets a single namespace by name"""
|
|
143
|
+
|
|
144
|
+
@abstractmethod
|
|
145
|
+
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
146
|
+
"""Gets a list of all namespaces"""
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
@abstractmethod
|
|
150
|
+
def is_studio(self) -> bool:
|
|
151
|
+
"""Returns True if this code is ran in Studio"""
|
|
152
|
+
|
|
153
|
+
def is_local_dataset(self, dataset_namespace: str) -> bool:
|
|
154
|
+
"""
|
|
155
|
+
Returns True if this is local dataset i.e. not pulled from Studio but
|
|
156
|
+
created locally. This is False if we ran code in CLI mode but using dataset
|
|
157
|
+
names that are present in Studio.
|
|
158
|
+
"""
|
|
159
|
+
return self.is_studio or dataset_namespace == Namespace.default()
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def namespace_allowed_to_create(self):
|
|
163
|
+
return self.is_studio
|
|
164
|
+
|
|
165
|
+
#
|
|
166
|
+
# Projects
|
|
111
167
|
#
|
|
112
168
|
|
|
169
|
+
@property
|
|
170
|
+
@abstractmethod
|
|
171
|
+
def default_project_name(self):
|
|
172
|
+
"""Gets default project name"""
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def listing_project_name(self):
|
|
176
|
+
return Project.listing()
|
|
177
|
+
|
|
178
|
+
@cached_property
|
|
179
|
+
def default_project(self) -> Project:
|
|
180
|
+
return self.get_project(
|
|
181
|
+
self.default_project_name, self.default_namespace_name, create=True
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
@cached_property
|
|
185
|
+
def listing_project(self) -> Project:
|
|
186
|
+
return self.get_project(self.listing_project_name, self.system_namespace_name)
|
|
187
|
+
|
|
188
|
+
@abstractmethod
|
|
189
|
+
def create_project(
|
|
190
|
+
self,
|
|
191
|
+
namespace_name: str,
|
|
192
|
+
name: str,
|
|
193
|
+
description: Optional[str] = None,
|
|
194
|
+
uuid: Optional[str] = None,
|
|
195
|
+
ignore_if_exists: bool = True,
|
|
196
|
+
validate: bool = True,
|
|
197
|
+
**kwargs,
|
|
198
|
+
) -> Project:
|
|
199
|
+
"""Creates new project in specific namespace"""
|
|
200
|
+
|
|
201
|
+
@abstractmethod
|
|
202
|
+
def get_project(
|
|
203
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
204
|
+
) -> Project:
|
|
205
|
+
"""
|
|
206
|
+
Gets a single project inside some namespace by name.
|
|
207
|
+
It also creates project if not found and create flag is set to True.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
@abstractmethod
|
|
211
|
+
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
212
|
+
"""Gets list of projects in some namespace or in general (in all namespaces)"""
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def project_allowed_to_create(self):
|
|
216
|
+
return self.is_studio
|
|
217
|
+
|
|
218
|
+
#
|
|
219
|
+
# Datasets
|
|
220
|
+
#
|
|
113
221
|
@abstractmethod
|
|
114
222
|
def create_dataset(
|
|
115
223
|
self,
|
|
116
224
|
name: str,
|
|
225
|
+
project_id: Optional[int] = None,
|
|
117
226
|
status: int = DatasetStatus.CREATED,
|
|
118
227
|
sources: Optional[list[str]] = None,
|
|
119
228
|
feature_schema: Optional[dict] = None,
|
|
@@ -173,15 +282,22 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
173
282
|
"""
|
|
174
283
|
|
|
175
284
|
@abstractmethod
|
|
176
|
-
def list_datasets(
|
|
177
|
-
|
|
285
|
+
def list_datasets(
|
|
286
|
+
self, project_id: Optional[int] = None
|
|
287
|
+
) -> Iterator[DatasetListRecord]:
|
|
288
|
+
"""Lists all datasets in some project or in all projects."""
|
|
178
289
|
|
|
179
290
|
@abstractmethod
|
|
180
|
-
def list_datasets_by_prefix(
|
|
181
|
-
|
|
291
|
+
def list_datasets_by_prefix(
|
|
292
|
+
self, prefix: str, project_id: Optional[int] = None
|
|
293
|
+
) -> Iterator["DatasetListRecord"]:
|
|
294
|
+
"""
|
|
295
|
+
Lists all datasets which names start with prefix in some project or in all
|
|
296
|
+
projects.
|
|
297
|
+
"""
|
|
182
298
|
|
|
183
299
|
@abstractmethod
|
|
184
|
-
def get_dataset(self, name: str) -> DatasetRecord:
|
|
300
|
+
def get_dataset(self, name: str, project_id: Optional[int] = None) -> DatasetRecord:
|
|
185
301
|
"""Gets a single dataset by name."""
|
|
186
302
|
|
|
187
303
|
@abstractmethod
|
|
@@ -202,10 +318,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
202
318
|
@abstractmethod
|
|
203
319
|
def add_dataset_dependency(
|
|
204
320
|
self,
|
|
205
|
-
|
|
321
|
+
source_dataset: "DatasetRecord",
|
|
206
322
|
source_dataset_version: str,
|
|
207
|
-
|
|
208
|
-
|
|
323
|
+
dep_dataset: "DatasetRecord",
|
|
324
|
+
dep_dataset_version: str,
|
|
209
325
|
) -> None:
|
|
210
326
|
"""Adds dataset dependency to dataset."""
|
|
211
327
|
|
|
@@ -304,6 +420,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
304
420
|
and has shared logic for all database systems currently in use.
|
|
305
421
|
"""
|
|
306
422
|
|
|
423
|
+
NAMESPACE_TABLE = "namespaces"
|
|
424
|
+
PROJECT_TABLE = "projects"
|
|
307
425
|
DATASET_TABLE = "datasets"
|
|
308
426
|
DATASET_VERSION_TABLE = "datasets_versions"
|
|
309
427
|
DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
|
|
@@ -322,11 +440,62 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
322
440
|
def cleanup_tables(self, temp_table_names: list[str]) -> None:
|
|
323
441
|
"""Cleanup temp tables."""
|
|
324
442
|
|
|
443
|
+
@classmethod
|
|
444
|
+
def _namespaces_columns(cls) -> list["SchemaItem"]:
|
|
445
|
+
"""Namespace table columns."""
|
|
446
|
+
return [
|
|
447
|
+
Column("id", Integer, primary_key=True),
|
|
448
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
449
|
+
Column("name", Text, nullable=False),
|
|
450
|
+
Column("description", Text),
|
|
451
|
+
Column("created_at", DateTime(timezone=True)),
|
|
452
|
+
]
|
|
453
|
+
|
|
454
|
+
@cached_property
|
|
455
|
+
def _namespaces_fields(self) -> list[str]:
|
|
456
|
+
return [
|
|
457
|
+
c.name # type: ignore [attr-defined]
|
|
458
|
+
for c in self._namespaces_columns()
|
|
459
|
+
if c.name # type: ignore [attr-defined]
|
|
460
|
+
]
|
|
461
|
+
|
|
462
|
+
@classmethod
|
|
463
|
+
def _projects_columns(cls) -> list["SchemaItem"]:
|
|
464
|
+
"""Project table columns."""
|
|
465
|
+
return [
|
|
466
|
+
Column("id", Integer, primary_key=True),
|
|
467
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
468
|
+
Column("name", Text, nullable=False),
|
|
469
|
+
Column("description", Text),
|
|
470
|
+
Column("created_at", DateTime(timezone=True)),
|
|
471
|
+
Column(
|
|
472
|
+
"namespace_id",
|
|
473
|
+
Integer,
|
|
474
|
+
ForeignKey(f"{cls.NAMESPACE_TABLE}.id", ondelete="CASCADE"),
|
|
475
|
+
nullable=False,
|
|
476
|
+
),
|
|
477
|
+
UniqueConstraint("namespace_id", "name"),
|
|
478
|
+
]
|
|
479
|
+
|
|
480
|
+
@cached_property
|
|
481
|
+
def _projects_fields(self) -> list[str]:
|
|
482
|
+
return [
|
|
483
|
+
c.name # type: ignore [attr-defined]
|
|
484
|
+
for c in self._projects_columns()
|
|
485
|
+
if c.name # type: ignore [attr-defined]
|
|
486
|
+
]
|
|
487
|
+
|
|
325
488
|
@classmethod
|
|
326
489
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
327
490
|
"""Datasets table columns."""
|
|
328
491
|
return [
|
|
329
492
|
Column("id", Integer, primary_key=True),
|
|
493
|
+
Column(
|
|
494
|
+
"project_id",
|
|
495
|
+
Integer,
|
|
496
|
+
ForeignKey(f"{cls.PROJECT_TABLE}.id", ondelete="CASCADE"),
|
|
497
|
+
nullable=False,
|
|
498
|
+
),
|
|
330
499
|
Column("name", Text, nullable=False),
|
|
331
500
|
Column("description", Text),
|
|
332
501
|
Column("attrs", JSON, nullable=True),
|
|
@@ -445,6 +614,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
445
614
|
#
|
|
446
615
|
# Query Tables
|
|
447
616
|
#
|
|
617
|
+
@cached_property
|
|
618
|
+
def _namespaces(self) -> Table:
|
|
619
|
+
return Table(
|
|
620
|
+
self.NAMESPACE_TABLE, self.db.metadata, *self._namespaces_columns()
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
@cached_property
|
|
624
|
+
def _projects(self) -> Table:
|
|
625
|
+
return Table(self.PROJECT_TABLE, self.db.metadata, *self._projects_columns())
|
|
626
|
+
|
|
448
627
|
@cached_property
|
|
449
628
|
def _datasets(self) -> Table:
|
|
450
629
|
return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
|
|
@@ -468,6 +647,34 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
468
647
|
#
|
|
469
648
|
# Query Starters (These can be overridden by subclasses)
|
|
470
649
|
#
|
|
650
|
+
@abstractmethod
|
|
651
|
+
def _namespaces_insert(self) -> "Insert": ...
|
|
652
|
+
|
|
653
|
+
def _namespaces_select(self, *columns) -> "Select":
|
|
654
|
+
if not columns:
|
|
655
|
+
return self._namespaces.select()
|
|
656
|
+
return select(*columns)
|
|
657
|
+
|
|
658
|
+
def _namespaces_update(self) -> "Update":
|
|
659
|
+
return self._namespaces.update()
|
|
660
|
+
|
|
661
|
+
def _namespaces_delete(self) -> "Delete":
|
|
662
|
+
return self._namespaces.delete()
|
|
663
|
+
|
|
664
|
+
@abstractmethod
|
|
665
|
+
def _projects_insert(self) -> "Insert": ...
|
|
666
|
+
|
|
667
|
+
def _projects_select(self, *columns) -> "Select":
|
|
668
|
+
if not columns:
|
|
669
|
+
return self._projects.select()
|
|
670
|
+
return select(*columns)
|
|
671
|
+
|
|
672
|
+
def _projects_update(self) -> "Update":
|
|
673
|
+
return self._projects.update()
|
|
674
|
+
|
|
675
|
+
def _projects_delete(self) -> "Delete":
|
|
676
|
+
return self._projects.delete()
|
|
677
|
+
|
|
471
678
|
@abstractmethod
|
|
472
679
|
def _datasets_insert(self) -> "Insert": ...
|
|
473
680
|
|
|
@@ -510,6 +717,160 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
510
717
|
def _datasets_dependencies_delete(self) -> "Delete":
|
|
511
718
|
return self._datasets_dependencies.delete()
|
|
512
719
|
|
|
720
|
+
#
|
|
721
|
+
# Namespaces
|
|
722
|
+
#
|
|
723
|
+
|
|
724
|
+
def create_namespace(
|
|
725
|
+
self,
|
|
726
|
+
name: str,
|
|
727
|
+
description: Optional[str] = None,
|
|
728
|
+
uuid: Optional[str] = None,
|
|
729
|
+
ignore_if_exists: bool = True,
|
|
730
|
+
validate: bool = True,
|
|
731
|
+
**kwargs,
|
|
732
|
+
) -> Namespace:
|
|
733
|
+
if validate:
|
|
734
|
+
Namespace.validate_name(name)
|
|
735
|
+
query = self._namespaces_insert().values(
|
|
736
|
+
name=name,
|
|
737
|
+
uuid=uuid or str(uuid4()),
|
|
738
|
+
created_at=datetime.now(timezone.utc),
|
|
739
|
+
description=description,
|
|
740
|
+
)
|
|
741
|
+
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
742
|
+
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
743
|
+
# but generic SQL does not
|
|
744
|
+
query = query.on_conflict_do_nothing(index_elements=["name"])
|
|
745
|
+
self.db.execute(query)
|
|
746
|
+
|
|
747
|
+
return self.get_namespace(name)
|
|
748
|
+
|
|
749
|
+
def get_namespace(self, name: str, conn=None) -> Namespace:
|
|
750
|
+
"""Gets a single namespace by name"""
|
|
751
|
+
n = self._namespaces
|
|
752
|
+
|
|
753
|
+
query = self._namespaces_select(
|
|
754
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
755
|
+
).where(n.c.name == name)
|
|
756
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
757
|
+
if not rows:
|
|
758
|
+
raise NamespaceNotFoundError(f"Namespace {name} not found.")
|
|
759
|
+
return self.namespace_class.parse(*rows[0])
|
|
760
|
+
|
|
761
|
+
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
762
|
+
"""Gets a list of all namespaces"""
|
|
763
|
+
n = self._namespaces
|
|
764
|
+
|
|
765
|
+
query = self._namespaces_select(
|
|
766
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
767
|
+
)
|
|
768
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
769
|
+
|
|
770
|
+
return [self.namespace_class.parse(*r) for r in rows]
|
|
771
|
+
|
|
772
|
+
#
|
|
773
|
+
# Projects
|
|
774
|
+
#
|
|
775
|
+
|
|
776
|
+
def create_project(
|
|
777
|
+
self,
|
|
778
|
+
namespace_name: str,
|
|
779
|
+
name: str,
|
|
780
|
+
description: Optional[str] = None,
|
|
781
|
+
uuid: Optional[str] = None,
|
|
782
|
+
ignore_if_exists: bool = True,
|
|
783
|
+
validate: bool = True,
|
|
784
|
+
**kwargs,
|
|
785
|
+
) -> Project:
|
|
786
|
+
if validate:
|
|
787
|
+
Project.validate_name(name)
|
|
788
|
+
try:
|
|
789
|
+
namespace = self.get_namespace(namespace_name)
|
|
790
|
+
except NamespaceNotFoundError:
|
|
791
|
+
namespace = self.create_namespace(namespace_name, validate=validate)
|
|
792
|
+
|
|
793
|
+
query = self._projects_insert().values(
|
|
794
|
+
namespace_id=namespace.id,
|
|
795
|
+
uuid=uuid or str(uuid4()),
|
|
796
|
+
name=name,
|
|
797
|
+
created_at=datetime.now(timezone.utc),
|
|
798
|
+
description=description,
|
|
799
|
+
)
|
|
800
|
+
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
801
|
+
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
802
|
+
# but generic SQL does not
|
|
803
|
+
query = query.on_conflict_do_nothing(
|
|
804
|
+
index_elements=["namespace_id", "name"]
|
|
805
|
+
)
|
|
806
|
+
self.db.execute(query)
|
|
807
|
+
|
|
808
|
+
return self.get_project(name, namespace.name)
|
|
809
|
+
|
|
810
|
+
def _is_listing_project(self, project_name: str, namespace_name: str) -> bool:
|
|
811
|
+
return (
|
|
812
|
+
project_name == self.listing_project_name
|
|
813
|
+
and namespace_name == self.system_namespace_name
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
def _is_default_project(self, project_name: str, namespace_name: str) -> bool:
|
|
817
|
+
return (
|
|
818
|
+
project_name == self.default_project_name
|
|
819
|
+
and namespace_name == self.default_namespace_name
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
def get_project(
|
|
823
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
824
|
+
) -> Project:
|
|
825
|
+
"""Gets a single project inside some namespace by name"""
|
|
826
|
+
n = self._namespaces
|
|
827
|
+
p = self._projects
|
|
828
|
+
validate = True
|
|
829
|
+
|
|
830
|
+
if self._is_listing_project(name, namespace_name) or self._is_default_project(
|
|
831
|
+
name, namespace_name
|
|
832
|
+
):
|
|
833
|
+
# we are always creating default and listing projects if they don't exist
|
|
834
|
+
create = True
|
|
835
|
+
validate = False
|
|
836
|
+
|
|
837
|
+
query = self._projects_select(
|
|
838
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
839
|
+
*(getattr(p.c, f) for f in self._projects_fields),
|
|
840
|
+
)
|
|
841
|
+
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
|
|
842
|
+
p.c.name == name, n.c.name == namespace_name
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
846
|
+
if not rows:
|
|
847
|
+
if create:
|
|
848
|
+
return self.create_project(namespace_name, name, validate=validate)
|
|
849
|
+
raise ProjectNotFoundError(
|
|
850
|
+
f"Project {name} in namespace {namespace_name} not found."
|
|
851
|
+
)
|
|
852
|
+
return self.project_class.parse(*rows[0])
|
|
853
|
+
|
|
854
|
+
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
855
|
+
"""
|
|
856
|
+
Gets a list of projects inside some namespace, or in all namespaces
|
|
857
|
+
"""
|
|
858
|
+
n = self._namespaces
|
|
859
|
+
p = self._projects
|
|
860
|
+
|
|
861
|
+
query = self._projects_select(
|
|
862
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
863
|
+
*(getattr(p.c, f) for f in self._projects_fields),
|
|
864
|
+
)
|
|
865
|
+
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id))
|
|
866
|
+
|
|
867
|
+
if namespace_id:
|
|
868
|
+
query = query.where(n.c.id == namespace_id)
|
|
869
|
+
|
|
870
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
871
|
+
|
|
872
|
+
return [self.project_class.parse(*r) for r in rows]
|
|
873
|
+
|
|
513
874
|
#
|
|
514
875
|
# Datasets
|
|
515
876
|
#
|
|
@@ -517,6 +878,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
517
878
|
def create_dataset(
|
|
518
879
|
self,
|
|
519
880
|
name: str,
|
|
881
|
+
project_id: Optional[int] = None,
|
|
520
882
|
status: int = DatasetStatus.CREATED,
|
|
521
883
|
sources: Optional[list[str]] = None,
|
|
522
884
|
feature_schema: Optional[dict] = None,
|
|
@@ -528,9 +890,11 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
528
890
|
**kwargs, # TODO registered = True / False
|
|
529
891
|
) -> DatasetRecord:
|
|
530
892
|
"""Creates new dataset."""
|
|
531
|
-
|
|
893
|
+
project_id = project_id or self.default_project.id
|
|
894
|
+
|
|
532
895
|
query = self._datasets_insert().values(
|
|
533
896
|
name=name,
|
|
897
|
+
project_id=project_id,
|
|
534
898
|
status=status,
|
|
535
899
|
feature_schema=json.dumps(feature_schema or {}),
|
|
536
900
|
created_at=datetime.now(timezone.utc),
|
|
@@ -546,10 +910,10 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
546
910
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
547
911
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
548
912
|
# but generic SQL does not
|
|
549
|
-
query = query.on_conflict_do_nothing(index_elements=["name"])
|
|
913
|
+
query = query.on_conflict_do_nothing(index_elements=["project_id", "name"])
|
|
550
914
|
self.db.execute(query)
|
|
551
915
|
|
|
552
|
-
return self.get_dataset(name)
|
|
916
|
+
return self.get_dataset(name, project_id)
|
|
553
917
|
|
|
554
918
|
def create_dataset_version( # noqa: PLR0913
|
|
555
919
|
self,
|
|
@@ -606,7 +970,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
606
970
|
)
|
|
607
971
|
self.db.execute(query, conn=conn)
|
|
608
972
|
|
|
609
|
-
return self.get_dataset(dataset.name, conn=conn)
|
|
973
|
+
return self.get_dataset(dataset.name, dataset.project.id, conn=conn)
|
|
610
974
|
|
|
611
975
|
def remove_dataset(self, dataset: DatasetRecord) -> None:
|
|
612
976
|
"""Removes dataset."""
|
|
@@ -744,13 +1108,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
744
1108
|
|
|
745
1109
|
def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
|
|
746
1110
|
# grouping rows by dataset id
|
|
747
|
-
for _, g in groupby(rows, lambda r: r[
|
|
1111
|
+
for _, g in groupby(rows, lambda r: r[11]):
|
|
748
1112
|
dataset = self._parse_list_dataset(list(g))
|
|
749
1113
|
if dataset:
|
|
750
1114
|
yield dataset
|
|
751
1115
|
|
|
752
1116
|
def _get_dataset_query(
|
|
753
1117
|
self,
|
|
1118
|
+
namespace_fields: list[str],
|
|
1119
|
+
project_fields: list[str],
|
|
754
1120
|
dataset_fields: list[str],
|
|
755
1121
|
dataset_version_fields: list[str],
|
|
756
1122
|
isouter: bool = True,
|
|
@@ -761,48 +1127,81 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
761
1127
|
):
|
|
762
1128
|
raise TableMissingError
|
|
763
1129
|
|
|
1130
|
+
n = self._namespaces
|
|
1131
|
+
p = self._projects
|
|
764
1132
|
d = self._datasets
|
|
765
1133
|
dv = self._datasets_versions
|
|
766
1134
|
|
|
767
1135
|
query = self._datasets_select(
|
|
1136
|
+
*(getattr(n.c, f) for f in namespace_fields),
|
|
1137
|
+
*(getattr(p.c, f) for f in project_fields),
|
|
768
1138
|
*(getattr(d.c, f) for f in dataset_fields),
|
|
769
1139
|
*(getattr(dv.c, f) for f in dataset_version_fields),
|
|
770
1140
|
)
|
|
771
|
-
j =
|
|
1141
|
+
j = (
|
|
1142
|
+
n.join(p, n.c.id == p.c.namespace_id)
|
|
1143
|
+
.join(d, p.c.id == d.c.project_id)
|
|
1144
|
+
.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
|
|
1145
|
+
)
|
|
772
1146
|
return query.select_from(j)
|
|
773
1147
|
|
|
774
1148
|
def _base_dataset_query(self) -> "Select":
|
|
775
1149
|
return self._get_dataset_query(
|
|
776
|
-
self.
|
|
1150
|
+
self._namespaces_fields,
|
|
1151
|
+
self._projects_fields,
|
|
1152
|
+
self._dataset_fields,
|
|
1153
|
+
self._dataset_version_fields,
|
|
777
1154
|
)
|
|
778
1155
|
|
|
779
1156
|
def _base_list_datasets_query(self) -> "Select":
|
|
780
1157
|
return self._get_dataset_query(
|
|
781
|
-
self.
|
|
1158
|
+
self._namespaces_fields,
|
|
1159
|
+
self._projects_fields,
|
|
1160
|
+
self._dataset_list_fields,
|
|
1161
|
+
self._dataset_list_version_fields,
|
|
1162
|
+
isouter=False,
|
|
782
1163
|
)
|
|
783
1164
|
|
|
784
|
-
def list_datasets(
|
|
1165
|
+
def list_datasets(
|
|
1166
|
+
self, project_id: Optional[int] = None
|
|
1167
|
+
) -> Iterator["DatasetListRecord"]:
|
|
785
1168
|
"""Lists all datasets."""
|
|
1169
|
+
d = self._datasets
|
|
786
1170
|
query = self._base_list_datasets_query().order_by(
|
|
787
1171
|
self._datasets.c.name, self._datasets_versions.c.version
|
|
788
1172
|
)
|
|
1173
|
+
if project_id:
|
|
1174
|
+
query = query.where(d.c.project_id == project_id)
|
|
789
1175
|
yield from self._parse_dataset_list(self.db.execute(query))
|
|
790
1176
|
|
|
791
1177
|
def list_datasets_by_prefix(
|
|
792
|
-
self, prefix: str, conn=None
|
|
1178
|
+
self, prefix: str, project_id: Optional[int] = None, conn=None
|
|
793
1179
|
) -> Iterator["DatasetListRecord"]:
|
|
1180
|
+
d = self._datasets
|
|
794
1181
|
query = self._base_list_datasets_query()
|
|
1182
|
+
if project_id:
|
|
1183
|
+
query = query.where(d.c.project_id == project_id)
|
|
795
1184
|
query = query.where(self._datasets.c.name.startswith(prefix))
|
|
796
1185
|
yield from self._parse_dataset_list(self.db.execute(query))
|
|
797
1186
|
|
|
798
|
-
def get_dataset(
|
|
799
|
-
|
|
1187
|
+
def get_dataset(
|
|
1188
|
+
self,
|
|
1189
|
+
name: str, # normal, not full dataset name
|
|
1190
|
+
project_id: Optional[int] = None,
|
|
1191
|
+
conn=None,
|
|
1192
|
+
) -> DatasetRecord:
|
|
1193
|
+
"""
|
|
1194
|
+
Gets a single dataset in project by dataset name.
|
|
1195
|
+
"""
|
|
1196
|
+
project_id = project_id or self.default_project.id
|
|
800
1197
|
d = self._datasets
|
|
801
1198
|
query = self._base_dataset_query()
|
|
802
|
-
query = query.where(d.c.name == name) # type: ignore [attr-defined]
|
|
1199
|
+
query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
|
|
803
1200
|
ds = self._parse_dataset(self.db.execute(query, conn=conn))
|
|
804
1201
|
if not ds:
|
|
805
|
-
raise DatasetNotFoundError(
|
|
1202
|
+
raise DatasetNotFoundError(
|
|
1203
|
+
f"Dataset {name} not found in project {project_id}"
|
|
1204
|
+
)
|
|
806
1205
|
return ds
|
|
807
1206
|
|
|
808
1207
|
def remove_dataset_version(
|
|
@@ -872,23 +1271,20 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
872
1271
|
#
|
|
873
1272
|
def add_dataset_dependency(
|
|
874
1273
|
self,
|
|
875
|
-
|
|
1274
|
+
source_dataset: "DatasetRecord",
|
|
876
1275
|
source_dataset_version: str,
|
|
877
|
-
|
|
878
|
-
|
|
1276
|
+
dep_dataset: "DatasetRecord",
|
|
1277
|
+
dep_dataset_version: str,
|
|
879
1278
|
) -> None:
|
|
880
1279
|
"""Adds dataset dependency to dataset."""
|
|
881
|
-
source_dataset = self.get_dataset(source_dataset_name)
|
|
882
|
-
dataset = self.get_dataset(dataset_name)
|
|
883
|
-
|
|
884
1280
|
self.db.execute(
|
|
885
1281
|
self._datasets_dependencies_insert().values(
|
|
886
1282
|
source_dataset_id=source_dataset.id,
|
|
887
1283
|
source_dataset_version_id=(
|
|
888
1284
|
source_dataset.get_version(source_dataset_version).id
|
|
889
1285
|
),
|
|
890
|
-
dataset_id=
|
|
891
|
-
dataset_version_id=
|
|
1286
|
+
dataset_id=dep_dataset.id,
|
|
1287
|
+
dataset_version_id=dep_dataset.get_version(dep_dataset_version).id,
|
|
892
1288
|
)
|
|
893
1289
|
)
|
|
894
1290
|
|
|
@@ -930,6 +1326,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
930
1326
|
def get_direct_dataset_dependencies(
|
|
931
1327
|
self, dataset: DatasetRecord, version: str
|
|
932
1328
|
) -> list[Optional[DatasetDependency]]:
|
|
1329
|
+
n = self._namespaces
|
|
1330
|
+
p = self._projects
|
|
933
1331
|
d = self._datasets
|
|
934
1332
|
dd = self._datasets_dependencies
|
|
935
1333
|
dv = self._datasets_versions
|
|
@@ -941,18 +1339,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
941
1339
|
query = (
|
|
942
1340
|
self._datasets_dependencies_select(*select_cols)
|
|
943
1341
|
.select_from(
|
|
944
|
-
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
|
|
945
|
-
|
|
946
|
-
)
|
|
1342
|
+
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
|
|
1343
|
+
.join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
|
|
1344
|
+
.join(p, d.c.project_id == p.c.id, isouter=True)
|
|
1345
|
+
.join(n, p.c.namespace_id == n.c.id, isouter=True)
|
|
947
1346
|
)
|
|
948
1347
|
.where(
|
|
949
1348
|
(dd.c.source_dataset_id == dataset.id)
|
|
950
1349
|
& (dd.c.source_dataset_version_id == dataset_version.id)
|
|
951
1350
|
)
|
|
952
1351
|
)
|
|
953
|
-
if version:
|
|
954
|
-
dataset_version = dataset.get_version(version)
|
|
955
|
-
query = query.where(dd.c.source_dataset_version_id == dataset_version.id)
|
|
956
1352
|
|
|
957
1353
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
958
1354
|
|