datachain 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +422 -37
- datachain/data_storage/sqlite.py +136 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +126 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +260 -92
- datachain/lib/dc/datasets.py +104 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +23 -6
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
|
@@ -37,9 +37,13 @@ from datachain.dataset import (
|
|
|
37
37
|
from datachain.error import (
|
|
38
38
|
DatasetNotFoundError,
|
|
39
39
|
DatasetVersionNotFoundError,
|
|
40
|
+
NamespaceNotFoundError,
|
|
41
|
+
ProjectNotFoundError,
|
|
40
42
|
TableMissingError,
|
|
41
43
|
)
|
|
42
44
|
from datachain.job import Job
|
|
45
|
+
from datachain.namespace import Namespace
|
|
46
|
+
from datachain.project import Project
|
|
43
47
|
from datachain.utils import JSONSerialize
|
|
44
48
|
|
|
45
49
|
if TYPE_CHECKING:
|
|
@@ -61,6 +65,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
61
65
|
uri: StorageURI
|
|
62
66
|
|
|
63
67
|
schema: "schema.Schema"
|
|
68
|
+
namespace_class: type[Namespace] = Namespace
|
|
69
|
+
project_class: type[Project] = Project
|
|
64
70
|
dataset_class: type[DatasetRecord] = DatasetRecord
|
|
65
71
|
dataset_list_class: type[DatasetListRecord] = DatasetListRecord
|
|
66
72
|
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
@@ -107,13 +113,114 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
107
113
|
"""Cleanup for tests."""
|
|
108
114
|
|
|
109
115
|
#
|
|
110
|
-
#
|
|
116
|
+
# Namespaces
|
|
117
|
+
#
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def default_namespace_name(self):
|
|
122
|
+
"""Gets default namespace name"""
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def system_namespace_name(self):
|
|
126
|
+
return Namespace.system()
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def create_namespace(
|
|
130
|
+
self,
|
|
131
|
+
name: str,
|
|
132
|
+
description: Optional[str] = None,
|
|
133
|
+
uuid: Optional[str] = None,
|
|
134
|
+
ignore_if_exists: bool = True,
|
|
135
|
+
**kwargs,
|
|
136
|
+
) -> Namespace:
|
|
137
|
+
"""Creates new namespace"""
|
|
138
|
+
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def get_namespace(self, name: str, conn=None) -> Namespace:
|
|
141
|
+
"""Gets a single namespace by name"""
|
|
142
|
+
|
|
143
|
+
@abstractmethod
|
|
144
|
+
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
145
|
+
"""Gets a list of all namespaces"""
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
@abstractmethod
|
|
149
|
+
def is_studio(self) -> bool:
|
|
150
|
+
"""Returns True if this code is ran in Studio"""
|
|
151
|
+
|
|
152
|
+
def is_local_dataset(self, dataset_namespace: str) -> bool:
|
|
153
|
+
"""
|
|
154
|
+
Returns True if this is local dataset i.e. not pulled from Studio but
|
|
155
|
+
created locally. This is False if we ran code in CLI mode but using dataset
|
|
156
|
+
names that are present in Studio.
|
|
157
|
+
"""
|
|
158
|
+
return self.is_studio or dataset_namespace == Namespace.default()
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def namespace_allowed_to_create(self):
|
|
162
|
+
return self.is_studio
|
|
163
|
+
|
|
164
|
+
#
|
|
165
|
+
# Projects
|
|
111
166
|
#
|
|
112
167
|
|
|
168
|
+
@property
|
|
169
|
+
@abstractmethod
|
|
170
|
+
def default_project_name(self):
|
|
171
|
+
"""Gets default project name"""
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def listing_project_name(self):
|
|
175
|
+
return Project.listing()
|
|
176
|
+
|
|
177
|
+
@cached_property
|
|
178
|
+
def default_project(self) -> Project:
|
|
179
|
+
return self.get_project(
|
|
180
|
+
self.default_project_name, self.default_namespace_name, create=True
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
@cached_property
|
|
184
|
+
def listing_project(self) -> Project:
|
|
185
|
+
return self.get_project(self.listing_project_name, self.system_namespace_name)
|
|
186
|
+
|
|
187
|
+
@abstractmethod
|
|
188
|
+
def create_project(
|
|
189
|
+
self,
|
|
190
|
+
namespace_name: str,
|
|
191
|
+
name: str,
|
|
192
|
+
description: Optional[str] = None,
|
|
193
|
+
uuid: Optional[str] = None,
|
|
194
|
+
ignore_if_exists: bool = True,
|
|
195
|
+
**kwargs,
|
|
196
|
+
) -> Project:
|
|
197
|
+
"""Creates new project in specific namespace"""
|
|
198
|
+
|
|
199
|
+
@abstractmethod
|
|
200
|
+
def get_project(
|
|
201
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
202
|
+
) -> Project:
|
|
203
|
+
"""
|
|
204
|
+
Gets a single project inside some namespace by name.
|
|
205
|
+
It also creates project if not found and create flag is set to True.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
@abstractmethod
|
|
209
|
+
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
210
|
+
"""Gets list of projects in some namespace or in general (in all namespaces)"""
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def project_allowed_to_create(self):
|
|
214
|
+
return self.is_studio
|
|
215
|
+
|
|
216
|
+
#
|
|
217
|
+
# Datasets
|
|
218
|
+
#
|
|
113
219
|
@abstractmethod
|
|
114
220
|
def create_dataset(
|
|
115
221
|
self,
|
|
116
222
|
name: str,
|
|
223
|
+
project_id: Optional[int] = None,
|
|
117
224
|
status: int = DatasetStatus.CREATED,
|
|
118
225
|
sources: Optional[list[str]] = None,
|
|
119
226
|
feature_schema: Optional[dict] = None,
|
|
@@ -173,15 +280,22 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
173
280
|
"""
|
|
174
281
|
|
|
175
282
|
@abstractmethod
|
|
176
|
-
def list_datasets(
|
|
177
|
-
|
|
283
|
+
def list_datasets(
|
|
284
|
+
self, project_id: Optional[int] = None
|
|
285
|
+
) -> Iterator[DatasetListRecord]:
|
|
286
|
+
"""Lists all datasets in some project or in all projects."""
|
|
178
287
|
|
|
179
288
|
@abstractmethod
|
|
180
|
-
def list_datasets_by_prefix(
|
|
181
|
-
|
|
289
|
+
def list_datasets_by_prefix(
|
|
290
|
+
self, prefix: str, project_id: Optional[int] = None
|
|
291
|
+
) -> Iterator["DatasetListRecord"]:
|
|
292
|
+
"""
|
|
293
|
+
Lists all datasets which names start with prefix in some project or in all
|
|
294
|
+
projects.
|
|
295
|
+
"""
|
|
182
296
|
|
|
183
297
|
@abstractmethod
|
|
184
|
-
def get_dataset(self, name: str) -> DatasetRecord:
|
|
298
|
+
def get_dataset(self, name: str, project_id: Optional[int] = None) -> DatasetRecord:
|
|
185
299
|
"""Gets a single dataset by name."""
|
|
186
300
|
|
|
187
301
|
@abstractmethod
|
|
@@ -202,10 +316,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
202
316
|
@abstractmethod
|
|
203
317
|
def add_dataset_dependency(
|
|
204
318
|
self,
|
|
205
|
-
|
|
319
|
+
source_dataset: "DatasetRecord",
|
|
206
320
|
source_dataset_version: str,
|
|
207
|
-
|
|
208
|
-
|
|
321
|
+
dep_dataset: "DatasetRecord",
|
|
322
|
+
dep_dataset_version: str,
|
|
209
323
|
) -> None:
|
|
210
324
|
"""Adds dataset dependency to dataset."""
|
|
211
325
|
|
|
@@ -304,6 +418,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
304
418
|
and has shared logic for all database systems currently in use.
|
|
305
419
|
"""
|
|
306
420
|
|
|
421
|
+
NAMESPACE_TABLE = "namespaces"
|
|
422
|
+
PROJECT_TABLE = "projects"
|
|
307
423
|
DATASET_TABLE = "datasets"
|
|
308
424
|
DATASET_VERSION_TABLE = "datasets_versions"
|
|
309
425
|
DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
|
|
@@ -322,11 +438,62 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
322
438
|
def cleanup_tables(self, temp_table_names: list[str]) -> None:
|
|
323
439
|
"""Cleanup temp tables."""
|
|
324
440
|
|
|
441
|
+
@classmethod
|
|
442
|
+
def _namespaces_columns(cls) -> list["SchemaItem"]:
|
|
443
|
+
"""Namespace table columns."""
|
|
444
|
+
return [
|
|
445
|
+
Column("id", Integer, primary_key=True),
|
|
446
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
447
|
+
Column("name", Text, nullable=False),
|
|
448
|
+
Column("description", Text),
|
|
449
|
+
Column("created_at", DateTime(timezone=True)),
|
|
450
|
+
]
|
|
451
|
+
|
|
452
|
+
@cached_property
|
|
453
|
+
def _namespaces_fields(self) -> list[str]:
|
|
454
|
+
return [
|
|
455
|
+
c.name # type: ignore [attr-defined]
|
|
456
|
+
for c in self._namespaces_columns()
|
|
457
|
+
if c.name # type: ignore [attr-defined]
|
|
458
|
+
]
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def _projects_columns(cls) -> list["SchemaItem"]:
|
|
462
|
+
"""Project table columns."""
|
|
463
|
+
return [
|
|
464
|
+
Column("id", Integer, primary_key=True),
|
|
465
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
466
|
+
Column("name", Text, nullable=False),
|
|
467
|
+
Column("description", Text),
|
|
468
|
+
Column("created_at", DateTime(timezone=True)),
|
|
469
|
+
Column(
|
|
470
|
+
"namespace_id",
|
|
471
|
+
Integer,
|
|
472
|
+
ForeignKey(f"{cls.NAMESPACE_TABLE}.id", ondelete="CASCADE"),
|
|
473
|
+
nullable=False,
|
|
474
|
+
),
|
|
475
|
+
UniqueConstraint("namespace_id", "name"),
|
|
476
|
+
]
|
|
477
|
+
|
|
478
|
+
@cached_property
|
|
479
|
+
def _projects_fields(self) -> list[str]:
|
|
480
|
+
return [
|
|
481
|
+
c.name # type: ignore [attr-defined]
|
|
482
|
+
for c in self._projects_columns()
|
|
483
|
+
if c.name # type: ignore [attr-defined]
|
|
484
|
+
]
|
|
485
|
+
|
|
325
486
|
@classmethod
|
|
326
487
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
327
488
|
"""Datasets table columns."""
|
|
328
489
|
return [
|
|
329
490
|
Column("id", Integer, primary_key=True),
|
|
491
|
+
Column(
|
|
492
|
+
"project_id",
|
|
493
|
+
Integer,
|
|
494
|
+
ForeignKey(f"{cls.PROJECT_TABLE}.id", ondelete="CASCADE"),
|
|
495
|
+
nullable=False,
|
|
496
|
+
),
|
|
330
497
|
Column("name", Text, nullable=False),
|
|
331
498
|
Column("description", Text),
|
|
332
499
|
Column("attrs", JSON, nullable=True),
|
|
@@ -445,6 +612,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
445
612
|
#
|
|
446
613
|
# Query Tables
|
|
447
614
|
#
|
|
615
|
+
@cached_property
|
|
616
|
+
def _namespaces(self) -> Table:
|
|
617
|
+
return Table(
|
|
618
|
+
self.NAMESPACE_TABLE, self.db.metadata, *self._namespaces_columns()
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
@cached_property
|
|
622
|
+
def _projects(self) -> Table:
|
|
623
|
+
return Table(self.PROJECT_TABLE, self.db.metadata, *self._projects_columns())
|
|
624
|
+
|
|
448
625
|
@cached_property
|
|
449
626
|
def _datasets(self) -> Table:
|
|
450
627
|
return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
|
|
@@ -468,6 +645,34 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
468
645
|
#
|
|
469
646
|
# Query Starters (These can be overridden by subclasses)
|
|
470
647
|
#
|
|
648
|
+
@abstractmethod
|
|
649
|
+
def _namespaces_insert(self) -> "Insert": ...
|
|
650
|
+
|
|
651
|
+
def _namespaces_select(self, *columns) -> "Select":
|
|
652
|
+
if not columns:
|
|
653
|
+
return self._namespaces.select()
|
|
654
|
+
return select(*columns)
|
|
655
|
+
|
|
656
|
+
def _namespaces_update(self) -> "Update":
|
|
657
|
+
return self._namespaces.update()
|
|
658
|
+
|
|
659
|
+
def _namespaces_delete(self) -> "Delete":
|
|
660
|
+
return self._namespaces.delete()
|
|
661
|
+
|
|
662
|
+
@abstractmethod
|
|
663
|
+
def _projects_insert(self) -> "Insert": ...
|
|
664
|
+
|
|
665
|
+
def _projects_select(self, *columns) -> "Select":
|
|
666
|
+
if not columns:
|
|
667
|
+
return self._projects.select()
|
|
668
|
+
return select(*columns)
|
|
669
|
+
|
|
670
|
+
def _projects_update(self) -> "Update":
|
|
671
|
+
return self._projects.update()
|
|
672
|
+
|
|
673
|
+
def _projects_delete(self) -> "Delete":
|
|
674
|
+
return self._projects.delete()
|
|
675
|
+
|
|
471
676
|
@abstractmethod
|
|
472
677
|
def _datasets_insert(self) -> "Insert": ...
|
|
473
678
|
|
|
@@ -510,6 +715,151 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
510
715
|
def _datasets_dependencies_delete(self) -> "Delete":
|
|
511
716
|
return self._datasets_dependencies.delete()
|
|
512
717
|
|
|
718
|
+
#
|
|
719
|
+
# Namespaces
|
|
720
|
+
#
|
|
721
|
+
|
|
722
|
+
def create_namespace(
|
|
723
|
+
self,
|
|
724
|
+
name: str,
|
|
725
|
+
description: Optional[str] = None,
|
|
726
|
+
uuid: Optional[str] = None,
|
|
727
|
+
ignore_if_exists: bool = True,
|
|
728
|
+
**kwargs,
|
|
729
|
+
) -> Namespace:
|
|
730
|
+
query = self._namespaces_insert().values(
|
|
731
|
+
name=name,
|
|
732
|
+
uuid=uuid or str(uuid4()),
|
|
733
|
+
created_at=datetime.now(timezone.utc),
|
|
734
|
+
description=description,
|
|
735
|
+
)
|
|
736
|
+
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
737
|
+
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
738
|
+
# but generic SQL does not
|
|
739
|
+
query = query.on_conflict_do_nothing(index_elements=["name"])
|
|
740
|
+
self.db.execute(query)
|
|
741
|
+
|
|
742
|
+
return self.get_namespace(name)
|
|
743
|
+
|
|
744
|
+
def get_namespace(self, name: str, conn=None) -> Namespace:
|
|
745
|
+
"""Gets a single namespace by name"""
|
|
746
|
+
n = self._namespaces
|
|
747
|
+
|
|
748
|
+
query = self._namespaces_select(
|
|
749
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
750
|
+
).where(n.c.name == name)
|
|
751
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
752
|
+
if not rows:
|
|
753
|
+
raise NamespaceNotFoundError(f"Namespace {name} not found.")
|
|
754
|
+
return self.namespace_class.parse(*rows[0])
|
|
755
|
+
|
|
756
|
+
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
757
|
+
"""Gets a list of all namespaces"""
|
|
758
|
+
n = self._namespaces
|
|
759
|
+
|
|
760
|
+
query = self._namespaces_select(
|
|
761
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
762
|
+
)
|
|
763
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
764
|
+
|
|
765
|
+
return [self.namespace_class.parse(*r) for r in rows]
|
|
766
|
+
|
|
767
|
+
#
|
|
768
|
+
# Projects
|
|
769
|
+
#
|
|
770
|
+
|
|
771
|
+
def create_project(
|
|
772
|
+
self,
|
|
773
|
+
namespace_name: str,
|
|
774
|
+
name: str,
|
|
775
|
+
description: Optional[str] = None,
|
|
776
|
+
uuid: Optional[str] = None,
|
|
777
|
+
ignore_if_exists: bool = True,
|
|
778
|
+
**kwargs,
|
|
779
|
+
) -> Project:
|
|
780
|
+
try:
|
|
781
|
+
namespace = self.get_namespace(namespace_name)
|
|
782
|
+
except NamespaceNotFoundError:
|
|
783
|
+
namespace = self.create_namespace(namespace_name)
|
|
784
|
+
|
|
785
|
+
query = self._projects_insert().values(
|
|
786
|
+
namespace_id=namespace.id,
|
|
787
|
+
uuid=uuid or str(uuid4()),
|
|
788
|
+
name=name,
|
|
789
|
+
created_at=datetime.now(timezone.utc),
|
|
790
|
+
description=description,
|
|
791
|
+
)
|
|
792
|
+
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
793
|
+
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
794
|
+
# but generic SQL does not
|
|
795
|
+
query = query.on_conflict_do_nothing(
|
|
796
|
+
index_elements=["namespace_id", "name"]
|
|
797
|
+
)
|
|
798
|
+
self.db.execute(query)
|
|
799
|
+
|
|
800
|
+
return self.get_project(name, namespace.name)
|
|
801
|
+
|
|
802
|
+
def _is_listing_project(self, project_name: str, namespace_name: str) -> bool:
|
|
803
|
+
return (
|
|
804
|
+
project_name == self.listing_project_name
|
|
805
|
+
and namespace_name == self.system_namespace_name
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
def _is_default_project(self, project_name: str, namespace_name: str) -> bool:
|
|
809
|
+
return (
|
|
810
|
+
project_name == self.default_project_name
|
|
811
|
+
and namespace_name == self.default_namespace_name
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
def get_project(
|
|
815
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
816
|
+
) -> Project:
|
|
817
|
+
"""Gets a single project inside some namespace by name"""
|
|
818
|
+
n = self._namespaces
|
|
819
|
+
p = self._projects
|
|
820
|
+
if self._is_listing_project(name, namespace_name) or self._is_default_project(
|
|
821
|
+
name, namespace_name
|
|
822
|
+
):
|
|
823
|
+
# we are always creating default and listing projects if they don't exist
|
|
824
|
+
create = True
|
|
825
|
+
|
|
826
|
+
query = self._projects_select(
|
|
827
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
828
|
+
*(getattr(p.c, f) for f in self._projects_fields),
|
|
829
|
+
)
|
|
830
|
+
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
|
|
831
|
+
p.c.name == name, n.c.name == namespace_name
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
835
|
+
if not rows:
|
|
836
|
+
if create:
|
|
837
|
+
return self.create_project(namespace_name, name)
|
|
838
|
+
raise ProjectNotFoundError(
|
|
839
|
+
f"Project {name} in namespace {namespace_name} not found."
|
|
840
|
+
)
|
|
841
|
+
return self.project_class.parse(*rows[0])
|
|
842
|
+
|
|
843
|
+
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
844
|
+
"""
|
|
845
|
+
Gets a list of projects inside some namespace, or in all namespaces
|
|
846
|
+
"""
|
|
847
|
+
n = self._namespaces
|
|
848
|
+
p = self._projects
|
|
849
|
+
|
|
850
|
+
query = self._projects_select(
|
|
851
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
852
|
+
*(getattr(p.c, f) for f in self._projects_fields),
|
|
853
|
+
)
|
|
854
|
+
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id))
|
|
855
|
+
|
|
856
|
+
if namespace_id:
|
|
857
|
+
query = query.where(n.c.id == namespace_id)
|
|
858
|
+
|
|
859
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
860
|
+
|
|
861
|
+
return [self.project_class.parse(*r) for r in rows]
|
|
862
|
+
|
|
513
863
|
#
|
|
514
864
|
# Datasets
|
|
515
865
|
#
|
|
@@ -517,6 +867,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
517
867
|
def create_dataset(
|
|
518
868
|
self,
|
|
519
869
|
name: str,
|
|
870
|
+
project_id: Optional[int] = None,
|
|
520
871
|
status: int = DatasetStatus.CREATED,
|
|
521
872
|
sources: Optional[list[str]] = None,
|
|
522
873
|
feature_schema: Optional[dict] = None,
|
|
@@ -528,9 +879,11 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
528
879
|
**kwargs, # TODO registered = True / False
|
|
529
880
|
) -> DatasetRecord:
|
|
530
881
|
"""Creates new dataset."""
|
|
531
|
-
|
|
882
|
+
project_id = project_id or self.default_project.id
|
|
883
|
+
|
|
532
884
|
query = self._datasets_insert().values(
|
|
533
885
|
name=name,
|
|
886
|
+
project_id=project_id,
|
|
534
887
|
status=status,
|
|
535
888
|
feature_schema=json.dumps(feature_schema or {}),
|
|
536
889
|
created_at=datetime.now(timezone.utc),
|
|
@@ -546,10 +899,10 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
546
899
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
547
900
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
548
901
|
# but generic SQL does not
|
|
549
|
-
query = query.on_conflict_do_nothing(index_elements=["name"])
|
|
902
|
+
query = query.on_conflict_do_nothing(index_elements=["project_id", "name"])
|
|
550
903
|
self.db.execute(query)
|
|
551
904
|
|
|
552
|
-
return self.get_dataset(name)
|
|
905
|
+
return self.get_dataset(name, project_id)
|
|
553
906
|
|
|
554
907
|
def create_dataset_version( # noqa: PLR0913
|
|
555
908
|
self,
|
|
@@ -606,7 +959,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
606
959
|
)
|
|
607
960
|
self.db.execute(query, conn=conn)
|
|
608
961
|
|
|
609
|
-
return self.get_dataset(dataset.name, conn=conn)
|
|
962
|
+
return self.get_dataset(dataset.name, dataset.project.id, conn=conn)
|
|
610
963
|
|
|
611
964
|
def remove_dataset(self, dataset: DatasetRecord) -> None:
|
|
612
965
|
"""Removes dataset."""
|
|
@@ -744,13 +1097,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
744
1097
|
|
|
745
1098
|
def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
|
|
746
1099
|
# grouping rows by dataset id
|
|
747
|
-
for _, g in groupby(rows, lambda r: r[
|
|
1100
|
+
for _, g in groupby(rows, lambda r: r[11]):
|
|
748
1101
|
dataset = self._parse_list_dataset(list(g))
|
|
749
1102
|
if dataset:
|
|
750
1103
|
yield dataset
|
|
751
1104
|
|
|
752
1105
|
def _get_dataset_query(
|
|
753
1106
|
self,
|
|
1107
|
+
namespace_fields: list[str],
|
|
1108
|
+
project_fields: list[str],
|
|
754
1109
|
dataset_fields: list[str],
|
|
755
1110
|
dataset_version_fields: list[str],
|
|
756
1111
|
isouter: bool = True,
|
|
@@ -761,48 +1116,81 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
761
1116
|
):
|
|
762
1117
|
raise TableMissingError
|
|
763
1118
|
|
|
1119
|
+
n = self._namespaces
|
|
1120
|
+
p = self._projects
|
|
764
1121
|
d = self._datasets
|
|
765
1122
|
dv = self._datasets_versions
|
|
766
1123
|
|
|
767
1124
|
query = self._datasets_select(
|
|
1125
|
+
*(getattr(n.c, f) for f in namespace_fields),
|
|
1126
|
+
*(getattr(p.c, f) for f in project_fields),
|
|
768
1127
|
*(getattr(d.c, f) for f in dataset_fields),
|
|
769
1128
|
*(getattr(dv.c, f) for f in dataset_version_fields),
|
|
770
1129
|
)
|
|
771
|
-
j =
|
|
1130
|
+
j = (
|
|
1131
|
+
n.join(p, n.c.id == p.c.namespace_id)
|
|
1132
|
+
.join(d, p.c.id == d.c.project_id)
|
|
1133
|
+
.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
|
|
1134
|
+
)
|
|
772
1135
|
return query.select_from(j)
|
|
773
1136
|
|
|
774
1137
|
def _base_dataset_query(self) -> "Select":
|
|
775
1138
|
return self._get_dataset_query(
|
|
776
|
-
self.
|
|
1139
|
+
self._namespaces_fields,
|
|
1140
|
+
self._projects_fields,
|
|
1141
|
+
self._dataset_fields,
|
|
1142
|
+
self._dataset_version_fields,
|
|
777
1143
|
)
|
|
778
1144
|
|
|
779
1145
|
def _base_list_datasets_query(self) -> "Select":
|
|
780
1146
|
return self._get_dataset_query(
|
|
781
|
-
self.
|
|
1147
|
+
self._namespaces_fields,
|
|
1148
|
+
self._projects_fields,
|
|
1149
|
+
self._dataset_list_fields,
|
|
1150
|
+
self._dataset_list_version_fields,
|
|
1151
|
+
isouter=False,
|
|
782
1152
|
)
|
|
783
1153
|
|
|
784
|
-
def list_datasets(
|
|
1154
|
+
def list_datasets(
|
|
1155
|
+
self, project_id: Optional[int] = None
|
|
1156
|
+
) -> Iterator["DatasetListRecord"]:
|
|
785
1157
|
"""Lists all datasets."""
|
|
1158
|
+
d = self._datasets
|
|
786
1159
|
query = self._base_list_datasets_query().order_by(
|
|
787
1160
|
self._datasets.c.name, self._datasets_versions.c.version
|
|
788
1161
|
)
|
|
1162
|
+
if project_id:
|
|
1163
|
+
query = query.where(d.c.project_id == project_id)
|
|
789
1164
|
yield from self._parse_dataset_list(self.db.execute(query))
|
|
790
1165
|
|
|
791
1166
|
def list_datasets_by_prefix(
|
|
792
|
-
self, prefix: str, conn=None
|
|
1167
|
+
self, prefix: str, project_id: Optional[int] = None, conn=None
|
|
793
1168
|
) -> Iterator["DatasetListRecord"]:
|
|
1169
|
+
d = self._datasets
|
|
794
1170
|
query = self._base_list_datasets_query()
|
|
1171
|
+
if project_id:
|
|
1172
|
+
query = query.where(d.c.project_id == project_id)
|
|
795
1173
|
query = query.where(self._datasets.c.name.startswith(prefix))
|
|
796
1174
|
yield from self._parse_dataset_list(self.db.execute(query))
|
|
797
1175
|
|
|
798
|
-
def get_dataset(
|
|
799
|
-
|
|
1176
|
+
def get_dataset(
|
|
1177
|
+
self,
|
|
1178
|
+
name: str, # normal, not full dataset name
|
|
1179
|
+
project_id: Optional[int] = None,
|
|
1180
|
+
conn=None,
|
|
1181
|
+
) -> DatasetRecord:
|
|
1182
|
+
"""
|
|
1183
|
+
Gets a single dataset in project by dataset name.
|
|
1184
|
+
"""
|
|
1185
|
+
project_id = project_id or self.default_project.id
|
|
800
1186
|
d = self._datasets
|
|
801
1187
|
query = self._base_dataset_query()
|
|
802
|
-
query = query.where(d.c.name == name) # type: ignore [attr-defined]
|
|
1188
|
+
query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
|
|
803
1189
|
ds = self._parse_dataset(self.db.execute(query, conn=conn))
|
|
804
1190
|
if not ds:
|
|
805
|
-
raise DatasetNotFoundError(
|
|
1191
|
+
raise DatasetNotFoundError(
|
|
1192
|
+
f"Dataset {name} not found in project {project_id}"
|
|
1193
|
+
)
|
|
806
1194
|
return ds
|
|
807
1195
|
|
|
808
1196
|
def remove_dataset_version(
|
|
@@ -872,23 +1260,20 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
872
1260
|
#
|
|
873
1261
|
def add_dataset_dependency(
|
|
874
1262
|
self,
|
|
875
|
-
|
|
1263
|
+
source_dataset: "DatasetRecord",
|
|
876
1264
|
source_dataset_version: str,
|
|
877
|
-
|
|
878
|
-
|
|
1265
|
+
dep_dataset: "DatasetRecord",
|
|
1266
|
+
dep_dataset_version: str,
|
|
879
1267
|
) -> None:
|
|
880
1268
|
"""Adds dataset dependency to dataset."""
|
|
881
|
-
source_dataset = self.get_dataset(source_dataset_name)
|
|
882
|
-
dataset = self.get_dataset(dataset_name)
|
|
883
|
-
|
|
884
1269
|
self.db.execute(
|
|
885
1270
|
self._datasets_dependencies_insert().values(
|
|
886
1271
|
source_dataset_id=source_dataset.id,
|
|
887
1272
|
source_dataset_version_id=(
|
|
888
1273
|
source_dataset.get_version(source_dataset_version).id
|
|
889
1274
|
),
|
|
890
|
-
dataset_id=
|
|
891
|
-
dataset_version_id=
|
|
1275
|
+
dataset_id=dep_dataset.id,
|
|
1276
|
+
dataset_version_id=dep_dataset.get_version(dep_dataset_version).id,
|
|
892
1277
|
)
|
|
893
1278
|
)
|
|
894
1279
|
|
|
@@ -930,6 +1315,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
930
1315
|
def get_direct_dataset_dependencies(
|
|
931
1316
|
self, dataset: DatasetRecord, version: str
|
|
932
1317
|
) -> list[Optional[DatasetDependency]]:
|
|
1318
|
+
n = self._namespaces
|
|
1319
|
+
p = self._projects
|
|
933
1320
|
d = self._datasets
|
|
934
1321
|
dd = self._datasets_dependencies
|
|
935
1322
|
dv = self._datasets_versions
|
|
@@ -941,18 +1328,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
941
1328
|
query = (
|
|
942
1329
|
self._datasets_dependencies_select(*select_cols)
|
|
943
1330
|
.select_from(
|
|
944
|
-
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
|
|
945
|
-
|
|
946
|
-
)
|
|
1331
|
+
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
|
|
1332
|
+
.join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
|
|
1333
|
+
.join(p, d.c.project_id == p.c.id, isouter=True)
|
|
1334
|
+
.join(n, p.c.namespace_id == n.c.id, isouter=True)
|
|
947
1335
|
)
|
|
948
1336
|
.where(
|
|
949
1337
|
(dd.c.source_dataset_id == dataset.id)
|
|
950
1338
|
& (dd.c.source_dataset_version_id == dataset_version.id)
|
|
951
1339
|
)
|
|
952
1340
|
)
|
|
953
|
-
if version:
|
|
954
|
-
dataset_version = dataset.get_version(version)
|
|
955
|
-
query = query.where(dd.c.source_dataset_version_id == dataset_version.id)
|
|
956
1341
|
|
|
957
1342
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
958
1343
|
|