datachain 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +3 -0
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/parser/__init__.py +1 -35
- datachain/cli/parser/job.py +25 -0
- datachain/cli/parser/studio.py +11 -4
- datachain/data_storage/metastore.py +390 -37
- datachain/data_storage/schema.py +23 -1
- datachain/data_storage/sqlite.py +139 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +125 -12
- datachain/delta.py +9 -5
- datachain/error.py +36 -0
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +86 -7
- datachain/lib/dc/datasets.py +62 -12
- datachain/lib/dc/listings.py +111 -0
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +14 -2
- datachain/lib/listing.py +3 -1
- datachain/lib/namespaces.py +73 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/settings.py +10 -0
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +67 -26
- datachain/studio.py +68 -8
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/METADATA +2 -2
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/RECORD +37 -33
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/WHEEL +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/top_level.txt +0 -0
|
@@ -37,9 +37,13 @@ from datachain.dataset import (
|
|
|
37
37
|
from datachain.error import (
|
|
38
38
|
DatasetNotFoundError,
|
|
39
39
|
DatasetVersionNotFoundError,
|
|
40
|
+
NamespaceNotFoundError,
|
|
41
|
+
ProjectNotFoundError,
|
|
40
42
|
TableMissingError,
|
|
41
43
|
)
|
|
42
44
|
from datachain.job import Job
|
|
45
|
+
from datachain.namespace import Namespace
|
|
46
|
+
from datachain.project import Project
|
|
43
47
|
from datachain.utils import JSONSerialize
|
|
44
48
|
|
|
45
49
|
if TYPE_CHECKING:
|
|
@@ -61,6 +65,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
61
65
|
uri: StorageURI
|
|
62
66
|
|
|
63
67
|
schema: "schema.Schema"
|
|
68
|
+
namespace_class: type[Namespace] = Namespace
|
|
69
|
+
project_class: type[Project] = Project
|
|
64
70
|
dataset_class: type[DatasetRecord] = DatasetRecord
|
|
65
71
|
dataset_list_class: type[DatasetListRecord] = DatasetListRecord
|
|
66
72
|
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
@@ -107,13 +113,107 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
107
113
|
"""Cleanup for tests."""
|
|
108
114
|
|
|
109
115
|
#
|
|
110
|
-
#
|
|
116
|
+
# Namespaces
|
|
117
|
+
#
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def default_namespace_name(self):
|
|
122
|
+
"""Gets default namespace name"""
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def system_namespace_name(self):
|
|
126
|
+
return Namespace.system()
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def create_namespace(
|
|
130
|
+
self,
|
|
131
|
+
name: str,
|
|
132
|
+
description: Optional[str] = None,
|
|
133
|
+
uuid: Optional[str] = None,
|
|
134
|
+
ignore_if_exists: bool = True,
|
|
135
|
+
**kwargs,
|
|
136
|
+
) -> Namespace:
|
|
137
|
+
"""Creates new namespace"""
|
|
138
|
+
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def get_namespace(self, name: str, conn=None) -> Namespace:
|
|
141
|
+
"""Gets a single namespace by name"""
|
|
142
|
+
|
|
143
|
+
@abstractmethod
|
|
144
|
+
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
145
|
+
"""Gets a list of all namespaces"""
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
@abstractmethod
|
|
149
|
+
def is_studio(self) -> bool:
|
|
150
|
+
"""Returns True if this code is ran in Studio"""
|
|
151
|
+
|
|
152
|
+
def is_local_dataset(self, dataset_namespace: str) -> bool:
|
|
153
|
+
"""
|
|
154
|
+
Returns True if this is local dataset i.e. not pulled from Studio but
|
|
155
|
+
created locally. This is False if we ran code in CLI mode but using dataset
|
|
156
|
+
names that are present in Studio.
|
|
157
|
+
"""
|
|
158
|
+
return self.is_studio or dataset_namespace == Namespace.default()
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def namespace_allowed_to_create(self):
|
|
162
|
+
return self.is_studio
|
|
163
|
+
|
|
164
|
+
#
|
|
165
|
+
# Projects
|
|
111
166
|
#
|
|
112
167
|
|
|
168
|
+
@property
|
|
169
|
+
@abstractmethod
|
|
170
|
+
def default_project_name(self):
|
|
171
|
+
"""Gets default project name"""
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def listing_project_name(self):
|
|
175
|
+
return Project.listing()
|
|
176
|
+
|
|
177
|
+
@cached_property
|
|
178
|
+
def default_project(self) -> Project:
|
|
179
|
+
return self.get_project(self.default_project_name, self.default_namespace_name)
|
|
180
|
+
|
|
181
|
+
@cached_property
|
|
182
|
+
def listing_project(self) -> Project:
|
|
183
|
+
return self.get_project(self.listing_project_name, self.system_namespace_name)
|
|
184
|
+
|
|
185
|
+
@abstractmethod
|
|
186
|
+
def create_project(
|
|
187
|
+
self,
|
|
188
|
+
name: str,
|
|
189
|
+
namespace_name: str,
|
|
190
|
+
description: Optional[str] = None,
|
|
191
|
+
uuid: Optional[str] = None,
|
|
192
|
+
ignore_if_exists: bool = True,
|
|
193
|
+
**kwargs,
|
|
194
|
+
) -> Project:
|
|
195
|
+
"""Creates new project in specific namespace"""
|
|
196
|
+
|
|
197
|
+
@abstractmethod
|
|
198
|
+
def get_project(self, name: str, namespace_name: str, conn=None) -> Project:
|
|
199
|
+
"""Gets a single project inside some namespace by name"""
|
|
200
|
+
|
|
201
|
+
@abstractmethod
|
|
202
|
+
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
203
|
+
"""Gets list of projects in some namespace or in general (in all namespaces)"""
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def project_allowed_to_create(self):
|
|
207
|
+
return self.is_studio
|
|
208
|
+
|
|
209
|
+
#
|
|
210
|
+
# Datasets
|
|
211
|
+
#
|
|
113
212
|
@abstractmethod
|
|
114
213
|
def create_dataset(
|
|
115
214
|
self,
|
|
116
215
|
name: str,
|
|
216
|
+
project_id: Optional[int] = None,
|
|
117
217
|
status: int = DatasetStatus.CREATED,
|
|
118
218
|
sources: Optional[list[str]] = None,
|
|
119
219
|
feature_schema: Optional[dict] = None,
|
|
@@ -173,15 +273,22 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
173
273
|
"""
|
|
174
274
|
|
|
175
275
|
@abstractmethod
|
|
176
|
-
def list_datasets(
|
|
177
|
-
|
|
276
|
+
def list_datasets(
|
|
277
|
+
self, project_id: Optional[int] = None
|
|
278
|
+
) -> Iterator[DatasetListRecord]:
|
|
279
|
+
"""Lists all datasets in some project or in all projects."""
|
|
178
280
|
|
|
179
281
|
@abstractmethod
|
|
180
|
-
def list_datasets_by_prefix(
|
|
181
|
-
|
|
282
|
+
def list_datasets_by_prefix(
|
|
283
|
+
self, prefix: str, project_id: Optional[int] = None
|
|
284
|
+
) -> Iterator["DatasetListRecord"]:
|
|
285
|
+
"""
|
|
286
|
+
Lists all datasets which names start with prefix in some project or in all
|
|
287
|
+
projects.
|
|
288
|
+
"""
|
|
182
289
|
|
|
183
290
|
@abstractmethod
|
|
184
|
-
def get_dataset(self, name: str) -> DatasetRecord:
|
|
291
|
+
def get_dataset(self, name: str, project_id: Optional[int] = None) -> DatasetRecord:
|
|
185
292
|
"""Gets a single dataset by name."""
|
|
186
293
|
|
|
187
294
|
@abstractmethod
|
|
@@ -202,10 +309,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
202
309
|
@abstractmethod
|
|
203
310
|
def add_dataset_dependency(
|
|
204
311
|
self,
|
|
205
|
-
|
|
312
|
+
source_dataset: "DatasetRecord",
|
|
206
313
|
source_dataset_version: str,
|
|
207
|
-
|
|
208
|
-
|
|
314
|
+
dep_dataset: "DatasetRecord",
|
|
315
|
+
dep_dataset_version: str,
|
|
209
316
|
) -> None:
|
|
210
317
|
"""Adds dataset dependency to dataset."""
|
|
211
318
|
|
|
@@ -304,6 +411,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
304
411
|
and has shared logic for all database systems currently in use.
|
|
305
412
|
"""
|
|
306
413
|
|
|
414
|
+
NAMESPACE_TABLE = "namespaces"
|
|
415
|
+
PROJECT_TABLE = "projects"
|
|
307
416
|
DATASET_TABLE = "datasets"
|
|
308
417
|
DATASET_VERSION_TABLE = "datasets_versions"
|
|
309
418
|
DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
|
|
@@ -322,11 +431,62 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
322
431
|
def cleanup_tables(self, temp_table_names: list[str]) -> None:
|
|
323
432
|
"""Cleanup temp tables."""
|
|
324
433
|
|
|
434
|
+
@classmethod
|
|
435
|
+
def _namespaces_columns(cls) -> list["SchemaItem"]:
|
|
436
|
+
"""Namespace table columns."""
|
|
437
|
+
return [
|
|
438
|
+
Column("id", Integer, primary_key=True),
|
|
439
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
440
|
+
Column("name", Text, nullable=False),
|
|
441
|
+
Column("description", Text),
|
|
442
|
+
Column("created_at", DateTime(timezone=True)),
|
|
443
|
+
]
|
|
444
|
+
|
|
445
|
+
@cached_property
|
|
446
|
+
def _namespaces_fields(self) -> list[str]:
|
|
447
|
+
return [
|
|
448
|
+
c.name # type: ignore [attr-defined]
|
|
449
|
+
for c in self._namespaces_columns()
|
|
450
|
+
if c.name # type: ignore [attr-defined]
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
@classmethod
|
|
454
|
+
def _projects_columns(cls) -> list["SchemaItem"]:
|
|
455
|
+
"""Project table columns."""
|
|
456
|
+
return [
|
|
457
|
+
Column("id", Integer, primary_key=True),
|
|
458
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
459
|
+
Column("name", Text, nullable=False),
|
|
460
|
+
Column("description", Text),
|
|
461
|
+
Column("created_at", DateTime(timezone=True)),
|
|
462
|
+
Column(
|
|
463
|
+
"namespace_id",
|
|
464
|
+
Integer,
|
|
465
|
+
ForeignKey(f"{cls.NAMESPACE_TABLE}.id", ondelete="CASCADE"),
|
|
466
|
+
nullable=False,
|
|
467
|
+
),
|
|
468
|
+
UniqueConstraint("namespace_id", "name"),
|
|
469
|
+
]
|
|
470
|
+
|
|
471
|
+
@cached_property
|
|
472
|
+
def _projects_fields(self) -> list[str]:
|
|
473
|
+
return [
|
|
474
|
+
c.name # type: ignore [attr-defined]
|
|
475
|
+
for c in self._projects_columns()
|
|
476
|
+
if c.name # type: ignore [attr-defined]
|
|
477
|
+
]
|
|
478
|
+
|
|
325
479
|
@classmethod
|
|
326
480
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
327
481
|
"""Datasets table columns."""
|
|
328
482
|
return [
|
|
329
483
|
Column("id", Integer, primary_key=True),
|
|
484
|
+
Column(
|
|
485
|
+
"project_id",
|
|
486
|
+
Integer,
|
|
487
|
+
ForeignKey(f"{cls.PROJECT_TABLE}.id", ondelete="CASCADE"),
|
|
488
|
+
nullable=False,
|
|
489
|
+
),
|
|
330
490
|
Column("name", Text, nullable=False),
|
|
331
491
|
Column("description", Text),
|
|
332
492
|
Column("attrs", JSON, nullable=True),
|
|
@@ -445,6 +605,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
445
605
|
#
|
|
446
606
|
# Query Tables
|
|
447
607
|
#
|
|
608
|
+
@cached_property
|
|
609
|
+
def _namespaces(self) -> Table:
|
|
610
|
+
return Table(
|
|
611
|
+
self.NAMESPACE_TABLE, self.db.metadata, *self._namespaces_columns()
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
@cached_property
|
|
615
|
+
def _projects(self) -> Table:
|
|
616
|
+
return Table(self.PROJECT_TABLE, self.db.metadata, *self._projects_columns())
|
|
617
|
+
|
|
448
618
|
@cached_property
|
|
449
619
|
def _datasets(self) -> Table:
|
|
450
620
|
return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
|
|
@@ -468,6 +638,34 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
468
638
|
#
|
|
469
639
|
# Query Starters (These can be overridden by subclasses)
|
|
470
640
|
#
|
|
641
|
+
@abstractmethod
|
|
642
|
+
def _namespaces_insert(self) -> "Insert": ...
|
|
643
|
+
|
|
644
|
+
def _namespaces_select(self, *columns) -> "Select":
|
|
645
|
+
if not columns:
|
|
646
|
+
return self._namespaces.select()
|
|
647
|
+
return select(*columns)
|
|
648
|
+
|
|
649
|
+
def _namespaces_update(self) -> "Update":
|
|
650
|
+
return self._namespaces.update()
|
|
651
|
+
|
|
652
|
+
def _namespaces_delete(self) -> "Delete":
|
|
653
|
+
return self._namespaces.delete()
|
|
654
|
+
|
|
655
|
+
@abstractmethod
|
|
656
|
+
def _projects_insert(self) -> "Insert": ...
|
|
657
|
+
|
|
658
|
+
def _projects_select(self, *columns) -> "Select":
|
|
659
|
+
if not columns:
|
|
660
|
+
return self._projects.select()
|
|
661
|
+
return select(*columns)
|
|
662
|
+
|
|
663
|
+
def _projects_update(self) -> "Update":
|
|
664
|
+
return self._projects.update()
|
|
665
|
+
|
|
666
|
+
def _projects_delete(self) -> "Delete":
|
|
667
|
+
return self._projects.delete()
|
|
668
|
+
|
|
471
669
|
@abstractmethod
|
|
472
670
|
def _datasets_insert(self) -> "Insert": ...
|
|
473
671
|
|
|
@@ -510,6 +708,126 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
510
708
|
def _datasets_dependencies_delete(self) -> "Delete":
|
|
511
709
|
return self._datasets_dependencies.delete()
|
|
512
710
|
|
|
711
|
+
#
|
|
712
|
+
# Namespaces
|
|
713
|
+
#
|
|
714
|
+
|
|
715
|
+
def create_namespace(
|
|
716
|
+
self,
|
|
717
|
+
name: str,
|
|
718
|
+
description: Optional[str] = None,
|
|
719
|
+
uuid: Optional[str] = None,
|
|
720
|
+
ignore_if_exists: bool = True,
|
|
721
|
+
**kwargs,
|
|
722
|
+
) -> Namespace:
|
|
723
|
+
query = self._namespaces_insert().values(
|
|
724
|
+
name=name,
|
|
725
|
+
uuid=uuid or str(uuid4()),
|
|
726
|
+
created_at=datetime.now(timezone.utc),
|
|
727
|
+
description=description,
|
|
728
|
+
)
|
|
729
|
+
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
730
|
+
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
731
|
+
# but generic SQL does not
|
|
732
|
+
query = query.on_conflict_do_nothing(index_elements=["name"])
|
|
733
|
+
self.db.execute(query)
|
|
734
|
+
|
|
735
|
+
return self.get_namespace(name)
|
|
736
|
+
|
|
737
|
+
def get_namespace(self, name: str, conn=None) -> Namespace:
|
|
738
|
+
"""Gets a single namespace by name"""
|
|
739
|
+
n = self._namespaces
|
|
740
|
+
|
|
741
|
+
query = self._namespaces_select(
|
|
742
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
743
|
+
).where(n.c.name == name)
|
|
744
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
745
|
+
if not rows:
|
|
746
|
+
raise NamespaceNotFoundError(f"Namespace {name} not found.")
|
|
747
|
+
return self.namespace_class.parse(*rows[0])
|
|
748
|
+
|
|
749
|
+
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
750
|
+
"""Gets a list of all namespaces"""
|
|
751
|
+
n = self._namespaces
|
|
752
|
+
|
|
753
|
+
query = self._namespaces_select(
|
|
754
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
755
|
+
)
|
|
756
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
757
|
+
|
|
758
|
+
return [self.namespace_class.parse(*r) for r in rows]
|
|
759
|
+
|
|
760
|
+
#
|
|
761
|
+
# Projects
|
|
762
|
+
#
|
|
763
|
+
|
|
764
|
+
def create_project(
|
|
765
|
+
self,
|
|
766
|
+
name: str,
|
|
767
|
+
namespace_name: str,
|
|
768
|
+
description: Optional[str] = None,
|
|
769
|
+
uuid: Optional[str] = None,
|
|
770
|
+
ignore_if_exists: bool = True,
|
|
771
|
+
**kwargs,
|
|
772
|
+
) -> Project:
|
|
773
|
+
namespace = self.get_namespace(namespace_name)
|
|
774
|
+
query = self._projects_insert().values(
|
|
775
|
+
namespace_id=namespace.id,
|
|
776
|
+
uuid=uuid or str(uuid4()),
|
|
777
|
+
name=name,
|
|
778
|
+
created_at=datetime.now(timezone.utc),
|
|
779
|
+
description=description,
|
|
780
|
+
)
|
|
781
|
+
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
782
|
+
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
783
|
+
# but generic SQL does not
|
|
784
|
+
query = query.on_conflict_do_nothing(
|
|
785
|
+
index_elements=["namespace_id", "name"]
|
|
786
|
+
)
|
|
787
|
+
self.db.execute(query)
|
|
788
|
+
|
|
789
|
+
return self.get_project(name, namespace.name)
|
|
790
|
+
|
|
791
|
+
def get_project(self, name: str, namespace_name: str, conn=None) -> Project:
|
|
792
|
+
"""Gets a single project inside some namespace by name"""
|
|
793
|
+
n = self._namespaces
|
|
794
|
+
p = self._projects
|
|
795
|
+
|
|
796
|
+
query = self._projects_select(
|
|
797
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
798
|
+
*(getattr(p.c, f) for f in self._projects_fields),
|
|
799
|
+
)
|
|
800
|
+
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
|
|
801
|
+
p.c.name == name, n.c.name == namespace_name
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
805
|
+
if not rows:
|
|
806
|
+
raise ProjectNotFoundError(
|
|
807
|
+
f"Project {name} in namespace {namespace_name} not found."
|
|
808
|
+
)
|
|
809
|
+
return self.project_class.parse(*rows[0])
|
|
810
|
+
|
|
811
|
+
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
812
|
+
"""
|
|
813
|
+
Gets a list of projects inside some namespace, or in all namespaces
|
|
814
|
+
"""
|
|
815
|
+
n = self._namespaces
|
|
816
|
+
p = self._projects
|
|
817
|
+
|
|
818
|
+
query = self._projects_select(
|
|
819
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
820
|
+
*(getattr(p.c, f) for f in self._projects_fields),
|
|
821
|
+
)
|
|
822
|
+
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id))
|
|
823
|
+
|
|
824
|
+
if namespace_id:
|
|
825
|
+
query = query.where(n.c.id == namespace_id)
|
|
826
|
+
|
|
827
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
828
|
+
|
|
829
|
+
return [self.project_class.parse(*r) for r in rows]
|
|
830
|
+
|
|
513
831
|
#
|
|
514
832
|
# Datasets
|
|
515
833
|
#
|
|
@@ -517,6 +835,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
517
835
|
def create_dataset(
|
|
518
836
|
self,
|
|
519
837
|
name: str,
|
|
838
|
+
project_id: Optional[int] = None,
|
|
520
839
|
status: int = DatasetStatus.CREATED,
|
|
521
840
|
sources: Optional[list[str]] = None,
|
|
522
841
|
feature_schema: Optional[dict] = None,
|
|
@@ -528,9 +847,11 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
528
847
|
**kwargs, # TODO registered = True / False
|
|
529
848
|
) -> DatasetRecord:
|
|
530
849
|
"""Creates new dataset."""
|
|
531
|
-
|
|
850
|
+
project_id = project_id or self.default_project.id
|
|
851
|
+
|
|
532
852
|
query = self._datasets_insert().values(
|
|
533
853
|
name=name,
|
|
854
|
+
project_id=project_id,
|
|
534
855
|
status=status,
|
|
535
856
|
feature_schema=json.dumps(feature_schema or {}),
|
|
536
857
|
created_at=datetime.now(timezone.utc),
|
|
@@ -546,10 +867,10 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
546
867
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
547
868
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
548
869
|
# but generic SQL does not
|
|
549
|
-
query = query.on_conflict_do_nothing(index_elements=["name"])
|
|
870
|
+
query = query.on_conflict_do_nothing(index_elements=["project_id", "name"])
|
|
550
871
|
self.db.execute(query)
|
|
551
872
|
|
|
552
|
-
return self.get_dataset(name)
|
|
873
|
+
return self.get_dataset(name, project_id)
|
|
553
874
|
|
|
554
875
|
def create_dataset_version( # noqa: PLR0913
|
|
555
876
|
self,
|
|
@@ -606,7 +927,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
606
927
|
)
|
|
607
928
|
self.db.execute(query, conn=conn)
|
|
608
929
|
|
|
609
|
-
return self.get_dataset(dataset.name, conn=conn)
|
|
930
|
+
return self.get_dataset(dataset.name, dataset.project.id, conn=conn)
|
|
610
931
|
|
|
611
932
|
def remove_dataset(self, dataset: DatasetRecord) -> None:
|
|
612
933
|
"""Removes dataset."""
|
|
@@ -744,13 +1065,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
744
1065
|
|
|
745
1066
|
def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
|
|
746
1067
|
# grouping rows by dataset id
|
|
747
|
-
for _, g in groupby(rows, lambda r: r[
|
|
1068
|
+
for _, g in groupby(rows, lambda r: r[11]):
|
|
748
1069
|
dataset = self._parse_list_dataset(list(g))
|
|
749
1070
|
if dataset:
|
|
750
1071
|
yield dataset
|
|
751
1072
|
|
|
752
1073
|
def _get_dataset_query(
|
|
753
1074
|
self,
|
|
1075
|
+
namespace_fields: list[str],
|
|
1076
|
+
project_fields: list[str],
|
|
754
1077
|
dataset_fields: list[str],
|
|
755
1078
|
dataset_version_fields: list[str],
|
|
756
1079
|
isouter: bool = True,
|
|
@@ -761,48 +1084,81 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
761
1084
|
):
|
|
762
1085
|
raise TableMissingError
|
|
763
1086
|
|
|
1087
|
+
n = self._namespaces
|
|
1088
|
+
p = self._projects
|
|
764
1089
|
d = self._datasets
|
|
765
1090
|
dv = self._datasets_versions
|
|
766
1091
|
|
|
767
1092
|
query = self._datasets_select(
|
|
1093
|
+
*(getattr(n.c, f) for f in namespace_fields),
|
|
1094
|
+
*(getattr(p.c, f) for f in project_fields),
|
|
768
1095
|
*(getattr(d.c, f) for f in dataset_fields),
|
|
769
1096
|
*(getattr(dv.c, f) for f in dataset_version_fields),
|
|
770
1097
|
)
|
|
771
|
-
j =
|
|
1098
|
+
j = (
|
|
1099
|
+
n.join(p, n.c.id == p.c.namespace_id)
|
|
1100
|
+
.join(d, p.c.id == d.c.project_id)
|
|
1101
|
+
.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
|
|
1102
|
+
)
|
|
772
1103
|
return query.select_from(j)
|
|
773
1104
|
|
|
774
1105
|
def _base_dataset_query(self) -> "Select":
|
|
775
1106
|
return self._get_dataset_query(
|
|
776
|
-
self.
|
|
1107
|
+
self._namespaces_fields,
|
|
1108
|
+
self._projects_fields,
|
|
1109
|
+
self._dataset_fields,
|
|
1110
|
+
self._dataset_version_fields,
|
|
777
1111
|
)
|
|
778
1112
|
|
|
779
1113
|
def _base_list_datasets_query(self) -> "Select":
|
|
780
1114
|
return self._get_dataset_query(
|
|
781
|
-
self.
|
|
1115
|
+
self._namespaces_fields,
|
|
1116
|
+
self._projects_fields,
|
|
1117
|
+
self._dataset_list_fields,
|
|
1118
|
+
self._dataset_list_version_fields,
|
|
1119
|
+
isouter=False,
|
|
782
1120
|
)
|
|
783
1121
|
|
|
784
|
-
def list_datasets(
|
|
1122
|
+
def list_datasets(
|
|
1123
|
+
self, project_id: Optional[int] = None
|
|
1124
|
+
) -> Iterator["DatasetListRecord"]:
|
|
785
1125
|
"""Lists all datasets."""
|
|
1126
|
+
d = self._datasets
|
|
786
1127
|
query = self._base_list_datasets_query().order_by(
|
|
787
1128
|
self._datasets.c.name, self._datasets_versions.c.version
|
|
788
1129
|
)
|
|
1130
|
+
if project_id:
|
|
1131
|
+
query = query.where(d.c.project_id == project_id)
|
|
789
1132
|
yield from self._parse_dataset_list(self.db.execute(query))
|
|
790
1133
|
|
|
791
1134
|
def list_datasets_by_prefix(
|
|
792
|
-
self, prefix: str, conn=None
|
|
1135
|
+
self, prefix: str, project_id: Optional[int] = None, conn=None
|
|
793
1136
|
) -> Iterator["DatasetListRecord"]:
|
|
1137
|
+
d = self._datasets
|
|
794
1138
|
query = self._base_list_datasets_query()
|
|
1139
|
+
if project_id:
|
|
1140
|
+
query = query.where(d.c.project_id == project_id)
|
|
795
1141
|
query = query.where(self._datasets.c.name.startswith(prefix))
|
|
796
1142
|
yield from self._parse_dataset_list(self.db.execute(query))
|
|
797
1143
|
|
|
798
|
-
def get_dataset(
|
|
799
|
-
|
|
1144
|
+
def get_dataset(
|
|
1145
|
+
self,
|
|
1146
|
+
name: str, # normal, not full dataset name
|
|
1147
|
+
project_id: Optional[int] = None,
|
|
1148
|
+
conn=None,
|
|
1149
|
+
) -> DatasetRecord:
|
|
1150
|
+
"""
|
|
1151
|
+
Gets a single dataset in project by dataset name.
|
|
1152
|
+
"""
|
|
1153
|
+
project_id = project_id or self.default_project.id
|
|
800
1154
|
d = self._datasets
|
|
801
1155
|
query = self._base_dataset_query()
|
|
802
|
-
query = query.where(d.c.name == name) # type: ignore [attr-defined]
|
|
1156
|
+
query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
|
|
803
1157
|
ds = self._parse_dataset(self.db.execute(query, conn=conn))
|
|
804
1158
|
if not ds:
|
|
805
|
-
raise DatasetNotFoundError(
|
|
1159
|
+
raise DatasetNotFoundError(
|
|
1160
|
+
f"Dataset {name} not found in project {project_id}"
|
|
1161
|
+
)
|
|
806
1162
|
return ds
|
|
807
1163
|
|
|
808
1164
|
def remove_dataset_version(
|
|
@@ -872,23 +1228,20 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
872
1228
|
#
|
|
873
1229
|
def add_dataset_dependency(
|
|
874
1230
|
self,
|
|
875
|
-
|
|
1231
|
+
source_dataset: "DatasetRecord",
|
|
876
1232
|
source_dataset_version: str,
|
|
877
|
-
|
|
878
|
-
|
|
1233
|
+
dep_dataset: "DatasetRecord",
|
|
1234
|
+
dep_dataset_version: str,
|
|
879
1235
|
) -> None:
|
|
880
1236
|
"""Adds dataset dependency to dataset."""
|
|
881
|
-
source_dataset = self.get_dataset(source_dataset_name)
|
|
882
|
-
dataset = self.get_dataset(dataset_name)
|
|
883
|
-
|
|
884
1237
|
self.db.execute(
|
|
885
1238
|
self._datasets_dependencies_insert().values(
|
|
886
1239
|
source_dataset_id=source_dataset.id,
|
|
887
1240
|
source_dataset_version_id=(
|
|
888
1241
|
source_dataset.get_version(source_dataset_version).id
|
|
889
1242
|
),
|
|
890
|
-
dataset_id=
|
|
891
|
-
dataset_version_id=
|
|
1243
|
+
dataset_id=dep_dataset.id,
|
|
1244
|
+
dataset_version_id=dep_dataset.get_version(dep_dataset_version).id,
|
|
892
1245
|
)
|
|
893
1246
|
)
|
|
894
1247
|
|
|
@@ -930,6 +1283,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
930
1283
|
def get_direct_dataset_dependencies(
|
|
931
1284
|
self, dataset: DatasetRecord, version: str
|
|
932
1285
|
) -> list[Optional[DatasetDependency]]:
|
|
1286
|
+
n = self._namespaces
|
|
1287
|
+
p = self._projects
|
|
933
1288
|
d = self._datasets
|
|
934
1289
|
dd = self._datasets_dependencies
|
|
935
1290
|
dv = self._datasets_versions
|
|
@@ -941,18 +1296,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
941
1296
|
query = (
|
|
942
1297
|
self._datasets_dependencies_select(*select_cols)
|
|
943
1298
|
.select_from(
|
|
944
|
-
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
|
|
945
|
-
|
|
946
|
-
)
|
|
1299
|
+
dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
|
|
1300
|
+
.join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
|
|
1301
|
+
.join(p, d.c.project_id == p.c.id, isouter=True)
|
|
1302
|
+
.join(n, p.c.namespace_id == n.c.id, isouter=True)
|
|
947
1303
|
)
|
|
948
1304
|
.where(
|
|
949
1305
|
(dd.c.source_dataset_id == dataset.id)
|
|
950
1306
|
& (dd.c.source_dataset_version_id == dataset_version.id)
|
|
951
1307
|
)
|
|
952
1308
|
)
|
|
953
|
-
if version:
|
|
954
|
-
dataset_version = dataset.get_version(version)
|
|
955
|
-
query = query.where(dd.c.source_dataset_version_id == dataset_version.id)
|
|
956
1309
|
|
|
957
1310
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
958
1311
|
|
datachain/data_storage/schema.py
CHANGED
|
@@ -13,7 +13,16 @@ from sqlalchemy.sql import func as f
|
|
|
13
13
|
from sqlalchemy.sql.expression import false, null, true
|
|
14
14
|
|
|
15
15
|
from datachain.sql.functions import path as pathfunc
|
|
16
|
-
from datachain.sql.types import
|
|
16
|
+
from datachain.sql.types import (
|
|
17
|
+
JSON,
|
|
18
|
+
Boolean,
|
|
19
|
+
DateTime,
|
|
20
|
+
Int,
|
|
21
|
+
Int64,
|
|
22
|
+
SQLType,
|
|
23
|
+
String,
|
|
24
|
+
UInt64,
|
|
25
|
+
)
|
|
17
26
|
|
|
18
27
|
if TYPE_CHECKING:
|
|
19
28
|
from sqlalchemy.engine.interfaces import Dialect
|
|
@@ -272,6 +281,19 @@ class DataTable:
|
|
|
272
281
|
),
|
|
273
282
|
]
|
|
274
283
|
|
|
284
|
+
@classmethod
|
|
285
|
+
def listing_columns(cls):
|
|
286
|
+
return [
|
|
287
|
+
sa.Column("file__source", String()),
|
|
288
|
+
sa.Column("file__path", String()),
|
|
289
|
+
sa.Column("file__size", Int64()),
|
|
290
|
+
sa.Column("file__version", String()),
|
|
291
|
+
sa.Column("file__etag", String()),
|
|
292
|
+
sa.Column("file__is_latest", Boolean()),
|
|
293
|
+
sa.Column("file__last_modified", DateTime()),
|
|
294
|
+
sa.Column("file__location", JSON()),
|
|
295
|
+
]
|
|
296
|
+
|
|
275
297
|
def dir_expansion(self):
|
|
276
298
|
return DirExpansion(self.column)
|
|
277
299
|
|