datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +213 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +35 -26
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +433 -37
  12. datachain/data_storage/sqlite.py +140 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +128 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +253 -91
  21. datachain/lib/dc/datasets.py +103 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +2 -1
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/signal_schema.py +8 -0
  33. datachain/lib/tar.py +1 -2
  34. datachain/lib/udf.py +1 -1
  35. datachain/lib/udf_signature.py +1 -1
  36. datachain/lib/webdataset.py +30 -20
  37. datachain/listing.py +3 -1
  38. datachain/namespace.py +65 -0
  39. datachain/project.py +78 -0
  40. datachain/query/dataset.py +71 -46
  41. datachain/query/session.py +1 -1
  42. datachain/remote/studio.py +61 -26
  43. datachain/studio.py +23 -6
  44. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
  45. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
  46. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
  49. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
@@ -37,9 +37,13 @@ from datachain.dataset import (
37
37
  from datachain.error import (
38
38
  DatasetNotFoundError,
39
39
  DatasetVersionNotFoundError,
40
+ NamespaceNotFoundError,
41
+ ProjectNotFoundError,
40
42
  TableMissingError,
41
43
  )
42
44
  from datachain.job import Job
45
+ from datachain.namespace import Namespace
46
+ from datachain.project import Project
43
47
  from datachain.utils import JSONSerialize
44
48
 
45
49
  if TYPE_CHECKING:
@@ -61,6 +65,8 @@ class AbstractMetastore(ABC, Serializable):
61
65
  uri: StorageURI
62
66
 
63
67
  schema: "schema.Schema"
68
+ namespace_class: type[Namespace] = Namespace
69
+ project_class: type[Project] = Project
64
70
  dataset_class: type[DatasetRecord] = DatasetRecord
65
71
  dataset_list_class: type[DatasetListRecord] = DatasetListRecord
66
72
  dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
@@ -107,13 +113,116 @@ class AbstractMetastore(ABC, Serializable):
107
113
  """Cleanup for tests."""
108
114
 
109
115
  #
110
- # Datasets
116
+ # Namespaces
117
+ #
118
+
119
+ @property
120
+ @abstractmethod
121
+ def default_namespace_name(self):
122
+ """Gets default namespace name"""
123
+
124
+ @property
125
+ def system_namespace_name(self):
126
+ return Namespace.system()
127
+
128
+ @abstractmethod
129
+ def create_namespace(
130
+ self,
131
+ name: str,
132
+ description: Optional[str] = None,
133
+ uuid: Optional[str] = None,
134
+ ignore_if_exists: bool = True,
135
+ validate: bool = True,
136
+ **kwargs,
137
+ ) -> Namespace:
138
+ """Creates new namespace"""
139
+
140
+ @abstractmethod
141
+ def get_namespace(self, name: str, conn=None) -> Namespace:
142
+ """Gets a single namespace by name"""
143
+
144
+ @abstractmethod
145
+ def list_namespaces(self, conn=None) -> list[Namespace]:
146
+ """Gets a list of all namespaces"""
147
+
148
+ @property
149
+ @abstractmethod
150
+ def is_studio(self) -> bool:
151
+ """Returns True if this code is ran in Studio"""
152
+
153
+ def is_local_dataset(self, dataset_namespace: str) -> bool:
154
+ """
155
+ Returns True if this is local dataset i.e. not pulled from Studio but
156
+ created locally. This is False if we ran code in CLI mode but using dataset
157
+ names that are present in Studio.
158
+ """
159
+ return self.is_studio or dataset_namespace == Namespace.default()
160
+
161
+ @property
162
+ def namespace_allowed_to_create(self):
163
+ return self.is_studio
164
+
165
+ #
166
+ # Projects
111
167
  #
112
168
 
169
+ @property
170
+ @abstractmethod
171
+ def default_project_name(self):
172
+ """Gets default project name"""
173
+
174
+ @property
175
+ def listing_project_name(self):
176
+ return Project.listing()
177
+
178
+ @cached_property
179
+ def default_project(self) -> Project:
180
+ return self.get_project(
181
+ self.default_project_name, self.default_namespace_name, create=True
182
+ )
183
+
184
+ @cached_property
185
+ def listing_project(self) -> Project:
186
+ return self.get_project(self.listing_project_name, self.system_namespace_name)
187
+
188
+ @abstractmethod
189
+ def create_project(
190
+ self,
191
+ namespace_name: str,
192
+ name: str,
193
+ description: Optional[str] = None,
194
+ uuid: Optional[str] = None,
195
+ ignore_if_exists: bool = True,
196
+ validate: bool = True,
197
+ **kwargs,
198
+ ) -> Project:
199
+ """Creates new project in specific namespace"""
200
+
201
+ @abstractmethod
202
+ def get_project(
203
+ self, name: str, namespace_name: str, create: bool = False, conn=None
204
+ ) -> Project:
205
+ """
206
+ Gets a single project inside some namespace by name.
207
+ It also creates project if not found and create flag is set to True.
208
+ """
209
+
210
+ @abstractmethod
211
+ def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
212
+ """Gets list of projects in some namespace or in general (in all namespaces)"""
213
+
214
+ @property
215
+ def project_allowed_to_create(self):
216
+ return self.is_studio
217
+
218
+ #
219
+ # Datasets
220
+ #
113
221
  @abstractmethod
114
222
  def create_dataset(
115
223
  self,
116
224
  name: str,
225
+ project_id: Optional[int] = None,
117
226
  status: int = DatasetStatus.CREATED,
118
227
  sources: Optional[list[str]] = None,
119
228
  feature_schema: Optional[dict] = None,
@@ -173,15 +282,22 @@ class AbstractMetastore(ABC, Serializable):
173
282
  """
174
283
 
175
284
  @abstractmethod
176
- def list_datasets(self) -> Iterator[DatasetListRecord]:
177
- """Lists all datasets."""
285
+ def list_datasets(
286
+ self, project_id: Optional[int] = None
287
+ ) -> Iterator[DatasetListRecord]:
288
+ """Lists all datasets in some project or in all projects."""
178
289
 
179
290
  @abstractmethod
180
- def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetListRecord"]:
181
- """Lists all datasets which names start with prefix."""
291
+ def list_datasets_by_prefix(
292
+ self, prefix: str, project_id: Optional[int] = None
293
+ ) -> Iterator["DatasetListRecord"]:
294
+ """
295
+ Lists all datasets which names start with prefix in some project or in all
296
+ projects.
297
+ """
182
298
 
183
299
  @abstractmethod
184
- def get_dataset(self, name: str) -> DatasetRecord:
300
+ def get_dataset(self, name: str, project_id: Optional[int] = None) -> DatasetRecord:
185
301
  """Gets a single dataset by name."""
186
302
 
187
303
  @abstractmethod
@@ -202,10 +318,10 @@ class AbstractMetastore(ABC, Serializable):
202
318
  @abstractmethod
203
319
  def add_dataset_dependency(
204
320
  self,
205
- source_dataset_name: str,
321
+ source_dataset: "DatasetRecord",
206
322
  source_dataset_version: str,
207
- dataset_name: str,
208
- dataset_version: str,
323
+ dep_dataset: "DatasetRecord",
324
+ dep_dataset_version: str,
209
325
  ) -> None:
210
326
  """Adds dataset dependency to dataset."""
211
327
 
@@ -304,6 +420,8 @@ class AbstractDBMetastore(AbstractMetastore):
304
420
  and has shared logic for all database systems currently in use.
305
421
  """
306
422
 
423
+ NAMESPACE_TABLE = "namespaces"
424
+ PROJECT_TABLE = "projects"
307
425
  DATASET_TABLE = "datasets"
308
426
  DATASET_VERSION_TABLE = "datasets_versions"
309
427
  DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
@@ -322,11 +440,62 @@ class AbstractDBMetastore(AbstractMetastore):
322
440
  def cleanup_tables(self, temp_table_names: list[str]) -> None:
323
441
  """Cleanup temp tables."""
324
442
 
443
+ @classmethod
444
+ def _namespaces_columns(cls) -> list["SchemaItem"]:
445
+ """Namespace table columns."""
446
+ return [
447
+ Column("id", Integer, primary_key=True),
448
+ Column("uuid", Text, nullable=False, default=uuid4()),
449
+ Column("name", Text, nullable=False),
450
+ Column("description", Text),
451
+ Column("created_at", DateTime(timezone=True)),
452
+ ]
453
+
454
+ @cached_property
455
+ def _namespaces_fields(self) -> list[str]:
456
+ return [
457
+ c.name # type: ignore [attr-defined]
458
+ for c in self._namespaces_columns()
459
+ if c.name # type: ignore [attr-defined]
460
+ ]
461
+
462
+ @classmethod
463
+ def _projects_columns(cls) -> list["SchemaItem"]:
464
+ """Project table columns."""
465
+ return [
466
+ Column("id", Integer, primary_key=True),
467
+ Column("uuid", Text, nullable=False, default=uuid4()),
468
+ Column("name", Text, nullable=False),
469
+ Column("description", Text),
470
+ Column("created_at", DateTime(timezone=True)),
471
+ Column(
472
+ "namespace_id",
473
+ Integer,
474
+ ForeignKey(f"{cls.NAMESPACE_TABLE}.id", ondelete="CASCADE"),
475
+ nullable=False,
476
+ ),
477
+ UniqueConstraint("namespace_id", "name"),
478
+ ]
479
+
480
+ @cached_property
481
+ def _projects_fields(self) -> list[str]:
482
+ return [
483
+ c.name # type: ignore [attr-defined]
484
+ for c in self._projects_columns()
485
+ if c.name # type: ignore [attr-defined]
486
+ ]
487
+
325
488
  @classmethod
326
489
  def _datasets_columns(cls) -> list["SchemaItem"]:
327
490
  """Datasets table columns."""
328
491
  return [
329
492
  Column("id", Integer, primary_key=True),
493
+ Column(
494
+ "project_id",
495
+ Integer,
496
+ ForeignKey(f"{cls.PROJECT_TABLE}.id", ondelete="CASCADE"),
497
+ nullable=False,
498
+ ),
330
499
  Column("name", Text, nullable=False),
331
500
  Column("description", Text),
332
501
  Column("attrs", JSON, nullable=True),
@@ -445,6 +614,16 @@ class AbstractDBMetastore(AbstractMetastore):
445
614
  #
446
615
  # Query Tables
447
616
  #
617
+ @cached_property
618
+ def _namespaces(self) -> Table:
619
+ return Table(
620
+ self.NAMESPACE_TABLE, self.db.metadata, *self._namespaces_columns()
621
+ )
622
+
623
+ @cached_property
624
+ def _projects(self) -> Table:
625
+ return Table(self.PROJECT_TABLE, self.db.metadata, *self._projects_columns())
626
+
448
627
  @cached_property
449
628
  def _datasets(self) -> Table:
450
629
  return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
@@ -468,6 +647,34 @@ class AbstractDBMetastore(AbstractMetastore):
468
647
  #
469
648
  # Query Starters (These can be overridden by subclasses)
470
649
  #
650
+ @abstractmethod
651
+ def _namespaces_insert(self) -> "Insert": ...
652
+
653
+ def _namespaces_select(self, *columns) -> "Select":
654
+ if not columns:
655
+ return self._namespaces.select()
656
+ return select(*columns)
657
+
658
+ def _namespaces_update(self) -> "Update":
659
+ return self._namespaces.update()
660
+
661
+ def _namespaces_delete(self) -> "Delete":
662
+ return self._namespaces.delete()
663
+
664
+ @abstractmethod
665
+ def _projects_insert(self) -> "Insert": ...
666
+
667
+ def _projects_select(self, *columns) -> "Select":
668
+ if not columns:
669
+ return self._projects.select()
670
+ return select(*columns)
671
+
672
+ def _projects_update(self) -> "Update":
673
+ return self._projects.update()
674
+
675
+ def _projects_delete(self) -> "Delete":
676
+ return self._projects.delete()
677
+
471
678
  @abstractmethod
472
679
  def _datasets_insert(self) -> "Insert": ...
473
680
 
@@ -510,6 +717,160 @@ class AbstractDBMetastore(AbstractMetastore):
510
717
  def _datasets_dependencies_delete(self) -> "Delete":
511
718
  return self._datasets_dependencies.delete()
512
719
 
720
+ #
721
+ # Namespaces
722
+ #
723
+
724
+ def create_namespace(
725
+ self,
726
+ name: str,
727
+ description: Optional[str] = None,
728
+ uuid: Optional[str] = None,
729
+ ignore_if_exists: bool = True,
730
+ validate: bool = True,
731
+ **kwargs,
732
+ ) -> Namespace:
733
+ if validate:
734
+ Namespace.validate_name(name)
735
+ query = self._namespaces_insert().values(
736
+ name=name,
737
+ uuid=uuid or str(uuid4()),
738
+ created_at=datetime.now(timezone.utc),
739
+ description=description,
740
+ )
741
+ if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
742
+ # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
743
+ # but generic SQL does not
744
+ query = query.on_conflict_do_nothing(index_elements=["name"])
745
+ self.db.execute(query)
746
+
747
+ return self.get_namespace(name)
748
+
749
+ def get_namespace(self, name: str, conn=None) -> Namespace:
750
+ """Gets a single namespace by name"""
751
+ n = self._namespaces
752
+
753
+ query = self._namespaces_select(
754
+ *(getattr(n.c, f) for f in self._namespaces_fields),
755
+ ).where(n.c.name == name)
756
+ rows = list(self.db.execute(query, conn=conn))
757
+ if not rows:
758
+ raise NamespaceNotFoundError(f"Namespace {name} not found.")
759
+ return self.namespace_class.parse(*rows[0])
760
+
761
+ def list_namespaces(self, conn=None) -> list[Namespace]:
762
+ """Gets a list of all namespaces"""
763
+ n = self._namespaces
764
+
765
+ query = self._namespaces_select(
766
+ *(getattr(n.c, f) for f in self._namespaces_fields),
767
+ )
768
+ rows = list(self.db.execute(query, conn=conn))
769
+
770
+ return [self.namespace_class.parse(*r) for r in rows]
771
+
772
+ #
773
+ # Projects
774
+ #
775
+
776
+ def create_project(
777
+ self,
778
+ namespace_name: str,
779
+ name: str,
780
+ description: Optional[str] = None,
781
+ uuid: Optional[str] = None,
782
+ ignore_if_exists: bool = True,
783
+ validate: bool = True,
784
+ **kwargs,
785
+ ) -> Project:
786
+ if validate:
787
+ Project.validate_name(name)
788
+ try:
789
+ namespace = self.get_namespace(namespace_name)
790
+ except NamespaceNotFoundError:
791
+ namespace = self.create_namespace(namespace_name, validate=validate)
792
+
793
+ query = self._projects_insert().values(
794
+ namespace_id=namespace.id,
795
+ uuid=uuid or str(uuid4()),
796
+ name=name,
797
+ created_at=datetime.now(timezone.utc),
798
+ description=description,
799
+ )
800
+ if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
801
+ # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
802
+ # but generic SQL does not
803
+ query = query.on_conflict_do_nothing(
804
+ index_elements=["namespace_id", "name"]
805
+ )
806
+ self.db.execute(query)
807
+
808
+ return self.get_project(name, namespace.name)
809
+
810
+ def _is_listing_project(self, project_name: str, namespace_name: str) -> bool:
811
+ return (
812
+ project_name == self.listing_project_name
813
+ and namespace_name == self.system_namespace_name
814
+ )
815
+
816
+ def _is_default_project(self, project_name: str, namespace_name: str) -> bool:
817
+ return (
818
+ project_name == self.default_project_name
819
+ and namespace_name == self.default_namespace_name
820
+ )
821
+
822
+ def get_project(
823
+ self, name: str, namespace_name: str, create: bool = False, conn=None
824
+ ) -> Project:
825
+ """Gets a single project inside some namespace by name"""
826
+ n = self._namespaces
827
+ p = self._projects
828
+ validate = True
829
+
830
+ if self._is_listing_project(name, namespace_name) or self._is_default_project(
831
+ name, namespace_name
832
+ ):
833
+ # we are always creating default and listing projects if they don't exist
834
+ create = True
835
+ validate = False
836
+
837
+ query = self._projects_select(
838
+ *(getattr(n.c, f) for f in self._namespaces_fields),
839
+ *(getattr(p.c, f) for f in self._projects_fields),
840
+ )
841
+ query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
842
+ p.c.name == name, n.c.name == namespace_name
843
+ )
844
+
845
+ rows = list(self.db.execute(query, conn=conn))
846
+ if not rows:
847
+ if create:
848
+ return self.create_project(namespace_name, name, validate=validate)
849
+ raise ProjectNotFoundError(
850
+ f"Project {name} in namespace {namespace_name} not found."
851
+ )
852
+ return self.project_class.parse(*rows[0])
853
+
854
+ def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
855
+ """
856
+ Gets a list of projects inside some namespace, or in all namespaces
857
+ """
858
+ n = self._namespaces
859
+ p = self._projects
860
+
861
+ query = self._projects_select(
862
+ *(getattr(n.c, f) for f in self._namespaces_fields),
863
+ *(getattr(p.c, f) for f in self._projects_fields),
864
+ )
865
+ query = query.select_from(n.join(p, n.c.id == p.c.namespace_id))
866
+
867
+ if namespace_id:
868
+ query = query.where(n.c.id == namespace_id)
869
+
870
+ rows = list(self.db.execute(query, conn=conn))
871
+
872
+ return [self.project_class.parse(*r) for r in rows]
873
+
513
874
  #
514
875
  # Datasets
515
876
  #
@@ -517,6 +878,7 @@ class AbstractDBMetastore(AbstractMetastore):
517
878
  def create_dataset(
518
879
  self,
519
880
  name: str,
881
+ project_id: Optional[int] = None,
520
882
  status: int = DatasetStatus.CREATED,
521
883
  sources: Optional[list[str]] = None,
522
884
  feature_schema: Optional[dict] = None,
@@ -528,9 +890,11 @@ class AbstractDBMetastore(AbstractMetastore):
528
890
  **kwargs, # TODO registered = True / False
529
891
  ) -> DatasetRecord:
530
892
  """Creates new dataset."""
531
- # TODO abstract this method and add registered = True based on kwargs
893
+ project_id = project_id or self.default_project.id
894
+
532
895
  query = self._datasets_insert().values(
533
896
  name=name,
897
+ project_id=project_id,
534
898
  status=status,
535
899
  feature_schema=json.dumps(feature_schema or {}),
536
900
  created_at=datetime.now(timezone.utc),
@@ -546,10 +910,10 @@ class AbstractDBMetastore(AbstractMetastore):
546
910
  if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
547
911
  # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
548
912
  # but generic SQL does not
549
- query = query.on_conflict_do_nothing(index_elements=["name"])
913
+ query = query.on_conflict_do_nothing(index_elements=["project_id", "name"])
550
914
  self.db.execute(query)
551
915
 
552
- return self.get_dataset(name)
916
+ return self.get_dataset(name, project_id)
553
917
 
554
918
  def create_dataset_version( # noqa: PLR0913
555
919
  self,
@@ -606,7 +970,7 @@ class AbstractDBMetastore(AbstractMetastore):
606
970
  )
607
971
  self.db.execute(query, conn=conn)
608
972
 
609
- return self.get_dataset(dataset.name, conn=conn)
973
+ return self.get_dataset(dataset.name, dataset.project.id, conn=conn)
610
974
 
611
975
  def remove_dataset(self, dataset: DatasetRecord) -> None:
612
976
  """Removes dataset."""
@@ -744,13 +1108,15 @@ class AbstractDBMetastore(AbstractMetastore):
744
1108
 
745
1109
  def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
746
1110
  # grouping rows by dataset id
747
- for _, g in groupby(rows, lambda r: r[0]):
1111
+ for _, g in groupby(rows, lambda r: r[11]):
748
1112
  dataset = self._parse_list_dataset(list(g))
749
1113
  if dataset:
750
1114
  yield dataset
751
1115
 
752
1116
  def _get_dataset_query(
753
1117
  self,
1118
+ namespace_fields: list[str],
1119
+ project_fields: list[str],
754
1120
  dataset_fields: list[str],
755
1121
  dataset_version_fields: list[str],
756
1122
  isouter: bool = True,
@@ -761,48 +1127,81 @@ class AbstractDBMetastore(AbstractMetastore):
761
1127
  ):
762
1128
  raise TableMissingError
763
1129
 
1130
+ n = self._namespaces
1131
+ p = self._projects
764
1132
  d = self._datasets
765
1133
  dv = self._datasets_versions
766
1134
 
767
1135
  query = self._datasets_select(
1136
+ *(getattr(n.c, f) for f in namespace_fields),
1137
+ *(getattr(p.c, f) for f in project_fields),
768
1138
  *(getattr(d.c, f) for f in dataset_fields),
769
1139
  *(getattr(dv.c, f) for f in dataset_version_fields),
770
1140
  )
771
- j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
1141
+ j = (
1142
+ n.join(p, n.c.id == p.c.namespace_id)
1143
+ .join(d, p.c.id == d.c.project_id)
1144
+ .join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
1145
+ )
772
1146
  return query.select_from(j)
773
1147
 
774
1148
  def _base_dataset_query(self) -> "Select":
775
1149
  return self._get_dataset_query(
776
- self._dataset_fields, self._dataset_version_fields
1150
+ self._namespaces_fields,
1151
+ self._projects_fields,
1152
+ self._dataset_fields,
1153
+ self._dataset_version_fields,
777
1154
  )
778
1155
 
779
1156
  def _base_list_datasets_query(self) -> "Select":
780
1157
  return self._get_dataset_query(
781
- self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
1158
+ self._namespaces_fields,
1159
+ self._projects_fields,
1160
+ self._dataset_list_fields,
1161
+ self._dataset_list_version_fields,
1162
+ isouter=False,
782
1163
  )
783
1164
 
784
- def list_datasets(self) -> Iterator["DatasetListRecord"]:
1165
+ def list_datasets(
1166
+ self, project_id: Optional[int] = None
1167
+ ) -> Iterator["DatasetListRecord"]:
785
1168
  """Lists all datasets."""
1169
+ d = self._datasets
786
1170
  query = self._base_list_datasets_query().order_by(
787
1171
  self._datasets.c.name, self._datasets_versions.c.version
788
1172
  )
1173
+ if project_id:
1174
+ query = query.where(d.c.project_id == project_id)
789
1175
  yield from self._parse_dataset_list(self.db.execute(query))
790
1176
 
791
1177
  def list_datasets_by_prefix(
792
- self, prefix: str, conn=None
1178
+ self, prefix: str, project_id: Optional[int] = None, conn=None
793
1179
  ) -> Iterator["DatasetListRecord"]:
1180
+ d = self._datasets
794
1181
  query = self._base_list_datasets_query()
1182
+ if project_id:
1183
+ query = query.where(d.c.project_id == project_id)
795
1184
  query = query.where(self._datasets.c.name.startswith(prefix))
796
1185
  yield from self._parse_dataset_list(self.db.execute(query))
797
1186
 
798
- def get_dataset(self, name: str, conn=None) -> DatasetRecord:
799
- """Gets a single dataset by name"""
1187
+ def get_dataset(
1188
+ self,
1189
+ name: str, # normal, not full dataset name
1190
+ project_id: Optional[int] = None,
1191
+ conn=None,
1192
+ ) -> DatasetRecord:
1193
+ """
1194
+ Gets a single dataset in project by dataset name.
1195
+ """
1196
+ project_id = project_id or self.default_project.id
800
1197
  d = self._datasets
801
1198
  query = self._base_dataset_query()
802
- query = query.where(d.c.name == name) # type: ignore [attr-defined]
1199
+ query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
803
1200
  ds = self._parse_dataset(self.db.execute(query, conn=conn))
804
1201
  if not ds:
805
- raise DatasetNotFoundError(f"Dataset {name} not found.")
1202
+ raise DatasetNotFoundError(
1203
+ f"Dataset {name} not found in project {project_id}"
1204
+ )
806
1205
  return ds
807
1206
 
808
1207
  def remove_dataset_version(
@@ -872,23 +1271,20 @@ class AbstractDBMetastore(AbstractMetastore):
872
1271
  #
873
1272
  def add_dataset_dependency(
874
1273
  self,
875
- source_dataset_name: str,
1274
+ source_dataset: "DatasetRecord",
876
1275
  source_dataset_version: str,
877
- dataset_name: str,
878
- dataset_version: str,
1276
+ dep_dataset: "DatasetRecord",
1277
+ dep_dataset_version: str,
879
1278
  ) -> None:
880
1279
  """Adds dataset dependency to dataset."""
881
- source_dataset = self.get_dataset(source_dataset_name)
882
- dataset = self.get_dataset(dataset_name)
883
-
884
1280
  self.db.execute(
885
1281
  self._datasets_dependencies_insert().values(
886
1282
  source_dataset_id=source_dataset.id,
887
1283
  source_dataset_version_id=(
888
1284
  source_dataset.get_version(source_dataset_version).id
889
1285
  ),
890
- dataset_id=dataset.id,
891
- dataset_version_id=dataset.get_version(dataset_version).id,
1286
+ dataset_id=dep_dataset.id,
1287
+ dataset_version_id=dep_dataset.get_version(dep_dataset_version).id,
892
1288
  )
893
1289
  )
894
1290
 
@@ -930,6 +1326,8 @@ class AbstractDBMetastore(AbstractMetastore):
930
1326
  def get_direct_dataset_dependencies(
931
1327
  self, dataset: DatasetRecord, version: str
932
1328
  ) -> list[Optional[DatasetDependency]]:
1329
+ n = self._namespaces
1330
+ p = self._projects
933
1331
  d = self._datasets
934
1332
  dd = self._datasets_dependencies
935
1333
  dv = self._datasets_versions
@@ -941,18 +1339,16 @@ class AbstractDBMetastore(AbstractMetastore):
941
1339
  query = (
942
1340
  self._datasets_dependencies_select(*select_cols)
943
1341
  .select_from(
944
- dd.join(d, dd.c.dataset_id == d.c.id, isouter=True).join(
945
- dv, dd.c.dataset_version_id == dv.c.id, isouter=True
946
- )
1342
+ dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
1343
+ .join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
1344
+ .join(p, d.c.project_id == p.c.id, isouter=True)
1345
+ .join(n, p.c.namespace_id == n.c.id, isouter=True)
947
1346
  )
948
1347
  .where(
949
1348
  (dd.c.source_dataset_id == dataset.id)
950
1349
  & (dd.c.source_dataset_version_id == dataset_version.id)
951
1350
  )
952
1351
  )
953
- if version:
954
- dataset_version = dataset.get_version(version)
955
- query = query.where(dd.c.source_dataset_version_id == dataset_version.id)
956
1352
 
957
1353
  return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
958
1354