datachain 0.21.0__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (48) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +180 -65
  4. datachain/cli/__init__.py +4 -9
  5. datachain/cli/commands/datasets.py +43 -28
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +422 -37
  12. datachain/data_storage/sqlite.py +136 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +126 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +260 -92
  21. datachain/lib/dc/datasets.py +104 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +1 -0
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/tar.py +1 -2
  33. datachain/lib/udf.py +1 -1
  34. datachain/lib/udf_signature.py +1 -1
  35. datachain/lib/webdataset.py +30 -20
  36. datachain/listing.py +3 -1
  37. datachain/namespace.py +65 -0
  38. datachain/project.py +78 -0
  39. datachain/query/dataset.py +71 -46
  40. datachain/query/session.py +1 -1
  41. datachain/remote/studio.py +61 -26
  42. datachain/studio.py +36 -10
  43. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
  44. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
  45. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
  46. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
  47. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
  48. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
@@ -37,9 +37,13 @@ from datachain.dataset import (
37
37
  from datachain.error import (
38
38
  DatasetNotFoundError,
39
39
  DatasetVersionNotFoundError,
40
+ NamespaceNotFoundError,
41
+ ProjectNotFoundError,
40
42
  TableMissingError,
41
43
  )
42
44
  from datachain.job import Job
45
+ from datachain.namespace import Namespace
46
+ from datachain.project import Project
43
47
  from datachain.utils import JSONSerialize
44
48
 
45
49
  if TYPE_CHECKING:
@@ -61,6 +65,8 @@ class AbstractMetastore(ABC, Serializable):
61
65
  uri: StorageURI
62
66
 
63
67
  schema: "schema.Schema"
68
+ namespace_class: type[Namespace] = Namespace
69
+ project_class: type[Project] = Project
64
70
  dataset_class: type[DatasetRecord] = DatasetRecord
65
71
  dataset_list_class: type[DatasetListRecord] = DatasetListRecord
66
72
  dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
@@ -107,13 +113,114 @@ class AbstractMetastore(ABC, Serializable):
107
113
  """Cleanup for tests."""
108
114
 
109
115
  #
110
- # Datasets
116
+ # Namespaces
117
+ #
118
+
119
+ @property
120
+ @abstractmethod
121
+ def default_namespace_name(self):
122
+ """Gets default namespace name"""
123
+
124
+ @property
125
+ def system_namespace_name(self):
126
+ return Namespace.system()
127
+
128
+ @abstractmethod
129
+ def create_namespace(
130
+ self,
131
+ name: str,
132
+ description: Optional[str] = None,
133
+ uuid: Optional[str] = None,
134
+ ignore_if_exists: bool = True,
135
+ **kwargs,
136
+ ) -> Namespace:
137
+ """Creates new namespace"""
138
+
139
+ @abstractmethod
140
+ def get_namespace(self, name: str, conn=None) -> Namespace:
141
+ """Gets a single namespace by name"""
142
+
143
+ @abstractmethod
144
+ def list_namespaces(self, conn=None) -> list[Namespace]:
145
+ """Gets a list of all namespaces"""
146
+
147
+ @property
148
+ @abstractmethod
149
+ def is_studio(self) -> bool:
150
+ """Returns True if this code is ran in Studio"""
151
+
152
+ def is_local_dataset(self, dataset_namespace: str) -> bool:
153
+ """
154
+ Returns True if this is local dataset i.e. not pulled from Studio but
155
+ created locally. This is False if we ran code in CLI mode but using dataset
156
+ names that are present in Studio.
157
+ """
158
+ return self.is_studio or dataset_namespace == Namespace.default()
159
+
160
+ @property
161
+ def namespace_allowed_to_create(self):
162
+ return self.is_studio
163
+
164
+ #
165
+ # Projects
111
166
  #
112
167
 
168
+ @property
169
+ @abstractmethod
170
+ def default_project_name(self):
171
+ """Gets default project name"""
172
+
173
+ @property
174
+ def listing_project_name(self):
175
+ return Project.listing()
176
+
177
+ @cached_property
178
+ def default_project(self) -> Project:
179
+ return self.get_project(
180
+ self.default_project_name, self.default_namespace_name, create=True
181
+ )
182
+
183
+ @cached_property
184
+ def listing_project(self) -> Project:
185
+ return self.get_project(self.listing_project_name, self.system_namespace_name)
186
+
187
+ @abstractmethod
188
+ def create_project(
189
+ self,
190
+ namespace_name: str,
191
+ name: str,
192
+ description: Optional[str] = None,
193
+ uuid: Optional[str] = None,
194
+ ignore_if_exists: bool = True,
195
+ **kwargs,
196
+ ) -> Project:
197
+ """Creates new project in specific namespace"""
198
+
199
+ @abstractmethod
200
+ def get_project(
201
+ self, name: str, namespace_name: str, create: bool = False, conn=None
202
+ ) -> Project:
203
+ """
204
+ Gets a single project inside some namespace by name.
205
+ It also creates project if not found and create flag is set to True.
206
+ """
207
+
208
+ @abstractmethod
209
+ def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
210
+ """Gets list of projects in some namespace or in general (in all namespaces)"""
211
+
212
+ @property
213
+ def project_allowed_to_create(self):
214
+ return self.is_studio
215
+
216
+ #
217
+ # Datasets
218
+ #
113
219
  @abstractmethod
114
220
  def create_dataset(
115
221
  self,
116
222
  name: str,
223
+ project_id: Optional[int] = None,
117
224
  status: int = DatasetStatus.CREATED,
118
225
  sources: Optional[list[str]] = None,
119
226
  feature_schema: Optional[dict] = None,
@@ -173,15 +280,22 @@ class AbstractMetastore(ABC, Serializable):
173
280
  """
174
281
 
175
282
  @abstractmethod
176
- def list_datasets(self) -> Iterator[DatasetListRecord]:
177
- """Lists all datasets."""
283
+ def list_datasets(
284
+ self, project_id: Optional[int] = None
285
+ ) -> Iterator[DatasetListRecord]:
286
+ """Lists all datasets in some project or in all projects."""
178
287
 
179
288
  @abstractmethod
180
- def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetListRecord"]:
181
- """Lists all datasets which names start with prefix."""
289
+ def list_datasets_by_prefix(
290
+ self, prefix: str, project_id: Optional[int] = None
291
+ ) -> Iterator["DatasetListRecord"]:
292
+ """
293
+ Lists all datasets which names start with prefix in some project or in all
294
+ projects.
295
+ """
182
296
 
183
297
  @abstractmethod
184
- def get_dataset(self, name: str) -> DatasetRecord:
298
+ def get_dataset(self, name: str, project_id: Optional[int] = None) -> DatasetRecord:
185
299
  """Gets a single dataset by name."""
186
300
 
187
301
  @abstractmethod
@@ -202,10 +316,10 @@ class AbstractMetastore(ABC, Serializable):
202
316
  @abstractmethod
203
317
  def add_dataset_dependency(
204
318
  self,
205
- source_dataset_name: str,
319
+ source_dataset: "DatasetRecord",
206
320
  source_dataset_version: str,
207
- dataset_name: str,
208
- dataset_version: str,
321
+ dep_dataset: "DatasetRecord",
322
+ dep_dataset_version: str,
209
323
  ) -> None:
210
324
  """Adds dataset dependency to dataset."""
211
325
 
@@ -304,6 +418,8 @@ class AbstractDBMetastore(AbstractMetastore):
304
418
  and has shared logic for all database systems currently in use.
305
419
  """
306
420
 
421
+ NAMESPACE_TABLE = "namespaces"
422
+ PROJECT_TABLE = "projects"
307
423
  DATASET_TABLE = "datasets"
308
424
  DATASET_VERSION_TABLE = "datasets_versions"
309
425
  DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
@@ -322,11 +438,62 @@ class AbstractDBMetastore(AbstractMetastore):
322
438
  def cleanup_tables(self, temp_table_names: list[str]) -> None:
323
439
  """Cleanup temp tables."""
324
440
 
441
+ @classmethod
442
+ def _namespaces_columns(cls) -> list["SchemaItem"]:
443
+ """Namespace table columns."""
444
+ return [
445
+ Column("id", Integer, primary_key=True),
446
+ Column("uuid", Text, nullable=False, default=uuid4()),
447
+ Column("name", Text, nullable=False),
448
+ Column("description", Text),
449
+ Column("created_at", DateTime(timezone=True)),
450
+ ]
451
+
452
+ @cached_property
453
+ def _namespaces_fields(self) -> list[str]:
454
+ return [
455
+ c.name # type: ignore [attr-defined]
456
+ for c in self._namespaces_columns()
457
+ if c.name # type: ignore [attr-defined]
458
+ ]
459
+
460
+ @classmethod
461
+ def _projects_columns(cls) -> list["SchemaItem"]:
462
+ """Project table columns."""
463
+ return [
464
+ Column("id", Integer, primary_key=True),
465
+ Column("uuid", Text, nullable=False, default=uuid4()),
466
+ Column("name", Text, nullable=False),
467
+ Column("description", Text),
468
+ Column("created_at", DateTime(timezone=True)),
469
+ Column(
470
+ "namespace_id",
471
+ Integer,
472
+ ForeignKey(f"{cls.NAMESPACE_TABLE}.id", ondelete="CASCADE"),
473
+ nullable=False,
474
+ ),
475
+ UniqueConstraint("namespace_id", "name"),
476
+ ]
477
+
478
+ @cached_property
479
+ def _projects_fields(self) -> list[str]:
480
+ return [
481
+ c.name # type: ignore [attr-defined]
482
+ for c in self._projects_columns()
483
+ if c.name # type: ignore [attr-defined]
484
+ ]
485
+
325
486
  @classmethod
326
487
  def _datasets_columns(cls) -> list["SchemaItem"]:
327
488
  """Datasets table columns."""
328
489
  return [
329
490
  Column("id", Integer, primary_key=True),
491
+ Column(
492
+ "project_id",
493
+ Integer,
494
+ ForeignKey(f"{cls.PROJECT_TABLE}.id", ondelete="CASCADE"),
495
+ nullable=False,
496
+ ),
330
497
  Column("name", Text, nullable=False),
331
498
  Column("description", Text),
332
499
  Column("attrs", JSON, nullable=True),
@@ -445,6 +612,16 @@ class AbstractDBMetastore(AbstractMetastore):
445
612
  #
446
613
  # Query Tables
447
614
  #
615
+ @cached_property
616
+ def _namespaces(self) -> Table:
617
+ return Table(
618
+ self.NAMESPACE_TABLE, self.db.metadata, *self._namespaces_columns()
619
+ )
620
+
621
+ @cached_property
622
+ def _projects(self) -> Table:
623
+ return Table(self.PROJECT_TABLE, self.db.metadata, *self._projects_columns())
624
+
448
625
  @cached_property
449
626
  def _datasets(self) -> Table:
450
627
  return Table(self.DATASET_TABLE, self.db.metadata, *self._datasets_columns())
@@ -468,6 +645,34 @@ class AbstractDBMetastore(AbstractMetastore):
468
645
  #
469
646
  # Query Starters (These can be overridden by subclasses)
470
647
  #
648
+ @abstractmethod
649
+ def _namespaces_insert(self) -> "Insert": ...
650
+
651
+ def _namespaces_select(self, *columns) -> "Select":
652
+ if not columns:
653
+ return self._namespaces.select()
654
+ return select(*columns)
655
+
656
+ def _namespaces_update(self) -> "Update":
657
+ return self._namespaces.update()
658
+
659
+ def _namespaces_delete(self) -> "Delete":
660
+ return self._namespaces.delete()
661
+
662
+ @abstractmethod
663
+ def _projects_insert(self) -> "Insert": ...
664
+
665
+ def _projects_select(self, *columns) -> "Select":
666
+ if not columns:
667
+ return self._projects.select()
668
+ return select(*columns)
669
+
670
+ def _projects_update(self) -> "Update":
671
+ return self._projects.update()
672
+
673
+ def _projects_delete(self) -> "Delete":
674
+ return self._projects.delete()
675
+
471
676
  @abstractmethod
472
677
  def _datasets_insert(self) -> "Insert": ...
473
678
 
@@ -510,6 +715,151 @@ class AbstractDBMetastore(AbstractMetastore):
510
715
  def _datasets_dependencies_delete(self) -> "Delete":
511
716
  return self._datasets_dependencies.delete()
512
717
 
718
+ #
719
+ # Namespaces
720
+ #
721
+
722
+ def create_namespace(
723
+ self,
724
+ name: str,
725
+ description: Optional[str] = None,
726
+ uuid: Optional[str] = None,
727
+ ignore_if_exists: bool = True,
728
+ **kwargs,
729
+ ) -> Namespace:
730
+ query = self._namespaces_insert().values(
731
+ name=name,
732
+ uuid=uuid or str(uuid4()),
733
+ created_at=datetime.now(timezone.utc),
734
+ description=description,
735
+ )
736
+ if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
737
+ # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
738
+ # but generic SQL does not
739
+ query = query.on_conflict_do_nothing(index_elements=["name"])
740
+ self.db.execute(query)
741
+
742
+ return self.get_namespace(name)
743
+
744
+ def get_namespace(self, name: str, conn=None) -> Namespace:
745
+ """Gets a single namespace by name"""
746
+ n = self._namespaces
747
+
748
+ query = self._namespaces_select(
749
+ *(getattr(n.c, f) for f in self._namespaces_fields),
750
+ ).where(n.c.name == name)
751
+ rows = list(self.db.execute(query, conn=conn))
752
+ if not rows:
753
+ raise NamespaceNotFoundError(f"Namespace {name} not found.")
754
+ return self.namespace_class.parse(*rows[0])
755
+
756
+ def list_namespaces(self, conn=None) -> list[Namespace]:
757
+ """Gets a list of all namespaces"""
758
+ n = self._namespaces
759
+
760
+ query = self._namespaces_select(
761
+ *(getattr(n.c, f) for f in self._namespaces_fields),
762
+ )
763
+ rows = list(self.db.execute(query, conn=conn))
764
+
765
+ return [self.namespace_class.parse(*r) for r in rows]
766
+
767
+ #
768
+ # Projects
769
+ #
770
+
771
+ def create_project(
772
+ self,
773
+ namespace_name: str,
774
+ name: str,
775
+ description: Optional[str] = None,
776
+ uuid: Optional[str] = None,
777
+ ignore_if_exists: bool = True,
778
+ **kwargs,
779
+ ) -> Project:
780
+ try:
781
+ namespace = self.get_namespace(namespace_name)
782
+ except NamespaceNotFoundError:
783
+ namespace = self.create_namespace(namespace_name)
784
+
785
+ query = self._projects_insert().values(
786
+ namespace_id=namespace.id,
787
+ uuid=uuid or str(uuid4()),
788
+ name=name,
789
+ created_at=datetime.now(timezone.utc),
790
+ description=description,
791
+ )
792
+ if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
793
+ # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
794
+ # but generic SQL does not
795
+ query = query.on_conflict_do_nothing(
796
+ index_elements=["namespace_id", "name"]
797
+ )
798
+ self.db.execute(query)
799
+
800
+ return self.get_project(name, namespace.name)
801
+
802
+ def _is_listing_project(self, project_name: str, namespace_name: str) -> bool:
803
+ return (
804
+ project_name == self.listing_project_name
805
+ and namespace_name == self.system_namespace_name
806
+ )
807
+
808
+ def _is_default_project(self, project_name: str, namespace_name: str) -> bool:
809
+ return (
810
+ project_name == self.default_project_name
811
+ and namespace_name == self.default_namespace_name
812
+ )
813
+
814
+ def get_project(
815
+ self, name: str, namespace_name: str, create: bool = False, conn=None
816
+ ) -> Project:
817
+ """Gets a single project inside some namespace by name"""
818
+ n = self._namespaces
819
+ p = self._projects
820
+ if self._is_listing_project(name, namespace_name) or self._is_default_project(
821
+ name, namespace_name
822
+ ):
823
+ # we are always creating default and listing projects if they don't exist
824
+ create = True
825
+
826
+ query = self._projects_select(
827
+ *(getattr(n.c, f) for f in self._namespaces_fields),
828
+ *(getattr(p.c, f) for f in self._projects_fields),
829
+ )
830
+ query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
831
+ p.c.name == name, n.c.name == namespace_name
832
+ )
833
+
834
+ rows = list(self.db.execute(query, conn=conn))
835
+ if not rows:
836
+ if create:
837
+ return self.create_project(namespace_name, name)
838
+ raise ProjectNotFoundError(
839
+ f"Project {name} in namespace {namespace_name} not found."
840
+ )
841
+ return self.project_class.parse(*rows[0])
842
+
843
+ def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
844
+ """
845
+ Gets a list of projects inside some namespace, or in all namespaces
846
+ """
847
+ n = self._namespaces
848
+ p = self._projects
849
+
850
+ query = self._projects_select(
851
+ *(getattr(n.c, f) for f in self._namespaces_fields),
852
+ *(getattr(p.c, f) for f in self._projects_fields),
853
+ )
854
+ query = query.select_from(n.join(p, n.c.id == p.c.namespace_id))
855
+
856
+ if namespace_id:
857
+ query = query.where(n.c.id == namespace_id)
858
+
859
+ rows = list(self.db.execute(query, conn=conn))
860
+
861
+ return [self.project_class.parse(*r) for r in rows]
862
+
513
863
  #
514
864
  # Datasets
515
865
  #
@@ -517,6 +867,7 @@ class AbstractDBMetastore(AbstractMetastore):
517
867
  def create_dataset(
518
868
  self,
519
869
  name: str,
870
+ project_id: Optional[int] = None,
520
871
  status: int = DatasetStatus.CREATED,
521
872
  sources: Optional[list[str]] = None,
522
873
  feature_schema: Optional[dict] = None,
@@ -528,9 +879,11 @@ class AbstractDBMetastore(AbstractMetastore):
528
879
  **kwargs, # TODO registered = True / False
529
880
  ) -> DatasetRecord:
530
881
  """Creates new dataset."""
531
- # TODO abstract this method and add registered = True based on kwargs
882
+ project_id = project_id or self.default_project.id
883
+
532
884
  query = self._datasets_insert().values(
533
885
  name=name,
886
+ project_id=project_id,
534
887
  status=status,
535
888
  feature_schema=json.dumps(feature_schema or {}),
536
889
  created_at=datetime.now(timezone.utc),
@@ -546,10 +899,10 @@ class AbstractDBMetastore(AbstractMetastore):
546
899
  if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
547
900
  # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
548
901
  # but generic SQL does not
549
- query = query.on_conflict_do_nothing(index_elements=["name"])
902
+ query = query.on_conflict_do_nothing(index_elements=["project_id", "name"])
550
903
  self.db.execute(query)
551
904
 
552
- return self.get_dataset(name)
905
+ return self.get_dataset(name, project_id)
553
906
 
554
907
  def create_dataset_version( # noqa: PLR0913
555
908
  self,
@@ -606,7 +959,7 @@ class AbstractDBMetastore(AbstractMetastore):
606
959
  )
607
960
  self.db.execute(query, conn=conn)
608
961
 
609
- return self.get_dataset(dataset.name, conn=conn)
962
+ return self.get_dataset(dataset.name, dataset.project.id, conn=conn)
610
963
 
611
964
  def remove_dataset(self, dataset: DatasetRecord) -> None:
612
965
  """Removes dataset."""
@@ -744,13 +1097,15 @@ class AbstractDBMetastore(AbstractMetastore):
744
1097
 
745
1098
  def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
746
1099
  # grouping rows by dataset id
747
- for _, g in groupby(rows, lambda r: r[0]):
1100
+ for _, g in groupby(rows, lambda r: r[11]):
748
1101
  dataset = self._parse_list_dataset(list(g))
749
1102
  if dataset:
750
1103
  yield dataset
751
1104
 
752
1105
  def _get_dataset_query(
753
1106
  self,
1107
+ namespace_fields: list[str],
1108
+ project_fields: list[str],
754
1109
  dataset_fields: list[str],
755
1110
  dataset_version_fields: list[str],
756
1111
  isouter: bool = True,
@@ -761,48 +1116,81 @@ class AbstractDBMetastore(AbstractMetastore):
761
1116
  ):
762
1117
  raise TableMissingError
763
1118
 
1119
+ n = self._namespaces
1120
+ p = self._projects
764
1121
  d = self._datasets
765
1122
  dv = self._datasets_versions
766
1123
 
767
1124
  query = self._datasets_select(
1125
+ *(getattr(n.c, f) for f in namespace_fields),
1126
+ *(getattr(p.c, f) for f in project_fields),
768
1127
  *(getattr(d.c, f) for f in dataset_fields),
769
1128
  *(getattr(dv.c, f) for f in dataset_version_fields),
770
1129
  )
771
- j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
1130
+ j = (
1131
+ n.join(p, n.c.id == p.c.namespace_id)
1132
+ .join(d, p.c.id == d.c.project_id)
1133
+ .join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
1134
+ )
772
1135
  return query.select_from(j)
773
1136
 
774
1137
  def _base_dataset_query(self) -> "Select":
775
1138
  return self._get_dataset_query(
776
- self._dataset_fields, self._dataset_version_fields
1139
+ self._namespaces_fields,
1140
+ self._projects_fields,
1141
+ self._dataset_fields,
1142
+ self._dataset_version_fields,
777
1143
  )
778
1144
 
779
1145
  def _base_list_datasets_query(self) -> "Select":
780
1146
  return self._get_dataset_query(
781
- self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
1147
+ self._namespaces_fields,
1148
+ self._projects_fields,
1149
+ self._dataset_list_fields,
1150
+ self._dataset_list_version_fields,
1151
+ isouter=False,
782
1152
  )
783
1153
 
784
- def list_datasets(self) -> Iterator["DatasetListRecord"]:
1154
+ def list_datasets(
1155
+ self, project_id: Optional[int] = None
1156
+ ) -> Iterator["DatasetListRecord"]:
785
1157
  """Lists all datasets."""
1158
+ d = self._datasets
786
1159
  query = self._base_list_datasets_query().order_by(
787
1160
  self._datasets.c.name, self._datasets_versions.c.version
788
1161
  )
1162
+ if project_id:
1163
+ query = query.where(d.c.project_id == project_id)
789
1164
  yield from self._parse_dataset_list(self.db.execute(query))
790
1165
 
791
1166
  def list_datasets_by_prefix(
792
- self, prefix: str, conn=None
1167
+ self, prefix: str, project_id: Optional[int] = None, conn=None
793
1168
  ) -> Iterator["DatasetListRecord"]:
1169
+ d = self._datasets
794
1170
  query = self._base_list_datasets_query()
1171
+ if project_id:
1172
+ query = query.where(d.c.project_id == project_id)
795
1173
  query = query.where(self._datasets.c.name.startswith(prefix))
796
1174
  yield from self._parse_dataset_list(self.db.execute(query))
797
1175
 
798
- def get_dataset(self, name: str, conn=None) -> DatasetRecord:
799
- """Gets a single dataset by name"""
1176
+ def get_dataset(
1177
+ self,
1178
+ name: str, # normal, not full dataset name
1179
+ project_id: Optional[int] = None,
1180
+ conn=None,
1181
+ ) -> DatasetRecord:
1182
+ """
1183
+ Gets a single dataset in project by dataset name.
1184
+ """
1185
+ project_id = project_id or self.default_project.id
800
1186
  d = self._datasets
801
1187
  query = self._base_dataset_query()
802
- query = query.where(d.c.name == name) # type: ignore [attr-defined]
1188
+ query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
803
1189
  ds = self._parse_dataset(self.db.execute(query, conn=conn))
804
1190
  if not ds:
805
- raise DatasetNotFoundError(f"Dataset {name} not found.")
1191
+ raise DatasetNotFoundError(
1192
+ f"Dataset {name} not found in project {project_id}"
1193
+ )
806
1194
  return ds
807
1195
 
808
1196
  def remove_dataset_version(
@@ -872,23 +1260,20 @@ class AbstractDBMetastore(AbstractMetastore):
872
1260
  #
873
1261
  def add_dataset_dependency(
874
1262
  self,
875
- source_dataset_name: str,
1263
+ source_dataset: "DatasetRecord",
876
1264
  source_dataset_version: str,
877
- dataset_name: str,
878
- dataset_version: str,
1265
+ dep_dataset: "DatasetRecord",
1266
+ dep_dataset_version: str,
879
1267
  ) -> None:
880
1268
  """Adds dataset dependency to dataset."""
881
- source_dataset = self.get_dataset(source_dataset_name)
882
- dataset = self.get_dataset(dataset_name)
883
-
884
1269
  self.db.execute(
885
1270
  self._datasets_dependencies_insert().values(
886
1271
  source_dataset_id=source_dataset.id,
887
1272
  source_dataset_version_id=(
888
1273
  source_dataset.get_version(source_dataset_version).id
889
1274
  ),
890
- dataset_id=dataset.id,
891
- dataset_version_id=dataset.get_version(dataset_version).id,
1275
+ dataset_id=dep_dataset.id,
1276
+ dataset_version_id=dep_dataset.get_version(dep_dataset_version).id,
892
1277
  )
893
1278
  )
894
1279
 
@@ -930,6 +1315,8 @@ class AbstractDBMetastore(AbstractMetastore):
930
1315
  def get_direct_dataset_dependencies(
931
1316
  self, dataset: DatasetRecord, version: str
932
1317
  ) -> list[Optional[DatasetDependency]]:
1318
+ n = self._namespaces
1319
+ p = self._projects
933
1320
  d = self._datasets
934
1321
  dd = self._datasets_dependencies
935
1322
  dv = self._datasets_versions
@@ -941,18 +1328,16 @@ class AbstractDBMetastore(AbstractMetastore):
941
1328
  query = (
942
1329
  self._datasets_dependencies_select(*select_cols)
943
1330
  .select_from(
944
- dd.join(d, dd.c.dataset_id == d.c.id, isouter=True).join(
945
- dv, dd.c.dataset_version_id == dv.c.id, isouter=True
946
- )
1331
+ dd.join(d, dd.c.dataset_id == d.c.id, isouter=True)
1332
+ .join(dv, dd.c.dataset_version_id == dv.c.id, isouter=True)
1333
+ .join(p, d.c.project_id == p.c.id, isouter=True)
1334
+ .join(n, p.c.namespace_id == n.c.id, isouter=True)
947
1335
  )
948
1336
  .where(
949
1337
  (dd.c.source_dataset_id == dataset.id)
950
1338
  & (dd.c.source_dataset_version_id == dataset_version.id)
951
1339
  )
952
1340
  )
953
- if version:
954
- dataset_version = dataset.get_version(version)
955
- query = query.where(dd.c.source_dataset_version_id == dataset_version.id)
956
1341
 
957
1342
  return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
958
1343