datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +213 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +35 -26
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +433 -37
  12. datachain/data_storage/sqlite.py +140 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +128 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +253 -91
  21. datachain/lib/dc/datasets.py +103 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +2 -1
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/signal_schema.py +8 -0
  33. datachain/lib/tar.py +1 -2
  34. datachain/lib/udf.py +1 -1
  35. datachain/lib/udf_signature.py +1 -1
  36. datachain/lib/webdataset.py +30 -20
  37. datachain/listing.py +3 -1
  38. datachain/namespace.py +65 -0
  39. datachain/project.py +78 -0
  40. datachain/query/dataset.py +71 -46
  41. datachain/query/session.py +1 -1
  42. datachain/remote/studio.py +61 -26
  43. datachain/studio.py +23 -6
  44. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
  45. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
  46. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
  49. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py CHANGED
@@ -32,6 +32,7 @@ from datachain.lib.file import (
32
32
  VideoFrame,
33
33
  )
34
34
  from datachain.lib.model_store import ModelStore
35
+ from datachain.lib.projects import create as create_project
35
36
  from datachain.lib.udf import Aggregator, Generator, Mapper
36
37
  from datachain.lib.utils import AbstractUDF, DataChainError
37
38
  from datachain.query import metrics, param
@@ -62,6 +63,7 @@ __all__ = [
62
63
  "VideoFile",
63
64
  "VideoFragment",
64
65
  "VideoFrame",
66
+ "create_project",
65
67
  "datasets",
66
68
  "delete_dataset",
67
69
  "is_chain_type",
datachain/cache.py CHANGED
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class Cache:
42
+ class Cache: # noqa: PLW1641
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -76,9 +76,9 @@ class Cache:
76
76
  async def download(
77
77
  self, file: "File", client: "Client", callback: Optional[Callback] = None
78
78
  ) -> None:
79
- from_path = f"{file.source}/{file.path}"
80
79
  from dvc_objects.fs.utils import tmp_fname
81
80
 
81
+ from_path = file.get_uri()
82
82
  odb_fs = self.odb.fs
83
83
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
84
84
  size = file.size
@@ -41,6 +41,7 @@ from datachain.dataset import (
41
41
  DatasetStatus,
42
42
  StorageURI,
43
43
  create_dataset_uri,
44
+ parse_dataset_name,
44
45
  parse_dataset_uri,
45
46
  )
46
47
  from datachain.error import (
@@ -48,12 +49,14 @@ from datachain.error import (
48
49
  DatasetInvalidVersionError,
49
50
  DatasetNotFoundError,
50
51
  DatasetVersionNotFoundError,
52
+ ProjectNotFoundError,
51
53
  QueryScriptCancelError,
52
54
  QueryScriptRunError,
53
55
  )
54
56
  from datachain.lib.listing import get_listing
55
57
  from datachain.node import DirType, Node, NodeWithPath
56
58
  from datachain.nodes_thread_pool import NodesThreadPool
59
+ from datachain.project import Project
57
60
  from datachain.sql.types import DateTime, SQLType
58
61
  from datachain.utils import DataChainDir
59
62
 
@@ -155,9 +158,9 @@ class DatasetRowsFetcher(NodesThreadPool):
155
158
  self,
156
159
  metastore: "AbstractMetastore",
157
160
  warehouse: "AbstractWarehouse",
158
- remote_ds_name: str,
161
+ remote_ds: DatasetRecord,
159
162
  remote_ds_version: str,
160
- local_ds_name: str,
163
+ local_ds: DatasetRecord,
161
164
  local_ds_version: str,
162
165
  schema: dict[str, Union[SQLType, type[SQLType]]],
163
166
  max_threads: int = PULL_DATASET_MAX_THREADS,
@@ -169,9 +172,9 @@ class DatasetRowsFetcher(NodesThreadPool):
169
172
  self._check_dependencies()
170
173
  self.metastore = metastore
171
174
  self.warehouse = warehouse
172
- self.remote_ds_name = remote_ds_name
175
+ self.remote_ds = remote_ds
173
176
  self.remote_ds_version = remote_ds_version
174
- self.local_ds_name = local_ds_name
177
+ self.local_ds = local_ds
175
178
  self.local_ds_version = local_ds_version
176
179
  self.schema = schema
177
180
  self.last_status_check: Optional[float] = None
@@ -207,7 +210,7 @@ class DatasetRowsFetcher(NodesThreadPool):
207
210
  Checks are done every PULL_DATASET_CHECK_STATUS_INTERVAL seconds
208
211
  """
209
212
  export_status_response = self.studio_client.dataset_export_status(
210
- self.remote_ds_name, self.remote_ds_version
213
+ self.remote_ds, self.remote_ds_version
211
214
  )
212
215
  if not export_status_response.ok:
213
216
  raise DataChainError(export_status_response.message)
@@ -254,9 +257,7 @@ class DatasetRowsFetcher(NodesThreadPool):
254
257
  import pandas as pd
255
258
 
256
259
  # metastore and warehouse are not thread safe
257
- with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
258
- local_ds = metastore.get_dataset(self.local_ds_name)
259
-
260
+ with self.warehouse.clone() as warehouse:
260
261
  urls = list(urls)
261
262
 
262
263
  for url in urls:
@@ -269,7 +270,7 @@ class DatasetRowsFetcher(NodesThreadPool):
269
270
  df = self.fix_columns(df)
270
271
 
271
272
  inserted = warehouse.insert_dataset_rows(
272
- df, local_ds, self.local_ds_version
273
+ df, self.local_ds, self.local_ds_version
273
274
  )
274
275
  self.increase_counter(inserted) # type: ignore [arg-type]
275
276
  # sometimes progress bar doesn't get updated so manually updating it
@@ -675,7 +676,11 @@ class Catalog:
675
676
  listing: Optional[Listing]
676
677
  if src.startswith("ds://"):
677
678
  ds_name, ds_version = parse_dataset_uri(src)
678
- dataset = self.get_dataset(ds_name)
679
+ ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
680
+ assert ds_namespace
681
+ assert ds_project
682
+ project = self.metastore.get_project(ds_project, ds_namespace)
683
+ dataset = self.get_dataset(ds_name, project)
679
684
  if not ds_version:
680
685
  ds_version = dataset.latest_version
681
686
  dataset_sources = self.warehouse.get_dataset_sources(
@@ -695,7 +700,11 @@ class Catalog:
695
700
  dataset_name=dataset_name,
696
701
  )
697
702
  rows = DatasetQuery(
698
- name=dataset.name, version=ds_version, catalog=self
703
+ name=dataset.name,
704
+ namespace_name=dataset.project.namespace.name,
705
+ project_name=dataset.project.name,
706
+ version=ds_version,
707
+ catalog=self,
699
708
  ).to_db_records()
700
709
  indexed_sources.append(
701
710
  (
@@ -769,6 +778,7 @@ class Catalog:
769
778
  def create_dataset(
770
779
  self,
771
780
  name: str,
781
+ project: Optional[Project] = None,
772
782
  version: Optional[str] = None,
773
783
  *,
774
784
  columns: Sequence[Column],
@@ -788,6 +798,7 @@ class Catalog:
788
798
  If version is None, then next unused version is created.
789
799
  If version is given, then it must be an unused version.
790
800
  """
801
+ DatasetRecord.validate_name(name)
791
802
  assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
792
803
  if not listing and Client.is_data_source_uri(name):
793
804
  raise RuntimeError(
@@ -795,7 +806,7 @@ class Catalog:
795
806
  )
796
807
  default_version = DEFAULT_DATASET_VERSION
797
808
  try:
798
- dataset = self.get_dataset(name)
809
+ dataset = self.get_dataset(name, project)
799
810
  default_version = dataset.next_version_patch
800
811
  if update_version == "major":
801
812
  default_version = dataset.next_version_major
@@ -820,6 +831,7 @@ class Catalog:
820
831
  }
821
832
  dataset = self.metastore.create_dataset(
822
833
  name,
834
+ project.id if project else None,
823
835
  feature_schema=feature_schema,
824
836
  query_script=query_script,
825
837
  schema=schema,
@@ -892,7 +904,7 @@ class Catalog:
892
904
  )
893
905
 
894
906
  if create_rows_table:
895
- table_name = self.warehouse.dataset_table_name(dataset.name, version)
907
+ table_name = self.warehouse.dataset_table_name(dataset, version)
896
908
  self.warehouse.create_dataset_rows_table(table_name, columns=columns)
897
909
  self.update_dataset_version_with_warehouse_info(dataset, version)
898
910
 
@@ -923,7 +935,13 @@ class Catalog:
923
935
 
924
936
  if not dataset_version.preview:
925
937
  values["preview"] = (
926
- DatasetQuery(name=dataset.name, version=version, catalog=self)
938
+ DatasetQuery(
939
+ name=dataset.name,
940
+ namespace_name=dataset.project.namespace.name,
941
+ project_name=dataset.project.name,
942
+ version=version,
943
+ catalog=self,
944
+ )
927
945
  .limit(20)
928
946
  .to_db_records()
929
947
  )
@@ -949,6 +967,7 @@ class Catalog:
949
967
  # updating name must result in updating dataset table names as well
950
968
  for version in [v.version for v in dataset.versions]:
951
969
  self.warehouse.rename_dataset_table(
970
+ dataset,
952
971
  old_name,
953
972
  new_name,
954
973
  old_version=version,
@@ -986,6 +1005,7 @@ class Catalog:
986
1005
  self,
987
1006
  name: str,
988
1007
  sources: list[str],
1008
+ project: Optional[Project] = None,
989
1009
  client_config=None,
990
1010
  recursive=False,
991
1011
  ) -> DatasetRecord:
@@ -994,6 +1014,8 @@ class Catalog:
994
1014
 
995
1015
  from datachain import read_dataset, read_storage
996
1016
 
1017
+ project = project or self.metastore.default_project
1018
+
997
1019
  chains = []
998
1020
  for source in sources:
999
1021
  if source.startswith(DATASET_PREFIX):
@@ -1006,10 +1028,11 @@ class Catalog:
1006
1028
  # create union of all dataset queries created from sources
1007
1029
  dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
1008
1030
  try:
1031
+ dc = dc.settings(project=project.name, namespace=project.namespace.name)
1009
1032
  dc.save(name)
1010
1033
  except Exception as e: # noqa: BLE001
1011
1034
  try:
1012
- ds = self.get_dataset(name)
1035
+ ds = self.get_dataset(name, project)
1013
1036
  self.metastore.update_dataset_status(
1014
1037
  ds,
1015
1038
  DatasetStatus.FAILED,
@@ -1026,7 +1049,7 @@ class Catalog:
1026
1049
  except DatasetNotFoundError:
1027
1050
  raise e from None
1028
1051
 
1029
- ds = self.get_dataset(name)
1052
+ ds = self.get_dataset(name, project)
1030
1053
 
1031
1054
  self.update_dataset_version_with_warehouse_info(
1032
1055
  ds,
@@ -1034,49 +1057,100 @@ class Catalog:
1034
1057
  sources="\n".join(sources),
1035
1058
  )
1036
1059
 
1037
- return self.get_dataset(name)
1060
+ return self.get_dataset(name, project)
1038
1061
 
1039
- def get_dataset(self, name: str) -> DatasetRecord:
1040
- return self.metastore.get_dataset(name)
1062
+ def get_full_dataset_name(
1063
+ self,
1064
+ name: str,
1065
+ project_name: Optional[str] = None,
1066
+ namespace_name: Optional[str] = None,
1067
+ ) -> tuple[str, str, str]:
1068
+ """
1069
+ Returns dataset name together with separated namespace and project name.
1070
+ It takes into account all the ways namespace and project can be added.
1071
+ """
1072
+ parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name)
1073
+
1074
+ namespace_env = os.environ.get("DATACHAIN_NAMESPACE")
1075
+ project_env = os.environ.get("DATACHAIN_PROJECT")
1076
+ if project_env and len(project_env.split(".")) == 2:
1077
+ # we allow setting both namespace and project in DATACHAIN_PROJECT
1078
+ namespace_env, project_env = project_env.split(".")
1079
+
1080
+ namespace_name = (
1081
+ parsed_namespace_name
1082
+ or namespace_name
1083
+ or namespace_env
1084
+ or self.metastore.default_namespace_name
1085
+ )
1086
+ project_name = (
1087
+ parsed_project_name
1088
+ or project_name
1089
+ or project_env
1090
+ or self.metastore.default_project_name
1091
+ )
1092
+
1093
+ return namespace_name, project_name, name
1094
+
1095
+ def get_dataset(
1096
+ self, name: str, project: Optional[Project] = None
1097
+ ) -> DatasetRecord:
1098
+ from datachain.lib.listing import is_listing_dataset
1099
+
1100
+ if is_listing_dataset(name):
1101
+ project = self.metastore.listing_project
1102
+ return self.metastore.get_dataset(name, project.id if project else None)
1041
1103
 
1042
1104
  def get_dataset_with_remote_fallback(
1043
- self, name: str, version: Optional[str] = None
1105
+ self,
1106
+ name: str,
1107
+ namespace_name: str,
1108
+ project_name: str,
1109
+ version: Optional[str] = None,
1044
1110
  ) -> DatasetRecord:
1045
1111
  try:
1046
- ds = self.get_dataset(name)
1112
+ project = self.metastore.get_project(project_name, namespace_name)
1113
+ ds = self.get_dataset(name, project)
1047
1114
  if version and not ds.has_version(version):
1048
1115
  raise DatasetVersionNotFoundError(
1049
1116
  f"Dataset {name} does not have version {version}"
1050
1117
  )
1051
1118
  return ds
1052
1119
 
1053
- except (DatasetNotFoundError, DatasetVersionNotFoundError):
1120
+ except (
1121
+ ProjectNotFoundError,
1122
+ DatasetNotFoundError,
1123
+ DatasetVersionNotFoundError,
1124
+ ):
1054
1125
  print("Dataset not found in local catalog, trying to get from studio")
1055
-
1056
- remote_ds_uri = f"{DATASET_PREFIX}{name}"
1057
- if version:
1058
- remote_ds_uri += f"@v{version}"
1126
+ remote_ds_uri = create_dataset_uri(
1127
+ name, namespace_name, project_name, version
1128
+ )
1059
1129
 
1060
1130
  self.pull_dataset(
1061
1131
  remote_ds_uri=remote_ds_uri,
1062
1132
  local_ds_name=name,
1063
1133
  local_ds_version=version,
1064
1134
  )
1065
- return self.get_dataset(name)
1135
+ return self.get_dataset(
1136
+ name, self.metastore.get_project(project_name, namespace_name)
1137
+ )
1066
1138
 
1067
1139
  def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
1068
1140
  """Returns dataset that contains version with specific uuid"""
1069
1141
  for dataset in self.ls_datasets():
1070
1142
  if dataset.has_version_with_uuid(uuid):
1071
- return self.get_dataset(dataset.name)
1143
+ return self.get_dataset(dataset.name, dataset.project)
1072
1144
  raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
1073
1145
 
1074
- def get_remote_dataset(self, name: str) -> DatasetRecord:
1146
+ def get_remote_dataset(
1147
+ self, namespace: str, project: str, name: str
1148
+ ) -> DatasetRecord:
1075
1149
  from datachain.remote.studio import StudioClient
1076
1150
 
1077
1151
  studio_client = StudioClient()
1078
1152
 
1079
- info_response = studio_client.dataset_info(name)
1153
+ info_response = studio_client.dataset_info(namespace, project, name)
1080
1154
  if not info_response.ok:
1081
1155
  raise DataChainError(info_response.message)
1082
1156
 
@@ -1085,9 +1159,9 @@ class Catalog:
1085
1159
  return DatasetRecord.from_dict(dataset_info)
1086
1160
 
1087
1161
  def get_dataset_dependencies(
1088
- self, name: str, version: str, indirect=False
1162
+ self, name: str, version: str, project: Optional[Project] = None, indirect=False
1089
1163
  ) -> list[Optional[DatasetDependency]]:
1090
- dataset = self.get_dataset(name)
1164
+ dataset = self.get_dataset(name, project)
1091
1165
 
1092
1166
  direct_dependencies = self.metastore.get_direct_dataset_dependencies(
1093
1167
  dataset, version
@@ -1101,9 +1175,10 @@ class Catalog:
1101
1175
  # dependency has been removed
1102
1176
  continue
1103
1177
  if d.is_dataset:
1178
+ project = self.metastore.get_project(d.project, d.namespace)
1104
1179
  # only datasets can have dependencies
1105
1180
  d.dependencies = self.get_dataset_dependencies(
1106
- d.name, d.version, indirect=indirect
1181
+ d.name, d.version, project, indirect=indirect
1107
1182
  )
1108
1183
 
1109
1184
  return direct_dependencies
@@ -1113,9 +1188,12 @@ class Catalog:
1113
1188
  prefix: Optional[str] = None,
1114
1189
  include_listing: bool = False,
1115
1190
  studio: bool = False,
1191
+ project: Optional[Project] = None,
1116
1192
  ) -> Iterator[DatasetListRecord]:
1117
1193
  from datachain.remote.studio import StudioClient
1118
1194
 
1195
+ project_id = project.id if project else None
1196
+
1119
1197
  if studio:
1120
1198
  client = StudioClient()
1121
1199
  response = client.ls_datasets(prefix=prefix)
@@ -1130,9 +1208,11 @@ class Catalog:
1130
1208
  if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
1131
1209
  )
1132
1210
  elif prefix:
1133
- datasets = self.metastore.list_datasets_by_prefix(prefix)
1211
+ datasets = self.metastore.list_datasets_by_prefix(
1212
+ prefix, project_id=project_id
1213
+ )
1134
1214
  else:
1135
- datasets = self.metastore.list_datasets()
1215
+ datasets = self.metastore.list_datasets(project_id=project_id)
1136
1216
 
1137
1217
  for d in datasets:
1138
1218
  if not d.is_bucket_listing or include_listing:
@@ -1144,11 +1224,15 @@ class Catalog:
1144
1224
  include_listing: bool = False,
1145
1225
  with_job: bool = True,
1146
1226
  studio: bool = False,
1227
+ project: Optional[Project] = None,
1147
1228
  ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
1148
1229
  """Iterate over all dataset versions with related jobs."""
1149
1230
  datasets = list(
1150
1231
  self.ls_datasets(
1151
- prefix=prefix, include_listing=include_listing, studio=studio
1232
+ prefix=prefix,
1233
+ include_listing=include_listing,
1234
+ studio=studio,
1235
+ project=project,
1152
1236
  )
1153
1237
  )
1154
1238
 
@@ -1184,6 +1268,7 @@ class Catalog:
1184
1268
  prefix=prefix,
1185
1269
  include_listing=True,
1186
1270
  with_job=False,
1271
+ project=self.metastore.listing_project,
1187
1272
  )
1188
1273
 
1189
1274
  return [
@@ -1193,13 +1278,21 @@ class Catalog:
1193
1278
  ]
1194
1279
 
1195
1280
  def ls_dataset_rows(
1196
- self, name: str, version: str, offset=None, limit=None
1281
+ self,
1282
+ dataset: DatasetRecord,
1283
+ version: str,
1284
+ offset=None,
1285
+ limit=None,
1197
1286
  ) -> list[dict]:
1198
1287
  from datachain.query.dataset import DatasetQuery
1199
1288
 
1200
- dataset = self.get_dataset(name)
1201
-
1202
- q = DatasetQuery(name=dataset.name, version=version, catalog=self)
1289
+ q = DatasetQuery(
1290
+ name=dataset.name,
1291
+ namespace_name=dataset.project.namespace.name,
1292
+ project_name=dataset.project.name,
1293
+ version=version,
1294
+ catalog=self,
1295
+ )
1203
1296
  if limit:
1204
1297
  q = q.limit(limit)
1205
1298
  if offset:
@@ -1232,35 +1325,29 @@ class Catalog:
1232
1325
  bucket_uri: str,
1233
1326
  name: str,
1234
1327
  version: str,
1328
+ project: Optional[Project] = None,
1235
1329
  client_config=None,
1236
1330
  ) -> list[str]:
1237
- dataset = self.get_dataset(name)
1331
+ dataset = self.get_dataset(name, project)
1238
1332
 
1239
1333
  return self.warehouse.export_dataset_table(
1240
1334
  bucket_uri, dataset, version, client_config
1241
1335
  )
1242
1336
 
1243
- def dataset_table_export_file_names(self, name: str, version: str) -> list[str]:
1244
- dataset = self.get_dataset(name)
1337
+ def dataset_table_export_file_names(
1338
+ self, name: str, version: str, project: Optional[Project] = None
1339
+ ) -> list[str]:
1340
+ dataset = self.get_dataset(name, project)
1245
1341
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1246
1342
 
1247
1343
  def remove_dataset(
1248
1344
  self,
1249
1345
  name: str,
1346
+ project: Optional[Project] = None,
1250
1347
  version: Optional[str] = None,
1251
1348
  force: Optional[bool] = False,
1252
- studio: Optional[bool] = False,
1253
1349
  ):
1254
- from datachain.remote.studio import StudioClient
1255
-
1256
- if studio:
1257
- client = StudioClient()
1258
- response = client.rm_dataset(name, version=version, force=force)
1259
- if not response.ok:
1260
- raise DataChainError(response.message)
1261
- return
1262
-
1263
- dataset = self.get_dataset(name)
1350
+ dataset = self.get_dataset(name, project)
1264
1351
  if not version and not force:
1265
1352
  raise ValueError(f"Missing dataset version from input for dataset {name}")
1266
1353
  if version and not dataset.has_version(version):
@@ -1282,19 +1369,21 @@ class Catalog:
1282
1369
  def edit_dataset(
1283
1370
  self,
1284
1371
  name: str,
1372
+ project: Optional[Project] = None,
1285
1373
  new_name: Optional[str] = None,
1286
1374
  description: Optional[str] = None,
1287
1375
  attrs: Optional[list[str]] = None,
1288
1376
  ) -> DatasetRecord:
1289
1377
  update_data = {}
1290
1378
  if new_name:
1379
+ DatasetRecord.validate_name(new_name)
1291
1380
  update_data["name"] = new_name
1292
1381
  if description is not None:
1293
1382
  update_data["description"] = description
1294
1383
  if attrs is not None:
1295
1384
  update_data["attrs"] = attrs # type: ignore[assignment]
1296
1385
 
1297
- dataset = self.get_dataset(name)
1386
+ dataset = self.get_dataset(name, project)
1298
1387
  return self.update_dataset(dataset, **update_data)
1299
1388
 
1300
1389
  def ls(
@@ -1351,7 +1440,29 @@ class Catalog:
1351
1440
  except Exception as e:
1352
1441
  raise DataChainError("Error when parsing dataset uri") from e
1353
1442
 
1354
- remote_ds = self.get_remote_dataset(remote_ds_name)
1443
+ remote_namespace, remote_project, remote_ds_name = parse_dataset_name(
1444
+ remote_ds_name
1445
+ )
1446
+ if not remote_namespace or not remote_project:
1447
+ raise DataChainError(
1448
+ f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
1449
+ f" or project missing"
1450
+ )
1451
+
1452
+ if local_ds_name:
1453
+ local_namespace, local_project, local_ds_name = parse_dataset_name(
1454
+ local_ds_name
1455
+ )
1456
+ if local_namespace and local_namespace != remote_namespace:
1457
+ raise DataChainError(
1458
+ "Local namespace must be the same to remote namespace"
1459
+ )
1460
+ if local_project and local_project != remote_project:
1461
+ raise DataChainError("Local project must be the same to remote project")
1462
+
1463
+ remote_ds = self.get_remote_dataset(
1464
+ remote_namespace, remote_project, remote_ds_name
1465
+ )
1355
1466
 
1356
1467
  try:
1357
1468
  # if version is not specified in uri, take the latest one
@@ -1359,7 +1470,12 @@ class Catalog:
1359
1470
  version = remote_ds.latest_version
1360
1471
  print(f"Version not specified, pulling the latest one (v{version})")
1361
1472
  # updating dataset uri with latest version
1362
- remote_ds_uri = create_dataset_uri(remote_ds_name, version)
1473
+ remote_ds_uri = create_dataset_uri(
1474
+ remote_ds.name,
1475
+ remote_ds.project.namespace.name,
1476
+ remote_ds.project.name,
1477
+ version,
1478
+ )
1363
1479
  remote_ds_version = remote_ds.get_version(version)
1364
1480
  except (DatasetVersionNotFoundError, StopIteration) as exc:
1365
1481
  raise DataChainError(
@@ -1368,7 +1484,13 @@ class Catalog:
1368
1484
 
1369
1485
  local_ds_name = local_ds_name or remote_ds.name
1370
1486
  local_ds_version = local_ds_version or remote_ds_version.version
1371
- local_ds_uri = create_dataset_uri(local_ds_name, local_ds_version)
1487
+
1488
+ local_ds_uri = create_dataset_uri(
1489
+ local_ds_name,
1490
+ remote_ds.project.namespace.name,
1491
+ remote_ds.project.name,
1492
+ local_ds_version,
1493
+ )
1372
1494
 
1373
1495
  try:
1374
1496
  # try to find existing dataset with the same uuid to avoid pulling again
@@ -1377,7 +1499,10 @@ class Catalog:
1377
1499
  remote_ds_version.uuid
1378
1500
  )
1379
1501
  existing_ds_uri = create_dataset_uri(
1380
- existing_ds.name, existing_ds_version.version
1502
+ existing_ds.name,
1503
+ existing_ds.project.namespace.name,
1504
+ existing_ds.project.name,
1505
+ existing_ds_version.version,
1381
1506
  )
1382
1507
  if existing_ds_uri == remote_ds_uri:
1383
1508
  print(f"Local copy of dataset {remote_ds_uri} already present")
@@ -1391,8 +1516,26 @@ class Catalog:
1391
1516
  except DatasetNotFoundError:
1392
1517
  pass
1393
1518
 
1519
+ # Create namespace and project if doesn't exist
1520
+ print(
1521
+ f"Creating namespace {remote_ds.project.namespace.name} and project"
1522
+ f" {remote_ds.project.name}"
1523
+ )
1524
+
1525
+ namespace = self.metastore.create_namespace(
1526
+ remote_ds.project.namespace.name,
1527
+ description=remote_ds.project.namespace.descr,
1528
+ uuid=remote_ds.project.namespace.uuid,
1529
+ )
1530
+ project = self.metastore.create_project(
1531
+ namespace.name,
1532
+ remote_ds.project.name,
1533
+ description=remote_ds.project.descr,
1534
+ uuid=remote_ds.project.uuid,
1535
+ )
1536
+
1394
1537
  try:
1395
- local_dataset = self.get_dataset(local_ds_name)
1538
+ local_dataset = self.get_dataset(local_ds_name, project=project)
1396
1539
  if local_dataset and local_dataset.has_version(local_ds_version):
1397
1540
  raise DataChainError(
1398
1541
  f"Local dataset {local_ds_uri} already exists with different uuid,"
@@ -1414,6 +1557,7 @@ class Catalog:
1414
1557
 
1415
1558
  local_ds = self.create_dataset(
1416
1559
  local_ds_name,
1560
+ project,
1417
1561
  local_ds_version,
1418
1562
  query_script=remote_ds_version.query_script,
1419
1563
  create_rows=True,
@@ -1426,7 +1570,7 @@ class Catalog:
1426
1570
  # asking remote to export dataset rows table to s3 and to return signed
1427
1571
  # urls of exported parts, which are in parquet format
1428
1572
  export_response = studio_client.export_dataset_table(
1429
- remote_ds_name, remote_ds_version.version
1573
+ remote_ds, remote_ds_version.version
1430
1574
  )
1431
1575
  if not export_response.ok:
1432
1576
  raise DataChainError(export_response.message)
@@ -1457,9 +1601,9 @@ class Catalog:
1457
1601
  rows_fetcher = DatasetRowsFetcher(
1458
1602
  metastore,
1459
1603
  warehouse,
1460
- remote_ds_name,
1604
+ remote_ds,
1461
1605
  remote_ds_version.version,
1462
- local_ds_name,
1606
+ local_ds,
1463
1607
  local_ds_version,
1464
1608
  schema,
1465
1609
  progress_bar=dataset_save_progress_bar,
@@ -1469,7 +1613,7 @@ class Catalog:
1469
1613
  iter(batch(signed_urls)), dataset_save_progress_bar
1470
1614
  )
1471
1615
  except:
1472
- self.remove_dataset(local_ds_name, local_ds_version)
1616
+ self.remove_dataset(local_ds_name, project, local_ds_version)
1473
1617
  raise
1474
1618
 
1475
1619
  local_ds = self.metastore.update_dataset_status(
@@ -1526,7 +1670,11 @@ class Catalog:
1526
1670
  )
1527
1671
 
1528
1672
  self.create_dataset_from_sources(
1529
- output, sources, client_config=client_config, recursive=recursive
1673
+ output,
1674
+ sources,
1675
+ self.metastore.default_project,
1676
+ client_config=client_config,
1677
+ recursive=recursive,
1530
1678
  )
1531
1679
 
1532
1680
  def query(
datachain/cli/__init__.py CHANGED
@@ -154,9 +154,6 @@ def handle_dataset_command(args, catalog):
154
154
  new_name=args.new_name,
155
155
  description=args.description,
156
156
  attrs=args.attrs,
157
- studio=args.studio,
158
- local=args.local,
159
- all=args.all,
160
157
  team=args.team,
161
158
  ),
162
159
  "ls": lambda: list_datasets(
@@ -174,8 +171,6 @@ def handle_dataset_command(args, catalog):
174
171
  version=args.version,
175
172
  force=args.force,
176
173
  studio=args.studio,
177
- local=args.local,
178
- all=args.all,
179
174
  team=args.team,
180
175
  ),
181
176
  "remove": lambda: rm_dataset(
@@ -184,8 +179,6 @@ def handle_dataset_command(args, catalog):
184
179
  version=args.version,
185
180
  force=args.force,
186
181
  studio=args.studio,
187
- local=args.local,
188
- all=args.all,
189
182
  team=args.team,
190
183
  ),
191
184
  }