datachain 0.19.2__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from datachain.lib import namespaces, projects
1
2
  from datachain.lib.data_model import DataModel, DataType, is_chain_type
2
3
  from datachain.lib.dc import (
3
4
  C,
@@ -67,7 +68,9 @@ __all__ = [
67
68
  "is_chain_type",
68
69
  "listings",
69
70
  "metrics",
71
+ "namespaces",
70
72
  "param",
73
+ "projects",
71
74
  "read_csv",
72
75
  "read_database",
73
76
  "read_dataset",
@@ -41,6 +41,7 @@ from datachain.dataset import (
41
41
  DatasetStatus,
42
42
  StorageURI,
43
43
  create_dataset_uri,
44
+ parse_dataset_name,
44
45
  parse_dataset_uri,
45
46
  )
46
47
  from datachain.error import (
@@ -48,12 +49,14 @@ from datachain.error import (
48
49
  DatasetInvalidVersionError,
49
50
  DatasetNotFoundError,
50
51
  DatasetVersionNotFoundError,
52
+ ProjectNotFoundError,
51
53
  QueryScriptCancelError,
52
54
  QueryScriptRunError,
53
55
  )
54
56
  from datachain.lib.listing import get_listing
55
57
  from datachain.node import DirType, Node, NodeWithPath
56
58
  from datachain.nodes_thread_pool import NodesThreadPool
59
+ from datachain.project import Project
57
60
  from datachain.sql.types import DateTime, SQLType
58
61
  from datachain.utils import DataChainDir
59
62
 
@@ -155,9 +158,9 @@ class DatasetRowsFetcher(NodesThreadPool):
155
158
  self,
156
159
  metastore: "AbstractMetastore",
157
160
  warehouse: "AbstractWarehouse",
158
- remote_ds_name: str,
161
+ remote_ds: DatasetRecord,
159
162
  remote_ds_version: str,
160
- local_ds_name: str,
163
+ local_ds: DatasetRecord,
161
164
  local_ds_version: str,
162
165
  schema: dict[str, Union[SQLType, type[SQLType]]],
163
166
  max_threads: int = PULL_DATASET_MAX_THREADS,
@@ -169,9 +172,9 @@ class DatasetRowsFetcher(NodesThreadPool):
169
172
  self._check_dependencies()
170
173
  self.metastore = metastore
171
174
  self.warehouse = warehouse
172
- self.remote_ds_name = remote_ds_name
175
+ self.remote_ds = remote_ds
173
176
  self.remote_ds_version = remote_ds_version
174
- self.local_ds_name = local_ds_name
177
+ self.local_ds = local_ds
175
178
  self.local_ds_version = local_ds_version
176
179
  self.schema = schema
177
180
  self.last_status_check: Optional[float] = None
@@ -207,7 +210,7 @@ class DatasetRowsFetcher(NodesThreadPool):
207
210
  Checks are done every PULL_DATASET_CHECK_STATUS_INTERVAL seconds
208
211
  """
209
212
  export_status_response = self.studio_client.dataset_export_status(
210
- self.remote_ds_name, self.remote_ds_version
213
+ self.remote_ds, self.remote_ds_version
211
214
  )
212
215
  if not export_status_response.ok:
213
216
  raise DataChainError(export_status_response.message)
@@ -254,9 +257,7 @@ class DatasetRowsFetcher(NodesThreadPool):
254
257
  import pandas as pd
255
258
 
256
259
  # metastore and warehouse are not thread safe
257
- with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
258
- local_ds = metastore.get_dataset(self.local_ds_name)
259
-
260
+ with self.warehouse.clone() as warehouse:
260
261
  urls = list(urls)
261
262
 
262
263
  for url in urls:
@@ -269,7 +270,7 @@ class DatasetRowsFetcher(NodesThreadPool):
269
270
  df = self.fix_columns(df)
270
271
 
271
272
  inserted = warehouse.insert_dataset_rows(
272
- df, local_ds, self.local_ds_version
273
+ df, self.local_ds, self.local_ds_version
273
274
  )
274
275
  self.increase_counter(inserted) # type: ignore [arg-type]
275
276
  # sometimes progress bar doesn't get updated so manually updating it
@@ -675,7 +676,11 @@ class Catalog:
675
676
  listing: Optional[Listing]
676
677
  if src.startswith("ds://"):
677
678
  ds_name, ds_version = parse_dataset_uri(src)
678
- dataset = self.get_dataset(ds_name)
679
+ ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
680
+ assert ds_namespace
681
+ assert ds_project
682
+ project = self.metastore.get_project(ds_project, ds_namespace)
683
+ dataset = self.get_dataset(ds_name, project)
679
684
  if not ds_version:
680
685
  ds_version = dataset.latest_version
681
686
  dataset_sources = self.warehouse.get_dataset_sources(
@@ -695,7 +700,11 @@ class Catalog:
695
700
  dataset_name=dataset_name,
696
701
  )
697
702
  rows = DatasetQuery(
698
- name=dataset.name, version=ds_version, catalog=self
703
+ name=dataset.name,
704
+ namespace_name=dataset.project.namespace.name,
705
+ project_name=dataset.project.name,
706
+ version=ds_version,
707
+ catalog=self,
699
708
  ).to_db_records()
700
709
  indexed_sources.append(
701
710
  (
@@ -769,6 +778,7 @@ class Catalog:
769
778
  def create_dataset(
770
779
  self,
771
780
  name: str,
781
+ project: Optional[Project] = None,
772
782
  version: Optional[str] = None,
773
783
  *,
774
784
  columns: Sequence[Column],
@@ -788,6 +798,7 @@ class Catalog:
788
798
  If version is None, then next unused version is created.
789
799
  If version is given, then it must be an unused version.
790
800
  """
801
+ DatasetRecord.validate_name(name)
791
802
  assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
792
803
  if not listing and Client.is_data_source_uri(name):
793
804
  raise RuntimeError(
@@ -795,7 +806,7 @@ class Catalog:
795
806
  )
796
807
  default_version = DEFAULT_DATASET_VERSION
797
808
  try:
798
- dataset = self.get_dataset(name)
809
+ dataset = self.get_dataset(name, project)
799
810
  default_version = dataset.next_version_patch
800
811
  if update_version == "major":
801
812
  default_version = dataset.next_version_major
@@ -820,6 +831,7 @@ class Catalog:
820
831
  }
821
832
  dataset = self.metastore.create_dataset(
822
833
  name,
834
+ project.id if project else None,
823
835
  feature_schema=feature_schema,
824
836
  query_script=query_script,
825
837
  schema=schema,
@@ -892,7 +904,7 @@ class Catalog:
892
904
  )
893
905
 
894
906
  if create_rows_table:
895
- table_name = self.warehouse.dataset_table_name(dataset.name, version)
907
+ table_name = self.warehouse.dataset_table_name(dataset, version)
896
908
  self.warehouse.create_dataset_rows_table(table_name, columns=columns)
897
909
  self.update_dataset_version_with_warehouse_info(dataset, version)
898
910
 
@@ -923,7 +935,13 @@ class Catalog:
923
935
 
924
936
  if not dataset_version.preview:
925
937
  values["preview"] = (
926
- DatasetQuery(name=dataset.name, version=version, catalog=self)
938
+ DatasetQuery(
939
+ name=dataset.name,
940
+ namespace_name=dataset.project.namespace.name,
941
+ project_name=dataset.project.name,
942
+ version=version,
943
+ catalog=self,
944
+ )
927
945
  .limit(20)
928
946
  .to_db_records()
929
947
  )
@@ -949,6 +967,7 @@ class Catalog:
949
967
  # updating name must result in updating dataset table names as well
950
968
  for version in [v.version for v in dataset.versions]:
951
969
  self.warehouse.rename_dataset_table(
970
+ dataset,
952
971
  old_name,
953
972
  new_name,
954
973
  old_version=version,
@@ -986,6 +1005,7 @@ class Catalog:
986
1005
  self,
987
1006
  name: str,
988
1007
  sources: list[str],
1008
+ project: Optional[Project] = None,
989
1009
  client_config=None,
990
1010
  recursive=False,
991
1011
  ) -> DatasetRecord:
@@ -994,6 +1014,8 @@ class Catalog:
994
1014
 
995
1015
  from datachain import read_dataset, read_storage
996
1016
 
1017
+ project = project or self.metastore.default_project
1018
+
997
1019
  chains = []
998
1020
  for source in sources:
999
1021
  if source.startswith(DATASET_PREFIX):
@@ -1006,10 +1028,11 @@ class Catalog:
1006
1028
  # create union of all dataset queries created from sources
1007
1029
  dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
1008
1030
  try:
1031
+ dc = dc.settings(project=project.name, namespace=project.namespace.name)
1009
1032
  dc.save(name)
1010
1033
  except Exception as e: # noqa: BLE001
1011
1034
  try:
1012
- ds = self.get_dataset(name)
1035
+ ds = self.get_dataset(name, project)
1013
1036
  self.metastore.update_dataset_status(
1014
1037
  ds,
1015
1038
  DatasetStatus.FAILED,
@@ -1026,7 +1049,7 @@ class Catalog:
1026
1049
  except DatasetNotFoundError:
1027
1050
  raise e from None
1028
1051
 
1029
- ds = self.get_dataset(name)
1052
+ ds = self.get_dataset(name, project)
1030
1053
 
1031
1054
  self.update_dataset_version_with_warehouse_info(
1032
1055
  ds,
@@ -1034,49 +1057,67 @@ class Catalog:
1034
1057
  sources="\n".join(sources),
1035
1058
  )
1036
1059
 
1037
- return self.get_dataset(name)
1060
+ return self.get_dataset(name, project)
1038
1061
 
1039
- def get_dataset(self, name: str) -> DatasetRecord:
1040
- return self.metastore.get_dataset(name)
1062
+ def get_dataset(
1063
+ self, name: str, project: Optional[Project] = None
1064
+ ) -> DatasetRecord:
1065
+ from datachain.lib.listing import is_listing_dataset
1066
+
1067
+ if is_listing_dataset(name):
1068
+ project = self.metastore.listing_project
1069
+ return self.metastore.get_dataset(name, project.id if project else None)
1041
1070
 
1042
1071
  def get_dataset_with_remote_fallback(
1043
- self, name: str, version: Optional[str] = None
1072
+ self,
1073
+ name: str,
1074
+ namespace_name: str,
1075
+ project_name: str,
1076
+ version: Optional[str] = None,
1044
1077
  ) -> DatasetRecord:
1045
1078
  try:
1046
- ds = self.get_dataset(name)
1079
+ project = self.metastore.get_project(project_name, namespace_name)
1080
+ ds = self.get_dataset(name, project)
1047
1081
  if version and not ds.has_version(version):
1048
1082
  raise DatasetVersionNotFoundError(
1049
1083
  f"Dataset {name} does not have version {version}"
1050
1084
  )
1051
1085
  return ds
1052
1086
 
1053
- except (DatasetNotFoundError, DatasetVersionNotFoundError):
1087
+ except (
1088
+ ProjectNotFoundError,
1089
+ DatasetNotFoundError,
1090
+ DatasetVersionNotFoundError,
1091
+ ):
1054
1092
  print("Dataset not found in local catalog, trying to get from studio")
1055
-
1056
- remote_ds_uri = f"{DATASET_PREFIX}{name}"
1057
- if version:
1058
- remote_ds_uri += f"@v{version}"
1093
+ remote_ds_uri = create_dataset_uri(
1094
+ name, namespace_name, project_name, version
1095
+ )
1059
1096
 
1060
1097
  self.pull_dataset(
1061
1098
  remote_ds_uri=remote_ds_uri,
1062
1099
  local_ds_name=name,
1063
1100
  local_ds_version=version,
1064
1101
  )
1065
- return self.get_dataset(name)
1102
+ return self.get_dataset(
1103
+ name, self.metastore.get_project(project_name, namespace_name)
1104
+ )
1066
1105
 
1067
1106
  def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
1068
1107
  """Returns dataset that contains version with specific uuid"""
1069
1108
  for dataset in self.ls_datasets():
1070
1109
  if dataset.has_version_with_uuid(uuid):
1071
- return self.get_dataset(dataset.name)
1110
+ return self.get_dataset(dataset.name, dataset.project)
1072
1111
  raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
1073
1112
 
1074
- def get_remote_dataset(self, name: str) -> DatasetRecord:
1113
+ def get_remote_dataset(
1114
+ self, namespace: str, project: str, name: str
1115
+ ) -> DatasetRecord:
1075
1116
  from datachain.remote.studio import StudioClient
1076
1117
 
1077
1118
  studio_client = StudioClient()
1078
1119
 
1079
- info_response = studio_client.dataset_info(name)
1120
+ info_response = studio_client.dataset_info(namespace, project, name)
1080
1121
  if not info_response.ok:
1081
1122
  raise DataChainError(info_response.message)
1082
1123
 
@@ -1085,9 +1126,9 @@ class Catalog:
1085
1126
  return DatasetRecord.from_dict(dataset_info)
1086
1127
 
1087
1128
  def get_dataset_dependencies(
1088
- self, name: str, version: str, indirect=False
1129
+ self, name: str, version: str, project: Optional[Project] = None, indirect=False
1089
1130
  ) -> list[Optional[DatasetDependency]]:
1090
- dataset = self.get_dataset(name)
1131
+ dataset = self.get_dataset(name, project)
1091
1132
 
1092
1133
  direct_dependencies = self.metastore.get_direct_dataset_dependencies(
1093
1134
  dataset, version
@@ -1101,9 +1142,10 @@ class Catalog:
1101
1142
  # dependency has been removed
1102
1143
  continue
1103
1144
  if d.is_dataset:
1145
+ project = self.metastore.get_project(d.project, d.namespace)
1104
1146
  # only datasets can have dependencies
1105
1147
  d.dependencies = self.get_dataset_dependencies(
1106
- d.name, d.version, indirect=indirect
1148
+ d.name, d.version, project, indirect=indirect
1107
1149
  )
1108
1150
 
1109
1151
  return direct_dependencies
@@ -1113,9 +1155,12 @@ class Catalog:
1113
1155
  prefix: Optional[str] = None,
1114
1156
  include_listing: bool = False,
1115
1157
  studio: bool = False,
1158
+ project: Optional[Project] = None,
1116
1159
  ) -> Iterator[DatasetListRecord]:
1117
1160
  from datachain.remote.studio import StudioClient
1118
1161
 
1162
+ project_id = project.id if project else None
1163
+
1119
1164
  if studio:
1120
1165
  client = StudioClient()
1121
1166
  response = client.ls_datasets(prefix=prefix)
@@ -1130,9 +1175,11 @@ class Catalog:
1130
1175
  if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
1131
1176
  )
1132
1177
  elif prefix:
1133
- datasets = self.metastore.list_datasets_by_prefix(prefix)
1178
+ datasets = self.metastore.list_datasets_by_prefix(
1179
+ prefix, project_id=project_id
1180
+ )
1134
1181
  else:
1135
- datasets = self.metastore.list_datasets()
1182
+ datasets = self.metastore.list_datasets(project_id=project_id)
1136
1183
 
1137
1184
  for d in datasets:
1138
1185
  if not d.is_bucket_listing or include_listing:
@@ -1144,11 +1191,15 @@ class Catalog:
1144
1191
  include_listing: bool = False,
1145
1192
  with_job: bool = True,
1146
1193
  studio: bool = False,
1194
+ project: Optional[Project] = None,
1147
1195
  ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
1148
1196
  """Iterate over all dataset versions with related jobs."""
1149
1197
  datasets = list(
1150
1198
  self.ls_datasets(
1151
- prefix=prefix, include_listing=include_listing, studio=studio
1199
+ prefix=prefix,
1200
+ include_listing=include_listing,
1201
+ studio=studio,
1202
+ project=project,
1152
1203
  )
1153
1204
  )
1154
1205
 
@@ -1184,6 +1235,7 @@ class Catalog:
1184
1235
  prefix=prefix,
1185
1236
  include_listing=True,
1186
1237
  with_job=False,
1238
+ project=self.metastore.listing_project,
1187
1239
  )
1188
1240
 
1189
1241
  return [
@@ -1193,13 +1245,21 @@ class Catalog:
1193
1245
  ]
1194
1246
 
1195
1247
  def ls_dataset_rows(
1196
- self, name: str, version: str, offset=None, limit=None
1248
+ self,
1249
+ dataset: DatasetRecord,
1250
+ version: str,
1251
+ offset=None,
1252
+ limit=None,
1197
1253
  ) -> list[dict]:
1198
1254
  from datachain.query.dataset import DatasetQuery
1199
1255
 
1200
- dataset = self.get_dataset(name)
1201
-
1202
- q = DatasetQuery(name=dataset.name, version=version, catalog=self)
1256
+ q = DatasetQuery(
1257
+ name=dataset.name,
1258
+ namespace_name=dataset.project.namespace.name,
1259
+ project_name=dataset.project.name,
1260
+ version=version,
1261
+ catalog=self,
1262
+ )
1203
1263
  if limit:
1204
1264
  q = q.limit(limit)
1205
1265
  if offset:
@@ -1232,35 +1292,29 @@ class Catalog:
1232
1292
  bucket_uri: str,
1233
1293
  name: str,
1234
1294
  version: str,
1295
+ project: Optional[Project] = None,
1235
1296
  client_config=None,
1236
1297
  ) -> list[str]:
1237
- dataset = self.get_dataset(name)
1298
+ dataset = self.get_dataset(name, project)
1238
1299
 
1239
1300
  return self.warehouse.export_dataset_table(
1240
1301
  bucket_uri, dataset, version, client_config
1241
1302
  )
1242
1303
 
1243
- def dataset_table_export_file_names(self, name: str, version: str) -> list[str]:
1244
- dataset = self.get_dataset(name)
1304
+ def dataset_table_export_file_names(
1305
+ self, name: str, version: str, project: Optional[Project] = None
1306
+ ) -> list[str]:
1307
+ dataset = self.get_dataset(name, project)
1245
1308
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1246
1309
 
1247
1310
  def remove_dataset(
1248
1311
  self,
1249
1312
  name: str,
1313
+ project: Optional[Project] = None,
1250
1314
  version: Optional[str] = None,
1251
1315
  force: Optional[bool] = False,
1252
- studio: Optional[bool] = False,
1253
1316
  ):
1254
- from datachain.remote.studio import StudioClient
1255
-
1256
- if studio:
1257
- client = StudioClient()
1258
- response = client.rm_dataset(name, version=version, force=force)
1259
- if not response.ok:
1260
- raise DataChainError(response.message)
1261
- return
1262
-
1263
- dataset = self.get_dataset(name)
1317
+ dataset = self.get_dataset(name, project)
1264
1318
  if not version and not force:
1265
1319
  raise ValueError(f"Missing dataset version from input for dataset {name}")
1266
1320
  if version and not dataset.has_version(version):
@@ -1282,19 +1336,21 @@ class Catalog:
1282
1336
  def edit_dataset(
1283
1337
  self,
1284
1338
  name: str,
1339
+ project: Optional[Project] = None,
1285
1340
  new_name: Optional[str] = None,
1286
1341
  description: Optional[str] = None,
1287
1342
  attrs: Optional[list[str]] = None,
1288
1343
  ) -> DatasetRecord:
1289
1344
  update_data = {}
1290
1345
  if new_name:
1346
+ DatasetRecord.validate_name(new_name)
1291
1347
  update_data["name"] = new_name
1292
1348
  if description is not None:
1293
1349
  update_data["description"] = description
1294
1350
  if attrs is not None:
1295
1351
  update_data["attrs"] = attrs # type: ignore[assignment]
1296
1352
 
1297
- dataset = self.get_dataset(name)
1353
+ dataset = self.get_dataset(name, project)
1298
1354
  return self.update_dataset(dataset, **update_data)
1299
1355
 
1300
1356
  def ls(
@@ -1351,7 +1407,29 @@ class Catalog:
1351
1407
  except Exception as e:
1352
1408
  raise DataChainError("Error when parsing dataset uri") from e
1353
1409
 
1354
- remote_ds = self.get_remote_dataset(remote_ds_name)
1410
+ remote_namespace, remote_project, remote_ds_name = parse_dataset_name(
1411
+ remote_ds_name
1412
+ )
1413
+ if not remote_namespace or not remote_project:
1414
+ raise DataChainError(
1415
+ f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
1416
+ f" or project missing"
1417
+ )
1418
+
1419
+ if local_ds_name:
1420
+ local_namespace, local_project, local_ds_name = parse_dataset_name(
1421
+ local_ds_name
1422
+ )
1423
+ if local_namespace and local_namespace != remote_namespace:
1424
+ raise DataChainError(
1425
+ "Local namespace must be the same to remote namespace"
1426
+ )
1427
+ if local_project and local_project != remote_project:
1428
+ raise DataChainError("Local project must be the same to remote project")
1429
+
1430
+ remote_ds = self.get_remote_dataset(
1431
+ remote_namespace, remote_project, remote_ds_name
1432
+ )
1355
1433
 
1356
1434
  try:
1357
1435
  # if version is not specified in uri, take the latest one
@@ -1359,7 +1437,12 @@ class Catalog:
1359
1437
  version = remote_ds.latest_version
1360
1438
  print(f"Version not specified, pulling the latest one (v{version})")
1361
1439
  # updating dataset uri with latest version
1362
- remote_ds_uri = create_dataset_uri(remote_ds_name, version)
1440
+ remote_ds_uri = create_dataset_uri(
1441
+ remote_ds.name,
1442
+ remote_ds.project.namespace.name,
1443
+ remote_ds.project.name,
1444
+ version,
1445
+ )
1363
1446
  remote_ds_version = remote_ds.get_version(version)
1364
1447
  except (DatasetVersionNotFoundError, StopIteration) as exc:
1365
1448
  raise DataChainError(
@@ -1368,7 +1451,13 @@ class Catalog:
1368
1451
 
1369
1452
  local_ds_name = local_ds_name or remote_ds.name
1370
1453
  local_ds_version = local_ds_version or remote_ds_version.version
1371
- local_ds_uri = create_dataset_uri(local_ds_name, local_ds_version)
1454
+
1455
+ local_ds_uri = create_dataset_uri(
1456
+ local_ds_name,
1457
+ remote_ds.project.namespace.name,
1458
+ remote_ds.project.name,
1459
+ local_ds_version,
1460
+ )
1372
1461
 
1373
1462
  try:
1374
1463
  # try to find existing dataset with the same uuid to avoid pulling again
@@ -1377,7 +1466,10 @@ class Catalog:
1377
1466
  remote_ds_version.uuid
1378
1467
  )
1379
1468
  existing_ds_uri = create_dataset_uri(
1380
- existing_ds.name, existing_ds_version.version
1469
+ existing_ds.name,
1470
+ existing_ds.project.namespace.name,
1471
+ existing_ds.project.name,
1472
+ existing_ds_version.version,
1381
1473
  )
1382
1474
  if existing_ds_uri == remote_ds_uri:
1383
1475
  print(f"Local copy of dataset {remote_ds_uri} already present")
@@ -1391,8 +1483,26 @@ class Catalog:
1391
1483
  except DatasetNotFoundError:
1392
1484
  pass
1393
1485
 
1486
+ # Create namespace and project if doesn't exist
1487
+ print(
1488
+ f"Creating namespace {remote_ds.project.namespace.name} and project"
1489
+ f" {remote_ds.project.name}"
1490
+ )
1491
+
1492
+ namespace = self.metastore.create_namespace(
1493
+ remote_ds.project.namespace.name,
1494
+ description=remote_ds.project.namespace.description,
1495
+ uuid=remote_ds.project.namespace.uuid,
1496
+ )
1497
+ project = self.metastore.create_project(
1498
+ remote_ds.project.name,
1499
+ namespace.name,
1500
+ description=remote_ds.project.description,
1501
+ uuid=remote_ds.project.uuid,
1502
+ )
1503
+
1394
1504
  try:
1395
- local_dataset = self.get_dataset(local_ds_name)
1505
+ local_dataset = self.get_dataset(local_ds_name, project=project)
1396
1506
  if local_dataset and local_dataset.has_version(local_ds_version):
1397
1507
  raise DataChainError(
1398
1508
  f"Local dataset {local_ds_uri} already exists with different uuid,"
@@ -1414,6 +1524,7 @@ class Catalog:
1414
1524
 
1415
1525
  local_ds = self.create_dataset(
1416
1526
  local_ds_name,
1527
+ project,
1417
1528
  local_ds_version,
1418
1529
  query_script=remote_ds_version.query_script,
1419
1530
  create_rows=True,
@@ -1426,7 +1537,7 @@ class Catalog:
1426
1537
  # asking remote to export dataset rows table to s3 and to return signed
1427
1538
  # urls of exported parts, which are in parquet format
1428
1539
  export_response = studio_client.export_dataset_table(
1429
- remote_ds_name, remote_ds_version.version
1540
+ remote_ds, remote_ds_version.version
1430
1541
  )
1431
1542
  if not export_response.ok:
1432
1543
  raise DataChainError(export_response.message)
@@ -1457,9 +1568,9 @@ class Catalog:
1457
1568
  rows_fetcher = DatasetRowsFetcher(
1458
1569
  metastore,
1459
1570
  warehouse,
1460
- remote_ds_name,
1571
+ remote_ds,
1461
1572
  remote_ds_version.version,
1462
- local_ds_name,
1573
+ local_ds,
1463
1574
  local_ds_version,
1464
1575
  schema,
1465
1576
  progress_bar=dataset_save_progress_bar,
@@ -1469,7 +1580,7 @@ class Catalog:
1469
1580
  iter(batch(signed_urls)), dataset_save_progress_bar
1470
1581
  )
1471
1582
  except:
1472
- self.remove_dataset(local_ds_name, local_ds_version)
1583
+ self.remove_dataset(local_ds_name, project, local_ds_version)
1473
1584
  raise
1474
1585
 
1475
1586
  local_ds = self.metastore.update_dataset_status(
@@ -1526,7 +1637,11 @@ class Catalog:
1526
1637
  )
1527
1638
 
1528
1639
  self.create_dataset_from_sources(
1529
- output, sources, client_config=client_config, recursive=recursive
1640
+ output,
1641
+ sources,
1642
+ self.metastore.default_project,
1643
+ client_config=client_config,
1644
+ recursive=recursive,
1530
1645
  )
1531
1646
 
1532
1647
  def query(
datachain/cli/__init__.py CHANGED
@@ -152,9 +152,6 @@ def handle_dataset_command(args, catalog):
152
152
  new_name=args.new_name,
153
153
  description=args.description,
154
154
  attrs=args.attrs,
155
- studio=args.studio,
156
- local=args.local,
157
- all=args.all,
158
155
  team=args.team,
159
156
  ),
160
157
  "ls": lambda: list_datasets(
@@ -172,8 +169,6 @@ def handle_dataset_command(args, catalog):
172
169
  version=args.version,
173
170
  force=args.force,
174
171
  studio=args.studio,
175
- local=args.local,
176
- all=args.all,
177
172
  team=args.team,
178
173
  ),
179
174
  "remove": lambda: rm_dataset(
@@ -182,8 +177,6 @@ def handle_dataset_command(args, catalog):
182
177
  version=args.version,
183
178
  force=args.force,
184
179
  studio=args.studio,
185
- local=args.local,
186
- all=args.all,
187
180
  team=args.team,
188
181
  ),
189
182
  }