datachain 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +65 -180
  4. datachain/cli/__init__.py +7 -0
  5. datachain/cli/commands/datasets.py +28 -43
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +35 -1
  8. datachain/client/fsspec.py +3 -5
  9. datachain/client/hf.py +0 -10
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +37 -403
  12. datachain/data_storage/sqlite.py +7 -139
  13. datachain/data_storage/warehouse.py +7 -26
  14. datachain/dataset.py +12 -126
  15. datachain/delta.py +7 -11
  16. datachain/error.py +0 -36
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +0 -4
  20. datachain/lib/dc/datachain.py +92 -259
  21. datachain/lib/dc/datasets.py +49 -87
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +0 -1
  24. datachain/lib/dc/storage.py +40 -38
  25. datachain/lib/file.py +23 -77
  26. datachain/lib/listing.py +1 -3
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/pytorch.py +1 -1
  29. datachain/lib/settings.py +0 -10
  30. datachain/lib/tar.py +2 -1
  31. datachain/lib/udf_signature.py +1 -1
  32. datachain/lib/webdataset.py +20 -30
  33. datachain/listing.py +1 -3
  34. datachain/query/dataset.py +46 -71
  35. datachain/query/session.py +1 -1
  36. datachain/remote/studio.py +26 -61
  37. datachain/studio.py +7 -23
  38. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
  39. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
  40. datachain/lib/namespaces.py +0 -71
  41. datachain/lib/projects.py +0 -86
  42. datachain/namespace.py +0 -65
  43. datachain/project.py +0 -78
  44. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
  45. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
  47. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py CHANGED
@@ -32,7 +32,6 @@ from datachain.lib.file import (
32
32
  VideoFrame,
33
33
  )
34
34
  from datachain.lib.model_store import ModelStore
35
- from datachain.lib.projects import create as create_project
36
35
  from datachain.lib.udf import Aggregator, Generator, Mapper
37
36
  from datachain.lib.utils import AbstractUDF, DataChainError
38
37
  from datachain.query import metrics, param
@@ -63,7 +62,6 @@ __all__ = [
63
62
  "VideoFile",
64
63
  "VideoFragment",
65
64
  "VideoFrame",
66
- "create_project",
67
65
  "datasets",
68
66
  "delete_dataset",
69
67
  "is_chain_type",
datachain/cache.py CHANGED
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class Cache: # noqa: PLW1641
42
+ class Cache:
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -76,9 +76,9 @@ class Cache: # noqa: PLW1641
76
76
  async def download(
77
77
  self, file: "File", client: "Client", callback: Optional[Callback] = None
78
78
  ) -> None:
79
+ from_path = f"{file.source}/{file.path}"
79
80
  from dvc_objects.fs.utils import tmp_fname
80
81
 
81
- from_path = file.get_uri()
82
82
  odb_fs = self.odb.fs
83
83
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
84
84
  size = file.size
@@ -41,7 +41,6 @@ from datachain.dataset import (
41
41
  DatasetStatus,
42
42
  StorageURI,
43
43
  create_dataset_uri,
44
- parse_dataset_name,
45
44
  parse_dataset_uri,
46
45
  )
47
46
  from datachain.error import (
@@ -49,14 +48,12 @@ from datachain.error import (
49
48
  DatasetInvalidVersionError,
50
49
  DatasetNotFoundError,
51
50
  DatasetVersionNotFoundError,
52
- ProjectNotFoundError,
53
51
  QueryScriptCancelError,
54
52
  QueryScriptRunError,
55
53
  )
56
54
  from datachain.lib.listing import get_listing
57
55
  from datachain.node import DirType, Node, NodeWithPath
58
56
  from datachain.nodes_thread_pool import NodesThreadPool
59
- from datachain.project import Project
60
57
  from datachain.sql.types import DateTime, SQLType
61
58
  from datachain.utils import DataChainDir
62
59
 
@@ -158,9 +155,9 @@ class DatasetRowsFetcher(NodesThreadPool):
158
155
  self,
159
156
  metastore: "AbstractMetastore",
160
157
  warehouse: "AbstractWarehouse",
161
- remote_ds: DatasetRecord,
158
+ remote_ds_name: str,
162
159
  remote_ds_version: str,
163
- local_ds: DatasetRecord,
160
+ local_ds_name: str,
164
161
  local_ds_version: str,
165
162
  schema: dict[str, Union[SQLType, type[SQLType]]],
166
163
  max_threads: int = PULL_DATASET_MAX_THREADS,
@@ -172,9 +169,9 @@ class DatasetRowsFetcher(NodesThreadPool):
172
169
  self._check_dependencies()
173
170
  self.metastore = metastore
174
171
  self.warehouse = warehouse
175
- self.remote_ds = remote_ds
172
+ self.remote_ds_name = remote_ds_name
176
173
  self.remote_ds_version = remote_ds_version
177
- self.local_ds = local_ds
174
+ self.local_ds_name = local_ds_name
178
175
  self.local_ds_version = local_ds_version
179
176
  self.schema = schema
180
177
  self.last_status_check: Optional[float] = None
@@ -210,7 +207,7 @@ class DatasetRowsFetcher(NodesThreadPool):
210
207
  Checks are done every PULL_DATASET_CHECK_STATUS_INTERVAL seconds
211
208
  """
212
209
  export_status_response = self.studio_client.dataset_export_status(
213
- self.remote_ds, self.remote_ds_version
210
+ self.remote_ds_name, self.remote_ds_version
214
211
  )
215
212
  if not export_status_response.ok:
216
213
  raise DataChainError(export_status_response.message)
@@ -257,7 +254,9 @@ class DatasetRowsFetcher(NodesThreadPool):
257
254
  import pandas as pd
258
255
 
259
256
  # metastore and warehouse are not thread safe
260
- with self.warehouse.clone() as warehouse:
257
+ with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
258
+ local_ds = metastore.get_dataset(self.local_ds_name)
259
+
261
260
  urls = list(urls)
262
261
 
263
262
  for url in urls:
@@ -270,7 +269,7 @@ class DatasetRowsFetcher(NodesThreadPool):
270
269
  df = self.fix_columns(df)
271
270
 
272
271
  inserted = warehouse.insert_dataset_rows(
273
- df, self.local_ds, self.local_ds_version
272
+ df, local_ds, self.local_ds_version
274
273
  )
275
274
  self.increase_counter(inserted) # type: ignore [arg-type]
276
275
  # sometimes progress bar doesn't get updated so manually updating it
@@ -676,11 +675,7 @@ class Catalog:
676
675
  listing: Optional[Listing]
677
676
  if src.startswith("ds://"):
678
677
  ds_name, ds_version = parse_dataset_uri(src)
679
- ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
680
- assert ds_namespace
681
- assert ds_project
682
- project = self.metastore.get_project(ds_project, ds_namespace)
683
- dataset = self.get_dataset(ds_name, project)
678
+ dataset = self.get_dataset(ds_name)
684
679
  if not ds_version:
685
680
  ds_version = dataset.latest_version
686
681
  dataset_sources = self.warehouse.get_dataset_sources(
@@ -700,11 +695,7 @@ class Catalog:
700
695
  dataset_name=dataset_name,
701
696
  )
702
697
  rows = DatasetQuery(
703
- name=dataset.name,
704
- namespace_name=dataset.project.namespace.name,
705
- project_name=dataset.project.name,
706
- version=ds_version,
707
- catalog=self,
698
+ name=dataset.name, version=ds_version, catalog=self
708
699
  ).to_db_records()
709
700
  indexed_sources.append(
710
701
  (
@@ -778,7 +769,6 @@ class Catalog:
778
769
  def create_dataset(
779
770
  self,
780
771
  name: str,
781
- project: Optional[Project] = None,
782
772
  version: Optional[str] = None,
783
773
  *,
784
774
  columns: Sequence[Column],
@@ -798,7 +788,6 @@ class Catalog:
798
788
  If version is None, then next unused version is created.
799
789
  If version is given, then it must be an unused version.
800
790
  """
801
- DatasetRecord.validate_name(name)
802
791
  assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
803
792
  if not listing and Client.is_data_source_uri(name):
804
793
  raise RuntimeError(
@@ -806,7 +795,7 @@ class Catalog:
806
795
  )
807
796
  default_version = DEFAULT_DATASET_VERSION
808
797
  try:
809
- dataset = self.get_dataset(name, project)
798
+ dataset = self.get_dataset(name)
810
799
  default_version = dataset.next_version_patch
811
800
  if update_version == "major":
812
801
  default_version = dataset.next_version_major
@@ -831,7 +820,6 @@ class Catalog:
831
820
  }
832
821
  dataset = self.metastore.create_dataset(
833
822
  name,
834
- project.id if project else None,
835
823
  feature_schema=feature_schema,
836
824
  query_script=query_script,
837
825
  schema=schema,
@@ -904,7 +892,7 @@ class Catalog:
904
892
  )
905
893
 
906
894
  if create_rows_table:
907
- table_name = self.warehouse.dataset_table_name(dataset, version)
895
+ table_name = self.warehouse.dataset_table_name(dataset.name, version)
908
896
  self.warehouse.create_dataset_rows_table(table_name, columns=columns)
909
897
  self.update_dataset_version_with_warehouse_info(dataset, version)
910
898
 
@@ -935,13 +923,7 @@ class Catalog:
935
923
 
936
924
  if not dataset_version.preview:
937
925
  values["preview"] = (
938
- DatasetQuery(
939
- name=dataset.name,
940
- namespace_name=dataset.project.namespace.name,
941
- project_name=dataset.project.name,
942
- version=version,
943
- catalog=self,
944
- )
926
+ DatasetQuery(name=dataset.name, version=version, catalog=self)
945
927
  .limit(20)
946
928
  .to_db_records()
947
929
  )
@@ -967,7 +949,6 @@ class Catalog:
967
949
  # updating name must result in updating dataset table names as well
968
950
  for version in [v.version for v in dataset.versions]:
969
951
  self.warehouse.rename_dataset_table(
970
- dataset,
971
952
  old_name,
972
953
  new_name,
973
954
  old_version=version,
@@ -1005,7 +986,6 @@ class Catalog:
1005
986
  self,
1006
987
  name: str,
1007
988
  sources: list[str],
1008
- project: Optional[Project] = None,
1009
989
  client_config=None,
1010
990
  recursive=False,
1011
991
  ) -> DatasetRecord:
@@ -1014,8 +994,6 @@ class Catalog:
1014
994
 
1015
995
  from datachain import read_dataset, read_storage
1016
996
 
1017
- project = project or self.metastore.default_project
1018
-
1019
997
  chains = []
1020
998
  for source in sources:
1021
999
  if source.startswith(DATASET_PREFIX):
@@ -1028,11 +1006,10 @@ class Catalog:
1028
1006
  # create union of all dataset queries created from sources
1029
1007
  dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
1030
1008
  try:
1031
- dc = dc.settings(project=project.name, namespace=project.namespace.name)
1032
1009
  dc.save(name)
1033
1010
  except Exception as e: # noqa: BLE001
1034
1011
  try:
1035
- ds = self.get_dataset(name, project)
1012
+ ds = self.get_dataset(name)
1036
1013
  self.metastore.update_dataset_status(
1037
1014
  ds,
1038
1015
  DatasetStatus.FAILED,
@@ -1049,7 +1026,7 @@ class Catalog:
1049
1026
  except DatasetNotFoundError:
1050
1027
  raise e from None
1051
1028
 
1052
- ds = self.get_dataset(name, project)
1029
+ ds = self.get_dataset(name)
1053
1030
 
1054
1031
  self.update_dataset_version_with_warehouse_info(
1055
1032
  ds,
@@ -1057,67 +1034,49 @@ class Catalog:
1057
1034
  sources="\n".join(sources),
1058
1035
  )
1059
1036
 
1060
- return self.get_dataset(name, project)
1037
+ return self.get_dataset(name)
1061
1038
 
1062
- def get_dataset(
1063
- self, name: str, project: Optional[Project] = None
1064
- ) -> DatasetRecord:
1065
- from datachain.lib.listing import is_listing_dataset
1066
-
1067
- if is_listing_dataset(name):
1068
- project = self.metastore.listing_project
1069
- return self.metastore.get_dataset(name, project.id if project else None)
1039
+ def get_dataset(self, name: str) -> DatasetRecord:
1040
+ return self.metastore.get_dataset(name)
1070
1041
 
1071
1042
  def get_dataset_with_remote_fallback(
1072
- self,
1073
- name: str,
1074
- namespace_name: str,
1075
- project_name: str,
1076
- version: Optional[str] = None,
1043
+ self, name: str, version: Optional[str] = None
1077
1044
  ) -> DatasetRecord:
1078
1045
  try:
1079
- project = self.metastore.get_project(project_name, namespace_name)
1080
- ds = self.get_dataset(name, project)
1046
+ ds = self.get_dataset(name)
1081
1047
  if version and not ds.has_version(version):
1082
1048
  raise DatasetVersionNotFoundError(
1083
1049
  f"Dataset {name} does not have version {version}"
1084
1050
  )
1085
1051
  return ds
1086
1052
 
1087
- except (
1088
- ProjectNotFoundError,
1089
- DatasetNotFoundError,
1090
- DatasetVersionNotFoundError,
1091
- ):
1053
+ except (DatasetNotFoundError, DatasetVersionNotFoundError):
1092
1054
  print("Dataset not found in local catalog, trying to get from studio")
1093
- remote_ds_uri = create_dataset_uri(
1094
- name, namespace_name, project_name, version
1095
- )
1055
+
1056
+ remote_ds_uri = f"{DATASET_PREFIX}{name}"
1057
+ if version:
1058
+ remote_ds_uri += f"@v{version}"
1096
1059
 
1097
1060
  self.pull_dataset(
1098
1061
  remote_ds_uri=remote_ds_uri,
1099
1062
  local_ds_name=name,
1100
1063
  local_ds_version=version,
1101
1064
  )
1102
- return self.get_dataset(
1103
- name, self.metastore.get_project(project_name, namespace_name)
1104
- )
1065
+ return self.get_dataset(name)
1105
1066
 
1106
1067
  def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
1107
1068
  """Returns dataset that contains version with specific uuid"""
1108
1069
  for dataset in self.ls_datasets():
1109
1070
  if dataset.has_version_with_uuid(uuid):
1110
- return self.get_dataset(dataset.name, dataset.project)
1071
+ return self.get_dataset(dataset.name)
1111
1072
  raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
1112
1073
 
1113
- def get_remote_dataset(
1114
- self, namespace: str, project: str, name: str
1115
- ) -> DatasetRecord:
1074
+ def get_remote_dataset(self, name: str) -> DatasetRecord:
1116
1075
  from datachain.remote.studio import StudioClient
1117
1076
 
1118
1077
  studio_client = StudioClient()
1119
1078
 
1120
- info_response = studio_client.dataset_info(namespace, project, name)
1079
+ info_response = studio_client.dataset_info(name)
1121
1080
  if not info_response.ok:
1122
1081
  raise DataChainError(info_response.message)
1123
1082
 
@@ -1126,9 +1085,9 @@ class Catalog:
1126
1085
  return DatasetRecord.from_dict(dataset_info)
1127
1086
 
1128
1087
  def get_dataset_dependencies(
1129
- self, name: str, version: str, project: Optional[Project] = None, indirect=False
1088
+ self, name: str, version: str, indirect=False
1130
1089
  ) -> list[Optional[DatasetDependency]]:
1131
- dataset = self.get_dataset(name, project)
1090
+ dataset = self.get_dataset(name)
1132
1091
 
1133
1092
  direct_dependencies = self.metastore.get_direct_dataset_dependencies(
1134
1093
  dataset, version
@@ -1142,10 +1101,9 @@ class Catalog:
1142
1101
  # dependency has been removed
1143
1102
  continue
1144
1103
  if d.is_dataset:
1145
- project = self.metastore.get_project(d.project, d.namespace)
1146
1104
  # only datasets can have dependencies
1147
1105
  d.dependencies = self.get_dataset_dependencies(
1148
- d.name, d.version, project, indirect=indirect
1106
+ d.name, d.version, indirect=indirect
1149
1107
  )
1150
1108
 
1151
1109
  return direct_dependencies
@@ -1155,12 +1113,9 @@ class Catalog:
1155
1113
  prefix: Optional[str] = None,
1156
1114
  include_listing: bool = False,
1157
1115
  studio: bool = False,
1158
- project: Optional[Project] = None,
1159
1116
  ) -> Iterator[DatasetListRecord]:
1160
1117
  from datachain.remote.studio import StudioClient
1161
1118
 
1162
- project_id = project.id if project else None
1163
-
1164
1119
  if studio:
1165
1120
  client = StudioClient()
1166
1121
  response = client.ls_datasets(prefix=prefix)
@@ -1175,11 +1130,9 @@ class Catalog:
1175
1130
  if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
1176
1131
  )
1177
1132
  elif prefix:
1178
- datasets = self.metastore.list_datasets_by_prefix(
1179
- prefix, project_id=project_id
1180
- )
1133
+ datasets = self.metastore.list_datasets_by_prefix(prefix)
1181
1134
  else:
1182
- datasets = self.metastore.list_datasets(project_id=project_id)
1135
+ datasets = self.metastore.list_datasets()
1183
1136
 
1184
1137
  for d in datasets:
1185
1138
  if not d.is_bucket_listing or include_listing:
@@ -1191,15 +1144,11 @@ class Catalog:
1191
1144
  include_listing: bool = False,
1192
1145
  with_job: bool = True,
1193
1146
  studio: bool = False,
1194
- project: Optional[Project] = None,
1195
1147
  ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
1196
1148
  """Iterate over all dataset versions with related jobs."""
1197
1149
  datasets = list(
1198
1150
  self.ls_datasets(
1199
- prefix=prefix,
1200
- include_listing=include_listing,
1201
- studio=studio,
1202
- project=project,
1151
+ prefix=prefix, include_listing=include_listing, studio=studio
1203
1152
  )
1204
1153
  )
1205
1154
 
@@ -1235,7 +1184,6 @@ class Catalog:
1235
1184
  prefix=prefix,
1236
1185
  include_listing=True,
1237
1186
  with_job=False,
1238
- project=self.metastore.listing_project,
1239
1187
  )
1240
1188
 
1241
1189
  return [
@@ -1245,21 +1193,13 @@ class Catalog:
1245
1193
  ]
1246
1194
 
1247
1195
  def ls_dataset_rows(
1248
- self,
1249
- dataset: DatasetRecord,
1250
- version: str,
1251
- offset=None,
1252
- limit=None,
1196
+ self, name: str, version: str, offset=None, limit=None
1253
1197
  ) -> list[dict]:
1254
1198
  from datachain.query.dataset import DatasetQuery
1255
1199
 
1256
- q = DatasetQuery(
1257
- name=dataset.name,
1258
- namespace_name=dataset.project.namespace.name,
1259
- project_name=dataset.project.name,
1260
- version=version,
1261
- catalog=self,
1262
- )
1200
+ dataset = self.get_dataset(name)
1201
+
1202
+ q = DatasetQuery(name=dataset.name, version=version, catalog=self)
1263
1203
  if limit:
1264
1204
  q = q.limit(limit)
1265
1205
  if offset:
@@ -1292,29 +1232,35 @@ class Catalog:
1292
1232
  bucket_uri: str,
1293
1233
  name: str,
1294
1234
  version: str,
1295
- project: Optional[Project] = None,
1296
1235
  client_config=None,
1297
1236
  ) -> list[str]:
1298
- dataset = self.get_dataset(name, project)
1237
+ dataset = self.get_dataset(name)
1299
1238
 
1300
1239
  return self.warehouse.export_dataset_table(
1301
1240
  bucket_uri, dataset, version, client_config
1302
1241
  )
1303
1242
 
1304
- def dataset_table_export_file_names(
1305
- self, name: str, version: str, project: Optional[Project] = None
1306
- ) -> list[str]:
1307
- dataset = self.get_dataset(name, project)
1243
+ def dataset_table_export_file_names(self, name: str, version: str) -> list[str]:
1244
+ dataset = self.get_dataset(name)
1308
1245
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1309
1246
 
1310
1247
  def remove_dataset(
1311
1248
  self,
1312
1249
  name: str,
1313
- project: Optional[Project] = None,
1314
1250
  version: Optional[str] = None,
1315
1251
  force: Optional[bool] = False,
1252
+ studio: Optional[bool] = False,
1316
1253
  ):
1317
- dataset = self.get_dataset(name, project)
1254
+ from datachain.remote.studio import StudioClient
1255
+
1256
+ if studio:
1257
+ client = StudioClient()
1258
+ response = client.rm_dataset(name, version=version, force=force)
1259
+ if not response.ok:
1260
+ raise DataChainError(response.message)
1261
+ return
1262
+
1263
+ dataset = self.get_dataset(name)
1318
1264
  if not version and not force:
1319
1265
  raise ValueError(f"Missing dataset version from input for dataset {name}")
1320
1266
  if version and not dataset.has_version(version):
@@ -1336,21 +1282,19 @@ class Catalog:
1336
1282
  def edit_dataset(
1337
1283
  self,
1338
1284
  name: str,
1339
- project: Optional[Project] = None,
1340
1285
  new_name: Optional[str] = None,
1341
1286
  description: Optional[str] = None,
1342
1287
  attrs: Optional[list[str]] = None,
1343
1288
  ) -> DatasetRecord:
1344
1289
  update_data = {}
1345
1290
  if new_name:
1346
- DatasetRecord.validate_name(new_name)
1347
1291
  update_data["name"] = new_name
1348
1292
  if description is not None:
1349
1293
  update_data["description"] = description
1350
1294
  if attrs is not None:
1351
1295
  update_data["attrs"] = attrs # type: ignore[assignment]
1352
1296
 
1353
- dataset = self.get_dataset(name, project)
1297
+ dataset = self.get_dataset(name)
1354
1298
  return self.update_dataset(dataset, **update_data)
1355
1299
 
1356
1300
  def ls(
@@ -1407,29 +1351,7 @@ class Catalog:
1407
1351
  except Exception as e:
1408
1352
  raise DataChainError("Error when parsing dataset uri") from e
1409
1353
 
1410
- remote_namespace, remote_project, remote_ds_name = parse_dataset_name(
1411
- remote_ds_name
1412
- )
1413
- if not remote_namespace or not remote_project:
1414
- raise DataChainError(
1415
- f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
1416
- f" or project missing"
1417
- )
1418
-
1419
- if local_ds_name:
1420
- local_namespace, local_project, local_ds_name = parse_dataset_name(
1421
- local_ds_name
1422
- )
1423
- if local_namespace and local_namespace != remote_namespace:
1424
- raise DataChainError(
1425
- "Local namespace must be the same to remote namespace"
1426
- )
1427
- if local_project and local_project != remote_project:
1428
- raise DataChainError("Local project must be the same to remote project")
1429
-
1430
- remote_ds = self.get_remote_dataset(
1431
- remote_namespace, remote_project, remote_ds_name
1432
- )
1354
+ remote_ds = self.get_remote_dataset(remote_ds_name)
1433
1355
 
1434
1356
  try:
1435
1357
  # if version is not specified in uri, take the latest one
@@ -1437,12 +1359,7 @@ class Catalog:
1437
1359
  version = remote_ds.latest_version
1438
1360
  print(f"Version not specified, pulling the latest one (v{version})")
1439
1361
  # updating dataset uri with latest version
1440
- remote_ds_uri = create_dataset_uri(
1441
- remote_ds.name,
1442
- remote_ds.project.namespace.name,
1443
- remote_ds.project.name,
1444
- version,
1445
- )
1362
+ remote_ds_uri = create_dataset_uri(remote_ds_name, version)
1446
1363
  remote_ds_version = remote_ds.get_version(version)
1447
1364
  except (DatasetVersionNotFoundError, StopIteration) as exc:
1448
1365
  raise DataChainError(
@@ -1451,13 +1368,7 @@ class Catalog:
1451
1368
 
1452
1369
  local_ds_name = local_ds_name or remote_ds.name
1453
1370
  local_ds_version = local_ds_version or remote_ds_version.version
1454
-
1455
- local_ds_uri = create_dataset_uri(
1456
- local_ds_name,
1457
- remote_ds.project.namespace.name,
1458
- remote_ds.project.name,
1459
- local_ds_version,
1460
- )
1371
+ local_ds_uri = create_dataset_uri(local_ds_name, local_ds_version)
1461
1372
 
1462
1373
  try:
1463
1374
  # try to find existing dataset with the same uuid to avoid pulling again
@@ -1466,10 +1377,7 @@ class Catalog:
1466
1377
  remote_ds_version.uuid
1467
1378
  )
1468
1379
  existing_ds_uri = create_dataset_uri(
1469
- existing_ds.name,
1470
- existing_ds.project.namespace.name,
1471
- existing_ds.project.name,
1472
- existing_ds_version.version,
1380
+ existing_ds.name, existing_ds_version.version
1473
1381
  )
1474
1382
  if existing_ds_uri == remote_ds_uri:
1475
1383
  print(f"Local copy of dataset {remote_ds_uri} already present")
@@ -1483,26 +1391,8 @@ class Catalog:
1483
1391
  except DatasetNotFoundError:
1484
1392
  pass
1485
1393
 
1486
- # Create namespace and project if doesn't exist
1487
- print(
1488
- f"Creating namespace {remote_ds.project.namespace.name} and project"
1489
- f" {remote_ds.project.name}"
1490
- )
1491
-
1492
- namespace = self.metastore.create_namespace(
1493
- remote_ds.project.namespace.name,
1494
- description=remote_ds.project.namespace.descr,
1495
- uuid=remote_ds.project.namespace.uuid,
1496
- )
1497
- project = self.metastore.create_project(
1498
- namespace.name,
1499
- remote_ds.project.name,
1500
- description=remote_ds.project.descr,
1501
- uuid=remote_ds.project.uuid,
1502
- )
1503
-
1504
1394
  try:
1505
- local_dataset = self.get_dataset(local_ds_name, project=project)
1395
+ local_dataset = self.get_dataset(local_ds_name)
1506
1396
  if local_dataset and local_dataset.has_version(local_ds_version):
1507
1397
  raise DataChainError(
1508
1398
  f"Local dataset {local_ds_uri} already exists with different uuid,"
@@ -1524,7 +1414,6 @@ class Catalog:
1524
1414
 
1525
1415
  local_ds = self.create_dataset(
1526
1416
  local_ds_name,
1527
- project,
1528
1417
  local_ds_version,
1529
1418
  query_script=remote_ds_version.query_script,
1530
1419
  create_rows=True,
@@ -1537,7 +1426,7 @@ class Catalog:
1537
1426
  # asking remote to export dataset rows table to s3 and to return signed
1538
1427
  # urls of exported parts, which are in parquet format
1539
1428
  export_response = studio_client.export_dataset_table(
1540
- remote_ds, remote_ds_version.version
1429
+ remote_ds_name, remote_ds_version.version
1541
1430
  )
1542
1431
  if not export_response.ok:
1543
1432
  raise DataChainError(export_response.message)
@@ -1568,9 +1457,9 @@ class Catalog:
1568
1457
  rows_fetcher = DatasetRowsFetcher(
1569
1458
  metastore,
1570
1459
  warehouse,
1571
- remote_ds,
1460
+ remote_ds_name,
1572
1461
  remote_ds_version.version,
1573
- local_ds,
1462
+ local_ds_name,
1574
1463
  local_ds_version,
1575
1464
  schema,
1576
1465
  progress_bar=dataset_save_progress_bar,
@@ -1580,7 +1469,7 @@ class Catalog:
1580
1469
  iter(batch(signed_urls)), dataset_save_progress_bar
1581
1470
  )
1582
1471
  except:
1583
- self.remove_dataset(local_ds_name, project, local_ds_version)
1472
+ self.remove_dataset(local_ds_name, local_ds_version)
1584
1473
  raise
1585
1474
 
1586
1475
  local_ds = self.metastore.update_dataset_status(
@@ -1637,11 +1526,7 @@ class Catalog:
1637
1526
  )
1638
1527
 
1639
1528
  self.create_dataset_from_sources(
1640
- output,
1641
- sources,
1642
- self.metastore.default_project,
1643
- client_config=client_config,
1644
- recursive=recursive,
1529
+ output, sources, client_config=client_config, recursive=recursive
1645
1530
  )
1646
1531
 
1647
1532
  def query(
datachain/cli/__init__.py CHANGED
@@ -152,6 +152,9 @@ def handle_dataset_command(args, catalog):
152
152
  new_name=args.new_name,
153
153
  description=args.description,
154
154
  attrs=args.attrs,
155
+ studio=args.studio,
156
+ local=args.local,
157
+ all=args.all,
155
158
  team=args.team,
156
159
  ),
157
160
  "ls": lambda: list_datasets(
@@ -169,6 +172,8 @@ def handle_dataset_command(args, catalog):
169
172
  version=args.version,
170
173
  force=args.force,
171
174
  studio=args.studio,
175
+ local=args.local,
176
+ all=args.all,
172
177
  team=args.team,
173
178
  ),
174
179
  "remove": lambda: rm_dataset(
@@ -177,6 +182,8 @@ def handle_dataset_command(args, catalog):
177
182
  version=args.version,
178
183
  force=args.force,
179
184
  studio=args.studio,
185
+ local=args.local,
186
+ all=args.all,
180
187
  team=args.team,
181
188
  ),
182
189
  }