datachain 0.20.4__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +65 -180
  4. datachain/cli/__init__.py +7 -0
  5. datachain/cli/commands/datasets.py +28 -43
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +35 -1
  8. datachain/client/fsspec.py +3 -5
  9. datachain/client/hf.py +0 -10
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +37 -405
  12. datachain/data_storage/sqlite.py +7 -136
  13. datachain/data_storage/warehouse.py +7 -26
  14. datachain/dataset.py +12 -126
  15. datachain/delta.py +7 -11
  16. datachain/error.py +0 -36
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +0 -4
  20. datachain/lib/dc/datachain.py +92 -260
  21. datachain/lib/dc/datasets.py +50 -104
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +0 -1
  24. datachain/lib/dc/storage.py +40 -38
  25. datachain/lib/file.py +23 -77
  26. datachain/lib/listing.py +1 -3
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/pytorch.py +1 -1
  29. datachain/lib/settings.py +0 -10
  30. datachain/lib/tar.py +2 -1
  31. datachain/lib/udf_signature.py +1 -1
  32. datachain/lib/webdataset.py +20 -30
  33. datachain/listing.py +1 -3
  34. datachain/query/dataset.py +46 -71
  35. datachain/query/session.py +1 -1
  36. datachain/remote/studio.py +26 -61
  37. datachain/studio.py +7 -23
  38. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
  39. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
  40. datachain/lib/namespaces.py +0 -71
  41. datachain/lib/projects.py +0 -86
  42. datachain/namespace.py +0 -65
  43. datachain/project.py +0 -78
  44. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
  45. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
  47. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/lib/settings.py CHANGED
@@ -14,16 +14,12 @@ class Settings:
14
14
  workers=None,
15
15
  min_task_size=None,
16
16
  prefetch=None,
17
- namespace=None,
18
- project=None,
19
17
  ):
20
18
  self._cache = cache
21
19
  self.parallel = parallel
22
20
  self._workers = workers
23
21
  self.min_task_size = min_task_size
24
22
  self.prefetch = prefetch
25
- self.namespace = namespace
26
- self.project = project
27
23
 
28
24
  if not isinstance(cache, bool) and cache is not None:
29
25
  raise SettingsError(
@@ -71,10 +67,6 @@ class Settings:
71
67
  res["workers"] = self.workers
72
68
  if self.min_task_size is not None:
73
69
  res["min_task_size"] = self.min_task_size
74
- if self.namespace is not None:
75
- res["namespace"] = self.namespace
76
- if self.project is not None:
77
- res["project"] = self.project
78
70
  return res
79
71
 
80
72
  def add(self, settings: "Settings"):
@@ -82,7 +74,5 @@ class Settings:
82
74
  self.parallel = settings.parallel or self.parallel
83
75
  self._workers = settings._workers or self._workers
84
76
  self.min_task_size = settings.min_task_size or self.min_task_size
85
- self.namespace = settings.namespace or self.namespace
86
- self.project = settings.project or self.project
87
77
  if settings.prefetch is not None:
88
78
  self.prefetch = settings.prefetch
datachain/lib/tar.py CHANGED
@@ -6,11 +6,12 @@ from datachain.lib.file import File, TarVFile
6
6
 
7
7
 
8
8
  def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
9
+ new_parent = parent.get_full_name()
9
10
  etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
10
11
  etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
11
12
  return File(
12
13
  source=parent.source,
13
- path=f"{parent.path}/{info.name}",
14
+ path=f"{new_parent}/{info.name}",
14
15
  version=parent.version,
15
16
  size=info.size,
16
17
  etag=etag,
@@ -16,7 +16,7 @@ class UdfSignatureError(DataChainParamsError):
16
16
 
17
17
 
18
18
  @dataclass
19
- class UdfSignature: # noqa: PLW1641
19
+ class UdfSignature:
20
20
  func: Union[Callable, UDFBase]
21
21
  params: dict[str, Union[DataType, Any]]
22
22
  output_schema: SignalSchema
@@ -34,29 +34,29 @@ warnings.filterwarnings(
34
34
 
35
35
 
36
36
  class WDSError(DataChainError):
37
- def __init__(self, tar_name: str, message: str):
38
- super().__init__(f"WebDataset error '{tar_name}': {message}")
37
+ def __init__(self, tar_stream, message: str):
38
+ super().__init__(f"WebDataset error '{tar_stream.get_full_name()}': {message}")
39
39
 
40
40
 
41
41
  class CoreFileDuplicationError(WDSError):
42
- def __init__(self, tar_name: str, file1: str, file2: str):
42
+ def __init__(self, tar_stream, file1: str, file2: str):
43
43
  super().__init__(
44
- tar_name, f"duplication of files with core extensions: {file1}, {file2}"
44
+ tar_stream, f"duplication of files with core extensions: {file1}, {file2}"
45
45
  )
46
46
 
47
47
 
48
48
  class CoreFileNotFoundError(WDSError):
49
- def __init__(self, tar_name: str, extensions: Sequence[str], stem: str):
49
+ def __init__(self, tar_stream, extensions, stem):
50
50
  super().__init__(
51
- tar_name,
51
+ tar_stream,
52
52
  f"no files with the extensions '{','.join(extensions)}'"
53
53
  f" were found for file stem {stem}",
54
54
  )
55
55
 
56
56
 
57
57
  class UnknownFileExtensionError(WDSError):
58
- def __init__(self, tar_name, name: str, ext: str):
59
- super().__init__(tar_name, f"unknown extension '{ext}' for file '{name}'")
58
+ def __init__(self, tar_stream, name, ext):
59
+ super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
60
60
 
61
61
 
62
62
  class WDSBasic(DataModel):
@@ -113,10 +113,10 @@ class Builder:
113
113
  def __init__(
114
114
  self,
115
115
  tar_stream: File,
116
- core_extensions: Sequence[str],
116
+ core_extensions: list[str],
117
117
  wds_class: type[WDSBasic],
118
- tar: tarfile.TarFile,
119
- encoding: str = "utf-8",
118
+ tar,
119
+ encoding="utf-8",
120
120
  ):
121
121
  self._core_extensions = core_extensions
122
122
  self._tar_stream = tar_stream
@@ -145,20 +145,18 @@ class Builder:
145
145
  if ext in self._core_extensions:
146
146
  if self.state.core_file is not None:
147
147
  raise CoreFileDuplicationError(
148
- self._tar_stream.name, file.name, self.state.core_file.name
148
+ self._tar_stream, file.name, self.state.core_file.name
149
149
  )
150
150
  self.state.core_file = file
151
151
  elif ext in self.state.data:
152
152
  raise WDSError(
153
- self._tar_stream.name,
153
+ self._tar_stream,
154
154
  f"file with extension '.{ext}' already exists in the archive",
155
155
  )
156
156
  else:
157
157
  type_ = self._get_type(ext)
158
158
  if type_ is None:
159
- raise UnknownFileExtensionError(
160
- self._tar_stream.name, fstream.name, ext
161
- )
159
+ raise UnknownFileExtensionError(self._tar_stream, fstream.name, ext)
162
160
 
163
161
  if issubclass(type_, WDSReadableSubclass):
164
162
  reader = type_._reader
@@ -167,7 +165,7 @@ class Builder:
167
165
 
168
166
  if reader is None:
169
167
  raise WDSError(
170
- self._tar_stream.name,
168
+ self._tar_stream,
171
169
  f"unable to find a reader for type {type_}, extension .{ext}",
172
170
  )
173
171
  self.state.data[ext] = reader(self, file)
@@ -175,7 +173,7 @@ class Builder:
175
173
  def produce(self):
176
174
  if self.state.core_file is None:
177
175
  raise CoreFileNotFoundError(
178
- self._tar_stream.name, self._core_extensions, self.state.stem
176
+ self._tar_stream, self._core_extensions, self.state.stem
179
177
  )
180
178
 
181
179
  file = build_tar_member(self._tar_stream, self.state.core_file)
@@ -196,13 +194,7 @@ class Builder:
196
194
  return anno
197
195
 
198
196
 
199
- def get_tar_groups(
200
- stream: File,
201
- tar: tarfile.TarFile,
202
- core_extensions: Sequence[str],
203
- spec: type[WDSBasic],
204
- encoding: str = "utf-8",
205
- ) -> Iterator[WDSBasic]:
197
+ def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
206
198
  builder = Builder(stream, core_extensions, spec, tar, encoding)
207
199
 
208
200
  for item in sorted(tar.getmembers(), key=lambda m: Path(m.name).stem):
@@ -218,11 +210,9 @@ def get_tar_groups(
218
210
 
219
211
 
220
212
  def process_webdataset(
221
- core_extensions: Sequence[str] = ("jpg", "png"),
222
- spec: type[WDSBasic] = WDSAllFile,
223
- encoding: str = "utf-8",
224
- ) -> Callable[[File], Iterator]:
225
- def wds_func(file: File) -> Iterator[spec]: # type: ignore[valid-type]
213
+ core_extensions: Sequence[str] = ("jpg", "png"), spec=WDSAllFile, encoding="utf-8"
214
+ ) -> Callable:
215
+ def wds_func(file: File) -> Iterator[spec]:
226
216
  with file.open() as fd:
227
217
  with tarfile.open(fileobj=fd) as tar:
228
218
  yield from get_tar_groups(file, tar, core_extensions, spec, encoding)
datachain/listing.py CHANGED
@@ -66,9 +66,7 @@ class Listing:
66
66
  @cached_property
67
67
  def dataset(self) -> "DatasetRecord":
68
68
  assert self.dataset_name
69
- return self.metastore.get_dataset(
70
- self.dataset_name, self.metastore.listing_project.id
71
- )
69
+ return self.metastore.get_dataset(self.dataset_name)
72
70
 
73
71
  @cached_property
74
72
  def dataset_rows(self):
@@ -41,13 +41,12 @@ from datachain.data_storage.schema import (
41
41
  partition_col_names,
42
42
  partition_columns,
43
43
  )
44
- from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
44
+ from datachain.dataset import DATASET_PREFIX, DatasetDependency, DatasetStatus, RowDict
45
45
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
46
46
  from datachain.func.base import Function
47
47
  from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
48
48
  from datachain.lib.udf import UDFAdapter, _get_cache
49
49
  from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
50
- from datachain.project import Project
51
50
  from datachain.query.schema import C, UDFParamSpec, normalize_param
52
51
  from datachain.query.session import Session
53
52
  from datachain.query.udf import UdfInfo
@@ -84,7 +83,7 @@ PartitionByType = Union[
84
83
  Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
85
84
  ]
86
85
  JoinPredicateType = Union[str, ColumnClause, ColumnElement]
87
- DatasetDependencyType = tuple["DatasetRecord", str]
86
+ DatasetDependencyType = tuple[str, str]
88
87
 
89
88
  logger = logging.getLogger("datachain")
90
89
 
@@ -170,17 +169,18 @@ class QueryStep:
170
169
  """A query that returns all rows from specific dataset version"""
171
170
 
172
171
  catalog: "Catalog"
173
- dataset: "DatasetRecord"
172
+ dataset_name: str
174
173
  dataset_version: str
175
174
 
176
175
  def apply(self) -> "StepResult":
177
176
  def q(*columns):
178
177
  return sqlalchemy.select(*columns)
179
178
 
180
- dr = self.catalog.warehouse.dataset_rows(self.dataset, self.dataset_version)
179
+ dataset = self.catalog.get_dataset(self.dataset_name)
180
+ dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
181
181
 
182
182
  return step_result(
183
- q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
183
+ q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
184
184
  )
185
185
 
186
186
 
@@ -1095,8 +1095,6 @@ class DatasetQuery:
1095
1095
  self,
1096
1096
  name: str,
1097
1097
  version: Optional[str] = None,
1098
- project_name: Optional[str] = None,
1099
- namespace_name: Optional[str] = None,
1100
1098
  catalog: Optional["Catalog"] = None,
1101
1099
  session: Optional[Session] = None,
1102
1100
  indexing_column_types: Optional[dict[str, Any]] = None,
@@ -1130,38 +1128,33 @@ class DatasetQuery:
1130
1128
  if version:
1131
1129
  self.version = version
1132
1130
 
1133
- namespace_name = namespace_name or self.catalog.metastore.default_namespace_name
1134
- project_name = project_name or self.catalog.metastore.default_project_name
1135
-
1136
- if is_listing_dataset(name) and not version:
1137
- # not setting query step yet as listing dataset might not exist at
1138
- # this point
1139
- self.list_ds_name = name
1131
+ if is_listing_dataset(name):
1132
+ if version:
1133
+ # this listing dataset should already be listed as we specify
1134
+ # exact version
1135
+ self._set_starting_step(self.catalog.get_dataset(name))
1136
+ else:
1137
+ # not setting query step yet as listing dataset might not exist at
1138
+ # this point
1139
+ self.list_ds_name = name
1140
1140
  elif fallback_to_studio and is_token_set():
1141
1141
  self._set_starting_step(
1142
- self.catalog.get_dataset_with_remote_fallback(
1143
- name,
1144
- namespace_name=namespace_name,
1145
- project_name=project_name,
1146
- version=version,
1147
- )
1142
+ self.catalog.get_dataset_with_remote_fallback(name, version)
1148
1143
  )
1149
1144
  else:
1150
- project = self.catalog.metastore.get_project(project_name, namespace_name)
1151
- self._set_starting_step(self.catalog.get_dataset(name, project=project))
1145
+ self._set_starting_step(self.catalog.get_dataset(name))
1152
1146
 
1153
1147
  def _set_starting_step(self, ds: "DatasetRecord") -> None:
1154
1148
  if not self.version:
1155
1149
  self.version = ds.latest_version
1156
1150
 
1157
- self.starting_step = QueryStep(self.catalog, ds, self.version)
1151
+ self.starting_step = QueryStep(self.catalog, ds.name, self.version)
1158
1152
 
1159
1153
  # at this point we know our starting dataset so setting up schemas
1160
1154
  self.feature_schema = ds.get_version(self.version).feature_schema
1161
1155
  self.column_types = copy(ds.schema)
1162
1156
  if "sys__id" in self.column_types:
1163
1157
  self.column_types.pop("sys__id")
1164
- self.project = ds.project
1165
1158
 
1166
1159
  def __iter__(self):
1167
1160
  return iter(self.db_results())
@@ -1169,6 +1162,21 @@ class DatasetQuery:
1169
1162
  def __or__(self, other):
1170
1163
  return self.union(other)
1171
1164
 
1165
+ def pull_dataset(self, name: str, version: Optional[str] = None) -> "DatasetRecord":
1166
+ print("Dataset not found in local catalog, trying to get from studio")
1167
+
1168
+ remote_ds_uri = f"{DATASET_PREFIX}{name}"
1169
+ if version:
1170
+ remote_ds_uri += f"@v{version}"
1171
+
1172
+ self.catalog.pull_dataset(
1173
+ remote_ds_uri=remote_ds_uri,
1174
+ local_ds_name=name,
1175
+ local_ds_version=version,
1176
+ )
1177
+
1178
+ return self.catalog.get_dataset(name)
1179
+
1172
1180
  @staticmethod
1173
1181
  def get_table() -> "TableClause":
1174
1182
  table_name = "".join(
@@ -1649,8 +1657,6 @@ class DatasetQuery:
1649
1657
  workers: Union[bool, int] = False,
1650
1658
  min_task_size: Optional[int] = None,
1651
1659
  partition_by: Optional[PartitionByType] = None,
1652
- namespace: Optional[str] = None,
1653
- project: Optional[str] = None,
1654
1660
  cache: bool = False,
1655
1661
  ) -> "Self":
1656
1662
  query = self.clone()
@@ -1670,36 +1676,26 @@ class DatasetQuery:
1670
1676
 
1671
1677
  def _add_dependencies(self, dataset: "DatasetRecord", version: str):
1672
1678
  dependencies: set[DatasetDependencyType] = set()
1673
- for dep_dataset, dep_dataset_version in self.dependencies:
1674
- if Session.is_temp_dataset(dep_dataset.name):
1679
+ for dep_name, dep_version in self.dependencies:
1680
+ if Session.is_temp_dataset(dep_name):
1675
1681
  # temp dataset are created for optimization and they will be removed
1676
1682
  # afterwards. Therefore, we should not put them as dependencies, but
1677
1683
  # their own direct dependencies
1678
1684
  for dep in self.catalog.get_dataset_dependencies(
1679
- dep_dataset.name,
1680
- dep_dataset_version,
1681
- dep_dataset.project,
1682
- indirect=False,
1685
+ dep_name, dep_version, indirect=False
1683
1686
  ):
1684
1687
  if dep:
1685
- dep_project = self.catalog.metastore.get_project(
1686
- dep.project, dep.namespace
1687
- )
1688
- dependencies.add(
1689
- (
1690
- self.catalog.get_dataset(dep.name, dep_project),
1691
- dep.version,
1692
- )
1693
- )
1688
+ dependencies.add((dep.name, dep.version))
1694
1689
  else:
1695
- dependencies.add((dep_dataset, dep_dataset_version))
1690
+ dependencies.add((dep_name, dep_version))
1696
1691
 
1697
- for dep_dataset, dep_dataset_version in dependencies:
1692
+ for dep_name, dep_version in dependencies:
1693
+ # ds_dependency_name, ds_dependency_version = dependency
1698
1694
  self.catalog.metastore.add_dataset_dependency(
1699
- dataset,
1695
+ dataset.name,
1700
1696
  version,
1701
- dep_dataset,
1702
- dep_dataset_version,
1697
+ dep_name,
1698
+ dep_version,
1703
1699
  )
1704
1700
 
1705
1701
  def exec(self) -> "Self":
@@ -1715,7 +1711,6 @@ class DatasetQuery:
1715
1711
  self,
1716
1712
  name: Optional[str] = None,
1717
1713
  version: Optional[str] = None,
1718
- project: Optional[Project] = None,
1719
1714
  feature_schema: Optional[dict] = None,
1720
1715
  dependencies: Optional[list[DatasetDependency]] = None,
1721
1716
  description: Optional[str] = None,
@@ -1724,13 +1719,8 @@ class DatasetQuery:
1724
1719
  **kwargs,
1725
1720
  ) -> "Self":
1726
1721
  """Save the query as a dataset."""
1727
- project = project or self.catalog.metastore.default_project
1728
1722
  try:
1729
- if (
1730
- name
1731
- and version
1732
- and self.catalog.get_dataset(name, project).has_version(version)
1733
- ):
1723
+ if name and version and self.catalog.get_dataset(name).has_version(version):
1734
1724
  raise RuntimeError(f"Dataset {name} already has version {version}")
1735
1725
  except DatasetNotFoundError:
1736
1726
  pass
@@ -1755,7 +1745,6 @@ class DatasetQuery:
1755
1745
 
1756
1746
  dataset = self.catalog.create_dataset(
1757
1747
  name,
1758
- project,
1759
1748
  version=version,
1760
1749
  feature_schema=feature_schema,
1761
1750
  columns=columns,
@@ -1781,25 +1770,11 @@ class DatasetQuery:
1781
1770
 
1782
1771
  if dependencies:
1783
1772
  # overriding dependencies
1784
- self.dependencies = set()
1785
- for dep in dependencies:
1786
- dep_project = self.catalog.metastore.get_project(
1787
- dep.project, dep.namespace
1788
- )
1789
- self.dependencies.add(
1790
- (self.catalog.get_dataset(dep.name, dep_project), dep.version)
1791
- )
1792
-
1773
+ self.dependencies = {(dep.name, dep.version) for dep in dependencies}
1793
1774
  self._add_dependencies(dataset, version) # type: ignore [arg-type]
1794
1775
  finally:
1795
1776
  self.cleanup()
1796
- return self.__class__(
1797
- name=name,
1798
- namespace_name=project.namespace.name,
1799
- project_name=project.name,
1800
- version=version,
1801
- catalog=self.catalog,
1802
- )
1777
+ return self.__class__(name=name, version=version, catalog=self.catalog)
1803
1778
 
1804
1779
  @property
1805
1780
  def is_ordered(self) -> bool:
@@ -108,7 +108,7 @@ class Session:
108
108
  prefix = self.get_temp_prefix()
109
109
  try:
110
110
  for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
111
- self.catalog.remove_dataset(dataset.name, dataset.project, force=True)
111
+ self.catalog.remove_dataset(dataset.name, force=True)
112
112
  # suppress error when metastore has been reset during testing
113
113
  except TableMissingError:
114
114
  pass
@@ -17,7 +17,6 @@ import websockets
17
17
  from requests.exceptions import HTTPError, Timeout
18
18
 
19
19
  from datachain.config import Config
20
- from datachain.dataset import DatasetRecord
21
20
  from datachain.error import DataChainError
22
21
  from datachain.utils import STUDIO_URL, retry_with_backoff
23
22
 
@@ -37,33 +36,13 @@ logger = logging.getLogger("datachain")
37
36
  DATASET_ROWS_CHUNK_SIZE = 8192
38
37
 
39
38
 
40
- def get_studio_env_variable(name: str) -> Any:
41
- """
42
- Get the value of a DataChain Studio environment variable.
43
- It first checks for the variable prefixed with 'DATACHAIN_STUDIO_',
44
- then checks for the deprecated 'DVC_STUDIO_' prefix.
45
- If neither is set, it returns the provided default value.
46
- """
47
- if (value := os.environ.get(f"DATACHAIN_STUDIO_{name}")) is not None:
48
- return value
49
- if (value := os.environ.get(f"DVC_STUDIO_{name}")) is not None: # deprecated
50
- logger.warning(
51
- "Environment variable 'DVC_STUDIO_%s' is deprecated, "
52
- "use 'DATACHAIN_STUDIO_%s' instead.",
53
- name,
54
- name,
55
- )
56
- return value
57
- return None
58
-
59
-
60
39
  def _is_server_error(status_code: int) -> bool:
61
40
  return str(status_code).startswith("5")
62
41
 
63
42
 
64
43
  def is_token_set() -> bool:
65
44
  return (
66
- bool(get_studio_env_variable("TOKEN"))
45
+ bool(os.environ.get("DVC_STUDIO_TOKEN"))
67
46
  or Config().read().get("studio", {}).get("token") is not None
68
47
  )
69
48
 
@@ -99,12 +78,12 @@ class StudioClient:
99
78
 
100
79
  @property
101
80
  def token(self) -> str:
102
- token = get_studio_env_variable("TOKEN") or self.config.get("token")
81
+ token = os.environ.get("DVC_STUDIO_TOKEN") or self.config.get("token")
103
82
 
104
83
  if not token:
105
84
  raise DataChainError(
106
85
  "Studio token is not set. Use `datachain auth login` "
107
- "or environment variable `DATACHAIN_STUDIO_TOKEN` to set it."
86
+ "or environment variable `DVC_STUDIO_TOKEN` to set it."
108
87
  )
109
88
 
110
89
  return token
@@ -112,8 +91,8 @@ class StudioClient:
112
91
  @property
113
92
  def url(self) -> str:
114
93
  return (
115
- get_studio_env_variable("URL") or self.config.get("url") or STUDIO_URL
116
- ).rstrip("/") + "/api"
94
+ os.environ.get("DVC_STUDIO_URL") or self.config.get("url") or STUDIO_URL
95
+ ) + "/api"
117
96
 
118
97
  @property
119
98
  def config(self) -> dict:
@@ -128,13 +107,13 @@ class StudioClient:
128
107
  return self._team
129
108
 
130
109
  def _get_team(self) -> str:
131
- team = get_studio_env_variable("TEAM") or self.config.get("team")
110
+ team = os.environ.get("DVC_STUDIO_TEAM") or self.config.get("team")
132
111
 
133
112
  if not team:
134
113
  raise DataChainError(
135
114
  "Studio team is not set. "
136
115
  "Use `datachain auth team <team_name>` "
137
- "or environment variable `DATACHAIN_STUDIO_TEAM` to set it. "
116
+ "or environment variable `DVC_STUDIO_TEAM` to set it. "
138
117
  "You can also set `studio.team` in the config file."
139
118
  )
140
119
 
@@ -312,17 +291,13 @@ class StudioClient:
312
291
  def edit_dataset(
313
292
  self,
314
293
  name: str,
315
- namespace: str,
316
- project: str,
317
294
  new_name: Optional[str] = None,
318
295
  description: Optional[str] = None,
319
296
  attrs: Optional[list[str]] = None,
320
297
  ) -> Response[DatasetInfoData]:
321
298
  body = {
322
299
  "new_name": new_name,
323
- "name": name,
324
- "namespace": namespace,
325
- "project": project,
300
+ "dataset_name": name,
326
301
  "description": description,
327
302
  "attrs": attrs,
328
303
  }
@@ -335,44 +310,44 @@ class StudioClient:
335
310
  def rm_dataset(
336
311
  self,
337
312
  name: str,
338
- namespace: str,
339
- project: str,
340
313
  version: Optional[str] = None,
341
314
  force: Optional[bool] = False,
342
315
  ) -> Response[DatasetInfoData]:
343
316
  return self._send_request(
344
317
  "datachain/datasets",
345
318
  {
346
- "name": name,
347
- "namespace": namespace,
348
- "project": project,
349
- "version": version,
319
+ "dataset_name": name,
320
+ "dataset_version": version,
350
321
  "force": force,
351
322
  },
352
323
  method="DELETE",
353
324
  )
354
325
 
355
- def dataset_info(
356
- self, namespace: str, project: str, name: str
357
- ) -> Response[DatasetInfoData]:
326
+ def dataset_info(self, name: str) -> Response[DatasetInfoData]:
358
327
  def _parse_dataset_info(dataset_info):
359
328
  _parse_dates(dataset_info, ["created_at", "finished_at"])
360
329
  for version in dataset_info.get("versions"):
361
330
  _parse_dates(version, ["created_at"])
362
- _parse_dates(dataset_info.get("project"), ["created_at"])
363
- _parse_dates(dataset_info.get("project").get("namespace"), ["created_at"])
364
331
 
365
332
  return dataset_info
366
333
 
367
334
  response = self._send_request(
368
- "datachain/datasets/info",
369
- {"namespace": namespace, "project": project, "name": name},
370
- method="GET",
335
+ "datachain/datasets/info", {"dataset_name": name}, method="GET"
371
336
  )
372
337
  if response.ok:
373
338
  response.data = _parse_dataset_info(response.data)
374
339
  return response
375
340
 
341
+ def dataset_rows_chunk(
342
+ self, name: str, version: str, offset: int
343
+ ) -> Response[DatasetRowsData]:
344
+ req_data = {"dataset_name": name, "dataset_version": version}
345
+ return self._send_request_msgpack(
346
+ "datachain/datasets/rows",
347
+ {**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
348
+ method="GET",
349
+ )
350
+
376
351
  def dataset_job_versions(self, job_id: str) -> Response[DatasetJobVersionsData]:
377
352
  return self._send_request(
378
353
  "datachain/datasets/dataset_job_versions",
@@ -381,30 +356,20 @@ class StudioClient:
381
356
  )
382
357
 
383
358
  def export_dataset_table(
384
- self, dataset: DatasetRecord, version: str
359
+ self, name: str, version: str
385
360
  ) -> Response[DatasetExportSignedUrls]:
386
361
  return self._send_request(
387
362
  "datachain/datasets/export",
388
- {
389
- "namespace": dataset.project.namespace.name,
390
- "project": dataset.project.name,
391
- "name": dataset.name,
392
- "version": version,
393
- },
363
+ {"dataset_name": name, "dataset_version": version},
394
364
  method="GET",
395
365
  )
396
366
 
397
367
  def dataset_export_status(
398
- self, dataset: DatasetRecord, version: str
368
+ self, name: str, version: str
399
369
  ) -> Response[DatasetExportStatus]:
400
370
  return self._send_request(
401
371
  "datachain/datasets/export-status",
402
- {
403
- "namespace": dataset.project.namespace.name,
404
- "project": dataset.project.name,
405
- "name": dataset.name,
406
- "version": version,
407
- },
372
+ {"dataset_name": name, "dataset_version": version},
408
373
  method="GET",
409
374
  )
410
375