datachain 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (37) hide show
  1. datachain/__init__.py +3 -0
  2. datachain/catalog/catalog.py +180 -65
  3. datachain/cli/__init__.py +0 -7
  4. datachain/cli/commands/datasets.py +43 -28
  5. datachain/cli/parser/__init__.py +1 -35
  6. datachain/cli/parser/job.py +25 -0
  7. datachain/cli/parser/studio.py +11 -4
  8. datachain/data_storage/metastore.py +390 -37
  9. datachain/data_storage/schema.py +23 -1
  10. datachain/data_storage/sqlite.py +139 -7
  11. datachain/data_storage/warehouse.py +26 -7
  12. datachain/dataset.py +125 -12
  13. datachain/delta.py +9 -5
  14. datachain/error.py +36 -0
  15. datachain/lib/dataset_info.py +4 -0
  16. datachain/lib/dc/datachain.py +86 -7
  17. datachain/lib/dc/datasets.py +62 -12
  18. datachain/lib/dc/listings.py +111 -0
  19. datachain/lib/dc/records.py +1 -0
  20. datachain/lib/dc/storage.py +14 -2
  21. datachain/lib/listing.py +3 -1
  22. datachain/lib/namespaces.py +73 -0
  23. datachain/lib/projects.py +86 -0
  24. datachain/lib/settings.py +10 -0
  25. datachain/listing.py +3 -1
  26. datachain/namespace.py +65 -0
  27. datachain/project.py +78 -0
  28. datachain/query/dataset.py +71 -46
  29. datachain/query/session.py +1 -1
  30. datachain/remote/studio.py +67 -26
  31. datachain/studio.py +68 -8
  32. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/METADATA +2 -2
  33. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/RECORD +37 -33
  34. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/WHEEL +0 -0
  35. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/entry_points.txt +0 -0
  36. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/licenses/LICENSE +0 -0
  37. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,8 @@ if TYPE_CHECKING:
22
22
 
23
23
  class DatasetInfo(DataModel):
24
24
  name: str
25
+ namespace_name: str
26
+ project_name: str
25
27
  uuid: str = Field(default=str(uuid4()))
26
28
  version: str = Field(default=DEFAULT_DATASET_VERSION)
27
29
  status: int = Field(default=DatasetStatus.CREATED)
@@ -91,6 +93,8 @@ class DatasetInfo(DataModel):
91
93
  return cls(
92
94
  uuid=version.uuid,
93
95
  name=dataset.name,
96
+ namespace_name=dataset.project.namespace.name,
97
+ project_name=dataset.project.name,
94
98
  version=version.version,
95
99
  status=version.status,
96
100
  created_at=version.created_at,
@@ -24,7 +24,7 @@ from pydantic import BaseModel
24
24
  from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
- from datachain.dataset import DatasetRecord
27
+ from datachain.dataset import DatasetRecord, parse_dataset_name
28
28
  from datachain.delta import delta_disabled
29
29
  from datachain.func import literal
30
30
  from datachain.func.base import Function
@@ -37,6 +37,7 @@ from datachain.lib.file import (
37
37
  FileExporter,
38
38
  )
39
39
  from datachain.lib.file import ExportPlacement as FileExportPlacement
40
+ from datachain.lib.projects import get as get_project
40
41
  from datachain.lib.settings import Settings
41
42
  from datachain.lib.signal_schema import SignalSchema
42
43
  from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
@@ -261,7 +262,7 @@ class DataChain:
261
262
  """Underlying dataset, if there is one."""
262
263
  if not self.name:
263
264
  return None
264
- return self.session.catalog.get_dataset(self.name)
265
+ return self.session.catalog.get_dataset(self.name, self._query.project)
265
266
 
266
267
  def __or__(self, other: "Self") -> "Self":
267
268
  """Return `self.union(other)`."""
@@ -312,6 +313,8 @@ class DataChain:
312
313
  min_task_size=None,
313
314
  prefetch: Optional[int] = None,
314
315
  sys: Optional[bool] = None,
316
+ namespace: Optional[str] = None,
317
+ project: Optional[str] = None,
315
318
  ) -> "Self":
316
319
  """Change settings for chain.
317
320
 
@@ -327,6 +330,8 @@ class DataChain:
327
330
  prefetch: number of workers to use for downloading files in advance.
328
331
  This is enabled by default and uses 2 workers.
329
332
  To disable prefetching, set it to 0.
333
+ namespace: namespace name.
334
+ project: project name.
330
335
 
331
336
  Example:
332
337
  ```py
@@ -340,7 +345,11 @@ class DataChain:
340
345
  if sys is None:
341
346
  sys = self._sys
342
347
  settings = copy.copy(self._settings)
343
- settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
348
+ settings.add(
349
+ Settings(
350
+ cache, parallel, workers, min_task_size, prefetch, namespace, project
351
+ )
352
+ )
344
353
  return self._evolve(settings=settings, _sys=sys)
345
354
 
346
355
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -490,6 +499,22 @@ class DataChain:
490
499
  )
491
500
  return listings(*args, **kwargs)
492
501
 
502
+ @property
503
+ def namespace_name(self) -> str:
504
+ """Current namespace name in which the chain is running"""
505
+ return (
506
+ self._settings.namespace
507
+ or self.session.catalog.metastore.default_namespace_name
508
+ )
509
+
510
+ @property
511
+ def project_name(self) -> str:
512
+ """Current project name in which the chain is running"""
513
+ return (
514
+ self._settings.project
515
+ or self.session.catalog.metastore.default_project_name
516
+ )
517
+
493
518
  def persist(self) -> "Self":
494
519
  """Saves temporary chain that will be removed after the process ends.
495
520
  Temporary datasets are useful for optimization, for example when we have
@@ -499,7 +524,12 @@ class DataChain:
499
524
  It returns the chain itself.
500
525
  """
501
526
  schema = self.signals_schema.clone_without_sys_signals().serialize()
502
- return self._evolve(query=self._query.save(feature_schema=schema))
527
+ project = get_project(
528
+ self.project_name, self.namespace_name, session=self.session
529
+ )
530
+ return self._evolve(
531
+ query=self._query.save(project=project, feature_schema=schema)
532
+ )
503
533
 
504
534
  def save( # type: ignore[override]
505
535
  self,
@@ -513,7 +543,10 @@ class DataChain:
513
543
  """Save to a Dataset. It returns the chain itself.
514
544
 
515
545
  Parameters:
516
- name : dataset name.
546
+ name : dataset name. It can be full name consisting of namespace and
547
+ project, but it can also be just a regular dataset name in which
548
+ case we are taking namespace and project from settings, if they
549
+ are defined there, or default ones instead.
517
550
  version : version of a dataset. If version is not specified and dataset
518
551
  already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
519
552
  description : description of a dataset.
@@ -535,6 +568,21 @@ class DataChain:
535
568
  " patch"
536
569
  )
537
570
 
571
+ namespace_name, project_name, name = parse_dataset_name(name)
572
+
573
+ namespace_name = (
574
+ namespace_name
575
+ or self._settings.namespace
576
+ or self.session.catalog.metastore.default_namespace_name
577
+ )
578
+ project_name = (
579
+ project_name
580
+ or self._settings.project
581
+ or self.session.catalog.metastore.default_project_name
582
+ )
583
+
584
+ project = get_project(project_name, namespace_name, session=self.session)
585
+
538
586
  schema = self.signals_schema.clone_without_sys_signals().serialize()
539
587
 
540
588
  # Handle retry and delta functionality
@@ -558,6 +606,7 @@ class DataChain:
558
606
  query=result_ds._query.save(
559
607
  name=name,
560
608
  version=version,
609
+ project=project,
561
610
  feature_schema=schema,
562
611
  dependencies=dependencies,
563
612
  **kwargs,
@@ -577,6 +626,7 @@ class DataChain:
577
626
  query=self._query.save(
578
627
  name=name,
579
628
  version=version,
629
+ project=project,
580
630
  description=description,
581
631
  attrs=attrs,
582
632
  feature_schema=schema,
@@ -2239,16 +2289,45 @@ class DataChain:
2239
2289
 
2240
2290
  Combining filters with "or"
2241
2291
  ```py
2242
- dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
2292
+ dc.filter(
2293
+ C("file.path").glob("cat*") |
2294
+ C("file.path").glob("dog*")
2295
+ )
2296
+ ```
2297
+
2298
+ ```py
2299
+ dc.filter(dc.func.or_(
2300
+ C("file.path").glob("cat*"),
2301
+ C("file.path").glob("dog*")
2302
+ ))
2243
2303
  ```
2244
2304
 
2245
2305
  Combining filters with "and"
2246
2306
  ```py
2247
2307
  dc.filter(
2248
- C("file.path").glob("*.jpg) &
2308
+ C("file.path").glob("*.jpg"),
2309
+ string.length(C("file.path")) > 5
2310
+ )
2311
+ ```
2312
+
2313
+ ```py
2314
+ dc.filter(
2315
+ C("file.path").glob("*.jpg") &
2249
2316
  (string.length(C("file.path")) > 5)
2250
2317
  )
2251
2318
  ```
2319
+
2320
+ ```py
2321
+ dc.filter(dc.func.and_(
2322
+ C("file.path").glob("*.jpg"),
2323
+ string.length(C("file.path")) > 5
2324
+ ))
2325
+ ```
2326
+
2327
+ Combining filters with "not"
2328
+ ```py
2329
+ dc.filter(~(C("file.path").glob("*.jpg")))
2330
+ ```
2252
2331
  """
2253
2332
  return self._evolve(query=self._query.filter(*args))
2254
2333
 
@@ -1,11 +1,13 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
+ from datachain.dataset import parse_dataset_name
4
5
  from datachain.error import DatasetVersionNotFoundError
5
6
  from datachain.lib.dataset_info import DatasetInfo
6
7
  from datachain.lib.file import (
7
8
  File,
8
9
  )
10
+ from datachain.lib.projects import get as get_project
9
11
  from datachain.lib.settings import Settings
10
12
  from datachain.lib.signal_schema import SignalSchema
11
13
  from datachain.query import Session
@@ -24,6 +26,8 @@ if TYPE_CHECKING:
24
26
 
25
27
  def read_dataset(
26
28
  name: str,
29
+ namespace: Optional[str] = None,
30
+ project: Optional[str] = None,
27
31
  version: Optional[Union[str, int]] = None,
28
32
  session: Optional[Session] = None,
29
33
  settings: Optional[dict] = None,
@@ -38,7 +42,12 @@ def read_dataset(
38
42
  If dataset or version is not found locally, it will try to pull it from Studio.
39
43
 
40
44
  Parameters:
41
- name : dataset name
45
+ name: The dataset name, which can be a fully qualified name including the
46
+ namespace and project. Alternatively, it can be a regular name, in which
47
+ case the explicitly defined namespace and project will be used if they are
48
+ set; otherwise, default values will be applied.
49
+ namespace : optional name of namespace in which dataset to read is created
50
+ project : optional name of project in which dataset to read is created
42
51
  version : dataset version
43
52
  session : Session to use for the chain.
44
53
  settings : Settings to use for the chain.
@@ -86,6 +95,11 @@ def read_dataset(
86
95
  chain = dc.read_dataset("my_cats")
87
96
  ```
88
97
 
98
+ ```py
99
+ import datachain as dc
100
+ chain = dc.read_dataset("dev.animals.my_cats")
101
+ ```
102
+
89
103
  ```py
90
104
  chain = dc.read_dataset("my_cats", fallback_to_studio=False)
91
105
  ```
@@ -116,6 +130,15 @@ def read_dataset(
116
130
 
117
131
  from .datachain import DataChain
118
132
 
133
+ session = Session.get(session)
134
+ catalog = session.catalog
135
+
136
+ namespace_name, project_name, name = parse_dataset_name(name)
137
+ namespace_name = (
138
+ namespace_name or namespace or catalog.metastore.default_namespace_name
139
+ )
140
+ project_name = project_name or project or catalog.metastore.default_project_name
141
+
119
142
  if version is not None:
120
143
  try:
121
144
  # for backward compatibility we still allow users to put version as integer
@@ -125,7 +148,9 @@ def read_dataset(
125
148
  # all 2.* dataset versions). If dataset doesn't have any versions where
126
149
  # major part is equal to that input, exception is thrown.
127
150
  major = int(version)
128
- dataset = Session.get(session).catalog.get_dataset(name)
151
+ dataset = session.catalog.get_dataset(
152
+ name, get_project(project_name, namespace_name, session=session)
153
+ )
129
154
  latest_major = dataset.latest_major_version(major)
130
155
  if not latest_major:
131
156
  raise DatasetVersionNotFoundError(
@@ -136,19 +161,22 @@ def read_dataset(
136
161
  # version is in new semver string format, continuing as normal
137
162
  pass
138
163
 
164
+ if settings:
165
+ _settings = Settings(**settings)
166
+ else:
167
+ _settings = Settings()
168
+
139
169
  query = DatasetQuery(
140
170
  name=name,
171
+ project_name=project_name,
172
+ namespace_name=namespace_name,
141
173
  version=version, # type: ignore[arg-type]
142
174
  session=session,
143
175
  indexing_column_types=File._datachain_column_types,
144
176
  fallback_to_studio=fallback_to_studio,
145
177
  )
146
- telemetry.send_event_once("class", "datachain_init", name=name, version=version)
147
- if settings:
148
- _settings = Settings(**settings)
149
- else:
150
- _settings = Settings()
151
178
 
179
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
152
180
  signals_schema = SignalSchema({"sys": Sys})
153
181
  if query.feature_schema:
154
182
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -251,6 +279,8 @@ def datasets(
251
279
 
252
280
  def delete_dataset(
253
281
  name: str,
282
+ namespace: Optional[str] = None,
283
+ project: Optional[str] = None,
254
284
  version: Optional[str] = None,
255
285
  force: Optional[bool] = False,
256
286
  studio: Optional[bool] = False,
@@ -261,11 +291,16 @@ def delete_dataset(
261
291
  a force flag.
262
292
 
263
293
  Args:
264
- name : Dataset name
294
+ name: The dataset name, which can be a fully qualified name including the
295
+ namespace and project. Alternatively, it can be a regular name, in which
296
+ case the explicitly defined namespace and project will be used if they are
297
+ set; otherwise, default values will be applied.
298
+ namespace : optional name of namespace in which dataset to delete is created
299
+ project : optional name of project in which dataset to delete is created
265
300
  version : Optional dataset version
266
301
  force: If true, all datasets versions will be removed. Defaults to False.
267
- studio: If True, removes dataset from Studio only,
268
- otherwise remove from local. Defaults to False.
302
+ studio: If True, removes dataset from Studio only, otherwise removes local
303
+ dataset. Defaults to False.
269
304
  session: Optional session instance. If not provided, uses default session.
270
305
  in_memory: If True, creates an in-memory session. Defaults to False.
271
306
 
@@ -282,11 +317,26 @@ def delete_dataset(
282
317
  dc.delete_dataset("cats", version="1.0.0")
283
318
  ```
284
319
  """
320
+ from datachain.studio import remove_studio_dataset
285
321
 
286
322
  session = Session.get(session, in_memory=in_memory)
287
323
  catalog = session.catalog
324
+
325
+ namespace_name, project_name, name = parse_dataset_name(name)
326
+ namespace_name = (
327
+ namespace_name or namespace or catalog.metastore.default_namespace_name
328
+ )
329
+ project_name = project_name or project or catalog.metastore.default_project_name
330
+
331
+ if not catalog.metastore.is_local_dataset(namespace_name) and studio:
332
+ return remove_studio_dataset(
333
+ None, name, namespace_name, project_name, version=version, force=force
334
+ )
335
+
336
+ ds_project = get_project(project_name, namespace_name, session=session)
337
+
288
338
  if not force:
289
- version = version or catalog.get_dataset(name).latest_version
339
+ version = version or catalog.get_dataset(name, ds_project).latest_version
290
340
  else:
291
341
  version = None
292
- catalog.remove_dataset(name, version=version, force=force, studio=studio)
342
+ catalog.remove_dataset(name, ds_project, version=version, force=force)
@@ -3,19 +3,58 @@ from typing import (
3
3
  Optional,
4
4
  )
5
5
 
6
+ from datachain.lib.listing import LISTING_PREFIX, ls
6
7
  from datachain.lib.listing_info import ListingInfo
8
+ from datachain.lib.settings import Settings
9
+ from datachain.lib.signal_schema import SignalSchema
7
10
  from datachain.query import Session
11
+ from datachain.query.dataset import DatasetQuery, QueryStep, step_result
8
12
 
9
13
  from .values import read_values
10
14
 
11
15
  if TYPE_CHECKING:
12
16
  from typing_extensions import ParamSpec
13
17
 
18
+ from datachain.dataset import DatasetVersion
19
+ from datachain.query.dataset import StepResult
20
+
14
21
  from .datachain import DataChain
15
22
 
16
23
  P = ParamSpec("P")
17
24
 
18
25
 
26
+ class ReadOnlyQueryStep(QueryStep):
27
+ """
28
+ This step is used to read the dataset in read-only mode.
29
+ It is used to avoid the need to read the table metadata from the warehouse.
30
+ This is useful when we want to list the files in the dataset.
31
+ """
32
+
33
+ def apply(self) -> "StepResult":
34
+ import sqlalchemy as sa
35
+
36
+ def q(*columns):
37
+ return sa.select(*columns)
38
+
39
+ table_name = self.catalog.warehouse.dataset_table_name(
40
+ self.dataset, self.dataset_version
41
+ )
42
+ dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
43
+ table = dataset_row_cls.new_table(
44
+ table_name,
45
+ columns=(
46
+ [
47
+ *dataset_row_cls.sys_columns(),
48
+ *dataset_row_cls.listing_columns(),
49
+ ]
50
+ ),
51
+ )
52
+
53
+ return step_result(
54
+ q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
55
+ )
56
+
57
+
19
58
  def listings(
20
59
  session: Optional[Session] = None,
21
60
  in_memory: bool = False,
@@ -41,3 +80,75 @@ def listings(
41
80
  output={column: ListingInfo},
42
81
  **{column: catalog.listings()}, # type: ignore[arg-type]
43
82
  )
83
+
84
+
85
+ def read_listing_dataset(
86
+ name: str,
87
+ version: Optional[str] = None,
88
+ path: str = "",
89
+ session: Optional["Session"] = None,
90
+ settings: Optional[dict] = None,
91
+ ) -> tuple["DataChain", "DatasetVersion"]:
92
+ """Read a listing dataset and return a DataChain and listing version.
93
+
94
+ Args:
95
+ name: Name of the dataset
96
+ version: Version of the dataset
97
+ path: Path within the listing to read. Path can have globs.
98
+ session: Optional Session object to use for reading
99
+ settings: Optional settings dictionary to use for reading
100
+
101
+ Returns:
102
+ tuple[DataChain, DatasetVersion]: A tuple containing:
103
+ - DataChain configured for listing files
104
+ - DatasetVersion object for the specified listing version
105
+
106
+ Example:
107
+ ```py
108
+ import datachain as dc
109
+ chain, listing_version = dc.read_listing_dataset(
110
+ "lst__s3://my-bucket/my-path", version="1.0.0", path="my-path"
111
+ )
112
+ chain.show()
113
+ ```
114
+ """
115
+ # Configure and return a DataChain for reading listing dataset files
116
+ # Uses ReadOnlyQueryStep to avoid warehouse metadata lookups
117
+ from datachain.lib.dc import Sys
118
+ from datachain.lib.file import File
119
+
120
+ from .datachain import DataChain
121
+
122
+ if not name.startswith(LISTING_PREFIX):
123
+ name = LISTING_PREFIX + name
124
+
125
+ session = Session.get(session)
126
+ dataset = session.catalog.get_dataset(name)
127
+ if version is None:
128
+ version = dataset.latest_version
129
+
130
+ query = DatasetQuery(
131
+ name=name,
132
+ session=session,
133
+ indexing_column_types=File._datachain_column_types,
134
+ fallback_to_studio=False,
135
+ )
136
+ if settings:
137
+ cfg = {**settings}
138
+ if "prefetch" not in cfg:
139
+ cfg["prefetch"] = 0
140
+ _settings = Settings(**cfg)
141
+ else:
142
+ _settings = Settings(prefetch=0)
143
+ signal_schema = SignalSchema({"sys": Sys, "file": File})
144
+
145
+ query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
146
+ query.version = version
147
+ # We already know that this is a listing dataset,
148
+ # so we can set the listing function to True
149
+ query.set_listing_fn(lambda: True)
150
+
151
+ chain = DataChain(query, _settings, signal_schema)
152
+ chain = ls(chain, path, recursive=True, column="file")
153
+
154
+ return chain, dataset.get_version(version)
@@ -68,6 +68,7 @@ def read_records(
68
68
 
69
69
  dsr = catalog.create_dataset(
70
70
  name,
71
+ catalog.metastore.default_project,
71
72
  columns=columns,
72
73
  feature_schema=(
73
74
  signal_schema.clone_without_sys_signals().serialize()
@@ -144,6 +144,8 @@ def read_storage(
144
144
  catalog = session.catalog
145
145
  cache = catalog.cache
146
146
  client_config = session.catalog.client_config
147
+ listing_namespace_name = catalog.metastore.system_namespace_name
148
+ listing_project_name = catalog.metastore.listing_project_name
147
149
 
148
150
  uris = uri if isinstance(uri, (list, tuple)) else [uri]
149
151
 
@@ -167,7 +169,13 @@ def read_storage(
167
169
  )
168
170
  continue
169
171
 
170
- dc = read_dataset(list_ds_name, session=session, settings=settings)
172
+ dc = read_dataset(
173
+ list_ds_name,
174
+ namespace=listing_namespace_name,
175
+ project=listing_project_name,
176
+ session=session,
177
+ settings=settings,
178
+ )
171
179
  dc._query.update = update
172
180
  dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
173
181
 
@@ -182,7 +190,11 @@ def read_storage(
182
190
  settings=settings,
183
191
  in_memory=in_memory,
184
192
  )
185
- .settings(prefetch=0)
193
+ .settings(
194
+ prefetch=0,
195
+ namespace=listing_namespace_name,
196
+ project=listing_project_name,
197
+ )
186
198
  .gen(
187
199
  list_bucket(lst_uri, cache, client_config=client_config),
188
200
  output={f"{column}": file_type},
datachain/lib/listing.py CHANGED
@@ -123,6 +123,9 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
123
123
  f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
124
124
  )
125
125
 
126
+ # we should remove dots from the name
127
+ ds_name = ds_name.replace(".", "_")
128
+
126
129
  return ds_name, lst_uri, path
127
130
 
128
131
 
@@ -195,5 +198,4 @@ def get_listing(
195
198
  list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
196
199
 
197
200
  ds_name = listing.name if listing else ds_name
198
-
199
201
  return ds_name, list_uri, list_path, bool(listing)
@@ -0,0 +1,73 @@
1
+ from typing import Optional
2
+
3
+ from datachain.error import NamespaceCreateNotAllowedError
4
+ from datachain.namespace import Namespace
5
+ from datachain.query import Session
6
+
7
+
8
+ def create(
9
+ name: str, description: Optional[str] = None, session: Optional[Session] = None
10
+ ) -> Namespace:
11
+ """
12
+ Creates a new custom namespace.
13
+ A Namespace is an object used to organize datasets. It has name and a list of
14
+ Project objects underneath it. On the other hand, each Project can have multiple
15
+ datasets.
16
+ Note that creating namespaces is not allowed in the local environment, unlike
17
+ in Studio, where it is allowed.
18
+ In local environment all datasets are created under the default `local` namespace.
19
+
20
+ Parameters:
21
+ name : The name of the namespace.
22
+ description : A description of the namespace.
23
+ session : Session to use for creating namespace.
24
+
25
+ Example:
26
+ ```py
27
+ import datachain as dc
28
+ namespace = dc.namespaces.create("dev", "Dev namespace")
29
+ ```
30
+ """
31
+ session = Session.get(session)
32
+
33
+ if not session.catalog.metastore.namespace_allowed_to_create:
34
+ raise NamespaceCreateNotAllowedError("Creating custom namespace is not allowed")
35
+
36
+ Namespace.validate_name(name)
37
+
38
+ return session.catalog.metastore.create_namespace(name, description)
39
+
40
+
41
+ def get(name: str, session: Optional[Session]) -> Namespace:
42
+ """
43
+ Gets a namespace by name.
44
+ If the namespace is not found, a `NamespaceNotFoundError` is raised.
45
+
46
+ Parameters:
47
+ name : The name of the namespace.
48
+ session : Session to use for getting namespace.
49
+
50
+ Example:
51
+ ```py
52
+ import datachain as dc
53
+ namespace = dc.get_namespace("local")
54
+ ```
55
+ """
56
+ session = Session.get(session)
57
+ return session.catalog.metastore.get_namespace(name)
58
+
59
+
60
+ def ls(session: Optional[Session] = None) -> list[Namespace]:
61
+ """
62
+ Gets a list of all namespaces.
63
+
64
+ Parameters:
65
+ session : Session to use for getting namespaces.
66
+
67
+ Example:
68
+ ```py
69
+ import datachain as dc
70
+ namespaces = dc.namespaces.ls()
71
+ ```
72
+ """
73
+ return Session.get(session).catalog.metastore.list_namespaces()