datachain 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +3 -0
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/parser/__init__.py +1 -35
- datachain/cli/parser/job.py +25 -0
- datachain/cli/parser/studio.py +11 -4
- datachain/data_storage/metastore.py +390 -37
- datachain/data_storage/schema.py +23 -1
- datachain/data_storage/sqlite.py +139 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +125 -12
- datachain/delta.py +9 -5
- datachain/error.py +36 -0
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +86 -7
- datachain/lib/dc/datasets.py +62 -12
- datachain/lib/dc/listings.py +111 -0
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +14 -2
- datachain/lib/listing.py +3 -1
- datachain/lib/namespaces.py +73 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/settings.py +10 -0
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +67 -26
- datachain/studio.py +68 -8
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/METADATA +2 -2
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/RECORD +37 -33
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/WHEEL +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/top_level.txt +0 -0
datachain/lib/dataset_info.py
CHANGED
|
@@ -22,6 +22,8 @@ if TYPE_CHECKING:
|
|
|
22
22
|
|
|
23
23
|
class DatasetInfo(DataModel):
|
|
24
24
|
name: str
|
|
25
|
+
namespace_name: str
|
|
26
|
+
project_name: str
|
|
25
27
|
uuid: str = Field(default=str(uuid4()))
|
|
26
28
|
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
27
29
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
@@ -91,6 +93,8 @@ class DatasetInfo(DataModel):
|
|
|
91
93
|
return cls(
|
|
92
94
|
uuid=version.uuid,
|
|
93
95
|
name=dataset.name,
|
|
96
|
+
namespace_name=dataset.project.namespace.name,
|
|
97
|
+
project_name=dataset.project.name,
|
|
94
98
|
version=version.version,
|
|
95
99
|
status=version.status,
|
|
96
100
|
created_at=version.created_at,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -24,7 +24,7 @@ from pydantic import BaseModel
|
|
|
24
24
|
from tqdm import tqdm
|
|
25
25
|
|
|
26
26
|
from datachain import semver
|
|
27
|
-
from datachain.dataset import DatasetRecord
|
|
27
|
+
from datachain.dataset import DatasetRecord, parse_dataset_name
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
29
|
from datachain.func import literal
|
|
30
30
|
from datachain.func.base import Function
|
|
@@ -37,6 +37,7 @@ from datachain.lib.file import (
|
|
|
37
37
|
FileExporter,
|
|
38
38
|
)
|
|
39
39
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
40
|
+
from datachain.lib.projects import get as get_project
|
|
40
41
|
from datachain.lib.settings import Settings
|
|
41
42
|
from datachain.lib.signal_schema import SignalSchema
|
|
42
43
|
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
@@ -261,7 +262,7 @@ class DataChain:
|
|
|
261
262
|
"""Underlying dataset, if there is one."""
|
|
262
263
|
if not self.name:
|
|
263
264
|
return None
|
|
264
|
-
return self.session.catalog.get_dataset(self.name)
|
|
265
|
+
return self.session.catalog.get_dataset(self.name, self._query.project)
|
|
265
266
|
|
|
266
267
|
def __or__(self, other: "Self") -> "Self":
|
|
267
268
|
"""Return `self.union(other)`."""
|
|
@@ -312,6 +313,8 @@ class DataChain:
|
|
|
312
313
|
min_task_size=None,
|
|
313
314
|
prefetch: Optional[int] = None,
|
|
314
315
|
sys: Optional[bool] = None,
|
|
316
|
+
namespace: Optional[str] = None,
|
|
317
|
+
project: Optional[str] = None,
|
|
315
318
|
) -> "Self":
|
|
316
319
|
"""Change settings for chain.
|
|
317
320
|
|
|
@@ -327,6 +330,8 @@ class DataChain:
|
|
|
327
330
|
prefetch: number of workers to use for downloading files in advance.
|
|
328
331
|
This is enabled by default and uses 2 workers.
|
|
329
332
|
To disable prefetching, set it to 0.
|
|
333
|
+
namespace: namespace name.
|
|
334
|
+
project: project name.
|
|
330
335
|
|
|
331
336
|
Example:
|
|
332
337
|
```py
|
|
@@ -340,7 +345,11 @@ class DataChain:
|
|
|
340
345
|
if sys is None:
|
|
341
346
|
sys = self._sys
|
|
342
347
|
settings = copy.copy(self._settings)
|
|
343
|
-
settings.add(
|
|
348
|
+
settings.add(
|
|
349
|
+
Settings(
|
|
350
|
+
cache, parallel, workers, min_task_size, prefetch, namespace, project
|
|
351
|
+
)
|
|
352
|
+
)
|
|
344
353
|
return self._evolve(settings=settings, _sys=sys)
|
|
345
354
|
|
|
346
355
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
@@ -490,6 +499,22 @@ class DataChain:
|
|
|
490
499
|
)
|
|
491
500
|
return listings(*args, **kwargs)
|
|
492
501
|
|
|
502
|
+
@property
|
|
503
|
+
def namespace_name(self) -> str:
|
|
504
|
+
"""Current namespace name in which the chain is running"""
|
|
505
|
+
return (
|
|
506
|
+
self._settings.namespace
|
|
507
|
+
or self.session.catalog.metastore.default_namespace_name
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
@property
|
|
511
|
+
def project_name(self) -> str:
|
|
512
|
+
"""Current project name in which the chain is running"""
|
|
513
|
+
return (
|
|
514
|
+
self._settings.project
|
|
515
|
+
or self.session.catalog.metastore.default_project_name
|
|
516
|
+
)
|
|
517
|
+
|
|
493
518
|
def persist(self) -> "Self":
|
|
494
519
|
"""Saves temporary chain that will be removed after the process ends.
|
|
495
520
|
Temporary datasets are useful for optimization, for example when we have
|
|
@@ -499,7 +524,12 @@ class DataChain:
|
|
|
499
524
|
It returns the chain itself.
|
|
500
525
|
"""
|
|
501
526
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
502
|
-
|
|
527
|
+
project = get_project(
|
|
528
|
+
self.project_name, self.namespace_name, session=self.session
|
|
529
|
+
)
|
|
530
|
+
return self._evolve(
|
|
531
|
+
query=self._query.save(project=project, feature_schema=schema)
|
|
532
|
+
)
|
|
503
533
|
|
|
504
534
|
def save( # type: ignore[override]
|
|
505
535
|
self,
|
|
@@ -513,7 +543,10 @@ class DataChain:
|
|
|
513
543
|
"""Save to a Dataset. It returns the chain itself.
|
|
514
544
|
|
|
515
545
|
Parameters:
|
|
516
|
-
name : dataset name.
|
|
546
|
+
name : dataset name. It can be full name consisting of namespace and
|
|
547
|
+
project, but it can also be just a regular dataset name in which
|
|
548
|
+
case we are taking namespace and project from settings, if they
|
|
549
|
+
are defined there, or default ones instead.
|
|
517
550
|
version : version of a dataset. If version is not specified and dataset
|
|
518
551
|
already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
|
|
519
552
|
description : description of a dataset.
|
|
@@ -535,6 +568,21 @@ class DataChain:
|
|
|
535
568
|
" patch"
|
|
536
569
|
)
|
|
537
570
|
|
|
571
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
572
|
+
|
|
573
|
+
namespace_name = (
|
|
574
|
+
namespace_name
|
|
575
|
+
or self._settings.namespace
|
|
576
|
+
or self.session.catalog.metastore.default_namespace_name
|
|
577
|
+
)
|
|
578
|
+
project_name = (
|
|
579
|
+
project_name
|
|
580
|
+
or self._settings.project
|
|
581
|
+
or self.session.catalog.metastore.default_project_name
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
project = get_project(project_name, namespace_name, session=self.session)
|
|
585
|
+
|
|
538
586
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
539
587
|
|
|
540
588
|
# Handle retry and delta functionality
|
|
@@ -558,6 +606,7 @@ class DataChain:
|
|
|
558
606
|
query=result_ds._query.save(
|
|
559
607
|
name=name,
|
|
560
608
|
version=version,
|
|
609
|
+
project=project,
|
|
561
610
|
feature_schema=schema,
|
|
562
611
|
dependencies=dependencies,
|
|
563
612
|
**kwargs,
|
|
@@ -577,6 +626,7 @@ class DataChain:
|
|
|
577
626
|
query=self._query.save(
|
|
578
627
|
name=name,
|
|
579
628
|
version=version,
|
|
629
|
+
project=project,
|
|
580
630
|
description=description,
|
|
581
631
|
attrs=attrs,
|
|
582
632
|
feature_schema=schema,
|
|
@@ -2239,16 +2289,45 @@ class DataChain:
|
|
|
2239
2289
|
|
|
2240
2290
|
Combining filters with "or"
|
|
2241
2291
|
```py
|
|
2242
|
-
dc.filter(
|
|
2292
|
+
dc.filter(
|
|
2293
|
+
C("file.path").glob("cat*") |
|
|
2294
|
+
C("file.path").glob("dog*")
|
|
2295
|
+
)
|
|
2296
|
+
```
|
|
2297
|
+
|
|
2298
|
+
```py
|
|
2299
|
+
dc.filter(dc.func.or_(
|
|
2300
|
+
C("file.path").glob("cat*"),
|
|
2301
|
+
C("file.path").glob("dog*")
|
|
2302
|
+
))
|
|
2243
2303
|
```
|
|
2244
2304
|
|
|
2245
2305
|
Combining filters with "and"
|
|
2246
2306
|
```py
|
|
2247
2307
|
dc.filter(
|
|
2248
|
-
C("file.path").glob("*.jpg)
|
|
2308
|
+
C("file.path").glob("*.jpg"),
|
|
2309
|
+
string.length(C("file.path")) > 5
|
|
2310
|
+
)
|
|
2311
|
+
```
|
|
2312
|
+
|
|
2313
|
+
```py
|
|
2314
|
+
dc.filter(
|
|
2315
|
+
C("file.path").glob("*.jpg") &
|
|
2249
2316
|
(string.length(C("file.path")) > 5)
|
|
2250
2317
|
)
|
|
2251
2318
|
```
|
|
2319
|
+
|
|
2320
|
+
```py
|
|
2321
|
+
dc.filter(dc.func.and_(
|
|
2322
|
+
C("file.path").glob("*.jpg"),
|
|
2323
|
+
string.length(C("file.path")) > 5
|
|
2324
|
+
))
|
|
2325
|
+
```
|
|
2326
|
+
|
|
2327
|
+
Combining filters with "not"
|
|
2328
|
+
```py
|
|
2329
|
+
dc.filter(~(C("file.path").glob("*.jpg")))
|
|
2330
|
+
```
|
|
2252
2331
|
"""
|
|
2253
2332
|
return self._evolve(query=self._query.filter(*args))
|
|
2254
2333
|
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
|
|
3
3
|
|
|
4
|
+
from datachain.dataset import parse_dataset_name
|
|
4
5
|
from datachain.error import DatasetVersionNotFoundError
|
|
5
6
|
from datachain.lib.dataset_info import DatasetInfo
|
|
6
7
|
from datachain.lib.file import (
|
|
7
8
|
File,
|
|
8
9
|
)
|
|
10
|
+
from datachain.lib.projects import get as get_project
|
|
9
11
|
from datachain.lib.settings import Settings
|
|
10
12
|
from datachain.lib.signal_schema import SignalSchema
|
|
11
13
|
from datachain.query import Session
|
|
@@ -24,6 +26,8 @@ if TYPE_CHECKING:
|
|
|
24
26
|
|
|
25
27
|
def read_dataset(
|
|
26
28
|
name: str,
|
|
29
|
+
namespace: Optional[str] = None,
|
|
30
|
+
project: Optional[str] = None,
|
|
27
31
|
version: Optional[Union[str, int]] = None,
|
|
28
32
|
session: Optional[Session] = None,
|
|
29
33
|
settings: Optional[dict] = None,
|
|
@@ -38,7 +42,12 @@ def read_dataset(
|
|
|
38
42
|
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
39
43
|
|
|
40
44
|
Parameters:
|
|
41
|
-
name
|
|
45
|
+
name: The dataset name, which can be a fully qualified name including the
|
|
46
|
+
namespace and project. Alternatively, it can be a regular name, in which
|
|
47
|
+
case the explicitly defined namespace and project will be used if they are
|
|
48
|
+
set; otherwise, default values will be applied.
|
|
49
|
+
namespace : optional name of namespace in which dataset to read is created
|
|
50
|
+
project : optional name of project in which dataset to read is created
|
|
42
51
|
version : dataset version
|
|
43
52
|
session : Session to use for the chain.
|
|
44
53
|
settings : Settings to use for the chain.
|
|
@@ -86,6 +95,11 @@ def read_dataset(
|
|
|
86
95
|
chain = dc.read_dataset("my_cats")
|
|
87
96
|
```
|
|
88
97
|
|
|
98
|
+
```py
|
|
99
|
+
import datachain as dc
|
|
100
|
+
chain = dc.read_dataset("dev.animals.my_cats")
|
|
101
|
+
```
|
|
102
|
+
|
|
89
103
|
```py
|
|
90
104
|
chain = dc.read_dataset("my_cats", fallback_to_studio=False)
|
|
91
105
|
```
|
|
@@ -116,6 +130,15 @@ def read_dataset(
|
|
|
116
130
|
|
|
117
131
|
from .datachain import DataChain
|
|
118
132
|
|
|
133
|
+
session = Session.get(session)
|
|
134
|
+
catalog = session.catalog
|
|
135
|
+
|
|
136
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
137
|
+
namespace_name = (
|
|
138
|
+
namespace_name or namespace or catalog.metastore.default_namespace_name
|
|
139
|
+
)
|
|
140
|
+
project_name = project_name or project or catalog.metastore.default_project_name
|
|
141
|
+
|
|
119
142
|
if version is not None:
|
|
120
143
|
try:
|
|
121
144
|
# for backward compatibility we still allow users to put version as integer
|
|
@@ -125,7 +148,9 @@ def read_dataset(
|
|
|
125
148
|
# all 2.* dataset versions). If dataset doesn't have any versions where
|
|
126
149
|
# major part is equal to that input, exception is thrown.
|
|
127
150
|
major = int(version)
|
|
128
|
-
dataset =
|
|
151
|
+
dataset = session.catalog.get_dataset(
|
|
152
|
+
name, get_project(project_name, namespace_name, session=session)
|
|
153
|
+
)
|
|
129
154
|
latest_major = dataset.latest_major_version(major)
|
|
130
155
|
if not latest_major:
|
|
131
156
|
raise DatasetVersionNotFoundError(
|
|
@@ -136,19 +161,22 @@ def read_dataset(
|
|
|
136
161
|
# version is in new semver string format, continuing as normal
|
|
137
162
|
pass
|
|
138
163
|
|
|
164
|
+
if settings:
|
|
165
|
+
_settings = Settings(**settings)
|
|
166
|
+
else:
|
|
167
|
+
_settings = Settings()
|
|
168
|
+
|
|
139
169
|
query = DatasetQuery(
|
|
140
170
|
name=name,
|
|
171
|
+
project_name=project_name,
|
|
172
|
+
namespace_name=namespace_name,
|
|
141
173
|
version=version, # type: ignore[arg-type]
|
|
142
174
|
session=session,
|
|
143
175
|
indexing_column_types=File._datachain_column_types,
|
|
144
176
|
fallback_to_studio=fallback_to_studio,
|
|
145
177
|
)
|
|
146
|
-
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
147
|
-
if settings:
|
|
148
|
-
_settings = Settings(**settings)
|
|
149
|
-
else:
|
|
150
|
-
_settings = Settings()
|
|
151
178
|
|
|
179
|
+
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
152
180
|
signals_schema = SignalSchema({"sys": Sys})
|
|
153
181
|
if query.feature_schema:
|
|
154
182
|
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
@@ -251,6 +279,8 @@ def datasets(
|
|
|
251
279
|
|
|
252
280
|
def delete_dataset(
|
|
253
281
|
name: str,
|
|
282
|
+
namespace: Optional[str] = None,
|
|
283
|
+
project: Optional[str] = None,
|
|
254
284
|
version: Optional[str] = None,
|
|
255
285
|
force: Optional[bool] = False,
|
|
256
286
|
studio: Optional[bool] = False,
|
|
@@ -261,11 +291,16 @@ def delete_dataset(
|
|
|
261
291
|
a force flag.
|
|
262
292
|
|
|
263
293
|
Args:
|
|
264
|
-
name
|
|
294
|
+
name: The dataset name, which can be a fully qualified name including the
|
|
295
|
+
namespace and project. Alternatively, it can be a regular name, in which
|
|
296
|
+
case the explicitly defined namespace and project will be used if they are
|
|
297
|
+
set; otherwise, default values will be applied.
|
|
298
|
+
namespace : optional name of namespace in which dataset to delete is created
|
|
299
|
+
project : optional name of project in which dataset to delete is created
|
|
265
300
|
version : Optional dataset version
|
|
266
301
|
force: If true, all datasets versions will be removed. Defaults to False.
|
|
267
|
-
studio: If True, removes dataset from Studio only,
|
|
268
|
-
|
|
302
|
+
studio: If True, removes dataset from Studio only, otherwise removes local
|
|
303
|
+
dataset. Defaults to False.
|
|
269
304
|
session: Optional session instance. If not provided, uses default session.
|
|
270
305
|
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
271
306
|
|
|
@@ -282,11 +317,26 @@ def delete_dataset(
|
|
|
282
317
|
dc.delete_dataset("cats", version="1.0.0")
|
|
283
318
|
```
|
|
284
319
|
"""
|
|
320
|
+
from datachain.studio import remove_studio_dataset
|
|
285
321
|
|
|
286
322
|
session = Session.get(session, in_memory=in_memory)
|
|
287
323
|
catalog = session.catalog
|
|
324
|
+
|
|
325
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
326
|
+
namespace_name = (
|
|
327
|
+
namespace_name or namespace or catalog.metastore.default_namespace_name
|
|
328
|
+
)
|
|
329
|
+
project_name = project_name or project or catalog.metastore.default_project_name
|
|
330
|
+
|
|
331
|
+
if not catalog.metastore.is_local_dataset(namespace_name) and studio:
|
|
332
|
+
return remove_studio_dataset(
|
|
333
|
+
None, name, namespace_name, project_name, version=version, force=force
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
ds_project = get_project(project_name, namespace_name, session=session)
|
|
337
|
+
|
|
288
338
|
if not force:
|
|
289
|
-
version = version or catalog.get_dataset(name).latest_version
|
|
339
|
+
version = version or catalog.get_dataset(name, ds_project).latest_version
|
|
290
340
|
else:
|
|
291
341
|
version = None
|
|
292
|
-
catalog.remove_dataset(name, version=version, force=force
|
|
342
|
+
catalog.remove_dataset(name, ds_project, version=version, force=force)
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -3,19 +3,58 @@ from typing import (
|
|
|
3
3
|
Optional,
|
|
4
4
|
)
|
|
5
5
|
|
|
6
|
+
from datachain.lib.listing import LISTING_PREFIX, ls
|
|
6
7
|
from datachain.lib.listing_info import ListingInfo
|
|
8
|
+
from datachain.lib.settings import Settings
|
|
9
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
7
10
|
from datachain.query import Session
|
|
11
|
+
from datachain.query.dataset import DatasetQuery, QueryStep, step_result
|
|
8
12
|
|
|
9
13
|
from .values import read_values
|
|
10
14
|
|
|
11
15
|
if TYPE_CHECKING:
|
|
12
16
|
from typing_extensions import ParamSpec
|
|
13
17
|
|
|
18
|
+
from datachain.dataset import DatasetVersion
|
|
19
|
+
from datachain.query.dataset import StepResult
|
|
20
|
+
|
|
14
21
|
from .datachain import DataChain
|
|
15
22
|
|
|
16
23
|
P = ParamSpec("P")
|
|
17
24
|
|
|
18
25
|
|
|
26
|
+
class ReadOnlyQueryStep(QueryStep):
|
|
27
|
+
"""
|
|
28
|
+
This step is used to read the dataset in read-only mode.
|
|
29
|
+
It is used to avoid the need to read the table metadata from the warehouse.
|
|
30
|
+
This is useful when we want to list the files in the dataset.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def apply(self) -> "StepResult":
|
|
34
|
+
import sqlalchemy as sa
|
|
35
|
+
|
|
36
|
+
def q(*columns):
|
|
37
|
+
return sa.select(*columns)
|
|
38
|
+
|
|
39
|
+
table_name = self.catalog.warehouse.dataset_table_name(
|
|
40
|
+
self.dataset, self.dataset_version
|
|
41
|
+
)
|
|
42
|
+
dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
|
|
43
|
+
table = dataset_row_cls.new_table(
|
|
44
|
+
table_name,
|
|
45
|
+
columns=(
|
|
46
|
+
[
|
|
47
|
+
*dataset_row_cls.sys_columns(),
|
|
48
|
+
*dataset_row_cls.listing_columns(),
|
|
49
|
+
]
|
|
50
|
+
),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return step_result(
|
|
54
|
+
q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
19
58
|
def listings(
|
|
20
59
|
session: Optional[Session] = None,
|
|
21
60
|
in_memory: bool = False,
|
|
@@ -41,3 +80,75 @@ def listings(
|
|
|
41
80
|
output={column: ListingInfo},
|
|
42
81
|
**{column: catalog.listings()}, # type: ignore[arg-type]
|
|
43
82
|
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def read_listing_dataset(
|
|
86
|
+
name: str,
|
|
87
|
+
version: Optional[str] = None,
|
|
88
|
+
path: str = "",
|
|
89
|
+
session: Optional["Session"] = None,
|
|
90
|
+
settings: Optional[dict] = None,
|
|
91
|
+
) -> tuple["DataChain", "DatasetVersion"]:
|
|
92
|
+
"""Read a listing dataset and return a DataChain and listing version.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
name: Name of the dataset
|
|
96
|
+
version: Version of the dataset
|
|
97
|
+
path: Path within the listing to read. Path can have globs.
|
|
98
|
+
session: Optional Session object to use for reading
|
|
99
|
+
settings: Optional settings dictionary to use for reading
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
tuple[DataChain, DatasetVersion]: A tuple containing:
|
|
103
|
+
- DataChain configured for listing files
|
|
104
|
+
- DatasetVersion object for the specified listing version
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
```py
|
|
108
|
+
import datachain as dc
|
|
109
|
+
chain, listing_version = dc.read_listing_dataset(
|
|
110
|
+
"lst__s3://my-bucket/my-path", version="1.0.0", path="my-path"
|
|
111
|
+
)
|
|
112
|
+
chain.show()
|
|
113
|
+
```
|
|
114
|
+
"""
|
|
115
|
+
# Configure and return a DataChain for reading listing dataset files
|
|
116
|
+
# Uses ReadOnlyQueryStep to avoid warehouse metadata lookups
|
|
117
|
+
from datachain.lib.dc import Sys
|
|
118
|
+
from datachain.lib.file import File
|
|
119
|
+
|
|
120
|
+
from .datachain import DataChain
|
|
121
|
+
|
|
122
|
+
if not name.startswith(LISTING_PREFIX):
|
|
123
|
+
name = LISTING_PREFIX + name
|
|
124
|
+
|
|
125
|
+
session = Session.get(session)
|
|
126
|
+
dataset = session.catalog.get_dataset(name)
|
|
127
|
+
if version is None:
|
|
128
|
+
version = dataset.latest_version
|
|
129
|
+
|
|
130
|
+
query = DatasetQuery(
|
|
131
|
+
name=name,
|
|
132
|
+
session=session,
|
|
133
|
+
indexing_column_types=File._datachain_column_types,
|
|
134
|
+
fallback_to_studio=False,
|
|
135
|
+
)
|
|
136
|
+
if settings:
|
|
137
|
+
cfg = {**settings}
|
|
138
|
+
if "prefetch" not in cfg:
|
|
139
|
+
cfg["prefetch"] = 0
|
|
140
|
+
_settings = Settings(**cfg)
|
|
141
|
+
else:
|
|
142
|
+
_settings = Settings(prefetch=0)
|
|
143
|
+
signal_schema = SignalSchema({"sys": Sys, "file": File})
|
|
144
|
+
|
|
145
|
+
query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
|
|
146
|
+
query.version = version
|
|
147
|
+
# We already know that this is a listing dataset,
|
|
148
|
+
# so we can set the listing function to True
|
|
149
|
+
query.set_listing_fn(lambda: True)
|
|
150
|
+
|
|
151
|
+
chain = DataChain(query, _settings, signal_schema)
|
|
152
|
+
chain = ls(chain, path, recursive=True, column="file")
|
|
153
|
+
|
|
154
|
+
return chain, dataset.get_version(version)
|
datachain/lib/dc/records.py
CHANGED
datachain/lib/dc/storage.py
CHANGED
|
@@ -144,6 +144,8 @@ def read_storage(
|
|
|
144
144
|
catalog = session.catalog
|
|
145
145
|
cache = catalog.cache
|
|
146
146
|
client_config = session.catalog.client_config
|
|
147
|
+
listing_namespace_name = catalog.metastore.system_namespace_name
|
|
148
|
+
listing_project_name = catalog.metastore.listing_project_name
|
|
147
149
|
|
|
148
150
|
uris = uri if isinstance(uri, (list, tuple)) else [uri]
|
|
149
151
|
|
|
@@ -167,7 +169,13 @@ def read_storage(
|
|
|
167
169
|
)
|
|
168
170
|
continue
|
|
169
171
|
|
|
170
|
-
dc = read_dataset(
|
|
172
|
+
dc = read_dataset(
|
|
173
|
+
list_ds_name,
|
|
174
|
+
namespace=listing_namespace_name,
|
|
175
|
+
project=listing_project_name,
|
|
176
|
+
session=session,
|
|
177
|
+
settings=settings,
|
|
178
|
+
)
|
|
171
179
|
dc._query.update = update
|
|
172
180
|
dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
|
|
173
181
|
|
|
@@ -182,7 +190,11 @@ def read_storage(
|
|
|
182
190
|
settings=settings,
|
|
183
191
|
in_memory=in_memory,
|
|
184
192
|
)
|
|
185
|
-
.settings(
|
|
193
|
+
.settings(
|
|
194
|
+
prefetch=0,
|
|
195
|
+
namespace=listing_namespace_name,
|
|
196
|
+
project=listing_project_name,
|
|
197
|
+
)
|
|
186
198
|
.gen(
|
|
187
199
|
list_bucket(lst_uri, cache, client_config=client_config),
|
|
188
200
|
output={f"{column}": file_type},
|
datachain/lib/listing.py
CHANGED
|
@@ -123,6 +123,9 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
|
|
|
123
123
|
f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
+
# we should remove dots from the name
|
|
127
|
+
ds_name = ds_name.replace(".", "_")
|
|
128
|
+
|
|
126
129
|
return ds_name, lst_uri, path
|
|
127
130
|
|
|
128
131
|
|
|
@@ -195,5 +198,4 @@ def get_listing(
|
|
|
195
198
|
list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
|
|
196
199
|
|
|
197
200
|
ds_name = listing.name if listing else ds_name
|
|
198
|
-
|
|
199
201
|
return ds_name, list_uri, list_path, bool(listing)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from datachain.error import NamespaceCreateNotAllowedError
|
|
4
|
+
from datachain.namespace import Namespace
|
|
5
|
+
from datachain.query import Session
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create(
|
|
9
|
+
name: str, description: Optional[str] = None, session: Optional[Session] = None
|
|
10
|
+
) -> Namespace:
|
|
11
|
+
"""
|
|
12
|
+
Creates a new custom namespace.
|
|
13
|
+
A Namespace is an object used to organize datasets. It has name and a list of
|
|
14
|
+
Project objects underneath it. On the other hand, each Project can have multiple
|
|
15
|
+
datasets.
|
|
16
|
+
Note that creating namespaces is not allowed in the local environment, unlike
|
|
17
|
+
in Studio, where it is allowed.
|
|
18
|
+
In local environment all datasets are created under the default `local` namespace.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
name : The name of the namespace.
|
|
22
|
+
description : A description of the namespace.
|
|
23
|
+
session : Session to use for creating namespace.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
```py
|
|
27
|
+
import datachain as dc
|
|
28
|
+
namespace = dc.namespaces.create("dev", "Dev namespace")
|
|
29
|
+
```
|
|
30
|
+
"""
|
|
31
|
+
session = Session.get(session)
|
|
32
|
+
|
|
33
|
+
if not session.catalog.metastore.namespace_allowed_to_create:
|
|
34
|
+
raise NamespaceCreateNotAllowedError("Creating custom namespace is not allowed")
|
|
35
|
+
|
|
36
|
+
Namespace.validate_name(name)
|
|
37
|
+
|
|
38
|
+
return session.catalog.metastore.create_namespace(name, description)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get(name: str, session: Optional[Session]) -> Namespace:
|
|
42
|
+
"""
|
|
43
|
+
Gets a namespace by name.
|
|
44
|
+
If the namespace is not found, a `NamespaceNotFoundError` is raised.
|
|
45
|
+
|
|
46
|
+
Parameters:
|
|
47
|
+
name : The name of the namespace.
|
|
48
|
+
session : Session to use for getting namespace.
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
```py
|
|
52
|
+
import datachain as dc
|
|
53
|
+
namespace = dc.get_namespace("local")
|
|
54
|
+
```
|
|
55
|
+
"""
|
|
56
|
+
session = Session.get(session)
|
|
57
|
+
return session.catalog.metastore.get_namespace(name)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def ls(session: Optional[Session] = None) -> list[Namespace]:
|
|
61
|
+
"""
|
|
62
|
+
Gets a list of all namespaces.
|
|
63
|
+
|
|
64
|
+
Parameters:
|
|
65
|
+
session : Session to use for getting namespaces.
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
```py
|
|
69
|
+
import datachain as dc
|
|
70
|
+
namespaces = dc.namespaces.ls()
|
|
71
|
+
```
|
|
72
|
+
"""
|
|
73
|
+
return Session.get(session).catalog.metastore.list_namespaces()
|