datachain 0.21.0__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +4 -9
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +422 -37
- datachain/data_storage/sqlite.py +136 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +126 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +260 -92
- datachain/lib/dc/datasets.py +104 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +36 -10
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
|
|
3
3
|
|
|
4
|
-
from datachain.
|
|
4
|
+
from datachain.dataset import parse_dataset_name
|
|
5
|
+
from datachain.error import (
|
|
6
|
+
DatasetNotFoundError,
|
|
7
|
+
DatasetVersionNotFoundError,
|
|
8
|
+
ProjectNotFoundError,
|
|
9
|
+
)
|
|
5
10
|
from datachain.lib.dataset_info import DatasetInfo
|
|
6
11
|
from datachain.lib.file import (
|
|
7
12
|
File,
|
|
8
13
|
)
|
|
14
|
+
from datachain.lib.projects import get as get_project
|
|
9
15
|
from datachain.lib.settings import Settings
|
|
10
16
|
from datachain.lib.signal_schema import SignalSchema
|
|
11
17
|
from datachain.query import Session
|
|
@@ -24,12 +30,18 @@ if TYPE_CHECKING:
|
|
|
24
30
|
|
|
25
31
|
def read_dataset(
|
|
26
32
|
name: str,
|
|
33
|
+
namespace: Optional[str] = None,
|
|
34
|
+
project: Optional[str] = None,
|
|
27
35
|
version: Optional[Union[str, int]] = None,
|
|
28
36
|
session: Optional[Session] = None,
|
|
29
37
|
settings: Optional[dict] = None,
|
|
30
38
|
fallback_to_studio: bool = True,
|
|
31
39
|
delta: Optional[bool] = False,
|
|
32
|
-
delta_on: Optional[Union[str, Sequence[str]]] =
|
|
40
|
+
delta_on: Optional[Union[str, Sequence[str]]] = (
|
|
41
|
+
"file.path",
|
|
42
|
+
"file.etag",
|
|
43
|
+
"file.version",
|
|
44
|
+
),
|
|
33
45
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
34
46
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
35
47
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
@@ -38,47 +50,36 @@ def read_dataset(
|
|
|
38
50
|
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
39
51
|
|
|
40
52
|
Parameters:
|
|
41
|
-
name
|
|
53
|
+
name: The dataset name, which can be a fully qualified name including the
|
|
54
|
+
namespace and project. Alternatively, it can be a regular name, in which
|
|
55
|
+
case the explicitly defined namespace and project will be used if they are
|
|
56
|
+
set; otherwise, default values will be applied.
|
|
57
|
+
namespace : optional name of namespace in which dataset to read is created
|
|
58
|
+
project : optional name of project in which dataset to read is created
|
|
42
59
|
version : dataset version
|
|
43
60
|
session : Session to use for the chain.
|
|
44
61
|
settings : Settings to use for the chain.
|
|
45
62
|
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
46
63
|
Default is True.
|
|
47
|
-
delta: If
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
dataset
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
67
|
-
to the `delta_on` fields from the source.
|
|
68
|
-
This is needed to identify rows that have changed in the source but are
|
|
69
|
-
already present in the current version of the resulting dataset, in order
|
|
70
|
-
to avoid including outdated versions of those rows in the new dataset.
|
|
71
|
-
We retain only the latest versions of rows to prevent duplication.
|
|
72
|
-
There is no need to define this if the `delta_on` fields are present in
|
|
73
|
-
the final dataset and have not been renamed.
|
|
74
|
-
delta_compare: A list of fields used to check if the same row has been modified
|
|
75
|
-
in the new version of the source.
|
|
76
|
-
If not defined, all fields except those defined in delta_on will be used.
|
|
77
|
-
delta_retry: Specifies retry behavior for delta processing. If a string,
|
|
78
|
-
it's the name of a field in the result dataset that indicates an error
|
|
79
|
-
when not None - records with errors will be reprocessed. If True,
|
|
80
|
-
records that exist in the source dataset but not in the result dataset
|
|
81
|
-
will be reprocessed.
|
|
64
|
+
delta: If True, only process new or changed files instead of reprocessing
|
|
65
|
+
everything. This saves time by skipping files that were already processed in
|
|
66
|
+
previous versions. The optimization is working when a new version of the
|
|
67
|
+
dataset is created.
|
|
68
|
+
Default is False.
|
|
69
|
+
delta_on: Field(s) that uniquely identify each record in the source data.
|
|
70
|
+
Used to detect which records are new or changed.
|
|
71
|
+
Default is ("file.path", "file.etag", "file.version").
|
|
72
|
+
delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
|
|
73
|
+
Only needed if you rename the identifying fields during processing.
|
|
74
|
+
Default is None.
|
|
75
|
+
delta_compare: Field(s) used to detect if a record has changed.
|
|
76
|
+
If not specified, all fields except `delta_on` fields are used.
|
|
77
|
+
Default is None.
|
|
78
|
+
delta_retry: Controls retry behavior for failed records:
|
|
79
|
+
- String (field name): Reprocess records where this field is not empty
|
|
80
|
+
(error mode)
|
|
81
|
+
- True: Reprocess records missing from the result dataset (missing mode)
|
|
82
|
+
- None: No retry processing (default)
|
|
82
83
|
|
|
83
84
|
Example:
|
|
84
85
|
```py
|
|
@@ -86,6 +87,11 @@ def read_dataset(
|
|
|
86
87
|
chain = dc.read_dataset("my_cats")
|
|
87
88
|
```
|
|
88
89
|
|
|
90
|
+
```py
|
|
91
|
+
import datachain as dc
|
|
92
|
+
chain = dc.read_dataset("dev.animals.my_cats")
|
|
93
|
+
```
|
|
94
|
+
|
|
89
95
|
```py
|
|
90
96
|
chain = dc.read_dataset("my_cats", fallback_to_studio=False)
|
|
91
97
|
```
|
|
@@ -116,6 +122,15 @@ def read_dataset(
|
|
|
116
122
|
|
|
117
123
|
from .datachain import DataChain
|
|
118
124
|
|
|
125
|
+
session = Session.get(session)
|
|
126
|
+
catalog = session.catalog
|
|
127
|
+
|
|
128
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
129
|
+
namespace_name = (
|
|
130
|
+
namespace_name or namespace or catalog.metastore.default_namespace_name
|
|
131
|
+
)
|
|
132
|
+
project_name = project_name or project or catalog.metastore.default_project_name
|
|
133
|
+
|
|
119
134
|
if version is not None:
|
|
120
135
|
try:
|
|
121
136
|
# for backward compatibility we still allow users to put version as integer
|
|
@@ -125,7 +140,15 @@ def read_dataset(
|
|
|
125
140
|
# all 2.* dataset versions). If dataset doesn't have any versions where
|
|
126
141
|
# major part is equal to that input, exception is thrown.
|
|
127
142
|
major = int(version)
|
|
128
|
-
|
|
143
|
+
try:
|
|
144
|
+
ds_project = get_project(project_name, namespace_name, session=session)
|
|
145
|
+
except ProjectNotFoundError:
|
|
146
|
+
raise DatasetNotFoundError(
|
|
147
|
+
f"Dataset {name} not found in namespace {namespace_name} and",
|
|
148
|
+
f" project {project_name}",
|
|
149
|
+
) from None
|
|
150
|
+
|
|
151
|
+
dataset = session.catalog.get_dataset(name, ds_project)
|
|
129
152
|
latest_major = dataset.latest_major_version(major)
|
|
130
153
|
if not latest_major:
|
|
131
154
|
raise DatasetVersionNotFoundError(
|
|
@@ -136,19 +159,22 @@ def read_dataset(
|
|
|
136
159
|
# version is in new semver string format, continuing as normal
|
|
137
160
|
pass
|
|
138
161
|
|
|
162
|
+
if settings:
|
|
163
|
+
_settings = Settings(**settings)
|
|
164
|
+
else:
|
|
165
|
+
_settings = Settings()
|
|
166
|
+
|
|
139
167
|
query = DatasetQuery(
|
|
140
168
|
name=name,
|
|
169
|
+
project_name=project_name,
|
|
170
|
+
namespace_name=namespace_name,
|
|
141
171
|
version=version, # type: ignore[arg-type]
|
|
142
172
|
session=session,
|
|
143
173
|
indexing_column_types=File._datachain_column_types,
|
|
144
174
|
fallback_to_studio=fallback_to_studio,
|
|
145
175
|
)
|
|
146
|
-
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
147
|
-
if settings:
|
|
148
|
-
_settings = Settings(**settings)
|
|
149
|
-
else:
|
|
150
|
-
_settings = Settings()
|
|
151
176
|
|
|
177
|
+
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
152
178
|
signals_schema = SignalSchema({"sys": Sys})
|
|
153
179
|
if query.feature_schema:
|
|
154
180
|
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
@@ -200,7 +226,7 @@ def datasets(
|
|
|
200
226
|
import datachain as dc
|
|
201
227
|
|
|
202
228
|
chain = dc.datasets(column="dataset")
|
|
203
|
-
for ds in chain.
|
|
229
|
+
for ds in chain.to_iter("dataset"):
|
|
204
230
|
print(f"{ds.name}@v{ds.version}")
|
|
205
231
|
```
|
|
206
232
|
"""
|
|
@@ -251,6 +277,8 @@ def datasets(
|
|
|
251
277
|
|
|
252
278
|
def delete_dataset(
|
|
253
279
|
name: str,
|
|
280
|
+
namespace: Optional[str] = None,
|
|
281
|
+
project: Optional[str] = None,
|
|
254
282
|
version: Optional[str] = None,
|
|
255
283
|
force: Optional[bool] = False,
|
|
256
284
|
studio: Optional[bool] = False,
|
|
@@ -261,11 +289,16 @@ def delete_dataset(
|
|
|
261
289
|
a force flag.
|
|
262
290
|
|
|
263
291
|
Args:
|
|
264
|
-
name
|
|
292
|
+
name: The dataset name, which can be a fully qualified name including the
|
|
293
|
+
namespace and project. Alternatively, it can be a regular name, in which
|
|
294
|
+
case the explicitly defined namespace and project will be used if they are
|
|
295
|
+
set; otherwise, default values will be applied.
|
|
296
|
+
namespace : optional name of namespace in which dataset to delete is created
|
|
297
|
+
project : optional name of project in which dataset to delete is created
|
|
265
298
|
version : Optional dataset version
|
|
266
299
|
force: If true, all datasets versions will be removed. Defaults to False.
|
|
267
|
-
studio: If True, removes dataset from Studio only,
|
|
268
|
-
|
|
300
|
+
studio: If True, removes dataset from Studio only, otherwise removes local
|
|
301
|
+
dataset. Defaults to False.
|
|
269
302
|
session: Optional session instance. If not provided, uses default session.
|
|
270
303
|
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
271
304
|
|
|
@@ -282,11 +315,32 @@ def delete_dataset(
|
|
|
282
315
|
dc.delete_dataset("cats", version="1.0.0")
|
|
283
316
|
```
|
|
284
317
|
"""
|
|
318
|
+
from datachain.studio import remove_studio_dataset
|
|
285
319
|
|
|
286
320
|
session = Session.get(session, in_memory=in_memory)
|
|
287
321
|
catalog = session.catalog
|
|
322
|
+
|
|
323
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
324
|
+
namespace_name = (
|
|
325
|
+
namespace_name or namespace or catalog.metastore.default_namespace_name
|
|
326
|
+
)
|
|
327
|
+
project_name = project_name or project or catalog.metastore.default_project_name
|
|
328
|
+
|
|
329
|
+
if not catalog.metastore.is_local_dataset(namespace_name) and studio:
|
|
330
|
+
return remove_studio_dataset(
|
|
331
|
+
None, name, namespace_name, project_name, version=version, force=force
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
try:
|
|
335
|
+
ds_project = get_project(project_name, namespace_name, session=session)
|
|
336
|
+
except ProjectNotFoundError:
|
|
337
|
+
raise DatasetNotFoundError(
|
|
338
|
+
f"Dataset {name} not found in namespace {namespace_name} and project",
|
|
339
|
+
f" {project_name}",
|
|
340
|
+
) from None
|
|
341
|
+
|
|
288
342
|
if not force:
|
|
289
|
-
version = version or catalog.get_dataset(name).latest_version
|
|
343
|
+
version = version or catalog.get_dataset(name, ds_project).latest_version
|
|
290
344
|
else:
|
|
291
345
|
version = None
|
|
292
|
-
catalog.remove_dataset(name, version=version, force=force
|
|
346
|
+
catalog.remove_dataset(name, ds_project, version=version, force=force)
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -37,7 +37,7 @@ class ReadOnlyQueryStep(QueryStep):
|
|
|
37
37
|
return sa.select(*columns)
|
|
38
38
|
|
|
39
39
|
table_name = self.catalog.warehouse.dataset_table_name(
|
|
40
|
-
self.
|
|
40
|
+
self.dataset, self.dataset_version
|
|
41
41
|
)
|
|
42
42
|
dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
|
|
43
43
|
table = dataset_row_cls.new_table(
|
|
@@ -51,7 +51,7 @@ class ReadOnlyQueryStep(QueryStep):
|
|
|
51
51
|
)
|
|
52
52
|
|
|
53
53
|
return step_result(
|
|
54
|
-
q, table.columns, dependencies=[(self.
|
|
54
|
+
q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
|
|
@@ -142,7 +142,7 @@ def read_listing_dataset(
|
|
|
142
142
|
_settings = Settings(prefetch=0)
|
|
143
143
|
signal_schema = SignalSchema({"sys": Sys, "file": File})
|
|
144
144
|
|
|
145
|
-
query.starting_step = ReadOnlyQueryStep(query.catalog,
|
|
145
|
+
query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
|
|
146
146
|
query.version = version
|
|
147
147
|
# We already know that this is a listing dataset,
|
|
148
148
|
# so we can set the listing function to True
|
datachain/lib/dc/records.py
CHANGED
datachain/lib/dc/storage.py
CHANGED
|
@@ -35,7 +35,11 @@ def read_storage(
|
|
|
35
35
|
update: bool = False,
|
|
36
36
|
anon: bool = False,
|
|
37
37
|
delta: Optional[bool] = False,
|
|
38
|
-
delta_on: Optional[Union[str, Sequence[str]]] =
|
|
38
|
+
delta_on: Optional[Union[str, Sequence[str]]] = (
|
|
39
|
+
"file.path",
|
|
40
|
+
"file.etag",
|
|
41
|
+
"file.version",
|
|
42
|
+
),
|
|
39
43
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
40
44
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
41
45
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
@@ -54,43 +58,25 @@ def read_storage(
|
|
|
54
58
|
update : force storage reindexing. Default is False.
|
|
55
59
|
anon : If True, we will treat cloud bucket as public one
|
|
56
60
|
client_config : Optional client configuration for the storage client.
|
|
57
|
-
delta: If
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
dataset
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
77
|
-
to the `delta_on` fields from the source.
|
|
78
|
-
This is needed to identify rows that have changed in the source but are
|
|
79
|
-
already present in the current version of the resulting dataset, in order
|
|
80
|
-
to avoid including outdated versions of those rows in the new dataset.
|
|
81
|
-
We retain only the latest versions of rows to prevent duplication.
|
|
82
|
-
There is no need to define this if the `delta_on` fields are present in
|
|
83
|
-
the final dataset and have not been renamed.
|
|
84
|
-
delta_compare: A list of fields used to check if the same row has been modified
|
|
85
|
-
in the new version of the source.
|
|
86
|
-
If not defined, all fields except those defined in `delta_on` will be used.
|
|
87
|
-
delta_retry: Controls which records to reprocess. Can be:
|
|
88
|
-
- A string specifying a field name: Records where this field is not None
|
|
89
|
-
will be reprocessed (error checking mode).
|
|
90
|
-
- True: Records that exist in the source dataset but not in the result
|
|
91
|
-
dataset (based on delta_on/delta_result_on fields) will be reprocessed
|
|
92
|
-
(missing records mode).
|
|
93
|
-
- False or None: No retry processing.
|
|
61
|
+
delta: If True, only process new or changed files instead of reprocessing
|
|
62
|
+
everything. This saves time by skipping files that were already processed in
|
|
63
|
+
previous versions. The optimization is working when a new version of the
|
|
64
|
+
dataset is created.
|
|
65
|
+
Default is False.
|
|
66
|
+
delta_on: Field(s) that uniquely identify each record in the source data.
|
|
67
|
+
Used to detect which records are new or changed.
|
|
68
|
+
Default is ("file.path", "file.etag", "file.version").
|
|
69
|
+
delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
|
|
70
|
+
Only needed if you rename the identifying fields during processing.
|
|
71
|
+
Default is None.
|
|
72
|
+
delta_compare: Field(s) used to detect if a record has changed.
|
|
73
|
+
If not specified, all fields except `delta_on` fields are used.
|
|
74
|
+
Default is None.
|
|
75
|
+
delta_retry: Controls retry behavior for failed records:
|
|
76
|
+
- String (field name): Reprocess records where this field is not empty
|
|
77
|
+
(error mode)
|
|
78
|
+
- True: Reprocess records missing from the result dataset (missing mode)
|
|
79
|
+
- None: No retry processing (default)
|
|
94
80
|
|
|
95
81
|
Returns:
|
|
96
82
|
DataChain: A DataChain object containing the file information.
|
|
@@ -144,6 +130,8 @@ def read_storage(
|
|
|
144
130
|
catalog = session.catalog
|
|
145
131
|
cache = catalog.cache
|
|
146
132
|
client_config = session.catalog.client_config
|
|
133
|
+
listing_namespace_name = catalog.metastore.system_namespace_name
|
|
134
|
+
listing_project_name = catalog.metastore.listing_project_name
|
|
147
135
|
|
|
148
136
|
uris = uri if isinstance(uri, (list, tuple)) else [uri]
|
|
149
137
|
|
|
@@ -167,7 +155,13 @@ def read_storage(
|
|
|
167
155
|
)
|
|
168
156
|
continue
|
|
169
157
|
|
|
170
|
-
dc = read_dataset(
|
|
158
|
+
dc = read_dataset(
|
|
159
|
+
list_ds_name,
|
|
160
|
+
namespace=listing_namespace_name,
|
|
161
|
+
project=listing_project_name,
|
|
162
|
+
session=session,
|
|
163
|
+
settings=settings,
|
|
164
|
+
)
|
|
171
165
|
dc._query.update = update
|
|
172
166
|
dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
|
|
173
167
|
|
|
@@ -182,7 +176,11 @@ def read_storage(
|
|
|
182
176
|
settings=settings,
|
|
183
177
|
in_memory=in_memory,
|
|
184
178
|
)
|
|
185
|
-
.settings(
|
|
179
|
+
.settings(
|
|
180
|
+
prefetch=0,
|
|
181
|
+
namespace=listing_namespace_name,
|
|
182
|
+
project=listing_project_name,
|
|
183
|
+
)
|
|
186
184
|
.gen(
|
|
187
185
|
list_bucket(lst_uri, cache, client_config=client_config),
|
|
188
186
|
output={f"{column}": file_type},
|
datachain/lib/file.py
CHANGED
|
@@ -5,13 +5,14 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import posixpath
|
|
8
|
+
import warnings
|
|
8
9
|
from abc import ABC, abstractmethod
|
|
9
10
|
from collections.abc import Iterator
|
|
10
11
|
from contextlib import contextmanager
|
|
11
12
|
from datetime import datetime
|
|
12
13
|
from functools import partial
|
|
13
14
|
from io import BytesIO
|
|
14
|
-
from pathlib import Path, PurePosixPath
|
|
15
|
+
from pathlib import Path, PurePath, PurePosixPath
|
|
15
16
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
16
17
|
from urllib.parse import unquote, urlparse
|
|
17
18
|
from urllib.request import url2pathname
|
|
@@ -69,7 +70,7 @@ class FileExporter(NodesThreadPool):
|
|
|
69
70
|
for task in done:
|
|
70
71
|
task.result()
|
|
71
72
|
|
|
72
|
-
def do_task(self, file):
|
|
73
|
+
def do_task(self, file: "File"):
|
|
73
74
|
file.export(
|
|
74
75
|
self.output,
|
|
75
76
|
self.placement,
|
|
@@ -274,8 +275,8 @@ class File(DataModel):
|
|
|
274
275
|
|
|
275
276
|
@field_validator("path", mode="before")
|
|
276
277
|
@classmethod
|
|
277
|
-
def validate_path(cls, path):
|
|
278
|
-
return
|
|
278
|
+
def validate_path(cls, path: str) -> str:
|
|
279
|
+
return PurePath(path).as_posix() if path else ""
|
|
279
280
|
|
|
280
281
|
def model_dump_custom(self):
|
|
281
282
|
res = self.model_dump()
|
|
@@ -337,11 +338,11 @@ class File(DataModel):
|
|
|
337
338
|
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
|
338
339
|
|
|
339
340
|
@property
|
|
340
|
-
def name(self):
|
|
341
|
+
def name(self) -> str:
|
|
341
342
|
return PurePosixPath(self.path).name
|
|
342
343
|
|
|
343
344
|
@property
|
|
344
|
-
def parent(self):
|
|
345
|
+
def parent(self) -> str:
|
|
345
346
|
return str(PurePosixPath(self.path).parent)
|
|
346
347
|
|
|
347
348
|
@contextmanager
|
|
@@ -391,7 +392,7 @@ class File(DataModel):
|
|
|
391
392
|
|
|
392
393
|
client.upload(self.read(), destination)
|
|
393
394
|
|
|
394
|
-
def _symlink_to(self, destination: str):
|
|
395
|
+
def _symlink_to(self, destination: str) -> None:
|
|
395
396
|
if self.location:
|
|
396
397
|
raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
|
|
397
398
|
|
|
@@ -400,7 +401,7 @@ class File(DataModel):
|
|
|
400
401
|
source = self.get_local_path()
|
|
401
402
|
assert source, "File was not cached"
|
|
402
403
|
elif self.source.startswith("file://"):
|
|
403
|
-
source = self.
|
|
404
|
+
source = self.get_fs_path()
|
|
404
405
|
else:
|
|
405
406
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
406
407
|
|
|
@@ -481,27 +482,62 @@ class File(DataModel):
|
|
|
481
482
|
|
|
482
483
|
def get_file_ext(self):
|
|
483
484
|
"""Returns last part of file name without `.`."""
|
|
484
|
-
return PurePosixPath(self.path).suffix.
|
|
485
|
+
return PurePosixPath(self.path).suffix.lstrip(".")
|
|
485
486
|
|
|
486
487
|
def get_file_stem(self):
|
|
487
488
|
"""Returns file name without extension."""
|
|
488
489
|
return PurePosixPath(self.path).stem
|
|
489
490
|
|
|
490
491
|
def get_full_name(self):
|
|
491
|
-
"""
|
|
492
|
+
"""
|
|
493
|
+
[DEPRECATED] Use `file.path` directly instead.
|
|
494
|
+
|
|
495
|
+
Returns name with parent directories.
|
|
496
|
+
"""
|
|
497
|
+
warnings.warn(
|
|
498
|
+
"file.get_full_name() is deprecated and will be removed "
|
|
499
|
+
"in a future version. Use `file.path` directly.",
|
|
500
|
+
DeprecationWarning,
|
|
501
|
+
stacklevel=2,
|
|
502
|
+
)
|
|
492
503
|
return self.path
|
|
493
504
|
|
|
494
|
-
def
|
|
505
|
+
def get_path_normalized(self) -> str:
|
|
506
|
+
if not self.path:
|
|
507
|
+
raise FileError("path must not be empty", self.source, self.path)
|
|
508
|
+
|
|
509
|
+
if self.path.endswith("/"):
|
|
510
|
+
raise FileError("path must not be a directory", self.source, self.path)
|
|
511
|
+
|
|
512
|
+
normpath = os.path.normpath(self.path)
|
|
513
|
+
normpath = PurePath(normpath).as_posix()
|
|
514
|
+
|
|
515
|
+
if normpath == ".":
|
|
516
|
+
raise FileError("path must not be a directory", self.source, self.path)
|
|
517
|
+
|
|
518
|
+
if any(part == ".." for part in PurePath(normpath).parts):
|
|
519
|
+
raise FileError("path must not contain '..'", self.source, self.path)
|
|
520
|
+
|
|
521
|
+
return normpath
|
|
522
|
+
|
|
523
|
+
def get_uri(self) -> str:
|
|
495
524
|
"""Returns file URI."""
|
|
496
|
-
return f"{self.source}/{self.
|
|
525
|
+
return f"{self.source}/{self.get_path_normalized()}"
|
|
497
526
|
|
|
498
|
-
def
|
|
499
|
-
"""
|
|
527
|
+
def get_fs_path(self) -> str:
|
|
528
|
+
"""
|
|
529
|
+
Returns file path with respect to the filescheme.
|
|
530
|
+
|
|
531
|
+
If `normalize` is True, the path is normalized to remove any redundant
|
|
532
|
+
separators and up-level references.
|
|
533
|
+
|
|
534
|
+
If the file scheme is "file", the path is converted to a local file path
|
|
535
|
+
using `url2pathname`. Otherwise, the original path with scheme is returned.
|
|
536
|
+
"""
|
|
500
537
|
path = unquote(self.get_uri())
|
|
501
|
-
|
|
502
|
-
if
|
|
503
|
-
path =
|
|
504
|
-
path = url2pathname(path)
|
|
538
|
+
path_parsed = urlparse(path)
|
|
539
|
+
if path_parsed.scheme == "file":
|
|
540
|
+
path = url2pathname(path_parsed.path)
|
|
505
541
|
return path
|
|
506
542
|
|
|
507
543
|
def get_destination_path(
|
|
@@ -516,7 +552,7 @@ class File(DataModel):
|
|
|
516
552
|
elif placement == "etag":
|
|
517
553
|
path = f"{self.etag}{self.get_file_suffix()}"
|
|
518
554
|
elif placement == "fullpath":
|
|
519
|
-
path = unquote(self.
|
|
555
|
+
path = unquote(self.get_path_normalized())
|
|
520
556
|
source = urlparse(self.source)
|
|
521
557
|
if source.scheme and source.scheme != "file":
|
|
522
558
|
path = posixpath.join(source.netloc, path)
|
|
@@ -554,8 +590,9 @@ class File(DataModel):
|
|
|
554
590
|
) from e
|
|
555
591
|
|
|
556
592
|
try:
|
|
557
|
-
|
|
558
|
-
|
|
593
|
+
normalized_path = self.get_path_normalized()
|
|
594
|
+
info = client.fs.info(client.get_full_path(normalized_path))
|
|
595
|
+
converted_info = client.info_to_file(info, normalized_path)
|
|
559
596
|
return type(self)(
|
|
560
597
|
path=self.path,
|
|
561
598
|
source=self.source,
|
|
@@ -566,8 +603,17 @@ class File(DataModel):
|
|
|
566
603
|
last_modified=converted_info.last_modified,
|
|
567
604
|
location=self.location,
|
|
568
605
|
)
|
|
606
|
+
except FileError as e:
|
|
607
|
+
logger.warning(
|
|
608
|
+
"File error when resolving %s/%s: %s", self.source, self.path, str(e)
|
|
609
|
+
)
|
|
569
610
|
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
570
|
-
logger.warning(
|
|
611
|
+
logger.warning(
|
|
612
|
+
"File system error when resolving %s/%s: %s",
|
|
613
|
+
self.source,
|
|
614
|
+
self.path,
|
|
615
|
+
str(e),
|
|
616
|
+
)
|
|
571
617
|
|
|
572
618
|
return type(self)(
|
|
573
619
|
path=self.path,
|
|
@@ -583,6 +629,8 @@ class File(DataModel):
|
|
|
583
629
|
|
|
584
630
|
def resolve(file: File) -> File:
|
|
585
631
|
"""
|
|
632
|
+
[DEPRECATED] Use `file.resolve()` directly instead.
|
|
633
|
+
|
|
586
634
|
Resolve a File object by checking its existence and updating its metadata.
|
|
587
635
|
|
|
588
636
|
This function is a wrapper around the File.resolve() method, designed to be
|
|
@@ -598,6 +646,12 @@ def resolve(file: File) -> File:
|
|
|
598
646
|
RuntimeError: If the file's catalog is not set or if
|
|
599
647
|
the file source protocol is unsupported.
|
|
600
648
|
"""
|
|
649
|
+
warnings.warn(
|
|
650
|
+
"resolve() is deprecated and will be removed "
|
|
651
|
+
"in a future version. Use file.resolve() directly.",
|
|
652
|
+
DeprecationWarning,
|
|
653
|
+
stacklevel=2,
|
|
654
|
+
)
|
|
601
655
|
return file.resolve()
|
|
602
656
|
|
|
603
657
|
|
|
@@ -945,7 +999,7 @@ class ArrowRow(DataModel):
|
|
|
945
999
|
ds = dataset(path, **self.kwargs)
|
|
946
1000
|
|
|
947
1001
|
else:
|
|
948
|
-
path = self.file.
|
|
1002
|
+
path = self.file.get_fs_path()
|
|
949
1003
|
ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
|
|
950
1004
|
|
|
951
1005
|
return ds.take([self.index]).to_reader()
|
datachain/lib/listing.py
CHANGED
|
@@ -123,6 +123,9 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
|
|
|
123
123
|
f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
+
# we should remove dots from the name
|
|
127
|
+
ds_name = ds_name.replace(".", "_")
|
|
128
|
+
|
|
126
129
|
return ds_name, lst_uri, path
|
|
127
130
|
|
|
128
131
|
|
|
@@ -195,5 +198,4 @@ def get_listing(
|
|
|
195
198
|
list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
|
|
196
199
|
|
|
197
200
|
ds_name = listing.name if listing else ds_name
|
|
198
|
-
|
|
199
201
|
return ds_name, list_uri, list_path, bool(listing)
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -106,7 +106,7 @@ def read_meta( # noqa: C901
|
|
|
106
106
|
from datachain import read_storage
|
|
107
107
|
|
|
108
108
|
if schema_from:
|
|
109
|
-
file =
|
|
109
|
+
file = read_storage(schema_from, type="text").limit(1).to_values("file")[0]
|
|
110
110
|
model_code = gen_datamodel_code(
|
|
111
111
|
file, format=format, jmespath=jmespath, model_name=model_name
|
|
112
112
|
)
|