datachain 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +65 -180
- datachain/cli/__init__.py +7 -0
- datachain/cli/commands/datasets.py +28 -43
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +35 -1
- datachain/client/fsspec.py +3 -5
- datachain/client/hf.py +0 -10
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +37 -403
- datachain/data_storage/sqlite.py +7 -139
- datachain/data_storage/warehouse.py +7 -26
- datachain/dataset.py +12 -126
- datachain/delta.py +7 -11
- datachain/error.py +0 -36
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +0 -4
- datachain/lib/dc/datachain.py +92 -259
- datachain/lib/dc/datasets.py +49 -87
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +0 -1
- datachain/lib/dc/storage.py +40 -38
- datachain/lib/file.py +23 -77
- datachain/lib/listing.py +1 -3
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +0 -10
- datachain/lib/tar.py +2 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +20 -30
- datachain/listing.py +1 -3
- datachain/query/dataset.py +46 -71
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +26 -61
- datachain/studio.py +7 -23
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
- datachain/lib/namespaces.py +0 -71
- datachain/lib/projects.py +0 -86
- datachain/namespace.py +0 -65
- datachain/project.py +0 -78
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
|
|
3
3
|
|
|
4
|
-
from datachain.dataset import parse_dataset_name
|
|
5
4
|
from datachain.error import DatasetVersionNotFoundError
|
|
6
5
|
from datachain.lib.dataset_info import DatasetInfo
|
|
7
6
|
from datachain.lib.file import (
|
|
8
7
|
File,
|
|
9
8
|
)
|
|
10
|
-
from datachain.lib.projects import get as get_project
|
|
11
9
|
from datachain.lib.settings import Settings
|
|
12
10
|
from datachain.lib.signal_schema import SignalSchema
|
|
13
11
|
from datachain.query import Session
|
|
@@ -26,18 +24,12 @@ if TYPE_CHECKING:
|
|
|
26
24
|
|
|
27
25
|
def read_dataset(
|
|
28
26
|
name: str,
|
|
29
|
-
namespace: Optional[str] = None,
|
|
30
|
-
project: Optional[str] = None,
|
|
31
27
|
version: Optional[Union[str, int]] = None,
|
|
32
28
|
session: Optional[Session] = None,
|
|
33
29
|
settings: Optional[dict] = None,
|
|
34
30
|
fallback_to_studio: bool = True,
|
|
35
31
|
delta: Optional[bool] = False,
|
|
36
|
-
delta_on: Optional[Union[str, Sequence[str]]] =
|
|
37
|
-
"file.path",
|
|
38
|
-
"file.etag",
|
|
39
|
-
"file.version",
|
|
40
|
-
),
|
|
32
|
+
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
41
33
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
42
34
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
43
35
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
@@ -46,36 +38,47 @@ def read_dataset(
|
|
|
46
38
|
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
47
39
|
|
|
48
40
|
Parameters:
|
|
49
|
-
name:
|
|
50
|
-
namespace and project. Alternatively, it can be a regular name, in which
|
|
51
|
-
case the explicitly defined namespace and project will be used if they are
|
|
52
|
-
set; otherwise, default values will be applied.
|
|
53
|
-
namespace : optional name of namespace in which dataset to read is created
|
|
54
|
-
project : optional name of project in which dataset to read is created
|
|
41
|
+
name : dataset name
|
|
55
42
|
version : dataset version
|
|
56
43
|
session : Session to use for the chain.
|
|
57
44
|
settings : Settings to use for the chain.
|
|
58
45
|
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
59
46
|
Default is True.
|
|
60
|
-
delta: If True,
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
dataset
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
47
|
+
delta: If set to True, we optimize the creation of new dataset versions by
|
|
48
|
+
calculating the diff between the latest version of this storage and the
|
|
49
|
+
version used to create the most recent version of the resulting chain
|
|
50
|
+
dataset (the one specified in `.save()`). We then run the "diff" chain
|
|
51
|
+
using only the diff data, rather than the entire storage data, and merge
|
|
52
|
+
that diff chain with the latest version of the resulting dataset to create
|
|
53
|
+
a new version. This approach avoids applying modifications to all records
|
|
54
|
+
from storage every time, which can be an expensive operation.
|
|
55
|
+
The diff is calculated using the `DataChain.compare()` method, which
|
|
56
|
+
compares the `delta_on` fields to find matches and checks the compare
|
|
57
|
+
fields to determine if a record has changed. Note that this process only
|
|
58
|
+
considers added and modified records in storage; deleted records are not
|
|
59
|
+
removed from the new dataset version.
|
|
60
|
+
This calculation is based on the difference between the current version
|
|
61
|
+
of the source and the version used to create the dataset.
|
|
62
|
+
delta_on: A list of fields that uniquely identify rows in the source.
|
|
63
|
+
If two rows have the same values, they are considered the same (e.g., they
|
|
64
|
+
could be different versions of the same row in a versioned source).
|
|
65
|
+
This is used in the delta update to calculate the diff.
|
|
66
|
+
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
67
|
+
to the `delta_on` fields from the source.
|
|
68
|
+
This is needed to identify rows that have changed in the source but are
|
|
69
|
+
already present in the current version of the resulting dataset, in order
|
|
70
|
+
to avoid including outdated versions of those rows in the new dataset.
|
|
71
|
+
We retain only the latest versions of rows to prevent duplication.
|
|
72
|
+
There is no need to define this if the `delta_on` fields are present in
|
|
73
|
+
the final dataset and have not been renamed.
|
|
74
|
+
delta_compare: A list of fields used to check if the same row has been modified
|
|
75
|
+
in the new version of the source.
|
|
76
|
+
If not defined, all fields except those defined in delta_on will be used.
|
|
77
|
+
delta_retry: Specifies retry behavior for delta processing. If a string,
|
|
78
|
+
it's the name of a field in the result dataset that indicates an error
|
|
79
|
+
when not None - records with errors will be reprocessed. If True,
|
|
80
|
+
records that exist in the source dataset but not in the result dataset
|
|
81
|
+
will be reprocessed.
|
|
79
82
|
|
|
80
83
|
Example:
|
|
81
84
|
```py
|
|
@@ -83,11 +86,6 @@ def read_dataset(
|
|
|
83
86
|
chain = dc.read_dataset("my_cats")
|
|
84
87
|
```
|
|
85
88
|
|
|
86
|
-
```py
|
|
87
|
-
import datachain as dc
|
|
88
|
-
chain = dc.read_dataset("dev.animals.my_cats")
|
|
89
|
-
```
|
|
90
|
-
|
|
91
89
|
```py
|
|
92
90
|
chain = dc.read_dataset("my_cats", fallback_to_studio=False)
|
|
93
91
|
```
|
|
@@ -118,15 +116,6 @@ def read_dataset(
|
|
|
118
116
|
|
|
119
117
|
from .datachain import DataChain
|
|
120
118
|
|
|
121
|
-
session = Session.get(session)
|
|
122
|
-
catalog = session.catalog
|
|
123
|
-
|
|
124
|
-
namespace_name, project_name, name = parse_dataset_name(name)
|
|
125
|
-
namespace_name = (
|
|
126
|
-
namespace_name or namespace or catalog.metastore.default_namespace_name
|
|
127
|
-
)
|
|
128
|
-
project_name = project_name or project or catalog.metastore.default_project_name
|
|
129
|
-
|
|
130
119
|
if version is not None:
|
|
131
120
|
try:
|
|
132
121
|
# for backward compatibility we still allow users to put version as integer
|
|
@@ -136,9 +125,7 @@ def read_dataset(
|
|
|
136
125
|
# all 2.* dataset versions). If dataset doesn't have any versions where
|
|
137
126
|
# major part is equal to that input, exception is thrown.
|
|
138
127
|
major = int(version)
|
|
139
|
-
dataset = session.catalog.get_dataset(
|
|
140
|
-
name, get_project(project_name, namespace_name, session=session)
|
|
141
|
-
)
|
|
128
|
+
dataset = Session.get(session).catalog.get_dataset(name)
|
|
142
129
|
latest_major = dataset.latest_major_version(major)
|
|
143
130
|
if not latest_major:
|
|
144
131
|
raise DatasetVersionNotFoundError(
|
|
@@ -149,22 +136,19 @@ def read_dataset(
|
|
|
149
136
|
# version is in new semver string format, continuing as normal
|
|
150
137
|
pass
|
|
151
138
|
|
|
152
|
-
if settings:
|
|
153
|
-
_settings = Settings(**settings)
|
|
154
|
-
else:
|
|
155
|
-
_settings = Settings()
|
|
156
|
-
|
|
157
139
|
query = DatasetQuery(
|
|
158
140
|
name=name,
|
|
159
|
-
project_name=project_name,
|
|
160
|
-
namespace_name=namespace_name,
|
|
161
141
|
version=version, # type: ignore[arg-type]
|
|
162
142
|
session=session,
|
|
163
143
|
indexing_column_types=File._datachain_column_types,
|
|
164
144
|
fallback_to_studio=fallback_to_studio,
|
|
165
145
|
)
|
|
166
|
-
|
|
167
146
|
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
147
|
+
if settings:
|
|
148
|
+
_settings = Settings(**settings)
|
|
149
|
+
else:
|
|
150
|
+
_settings = Settings()
|
|
151
|
+
|
|
168
152
|
signals_schema = SignalSchema({"sys": Sys})
|
|
169
153
|
if query.feature_schema:
|
|
170
154
|
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
@@ -216,7 +200,7 @@ def datasets(
|
|
|
216
200
|
import datachain as dc
|
|
217
201
|
|
|
218
202
|
chain = dc.datasets(column="dataset")
|
|
219
|
-
for ds in chain.
|
|
203
|
+
for ds in chain.collect("dataset"):
|
|
220
204
|
print(f"{ds.name}@v{ds.version}")
|
|
221
205
|
```
|
|
222
206
|
"""
|
|
@@ -267,8 +251,6 @@ def datasets(
|
|
|
267
251
|
|
|
268
252
|
def delete_dataset(
|
|
269
253
|
name: str,
|
|
270
|
-
namespace: Optional[str] = None,
|
|
271
|
-
project: Optional[str] = None,
|
|
272
254
|
version: Optional[str] = None,
|
|
273
255
|
force: Optional[bool] = False,
|
|
274
256
|
studio: Optional[bool] = False,
|
|
@@ -279,16 +261,11 @@ def delete_dataset(
|
|
|
279
261
|
a force flag.
|
|
280
262
|
|
|
281
263
|
Args:
|
|
282
|
-
name:
|
|
283
|
-
namespace and project. Alternatively, it can be a regular name, in which
|
|
284
|
-
case the explicitly defined namespace and project will be used if they are
|
|
285
|
-
set; otherwise, default values will be applied.
|
|
286
|
-
namespace : optional name of namespace in which dataset to delete is created
|
|
287
|
-
project : optional name of project in which dataset to delete is created
|
|
264
|
+
name : Dataset name
|
|
288
265
|
version : Optional dataset version
|
|
289
266
|
force: If true, all datasets versions will be removed. Defaults to False.
|
|
290
|
-
studio: If True, removes dataset from Studio only,
|
|
291
|
-
|
|
267
|
+
studio: If True, removes dataset from Studio only,
|
|
268
|
+
otherwise remove from local. Defaults to False.
|
|
292
269
|
session: Optional session instance. If not provided, uses default session.
|
|
293
270
|
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
294
271
|
|
|
@@ -305,26 +282,11 @@ def delete_dataset(
|
|
|
305
282
|
dc.delete_dataset("cats", version="1.0.0")
|
|
306
283
|
```
|
|
307
284
|
"""
|
|
308
|
-
from datachain.studio import remove_studio_dataset
|
|
309
285
|
|
|
310
286
|
session = Session.get(session, in_memory=in_memory)
|
|
311
287
|
catalog = session.catalog
|
|
312
|
-
|
|
313
|
-
namespace_name, project_name, name = parse_dataset_name(name)
|
|
314
|
-
namespace_name = (
|
|
315
|
-
namespace_name or namespace or catalog.metastore.default_namespace_name
|
|
316
|
-
)
|
|
317
|
-
project_name = project_name or project or catalog.metastore.default_project_name
|
|
318
|
-
|
|
319
|
-
if not catalog.metastore.is_local_dataset(namespace_name) and studio:
|
|
320
|
-
return remove_studio_dataset(
|
|
321
|
-
None, name, namespace_name, project_name, version=version, force=force
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
ds_project = get_project(project_name, namespace_name, session=session)
|
|
325
|
-
|
|
326
288
|
if not force:
|
|
327
|
-
version = version or catalog.get_dataset(name
|
|
289
|
+
version = version or catalog.get_dataset(name).latest_version
|
|
328
290
|
else:
|
|
329
291
|
version = None
|
|
330
|
-
catalog.remove_dataset(name,
|
|
292
|
+
catalog.remove_dataset(name, version=version, force=force, studio=studio)
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -37,7 +37,7 @@ class ReadOnlyQueryStep(QueryStep):
|
|
|
37
37
|
return sa.select(*columns)
|
|
38
38
|
|
|
39
39
|
table_name = self.catalog.warehouse.dataset_table_name(
|
|
40
|
-
self.
|
|
40
|
+
self.dataset_name, self.dataset_version
|
|
41
41
|
)
|
|
42
42
|
dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
|
|
43
43
|
table = dataset_row_cls.new_table(
|
|
@@ -51,7 +51,7 @@ class ReadOnlyQueryStep(QueryStep):
|
|
|
51
51
|
)
|
|
52
52
|
|
|
53
53
|
return step_result(
|
|
54
|
-
q, table.columns, dependencies=[(self.
|
|
54
|
+
q, table.columns, dependencies=[(self.dataset_name, self.dataset_version)]
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
|
|
@@ -142,7 +142,7 @@ def read_listing_dataset(
|
|
|
142
142
|
_settings = Settings(prefetch=0)
|
|
143
143
|
signal_schema = SignalSchema({"sys": Sys, "file": File})
|
|
144
144
|
|
|
145
|
-
query.starting_step = ReadOnlyQueryStep(query.catalog,
|
|
145
|
+
query.starting_step = ReadOnlyQueryStep(query.catalog, name, version)
|
|
146
146
|
query.version = version
|
|
147
147
|
# We already know that this is a listing dataset,
|
|
148
148
|
# so we can set the listing function to True
|
datachain/lib/dc/records.py
CHANGED
datachain/lib/dc/storage.py
CHANGED
|
@@ -35,11 +35,7 @@ def read_storage(
|
|
|
35
35
|
update: bool = False,
|
|
36
36
|
anon: bool = False,
|
|
37
37
|
delta: Optional[bool] = False,
|
|
38
|
-
delta_on: Optional[Union[str, Sequence[str]]] =
|
|
39
|
-
"file.path",
|
|
40
|
-
"file.etag",
|
|
41
|
-
"file.version",
|
|
42
|
-
),
|
|
38
|
+
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
43
39
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
44
40
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
45
41
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
@@ -58,25 +54,43 @@ def read_storage(
|
|
|
58
54
|
update : force storage reindexing. Default is False.
|
|
59
55
|
anon : If True, we will treat cloud bucket as public one
|
|
60
56
|
client_config : Optional client configuration for the storage client.
|
|
61
|
-
delta: If True,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
dataset
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
57
|
+
delta: If set to True, we optimize the creation of new dataset versions by
|
|
58
|
+
calculating the diff between the latest version of this storage and the
|
|
59
|
+
version used to create the most recent version of the resulting chain
|
|
60
|
+
dataset (the one specified in `.save()`). We then run the "diff" chain
|
|
61
|
+
using only the diff data, rather than the entire storage data, and merge
|
|
62
|
+
that diff chain with the latest version of the resulting dataset to create
|
|
63
|
+
a new version. This approach avoids applying modifications to all records
|
|
64
|
+
from storage every time, which can be an expensive operation.
|
|
65
|
+
The diff is calculated using the `DataChain.compare()` method, which
|
|
66
|
+
compares the `delta_on` fields to find matches and checks the compare
|
|
67
|
+
fields to determine if a record has changed. Note that this process only
|
|
68
|
+
considers added and modified records in storage; deleted records are not
|
|
69
|
+
removed from the new dataset version.
|
|
70
|
+
This calculation is based on the difference between the current version
|
|
71
|
+
of the source and the version used to create the dataset.
|
|
72
|
+
delta_on: A list of fields that uniquely identify rows in the source.
|
|
73
|
+
If two rows have the same values, they are considered the same (e.g., they
|
|
74
|
+
could be different versions of the same row in a versioned source).
|
|
75
|
+
This is used in the delta update to calculate the diff.
|
|
76
|
+
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
77
|
+
to the `delta_on` fields from the source.
|
|
78
|
+
This is needed to identify rows that have changed in the source but are
|
|
79
|
+
already present in the current version of the resulting dataset, in order
|
|
80
|
+
to avoid including outdated versions of those rows in the new dataset.
|
|
81
|
+
We retain only the latest versions of rows to prevent duplication.
|
|
82
|
+
There is no need to define this if the `delta_on` fields are present in
|
|
83
|
+
the final dataset and have not been renamed.
|
|
84
|
+
delta_compare: A list of fields used to check if the same row has been modified
|
|
85
|
+
in the new version of the source.
|
|
86
|
+
If not defined, all fields except those defined in `delta_on` will be used.
|
|
87
|
+
delta_retry: Controls which records to reprocess. Can be:
|
|
88
|
+
- A string specifying a field name: Records where this field is not None
|
|
89
|
+
will be reprocessed (error checking mode).
|
|
90
|
+
- True: Records that exist in the source dataset but not in the result
|
|
91
|
+
dataset (based on delta_on/delta_result_on fields) will be reprocessed
|
|
92
|
+
(missing records mode).
|
|
93
|
+
- False or None: No retry processing.
|
|
80
94
|
|
|
81
95
|
Returns:
|
|
82
96
|
DataChain: A DataChain object containing the file information.
|
|
@@ -130,8 +144,6 @@ def read_storage(
|
|
|
130
144
|
catalog = session.catalog
|
|
131
145
|
cache = catalog.cache
|
|
132
146
|
client_config = session.catalog.client_config
|
|
133
|
-
listing_namespace_name = catalog.metastore.system_namespace_name
|
|
134
|
-
listing_project_name = catalog.metastore.listing_project_name
|
|
135
147
|
|
|
136
148
|
uris = uri if isinstance(uri, (list, tuple)) else [uri]
|
|
137
149
|
|
|
@@ -155,13 +167,7 @@ def read_storage(
|
|
|
155
167
|
)
|
|
156
168
|
continue
|
|
157
169
|
|
|
158
|
-
dc = read_dataset(
|
|
159
|
-
list_ds_name,
|
|
160
|
-
namespace=listing_namespace_name,
|
|
161
|
-
project=listing_project_name,
|
|
162
|
-
session=session,
|
|
163
|
-
settings=settings,
|
|
164
|
-
)
|
|
170
|
+
dc = read_dataset(list_ds_name, session=session, settings=settings)
|
|
165
171
|
dc._query.update = update
|
|
166
172
|
dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
|
|
167
173
|
|
|
@@ -176,11 +182,7 @@ def read_storage(
|
|
|
176
182
|
settings=settings,
|
|
177
183
|
in_memory=in_memory,
|
|
178
184
|
)
|
|
179
|
-
.settings(
|
|
180
|
-
prefetch=0,
|
|
181
|
-
namespace=listing_namespace_name,
|
|
182
|
-
project=listing_project_name,
|
|
183
|
-
)
|
|
185
|
+
.settings(prefetch=0)
|
|
184
186
|
.gen(
|
|
185
187
|
list_bucket(lst_uri, cache, client_config=client_config),
|
|
186
188
|
output={f"{column}": file_type},
|
datachain/lib/file.py
CHANGED
|
@@ -5,14 +5,13 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import posixpath
|
|
8
|
-
import warnings
|
|
9
8
|
from abc import ABC, abstractmethod
|
|
10
9
|
from collections.abc import Iterator
|
|
11
10
|
from contextlib import contextmanager
|
|
12
11
|
from datetime import datetime
|
|
13
12
|
from functools import partial
|
|
14
13
|
from io import BytesIO
|
|
15
|
-
from pathlib import Path,
|
|
14
|
+
from pathlib import Path, PurePosixPath
|
|
16
15
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
17
16
|
from urllib.parse import unquote, urlparse
|
|
18
17
|
from urllib.request import url2pathname
|
|
@@ -70,7 +69,7 @@ class FileExporter(NodesThreadPool):
|
|
|
70
69
|
for task in done:
|
|
71
70
|
task.result()
|
|
72
71
|
|
|
73
|
-
def do_task(self, file
|
|
72
|
+
def do_task(self, file):
|
|
74
73
|
file.export(
|
|
75
74
|
self.output,
|
|
76
75
|
self.placement,
|
|
@@ -275,8 +274,8 @@ class File(DataModel):
|
|
|
275
274
|
|
|
276
275
|
@field_validator("path", mode="before")
|
|
277
276
|
@classmethod
|
|
278
|
-
def validate_path(cls, path
|
|
279
|
-
return
|
|
277
|
+
def validate_path(cls, path):
|
|
278
|
+
return Path(path).as_posix() if path else ""
|
|
280
279
|
|
|
281
280
|
def model_dump_custom(self):
|
|
282
281
|
res = self.model_dump()
|
|
@@ -338,11 +337,11 @@ class File(DataModel):
|
|
|
338
337
|
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
|
339
338
|
|
|
340
339
|
@property
|
|
341
|
-
def name(self)
|
|
340
|
+
def name(self):
|
|
342
341
|
return PurePosixPath(self.path).name
|
|
343
342
|
|
|
344
343
|
@property
|
|
345
|
-
def parent(self)
|
|
344
|
+
def parent(self):
|
|
346
345
|
return str(PurePosixPath(self.path).parent)
|
|
347
346
|
|
|
348
347
|
@contextmanager
|
|
@@ -392,7 +391,7 @@ class File(DataModel):
|
|
|
392
391
|
|
|
393
392
|
client.upload(self.read(), destination)
|
|
394
393
|
|
|
395
|
-
def _symlink_to(self, destination: str)
|
|
394
|
+
def _symlink_to(self, destination: str):
|
|
396
395
|
if self.location:
|
|
397
396
|
raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
|
|
398
397
|
|
|
@@ -401,7 +400,7 @@ class File(DataModel):
|
|
|
401
400
|
source = self.get_local_path()
|
|
402
401
|
assert source, "File was not cached"
|
|
403
402
|
elif self.source.startswith("file://"):
|
|
404
|
-
source = self.
|
|
403
|
+
source = self.get_path()
|
|
405
404
|
else:
|
|
406
405
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
407
406
|
|
|
@@ -482,62 +481,27 @@ class File(DataModel):
|
|
|
482
481
|
|
|
483
482
|
def get_file_ext(self):
|
|
484
483
|
"""Returns last part of file name without `.`."""
|
|
485
|
-
return PurePosixPath(self.path).suffix.
|
|
484
|
+
return PurePosixPath(self.path).suffix.strip(".")
|
|
486
485
|
|
|
487
486
|
def get_file_stem(self):
|
|
488
487
|
"""Returns file name without extension."""
|
|
489
488
|
return PurePosixPath(self.path).stem
|
|
490
489
|
|
|
491
490
|
def get_full_name(self):
|
|
492
|
-
"""
|
|
493
|
-
[DEPRECATED] Use `file.path` directly instead.
|
|
494
|
-
|
|
495
|
-
Returns name with parent directories.
|
|
496
|
-
"""
|
|
497
|
-
warnings.warn(
|
|
498
|
-
"file.get_full_name() is deprecated and will be removed "
|
|
499
|
-
"in a future version. Use `file.path` directly.",
|
|
500
|
-
DeprecationWarning,
|
|
501
|
-
stacklevel=2,
|
|
502
|
-
)
|
|
491
|
+
"""Returns name with parent directories."""
|
|
503
492
|
return self.path
|
|
504
493
|
|
|
505
|
-
def
|
|
506
|
-
if not self.path:
|
|
507
|
-
raise FileError("path must not be empty", self.source, self.path)
|
|
508
|
-
|
|
509
|
-
if self.path.endswith("/"):
|
|
510
|
-
raise FileError("path must not be a directory", self.source, self.path)
|
|
511
|
-
|
|
512
|
-
normpath = os.path.normpath(self.path)
|
|
513
|
-
normpath = PurePath(normpath).as_posix()
|
|
514
|
-
|
|
515
|
-
if normpath == ".":
|
|
516
|
-
raise FileError("path must not be a directory", self.source, self.path)
|
|
517
|
-
|
|
518
|
-
if any(part == ".." for part in PurePath(normpath).parts):
|
|
519
|
-
raise FileError("path must not contain '..'", self.source, self.path)
|
|
520
|
-
|
|
521
|
-
return normpath
|
|
522
|
-
|
|
523
|
-
def get_uri(self) -> str:
|
|
494
|
+
def get_uri(self):
|
|
524
495
|
"""Returns file URI."""
|
|
525
|
-
return f"{self.source}/{self.
|
|
496
|
+
return f"{self.source}/{self.get_full_name()}"
|
|
526
497
|
|
|
527
|
-
def
|
|
528
|
-
"""
|
|
529
|
-
Returns file path with respect to the filescheme.
|
|
530
|
-
|
|
531
|
-
If `normalize` is True, the path is normalized to remove any redundant
|
|
532
|
-
separators and up-level references.
|
|
533
|
-
|
|
534
|
-
If the file scheme is "file", the path is converted to a local file path
|
|
535
|
-
using `url2pathname`. Otherwise, the original path with scheme is returned.
|
|
536
|
-
"""
|
|
498
|
+
def get_path(self) -> str:
|
|
499
|
+
"""Returns file path."""
|
|
537
500
|
path = unquote(self.get_uri())
|
|
538
|
-
|
|
539
|
-
if
|
|
540
|
-
path =
|
|
501
|
+
source = urlparse(self.source)
|
|
502
|
+
if source.scheme == "file":
|
|
503
|
+
path = urlparse(path).path
|
|
504
|
+
path = url2pathname(path)
|
|
541
505
|
return path
|
|
542
506
|
|
|
543
507
|
def get_destination_path(
|
|
@@ -552,7 +516,7 @@ class File(DataModel):
|
|
|
552
516
|
elif placement == "etag":
|
|
553
517
|
path = f"{self.etag}{self.get_file_suffix()}"
|
|
554
518
|
elif placement == "fullpath":
|
|
555
|
-
path = unquote(self.
|
|
519
|
+
path = unquote(self.get_full_name())
|
|
556
520
|
source = urlparse(self.source)
|
|
557
521
|
if source.scheme and source.scheme != "file":
|
|
558
522
|
path = posixpath.join(source.netloc, path)
|
|
@@ -590,9 +554,8 @@ class File(DataModel):
|
|
|
590
554
|
) from e
|
|
591
555
|
|
|
592
556
|
try:
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
converted_info = client.info_to_file(info, normalized_path)
|
|
557
|
+
info = client.fs.info(client.get_full_path(self.path))
|
|
558
|
+
converted_info = client.info_to_file(info, self.path)
|
|
596
559
|
return type(self)(
|
|
597
560
|
path=self.path,
|
|
598
561
|
source=self.source,
|
|
@@ -603,17 +566,8 @@ class File(DataModel):
|
|
|
603
566
|
last_modified=converted_info.last_modified,
|
|
604
567
|
location=self.location,
|
|
605
568
|
)
|
|
606
|
-
except FileError as e:
|
|
607
|
-
logger.warning(
|
|
608
|
-
"File error when resolving %s/%s: %s", self.source, self.path, str(e)
|
|
609
|
-
)
|
|
610
569
|
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
611
|
-
logger.warning(
|
|
612
|
-
"File system error when resolving %s/%s: %s",
|
|
613
|
-
self.source,
|
|
614
|
-
self.path,
|
|
615
|
-
str(e),
|
|
616
|
-
)
|
|
570
|
+
logger.warning("File system error when resolving %s: %s", self.path, str(e))
|
|
617
571
|
|
|
618
572
|
return type(self)(
|
|
619
573
|
path=self.path,
|
|
@@ -629,8 +583,6 @@ class File(DataModel):
|
|
|
629
583
|
|
|
630
584
|
def resolve(file: File) -> File:
|
|
631
585
|
"""
|
|
632
|
-
[DEPRECATED] Use `file.resolve()` directly instead.
|
|
633
|
-
|
|
634
586
|
Resolve a File object by checking its existence and updating its metadata.
|
|
635
587
|
|
|
636
588
|
This function is a wrapper around the File.resolve() method, designed to be
|
|
@@ -646,12 +598,6 @@ def resolve(file: File) -> File:
|
|
|
646
598
|
RuntimeError: If the file's catalog is not set or if
|
|
647
599
|
the file source protocol is unsupported.
|
|
648
600
|
"""
|
|
649
|
-
warnings.warn(
|
|
650
|
-
"resolve() is deprecated and will be removed "
|
|
651
|
-
"in a future version. Use file.resolve() directly.",
|
|
652
|
-
DeprecationWarning,
|
|
653
|
-
stacklevel=2,
|
|
654
|
-
)
|
|
655
601
|
return file.resolve()
|
|
656
602
|
|
|
657
603
|
|
|
@@ -999,7 +945,7 @@ class ArrowRow(DataModel):
|
|
|
999
945
|
ds = dataset(path, **self.kwargs)
|
|
1000
946
|
|
|
1001
947
|
else:
|
|
1002
|
-
path = self.file.
|
|
948
|
+
path = self.file.get_path()
|
|
1003
949
|
ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
|
|
1004
950
|
|
|
1005
951
|
return ds.take([self.index]).to_reader()
|
datachain/lib/listing.py
CHANGED
|
@@ -123,9 +123,6 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
|
|
|
123
123
|
f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
# we should remove dots from the name
|
|
127
|
-
ds_name = ds_name.replace(".", "_")
|
|
128
|
-
|
|
129
126
|
return ds_name, lst_uri, path
|
|
130
127
|
|
|
131
128
|
|
|
@@ -198,4 +195,5 @@ def get_listing(
|
|
|
198
195
|
list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
|
|
199
196
|
|
|
200
197
|
ds_name = listing.name if listing else ds_name
|
|
198
|
+
|
|
201
199
|
return ds_name, list_uri, list_path, bool(listing)
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -106,7 +106,7 @@ def read_meta( # noqa: C901
|
|
|
106
106
|
from datachain import read_storage
|
|
107
107
|
|
|
108
108
|
if schema_from:
|
|
109
|
-
file = read_storage(schema_from, type="text").limit(1).
|
|
109
|
+
file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
|
|
110
110
|
model_code = gen_datamodel_code(
|
|
111
111
|
file, format=format, jmespath=jmespath, model_name=model_name
|
|
112
112
|
)
|
datachain/lib/pytorch.py
CHANGED
|
@@ -130,7 +130,7 @@ class PytorchDataset(IterableDataset):
|
|
|
130
130
|
if self.num_samples > 0:
|
|
131
131
|
ds = ds.sample(self.num_samples)
|
|
132
132
|
ds = ds.chunk(total_rank, total_workers)
|
|
133
|
-
yield from ds.
|
|
133
|
+
yield from ds.collect()
|
|
134
134
|
|
|
135
135
|
def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
|
|
136
136
|
from datachain.lib.udf import _prefetch_inputs
|