datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/storage.py
CHANGED
|
@@ -1,22 +1,17 @@
|
|
|
1
|
-
import os
|
|
1
|
+
import os
|
|
2
2
|
from collections.abc import Sequence
|
|
3
3
|
from functools import reduce
|
|
4
|
-
from typing import
|
|
5
|
-
TYPE_CHECKING,
|
|
6
|
-
Optional,
|
|
7
|
-
Union,
|
|
8
|
-
)
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
9
5
|
|
|
10
|
-
from datachain.lib.
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
get_listing,
|
|
17
|
-
list_bucket,
|
|
18
|
-
ls,
|
|
6
|
+
from datachain.lib.dc.storage_pattern import (
|
|
7
|
+
apply_glob_filter,
|
|
8
|
+
expand_brace_pattern,
|
|
9
|
+
should_use_recursion,
|
|
10
|
+
split_uri_pattern,
|
|
11
|
+
validate_cloud_bucket_name,
|
|
19
12
|
)
|
|
13
|
+
from datachain.lib.file import FileType, get_file_type
|
|
14
|
+
from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
|
|
20
15
|
from datachain.query import Session
|
|
21
16
|
|
|
22
17
|
if TYPE_CHECKING:
|
|
@@ -24,40 +19,46 @@ if TYPE_CHECKING:
|
|
|
24
19
|
|
|
25
20
|
|
|
26
21
|
def read_storage(
|
|
27
|
-
uri:
|
|
22
|
+
uri: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
|
|
28
23
|
*,
|
|
29
24
|
type: FileType = "binary",
|
|
30
|
-
session:
|
|
31
|
-
settings:
|
|
25
|
+
session: Session | None = None,
|
|
26
|
+
settings: dict | None = None,
|
|
32
27
|
in_memory: bool = False,
|
|
33
|
-
recursive:
|
|
28
|
+
recursive: bool | None = True,
|
|
34
29
|
column: str = "file",
|
|
35
30
|
update: bool = False,
|
|
36
|
-
anon:
|
|
37
|
-
delta:
|
|
38
|
-
delta_on:
|
|
31
|
+
anon: bool | None = None,
|
|
32
|
+
delta: bool | None = False,
|
|
33
|
+
delta_on: str | Sequence[str] | None = (
|
|
39
34
|
"file.path",
|
|
40
35
|
"file.etag",
|
|
41
36
|
"file.version",
|
|
42
37
|
),
|
|
43
|
-
delta_result_on:
|
|
44
|
-
delta_compare:
|
|
45
|
-
delta_retry:
|
|
46
|
-
|
|
38
|
+
delta_result_on: str | Sequence[str] | None = None,
|
|
39
|
+
delta_compare: str | Sequence[str] | None = None,
|
|
40
|
+
delta_retry: bool | str | None = None,
|
|
41
|
+
delta_unsafe: bool = False,
|
|
42
|
+
client_config: dict | None = None,
|
|
47
43
|
) -> "DataChain":
|
|
48
44
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
49
45
|
It returns the chain itself as usual.
|
|
50
46
|
|
|
51
47
|
Parameters:
|
|
52
|
-
uri
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
48
|
+
uri: Storage path(s) or URI(s). Can be a local path or start with a
|
|
49
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
50
|
+
Supports glob patterns:
|
|
51
|
+
- `*` : wildcard
|
|
52
|
+
- `**` : recursive wildcard
|
|
53
|
+
- `?` : single character
|
|
54
|
+
- `{a,b}` : brace expansion list
|
|
55
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
56
|
+
type: read file as "binary", "text", or "image" data. Default is "binary".
|
|
57
|
+
recursive: search recursively for the given path.
|
|
58
|
+
column: Column name that will contain File objects. Default is "file".
|
|
59
|
+
update: force storage reindexing. Default is False.
|
|
60
|
+
anon: If True, we will treat cloud bucket as public one.
|
|
61
|
+
client_config: Optional client configuration for the storage client.
|
|
61
62
|
delta: If True, only process new or changed files instead of reprocessing
|
|
62
63
|
everything. This saves time by skipping files that were already processed in
|
|
63
64
|
previous versions. The optimization is working when a new version of the
|
|
@@ -77,6 +78,9 @@ def read_storage(
|
|
|
77
78
|
(error mode)
|
|
78
79
|
- True: Reprocess records missing from the result dataset (missing mode)
|
|
79
80
|
- None: No retry processing (default)
|
|
81
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
82
|
+
distinct. Caller must ensure datasets are consistent and not partially
|
|
83
|
+
updated.
|
|
80
84
|
|
|
81
85
|
Returns:
|
|
82
86
|
DataChain: A DataChain object containing the file information.
|
|
@@ -85,37 +89,36 @@ def read_storage(
|
|
|
85
89
|
Simple call from s3:
|
|
86
90
|
```python
|
|
87
91
|
import datachain as dc
|
|
88
|
-
|
|
92
|
+
dc.read_storage("s3://my-bucket/my-dir")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Match all .json files recursively using glob pattern
|
|
96
|
+
```py
|
|
97
|
+
dc.read_storage("gs://bucket/meta/**/*.json")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Match image file extensions for directories with pattern
|
|
101
|
+
```py
|
|
102
|
+
dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
By ranges in filenames:
|
|
106
|
+
```py
|
|
107
|
+
dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")
|
|
89
108
|
```
|
|
90
109
|
|
|
91
110
|
Multiple URIs:
|
|
92
111
|
```python
|
|
93
|
-
|
|
94
|
-
"s3://bucket1/dir1",
|
|
95
|
-
"s3://bucket2/dir2"
|
|
96
|
-
])
|
|
112
|
+
dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
|
|
97
113
|
```
|
|
98
114
|
|
|
99
115
|
With AWS S3-compatible storage:
|
|
100
116
|
```python
|
|
101
|
-
|
|
117
|
+
dc.read_storage(
|
|
102
118
|
"s3://my-bucket/my-dir",
|
|
103
119
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
104
120
|
)
|
|
105
121
|
```
|
|
106
|
-
|
|
107
|
-
Pass existing session
|
|
108
|
-
```py
|
|
109
|
-
session = Session.get()
|
|
110
|
-
chain = dc.read_storage([
|
|
111
|
-
"path/to/dir1",
|
|
112
|
-
"path/to/dir2"
|
|
113
|
-
], session=session, recursive=True)
|
|
114
|
-
```
|
|
115
|
-
|
|
116
|
-
Note:
|
|
117
|
-
When using multiple URIs with `update=True`, the function optimizes by
|
|
118
|
-
avoiding redundant updates for URIs pointing to the same storage location.
|
|
119
122
|
"""
|
|
120
123
|
from .datachain import DataChain
|
|
121
124
|
from .datasets import read_dataset
|
|
@@ -138,13 +141,36 @@ def read_storage(
|
|
|
138
141
|
if not uris:
|
|
139
142
|
raise ValueError("No URIs provided")
|
|
140
143
|
|
|
144
|
+
# Then expand all URIs that contain brace patterns
|
|
145
|
+
expanded_uris = []
|
|
146
|
+
for single_uri in uris:
|
|
147
|
+
uri_str = str(single_uri)
|
|
148
|
+
validate_cloud_bucket_name(uri_str)
|
|
149
|
+
expanded_uris.extend(expand_brace_pattern(uri_str))
|
|
150
|
+
|
|
151
|
+
# Now process each expanded URI
|
|
141
152
|
chains = []
|
|
142
153
|
listed_ds_name = set()
|
|
143
154
|
file_values = []
|
|
144
155
|
|
|
145
|
-
|
|
156
|
+
updated_uris = set()
|
|
157
|
+
|
|
158
|
+
for single_uri in expanded_uris:
|
|
159
|
+
# Check if URI contains glob patterns and split them
|
|
160
|
+
base_uri, glob_pattern = split_uri_pattern(single_uri)
|
|
161
|
+
|
|
162
|
+
# If a pattern is found, use the base_uri for listing
|
|
163
|
+
# The pattern will be used for filtering later
|
|
164
|
+
list_uri_to_use = base_uri if glob_pattern else single_uri
|
|
165
|
+
|
|
166
|
+
# Avoid double updates for the same URI
|
|
167
|
+
update_single_uri = False
|
|
168
|
+
if update and (list_uri_to_use not in updated_uris):
|
|
169
|
+
updated_uris.add(list_uri_to_use)
|
|
170
|
+
update_single_uri = True
|
|
171
|
+
|
|
146
172
|
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
147
|
-
|
|
173
|
+
list_uri_to_use, session, update=update_single_uri
|
|
148
174
|
)
|
|
149
175
|
|
|
150
176
|
# list_ds_name is None if object is a file, we don't want to use cache
|
|
@@ -161,6 +187,12 @@ def read_storage(
|
|
|
161
187
|
project=listing_project_name,
|
|
162
188
|
session=session,
|
|
163
189
|
settings=settings,
|
|
190
|
+
delta=delta,
|
|
191
|
+
delta_on=delta_on,
|
|
192
|
+
delta_result_on=delta_result_on,
|
|
193
|
+
delta_compare=delta_compare,
|
|
194
|
+
delta_retry=delta_retry,
|
|
195
|
+
delta_unsafe=delta_unsafe,
|
|
164
196
|
)
|
|
165
197
|
dc._query.update = update
|
|
166
198
|
dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
|
|
@@ -193,7 +225,21 @@ def read_storage(
|
|
|
193
225
|
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
194
226
|
)
|
|
195
227
|
|
|
196
|
-
|
|
228
|
+
# If a glob pattern was detected, use it for filtering
|
|
229
|
+
# Otherwise, use the original list_path from get_listing
|
|
230
|
+
if glob_pattern:
|
|
231
|
+
# Determine if we should use recursive listing based on the pattern
|
|
232
|
+
use_recursive = should_use_recursion(glob_pattern, recursive or False)
|
|
233
|
+
|
|
234
|
+
# Apply glob filter - no need for brace expansion here as it's done above
|
|
235
|
+
chain = apply_glob_filter(
|
|
236
|
+
dc, glob_pattern, list_path, use_recursive, column
|
|
237
|
+
)
|
|
238
|
+
chains.append(chain)
|
|
239
|
+
else:
|
|
240
|
+
# No glob pattern detected, use normal ls behavior
|
|
241
|
+
chains.append(ls(dc, list_path, recursive=recursive, column=column))
|
|
242
|
+
|
|
197
243
|
listed_ds_name.add(list_ds_name)
|
|
198
244
|
|
|
199
245
|
storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
|
|
@@ -212,12 +258,4 @@ def read_storage(
|
|
|
212
258
|
|
|
213
259
|
assert storage_chain is not None
|
|
214
260
|
|
|
215
|
-
if delta:
|
|
216
|
-
storage_chain = storage_chain._as_delta(
|
|
217
|
-
on=delta_on,
|
|
218
|
-
right_on=delta_result_on,
|
|
219
|
-
compare=delta_compare,
|
|
220
|
-
delta_retry=delta_retry,
|
|
221
|
-
)
|
|
222
|
-
|
|
223
261
|
return storage_chain
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from datachain.client.fsspec import is_cloud_uri
|
|
5
|
+
from datachain.lib.listing import ls
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from .datachain import DataChain
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def validate_cloud_bucket_name(uri: str) -> None:
|
|
12
|
+
"""
|
|
13
|
+
Validate that cloud storage bucket names don't contain glob patterns.
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
ValueError: If a cloud storage bucket name contains glob patterns
|
|
17
|
+
"""
|
|
18
|
+
if not is_cloud_uri(uri):
|
|
19
|
+
return
|
|
20
|
+
|
|
21
|
+
if "://" in uri:
|
|
22
|
+
scheme_end = uri.index("://") + 3
|
|
23
|
+
path_part = uri[scheme_end:]
|
|
24
|
+
|
|
25
|
+
if "/" in path_part:
|
|
26
|
+
bucket_name = path_part.split("/")[0]
|
|
27
|
+
else:
|
|
28
|
+
bucket_name = path_part
|
|
29
|
+
|
|
30
|
+
glob_chars = ["*", "?", "[", "]", "{", "}"]
|
|
31
|
+
if any(char in bucket_name for char in glob_chars):
|
|
32
|
+
raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def split_uri_pattern(uri: str) -> tuple[str, str | None]:
|
|
36
|
+
"""Split a URI into base path and glob pattern."""
|
|
37
|
+
if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
|
|
38
|
+
return uri, None
|
|
39
|
+
|
|
40
|
+
if "://" in uri:
|
|
41
|
+
scheme_end = uri.index("://") + 3
|
|
42
|
+
scheme_part = uri[:scheme_end]
|
|
43
|
+
path_part = uri[scheme_end:]
|
|
44
|
+
path_segments = path_part.split("/")
|
|
45
|
+
|
|
46
|
+
pattern_start_idx = None
|
|
47
|
+
for i, segment in enumerate(path_segments):
|
|
48
|
+
# Check for glob patterns including brace expansion
|
|
49
|
+
if glob.has_magic(segment) or "{" in segment:
|
|
50
|
+
pattern_start_idx = i
|
|
51
|
+
break
|
|
52
|
+
|
|
53
|
+
if pattern_start_idx is None:
|
|
54
|
+
return uri, None
|
|
55
|
+
|
|
56
|
+
if pattern_start_idx == 0:
|
|
57
|
+
base = scheme_part + path_segments[0]
|
|
58
|
+
pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
|
|
59
|
+
else:
|
|
60
|
+
base = scheme_part + "/".join(path_segments[:pattern_start_idx])
|
|
61
|
+
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
62
|
+
|
|
63
|
+
return base, pattern
|
|
64
|
+
|
|
65
|
+
path_segments = uri.split("/")
|
|
66
|
+
|
|
67
|
+
pattern_start_idx = None
|
|
68
|
+
for i, segment in enumerate(path_segments):
|
|
69
|
+
if glob.has_magic(segment) or "{" in segment:
|
|
70
|
+
pattern_start_idx = i
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
if pattern_start_idx is None:
|
|
74
|
+
return uri, None
|
|
75
|
+
|
|
76
|
+
base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
|
|
77
|
+
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
78
|
+
|
|
79
|
+
return base, pattern
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
|
|
83
|
+
if not user_recursive:
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
if "**" in pattern:
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
return "/" in pattern
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def expand_brace_pattern(pattern: str) -> list[str]:
|
|
93
|
+
"""
|
|
94
|
+
Recursively expand brace patterns into multiple glob patterns.
|
|
95
|
+
Supports:
|
|
96
|
+
- Comma-separated lists: *.{mp3,wav}
|
|
97
|
+
- Numeric ranges: file{1..10}
|
|
98
|
+
- Zero-padded numeric ranges: file{01..10}
|
|
99
|
+
- Character ranges: file{a..z}
|
|
100
|
+
|
|
101
|
+
Examples:
|
|
102
|
+
"*.{mp3,wav}" -> ["*.mp3", "*.wav"]
|
|
103
|
+
"file{1..3}" -> ["file1", "file2", "file3"]
|
|
104
|
+
"file{01..03}" -> ["file01", "file02", "file03"]
|
|
105
|
+
"file{a..c}" -> ["filea", "fileb", "filec"]
|
|
106
|
+
"{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
|
|
107
|
+
"""
|
|
108
|
+
if "{" not in pattern or "}" not in pattern:
|
|
109
|
+
return [pattern]
|
|
110
|
+
|
|
111
|
+
return _expand_single_braces(pattern)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _expand_single_braces(pattern: str) -> list[str]:
|
|
115
|
+
if "{" not in pattern or "}" not in pattern:
|
|
116
|
+
return [pattern]
|
|
117
|
+
|
|
118
|
+
start = pattern.index("{")
|
|
119
|
+
end = start
|
|
120
|
+
depth = 0
|
|
121
|
+
for i in range(start, len(pattern)):
|
|
122
|
+
if pattern[i] == "{":
|
|
123
|
+
depth += 1
|
|
124
|
+
elif pattern[i] == "}":
|
|
125
|
+
depth -= 1
|
|
126
|
+
if depth == 0:
|
|
127
|
+
end = i
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
if start >= end:
|
|
131
|
+
return [pattern]
|
|
132
|
+
|
|
133
|
+
prefix = pattern[:start]
|
|
134
|
+
suffix = pattern[end + 1 :]
|
|
135
|
+
brace_content = pattern[start + 1 : end]
|
|
136
|
+
|
|
137
|
+
if ".." in brace_content:
|
|
138
|
+
options = _expand_range(brace_content)
|
|
139
|
+
else:
|
|
140
|
+
options = [opt.strip() for opt in brace_content.split(",")]
|
|
141
|
+
|
|
142
|
+
expanded = []
|
|
143
|
+
for option in options:
|
|
144
|
+
combined = prefix + option + suffix
|
|
145
|
+
expanded.extend(_expand_single_braces(combined))
|
|
146
|
+
|
|
147
|
+
return expanded
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _expand_range(range_spec: str) -> list[str]: # noqa: PLR0911
|
|
151
|
+
if ".." not in range_spec:
|
|
152
|
+
return [range_spec]
|
|
153
|
+
|
|
154
|
+
parts = range_spec.split("..")
|
|
155
|
+
if len(parts) != 2:
|
|
156
|
+
return [range_spec]
|
|
157
|
+
|
|
158
|
+
start, end = parts[0], parts[1]
|
|
159
|
+
|
|
160
|
+
if start.isdigit() and end.isdigit():
|
|
161
|
+
pad_width = max(len(start), len(end)) if start[0] == "0" or end[0] == "0" else 0
|
|
162
|
+
start_num = int(start)
|
|
163
|
+
end_num = int(end)
|
|
164
|
+
|
|
165
|
+
if start_num <= end_num:
|
|
166
|
+
if pad_width > 0:
|
|
167
|
+
return [str(i).zfill(pad_width) for i in range(start_num, end_num + 1)]
|
|
168
|
+
return [str(i) for i in range(start_num, end_num + 1)]
|
|
169
|
+
if pad_width > 0:
|
|
170
|
+
return [str(i).zfill(pad_width) for i in range(start_num, end_num - 1, -1)]
|
|
171
|
+
return [str(i) for i in range(start_num, end_num - 1, -1)]
|
|
172
|
+
|
|
173
|
+
if len(start) == 1 and len(end) == 1 and start.isalpha() and end.isalpha():
|
|
174
|
+
start_ord = ord(start)
|
|
175
|
+
end_ord = ord(end)
|
|
176
|
+
|
|
177
|
+
if start_ord <= end_ord:
|
|
178
|
+
return [chr(i) for i in range(start_ord, end_ord + 1)]
|
|
179
|
+
return [chr(i) for i in range(start_ord, end_ord - 1, -1)]
|
|
180
|
+
|
|
181
|
+
return [range_spec]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
185
|
+
if "**" not in filter_pattern:
|
|
186
|
+
return filter_pattern
|
|
187
|
+
|
|
188
|
+
parts = filter_pattern.split("/")
|
|
189
|
+
globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
|
|
190
|
+
|
|
191
|
+
num_globstars = len(globstar_positions)
|
|
192
|
+
|
|
193
|
+
if num_globstars <= 1:
|
|
194
|
+
if filter_pattern == "**/*":
|
|
195
|
+
return "*"
|
|
196
|
+
if filter_pattern.startswith("**/"):
|
|
197
|
+
remaining = filter_pattern[3:]
|
|
198
|
+
if "/" not in remaining:
|
|
199
|
+
# Pattern like **/*.ext or **/temp?.*
|
|
200
|
+
# The ** means zero or more directories
|
|
201
|
+
# For zero directories: pattern should be just the filename pattern
|
|
202
|
+
# For one or more: pattern should be */filename
|
|
203
|
+
# Since we can't OR in GLOB, we choose the more permissive option
|
|
204
|
+
# that works with recursive listing
|
|
205
|
+
# Special handling: if it's a simple extension pattern, match broadly
|
|
206
|
+
if remaining.startswith("*."):
|
|
207
|
+
return remaining
|
|
208
|
+
return f"*/{remaining}"
|
|
209
|
+
|
|
210
|
+
return filter_pattern.replace("**", "*")
|
|
211
|
+
|
|
212
|
+
middle_parts = []
|
|
213
|
+
start_idx = globstar_positions[0] + 1
|
|
214
|
+
end_idx = globstar_positions[-1]
|
|
215
|
+
for i in range(start_idx, end_idx):
|
|
216
|
+
if parts[i] != "**":
|
|
217
|
+
middle_parts.append(parts[i])
|
|
218
|
+
|
|
219
|
+
if not middle_parts:
|
|
220
|
+
result = filter_pattern.replace("**", "*")
|
|
221
|
+
else:
|
|
222
|
+
middle_pattern = "/".join(middle_parts)
|
|
223
|
+
last_part = parts[-1] if parts[-1] != "**" else "*"
|
|
224
|
+
|
|
225
|
+
if last_part != "*":
|
|
226
|
+
result = f"*{middle_pattern}*{last_part}"
|
|
227
|
+
else:
|
|
228
|
+
result = f"*{middle_pattern}*"
|
|
229
|
+
|
|
230
|
+
return result
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def apply_glob_filter(
|
|
234
|
+
dc: "DataChain",
|
|
235
|
+
pattern: str,
|
|
236
|
+
list_path: str,
|
|
237
|
+
use_recursive: bool,
|
|
238
|
+
column: str,
|
|
239
|
+
) -> "DataChain":
|
|
240
|
+
from datachain.query.schema import Column
|
|
241
|
+
|
|
242
|
+
chain = ls(dc, list_path, recursive=use_recursive, column=column)
|
|
243
|
+
|
|
244
|
+
if list_path and "/" not in pattern:
|
|
245
|
+
filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
|
|
246
|
+
else:
|
|
247
|
+
filter_pattern = pattern
|
|
248
|
+
|
|
249
|
+
glob_pattern = convert_globstar_to_glob(filter_pattern)
|
|
250
|
+
|
|
251
|
+
return chain.filter(Column(f"{column}.path").glob(glob_pattern))
|
datachain/lib/dc/utils.py
CHANGED
|
@@ -1,12 +1,6 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from functools import wraps
|
|
3
|
-
from typing import
|
|
4
|
-
TYPE_CHECKING,
|
|
5
|
-
Callable,
|
|
6
|
-
Optional,
|
|
7
|
-
TypeVar,
|
|
8
|
-
Union,
|
|
9
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
10
4
|
|
|
11
5
|
import sqlalchemy
|
|
12
6
|
from sqlalchemy.sql.functions import GenericFunction
|
|
@@ -18,7 +12,10 @@ from datachain.query.schema import DEFAULT_DELIMITER
|
|
|
18
12
|
from datachain.utils import getenv_bool
|
|
19
13
|
|
|
20
14
|
if TYPE_CHECKING:
|
|
21
|
-
from
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
from typing import Concatenate
|
|
17
|
+
|
|
18
|
+
from typing_extensions import ParamSpec
|
|
22
19
|
|
|
23
20
|
from .datachain import DataChain
|
|
24
21
|
|
|
@@ -28,9 +25,15 @@ D = TypeVar("D", bound="DataChain")
|
|
|
28
25
|
|
|
29
26
|
|
|
30
27
|
def is_studio() -> bool:
|
|
28
|
+
"""Check if the runtime environment is Studio (not local)."""
|
|
31
29
|
return getenv_bool("DATACHAIN_IS_STUDIO", default=False)
|
|
32
30
|
|
|
33
31
|
|
|
32
|
+
def is_local() -> bool:
|
|
33
|
+
"""Check if the runtime environment is local (not Studio)."""
|
|
34
|
+
return not is_studio()
|
|
35
|
+
|
|
36
|
+
|
|
34
37
|
def resolve_columns(
|
|
35
38
|
method: "Callable[Concatenate[D, P], D]",
|
|
36
39
|
) -> "Callable[Concatenate[D, P], D]":
|
|
@@ -70,11 +73,11 @@ class DatasetFromValuesError(DataChainParamsError):
|
|
|
70
73
|
super().__init__(f"Dataset{name} from values error: {msg}")
|
|
71
74
|
|
|
72
75
|
|
|
73
|
-
MergeColType =
|
|
76
|
+
MergeColType = str | Function | sqlalchemy.ColumnElement
|
|
74
77
|
|
|
75
78
|
|
|
76
79
|
def _validate_merge_on(
|
|
77
|
-
on:
|
|
80
|
+
on: MergeColType | Sequence[MergeColType],
|
|
78
81
|
ds: "DataChain",
|
|
79
82
|
) -> Sequence[MergeColType]:
|
|
80
83
|
if isinstance(on, (str, sqlalchemy.ColumnElement)):
|
|
@@ -103,12 +106,12 @@ def _get_merge_error_str(col: MergeColType) -> str:
|
|
|
103
106
|
class DatasetMergeError(DataChainParamsError):
|
|
104
107
|
def __init__(
|
|
105
108
|
self,
|
|
106
|
-
on:
|
|
107
|
-
right_on:
|
|
109
|
+
on: MergeColType | Sequence[MergeColType],
|
|
110
|
+
right_on: MergeColType | Sequence[MergeColType] | None,
|
|
108
111
|
msg: str,
|
|
109
112
|
):
|
|
110
113
|
def _get_str(
|
|
111
|
-
on:
|
|
114
|
+
on: MergeColType | Sequence[MergeColType],
|
|
112
115
|
) -> str:
|
|
113
116
|
if not isinstance(on, Sequence):
|
|
114
117
|
return str(on) # type: ignore[unreachable]
|
|
@@ -123,7 +126,7 @@ class DatasetMergeError(DataChainParamsError):
|
|
|
123
126
|
super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
|
|
124
127
|
|
|
125
128
|
|
|
126
|
-
OutputType =
|
|
129
|
+
OutputType = DataType | Sequence[str] | dict[str, DataType] | None
|
|
127
130
|
|
|
128
131
|
|
|
129
132
|
class Sys(DataModel):
|
datachain/lib/dc/values.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
1
|
from collections.abc import Iterator
|
|
2
|
-
from typing import
|
|
3
|
-
TYPE_CHECKING,
|
|
4
|
-
Optional,
|
|
5
|
-
)
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
6
3
|
|
|
7
4
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
8
5
|
from datachain.lib.data_model import dict_to_data_model
|
|
@@ -20,8 +17,8 @@ if TYPE_CHECKING:
|
|
|
20
17
|
|
|
21
18
|
def read_values(
|
|
22
19
|
ds_name: str = "",
|
|
23
|
-
session:
|
|
24
|
-
settings:
|
|
20
|
+
session: Session | None = None,
|
|
21
|
+
settings: dict | None = None,
|
|
25
22
|
in_memory: bool = False,
|
|
26
23
|
output: OutputType = None,
|
|
27
24
|
column: str = "",
|