datachain 0.31.1__py3-none-any.whl → 0.31.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/client/fsspec.py +11 -0
- datachain/lib/dc/storage.py +67 -25
- datachain/lib/dc/storage_pattern.py +300 -0
- {datachain-0.31.1.dist-info → datachain-0.31.2.dist-info}/METADATA +1 -1
- {datachain-0.31.1.dist-info → datachain-0.31.2.dist-info}/RECORD +9 -8
- {datachain-0.31.1.dist-info → datachain-0.31.2.dist-info}/WHEEL +0 -0
- {datachain-0.31.1.dist-info → datachain-0.31.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.31.1.dist-info → datachain-0.31.2.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.31.1.dist-info → datachain-0.31.2.dist-info}/top_level.txt +0 -0
datachain/client/fsspec.py
CHANGED
|
@@ -44,6 +44,7 @@ FETCH_WORKERS = 100
|
|
|
44
44
|
DELIMITER = "/" # Path delimiter.
|
|
45
45
|
|
|
46
46
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
47
|
+
CLOUD_STORAGE_PROTOCOLS = {"s3", "gs", "az", "hf"}
|
|
47
48
|
|
|
48
49
|
ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
|
|
49
50
|
|
|
@@ -62,6 +63,16 @@ def _is_win_local_path(uri: str) -> bool:
|
|
|
62
63
|
return False
|
|
63
64
|
|
|
64
65
|
|
|
66
|
+
def is_cloud_uri(uri: str) -> bool:
|
|
67
|
+
protocol = urlparse(uri).scheme
|
|
68
|
+
return protocol in CLOUD_STORAGE_PROTOCOLS
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_cloud_schemes() -> list[str]:
|
|
72
|
+
"""Get list of cloud storage scheme prefixes."""
|
|
73
|
+
return [f"{p}://" for p in CLOUD_STORAGE_PROTOCOLS]
|
|
74
|
+
|
|
75
|
+
|
|
65
76
|
class Bucket(NamedTuple):
|
|
66
77
|
name: str
|
|
67
78
|
uri: "StorageURI"
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -3,6 +3,13 @@ from collections.abc import Sequence
|
|
|
3
3
|
from functools import reduce
|
|
4
4
|
from typing import TYPE_CHECKING, Optional, Union
|
|
5
5
|
|
|
6
|
+
from datachain.lib.dc.storage_pattern import (
|
|
7
|
+
apply_glob_filter,
|
|
8
|
+
expand_brace_pattern,
|
|
9
|
+
should_use_recursion,
|
|
10
|
+
split_uri_pattern,
|
|
11
|
+
validate_cloud_bucket_name,
|
|
12
|
+
)
|
|
6
13
|
from datachain.lib.file import FileType, get_file_type
|
|
7
14
|
from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
|
|
8
15
|
from datachain.query import Session
|
|
@@ -38,14 +45,18 @@ def read_storage(
|
|
|
38
45
|
It returns the chain itself as usual.
|
|
39
46
|
|
|
40
47
|
Parameters:
|
|
41
|
-
uri:
|
|
42
|
-
|
|
43
|
-
|
|
48
|
+
uri: Storage path(s) or URI(s). Can be a local path or start with a
|
|
49
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
50
|
+
Supports glob patterns:
|
|
51
|
+
- `*` : wildcard
|
|
52
|
+
- `**` : recursive wildcard
|
|
53
|
+
- `?` : single character
|
|
54
|
+
- `{a,b}` : brace expansion
|
|
44
55
|
type: read file as "binary", "text", or "image" data. Default is "binary".
|
|
45
56
|
recursive: search recursively for the given path.
|
|
46
|
-
column:
|
|
57
|
+
column: Column name that will contain File objects. Default is "file".
|
|
47
58
|
update: force storage reindexing. Default is False.
|
|
48
|
-
anon: If True, we will treat cloud bucket as public one
|
|
59
|
+
anon: If True, we will treat cloud bucket as public one.
|
|
49
60
|
client_config: Optional client configuration for the storage client.
|
|
50
61
|
delta: If True, only process new or changed files instead of reprocessing
|
|
51
62
|
everything. This saves time by skipping files that were already processed in
|
|
@@ -80,12 +91,19 @@ def read_storage(
|
|
|
80
91
|
chain = dc.read_storage("s3://my-bucket/my-dir")
|
|
81
92
|
```
|
|
82
93
|
|
|
94
|
+
Match all .json files recursively using glob pattern
|
|
95
|
+
```py
|
|
96
|
+
chain = dc.read_storage("gs://bucket/meta/**/*.json")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Match image file extensions for directories with pattern
|
|
100
|
+
```py
|
|
101
|
+
chain = dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
|
|
102
|
+
```
|
|
103
|
+
|
|
83
104
|
Multiple URIs:
|
|
84
105
|
```python
|
|
85
|
-
chain = dc.read_storage([
|
|
86
|
-
"s3://bucket1/dir1",
|
|
87
|
-
"s3://bucket2/dir2"
|
|
88
|
-
])
|
|
106
|
+
chain = dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
|
|
89
107
|
```
|
|
90
108
|
|
|
91
109
|
With AWS S3-compatible storage:
|
|
@@ -95,19 +113,6 @@ def read_storage(
|
|
|
95
113
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
96
114
|
)
|
|
97
115
|
```
|
|
98
|
-
|
|
99
|
-
Pass existing session
|
|
100
|
-
```py
|
|
101
|
-
session = Session.get()
|
|
102
|
-
chain = dc.read_storage([
|
|
103
|
-
"path/to/dir1",
|
|
104
|
-
"path/to/dir2"
|
|
105
|
-
], session=session, recursive=True)
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
Note:
|
|
109
|
-
When using multiple URIs with `update=True`, the function optimizes by
|
|
110
|
-
avoiding redundant updates for URIs pointing to the same storage location.
|
|
111
116
|
"""
|
|
112
117
|
from .datachain import DataChain
|
|
113
118
|
from .datasets import read_dataset
|
|
@@ -130,13 +135,36 @@ def read_storage(
|
|
|
130
135
|
if not uris:
|
|
131
136
|
raise ValueError("No URIs provided")
|
|
132
137
|
|
|
138
|
+
# Then expand all URIs that contain brace patterns
|
|
139
|
+
expanded_uris = []
|
|
140
|
+
for single_uri in uris:
|
|
141
|
+
uri_str = str(single_uri)
|
|
142
|
+
validate_cloud_bucket_name(uri_str)
|
|
143
|
+
expanded_uris.extend(expand_brace_pattern(uri_str))
|
|
144
|
+
|
|
145
|
+
# Now process each expanded URI
|
|
133
146
|
chains = []
|
|
134
147
|
listed_ds_name = set()
|
|
135
148
|
file_values = []
|
|
136
149
|
|
|
137
|
-
|
|
150
|
+
updated_uris = set()
|
|
151
|
+
|
|
152
|
+
for single_uri in expanded_uris:
|
|
153
|
+
# Check if URI contains glob patterns and split them
|
|
154
|
+
base_uri, glob_pattern = split_uri_pattern(single_uri)
|
|
155
|
+
|
|
156
|
+
# If a pattern is found, use the base_uri for listing
|
|
157
|
+
# The pattern will be used for filtering later
|
|
158
|
+
list_uri_to_use = base_uri if glob_pattern else single_uri
|
|
159
|
+
|
|
160
|
+
# Avoid double updates for the same URI
|
|
161
|
+
update_single_uri = False
|
|
162
|
+
if update and (list_uri_to_use not in updated_uris):
|
|
163
|
+
updated_uris.add(list_uri_to_use)
|
|
164
|
+
update_single_uri = True
|
|
165
|
+
|
|
138
166
|
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
139
|
-
|
|
167
|
+
list_uri_to_use, session, update=update_single_uri
|
|
140
168
|
)
|
|
141
169
|
|
|
142
170
|
# list_ds_name is None if object is a file, we don't want to use cache
|
|
@@ -185,7 +213,21 @@ def read_storage(
|
|
|
185
213
|
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
186
214
|
)
|
|
187
215
|
|
|
188
|
-
|
|
216
|
+
# If a glob pattern was detected, use it for filtering
|
|
217
|
+
# Otherwise, use the original list_path from get_listing
|
|
218
|
+
if glob_pattern:
|
|
219
|
+
# Determine if we should use recursive listing based on the pattern
|
|
220
|
+
use_recursive = should_use_recursion(glob_pattern, recursive or False)
|
|
221
|
+
|
|
222
|
+
# Apply glob filter - no need for brace expansion here as it's done above
|
|
223
|
+
chain = apply_glob_filter(
|
|
224
|
+
dc, glob_pattern, list_path, use_recursive, column
|
|
225
|
+
)
|
|
226
|
+
chains.append(chain)
|
|
227
|
+
else:
|
|
228
|
+
# No glob pattern detected, use normal ls behavior
|
|
229
|
+
chains.append(ls(dc, list_path, recursive=recursive, column=column))
|
|
230
|
+
|
|
189
231
|
listed_ds_name.add(list_ds_name)
|
|
190
232
|
|
|
191
233
|
storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
from typing import TYPE_CHECKING, Union
|
|
3
|
+
|
|
4
|
+
from datachain.client.fsspec import is_cloud_uri
|
|
5
|
+
from datachain.lib.listing import ls
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from .datachain import DataChain
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def validate_cloud_bucket_name(uri: str) -> None:
|
|
12
|
+
"""
|
|
13
|
+
Validate that cloud storage bucket names don't contain glob patterns.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
uri: URI to validate
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If a cloud storage bucket name contains glob patterns
|
|
20
|
+
"""
|
|
21
|
+
if not is_cloud_uri(uri):
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
# Extract bucket name (everything between :// and first /)
|
|
25
|
+
if "://" in uri:
|
|
26
|
+
scheme_end = uri.index("://") + 3
|
|
27
|
+
path_part = uri[scheme_end:]
|
|
28
|
+
|
|
29
|
+
# Get the bucket name (first segment)
|
|
30
|
+
if "/" in path_part:
|
|
31
|
+
bucket_name = path_part.split("/")[0]
|
|
32
|
+
else:
|
|
33
|
+
bucket_name = path_part
|
|
34
|
+
|
|
35
|
+
# Check if bucket name contains glob patterns
|
|
36
|
+
glob_chars = ["*", "?", "[", "]", "{", "}"]
|
|
37
|
+
if any(char in bucket_name for char in glob_chars):
|
|
38
|
+
raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
42
|
+
"""
|
|
43
|
+
Split a URI into base path and glob pattern.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
uri: URI that may contain glob patterns (*, **, ?, {})
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Tuple of (base_uri, pattern) where pattern is None if no glob pattern found
|
|
50
|
+
|
|
51
|
+
Examples:
|
|
52
|
+
"s3://bucket/dir/*.mp3" -> ("s3://bucket/dir", "*.mp3")
|
|
53
|
+
"s3://bucket/**/*.mp3" -> ("s3://bucket", "**/*.mp3")
|
|
54
|
+
"s3://bucket/dir" -> ("s3://bucket/dir", None)
|
|
55
|
+
"""
|
|
56
|
+
if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
|
|
57
|
+
return uri, None
|
|
58
|
+
|
|
59
|
+
# Handle different URI schemes
|
|
60
|
+
if "://" in uri:
|
|
61
|
+
# Split into scheme and path
|
|
62
|
+
scheme_end = uri.index("://") + 3
|
|
63
|
+
scheme_part = uri[:scheme_end]
|
|
64
|
+
path_part = uri[scheme_end:]
|
|
65
|
+
|
|
66
|
+
# Find where the glob pattern starts
|
|
67
|
+
path_segments = path_part.split("/")
|
|
68
|
+
|
|
69
|
+
# Find first segment with glob pattern
|
|
70
|
+
pattern_start_idx = None
|
|
71
|
+
for i, segment in enumerate(path_segments):
|
|
72
|
+
# Check for glob patterns including brace expansion
|
|
73
|
+
if glob.has_magic(segment) or "{" in segment:
|
|
74
|
+
pattern_start_idx = i
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
if pattern_start_idx is None:
|
|
78
|
+
return uri, None
|
|
79
|
+
|
|
80
|
+
# Split into base and pattern
|
|
81
|
+
if pattern_start_idx == 0:
|
|
82
|
+
# Pattern at root of bucket
|
|
83
|
+
base = scheme_part + path_segments[0]
|
|
84
|
+
pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
|
|
85
|
+
else:
|
|
86
|
+
base = scheme_part + "/".join(path_segments[:pattern_start_idx])
|
|
87
|
+
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
88
|
+
|
|
89
|
+
return base, pattern
|
|
90
|
+
# Local path
|
|
91
|
+
path_segments = uri.split("/")
|
|
92
|
+
|
|
93
|
+
# Find first segment with glob pattern
|
|
94
|
+
pattern_start_idx = None
|
|
95
|
+
for i, segment in enumerate(path_segments):
|
|
96
|
+
# Check for glob patterns including brace expansion
|
|
97
|
+
if glob.has_magic(segment) or "{" in segment:
|
|
98
|
+
pattern_start_idx = i
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
if pattern_start_idx is None:
|
|
102
|
+
return uri, None
|
|
103
|
+
|
|
104
|
+
# Split into base and pattern
|
|
105
|
+
base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
|
|
106
|
+
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
107
|
+
|
|
108
|
+
return base, pattern
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
|
|
112
|
+
"""
|
|
113
|
+
Determine if we should use recursive listing based on the pattern.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
pattern: The glob pattern extracted from URI
|
|
117
|
+
user_recursive: User's recursive preference
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
True if recursive listing should be used
|
|
121
|
+
|
|
122
|
+
Examples:
|
|
123
|
+
"*" -> False (single level only)
|
|
124
|
+
"*.mp3" -> False (single level only)
|
|
125
|
+
"**/*.mp3" -> True (globstar requires recursion)
|
|
126
|
+
"dir/*/file.txt" -> True (multi-level pattern)
|
|
127
|
+
"""
|
|
128
|
+
if not user_recursive:
|
|
129
|
+
# If user explicitly wants non-recursive, respect that
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
# If pattern contains globstar, definitely need recursion
|
|
133
|
+
if "**" in pattern:
|
|
134
|
+
return True
|
|
135
|
+
|
|
136
|
+
# If pattern contains path separators, it needs recursion
|
|
137
|
+
# Single-level patterns like "*", "*.txt", "file?" should not be recursive
|
|
138
|
+
return "/" in pattern
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def expand_brace_pattern(pattern: str) -> list[str]:
|
|
142
|
+
"""
|
|
143
|
+
Recursively expand brace patterns like *.{mp3,wav} into multiple glob patterns.
|
|
144
|
+
Handles nested and multiple brace patterns.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
pattern: Pattern that may contain brace expansion
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
List of expanded patterns
|
|
151
|
+
|
|
152
|
+
Examples:
|
|
153
|
+
"*.{mp3,wav}" -> ["*.mp3", "*.wav"]
|
|
154
|
+
"{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
|
|
155
|
+
"*.txt" -> ["*.txt"]
|
|
156
|
+
"{{a,b}}" -> ["{a}", "{b}"] # Handle double braces
|
|
157
|
+
"""
|
|
158
|
+
if "{" not in pattern or "}" not in pattern:
|
|
159
|
+
return [pattern]
|
|
160
|
+
|
|
161
|
+
return _expand_single_braces(pattern)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _expand_single_braces(pattern: str) -> list[str]:
|
|
165
|
+
"""Helper to expand single-level braces."""
|
|
166
|
+
if "{" not in pattern or "}" not in pattern:
|
|
167
|
+
return [pattern]
|
|
168
|
+
|
|
169
|
+
# Find the first complete brace pattern
|
|
170
|
+
start = pattern.index("{")
|
|
171
|
+
end = start
|
|
172
|
+
depth = 0
|
|
173
|
+
for i in range(start, len(pattern)):
|
|
174
|
+
if pattern[i] == "{":
|
|
175
|
+
depth += 1
|
|
176
|
+
elif pattern[i] == "}":
|
|
177
|
+
depth -= 1
|
|
178
|
+
if depth == 0:
|
|
179
|
+
end = i
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
if start >= end:
|
|
183
|
+
return [pattern]
|
|
184
|
+
|
|
185
|
+
prefix = pattern[:start]
|
|
186
|
+
suffix = pattern[end + 1 :]
|
|
187
|
+
options = pattern[start + 1 : end].split(",")
|
|
188
|
+
|
|
189
|
+
# Generate all combinations and recursively expand
|
|
190
|
+
expanded = []
|
|
191
|
+
for option in options:
|
|
192
|
+
combined = prefix + option.strip() + suffix
|
|
193
|
+
# Recursively expand any remaining braces
|
|
194
|
+
expanded.extend(_expand_single_braces(combined))
|
|
195
|
+
|
|
196
|
+
return expanded
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
200
|
+
"""Convert globstar patterns to GLOB patterns.
|
|
201
|
+
|
|
202
|
+
Standard GLOB doesn't understand ** as recursive wildcard,
|
|
203
|
+
so we need to convert patterns appropriately.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
filter_pattern: Pattern that may contain globstars (**)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
GLOB-compatible pattern
|
|
210
|
+
"""
|
|
211
|
+
if "**" not in filter_pattern:
|
|
212
|
+
return filter_pattern
|
|
213
|
+
|
|
214
|
+
parts = filter_pattern.split("/")
|
|
215
|
+
globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
|
|
216
|
+
|
|
217
|
+
# Handle different cases based on number of globstars
|
|
218
|
+
num_globstars = len(globstar_positions)
|
|
219
|
+
|
|
220
|
+
if num_globstars <= 1:
|
|
221
|
+
# Special case: pattern like **/* means zero or more directories
|
|
222
|
+
# This is tricky because GLOB can't express "zero or more"
|
|
223
|
+
# We need different handling based on the pattern structure
|
|
224
|
+
|
|
225
|
+
if filter_pattern == "**/*":
|
|
226
|
+
# Match everything
|
|
227
|
+
return "*"
|
|
228
|
+
if filter_pattern.startswith("**/"):
|
|
229
|
+
remaining = filter_pattern[3:]
|
|
230
|
+
if "/" not in remaining:
|
|
231
|
+
# Pattern like **/*.ext or **/temp?.*
|
|
232
|
+
# The ** means zero or more directories
|
|
233
|
+
# For zero directories: pattern should be just the filename pattern
|
|
234
|
+
# For one or more: pattern should be */filename
|
|
235
|
+
# Since we can't OR in GLOB, we choose the more permissive option
|
|
236
|
+
# that works with recursive listing
|
|
237
|
+
# Special handling: if it's a simple extension pattern, match broadly
|
|
238
|
+
if remaining.startswith("*."):
|
|
239
|
+
# Pattern like **/*.ext - match any file with this extension
|
|
240
|
+
# This matches *.ext at current level and deeper with recursion:
|
|
241
|
+
return remaining
|
|
242
|
+
# Pattern like **/temp?.* - match as filename in subdirs
|
|
243
|
+
return f"*/{remaining}"
|
|
244
|
+
|
|
245
|
+
# Default: Zero or one globstar - simple replacement
|
|
246
|
+
return filter_pattern.replace("**", "*")
|
|
247
|
+
|
|
248
|
+
# Multiple globstars - need more careful handling
|
|
249
|
+
# For patterns like **/level?/backup/**/*.ext
|
|
250
|
+
# We want to match any path containing /level?/backup/ and ending with .ext
|
|
251
|
+
|
|
252
|
+
# Find middle directories (between first and last **)
|
|
253
|
+
middle_parts = []
|
|
254
|
+
start_idx = globstar_positions[0] + 1
|
|
255
|
+
end_idx = globstar_positions[-1]
|
|
256
|
+
for i in range(start_idx, end_idx):
|
|
257
|
+
if parts[i] != "**":
|
|
258
|
+
middle_parts.append(parts[i])
|
|
259
|
+
|
|
260
|
+
if not middle_parts:
|
|
261
|
+
# No fixed middle parts, just use wildcards
|
|
262
|
+
result = filter_pattern.replace("**", "*")
|
|
263
|
+
else:
|
|
264
|
+
# Create pattern that matches the middle parts
|
|
265
|
+
middle_pattern = "/".join(middle_parts)
|
|
266
|
+
# Get the file pattern at the end if any
|
|
267
|
+
last_part = parts[-1] if parts[-1] != "**" else "*"
|
|
268
|
+
|
|
269
|
+
# Match any path containing this pattern
|
|
270
|
+
if last_part != "*":
|
|
271
|
+
# Has specific file pattern
|
|
272
|
+
result = f"*{middle_pattern}*{last_part}"
|
|
273
|
+
else:
|
|
274
|
+
result = f"*{middle_pattern}*"
|
|
275
|
+
|
|
276
|
+
return result
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def apply_glob_filter(
|
|
280
|
+
dc: "DataChain",
|
|
281
|
+
pattern: str,
|
|
282
|
+
list_path: str,
|
|
283
|
+
use_recursive: bool,
|
|
284
|
+
column: str,
|
|
285
|
+
) -> "DataChain":
|
|
286
|
+
from datachain.query.schema import Column
|
|
287
|
+
|
|
288
|
+
chain = ls(dc, list_path, recursive=use_recursive, column=column)
|
|
289
|
+
|
|
290
|
+
# If pattern doesn't contain path separator and list_path is not empty,
|
|
291
|
+
# prepend the list_path to make the pattern match correctly
|
|
292
|
+
if list_path and "/" not in pattern:
|
|
293
|
+
filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
|
|
294
|
+
else:
|
|
295
|
+
filter_pattern = pattern
|
|
296
|
+
|
|
297
|
+
# Convert globstar patterns to GLOB-compatible patterns
|
|
298
|
+
glob_pattern = convert_globstar_to_glob(filter_pattern)
|
|
299
|
+
|
|
300
|
+
return chain.filter(Column(f"{column}.path").glob(glob_pattern))
|
|
@@ -41,7 +41,7 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
|
|
|
41
41
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
42
42
|
datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
|
|
43
43
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
44
|
-
datachain/client/fsspec.py,sha256=
|
|
44
|
+
datachain/client/fsspec.py,sha256=sChjxu931QgU2-n9MdXlmOrhGAiAckXoDVZTxKcNv6M,14336
|
|
45
45
|
datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
|
|
46
46
|
datachain/client/hf.py,sha256=n5xJZdvNLS-SqokxuBCIPfGbhIeC_XfLm_BNYtEVvg4,2677
|
|
47
47
|
datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,4607
|
|
@@ -112,7 +112,8 @@ datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,
|
|
|
112
112
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
113
113
|
datachain/lib/dc/parquet.py,sha256=STgm19AM-etu7WmOUMJa5Z9GI6tPC-A0P3JO3ulfsKo,1839
|
|
114
114
|
datachain/lib/dc/records.py,sha256=l7TKSKjT6boXGd05KA5vvax-Y-mLMOo46VWrlxPhmdQ,3067
|
|
115
|
-
datachain/lib/dc/storage.py,sha256=
|
|
115
|
+
datachain/lib/dc/storage.py,sha256=pydeiGLMsmDvruVY_bC5GsV6VLpYpRf7szrD0S2pTmE,9688
|
|
116
|
+
datachain/lib/dc/storage_pattern.py,sha256=QDLLSuBd1mdfkdRi3srGXXigs7rHw3vAnQedjE01_H8,9779
|
|
116
117
|
datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
|
|
117
118
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
118
119
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
@@ -160,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
160
161
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
161
162
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
162
163
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
163
|
-
datachain-0.31.
|
|
164
|
-
datachain-0.31.
|
|
165
|
-
datachain-0.31.
|
|
166
|
-
datachain-0.31.
|
|
167
|
-
datachain-0.31.
|
|
168
|
-
datachain-0.31.
|
|
164
|
+
datachain-0.31.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
165
|
+
datachain-0.31.2.dist-info/METADATA,sha256=ALo4Vp6w2VSanACVy1xv6aHWzbdasSKzD2U8_SybXBU,13898
|
|
166
|
+
datachain-0.31.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
167
|
+
datachain-0.31.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
168
|
+
datachain-0.31.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
169
|
+
datachain-0.31.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|