datachain 0.31.1__py3-none-any.whl → 0.31.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -44,6 +44,7 @@ FETCH_WORKERS = 100
44
44
  DELIMITER = "/" # Path delimiter.
45
45
 
46
46
  DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
47
+ CLOUD_STORAGE_PROTOCOLS = {"s3", "gs", "az", "hf"}
47
48
 
48
49
  ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
49
50
 
@@ -62,6 +63,16 @@ def _is_win_local_path(uri: str) -> bool:
62
63
  return False
63
64
 
64
65
 
66
+ def is_cloud_uri(uri: str) -> bool:
67
+ protocol = urlparse(uri).scheme
68
+ return protocol in CLOUD_STORAGE_PROTOCOLS
69
+
70
+
71
+ def get_cloud_schemes() -> list[str]:
72
+ """Get list of cloud storage scheme prefixes."""
73
+ return [f"{p}://" for p in CLOUD_STORAGE_PROTOCOLS]
74
+
75
+
65
76
  class Bucket(NamedTuple):
66
77
  name: str
67
78
  uri: "StorageURI"
@@ -3,6 +3,13 @@ from collections.abc import Sequence
3
3
  from functools import reduce
4
4
  from typing import TYPE_CHECKING, Optional, Union
5
5
 
6
+ from datachain.lib.dc.storage_pattern import (
7
+ apply_glob_filter,
8
+ expand_brace_pattern,
9
+ should_use_recursion,
10
+ split_uri_pattern,
11
+ validate_cloud_bucket_name,
12
+ )
6
13
  from datachain.lib.file import FileType, get_file_type
7
14
  from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
8
15
  from datachain.query import Session
@@ -38,14 +45,18 @@ def read_storage(
38
45
  It returns the chain itself as usual.
39
46
 
40
47
  Parameters:
41
- uri: storage URI with directory or list of URIs.
42
- URIs must start with storage prefix such
43
- as `s3://`, `gs://`, `az://` or "file:///"
48
+ uri: Storage path(s) or URI(s). Can be a local path or start with a
49
+ storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
50
+ Supports glob patterns:
51
+ - `*` : wildcard
52
+ - `**` : recursive wildcard
53
+ - `?` : single character
54
+ - `{a,b}` : brace expansion
44
55
  type: read file as "binary", "text", or "image" data. Default is "binary".
45
56
  recursive: search recursively for the given path.
46
- column: Created column name.
57
+ column: Column name that will contain File objects. Default is "file".
47
58
  update: force storage reindexing. Default is False.
48
- anon: If True, we will treat cloud bucket as public one
59
+ anon: If True, we will treat cloud bucket as public one.
49
60
  client_config: Optional client configuration for the storage client.
50
61
  delta: If True, only process new or changed files instead of reprocessing
51
62
  everything. This saves time by skipping files that were already processed in
@@ -80,12 +91,19 @@ def read_storage(
80
91
  chain = dc.read_storage("s3://my-bucket/my-dir")
81
92
  ```
82
93
 
94
+ Match all .json files recursively using glob pattern
95
+ ```py
96
+ chain = dc.read_storage("gs://bucket/meta/**/*.json")
97
+ ```
98
+
99
+ Match image file extensions for directories with pattern
100
+ ```py
101
+ chain = dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
102
+ ```
103
+
83
104
  Multiple URIs:
84
105
  ```python
85
- chain = dc.read_storage([
86
- "s3://bucket1/dir1",
87
- "s3://bucket2/dir2"
88
- ])
106
+ chain = dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
89
107
  ```
90
108
 
91
109
  With AWS S3-compatible storage:
@@ -95,19 +113,6 @@ def read_storage(
95
113
  client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
96
114
  )
97
115
  ```
98
-
99
- Pass existing session
100
- ```py
101
- session = Session.get()
102
- chain = dc.read_storage([
103
- "path/to/dir1",
104
- "path/to/dir2"
105
- ], session=session, recursive=True)
106
- ```
107
-
108
- Note:
109
- When using multiple URIs with `update=True`, the function optimizes by
110
- avoiding redundant updates for URIs pointing to the same storage location.
111
116
  """
112
117
  from .datachain import DataChain
113
118
  from .datasets import read_dataset
@@ -130,13 +135,36 @@ def read_storage(
130
135
  if not uris:
131
136
  raise ValueError("No URIs provided")
132
137
 
138
+ # Then expand all URIs that contain brace patterns
139
+ expanded_uris = []
140
+ for single_uri in uris:
141
+ uri_str = str(single_uri)
142
+ validate_cloud_bucket_name(uri_str)
143
+ expanded_uris.extend(expand_brace_pattern(uri_str))
144
+
145
+ # Now process each expanded URI
133
146
  chains = []
134
147
  listed_ds_name = set()
135
148
  file_values = []
136
149
 
137
- for single_uri in uris:
150
+ updated_uris = set()
151
+
152
+ for single_uri in expanded_uris:
153
+ # Check if URI contains glob patterns and split them
154
+ base_uri, glob_pattern = split_uri_pattern(single_uri)
155
+
156
+ # If a pattern is found, use the base_uri for listing
157
+ # The pattern will be used for filtering later
158
+ list_uri_to_use = base_uri if glob_pattern else single_uri
159
+
160
+ # Avoid double updates for the same URI
161
+ update_single_uri = False
162
+ if update and (list_uri_to_use not in updated_uris):
163
+ updated_uris.add(list_uri_to_use)
164
+ update_single_uri = True
165
+
138
166
  list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
139
- single_uri, session, update=update
167
+ list_uri_to_use, session, update=update_single_uri
140
168
  )
141
169
 
142
170
  # list_ds_name is None if object is a file, we don't want to use cache
@@ -185,7 +213,21 @@ def read_storage(
185
213
  lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
186
214
  )
187
215
 
188
- chains.append(ls(dc, list_path, recursive=recursive, column=column))
216
+ # If a glob pattern was detected, use it for filtering
217
+ # Otherwise, use the original list_path from get_listing
218
+ if glob_pattern:
219
+ # Determine if we should use recursive listing based on the pattern
220
+ use_recursive = should_use_recursion(glob_pattern, recursive or False)
221
+
222
+ # Apply glob filter - no need for brace expansion here as it's done above
223
+ chain = apply_glob_filter(
224
+ dc, glob_pattern, list_path, use_recursive, column
225
+ )
226
+ chains.append(chain)
227
+ else:
228
+ # No glob pattern detected, use normal ls behavior
229
+ chains.append(ls(dc, list_path, recursive=recursive, column=column))
230
+
189
231
  listed_ds_name.add(list_ds_name)
190
232
 
191
233
  storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
@@ -0,0 +1,300 @@
1
+ import glob
2
+ from typing import TYPE_CHECKING, Union
3
+
4
+ from datachain.client.fsspec import is_cloud_uri
5
+ from datachain.lib.listing import ls
6
+
7
+ if TYPE_CHECKING:
8
+ from .datachain import DataChain
9
+
10
+
11
+ def validate_cloud_bucket_name(uri: str) -> None:
12
+ """
13
+ Validate that cloud storage bucket names don't contain glob patterns.
14
+
15
+ Args:
16
+ uri: URI to validate
17
+
18
+ Raises:
19
+ ValueError: If a cloud storage bucket name contains glob patterns
20
+ """
21
+ if not is_cloud_uri(uri):
22
+ return
23
+
24
+ # Extract bucket name (everything between :// and first /)
25
+ if "://" in uri:
26
+ scheme_end = uri.index("://") + 3
27
+ path_part = uri[scheme_end:]
28
+
29
+ # Get the bucket name (first segment)
30
+ if "/" in path_part:
31
+ bucket_name = path_part.split("/")[0]
32
+ else:
33
+ bucket_name = path_part
34
+
35
+ # Check if bucket name contains glob patterns
36
+ glob_chars = ["*", "?", "[", "]", "{", "}"]
37
+ if any(char in bucket_name for char in glob_chars):
38
+ raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
39
+
40
+
41
+ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
42
+ """
43
+ Split a URI into base path and glob pattern.
44
+
45
+ Args:
46
+ uri: URI that may contain glob patterns (*, **, ?, {})
47
+
48
+ Returns:
49
+ Tuple of (base_uri, pattern) where pattern is None if no glob pattern found
50
+
51
+ Examples:
52
+ "s3://bucket/dir/*.mp3" -> ("s3://bucket/dir", "*.mp3")
53
+ "s3://bucket/**/*.mp3" -> ("s3://bucket", "**/*.mp3")
54
+ "s3://bucket/dir" -> ("s3://bucket/dir", None)
55
+ """
56
+ if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
57
+ return uri, None
58
+
59
+ # Handle different URI schemes
60
+ if "://" in uri:
61
+ # Split into scheme and path
62
+ scheme_end = uri.index("://") + 3
63
+ scheme_part = uri[:scheme_end]
64
+ path_part = uri[scheme_end:]
65
+
66
+ # Find where the glob pattern starts
67
+ path_segments = path_part.split("/")
68
+
69
+ # Find first segment with glob pattern
70
+ pattern_start_idx = None
71
+ for i, segment in enumerate(path_segments):
72
+ # Check for glob patterns including brace expansion
73
+ if glob.has_magic(segment) or "{" in segment:
74
+ pattern_start_idx = i
75
+ break
76
+
77
+ if pattern_start_idx is None:
78
+ return uri, None
79
+
80
+ # Split into base and pattern
81
+ if pattern_start_idx == 0:
82
+ # Pattern at root of bucket
83
+ base = scheme_part + path_segments[0]
84
+ pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
85
+ else:
86
+ base = scheme_part + "/".join(path_segments[:pattern_start_idx])
87
+ pattern = "/".join(path_segments[pattern_start_idx:])
88
+
89
+ return base, pattern
90
+ # Local path
91
+ path_segments = uri.split("/")
92
+
93
+ # Find first segment with glob pattern
94
+ pattern_start_idx = None
95
+ for i, segment in enumerate(path_segments):
96
+ # Check for glob patterns including brace expansion
97
+ if glob.has_magic(segment) or "{" in segment:
98
+ pattern_start_idx = i
99
+ break
100
+
101
+ if pattern_start_idx is None:
102
+ return uri, None
103
+
104
+ # Split into base and pattern
105
+ base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
106
+ pattern = "/".join(path_segments[pattern_start_idx:])
107
+
108
+ return base, pattern
109
+
110
+
111
+ def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
112
+ """
113
+ Determine if we should use recursive listing based on the pattern.
114
+
115
+ Args:
116
+ pattern: The glob pattern extracted from URI
117
+ user_recursive: User's recursive preference
118
+
119
+ Returns:
120
+ True if recursive listing should be used
121
+
122
+ Examples:
123
+ "*" -> False (single level only)
124
+ "*.mp3" -> False (single level only)
125
+ "**/*.mp3" -> True (globstar requires recursion)
126
+ "dir/*/file.txt" -> True (multi-level pattern)
127
+ """
128
+ if not user_recursive:
129
+ # If user explicitly wants non-recursive, respect that
130
+ return False
131
+
132
+ # If pattern contains globstar, definitely need recursion
133
+ if "**" in pattern:
134
+ return True
135
+
136
+ # If pattern contains path separators, it needs recursion
137
+ # Single-level patterns like "*", "*.txt", "file?" should not be recursive
138
+ return "/" in pattern
139
+
140
+
141
+ def expand_brace_pattern(pattern: str) -> list[str]:
142
+ """
143
+ Recursively expand brace patterns like *.{mp3,wav} into multiple glob patterns.
144
+ Handles nested and multiple brace patterns.
145
+
146
+ Args:
147
+ pattern: Pattern that may contain brace expansion
148
+
149
+ Returns:
150
+ List of expanded patterns
151
+
152
+ Examples:
153
+ "*.{mp3,wav}" -> ["*.mp3", "*.wav"]
154
+ "{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
155
+ "*.txt" -> ["*.txt"]
156
+ "{{a,b}}" -> ["{a}", "{b}"] # Handle double braces
157
+ """
158
+ if "{" not in pattern or "}" not in pattern:
159
+ return [pattern]
160
+
161
+ return _expand_single_braces(pattern)
162
+
163
+
164
+ def _expand_single_braces(pattern: str) -> list[str]:
165
+ """Helper to expand single-level braces."""
166
+ if "{" not in pattern or "}" not in pattern:
167
+ return [pattern]
168
+
169
+ # Find the first complete brace pattern
170
+ start = pattern.index("{")
171
+ end = start
172
+ depth = 0
173
+ for i in range(start, len(pattern)):
174
+ if pattern[i] == "{":
175
+ depth += 1
176
+ elif pattern[i] == "}":
177
+ depth -= 1
178
+ if depth == 0:
179
+ end = i
180
+ break
181
+
182
+ if start >= end:
183
+ return [pattern]
184
+
185
+ prefix = pattern[:start]
186
+ suffix = pattern[end + 1 :]
187
+ options = pattern[start + 1 : end].split(",")
188
+
189
+ # Generate all combinations and recursively expand
190
+ expanded = []
191
+ for option in options:
192
+ combined = prefix + option.strip() + suffix
193
+ # Recursively expand any remaining braces
194
+ expanded.extend(_expand_single_braces(combined))
195
+
196
+ return expanded
197
+
198
+
199
+ def convert_globstar_to_glob(filter_pattern: str) -> str:
200
+ """Convert globstar patterns to GLOB patterns.
201
+
202
+ Standard GLOB doesn't understand ** as recursive wildcard,
203
+ so we need to convert patterns appropriately.
204
+
205
+ Args:
206
+ filter_pattern: Pattern that may contain globstars (**)
207
+
208
+ Returns:
209
+ GLOB-compatible pattern
210
+ """
211
+ if "**" not in filter_pattern:
212
+ return filter_pattern
213
+
214
+ parts = filter_pattern.split("/")
215
+ globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
216
+
217
+ # Handle different cases based on number of globstars
218
+ num_globstars = len(globstar_positions)
219
+
220
+ if num_globstars <= 1:
221
+ # Special case: pattern like **/* means zero or more directories
222
+ # This is tricky because GLOB can't express "zero or more"
223
+ # We need different handling based on the pattern structure
224
+
225
+ if filter_pattern == "**/*":
226
+ # Match everything
227
+ return "*"
228
+ if filter_pattern.startswith("**/"):
229
+ remaining = filter_pattern[3:]
230
+ if "/" not in remaining:
231
+ # Pattern like **/*.ext or **/temp?.*
232
+ # The ** means zero or more directories
233
+ # For zero directories: pattern should be just the filename pattern
234
+ # For one or more: pattern should be */filename
235
+ # Since we can't OR in GLOB, we choose the more permissive option
236
+ # that works with recursive listing
237
+ # Special handling: if it's a simple extension pattern, match broadly
238
+ if remaining.startswith("*."):
239
+ # Pattern like **/*.ext - match any file with this extension
240
+ # This matches *.ext at current level and deeper with recursion:
241
+ return remaining
242
+ # Pattern like **/temp?.* - match as filename in subdirs
243
+ return f"*/{remaining}"
244
+
245
+ # Default: Zero or one globstar - simple replacement
246
+ return filter_pattern.replace("**", "*")
247
+
248
+ # Multiple globstars - need more careful handling
249
+ # For patterns like **/level?/backup/**/*.ext
250
+ # We want to match any path containing /level?/backup/ and ending with .ext
251
+
252
+ # Find middle directories (between first and last **)
253
+ middle_parts = []
254
+ start_idx = globstar_positions[0] + 1
255
+ end_idx = globstar_positions[-1]
256
+ for i in range(start_idx, end_idx):
257
+ if parts[i] != "**":
258
+ middle_parts.append(parts[i])
259
+
260
+ if not middle_parts:
261
+ # No fixed middle parts, just use wildcards
262
+ result = filter_pattern.replace("**", "*")
263
+ else:
264
+ # Create pattern that matches the middle parts
265
+ middle_pattern = "/".join(middle_parts)
266
+ # Get the file pattern at the end if any
267
+ last_part = parts[-1] if parts[-1] != "**" else "*"
268
+
269
+ # Match any path containing this pattern
270
+ if last_part != "*":
271
+ # Has specific file pattern
272
+ result = f"*{middle_pattern}*{last_part}"
273
+ else:
274
+ result = f"*{middle_pattern}*"
275
+
276
+ return result
277
+
278
+
279
+ def apply_glob_filter(
280
+ dc: "DataChain",
281
+ pattern: str,
282
+ list_path: str,
283
+ use_recursive: bool,
284
+ column: str,
285
+ ) -> "DataChain":
286
+ from datachain.query.schema import Column
287
+
288
+ chain = ls(dc, list_path, recursive=use_recursive, column=column)
289
+
290
+ # If pattern doesn't contain path separator and list_path is not empty,
291
+ # prepend the list_path to make the pattern match correctly
292
+ if list_path and "/" not in pattern:
293
+ filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
294
+ else:
295
+ filter_pattern = pattern
296
+
297
+ # Convert globstar patterns to GLOB-compatible patterns
298
+ glob_pattern = convert_globstar_to_glob(filter_pattern)
299
+
300
+ return chain.filter(Column(f"{column}.path").glob(glob_pattern))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.31.1
3
+ Version: 0.31.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -41,7 +41,7 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
41
41
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
42
42
  datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
43
43
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
44
- datachain/client/fsspec.py,sha256=kb_myMWcgGFClY5Rsv6fvHIRblg41dfH5knHJuDbW6w,14015
44
+ datachain/client/fsspec.py,sha256=sChjxu931QgU2-n9MdXlmOrhGAiAckXoDVZTxKcNv6M,14336
45
45
  datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
46
46
  datachain/client/hf.py,sha256=n5xJZdvNLS-SqokxuBCIPfGbhIeC_XfLm_BNYtEVvg4,2677
47
47
  datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,4607
@@ -112,7 +112,8 @@ datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,
112
112
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
113
113
  datachain/lib/dc/parquet.py,sha256=STgm19AM-etu7WmOUMJa5Z9GI6tPC-A0P3JO3ulfsKo,1839
114
114
  datachain/lib/dc/records.py,sha256=l7TKSKjT6boXGd05KA5vvax-Y-mLMOo46VWrlxPhmdQ,3067
115
- datachain/lib/dc/storage.py,sha256=asqug7UcW6qgyGGEm7b9ZxfbP2UDIeINF07Ngx06KDE,7887
115
+ datachain/lib/dc/storage.py,sha256=pydeiGLMsmDvruVY_bC5GsV6VLpYpRf7szrD0S2pTmE,9688
116
+ datachain/lib/dc/storage_pattern.py,sha256=QDLLSuBd1mdfkdRi3srGXXigs7rHw3vAnQedjE01_H8,9779
116
117
  datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
117
118
  datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
118
119
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -160,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
160
161
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
161
162
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
162
163
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
163
- datachain-0.31.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
- datachain-0.31.1.dist-info/METADATA,sha256=5VpADYBwb_LImU9zvCBSHmycEFwnAxf8EZPMziLp6sM,13898
165
- datachain-0.31.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- datachain-0.31.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
- datachain-0.31.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
- datachain-0.31.1.dist-info/RECORD,,
164
+ datachain-0.31.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
+ datachain-0.31.2.dist-info/METADATA,sha256=ALo4Vp6w2VSanACVy1xv6aHWzbdasSKzD2U8_SybXBU,13898
166
+ datachain-0.31.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
+ datachain-0.31.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
+ datachain-0.31.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
+ datachain-0.31.2.dist-info/RECORD,,