datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,17 @@
1
- import os.path
1
+ import os
2
2
  from collections.abc import Sequence
3
3
  from functools import reduce
4
- from typing import (
5
- TYPE_CHECKING,
6
- Optional,
7
- Union,
8
- )
4
+ from typing import TYPE_CHECKING
9
5
 
10
- from datachain.lib.file import (
11
- FileType,
12
- get_file_type,
13
- )
14
- from datachain.lib.listing import (
15
- get_file_info,
16
- get_listing,
17
- list_bucket,
18
- ls,
6
+ from datachain.lib.dc.storage_pattern import (
7
+ apply_glob_filter,
8
+ expand_brace_pattern,
9
+ should_use_recursion,
10
+ split_uri_pattern,
11
+ validate_cloud_bucket_name,
19
12
  )
13
+ from datachain.lib.file import FileType, get_file_type
14
+ from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
20
15
  from datachain.query import Session
21
16
 
22
17
  if TYPE_CHECKING:
@@ -24,40 +19,46 @@ if TYPE_CHECKING:
24
19
 
25
20
 
26
21
  def read_storage(
27
- uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
22
+ uri: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
28
23
  *,
29
24
  type: FileType = "binary",
30
- session: Optional[Session] = None,
31
- settings: Optional[dict] = None,
25
+ session: Session | None = None,
26
+ settings: dict | None = None,
32
27
  in_memory: bool = False,
33
- recursive: Optional[bool] = True,
28
+ recursive: bool | None = True,
34
29
  column: str = "file",
35
30
  update: bool = False,
36
- anon: Optional[bool] = None,
37
- delta: Optional[bool] = False,
38
- delta_on: Optional[Union[str, Sequence[str]]] = (
31
+ anon: bool | None = None,
32
+ delta: bool | None = False,
33
+ delta_on: str | Sequence[str] | None = (
39
34
  "file.path",
40
35
  "file.etag",
41
36
  "file.version",
42
37
  ),
43
- delta_result_on: Optional[Union[str, Sequence[str]]] = None,
44
- delta_compare: Optional[Union[str, Sequence[str]]] = None,
45
- delta_retry: Optional[Union[bool, str]] = None,
46
- client_config: Optional[dict] = None,
38
+ delta_result_on: str | Sequence[str] | None = None,
39
+ delta_compare: str | Sequence[str] | None = None,
40
+ delta_retry: bool | str | None = None,
41
+ delta_unsafe: bool = False,
42
+ client_config: dict | None = None,
47
43
  ) -> "DataChain":
48
44
  """Get data from storage(s) as a list of file with all file attributes.
49
45
  It returns the chain itself as usual.
50
46
 
51
47
  Parameters:
52
- uri : storage URI with directory or list of URIs.
53
- URIs must start with storage prefix such
54
- as `s3://`, `gs://`, `az://` or "file:///"
55
- type : read file as "binary", "text", or "image" data. Default is "binary".
56
- recursive : search recursively for the given path.
57
- column : Created column name.
58
- update : force storage reindexing. Default is False.
59
- anon : If True, we will treat cloud bucket as public one
60
- client_config : Optional client configuration for the storage client.
48
+ uri: Storage path(s) or URI(s). Can be a local path or start with a
49
+ storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
50
+ Supports glob patterns:
51
+ - `*` : wildcard
52
+ - `**` : recursive wildcard
53
+ - `?` : single character
54
+ - `{a,b}` : brace expansion list
55
+ - `{1..9}` : brace numeric or alphabetic range
56
+ type: read file as "binary", "text", or "image" data. Default is "binary".
57
+ recursive: search recursively for the given path.
58
+ column: Column name that will contain File objects. Default is "file".
59
+ update: force storage reindexing. Default is False.
60
+ anon: If True, we will treat cloud bucket as public one.
61
+ client_config: Optional client configuration for the storage client.
61
62
  delta: If True, only process new or changed files instead of reprocessing
62
63
  everything. This saves time by skipping files that were already processed in
63
64
  previous versions. The optimization is working when a new version of the
@@ -77,6 +78,9 @@ def read_storage(
77
78
  (error mode)
78
79
  - True: Reprocess records missing from the result dataset (missing mode)
79
80
  - None: No retry processing (default)
81
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
82
+ distinct. Caller must ensure datasets are consistent and not partially
83
+ updated.
80
84
 
81
85
  Returns:
82
86
  DataChain: A DataChain object containing the file information.
@@ -85,37 +89,36 @@ def read_storage(
85
89
  Simple call from s3:
86
90
  ```python
87
91
  import datachain as dc
88
- chain = dc.read_storage("s3://my-bucket/my-dir")
92
+ dc.read_storage("s3://my-bucket/my-dir")
93
+ ```
94
+
95
+ Match all .json files recursively using glob pattern
96
+ ```py
97
+ dc.read_storage("gs://bucket/meta/**/*.json")
98
+ ```
99
+
100
+ Match image file extensions for directories with pattern
101
+ ```py
102
+ dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
103
+ ```
104
+
105
+ By ranges in filenames:
106
+ ```py
107
+ dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")
89
108
  ```
90
109
 
91
110
  Multiple URIs:
92
111
  ```python
93
- chain = dc.read_storage([
94
- "s3://bucket1/dir1",
95
- "s3://bucket2/dir2"
96
- ])
112
+ dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
97
113
  ```
98
114
 
99
115
  With AWS S3-compatible storage:
100
116
  ```python
101
- chain = dc.read_storage(
117
+ dc.read_storage(
102
118
  "s3://my-bucket/my-dir",
103
119
  client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
104
120
  )
105
121
  ```
106
-
107
- Pass existing session
108
- ```py
109
- session = Session.get()
110
- chain = dc.read_storage([
111
- "path/to/dir1",
112
- "path/to/dir2"
113
- ], session=session, recursive=True)
114
- ```
115
-
116
- Note:
117
- When using multiple URIs with `update=True`, the function optimizes by
118
- avoiding redundant updates for URIs pointing to the same storage location.
119
122
  """
120
123
  from .datachain import DataChain
121
124
  from .datasets import read_dataset
@@ -138,13 +141,36 @@ def read_storage(
138
141
  if not uris:
139
142
  raise ValueError("No URIs provided")
140
143
 
144
+ # Then expand all URIs that contain brace patterns
145
+ expanded_uris = []
146
+ for single_uri in uris:
147
+ uri_str = str(single_uri)
148
+ validate_cloud_bucket_name(uri_str)
149
+ expanded_uris.extend(expand_brace_pattern(uri_str))
150
+
151
+ # Now process each expanded URI
141
152
  chains = []
142
153
  listed_ds_name = set()
143
154
  file_values = []
144
155
 
145
- for single_uri in uris:
156
+ updated_uris = set()
157
+
158
+ for single_uri in expanded_uris:
159
+ # Check if URI contains glob patterns and split them
160
+ base_uri, glob_pattern = split_uri_pattern(single_uri)
161
+
162
+ # If a pattern is found, use the base_uri for listing
163
+ # The pattern will be used for filtering later
164
+ list_uri_to_use = base_uri if glob_pattern else single_uri
165
+
166
+ # Avoid double updates for the same URI
167
+ update_single_uri = False
168
+ if update and (list_uri_to_use not in updated_uris):
169
+ updated_uris.add(list_uri_to_use)
170
+ update_single_uri = True
171
+
146
172
  list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
147
- single_uri, session, update=update
173
+ list_uri_to_use, session, update=update_single_uri
148
174
  )
149
175
 
150
176
  # list_ds_name is None if object is a file, we don't want to use cache
@@ -161,6 +187,12 @@ def read_storage(
161
187
  project=listing_project_name,
162
188
  session=session,
163
189
  settings=settings,
190
+ delta=delta,
191
+ delta_on=delta_on,
192
+ delta_result_on=delta_result_on,
193
+ delta_compare=delta_compare,
194
+ delta_retry=delta_retry,
195
+ delta_unsafe=delta_unsafe,
164
196
  )
165
197
  dc._query.update = update
166
198
  dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
@@ -193,7 +225,21 @@ def read_storage(
193
225
  lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
194
226
  )
195
227
 
196
- chains.append(ls(dc, list_path, recursive=recursive, column=column))
228
+ # If a glob pattern was detected, use it for filtering
229
+ # Otherwise, use the original list_path from get_listing
230
+ if glob_pattern:
231
+ # Determine if we should use recursive listing based on the pattern
232
+ use_recursive = should_use_recursion(glob_pattern, recursive or False)
233
+
234
+ # Apply glob filter - no need for brace expansion here as it's done above
235
+ chain = apply_glob_filter(
236
+ dc, glob_pattern, list_path, use_recursive, column
237
+ )
238
+ chains.append(chain)
239
+ else:
240
+ # No glob pattern detected, use normal ls behavior
241
+ chains.append(ls(dc, list_path, recursive=recursive, column=column))
242
+
197
243
  listed_ds_name.add(list_ds_name)
198
244
 
199
245
  storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
@@ -212,12 +258,4 @@ def read_storage(
212
258
 
213
259
  assert storage_chain is not None
214
260
 
215
- if delta:
216
- storage_chain = storage_chain._as_delta(
217
- on=delta_on,
218
- right_on=delta_result_on,
219
- compare=delta_compare,
220
- delta_retry=delta_retry,
221
- )
222
-
223
261
  return storage_chain
@@ -0,0 +1,251 @@
1
+ import glob
2
+ from typing import TYPE_CHECKING
3
+
4
+ from datachain.client.fsspec import is_cloud_uri
5
+ from datachain.lib.listing import ls
6
+
7
+ if TYPE_CHECKING:
8
+ from .datachain import DataChain
9
+
10
+
11
+ def validate_cloud_bucket_name(uri: str) -> None:
12
+ """
13
+ Validate that cloud storage bucket names don't contain glob patterns.
14
+
15
+ Raises:
16
+ ValueError: If a cloud storage bucket name contains glob patterns
17
+ """
18
+ if not is_cloud_uri(uri):
19
+ return
20
+
21
+ if "://" in uri:
22
+ scheme_end = uri.index("://") + 3
23
+ path_part = uri[scheme_end:]
24
+
25
+ if "/" in path_part:
26
+ bucket_name = path_part.split("/")[0]
27
+ else:
28
+ bucket_name = path_part
29
+
30
+ glob_chars = ["*", "?", "[", "]", "{", "}"]
31
+ if any(char in bucket_name for char in glob_chars):
32
+ raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
33
+
34
+
35
+ def split_uri_pattern(uri: str) -> tuple[str, str | None]:
36
+ """Split a URI into base path and glob pattern."""
37
+ if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
38
+ return uri, None
39
+
40
+ if "://" in uri:
41
+ scheme_end = uri.index("://") + 3
42
+ scheme_part = uri[:scheme_end]
43
+ path_part = uri[scheme_end:]
44
+ path_segments = path_part.split("/")
45
+
46
+ pattern_start_idx = None
47
+ for i, segment in enumerate(path_segments):
48
+ # Check for glob patterns including brace expansion
49
+ if glob.has_magic(segment) or "{" in segment:
50
+ pattern_start_idx = i
51
+ break
52
+
53
+ if pattern_start_idx is None:
54
+ return uri, None
55
+
56
+ if pattern_start_idx == 0:
57
+ base = scheme_part + path_segments[0]
58
+ pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
59
+ else:
60
+ base = scheme_part + "/".join(path_segments[:pattern_start_idx])
61
+ pattern = "/".join(path_segments[pattern_start_idx:])
62
+
63
+ return base, pattern
64
+
65
+ path_segments = uri.split("/")
66
+
67
+ pattern_start_idx = None
68
+ for i, segment in enumerate(path_segments):
69
+ if glob.has_magic(segment) or "{" in segment:
70
+ pattern_start_idx = i
71
+ break
72
+
73
+ if pattern_start_idx is None:
74
+ return uri, None
75
+
76
+ base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
77
+ pattern = "/".join(path_segments[pattern_start_idx:])
78
+
79
+ return base, pattern
80
+
81
+
82
+ def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
83
+ if not user_recursive:
84
+ return False
85
+
86
+ if "**" in pattern:
87
+ return True
88
+
89
+ return "/" in pattern
90
+
91
+
92
+ def expand_brace_pattern(pattern: str) -> list[str]:
93
+ """
94
+ Recursively expand brace patterns into multiple glob patterns.
95
+ Supports:
96
+ - Comma-separated lists: *.{mp3,wav}
97
+ - Numeric ranges: file{1..10}
98
+ - Zero-padded numeric ranges: file{01..10}
99
+ - Character ranges: file{a..z}
100
+
101
+ Examples:
102
+ "*.{mp3,wav}" -> ["*.mp3", "*.wav"]
103
+ "file{1..3}" -> ["file1", "file2", "file3"]
104
+ "file{01..03}" -> ["file01", "file02", "file03"]
105
+ "file{a..c}" -> ["filea", "fileb", "filec"]
106
+ "{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
107
+ """
108
+ if "{" not in pattern or "}" not in pattern:
109
+ return [pattern]
110
+
111
+ return _expand_single_braces(pattern)
112
+
113
+
114
+ def _expand_single_braces(pattern: str) -> list[str]:
115
+ if "{" not in pattern or "}" not in pattern:
116
+ return [pattern]
117
+
118
+ start = pattern.index("{")
119
+ end = start
120
+ depth = 0
121
+ for i in range(start, len(pattern)):
122
+ if pattern[i] == "{":
123
+ depth += 1
124
+ elif pattern[i] == "}":
125
+ depth -= 1
126
+ if depth == 0:
127
+ end = i
128
+ break
129
+
130
+ if start >= end:
131
+ return [pattern]
132
+
133
+ prefix = pattern[:start]
134
+ suffix = pattern[end + 1 :]
135
+ brace_content = pattern[start + 1 : end]
136
+
137
+ if ".." in brace_content:
138
+ options = _expand_range(brace_content)
139
+ else:
140
+ options = [opt.strip() for opt in brace_content.split(",")]
141
+
142
+ expanded = []
143
+ for option in options:
144
+ combined = prefix + option + suffix
145
+ expanded.extend(_expand_single_braces(combined))
146
+
147
+ return expanded
148
+
149
+
150
+ def _expand_range(range_spec: str) -> list[str]: # noqa: PLR0911
151
+ if ".." not in range_spec:
152
+ return [range_spec]
153
+
154
+ parts = range_spec.split("..")
155
+ if len(parts) != 2:
156
+ return [range_spec]
157
+
158
+ start, end = parts[0], parts[1]
159
+
160
+ if start.isdigit() and end.isdigit():
161
+ pad_width = max(len(start), len(end)) if start[0] == "0" or end[0] == "0" else 0
162
+ start_num = int(start)
163
+ end_num = int(end)
164
+
165
+ if start_num <= end_num:
166
+ if pad_width > 0:
167
+ return [str(i).zfill(pad_width) for i in range(start_num, end_num + 1)]
168
+ return [str(i) for i in range(start_num, end_num + 1)]
169
+ if pad_width > 0:
170
+ return [str(i).zfill(pad_width) for i in range(start_num, end_num - 1, -1)]
171
+ return [str(i) for i in range(start_num, end_num - 1, -1)]
172
+
173
+ if len(start) == 1 and len(end) == 1 and start.isalpha() and end.isalpha():
174
+ start_ord = ord(start)
175
+ end_ord = ord(end)
176
+
177
+ if start_ord <= end_ord:
178
+ return [chr(i) for i in range(start_ord, end_ord + 1)]
179
+ return [chr(i) for i in range(start_ord, end_ord - 1, -1)]
180
+
181
+ return [range_spec]
182
+
183
+
184
+ def convert_globstar_to_glob(filter_pattern: str) -> str:
185
+ if "**" not in filter_pattern:
186
+ return filter_pattern
187
+
188
+ parts = filter_pattern.split("/")
189
+ globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
190
+
191
+ num_globstars = len(globstar_positions)
192
+
193
+ if num_globstars <= 1:
194
+ if filter_pattern == "**/*":
195
+ return "*"
196
+ if filter_pattern.startswith("**/"):
197
+ remaining = filter_pattern[3:]
198
+ if "/" not in remaining:
199
+ # Pattern like **/*.ext or **/temp?.*
200
+ # The ** means zero or more directories
201
+ # For zero directories: pattern should be just the filename pattern
202
+ # For one or more: pattern should be */filename
203
+ # Since we can't OR in GLOB, we choose the more permissive option
204
+ # that works with recursive listing
205
+ # Special handling: if it's a simple extension pattern, match broadly
206
+ if remaining.startswith("*."):
207
+ return remaining
208
+ return f"*/{remaining}"
209
+
210
+ return filter_pattern.replace("**", "*")
211
+
212
+ middle_parts = []
213
+ start_idx = globstar_positions[0] + 1
214
+ end_idx = globstar_positions[-1]
215
+ for i in range(start_idx, end_idx):
216
+ if parts[i] != "**":
217
+ middle_parts.append(parts[i])
218
+
219
+ if not middle_parts:
220
+ result = filter_pattern.replace("**", "*")
221
+ else:
222
+ middle_pattern = "/".join(middle_parts)
223
+ last_part = parts[-1] if parts[-1] != "**" else "*"
224
+
225
+ if last_part != "*":
226
+ result = f"*{middle_pattern}*{last_part}"
227
+ else:
228
+ result = f"*{middle_pattern}*"
229
+
230
+ return result
231
+
232
+
233
+ def apply_glob_filter(
234
+ dc: "DataChain",
235
+ pattern: str,
236
+ list_path: str,
237
+ use_recursive: bool,
238
+ column: str,
239
+ ) -> "DataChain":
240
+ from datachain.query.schema import Column
241
+
242
+ chain = ls(dc, list_path, recursive=use_recursive, column=column)
243
+
244
+ if list_path and "/" not in pattern:
245
+ filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
246
+ else:
247
+ filter_pattern = pattern
248
+
249
+ glob_pattern = convert_globstar_to_glob(filter_pattern)
250
+
251
+ return chain.filter(Column(f"{column}.path").glob(glob_pattern))
datachain/lib/dc/utils.py CHANGED
@@ -1,12 +1,6 @@
1
1
  from collections.abc import Sequence
2
2
  from functools import wraps
3
- from typing import (
4
- TYPE_CHECKING,
5
- Callable,
6
- Optional,
7
- TypeVar,
8
- Union,
9
- )
3
+ from typing import TYPE_CHECKING, TypeVar
10
4
 
11
5
  import sqlalchemy
12
6
  from sqlalchemy.sql.functions import GenericFunction
@@ -18,7 +12,10 @@ from datachain.query.schema import DEFAULT_DELIMITER
18
12
  from datachain.utils import getenv_bool
19
13
 
20
14
  if TYPE_CHECKING:
21
- from typing_extensions import Concatenate, ParamSpec
15
+ from collections.abc import Callable
16
+ from typing import Concatenate
17
+
18
+ from typing_extensions import ParamSpec
22
19
 
23
20
  from .datachain import DataChain
24
21
 
@@ -28,9 +25,15 @@ D = TypeVar("D", bound="DataChain")
28
25
 
29
26
 
30
27
  def is_studio() -> bool:
28
+ """Check if the runtime environment is Studio (not local)."""
31
29
  return getenv_bool("DATACHAIN_IS_STUDIO", default=False)
32
30
 
33
31
 
32
+ def is_local() -> bool:
33
+ """Check if the runtime environment is local (not Studio)."""
34
+ return not is_studio()
35
+
36
+
34
37
  def resolve_columns(
35
38
  method: "Callable[Concatenate[D, P], D]",
36
39
  ) -> "Callable[Concatenate[D, P], D]":
@@ -70,11 +73,11 @@ class DatasetFromValuesError(DataChainParamsError):
70
73
  super().__init__(f"Dataset{name} from values error: {msg}")
71
74
 
72
75
 
73
- MergeColType = Union[str, Function, sqlalchemy.ColumnElement]
76
+ MergeColType = str | Function | sqlalchemy.ColumnElement
74
77
 
75
78
 
76
79
  def _validate_merge_on(
77
- on: Union[MergeColType, Sequence[MergeColType]],
80
+ on: MergeColType | Sequence[MergeColType],
78
81
  ds: "DataChain",
79
82
  ) -> Sequence[MergeColType]:
80
83
  if isinstance(on, (str, sqlalchemy.ColumnElement)):
@@ -103,12 +106,12 @@ def _get_merge_error_str(col: MergeColType) -> str:
103
106
  class DatasetMergeError(DataChainParamsError):
104
107
  def __init__(
105
108
  self,
106
- on: Union[MergeColType, Sequence[MergeColType]],
107
- right_on: Optional[Union[MergeColType, Sequence[MergeColType]]],
109
+ on: MergeColType | Sequence[MergeColType],
110
+ right_on: MergeColType | Sequence[MergeColType] | None,
108
111
  msg: str,
109
112
  ):
110
113
  def _get_str(
111
- on: Union[MergeColType, Sequence[MergeColType]],
114
+ on: MergeColType | Sequence[MergeColType],
112
115
  ) -> str:
113
116
  if not isinstance(on, Sequence):
114
117
  return str(on) # type: ignore[unreachable]
@@ -123,7 +126,7 @@ class DatasetMergeError(DataChainParamsError):
123
126
  super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
124
127
 
125
128
 
126
- OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
129
+ OutputType = DataType | Sequence[str] | dict[str, DataType] | None
127
130
 
128
131
 
129
132
  class Sys(DataModel):
@@ -1,8 +1,5 @@
1
1
  from collections.abc import Iterator
2
- from typing import (
3
- TYPE_CHECKING,
4
- Optional,
5
- )
2
+ from typing import TYPE_CHECKING
6
3
 
7
4
  from datachain.lib.convert.values_to_tuples import values_to_tuples
8
5
  from datachain.lib.data_model import dict_to_data_model
@@ -20,8 +17,8 @@ if TYPE_CHECKING:
20
17
 
21
18
  def read_values(
22
19
  ds_name: str = "",
23
- session: Optional[Session] = None,
24
- settings: Optional[dict] = None,
20
+ session: Session | None = None,
21
+ settings: dict | None = None,
25
22
  in_memory: bool = False,
26
23
  output: OutputType = None,
27
24
  column: str = "",