datachain 0.32.0__py3-none-any.whl → 0.32.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +1 -1
- datachain/dataset.py +2 -2
- datachain/lib/convert/python_to_sql.py +18 -4
- datachain/lib/dc/parquet.py +20 -5
- datachain/lib/dc/storage.py +12 -6
- datachain/lib/dc/storage_pattern.py +50 -99
- datachain/lib/namespaces.py +4 -5
- {datachain-0.32.0.dist-info → datachain-0.32.2.dist-info}/METADATA +12 -24
- {datachain-0.32.0.dist-info → datachain-0.32.2.dist-info}/RECORD +13 -13
- {datachain-0.32.0.dist-info → datachain-0.32.2.dist-info}/WHEEL +0 -0
- {datachain-0.32.0.dist-info → datachain-0.32.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.32.0.dist-info → datachain-0.32.2.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.32.0.dist-info → datachain-0.32.2.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -37,7 +37,7 @@ from datachain.lib.file import (
|
|
|
37
37
|
VideoFrame,
|
|
38
38
|
)
|
|
39
39
|
from datachain.lib.model_store import ModelStore
|
|
40
|
-
from datachain.lib.namespaces import
|
|
40
|
+
from datachain.lib.namespaces import delete_namespace
|
|
41
41
|
from datachain.lib.projects import create as create_project
|
|
42
42
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
43
43
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
datachain/dataset.py
CHANGED
|
@@ -619,7 +619,7 @@ class DatasetRecord:
|
|
|
619
619
|
if not self.versions:
|
|
620
620
|
return "1.0.0"
|
|
621
621
|
|
|
622
|
-
major,
|
|
622
|
+
major, _, _ = semver.parse(self.latest_version)
|
|
623
623
|
return semver.create(major + 1, 0, 0)
|
|
624
624
|
|
|
625
625
|
@property
|
|
@@ -630,7 +630,7 @@ class DatasetRecord:
|
|
|
630
630
|
if not self.versions:
|
|
631
631
|
return "1.0.0"
|
|
632
632
|
|
|
633
|
-
major, minor,
|
|
633
|
+
major, minor, _ = semver.parse(self.latest_version)
|
|
634
634
|
return semver.create(major, minor + 1, 0)
|
|
635
635
|
|
|
636
636
|
@property
|
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
import inspect
|
|
2
|
+
import sys
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from enum import Enum
|
|
4
5
|
from typing import Annotated, Literal, Union, get_args, get_origin
|
|
5
6
|
|
|
7
|
+
if sys.version_info >= (3, 10):
|
|
8
|
+
from types import UnionType
|
|
9
|
+
else:
|
|
10
|
+
UnionType = None
|
|
11
|
+
|
|
6
12
|
from pydantic import BaseModel
|
|
7
13
|
from typing_extensions import Literal as LiteralEx
|
|
8
14
|
|
|
@@ -34,6 +40,13 @@ PYTHON_TO_SQL = {
|
|
|
34
40
|
}
|
|
35
41
|
|
|
36
42
|
|
|
43
|
+
def _is_union(orig) -> bool:
|
|
44
|
+
if orig == Union:
|
|
45
|
+
return True
|
|
46
|
+
# some code is unreachab in python<3.10
|
|
47
|
+
return UnionType is not None and orig is UnionType # type: ignore[unreachable]
|
|
48
|
+
|
|
49
|
+
|
|
37
50
|
def python_to_sql(typ): # noqa: PLR0911
|
|
38
51
|
if inspect.isclass(typ):
|
|
39
52
|
if issubclass(typ, SQLType):
|
|
@@ -69,9 +82,10 @@ def python_to_sql(typ): # noqa: PLR0911
|
|
|
69
82
|
if inspect.isclass(orig) and issubclass(dict, orig):
|
|
70
83
|
return JSON
|
|
71
84
|
|
|
72
|
-
if orig
|
|
85
|
+
if _is_union(orig):
|
|
73
86
|
if len(args) == 2 and (type(None) in args):
|
|
74
|
-
|
|
87
|
+
non_none_arg = args[0] if args[0] is not type(None) else args[1]
|
|
88
|
+
return python_to_sql(non_none_arg)
|
|
75
89
|
|
|
76
90
|
if _is_union_str_literal(orig, args):
|
|
77
91
|
return String
|
|
@@ -95,7 +109,7 @@ def list_of_args_to_type(args) -> SQLType:
|
|
|
95
109
|
|
|
96
110
|
|
|
97
111
|
def _is_json_inside_union(orig, args) -> bool:
|
|
98
|
-
if orig
|
|
112
|
+
if _is_union(orig) and len(args) >= 2:
|
|
99
113
|
# List in JSON: Union[dict, list[dict]]
|
|
100
114
|
args_no_nones = [arg for arg in args if arg != type(None)] # noqa: E721
|
|
101
115
|
if len(args_no_nones) == 2:
|
|
@@ -112,6 +126,6 @@ def _is_json_inside_union(orig, args) -> bool:
|
|
|
112
126
|
|
|
113
127
|
|
|
114
128
|
def _is_union_str_literal(orig, args) -> bool:
|
|
115
|
-
if orig
|
|
129
|
+
if not _is_union(orig):
|
|
116
130
|
return False
|
|
117
131
|
return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -26,8 +26,14 @@ def read_parquet(
|
|
|
26
26
|
"""Generate chain from parquet files.
|
|
27
27
|
|
|
28
28
|
Parameters:
|
|
29
|
-
path: Storage
|
|
30
|
-
|
|
29
|
+
path: Storage path(s) or URI(s). Can be a local path or start with a
|
|
30
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
31
|
+
Supports glob patterns:
|
|
32
|
+
- `*` : wildcard
|
|
33
|
+
- `**` : recursive wildcard
|
|
34
|
+
- `?` : single character
|
|
35
|
+
- `{a,b}` : brace expansion list
|
|
36
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
31
37
|
partitioning: Any pyarrow partitioning schema.
|
|
32
38
|
output: Dictionary defining column names and their corresponding types.
|
|
33
39
|
column: Created column name.
|
|
@@ -43,10 +49,19 @@ def read_parquet(
|
|
|
43
49
|
dc.read_parquet("s3://mybucket/file.parquet")
|
|
44
50
|
```
|
|
45
51
|
|
|
46
|
-
|
|
52
|
+
All files from a directory:
|
|
47
53
|
```py
|
|
48
|
-
|
|
49
|
-
|
|
54
|
+
dc.read_parquet("s3://mybucket/dir/")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Only parquet files from a directory, and all it's subdirectories:
|
|
58
|
+
```py
|
|
59
|
+
dc.read_parquet("s3://mybucket/dir/**/*.parquet")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Using filename patterns - numeric, list, starting with zeros:
|
|
63
|
+
```py
|
|
64
|
+
dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
|
|
50
65
|
```
|
|
51
66
|
"""
|
|
52
67
|
from .storage import read_storage
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -51,7 +51,8 @@ def read_storage(
|
|
|
51
51
|
- `*` : wildcard
|
|
52
52
|
- `**` : recursive wildcard
|
|
53
53
|
- `?` : single character
|
|
54
|
-
- `{a,b}` : brace expansion
|
|
54
|
+
- `{a,b}` : brace expansion list
|
|
55
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
55
56
|
type: read file as "binary", "text", or "image" data. Default is "binary".
|
|
56
57
|
recursive: search recursively for the given path.
|
|
57
58
|
column: Column name that will contain File objects. Default is "file".
|
|
@@ -88,27 +89,32 @@ def read_storage(
|
|
|
88
89
|
Simple call from s3:
|
|
89
90
|
```python
|
|
90
91
|
import datachain as dc
|
|
91
|
-
|
|
92
|
+
dc.read_storage("s3://my-bucket/my-dir")
|
|
92
93
|
```
|
|
93
94
|
|
|
94
95
|
Match all .json files recursively using glob pattern
|
|
95
96
|
```py
|
|
96
|
-
|
|
97
|
+
dc.read_storage("gs://bucket/meta/**/*.json")
|
|
97
98
|
```
|
|
98
99
|
|
|
99
100
|
Match image file extensions for directories with pattern
|
|
100
101
|
```py
|
|
101
|
-
|
|
102
|
+
dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
By ranges in filenames:
|
|
106
|
+
```py
|
|
107
|
+
dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")
|
|
102
108
|
```
|
|
103
109
|
|
|
104
110
|
Multiple URIs:
|
|
105
111
|
```python
|
|
106
|
-
|
|
112
|
+
dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
|
|
107
113
|
```
|
|
108
114
|
|
|
109
115
|
With AWS S3-compatible storage:
|
|
110
116
|
```python
|
|
111
|
-
|
|
117
|
+
dc.read_storage(
|
|
112
118
|
"s3://my-bucket/my-dir",
|
|
113
119
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
114
120
|
)
|
|
@@ -12,61 +12,37 @@ def validate_cloud_bucket_name(uri: str) -> None:
|
|
|
12
12
|
"""
|
|
13
13
|
Validate that cloud storage bucket names don't contain glob patterns.
|
|
14
14
|
|
|
15
|
-
Args:
|
|
16
|
-
uri: URI to validate
|
|
17
|
-
|
|
18
15
|
Raises:
|
|
19
16
|
ValueError: If a cloud storage bucket name contains glob patterns
|
|
20
17
|
"""
|
|
21
18
|
if not is_cloud_uri(uri):
|
|
22
19
|
return
|
|
23
20
|
|
|
24
|
-
# Extract bucket name (everything between :// and first /)
|
|
25
21
|
if "://" in uri:
|
|
26
22
|
scheme_end = uri.index("://") + 3
|
|
27
23
|
path_part = uri[scheme_end:]
|
|
28
24
|
|
|
29
|
-
# Get the bucket name (first segment)
|
|
30
25
|
if "/" in path_part:
|
|
31
26
|
bucket_name = path_part.split("/")[0]
|
|
32
27
|
else:
|
|
33
28
|
bucket_name = path_part
|
|
34
29
|
|
|
35
|
-
# Check if bucket name contains glob patterns
|
|
36
30
|
glob_chars = ["*", "?", "[", "]", "{", "}"]
|
|
37
31
|
if any(char in bucket_name for char in glob_chars):
|
|
38
32
|
raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
|
|
39
33
|
|
|
40
34
|
|
|
41
35
|
def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
42
|
-
"""
|
|
43
|
-
Split a URI into base path and glob pattern.
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
uri: URI that may contain glob patterns (*, **, ?, {})
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
Tuple of (base_uri, pattern) where pattern is None if no glob pattern found
|
|
50
|
-
|
|
51
|
-
Examples:
|
|
52
|
-
"s3://bucket/dir/*.mp3" -> ("s3://bucket/dir", "*.mp3")
|
|
53
|
-
"s3://bucket/**/*.mp3" -> ("s3://bucket", "**/*.mp3")
|
|
54
|
-
"s3://bucket/dir" -> ("s3://bucket/dir", None)
|
|
55
|
-
"""
|
|
36
|
+
"""Split a URI into base path and glob pattern."""
|
|
56
37
|
if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
|
|
57
38
|
return uri, None
|
|
58
39
|
|
|
59
|
-
# Handle different URI schemes
|
|
60
40
|
if "://" in uri:
|
|
61
|
-
# Split into scheme and path
|
|
62
41
|
scheme_end = uri.index("://") + 3
|
|
63
42
|
scheme_part = uri[:scheme_end]
|
|
64
43
|
path_part = uri[scheme_end:]
|
|
65
|
-
|
|
66
|
-
# Find where the glob pattern starts
|
|
67
44
|
path_segments = path_part.split("/")
|
|
68
45
|
|
|
69
|
-
# Find first segment with glob pattern
|
|
70
46
|
pattern_start_idx = None
|
|
71
47
|
for i, segment in enumerate(path_segments):
|
|
72
48
|
# Check for glob patterns including brace expansion
|
|
@@ -77,9 +53,7 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
77
53
|
if pattern_start_idx is None:
|
|
78
54
|
return uri, None
|
|
79
55
|
|
|
80
|
-
# Split into base and pattern
|
|
81
56
|
if pattern_start_idx == 0:
|
|
82
|
-
# Pattern at root of bucket
|
|
83
57
|
base = scheme_part + path_segments[0]
|
|
84
58
|
pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
|
|
85
59
|
else:
|
|
@@ -87,13 +61,11 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
87
61
|
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
88
62
|
|
|
89
63
|
return base, pattern
|
|
90
|
-
|
|
64
|
+
|
|
91
65
|
path_segments = uri.split("/")
|
|
92
66
|
|
|
93
|
-
# Find first segment with glob pattern
|
|
94
67
|
pattern_start_idx = None
|
|
95
68
|
for i, segment in enumerate(path_segments):
|
|
96
|
-
# Check for glob patterns including brace expansion
|
|
97
69
|
if glob.has_magic(segment) or "{" in segment:
|
|
98
70
|
pattern_start_idx = i
|
|
99
71
|
break
|
|
@@ -101,7 +73,6 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
101
73
|
if pattern_start_idx is None:
|
|
102
74
|
return uri, None
|
|
103
75
|
|
|
104
|
-
# Split into base and pattern
|
|
105
76
|
base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
|
|
106
77
|
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
107
78
|
|
|
@@ -109,51 +80,30 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
109
80
|
|
|
110
81
|
|
|
111
82
|
def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
|
|
112
|
-
"""
|
|
113
|
-
Determine if we should use recursive listing based on the pattern.
|
|
114
|
-
|
|
115
|
-
Args:
|
|
116
|
-
pattern: The glob pattern extracted from URI
|
|
117
|
-
user_recursive: User's recursive preference
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
True if recursive listing should be used
|
|
121
|
-
|
|
122
|
-
Examples:
|
|
123
|
-
"*" -> False (single level only)
|
|
124
|
-
"*.mp3" -> False (single level only)
|
|
125
|
-
"**/*.mp3" -> True (globstar requires recursion)
|
|
126
|
-
"dir/*/file.txt" -> True (multi-level pattern)
|
|
127
|
-
"""
|
|
128
83
|
if not user_recursive:
|
|
129
|
-
# If user explicitly wants non-recursive, respect that
|
|
130
84
|
return False
|
|
131
85
|
|
|
132
|
-
# If pattern contains globstar, definitely need recursion
|
|
133
86
|
if "**" in pattern:
|
|
134
87
|
return True
|
|
135
88
|
|
|
136
|
-
# If pattern contains path separators, it needs recursion
|
|
137
|
-
# Single-level patterns like "*", "*.txt", "file?" should not be recursive
|
|
138
89
|
return "/" in pattern
|
|
139
90
|
|
|
140
91
|
|
|
141
92
|
def expand_brace_pattern(pattern: str) -> list[str]:
|
|
142
93
|
"""
|
|
143
|
-
Recursively expand brace patterns
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
Returns:
|
|
150
|
-
List of expanded patterns
|
|
94
|
+
Recursively expand brace patterns into multiple glob patterns.
|
|
95
|
+
Supports:
|
|
96
|
+
- Comma-separated lists: *.{mp3,wav}
|
|
97
|
+
- Numeric ranges: file{1..10}
|
|
98
|
+
- Zero-padded numeric ranges: file{01..10}
|
|
99
|
+
- Character ranges: file{a..z}
|
|
151
100
|
|
|
152
101
|
Examples:
|
|
153
102
|
"*.{mp3,wav}" -> ["*.mp3", "*.wav"]
|
|
103
|
+
"file{1..3}" -> ["file1", "file2", "file3"]
|
|
104
|
+
"file{01..03}" -> ["file01", "file02", "file03"]
|
|
105
|
+
"file{a..c}" -> ["filea", "fileb", "filec"]
|
|
154
106
|
"{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
|
|
155
|
-
"*.txt" -> ["*.txt"]
|
|
156
|
-
"{{a,b}}" -> ["{a}", "{b}"] # Handle double braces
|
|
157
107
|
"""
|
|
158
108
|
if "{" not in pattern or "}" not in pattern:
|
|
159
109
|
return [pattern]
|
|
@@ -162,11 +112,9 @@ def expand_brace_pattern(pattern: str) -> list[str]:
|
|
|
162
112
|
|
|
163
113
|
|
|
164
114
|
def _expand_single_braces(pattern: str) -> list[str]:
|
|
165
|
-
"""Helper to expand single-level braces."""
|
|
166
115
|
if "{" not in pattern or "}" not in pattern:
|
|
167
116
|
return [pattern]
|
|
168
117
|
|
|
169
|
-
# Find the first complete brace pattern
|
|
170
118
|
start = pattern.index("{")
|
|
171
119
|
end = start
|
|
172
120
|
depth = 0
|
|
@@ -184,46 +132,66 @@ def _expand_single_braces(pattern: str) -> list[str]:
|
|
|
184
132
|
|
|
185
133
|
prefix = pattern[:start]
|
|
186
134
|
suffix = pattern[end + 1 :]
|
|
187
|
-
|
|
135
|
+
brace_content = pattern[start + 1 : end]
|
|
136
|
+
|
|
137
|
+
if ".." in brace_content:
|
|
138
|
+
options = _expand_range(brace_content)
|
|
139
|
+
else:
|
|
140
|
+
options = [opt.strip() for opt in brace_content.split(",")]
|
|
188
141
|
|
|
189
|
-
# Generate all combinations and recursively expand
|
|
190
142
|
expanded = []
|
|
191
143
|
for option in options:
|
|
192
|
-
combined = prefix + option
|
|
193
|
-
# Recursively expand any remaining braces
|
|
144
|
+
combined = prefix + option + suffix
|
|
194
145
|
expanded.extend(_expand_single_braces(combined))
|
|
195
146
|
|
|
196
147
|
return expanded
|
|
197
148
|
|
|
198
149
|
|
|
199
|
-
def
|
|
200
|
-
""
|
|
150
|
+
def _expand_range(range_spec: str) -> list[str]: # noqa: PLR0911
|
|
151
|
+
if ".." not in range_spec:
|
|
152
|
+
return [range_spec]
|
|
201
153
|
|
|
202
|
-
|
|
203
|
-
|
|
154
|
+
parts = range_spec.split("..")
|
|
155
|
+
if len(parts) != 2:
|
|
156
|
+
return [range_spec]
|
|
204
157
|
|
|
205
|
-
|
|
206
|
-
filter_pattern: Pattern that may contain globstars (**)
|
|
158
|
+
start, end = parts[0], parts[1]
|
|
207
159
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
160
|
+
if start.isdigit() and end.isdigit():
|
|
161
|
+
pad_width = max(len(start), len(end)) if start[0] == "0" or end[0] == "0" else 0
|
|
162
|
+
start_num = int(start)
|
|
163
|
+
end_num = int(end)
|
|
164
|
+
|
|
165
|
+
if start_num <= end_num:
|
|
166
|
+
if pad_width > 0:
|
|
167
|
+
return [str(i).zfill(pad_width) for i in range(start_num, end_num + 1)]
|
|
168
|
+
return [str(i) for i in range(start_num, end_num + 1)]
|
|
169
|
+
if pad_width > 0:
|
|
170
|
+
return [str(i).zfill(pad_width) for i in range(start_num, end_num - 1, -1)]
|
|
171
|
+
return [str(i) for i in range(start_num, end_num - 1, -1)]
|
|
172
|
+
|
|
173
|
+
if len(start) == 1 and len(end) == 1 and start.isalpha() and end.isalpha():
|
|
174
|
+
start_ord = ord(start)
|
|
175
|
+
end_ord = ord(end)
|
|
176
|
+
|
|
177
|
+
if start_ord <= end_ord:
|
|
178
|
+
return [chr(i) for i in range(start_ord, end_ord + 1)]
|
|
179
|
+
return [chr(i) for i in range(start_ord, end_ord - 1, -1)]
|
|
180
|
+
|
|
181
|
+
return [range_spec]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
211
185
|
if "**" not in filter_pattern:
|
|
212
186
|
return filter_pattern
|
|
213
187
|
|
|
214
188
|
parts = filter_pattern.split("/")
|
|
215
189
|
globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
|
|
216
190
|
|
|
217
|
-
# Handle different cases based on number of globstars
|
|
218
191
|
num_globstars = len(globstar_positions)
|
|
219
192
|
|
|
220
193
|
if num_globstars <= 1:
|
|
221
|
-
# Special case: pattern like **/* means zero or more directories
|
|
222
|
-
# This is tricky because GLOB can't express "zero or more"
|
|
223
|
-
# We need different handling based on the pattern structure
|
|
224
|
-
|
|
225
194
|
if filter_pattern == "**/*":
|
|
226
|
-
# Match everything
|
|
227
195
|
return "*"
|
|
228
196
|
if filter_pattern.startswith("**/"):
|
|
229
197
|
remaining = filter_pattern[3:]
|
|
@@ -236,20 +204,11 @@ def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
|
236
204
|
# that works with recursive listing
|
|
237
205
|
# Special handling: if it's a simple extension pattern, match broadly
|
|
238
206
|
if remaining.startswith("*."):
|
|
239
|
-
# Pattern like **/*.ext - match any file with this extension
|
|
240
|
-
# This matches *.ext at current level and deeper with recursion:
|
|
241
207
|
return remaining
|
|
242
|
-
# Pattern like **/temp?.* - match as filename in subdirs
|
|
243
208
|
return f"*/{remaining}"
|
|
244
209
|
|
|
245
|
-
# Default: Zero or one globstar - simple replacement
|
|
246
210
|
return filter_pattern.replace("**", "*")
|
|
247
211
|
|
|
248
|
-
# Multiple globstars - need more careful handling
|
|
249
|
-
# For patterns like **/level?/backup/**/*.ext
|
|
250
|
-
# We want to match any path containing /level?/backup/ and ending with .ext
|
|
251
|
-
|
|
252
|
-
# Find middle directories (between first and last **)
|
|
253
212
|
middle_parts = []
|
|
254
213
|
start_idx = globstar_positions[0] + 1
|
|
255
214
|
end_idx = globstar_positions[-1]
|
|
@@ -258,17 +217,12 @@ def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
|
258
217
|
middle_parts.append(parts[i])
|
|
259
218
|
|
|
260
219
|
if not middle_parts:
|
|
261
|
-
# No fixed middle parts, just use wildcards
|
|
262
220
|
result = filter_pattern.replace("**", "*")
|
|
263
221
|
else:
|
|
264
|
-
# Create pattern that matches the middle parts
|
|
265
222
|
middle_pattern = "/".join(middle_parts)
|
|
266
|
-
# Get the file pattern at the end if any
|
|
267
223
|
last_part = parts[-1] if parts[-1] != "**" else "*"
|
|
268
224
|
|
|
269
|
-
# Match any path containing this pattern
|
|
270
225
|
if last_part != "*":
|
|
271
|
-
# Has specific file pattern
|
|
272
226
|
result = f"*{middle_pattern}*{last_part}"
|
|
273
227
|
else:
|
|
274
228
|
result = f"*{middle_pattern}*"
|
|
@@ -287,14 +241,11 @@ def apply_glob_filter(
|
|
|
287
241
|
|
|
288
242
|
chain = ls(dc, list_path, recursive=use_recursive, column=column)
|
|
289
243
|
|
|
290
|
-
# If pattern doesn't contain path separator and list_path is not empty,
|
|
291
|
-
# prepend the list_path to make the pattern match correctly
|
|
292
244
|
if list_path and "/" not in pattern:
|
|
293
245
|
filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
|
|
294
246
|
else:
|
|
295
247
|
filter_pattern = pattern
|
|
296
248
|
|
|
297
|
-
# Convert globstar patterns to GLOB-compatible patterns
|
|
298
249
|
glob_pattern = convert_globstar_to_glob(filter_pattern)
|
|
299
250
|
|
|
300
251
|
return chain.filter(Column(f"{column}.path").glob(glob_pattern))
|
datachain/lib/namespaces.py
CHANGED
|
@@ -77,7 +77,7 @@ def ls(session: Optional[Session] = None) -> list[Namespace]:
|
|
|
77
77
|
return Session.get(session).catalog.metastore.list_namespaces()
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
def
|
|
80
|
+
def delete_namespace(name: str, session: Optional[Session] = None) -> None:
|
|
81
81
|
"""
|
|
82
82
|
Removes a namespace by name.
|
|
83
83
|
|
|
@@ -88,14 +88,13 @@ def delete(name: str, session: Optional[Session]) -> None:
|
|
|
88
88
|
as these cannot be removed.
|
|
89
89
|
|
|
90
90
|
Parameters:
|
|
91
|
-
name
|
|
92
|
-
session
|
|
91
|
+
name: The name of the namespace.
|
|
92
|
+
session: Session to use for getting project.
|
|
93
93
|
|
|
94
94
|
Example:
|
|
95
95
|
```py
|
|
96
96
|
import datachain as dc
|
|
97
|
-
|
|
98
|
-
delete_namespace("dev")
|
|
97
|
+
dc.delete_namespace("dev")
|
|
99
98
|
```
|
|
100
99
|
"""
|
|
101
100
|
session = Session.get(session)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.32.
|
|
3
|
+
Version: 0.32.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -102,7 +102,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
102
102
|
Requires-Dist: ultralytics; extra == "tests"
|
|
103
103
|
Provides-Extra: dev
|
|
104
104
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
105
|
-
Requires-Dist: mypy==1.
|
|
105
|
+
Requires-Dist: mypy==1.18.1; extra == "dev"
|
|
106
106
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
107
107
|
Requires-Dist: types-dateparser; extra == "dev"
|
|
108
108
|
Requires-Dist: types-pytz; extra == "dev"
|
|
@@ -210,45 +210,33 @@ datasets that evolve over time and may occasionally have processing errors.
|
|
|
210
210
|
.. code:: py
|
|
211
211
|
|
|
212
212
|
import datachain as dc
|
|
213
|
-
from datachain import C, File
|
|
214
213
|
|
|
215
|
-
def process_file(file: File):
|
|
216
|
-
"""
|
|
214
|
+
def process_file(file: dc.File) -> tuple[str, str, str]:
|
|
215
|
+
"""Analyze a file, may occasionally fail."""
|
|
217
216
|
try:
|
|
218
217
|
# Your processing logic here
|
|
219
218
|
content = file.read_text()
|
|
220
|
-
result =
|
|
221
|
-
return
|
|
222
|
-
"content": content,
|
|
223
|
-
"result": result,
|
|
224
|
-
"error": None # No error
|
|
225
|
-
}
|
|
219
|
+
result = content.upper()
|
|
220
|
+
return content, result, "" # No error
|
|
226
221
|
except Exception as e:
|
|
227
222
|
# Return an error that will trigger reprocessing next time
|
|
228
|
-
return
|
|
229
|
-
"content": None,
|
|
230
|
-
"result": None,
|
|
231
|
-
"error": str(e) # Error field will trigger retry
|
|
232
|
-
}
|
|
223
|
+
return "", "", str(e) # Error field will trigger retry
|
|
233
224
|
|
|
234
225
|
# Process files efficiently with delta and retry
|
|
226
|
+
# Run it many times, keep adding files, to see delta and retry in action
|
|
235
227
|
chain = (
|
|
236
228
|
dc.read_storage(
|
|
237
229
|
"data/",
|
|
238
230
|
update=True,
|
|
239
231
|
delta=True, # Process only new/changed files
|
|
240
232
|
delta_on="file.path", # Identify files by path
|
|
241
|
-
|
|
233
|
+
delta_retry="error", # Process files with error again
|
|
242
234
|
)
|
|
243
|
-
.map(
|
|
244
|
-
.
|
|
245
|
-
content=C("processed_result.content"),
|
|
246
|
-
result=C("processed_result.result"),
|
|
247
|
-
error=C("processed_result.error")
|
|
248
|
-
)
|
|
249
|
-
.save(name="processed_data")
|
|
235
|
+
.map(process_file, output=("content", "result", "error"))
|
|
236
|
+
.save("processed-data")
|
|
250
237
|
)
|
|
251
238
|
|
|
239
|
+
|
|
252
240
|
Example: LLM based text-file evaluation
|
|
253
241
|
---------------------------------------
|
|
254
242
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=BRqfLPoBRRycnndaxyba-i4ZrZCJl0As2pwV9RiNBr8,1822
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
|
-
datachain/dataset.py,sha256=
|
|
6
|
+
datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
|
|
7
7
|
datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
|
|
8
8
|
datachain/error.py,sha256=comKx1JCdjsBpxabrOWaiRP0aHBspBDZl1mkKFnBSq0,1739
|
|
9
9
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
@@ -82,7 +82,7 @@ datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
|
|
|
82
82
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
83
83
|
datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
|
|
84
84
|
datachain/lib/model_store.py,sha256=A0pSVQ7uaZ9RvANapzirF8Cqq9N6ysosPpMSkzdRPkU,3226
|
|
85
|
-
datachain/lib/namespaces.py,sha256=
|
|
85
|
+
datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3775
|
|
86
86
|
datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
|
|
87
87
|
datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
|
|
88
88
|
datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
|
|
@@ -97,7 +97,7 @@ datachain/lib/webdataset.py,sha256=CkW8FfGigNx6wo2EEK4KMjhEE8FamRHWGs2HZuH7jDY,7
|
|
|
97
97
|
datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
|
|
98
98
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
99
|
datachain/lib/convert/flatten.py,sha256=_5rjGFnN6t1KCX5ftL5rG7tiiNat7j0SdNqajO15KUY,1539
|
|
100
|
-
datachain/lib/convert/python_to_sql.py,sha256=
|
|
100
|
+
datachain/lib/convert/python_to_sql.py,sha256=ACIHtiPujlG9DRChSlxifcMJCls1PvrB273w_cgR6OQ,3584
|
|
101
101
|
datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
|
|
102
102
|
datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
|
|
103
103
|
datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
|
|
@@ -110,10 +110,10 @@ datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
|
|
|
110
110
|
datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
|
|
111
111
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
112
112
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
113
|
-
datachain/lib/dc/parquet.py,sha256=
|
|
113
|
+
datachain/lib/dc/parquet.py,sha256=ASTrT1UhIbss8jcI5171mrlDQZ_sEFDcIA3qBxuPhZQ,2405
|
|
114
114
|
datachain/lib/dc/records.py,sha256=l7TKSKjT6boXGd05KA5vvax-Y-mLMOo46VWrlxPhmdQ,3067
|
|
115
|
-
datachain/lib/dc/storage.py,sha256=
|
|
116
|
-
datachain/lib/dc/storage_pattern.py,sha256=
|
|
115
|
+
datachain/lib/dc/storage.py,sha256=5GybJi5zftorrNzSk6HZw-rAda-KU7KEU29putjVRVc,9842
|
|
116
|
+
datachain/lib/dc/storage_pattern.py,sha256=FAEsXRl9QAWz-x1wgrJEC8Ehh049GgoeC_HW3Vlwx-c,7658
|
|
117
117
|
datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
|
|
118
118
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
119
119
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
@@ -161,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
161
161
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
162
162
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
163
163
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
164
|
-
datachain-0.32.
|
|
165
|
-
datachain-0.32.
|
|
166
|
-
datachain-0.32.
|
|
167
|
-
datachain-0.32.
|
|
168
|
-
datachain-0.32.
|
|
169
|
-
datachain-0.32.
|
|
164
|
+
datachain-0.32.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
165
|
+
datachain-0.32.2.dist-info/METADATA,sha256=S_VstbtQSTQyz-Ac0c1X_giCWOOFU9zJ4CWbvkw7E_o,13607
|
|
166
|
+
datachain-0.32.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
167
|
+
datachain-0.32.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
168
|
+
datachain-0.32.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
169
|
+
datachain-0.32.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|