datachain 0.32.1__py3-none-any.whl → 0.32.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/data_storage/metastore.py +16 -24
- datachain/dataset.py +2 -2
- datachain/lib/dc/parquet.py +20 -5
- datachain/lib/dc/storage.py +12 -6
- datachain/lib/dc/storage_pattern.py +50 -99
- datachain/lib/file.py +14 -6
- datachain/lib/namespaces.py +1 -1
- {datachain-0.32.1.dist-info → datachain-0.32.3.dist-info}/METADATA +2 -2
- {datachain-0.32.1.dist-info → datachain-0.32.3.dist-info}/RECORD +13 -13
- {datachain-0.32.1.dist-info → datachain-0.32.3.dist-info}/WHEEL +0 -0
- {datachain-0.32.1.dist-info → datachain-0.32.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.32.1.dist-info → datachain-0.32.3.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.32.1.dist-info → datachain-0.32.3.dist-info}/top_level.txt +0 -0
|
@@ -689,9 +689,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
689
689
|
return self._projects.select()
|
|
690
690
|
return select(*columns)
|
|
691
691
|
|
|
692
|
-
def _projects_update(self) -> "Update":
|
|
693
|
-
return self._projects.update()
|
|
694
|
-
|
|
695
692
|
def _projects_delete(self) -> "Delete":
|
|
696
693
|
return self._projects.delete()
|
|
697
694
|
|
|
@@ -839,6 +836,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
839
836
|
|
|
840
837
|
return self.get_project(name, namespace.name)
|
|
841
838
|
|
|
839
|
+
def _projects_base_query(self) -> "Select":
|
|
840
|
+
n = self._namespaces
|
|
841
|
+
p = self._projects
|
|
842
|
+
|
|
843
|
+
query = self._projects_select(
|
|
844
|
+
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
845
|
+
*(getattr(p.c, f) for f in self._projects_fields),
|
|
846
|
+
)
|
|
847
|
+
return query.select_from(n.join(p, n.c.id == p.c.namespace_id))
|
|
848
|
+
|
|
842
849
|
def get_project(
|
|
843
850
|
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
844
851
|
) -> Project:
|
|
@@ -854,11 +861,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
854
861
|
create = True
|
|
855
862
|
validate = False
|
|
856
863
|
|
|
857
|
-
query = self.
|
|
858
|
-
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
859
|
-
*(getattr(p.c, f) for f in self._projects_fields),
|
|
860
|
-
)
|
|
861
|
-
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
|
|
864
|
+
query = self._projects_base_query().where(
|
|
862
865
|
p.c.name == name, n.c.name == namespace_name
|
|
863
866
|
)
|
|
864
867
|
|
|
@@ -873,16 +876,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
873
876
|
|
|
874
877
|
def get_project_by_id(self, project_id: int, conn=None) -> Project:
|
|
875
878
|
"""Gets a single project by id"""
|
|
876
|
-
n = self._namespaces
|
|
877
879
|
p = self._projects
|
|
878
880
|
|
|
879
|
-
query = self.
|
|
880
|
-
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
881
|
-
*(getattr(p.c, f) for f in self._projects_fields),
|
|
882
|
-
)
|
|
883
|
-
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
|
|
884
|
-
p.c.id == project_id
|
|
885
|
-
)
|
|
881
|
+
query = self._projects_base_query().where(p.c.id == project_id)
|
|
886
882
|
|
|
887
883
|
rows = list(self.db.execute(query, conn=conn))
|
|
888
884
|
if not rows:
|
|
@@ -891,7 +887,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
891
887
|
|
|
892
888
|
def count_projects(self, namespace_id: Optional[int] = None) -> int:
|
|
893
889
|
p = self._projects
|
|
894
|
-
|
|
890
|
+
|
|
891
|
+
query = self._projects_base_query()
|
|
895
892
|
if namespace_id:
|
|
896
893
|
query = query.where(p.c.namespace_id == namespace_id)
|
|
897
894
|
|
|
@@ -917,17 +914,12 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
917
914
|
"""
|
|
918
915
|
Gets a list of projects inside some namespace, or in all namespaces
|
|
919
916
|
"""
|
|
920
|
-
n = self._namespaces
|
|
921
917
|
p = self._projects
|
|
922
918
|
|
|
923
|
-
query = self.
|
|
924
|
-
*(getattr(n.c, f) for f in self._namespaces_fields),
|
|
925
|
-
*(getattr(p.c, f) for f in self._projects_fields),
|
|
926
|
-
)
|
|
927
|
-
query = query.select_from(n.join(p, n.c.id == p.c.namespace_id))
|
|
919
|
+
query = self._projects_base_query()
|
|
928
920
|
|
|
929
921
|
if namespace_id:
|
|
930
|
-
query = query.where(
|
|
922
|
+
query = query.where(p.c.namespace_id == namespace_id)
|
|
931
923
|
|
|
932
924
|
rows = list(self.db.execute(query, conn=conn))
|
|
933
925
|
|
datachain/dataset.py
CHANGED
|
@@ -619,7 +619,7 @@ class DatasetRecord:
|
|
|
619
619
|
if not self.versions:
|
|
620
620
|
return "1.0.0"
|
|
621
621
|
|
|
622
|
-
major,
|
|
622
|
+
major, _, _ = semver.parse(self.latest_version)
|
|
623
623
|
return semver.create(major + 1, 0, 0)
|
|
624
624
|
|
|
625
625
|
@property
|
|
@@ -630,7 +630,7 @@ class DatasetRecord:
|
|
|
630
630
|
if not self.versions:
|
|
631
631
|
return "1.0.0"
|
|
632
632
|
|
|
633
|
-
major, minor,
|
|
633
|
+
major, minor, _ = semver.parse(self.latest_version)
|
|
634
634
|
return semver.create(major, minor + 1, 0)
|
|
635
635
|
|
|
636
636
|
@property
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -26,8 +26,14 @@ def read_parquet(
|
|
|
26
26
|
"""Generate chain from parquet files.
|
|
27
27
|
|
|
28
28
|
Parameters:
|
|
29
|
-
path: Storage
|
|
30
|
-
|
|
29
|
+
path: Storage path(s) or URI(s). Can be a local path or start with a
|
|
30
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
31
|
+
Supports glob patterns:
|
|
32
|
+
- `*` : wildcard
|
|
33
|
+
- `**` : recursive wildcard
|
|
34
|
+
- `?` : single character
|
|
35
|
+
- `{a,b}` : brace expansion list
|
|
36
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
31
37
|
partitioning: Any pyarrow partitioning schema.
|
|
32
38
|
output: Dictionary defining column names and their corresponding types.
|
|
33
39
|
column: Created column name.
|
|
@@ -43,10 +49,19 @@ def read_parquet(
|
|
|
43
49
|
dc.read_parquet("s3://mybucket/file.parquet")
|
|
44
50
|
```
|
|
45
51
|
|
|
46
|
-
|
|
52
|
+
All files from a directory:
|
|
47
53
|
```py
|
|
48
|
-
|
|
49
|
-
|
|
54
|
+
dc.read_parquet("s3://mybucket/dir/")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Only parquet files from a directory, and all it's subdirectories:
|
|
58
|
+
```py
|
|
59
|
+
dc.read_parquet("s3://mybucket/dir/**/*.parquet")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Using filename patterns - numeric, list, starting with zeros:
|
|
63
|
+
```py
|
|
64
|
+
dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
|
|
50
65
|
```
|
|
51
66
|
"""
|
|
52
67
|
from .storage import read_storage
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -51,7 +51,8 @@ def read_storage(
|
|
|
51
51
|
- `*` : wildcard
|
|
52
52
|
- `**` : recursive wildcard
|
|
53
53
|
- `?` : single character
|
|
54
|
-
- `{a,b}` : brace expansion
|
|
54
|
+
- `{a,b}` : brace expansion list
|
|
55
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
55
56
|
type: read file as "binary", "text", or "image" data. Default is "binary".
|
|
56
57
|
recursive: search recursively for the given path.
|
|
57
58
|
column: Column name that will contain File objects. Default is "file".
|
|
@@ -88,27 +89,32 @@ def read_storage(
|
|
|
88
89
|
Simple call from s3:
|
|
89
90
|
```python
|
|
90
91
|
import datachain as dc
|
|
91
|
-
|
|
92
|
+
dc.read_storage("s3://my-bucket/my-dir")
|
|
92
93
|
```
|
|
93
94
|
|
|
94
95
|
Match all .json files recursively using glob pattern
|
|
95
96
|
```py
|
|
96
|
-
|
|
97
|
+
dc.read_storage("gs://bucket/meta/**/*.json")
|
|
97
98
|
```
|
|
98
99
|
|
|
99
100
|
Match image file extensions for directories with pattern
|
|
100
101
|
```py
|
|
101
|
-
|
|
102
|
+
dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
By ranges in filenames:
|
|
106
|
+
```py
|
|
107
|
+
dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")
|
|
102
108
|
```
|
|
103
109
|
|
|
104
110
|
Multiple URIs:
|
|
105
111
|
```python
|
|
106
|
-
|
|
112
|
+
dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
|
|
107
113
|
```
|
|
108
114
|
|
|
109
115
|
With AWS S3-compatible storage:
|
|
110
116
|
```python
|
|
111
|
-
|
|
117
|
+
dc.read_storage(
|
|
112
118
|
"s3://my-bucket/my-dir",
|
|
113
119
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
114
120
|
)
|
|
@@ -12,61 +12,37 @@ def validate_cloud_bucket_name(uri: str) -> None:
|
|
|
12
12
|
"""
|
|
13
13
|
Validate that cloud storage bucket names don't contain glob patterns.
|
|
14
14
|
|
|
15
|
-
Args:
|
|
16
|
-
uri: URI to validate
|
|
17
|
-
|
|
18
15
|
Raises:
|
|
19
16
|
ValueError: If a cloud storage bucket name contains glob patterns
|
|
20
17
|
"""
|
|
21
18
|
if not is_cloud_uri(uri):
|
|
22
19
|
return
|
|
23
20
|
|
|
24
|
-
# Extract bucket name (everything between :// and first /)
|
|
25
21
|
if "://" in uri:
|
|
26
22
|
scheme_end = uri.index("://") + 3
|
|
27
23
|
path_part = uri[scheme_end:]
|
|
28
24
|
|
|
29
|
-
# Get the bucket name (first segment)
|
|
30
25
|
if "/" in path_part:
|
|
31
26
|
bucket_name = path_part.split("/")[0]
|
|
32
27
|
else:
|
|
33
28
|
bucket_name = path_part
|
|
34
29
|
|
|
35
|
-
# Check if bucket name contains glob patterns
|
|
36
30
|
glob_chars = ["*", "?", "[", "]", "{", "}"]
|
|
37
31
|
if any(char in bucket_name for char in glob_chars):
|
|
38
32
|
raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
|
|
39
33
|
|
|
40
34
|
|
|
41
35
|
def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
42
|
-
"""
|
|
43
|
-
Split a URI into base path and glob pattern.
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
uri: URI that may contain glob patterns (*, **, ?, {})
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
Tuple of (base_uri, pattern) where pattern is None if no glob pattern found
|
|
50
|
-
|
|
51
|
-
Examples:
|
|
52
|
-
"s3://bucket/dir/*.mp3" -> ("s3://bucket/dir", "*.mp3")
|
|
53
|
-
"s3://bucket/**/*.mp3" -> ("s3://bucket", "**/*.mp3")
|
|
54
|
-
"s3://bucket/dir" -> ("s3://bucket/dir", None)
|
|
55
|
-
"""
|
|
36
|
+
"""Split a URI into base path and glob pattern."""
|
|
56
37
|
if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
|
|
57
38
|
return uri, None
|
|
58
39
|
|
|
59
|
-
# Handle different URI schemes
|
|
60
40
|
if "://" in uri:
|
|
61
|
-
# Split into scheme and path
|
|
62
41
|
scheme_end = uri.index("://") + 3
|
|
63
42
|
scheme_part = uri[:scheme_end]
|
|
64
43
|
path_part = uri[scheme_end:]
|
|
65
|
-
|
|
66
|
-
# Find where the glob pattern starts
|
|
67
44
|
path_segments = path_part.split("/")
|
|
68
45
|
|
|
69
|
-
# Find first segment with glob pattern
|
|
70
46
|
pattern_start_idx = None
|
|
71
47
|
for i, segment in enumerate(path_segments):
|
|
72
48
|
# Check for glob patterns including brace expansion
|
|
@@ -77,9 +53,7 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
77
53
|
if pattern_start_idx is None:
|
|
78
54
|
return uri, None
|
|
79
55
|
|
|
80
|
-
# Split into base and pattern
|
|
81
56
|
if pattern_start_idx == 0:
|
|
82
|
-
# Pattern at root of bucket
|
|
83
57
|
base = scheme_part + path_segments[0]
|
|
84
58
|
pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
|
|
85
59
|
else:
|
|
@@ -87,13 +61,11 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
87
61
|
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
88
62
|
|
|
89
63
|
return base, pattern
|
|
90
|
-
|
|
64
|
+
|
|
91
65
|
path_segments = uri.split("/")
|
|
92
66
|
|
|
93
|
-
# Find first segment with glob pattern
|
|
94
67
|
pattern_start_idx = None
|
|
95
68
|
for i, segment in enumerate(path_segments):
|
|
96
|
-
# Check for glob patterns including brace expansion
|
|
97
69
|
if glob.has_magic(segment) or "{" in segment:
|
|
98
70
|
pattern_start_idx = i
|
|
99
71
|
break
|
|
@@ -101,7 +73,6 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
101
73
|
if pattern_start_idx is None:
|
|
102
74
|
return uri, None
|
|
103
75
|
|
|
104
|
-
# Split into base and pattern
|
|
105
76
|
base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
|
|
106
77
|
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
107
78
|
|
|
@@ -109,51 +80,30 @@ def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
|
109
80
|
|
|
110
81
|
|
|
111
82
|
def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
|
|
112
|
-
"""
|
|
113
|
-
Determine if we should use recursive listing based on the pattern.
|
|
114
|
-
|
|
115
|
-
Args:
|
|
116
|
-
pattern: The glob pattern extracted from URI
|
|
117
|
-
user_recursive: User's recursive preference
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
True if recursive listing should be used
|
|
121
|
-
|
|
122
|
-
Examples:
|
|
123
|
-
"*" -> False (single level only)
|
|
124
|
-
"*.mp3" -> False (single level only)
|
|
125
|
-
"**/*.mp3" -> True (globstar requires recursion)
|
|
126
|
-
"dir/*/file.txt" -> True (multi-level pattern)
|
|
127
|
-
"""
|
|
128
83
|
if not user_recursive:
|
|
129
|
-
# If user explicitly wants non-recursive, respect that
|
|
130
84
|
return False
|
|
131
85
|
|
|
132
|
-
# If pattern contains globstar, definitely need recursion
|
|
133
86
|
if "**" in pattern:
|
|
134
87
|
return True
|
|
135
88
|
|
|
136
|
-
# If pattern contains path separators, it needs recursion
|
|
137
|
-
# Single-level patterns like "*", "*.txt", "file?" should not be recursive
|
|
138
89
|
return "/" in pattern
|
|
139
90
|
|
|
140
91
|
|
|
141
92
|
def expand_brace_pattern(pattern: str) -> list[str]:
|
|
142
93
|
"""
|
|
143
|
-
Recursively expand brace patterns
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
Returns:
|
|
150
|
-
List of expanded patterns
|
|
94
|
+
Recursively expand brace patterns into multiple glob patterns.
|
|
95
|
+
Supports:
|
|
96
|
+
- Comma-separated lists: *.{mp3,wav}
|
|
97
|
+
- Numeric ranges: file{1..10}
|
|
98
|
+
- Zero-padded numeric ranges: file{01..10}
|
|
99
|
+
- Character ranges: file{a..z}
|
|
151
100
|
|
|
152
101
|
Examples:
|
|
153
102
|
"*.{mp3,wav}" -> ["*.mp3", "*.wav"]
|
|
103
|
+
"file{1..3}" -> ["file1", "file2", "file3"]
|
|
104
|
+
"file{01..03}" -> ["file01", "file02", "file03"]
|
|
105
|
+
"file{a..c}" -> ["filea", "fileb", "filec"]
|
|
154
106
|
"{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
|
|
155
|
-
"*.txt" -> ["*.txt"]
|
|
156
|
-
"{{a,b}}" -> ["{a}", "{b}"] # Handle double braces
|
|
157
107
|
"""
|
|
158
108
|
if "{" not in pattern or "}" not in pattern:
|
|
159
109
|
return [pattern]
|
|
@@ -162,11 +112,9 @@ def expand_brace_pattern(pattern: str) -> list[str]:
|
|
|
162
112
|
|
|
163
113
|
|
|
164
114
|
def _expand_single_braces(pattern: str) -> list[str]:
|
|
165
|
-
"""Helper to expand single-level braces."""
|
|
166
115
|
if "{" not in pattern or "}" not in pattern:
|
|
167
116
|
return [pattern]
|
|
168
117
|
|
|
169
|
-
# Find the first complete brace pattern
|
|
170
118
|
start = pattern.index("{")
|
|
171
119
|
end = start
|
|
172
120
|
depth = 0
|
|
@@ -184,46 +132,66 @@ def _expand_single_braces(pattern: str) -> list[str]:
|
|
|
184
132
|
|
|
185
133
|
prefix = pattern[:start]
|
|
186
134
|
suffix = pattern[end + 1 :]
|
|
187
|
-
|
|
135
|
+
brace_content = pattern[start + 1 : end]
|
|
136
|
+
|
|
137
|
+
if ".." in brace_content:
|
|
138
|
+
options = _expand_range(brace_content)
|
|
139
|
+
else:
|
|
140
|
+
options = [opt.strip() for opt in brace_content.split(",")]
|
|
188
141
|
|
|
189
|
-
# Generate all combinations and recursively expand
|
|
190
142
|
expanded = []
|
|
191
143
|
for option in options:
|
|
192
|
-
combined = prefix + option
|
|
193
|
-
# Recursively expand any remaining braces
|
|
144
|
+
combined = prefix + option + suffix
|
|
194
145
|
expanded.extend(_expand_single_braces(combined))
|
|
195
146
|
|
|
196
147
|
return expanded
|
|
197
148
|
|
|
198
149
|
|
|
199
|
-
def
|
|
200
|
-
""
|
|
150
|
+
def _expand_range(range_spec: str) -> list[str]: # noqa: PLR0911
|
|
151
|
+
if ".." not in range_spec:
|
|
152
|
+
return [range_spec]
|
|
201
153
|
|
|
202
|
-
|
|
203
|
-
|
|
154
|
+
parts = range_spec.split("..")
|
|
155
|
+
if len(parts) != 2:
|
|
156
|
+
return [range_spec]
|
|
204
157
|
|
|
205
|
-
|
|
206
|
-
filter_pattern: Pattern that may contain globstars (**)
|
|
158
|
+
start, end = parts[0], parts[1]
|
|
207
159
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
160
|
+
if start.isdigit() and end.isdigit():
|
|
161
|
+
pad_width = max(len(start), len(end)) if start[0] == "0" or end[0] == "0" else 0
|
|
162
|
+
start_num = int(start)
|
|
163
|
+
end_num = int(end)
|
|
164
|
+
|
|
165
|
+
if start_num <= end_num:
|
|
166
|
+
if pad_width > 0:
|
|
167
|
+
return [str(i).zfill(pad_width) for i in range(start_num, end_num + 1)]
|
|
168
|
+
return [str(i) for i in range(start_num, end_num + 1)]
|
|
169
|
+
if pad_width > 0:
|
|
170
|
+
return [str(i).zfill(pad_width) for i in range(start_num, end_num - 1, -1)]
|
|
171
|
+
return [str(i) for i in range(start_num, end_num - 1, -1)]
|
|
172
|
+
|
|
173
|
+
if len(start) == 1 and len(end) == 1 and start.isalpha() and end.isalpha():
|
|
174
|
+
start_ord = ord(start)
|
|
175
|
+
end_ord = ord(end)
|
|
176
|
+
|
|
177
|
+
if start_ord <= end_ord:
|
|
178
|
+
return [chr(i) for i in range(start_ord, end_ord + 1)]
|
|
179
|
+
return [chr(i) for i in range(start_ord, end_ord - 1, -1)]
|
|
180
|
+
|
|
181
|
+
return [range_spec]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
211
185
|
if "**" not in filter_pattern:
|
|
212
186
|
return filter_pattern
|
|
213
187
|
|
|
214
188
|
parts = filter_pattern.split("/")
|
|
215
189
|
globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
|
|
216
190
|
|
|
217
|
-
# Handle different cases based on number of globstars
|
|
218
191
|
num_globstars = len(globstar_positions)
|
|
219
192
|
|
|
220
193
|
if num_globstars <= 1:
|
|
221
|
-
# Special case: pattern like **/* means zero or more directories
|
|
222
|
-
# This is tricky because GLOB can't express "zero or more"
|
|
223
|
-
# We need different handling based on the pattern structure
|
|
224
|
-
|
|
225
194
|
if filter_pattern == "**/*":
|
|
226
|
-
# Match everything
|
|
227
195
|
return "*"
|
|
228
196
|
if filter_pattern.startswith("**/"):
|
|
229
197
|
remaining = filter_pattern[3:]
|
|
@@ -236,20 +204,11 @@ def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
|
236
204
|
# that works with recursive listing
|
|
237
205
|
# Special handling: if it's a simple extension pattern, match broadly
|
|
238
206
|
if remaining.startswith("*."):
|
|
239
|
-
# Pattern like **/*.ext - match any file with this extension
|
|
240
|
-
# This matches *.ext at current level and deeper with recursion:
|
|
241
207
|
return remaining
|
|
242
|
-
# Pattern like **/temp?.* - match as filename in subdirs
|
|
243
208
|
return f"*/{remaining}"
|
|
244
209
|
|
|
245
|
-
# Default: Zero or one globstar - simple replacement
|
|
246
210
|
return filter_pattern.replace("**", "*")
|
|
247
211
|
|
|
248
|
-
# Multiple globstars - need more careful handling
|
|
249
|
-
# For patterns like **/level?/backup/**/*.ext
|
|
250
|
-
# We want to match any path containing /level?/backup/ and ending with .ext
|
|
251
|
-
|
|
252
|
-
# Find middle directories (between first and last **)
|
|
253
212
|
middle_parts = []
|
|
254
213
|
start_idx = globstar_positions[0] + 1
|
|
255
214
|
end_idx = globstar_positions[-1]
|
|
@@ -258,17 +217,12 @@ def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
|
258
217
|
middle_parts.append(parts[i])
|
|
259
218
|
|
|
260
219
|
if not middle_parts:
|
|
261
|
-
# No fixed middle parts, just use wildcards
|
|
262
220
|
result = filter_pattern.replace("**", "*")
|
|
263
221
|
else:
|
|
264
|
-
# Create pattern that matches the middle parts
|
|
265
222
|
middle_pattern = "/".join(middle_parts)
|
|
266
|
-
# Get the file pattern at the end if any
|
|
267
223
|
last_part = parts[-1] if parts[-1] != "**" else "*"
|
|
268
224
|
|
|
269
|
-
# Match any path containing this pattern
|
|
270
225
|
if last_part != "*":
|
|
271
|
-
# Has specific file pattern
|
|
272
226
|
result = f"*{middle_pattern}*{last_part}"
|
|
273
227
|
else:
|
|
274
228
|
result = f"*{middle_pattern}*"
|
|
@@ -287,14 +241,11 @@ def apply_glob_filter(
|
|
|
287
241
|
|
|
288
242
|
chain = ls(dc, list_path, recursive=use_recursive, column=column)
|
|
289
243
|
|
|
290
|
-
# If pattern doesn't contain path separator and list_path is not empty,
|
|
291
|
-
# prepend the list_path to make the pattern match correctly
|
|
292
244
|
if list_path and "/" not in pattern:
|
|
293
245
|
filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
|
|
294
246
|
else:
|
|
295
247
|
filter_pattern = pattern
|
|
296
248
|
|
|
297
|
-
# Convert globstar patterns to GLOB-compatible patterns
|
|
298
249
|
glob_pattern = convert_globstar_to_glob(filter_pattern)
|
|
299
250
|
|
|
300
251
|
return chain.filter(Column(f"{column}.path").glob(glob_pattern))
|
datachain/lib/file.py
CHANGED
|
@@ -332,7 +332,10 @@ class File(DataModel):
|
|
|
332
332
|
|
|
333
333
|
@classmethod
|
|
334
334
|
def upload(
|
|
335
|
-
cls,
|
|
335
|
+
cls,
|
|
336
|
+
data: bytes,
|
|
337
|
+
path: Union[str, os.PathLike[str]],
|
|
338
|
+
catalog: Optional["Catalog"] = None,
|
|
336
339
|
) -> "Self":
|
|
337
340
|
if catalog is None:
|
|
338
341
|
from datachain.catalog.loader import get_catalog
|
|
@@ -340,8 +343,10 @@ class File(DataModel):
|
|
|
340
343
|
catalog = get_catalog()
|
|
341
344
|
from datachain.client.fsspec import Client
|
|
342
345
|
|
|
343
|
-
|
|
344
|
-
|
|
346
|
+
path_str = stringify_path(path)
|
|
347
|
+
|
|
348
|
+
client_cls = Client.get_implementation(path_str)
|
|
349
|
+
source, rel_path = client_cls.split_url(path_str)
|
|
345
350
|
|
|
346
351
|
client = catalog.get_client(client_cls.get_uri(source))
|
|
347
352
|
file = client.upload(data, rel_path)
|
|
@@ -351,7 +356,9 @@ class File(DataModel):
|
|
|
351
356
|
return file
|
|
352
357
|
|
|
353
358
|
@classmethod
|
|
354
|
-
def at(
|
|
359
|
+
def at(
|
|
360
|
+
cls, uri: Union[str, os.PathLike[str]], session: Optional["Session"] = None
|
|
361
|
+
) -> "Self":
|
|
355
362
|
"""Construct a File from a full URI in one call.
|
|
356
363
|
|
|
357
364
|
Example:
|
|
@@ -364,9 +371,10 @@ class File(DataModel):
|
|
|
364
371
|
if session is None:
|
|
365
372
|
session = Session.get()
|
|
366
373
|
catalog = session.catalog
|
|
374
|
+
uri_str = stringify_path(uri)
|
|
367
375
|
|
|
368
|
-
client_cls = Client.get_implementation(
|
|
369
|
-
source, rel_path = client_cls.split_url(
|
|
376
|
+
client_cls = Client.get_implementation(uri_str)
|
|
377
|
+
source, rel_path = client_cls.split_url(uri_str)
|
|
370
378
|
file = cls(source=client_cls.get_uri(source), path=rel_path)
|
|
371
379
|
file._set_stream(catalog)
|
|
372
380
|
return file
|
datachain/lib/namespaces.py
CHANGED
|
@@ -77,7 +77,7 @@ def ls(session: Optional[Session] = None) -> list[Namespace]:
|
|
|
77
77
|
return Session.get(session).catalog.metastore.list_namespaces()
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
def delete_namespace(name: str, session: Optional[Session]) -> None:
|
|
80
|
+
def delete_namespace(name: str, session: Optional[Session] = None) -> None:
|
|
81
81
|
"""
|
|
82
82
|
Removes a namespace by name.
|
|
83
83
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.32.
|
|
3
|
+
Version: 0.32.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -102,7 +102,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
102
102
|
Requires-Dist: ultralytics; extra == "tests"
|
|
103
103
|
Provides-Extra: dev
|
|
104
104
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
105
|
-
Requires-Dist: mypy==1.
|
|
105
|
+
Requires-Dist: mypy==1.18.1; extra == "dev"
|
|
106
106
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
107
107
|
Requires-Dist: types-dateparser; extra == "dev"
|
|
108
108
|
Requires-Dist: types-pytz; extra == "dev"
|
|
@@ -3,7 +3,7 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
|
-
datachain/dataset.py,sha256=
|
|
6
|
+
datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
|
|
7
7
|
datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
|
|
8
8
|
datachain/error.py,sha256=comKx1JCdjsBpxabrOWaiRP0aHBspBDZl1mkKFnBSq0,1739
|
|
9
9
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
@@ -49,7 +49,7 @@ datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
|
49
49
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
50
50
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
51
51
|
datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4,426
|
|
52
|
-
datachain/data_storage/metastore.py,sha256=
|
|
52
|
+
datachain/data_storage/metastore.py,sha256=SrcMeHAjzwTbX8A3WEZ3zzQzVW1n7uamrGDtQXqucyE,55810
|
|
53
53
|
datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
|
|
54
54
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
55
55
|
datachain/data_storage/sqlite.py,sha256=1fIeIhmB3O8oQVzP8dDKap0KUIgI0n2TdBQSyv0R8J4,30345
|
|
@@ -75,14 +75,14 @@ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
|
|
|
75
75
|
datachain/lib/clip.py,sha256=ae6uoiymOl53rBXwIfqJkbHrk_IA21R1uJwXo5454C4,6145
|
|
76
76
|
datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
|
|
77
77
|
datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
|
|
78
|
-
datachain/lib/file.py,sha256=
|
|
78
|
+
datachain/lib/file.py,sha256=FNM9XBn5uxOwaRedlL-aCYQ1CXboFaeQh5WzJXU3WhA,47505
|
|
79
79
|
datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
|
|
80
80
|
datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
81
81
|
datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
|
|
82
82
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
83
83
|
datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
|
|
84
84
|
datachain/lib/model_store.py,sha256=A0pSVQ7uaZ9RvANapzirF8Cqq9N6ysosPpMSkzdRPkU,3226
|
|
85
|
-
datachain/lib/namespaces.py,sha256=
|
|
85
|
+
datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3775
|
|
86
86
|
datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
|
|
87
87
|
datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
|
|
88
88
|
datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
|
|
@@ -110,10 +110,10 @@ datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
|
|
|
110
110
|
datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
|
|
111
111
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
112
112
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
113
|
-
datachain/lib/dc/parquet.py,sha256=
|
|
113
|
+
datachain/lib/dc/parquet.py,sha256=ASTrT1UhIbss8jcI5171mrlDQZ_sEFDcIA3qBxuPhZQ,2405
|
|
114
114
|
datachain/lib/dc/records.py,sha256=l7TKSKjT6boXGd05KA5vvax-Y-mLMOo46VWrlxPhmdQ,3067
|
|
115
|
-
datachain/lib/dc/storage.py,sha256=
|
|
116
|
-
datachain/lib/dc/storage_pattern.py,sha256=
|
|
115
|
+
datachain/lib/dc/storage.py,sha256=5GybJi5zftorrNzSk6HZw-rAda-KU7KEU29putjVRVc,9842
|
|
116
|
+
datachain/lib/dc/storage_pattern.py,sha256=FAEsXRl9QAWz-x1wgrJEC8Ehh049GgoeC_HW3Vlwx-c,7658
|
|
117
117
|
datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
|
|
118
118
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
119
119
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
@@ -161,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
161
161
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
162
162
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
163
163
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
164
|
-
datachain-0.32.
|
|
165
|
-
datachain-0.32.
|
|
166
|
-
datachain-0.32.
|
|
167
|
-
datachain-0.32.
|
|
168
|
-
datachain-0.32.
|
|
169
|
-
datachain-0.32.
|
|
164
|
+
datachain-0.32.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
165
|
+
datachain-0.32.3.dist-info/METADATA,sha256=MJCn0xaCu7eOuQl8AXKTFX4HTvPqtBPY93rCvcUcoBg,13607
|
|
166
|
+
datachain-0.32.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
167
|
+
datachain-0.32.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
168
|
+
datachain-0.32.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
169
|
+
datachain-0.32.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|