datachain 0.31.0__py3-none-any.whl → 0.31.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/client/fsspec.py +11 -0
- datachain/lib/clip.py +9 -9
- datachain/lib/dc/csv.py +15 -19
- datachain/lib/dc/datachain.py +64 -62
- datachain/lib/dc/datasets.py +8 -8
- datachain/lib/dc/hf.py +13 -17
- datachain/lib/dc/json.py +9 -9
- datachain/lib/dc/parquet.py +11 -14
- datachain/lib/dc/records.py +2 -2
- datachain/lib/dc/storage.py +74 -44
- datachain/lib/dc/storage_pattern.py +300 -0
- {datachain-0.31.0.dist-info → datachain-0.31.2.dist-info}/METADATA +1 -1
- {datachain-0.31.0.dist-info → datachain-0.31.2.dist-info}/RECORD +17 -16
- {datachain-0.31.0.dist-info → datachain-0.31.2.dist-info}/WHEEL +0 -0
- {datachain-0.31.0.dist-info → datachain-0.31.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.31.0.dist-info → datachain-0.31.2.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.31.0.dist-info → datachain-0.31.2.dist-info}/top_level.txt +0 -0
datachain/client/fsspec.py
CHANGED
|
@@ -44,6 +44,7 @@ FETCH_WORKERS = 100
|
|
|
44
44
|
DELIMITER = "/" # Path delimiter.
|
|
45
45
|
|
|
46
46
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
47
|
+
CLOUD_STORAGE_PROTOCOLS = {"s3", "gs", "az", "hf"}
|
|
47
48
|
|
|
48
49
|
ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
|
|
49
50
|
|
|
@@ -62,6 +63,16 @@ def _is_win_local_path(uri: str) -> bool:
|
|
|
62
63
|
return False
|
|
63
64
|
|
|
64
65
|
|
|
66
|
+
def is_cloud_uri(uri: str) -> bool:
|
|
67
|
+
protocol = urlparse(uri).scheme
|
|
68
|
+
return protocol in CLOUD_STORAGE_PROTOCOLS
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_cloud_schemes() -> list[str]:
|
|
72
|
+
"""Get list of cloud storage scheme prefixes."""
|
|
73
|
+
return [f"{p}://" for p in CLOUD_STORAGE_PROTOCOLS]
|
|
74
|
+
|
|
75
|
+
|
|
65
76
|
class Bucket(NamedTuple):
|
|
66
77
|
name: str
|
|
67
78
|
uri: "StorageURI"
|
datachain/lib/clip.py
CHANGED
|
@@ -45,15 +45,15 @@ def clip_similarity_scores(
|
|
|
45
45
|
Calculate CLIP similarity scores between one or more images and/or text.
|
|
46
46
|
|
|
47
47
|
Parameters:
|
|
48
|
-
images
|
|
49
|
-
text
|
|
50
|
-
model
|
|
51
|
-
preprocess
|
|
52
|
-
tokenizer
|
|
53
|
-
prob
|
|
54
|
-
image_to_text
|
|
55
|
-
if only one of images or text provided.
|
|
56
|
-
device
|
|
48
|
+
images: Images to use as inputs.
|
|
49
|
+
text: Text to use as inputs.
|
|
50
|
+
model: Model from clip or open_clip packages.
|
|
51
|
+
preprocess: Image preprocessor to apply.
|
|
52
|
+
tokenizer: Text tokenizer.
|
|
53
|
+
prob: Compute softmax probabilities.
|
|
54
|
+
image_to_text: Whether to compute for image-to-text or text-to-image. Ignored
|
|
55
|
+
if only one of the images or text provided.
|
|
56
|
+
device: Device to use. Default is None - use model's device.
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
Example:
|
datachain/lib/dc/csv.py
CHANGED
|
@@ -1,10 +1,6 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from collections.abc import Sequence
|
|
2
|
-
from typing import
|
|
3
|
-
TYPE_CHECKING,
|
|
4
|
-
Callable,
|
|
5
|
-
Optional,
|
|
6
|
-
Union,
|
|
7
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, Callable, Optional, Union
|
|
8
4
|
|
|
9
5
|
from datachain.lib.dc.utils import DatasetPrepareError, OutputType
|
|
10
6
|
from datachain.lib.model_store import ModelStore
|
|
@@ -17,14 +13,14 @@ if TYPE_CHECKING:
|
|
|
17
13
|
|
|
18
14
|
|
|
19
15
|
def read_csv(
|
|
20
|
-
path,
|
|
16
|
+
path: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
|
|
21
17
|
delimiter: Optional[str] = None,
|
|
22
18
|
header: bool = True,
|
|
23
19
|
output: OutputType = None,
|
|
24
20
|
column: str = "",
|
|
25
21
|
model_name: str = "",
|
|
26
22
|
source: bool = True,
|
|
27
|
-
nrows=None,
|
|
23
|
+
nrows: Optional[int] = None,
|
|
28
24
|
session: Optional[Session] = None,
|
|
29
25
|
settings: Optional[dict] = None,
|
|
30
26
|
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
@@ -34,21 +30,21 @@ def read_csv(
|
|
|
34
30
|
"""Generate chain from csv files.
|
|
35
31
|
|
|
36
32
|
Parameters:
|
|
37
|
-
path
|
|
33
|
+
path: Storage URI with directory. URI must start with storage prefix such
|
|
38
34
|
as `s3://`, `gs://`, `az://` or "file:///".
|
|
39
|
-
delimiter
|
|
35
|
+
delimiter: Character for delimiting columns. Takes precedence if also
|
|
40
36
|
specified in `parse_options`. Defaults to ",".
|
|
41
|
-
header
|
|
42
|
-
output
|
|
37
|
+
header: Whether the files include a header row.
|
|
38
|
+
output: Dictionary or feature class defining column names and their
|
|
43
39
|
corresponding types. List of column names is also accepted, in which
|
|
44
40
|
case types will be inferred.
|
|
45
|
-
column
|
|
46
|
-
model_name
|
|
47
|
-
source
|
|
48
|
-
nrows
|
|
49
|
-
session
|
|
50
|
-
settings
|
|
51
|
-
column_types
|
|
41
|
+
column: Created column name.
|
|
42
|
+
model_name: Generated model name.
|
|
43
|
+
source: Whether to include info about the source file.
|
|
44
|
+
nrows: Optional row limit.
|
|
45
|
+
session: Session to use for the chain.
|
|
46
|
+
settings: Settings to use for the chain.
|
|
47
|
+
column_types: Dictionary of column names and their corresponding types.
|
|
52
48
|
It is passed to CSV reader and for each column specified type auto
|
|
53
49
|
inference is disabled.
|
|
54
50
|
parse_options: Tells the parser how to process lines.
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -40,11 +40,7 @@ from datachain.lib.data_model import (
|
|
|
40
40
|
StandardType,
|
|
41
41
|
dict_to_data_model,
|
|
42
42
|
)
|
|
43
|
-
from datachain.lib.file import
|
|
44
|
-
EXPORT_FILES_MAX_THREADS,
|
|
45
|
-
ArrowRow,
|
|
46
|
-
FileExporter,
|
|
47
|
-
)
|
|
43
|
+
from datachain.lib.file import EXPORT_FILES_MAX_THREADS, ArrowRow, FileExporter
|
|
48
44
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
49
45
|
from datachain.lib.model_store import ModelStore
|
|
50
46
|
from datachain.lib.settings import Settings
|
|
@@ -352,24 +348,28 @@ class DataChain:
|
|
|
352
348
|
batch_size: Optional[int] = None,
|
|
353
349
|
sys: Optional[bool] = None,
|
|
354
350
|
) -> "Self":
|
|
355
|
-
"""
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
351
|
+
"""
|
|
352
|
+
Set chain execution parameters. Returns the chain itself, allowing method
|
|
353
|
+
chaining for subsequent operations. To restore all settings to their default
|
|
354
|
+
values, use `reset_settings()`.
|
|
359
355
|
|
|
360
356
|
Parameters:
|
|
361
|
-
cache
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
357
|
+
cache: Enable files caching to speed up subsequent accesses to the same
|
|
358
|
+
files from the same or different chains. Defaults to False.
|
|
359
|
+
prefetch: Enable prefetching of files. This will download files in
|
|
360
|
+
advance in parallel. If an integer is provided, it specifies the number
|
|
361
|
+
of files to prefetch concurrently for each process on each worker.
|
|
362
|
+
Defaults to 2. Set to 0 or False to disable prefetching.
|
|
363
|
+
parallel: Number of processes to use for processing user-defined functions
|
|
364
|
+
(UDFs) in parallel. If an integer is provided, it specifies the number
|
|
365
|
+
of CPUs to use. If True, all available CPUs are used. Defaults to 1.
|
|
366
|
+
namespace: Namespace to use for the chain by default.
|
|
367
|
+
project: Project to use for the chain by default.
|
|
368
|
+
min_task_size: Minimum number of rows per worker/process for parallel
|
|
369
|
+
processing by UDFs. Defaults to 1.
|
|
370
|
+
batch_size: Number of rows per insert by UDF to fine tune and balance speed
|
|
371
|
+
and memory usage. This might be useful when processing large rows
|
|
372
|
+
or when running into memory issues. Defaults to 2000.
|
|
373
373
|
|
|
374
374
|
Example:
|
|
375
375
|
```py
|
|
@@ -398,7 +398,7 @@ class DataChain:
|
|
|
398
398
|
return self._evolve(settings=settings, _sys=sys)
|
|
399
399
|
|
|
400
400
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
401
|
-
"""Reset all settings to default values."""
|
|
401
|
+
"""Reset all chain settings to default values."""
|
|
402
402
|
self._settings = settings if settings else Settings()
|
|
403
403
|
return self
|
|
404
404
|
|
|
@@ -580,14 +580,14 @@ class DataChain:
|
|
|
580
580
|
"""Save to a Dataset. It returns the chain itself.
|
|
581
581
|
|
|
582
582
|
Parameters:
|
|
583
|
-
name
|
|
584
|
-
|
|
585
|
-
case
|
|
586
|
-
|
|
587
|
-
version
|
|
583
|
+
name: dataset name. This can be either a fully qualified name, including
|
|
584
|
+
the namespace and project, or just a regular dataset name. In the latter
|
|
585
|
+
case, the namespace and project will be taken from the settings
|
|
586
|
+
(if specified) or from the default values otherwise.
|
|
587
|
+
version: version of a dataset. If version is not specified and dataset
|
|
588
588
|
already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
|
|
589
|
-
description
|
|
590
|
-
attrs
|
|
589
|
+
description: description of a dataset.
|
|
590
|
+
attrs: attributes of a dataset. They can be without value, e.g "NLP",
|
|
591
591
|
or with a value, e.g "location=US".
|
|
592
592
|
update_version: which part of the dataset version to automatically increase.
|
|
593
593
|
Available values: `major`, `minor` or `patch`. Default is `patch`.
|
|
@@ -661,7 +661,9 @@ class DataChain:
|
|
|
661
661
|
# current latest version instead.
|
|
662
662
|
from .datasets import read_dataset
|
|
663
663
|
|
|
664
|
-
return read_dataset(
|
|
664
|
+
return read_dataset(
|
|
665
|
+
name, namespace=namespace_name, project=project_name, **kwargs
|
|
666
|
+
)
|
|
665
667
|
|
|
666
668
|
return self._evolve(
|
|
667
669
|
query=self._query.save(
|
|
@@ -704,7 +706,7 @@ class DataChain:
|
|
|
704
706
|
func: Optional[Callable] = None,
|
|
705
707
|
params: Union[None, str, Sequence[str]] = None,
|
|
706
708
|
output: OutputType = None,
|
|
707
|
-
**signal_map,
|
|
709
|
+
**signal_map: Any,
|
|
708
710
|
) -> "Self":
|
|
709
711
|
"""Apply a function to each row to create new signals. The function should
|
|
710
712
|
return a new object for each row. It returns a chain itself with new signals.
|
|
@@ -712,17 +714,17 @@ class DataChain:
|
|
|
712
714
|
Input-output relationship: 1:1
|
|
713
715
|
|
|
714
716
|
Parameters:
|
|
715
|
-
func
|
|
716
|
-
params
|
|
717
|
+
func: Function applied to each row.
|
|
718
|
+
params: List of column names used as input for the function. Default
|
|
717
719
|
is taken from function signature.
|
|
718
|
-
output
|
|
720
|
+
output: Dictionary defining new signals and their corresponding types.
|
|
719
721
|
Default type is taken from function signature. Default can be also
|
|
720
722
|
taken from kwargs - **signal_map (see below).
|
|
721
723
|
If signal name is defined using signal_map (see below) only a single
|
|
722
724
|
type value can be used.
|
|
723
|
-
**signal_map
|
|
725
|
+
**signal_map: kwargs can be used to define `func` together with its return
|
|
724
726
|
signal name in format of `map(my_sign=my_func)`. This helps define
|
|
725
|
-
signal names and
|
|
727
|
+
signal names and functions in a nicer way.
|
|
726
728
|
|
|
727
729
|
Example:
|
|
728
730
|
Using signal_map and single type in output:
|
|
@@ -941,7 +943,7 @@ class DataChain:
|
|
|
941
943
|
It accepts the same parameters plus an
|
|
942
944
|
additional parameter:
|
|
943
945
|
|
|
944
|
-
batch
|
|
946
|
+
batch: Size of each batch passed to `func`. Defaults to 1000.
|
|
945
947
|
|
|
946
948
|
Example:
|
|
947
949
|
```py
|
|
@@ -1309,9 +1311,9 @@ class DataChain:
|
|
|
1309
1311
|
"""Yields flattened rows of values as a tuple.
|
|
1310
1312
|
|
|
1311
1313
|
Args:
|
|
1312
|
-
row_factory
|
|
1313
|
-
|
|
1314
|
-
|
|
1314
|
+
row_factory: A callable to convert row to a custom format.
|
|
1315
|
+
It should accept two arguments: a list of column names and
|
|
1316
|
+
a tuple of row values.
|
|
1315
1317
|
include_hidden: Whether to include hidden signals from the schema.
|
|
1316
1318
|
"""
|
|
1317
1319
|
db_signals = self._effective_signals_schema.db_signals(
|
|
@@ -1956,19 +1958,19 @@ class DataChain:
|
|
|
1956
1958
|
model_name: str = "",
|
|
1957
1959
|
source: bool = True,
|
|
1958
1960
|
nrows: Optional[int] = None,
|
|
1959
|
-
**kwargs,
|
|
1961
|
+
**kwargs: Any,
|
|
1960
1962
|
) -> "Self":
|
|
1961
1963
|
"""Generate chain from list of tabular files.
|
|
1962
1964
|
|
|
1963
1965
|
Parameters:
|
|
1964
|
-
output
|
|
1966
|
+
output: Dictionary or feature class defining column names and their
|
|
1965
1967
|
corresponding types. List of column names is also accepted, in which
|
|
1966
1968
|
case types will be inferred.
|
|
1967
|
-
column
|
|
1968
|
-
model_name
|
|
1969
|
-
source
|
|
1970
|
-
nrows
|
|
1971
|
-
kwargs
|
|
1969
|
+
column: Generated column name.
|
|
1970
|
+
model_name: Generated model name.
|
|
1971
|
+
source: Whether to include info about the source file.
|
|
1972
|
+
nrows: Optional row limit.
|
|
1973
|
+
kwargs: Parameters to pass to pyarrow.dataset.dataset.
|
|
1972
1974
|
|
|
1973
1975
|
Example:
|
|
1974
1976
|
Reading a json lines file:
|
|
@@ -2098,12 +2100,12 @@ class DataChain:
|
|
|
2098
2100
|
"""Save chain to parquet file with SignalSchema metadata.
|
|
2099
2101
|
|
|
2100
2102
|
Parameters:
|
|
2101
|
-
path
|
|
2103
|
+
path: Path or a file-like binary object to save the file. This supports
|
|
2102
2104
|
local paths as well as remote paths, such as s3:// or hf:// with fsspec.
|
|
2103
|
-
partition_cols
|
|
2104
|
-
chunk_size
|
|
2105
|
+
partition_cols: Column names by which to partition the dataset.
|
|
2106
|
+
chunk_size: The chunk size of results to read and convert to columnar
|
|
2105
2107
|
data, to avoid running out of memory.
|
|
2106
|
-
fs_kwargs
|
|
2108
|
+
fs_kwargs: Optional kwargs to pass to the fsspec filesystem, used only for
|
|
2107
2109
|
write, for fsspec-type URLs, such as s3:// or hf:// when
|
|
2108
2110
|
provided as the destination path.
|
|
2109
2111
|
"""
|
|
@@ -2195,10 +2197,10 @@ class DataChain:
|
|
|
2195
2197
|
"""Save chain to a csv (comma-separated values) file.
|
|
2196
2198
|
|
|
2197
2199
|
Parameters:
|
|
2198
|
-
path
|
|
2200
|
+
path: Path to save the file. This supports local paths as well as
|
|
2199
2201
|
remote paths, such as s3:// or hf:// with fsspec.
|
|
2200
|
-
delimiter
|
|
2201
|
-
fs_kwargs
|
|
2202
|
+
delimiter: Delimiter to use for the resulting file.
|
|
2203
|
+
fs_kwargs: Optional kwargs to pass to the fsspec filesystem, used only for
|
|
2202
2204
|
write, for fsspec-type URLs, such as s3:// or hf:// when
|
|
2203
2205
|
provided as the destination path.
|
|
2204
2206
|
"""
|
|
@@ -2241,12 +2243,12 @@ class DataChain:
|
|
|
2241
2243
|
"""Save chain to a JSON file.
|
|
2242
2244
|
|
|
2243
2245
|
Parameters:
|
|
2244
|
-
path
|
|
2246
|
+
path: Path to save the file. This supports local paths as well as
|
|
2245
2247
|
remote paths, such as s3:// or hf:// with fsspec.
|
|
2246
|
-
fs_kwargs
|
|
2248
|
+
fs_kwargs: Optional kwargs to pass to the fsspec filesystem, used only for
|
|
2247
2249
|
write, for fsspec-type URLs, such as s3:// or hf:// when
|
|
2248
2250
|
provided as the destination path.
|
|
2249
|
-
include_outer_list
|
|
2251
|
+
include_outer_list: Sets whether to include an outer list for all rows.
|
|
2250
2252
|
Setting this to True makes the file valid JSON, while False instead
|
|
2251
2253
|
writes in the JSON lines format.
|
|
2252
2254
|
"""
|
|
@@ -2301,9 +2303,9 @@ class DataChain:
|
|
|
2301
2303
|
"""Save chain to a JSON lines file.
|
|
2302
2304
|
|
|
2303
2305
|
Parameters:
|
|
2304
|
-
path
|
|
2306
|
+
path: Path to save the file. This supports local paths as well as
|
|
2305
2307
|
remote paths, such as s3:// or hf:// with fsspec.
|
|
2306
|
-
fs_kwargs
|
|
2308
|
+
fs_kwargs: Optional kwargs to pass to the fsspec filesystem, used only for
|
|
2307
2309
|
write, for fsspec-type URLs, such as s3:// or hf:// when
|
|
2308
2310
|
provided as the destination path.
|
|
2309
2311
|
"""
|
|
@@ -2571,9 +2573,9 @@ class DataChain:
|
|
|
2571
2573
|
The possible values are: "filename", "etag", "fullpath", and "checksum".
|
|
2572
2574
|
link_type: Method to use for exporting files.
|
|
2573
2575
|
Falls back to `'copy'` if symlinking fails.
|
|
2574
|
-
num_threads
|
|
2575
|
-
By default it uses 5 threads.
|
|
2576
|
-
anon: If True, we will treat cloud bucket as public one. Default behavior
|
|
2576
|
+
num_threads: number of threads to use for exporting files.
|
|
2577
|
+
By default, it uses 5 threads.
|
|
2578
|
+
anon: If True, we will treat cloud bucket as a public one. Default behavior
|
|
2577
2579
|
depends on the previous session configuration (e.g. happens in the
|
|
2578
2580
|
initial `read_storage`) and particular cloud storage client
|
|
2579
2581
|
implementation (e.g. S3 fallbacks to anonymous access if no credentials
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -51,14 +51,14 @@ def read_dataset(
|
|
|
51
51
|
namespace and project. Alternatively, it can be a regular name, in which
|
|
52
52
|
case the explicitly defined namespace and project will be used if they are
|
|
53
53
|
set; otherwise, default values will be applied.
|
|
54
|
-
namespace
|
|
55
|
-
project
|
|
56
|
-
version
|
|
54
|
+
namespace: optional name of namespace in which dataset to read is created
|
|
55
|
+
project: optional name of project in which dataset to read is created
|
|
56
|
+
version: dataset version. Supports:
|
|
57
57
|
- Exact version strings: "1.2.3"
|
|
58
58
|
- Legacy integer versions: 1, 2, 3 (finds latest major version)
|
|
59
59
|
- Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
|
|
60
|
-
session
|
|
61
|
-
settings
|
|
60
|
+
session: Session to use for the chain.
|
|
61
|
+
settings: Settings to use for the chain.
|
|
62
62
|
delta: If True, only process new or changed files instead of reprocessing
|
|
63
63
|
everything. This saves time by skipping files that were already processed in
|
|
64
64
|
previous versions. The optimization is working when a new version of the
|
|
@@ -314,9 +314,9 @@ def delete_dataset(
|
|
|
314
314
|
namespace and project. Alternatively, it can be a regular name, in which
|
|
315
315
|
case the explicitly defined namespace and project will be used if they are
|
|
316
316
|
set; otherwise, default values will be applied.
|
|
317
|
-
namespace
|
|
318
|
-
project
|
|
319
|
-
version
|
|
317
|
+
namespace: optional name of namespace in which dataset to delete is created
|
|
318
|
+
project: optional name of project in which dataset to delete is created
|
|
319
|
+
version: Optional dataset version
|
|
320
320
|
force: If true, all datasets versions will be removed. Defaults to False.
|
|
321
321
|
studio: If True, removes dataset from Studio only, otherwise removes local
|
|
322
322
|
dataset. Defaults to False.
|
datachain/lib/dc/hf.py
CHANGED
|
@@ -1,8 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
Union,
|
|
5
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
2
|
|
|
7
3
|
from datachain.lib.data_model import dict_to_data_model
|
|
8
4
|
from datachain.query import Session
|
|
@@ -20,28 +16,28 @@ if TYPE_CHECKING:
|
|
|
20
16
|
|
|
21
17
|
def read_hf(
|
|
22
18
|
dataset: Union[str, "HFDatasetType"],
|
|
23
|
-
*args,
|
|
19
|
+
*args: Any,
|
|
24
20
|
session: Optional[Session] = None,
|
|
25
21
|
settings: Optional[dict] = None,
|
|
26
22
|
column: str = "",
|
|
27
23
|
model_name: str = "",
|
|
28
24
|
limit: int = 0,
|
|
29
|
-
**kwargs,
|
|
25
|
+
**kwargs: Any,
|
|
30
26
|
) -> "DataChain":
|
|
31
27
|
"""Generate chain from Hugging Face Hub dataset.
|
|
32
28
|
|
|
33
29
|
Parameters:
|
|
34
|
-
dataset
|
|
30
|
+
dataset: Path or name of the dataset to read from Hugging Face Hub,
|
|
35
31
|
or an instance of `datasets.Dataset`-like object.
|
|
36
|
-
args
|
|
37
|
-
session
|
|
38
|
-
settings
|
|
39
|
-
column
|
|
40
|
-
model_name
|
|
41
|
-
limit
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
kwargs
|
|
32
|
+
args: Additional positional arguments to pass to `datasets.load_dataset`.
|
|
33
|
+
session: Session to use for the chain.
|
|
34
|
+
settings: Settings to use for the chain.
|
|
35
|
+
column: Generated object column name.
|
|
36
|
+
model_name: Generated model name.
|
|
37
|
+
limit: The maximum number of items to read from the HF dataset.
|
|
38
|
+
Applies `take(limit)` to `datasets.load_dataset`.
|
|
39
|
+
Defaults to 0 (no limit).
|
|
40
|
+
kwargs: Parameters to pass to `datasets.load_dataset`.
|
|
45
41
|
|
|
46
42
|
Example:
|
|
47
43
|
Load from Hugging Face Hub:
|
datachain/lib/dc/json.py
CHANGED
|
@@ -26,22 +26,22 @@ def read_json(
|
|
|
26
26
|
column: Optional[str] = "",
|
|
27
27
|
model_name: Optional[str] = None,
|
|
28
28
|
format: Optional[str] = "json",
|
|
29
|
-
nrows=None,
|
|
29
|
+
nrows: Optional[int] = None,
|
|
30
30
|
**kwargs,
|
|
31
31
|
) -> "DataChain":
|
|
32
32
|
"""Get data from JSON. It returns the chain itself.
|
|
33
33
|
|
|
34
34
|
Parameters:
|
|
35
|
-
path
|
|
35
|
+
path: storage URI with directory. URI must start with storage prefix such
|
|
36
36
|
as `s3://`, `gs://`, `az://` or "file:///"
|
|
37
|
-
type
|
|
38
|
-
spec
|
|
39
|
-
schema_from
|
|
40
|
-
column
|
|
41
|
-
model_name
|
|
37
|
+
type: read file as "binary", "text", or "image" data. Default is "text".
|
|
38
|
+
spec: optional Data Model
|
|
39
|
+
schema_from: path to sample to infer spec (if schema not provided)
|
|
40
|
+
column: generated column name
|
|
41
|
+
model_name: optional generated model name
|
|
42
42
|
format: "json", "jsonl"
|
|
43
|
-
jmespath
|
|
44
|
-
nrows
|
|
43
|
+
jmespath: optional JMESPATH expression to reduce JSON
|
|
44
|
+
nrows: optional row limit for jsonl and JSON arrays
|
|
45
45
|
|
|
46
46
|
Example:
|
|
47
47
|
infer JSON schema from data, reduce using JMESPATH
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
Any,
|
|
4
|
-
Optional,
|
|
5
|
-
)
|
|
1
|
+
import os
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
3
|
|
|
7
4
|
from datachain.lib.data_model import DataType
|
|
8
5
|
from datachain.query import Session
|
|
@@ -16,7 +13,7 @@ if TYPE_CHECKING:
|
|
|
16
13
|
|
|
17
14
|
|
|
18
15
|
def read_parquet(
|
|
19
|
-
path,
|
|
16
|
+
path: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
|
|
20
17
|
partitioning: Any = "hive",
|
|
21
18
|
output: Optional[dict[str, DataType]] = None,
|
|
22
19
|
column: str = "",
|
|
@@ -29,15 +26,15 @@ def read_parquet(
|
|
|
29
26
|
"""Generate chain from parquet files.
|
|
30
27
|
|
|
31
28
|
Parameters:
|
|
32
|
-
path
|
|
29
|
+
path: Storage URI with directory. URI must start with storage prefix such
|
|
33
30
|
as `s3://`, `gs://`, `az://` or "file:///".
|
|
34
|
-
partitioning
|
|
35
|
-
output
|
|
36
|
-
column
|
|
37
|
-
model_name
|
|
38
|
-
source
|
|
39
|
-
session
|
|
40
|
-
settings
|
|
31
|
+
partitioning: Any pyarrow partitioning schema.
|
|
32
|
+
output: Dictionary defining column names and their corresponding types.
|
|
33
|
+
column: Created column name.
|
|
34
|
+
model_name: Generated model name.
|
|
35
|
+
source: Whether to include info about the source file.
|
|
36
|
+
session: Session to use for the chain.
|
|
37
|
+
settings: Settings to use for the chain.
|
|
41
38
|
|
|
42
39
|
Example:
|
|
43
40
|
Reading a single file:
|
datachain/lib/dc/records.py
CHANGED
|
@@ -30,9 +30,9 @@ def read_records(
|
|
|
30
30
|
or other sources.
|
|
31
31
|
|
|
32
32
|
Parameters:
|
|
33
|
-
to_insert
|
|
33
|
+
to_insert: records (or a single record) to insert. Each record is
|
|
34
34
|
a dictionary of signals and their values.
|
|
35
|
-
schema
|
|
35
|
+
schema: describes chain signals and their corresponding types
|
|
36
36
|
|
|
37
37
|
Example:
|
|
38
38
|
```py
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -1,22 +1,17 @@
|
|
|
1
|
-
import os
|
|
1
|
+
import os
|
|
2
2
|
from collections.abc import Sequence
|
|
3
3
|
from functools import reduce
|
|
4
|
-
from typing import
|
|
5
|
-
TYPE_CHECKING,
|
|
6
|
-
Optional,
|
|
7
|
-
Union,
|
|
8
|
-
)
|
|
4
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
9
5
|
|
|
10
|
-
from datachain.lib.
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
get_listing,
|
|
17
|
-
list_bucket,
|
|
18
|
-
ls,
|
|
6
|
+
from datachain.lib.dc.storage_pattern import (
|
|
7
|
+
apply_glob_filter,
|
|
8
|
+
expand_brace_pattern,
|
|
9
|
+
should_use_recursion,
|
|
10
|
+
split_uri_pattern,
|
|
11
|
+
validate_cloud_bucket_name,
|
|
19
12
|
)
|
|
13
|
+
from datachain.lib.file import FileType, get_file_type
|
|
14
|
+
from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
|
|
20
15
|
from datachain.query import Session
|
|
21
16
|
|
|
22
17
|
if TYPE_CHECKING:
|
|
@@ -50,15 +45,19 @@ def read_storage(
|
|
|
50
45
|
It returns the chain itself as usual.
|
|
51
46
|
|
|
52
47
|
Parameters:
|
|
53
|
-
uri
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
48
|
+
uri: Storage path(s) or URI(s). Can be a local path or start with a
|
|
49
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
50
|
+
Supports glob patterns:
|
|
51
|
+
- `*` : wildcard
|
|
52
|
+
- `**` : recursive wildcard
|
|
53
|
+
- `?` : single character
|
|
54
|
+
- `{a,b}` : brace expansion
|
|
55
|
+
type: read file as "binary", "text", or "image" data. Default is "binary".
|
|
56
|
+
recursive: search recursively for the given path.
|
|
57
|
+
column: Column name that will contain File objects. Default is "file".
|
|
58
|
+
update: force storage reindexing. Default is False.
|
|
59
|
+
anon: If True, we will treat cloud bucket as public one.
|
|
60
|
+
client_config: Optional client configuration for the storage client.
|
|
62
61
|
delta: If True, only process new or changed files instead of reprocessing
|
|
63
62
|
everything. This saves time by skipping files that were already processed in
|
|
64
63
|
previous versions. The optimization is working when a new version of the
|
|
@@ -92,12 +91,19 @@ def read_storage(
|
|
|
92
91
|
chain = dc.read_storage("s3://my-bucket/my-dir")
|
|
93
92
|
```
|
|
94
93
|
|
|
94
|
+
Match all .json files recursively using glob pattern
|
|
95
|
+
```py
|
|
96
|
+
chain = dc.read_storage("gs://bucket/meta/**/*.json")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Match image file extensions for directories with pattern
|
|
100
|
+
```py
|
|
101
|
+
chain = dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
|
|
102
|
+
```
|
|
103
|
+
|
|
95
104
|
Multiple URIs:
|
|
96
105
|
```python
|
|
97
|
-
chain = dc.read_storage([
|
|
98
|
-
"s3://bucket1/dir1",
|
|
99
|
-
"s3://bucket2/dir2"
|
|
100
|
-
])
|
|
106
|
+
chain = dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
|
|
101
107
|
```
|
|
102
108
|
|
|
103
109
|
With AWS S3-compatible storage:
|
|
@@ -107,19 +113,6 @@ def read_storage(
|
|
|
107
113
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
108
114
|
)
|
|
109
115
|
```
|
|
110
|
-
|
|
111
|
-
Pass existing session
|
|
112
|
-
```py
|
|
113
|
-
session = Session.get()
|
|
114
|
-
chain = dc.read_storage([
|
|
115
|
-
"path/to/dir1",
|
|
116
|
-
"path/to/dir2"
|
|
117
|
-
], session=session, recursive=True)
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
Note:
|
|
121
|
-
When using multiple URIs with `update=True`, the function optimizes by
|
|
122
|
-
avoiding redundant updates for URIs pointing to the same storage location.
|
|
123
116
|
"""
|
|
124
117
|
from .datachain import DataChain
|
|
125
118
|
from .datasets import read_dataset
|
|
@@ -142,13 +135,36 @@ def read_storage(
|
|
|
142
135
|
if not uris:
|
|
143
136
|
raise ValueError("No URIs provided")
|
|
144
137
|
|
|
138
|
+
# Then expand all URIs that contain brace patterns
|
|
139
|
+
expanded_uris = []
|
|
140
|
+
for single_uri in uris:
|
|
141
|
+
uri_str = str(single_uri)
|
|
142
|
+
validate_cloud_bucket_name(uri_str)
|
|
143
|
+
expanded_uris.extend(expand_brace_pattern(uri_str))
|
|
144
|
+
|
|
145
|
+
# Now process each expanded URI
|
|
145
146
|
chains = []
|
|
146
147
|
listed_ds_name = set()
|
|
147
148
|
file_values = []
|
|
148
149
|
|
|
149
|
-
|
|
150
|
+
updated_uris = set()
|
|
151
|
+
|
|
152
|
+
for single_uri in expanded_uris:
|
|
153
|
+
# Check if URI contains glob patterns and split them
|
|
154
|
+
base_uri, glob_pattern = split_uri_pattern(single_uri)
|
|
155
|
+
|
|
156
|
+
# If a pattern is found, use the base_uri for listing
|
|
157
|
+
# The pattern will be used for filtering later
|
|
158
|
+
list_uri_to_use = base_uri if glob_pattern else single_uri
|
|
159
|
+
|
|
160
|
+
# Avoid double updates for the same URI
|
|
161
|
+
update_single_uri = False
|
|
162
|
+
if update and (list_uri_to_use not in updated_uris):
|
|
163
|
+
updated_uris.add(list_uri_to_use)
|
|
164
|
+
update_single_uri = True
|
|
165
|
+
|
|
150
166
|
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
151
|
-
|
|
167
|
+
list_uri_to_use, session, update=update_single_uri
|
|
152
168
|
)
|
|
153
169
|
|
|
154
170
|
# list_ds_name is None if object is a file, we don't want to use cache
|
|
@@ -197,7 +213,21 @@ def read_storage(
|
|
|
197
213
|
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
198
214
|
)
|
|
199
215
|
|
|
200
|
-
|
|
216
|
+
# If a glob pattern was detected, use it for filtering
|
|
217
|
+
# Otherwise, use the original list_path from get_listing
|
|
218
|
+
if glob_pattern:
|
|
219
|
+
# Determine if we should use recursive listing based on the pattern
|
|
220
|
+
use_recursive = should_use_recursion(glob_pattern, recursive or False)
|
|
221
|
+
|
|
222
|
+
# Apply glob filter - no need for brace expansion here as it's done above
|
|
223
|
+
chain = apply_glob_filter(
|
|
224
|
+
dc, glob_pattern, list_path, use_recursive, column
|
|
225
|
+
)
|
|
226
|
+
chains.append(chain)
|
|
227
|
+
else:
|
|
228
|
+
# No glob pattern detected, use normal ls behavior
|
|
229
|
+
chains.append(ls(dc, list_path, recursive=recursive, column=column))
|
|
230
|
+
|
|
201
231
|
listed_ds_name.add(list_ds_name)
|
|
202
232
|
|
|
203
233
|
storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
from typing import TYPE_CHECKING, Union
|
|
3
|
+
|
|
4
|
+
from datachain.client.fsspec import is_cloud_uri
|
|
5
|
+
from datachain.lib.listing import ls
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from .datachain import DataChain
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def validate_cloud_bucket_name(uri: str) -> None:
|
|
12
|
+
"""
|
|
13
|
+
Validate that cloud storage bucket names don't contain glob patterns.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
uri: URI to validate
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If a cloud storage bucket name contains glob patterns
|
|
20
|
+
"""
|
|
21
|
+
if not is_cloud_uri(uri):
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
# Extract bucket name (everything between :// and first /)
|
|
25
|
+
if "://" in uri:
|
|
26
|
+
scheme_end = uri.index("://") + 3
|
|
27
|
+
path_part = uri[scheme_end:]
|
|
28
|
+
|
|
29
|
+
# Get the bucket name (first segment)
|
|
30
|
+
if "/" in path_part:
|
|
31
|
+
bucket_name = path_part.split("/")[0]
|
|
32
|
+
else:
|
|
33
|
+
bucket_name = path_part
|
|
34
|
+
|
|
35
|
+
# Check if bucket name contains glob patterns
|
|
36
|
+
glob_chars = ["*", "?", "[", "]", "{", "}"]
|
|
37
|
+
if any(char in bucket_name for char in glob_chars):
|
|
38
|
+
raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
|
|
42
|
+
"""
|
|
43
|
+
Split a URI into base path and glob pattern.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
uri: URI that may contain glob patterns (*, **, ?, {})
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Tuple of (base_uri, pattern) where pattern is None if no glob pattern found
|
|
50
|
+
|
|
51
|
+
Examples:
|
|
52
|
+
"s3://bucket/dir/*.mp3" -> ("s3://bucket/dir", "*.mp3")
|
|
53
|
+
"s3://bucket/**/*.mp3" -> ("s3://bucket", "**/*.mp3")
|
|
54
|
+
"s3://bucket/dir" -> ("s3://bucket/dir", None)
|
|
55
|
+
"""
|
|
56
|
+
if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
|
|
57
|
+
return uri, None
|
|
58
|
+
|
|
59
|
+
# Handle different URI schemes
|
|
60
|
+
if "://" in uri:
|
|
61
|
+
# Split into scheme and path
|
|
62
|
+
scheme_end = uri.index("://") + 3
|
|
63
|
+
scheme_part = uri[:scheme_end]
|
|
64
|
+
path_part = uri[scheme_end:]
|
|
65
|
+
|
|
66
|
+
# Find where the glob pattern starts
|
|
67
|
+
path_segments = path_part.split("/")
|
|
68
|
+
|
|
69
|
+
# Find first segment with glob pattern
|
|
70
|
+
pattern_start_idx = None
|
|
71
|
+
for i, segment in enumerate(path_segments):
|
|
72
|
+
# Check for glob patterns including brace expansion
|
|
73
|
+
if glob.has_magic(segment) or "{" in segment:
|
|
74
|
+
pattern_start_idx = i
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
if pattern_start_idx is None:
|
|
78
|
+
return uri, None
|
|
79
|
+
|
|
80
|
+
# Split into base and pattern
|
|
81
|
+
if pattern_start_idx == 0:
|
|
82
|
+
# Pattern at root of bucket
|
|
83
|
+
base = scheme_part + path_segments[0]
|
|
84
|
+
pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
|
|
85
|
+
else:
|
|
86
|
+
base = scheme_part + "/".join(path_segments[:pattern_start_idx])
|
|
87
|
+
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
88
|
+
|
|
89
|
+
return base, pattern
|
|
90
|
+
# Local path
|
|
91
|
+
path_segments = uri.split("/")
|
|
92
|
+
|
|
93
|
+
# Find first segment with glob pattern
|
|
94
|
+
pattern_start_idx = None
|
|
95
|
+
for i, segment in enumerate(path_segments):
|
|
96
|
+
# Check for glob patterns including brace expansion
|
|
97
|
+
if glob.has_magic(segment) or "{" in segment:
|
|
98
|
+
pattern_start_idx = i
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
if pattern_start_idx is None:
|
|
102
|
+
return uri, None
|
|
103
|
+
|
|
104
|
+
# Split into base and pattern
|
|
105
|
+
base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
|
|
106
|
+
pattern = "/".join(path_segments[pattern_start_idx:])
|
|
107
|
+
|
|
108
|
+
return base, pattern
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
|
|
112
|
+
"""
|
|
113
|
+
Determine if we should use recursive listing based on the pattern.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
pattern: The glob pattern extracted from URI
|
|
117
|
+
user_recursive: User's recursive preference
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
True if recursive listing should be used
|
|
121
|
+
|
|
122
|
+
Examples:
|
|
123
|
+
"*" -> False (single level only)
|
|
124
|
+
"*.mp3" -> False (single level only)
|
|
125
|
+
"**/*.mp3" -> True (globstar requires recursion)
|
|
126
|
+
"dir/*/file.txt" -> True (multi-level pattern)
|
|
127
|
+
"""
|
|
128
|
+
if not user_recursive:
|
|
129
|
+
# If user explicitly wants non-recursive, respect that
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
# If pattern contains globstar, definitely need recursion
|
|
133
|
+
if "**" in pattern:
|
|
134
|
+
return True
|
|
135
|
+
|
|
136
|
+
# If pattern contains path separators, it needs recursion
|
|
137
|
+
# Single-level patterns like "*", "*.txt", "file?" should not be recursive
|
|
138
|
+
return "/" in pattern
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def expand_brace_pattern(pattern: str) -> list[str]:
|
|
142
|
+
"""
|
|
143
|
+
Recursively expand brace patterns like *.{mp3,wav} into multiple glob patterns.
|
|
144
|
+
Handles nested and multiple brace patterns.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
pattern: Pattern that may contain brace expansion
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
List of expanded patterns
|
|
151
|
+
|
|
152
|
+
Examples:
|
|
153
|
+
"*.{mp3,wav}" -> ["*.mp3", "*.wav"]
|
|
154
|
+
"{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
|
|
155
|
+
"*.txt" -> ["*.txt"]
|
|
156
|
+
"{{a,b}}" -> ["{a}", "{b}"] # Handle double braces
|
|
157
|
+
"""
|
|
158
|
+
if "{" not in pattern or "}" not in pattern:
|
|
159
|
+
return [pattern]
|
|
160
|
+
|
|
161
|
+
return _expand_single_braces(pattern)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _expand_single_braces(pattern: str) -> list[str]:
|
|
165
|
+
"""Helper to expand single-level braces."""
|
|
166
|
+
if "{" not in pattern or "}" not in pattern:
|
|
167
|
+
return [pattern]
|
|
168
|
+
|
|
169
|
+
# Find the first complete brace pattern
|
|
170
|
+
start = pattern.index("{")
|
|
171
|
+
end = start
|
|
172
|
+
depth = 0
|
|
173
|
+
for i in range(start, len(pattern)):
|
|
174
|
+
if pattern[i] == "{":
|
|
175
|
+
depth += 1
|
|
176
|
+
elif pattern[i] == "}":
|
|
177
|
+
depth -= 1
|
|
178
|
+
if depth == 0:
|
|
179
|
+
end = i
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
if start >= end:
|
|
183
|
+
return [pattern]
|
|
184
|
+
|
|
185
|
+
prefix = pattern[:start]
|
|
186
|
+
suffix = pattern[end + 1 :]
|
|
187
|
+
options = pattern[start + 1 : end].split(",")
|
|
188
|
+
|
|
189
|
+
# Generate all combinations and recursively expand
|
|
190
|
+
expanded = []
|
|
191
|
+
for option in options:
|
|
192
|
+
combined = prefix + option.strip() + suffix
|
|
193
|
+
# Recursively expand any remaining braces
|
|
194
|
+
expanded.extend(_expand_single_braces(combined))
|
|
195
|
+
|
|
196
|
+
return expanded
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def convert_globstar_to_glob(filter_pattern: str) -> str:
|
|
200
|
+
"""Convert globstar patterns to GLOB patterns.
|
|
201
|
+
|
|
202
|
+
Standard GLOB doesn't understand ** as recursive wildcard,
|
|
203
|
+
so we need to convert patterns appropriately.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
filter_pattern: Pattern that may contain globstars (**)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
GLOB-compatible pattern
|
|
210
|
+
"""
|
|
211
|
+
if "**" not in filter_pattern:
|
|
212
|
+
return filter_pattern
|
|
213
|
+
|
|
214
|
+
parts = filter_pattern.split("/")
|
|
215
|
+
globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
|
|
216
|
+
|
|
217
|
+
# Handle different cases based on number of globstars
|
|
218
|
+
num_globstars = len(globstar_positions)
|
|
219
|
+
|
|
220
|
+
if num_globstars <= 1:
|
|
221
|
+
# Special case: pattern like **/* means zero or more directories
|
|
222
|
+
# This is tricky because GLOB can't express "zero or more"
|
|
223
|
+
# We need different handling based on the pattern structure
|
|
224
|
+
|
|
225
|
+
if filter_pattern == "**/*":
|
|
226
|
+
# Match everything
|
|
227
|
+
return "*"
|
|
228
|
+
if filter_pattern.startswith("**/"):
|
|
229
|
+
remaining = filter_pattern[3:]
|
|
230
|
+
if "/" not in remaining:
|
|
231
|
+
# Pattern like **/*.ext or **/temp?.*
|
|
232
|
+
# The ** means zero or more directories
|
|
233
|
+
# For zero directories: pattern should be just the filename pattern
|
|
234
|
+
# For one or more: pattern should be */filename
|
|
235
|
+
# Since we can't OR in GLOB, we choose the more permissive option
|
|
236
|
+
# that works with recursive listing
|
|
237
|
+
# Special handling: if it's a simple extension pattern, match broadly
|
|
238
|
+
if remaining.startswith("*."):
|
|
239
|
+
# Pattern like **/*.ext - match any file with this extension
|
|
240
|
+
# This matches *.ext at current level and deeper with recursion:
|
|
241
|
+
return remaining
|
|
242
|
+
# Pattern like **/temp?.* - match as filename in subdirs
|
|
243
|
+
return f"*/{remaining}"
|
|
244
|
+
|
|
245
|
+
# Default: Zero or one globstar - simple replacement
|
|
246
|
+
return filter_pattern.replace("**", "*")
|
|
247
|
+
|
|
248
|
+
# Multiple globstars - need more careful handling
|
|
249
|
+
# For patterns like **/level?/backup/**/*.ext
|
|
250
|
+
# We want to match any path containing /level?/backup/ and ending with .ext
|
|
251
|
+
|
|
252
|
+
# Find middle directories (between first and last **)
|
|
253
|
+
middle_parts = []
|
|
254
|
+
start_idx = globstar_positions[0] + 1
|
|
255
|
+
end_idx = globstar_positions[-1]
|
|
256
|
+
for i in range(start_idx, end_idx):
|
|
257
|
+
if parts[i] != "**":
|
|
258
|
+
middle_parts.append(parts[i])
|
|
259
|
+
|
|
260
|
+
if not middle_parts:
|
|
261
|
+
# No fixed middle parts, just use wildcards
|
|
262
|
+
result = filter_pattern.replace("**", "*")
|
|
263
|
+
else:
|
|
264
|
+
# Create pattern that matches the middle parts
|
|
265
|
+
middle_pattern = "/".join(middle_parts)
|
|
266
|
+
# Get the file pattern at the end if any
|
|
267
|
+
last_part = parts[-1] if parts[-1] != "**" else "*"
|
|
268
|
+
|
|
269
|
+
# Match any path containing this pattern
|
|
270
|
+
if last_part != "*":
|
|
271
|
+
# Has specific file pattern
|
|
272
|
+
result = f"*{middle_pattern}*{last_part}"
|
|
273
|
+
else:
|
|
274
|
+
result = f"*{middle_pattern}*"
|
|
275
|
+
|
|
276
|
+
return result
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def apply_glob_filter(
|
|
280
|
+
dc: "DataChain",
|
|
281
|
+
pattern: str,
|
|
282
|
+
list_path: str,
|
|
283
|
+
use_recursive: bool,
|
|
284
|
+
column: str,
|
|
285
|
+
) -> "DataChain":
|
|
286
|
+
from datachain.query.schema import Column
|
|
287
|
+
|
|
288
|
+
chain = ls(dc, list_path, recursive=use_recursive, column=column)
|
|
289
|
+
|
|
290
|
+
# If pattern doesn't contain path separator and list_path is not empty,
|
|
291
|
+
# prepend the list_path to make the pattern match correctly
|
|
292
|
+
if list_path and "/" not in pattern:
|
|
293
|
+
filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
|
|
294
|
+
else:
|
|
295
|
+
filter_pattern = pattern
|
|
296
|
+
|
|
297
|
+
# Convert globstar patterns to GLOB-compatible patterns
|
|
298
|
+
glob_pattern = convert_globstar_to_glob(filter_pattern)
|
|
299
|
+
|
|
300
|
+
return chain.filter(Column(f"{column}.path").glob(glob_pattern))
|
|
@@ -41,7 +41,7 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
|
|
|
41
41
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
42
42
|
datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
|
|
43
43
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
44
|
-
datachain/client/fsspec.py,sha256=
|
|
44
|
+
datachain/client/fsspec.py,sha256=sChjxu931QgU2-n9MdXlmOrhGAiAckXoDVZTxKcNv6M,14336
|
|
45
45
|
datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
|
|
46
46
|
datachain/client/hf.py,sha256=n5xJZdvNLS-SqokxuBCIPfGbhIeC_XfLm_BNYtEVvg4,2677
|
|
47
47
|
datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,4607
|
|
@@ -72,7 +72,7 @@ datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
|
|
|
72
72
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
73
|
datachain/lib/arrow.py,sha256=aedsosbFNjIBa6LQIxR2zhIVcA4pVw1p5hCVmrDhWsQ,10781
|
|
74
74
|
datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
|
|
75
|
-
datachain/lib/clip.py,sha256=
|
|
75
|
+
datachain/lib/clip.py,sha256=ae6uoiymOl53rBXwIfqJkbHrk_IA21R1uJwXo5454C4,6145
|
|
76
76
|
datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
|
|
77
77
|
datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
|
|
78
78
|
datachain/lib/file.py,sha256=IGwpCwjsSOpZXlRsatcMKToMmuvYiX6_UtaTjUKAAdg,44511
|
|
@@ -102,17 +102,18 @@ datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8p
|
|
|
102
102
|
datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
|
|
103
103
|
datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
|
|
104
104
|
datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
|
|
105
|
-
datachain/lib/dc/csv.py,sha256=
|
|
105
|
+
datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
|
|
106
106
|
datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
|
|
107
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
108
|
-
datachain/lib/dc/datasets.py,sha256
|
|
109
|
-
datachain/lib/dc/hf.py,sha256=
|
|
110
|
-
datachain/lib/dc/json.py,sha256
|
|
107
|
+
datachain/lib/dc/datachain.py,sha256=pDgUmvmf0ENngFepoD0AkxxqiqNIgoRueejfojyuURQ,100458
|
|
108
|
+
datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
|
|
109
|
+
datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
|
|
110
|
+
datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
|
|
111
111
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
112
112
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
113
|
-
datachain/lib/dc/parquet.py,sha256=
|
|
114
|
-
datachain/lib/dc/records.py,sha256=
|
|
115
|
-
datachain/lib/dc/storage.py,sha256=
|
|
113
|
+
datachain/lib/dc/parquet.py,sha256=STgm19AM-etu7WmOUMJa5Z9GI6tPC-A0P3JO3ulfsKo,1839
|
|
114
|
+
datachain/lib/dc/records.py,sha256=l7TKSKjT6boXGd05KA5vvax-Y-mLMOo46VWrlxPhmdQ,3067
|
|
115
|
+
datachain/lib/dc/storage.py,sha256=pydeiGLMsmDvruVY_bC5GsV6VLpYpRf7szrD0S2pTmE,9688
|
|
116
|
+
datachain/lib/dc/storage_pattern.py,sha256=QDLLSuBd1mdfkdRi3srGXXigs7rHw3vAnQedjE01_H8,9779
|
|
116
117
|
datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
|
|
117
118
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
118
119
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
@@ -160,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
160
161
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
161
162
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
162
163
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
163
|
-
datachain-0.31.
|
|
164
|
-
datachain-0.31.
|
|
165
|
-
datachain-0.31.
|
|
166
|
-
datachain-0.31.
|
|
167
|
-
datachain-0.31.
|
|
168
|
-
datachain-0.31.
|
|
164
|
+
datachain-0.31.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
165
|
+
datachain-0.31.2.dist-info/METADATA,sha256=ALo4Vp6w2VSanACVy1xv6aHWzbdasSKzD2U8_SybXBU,13898
|
|
166
|
+
datachain-0.31.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
167
|
+
datachain-0.31.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
168
|
+
datachain-0.31.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
169
|
+
datachain-0.31.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|