datachain 0.25.1__py3-none-any.whl → 0.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +6 -0
- datachain/lib/audio.py +151 -0
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/dc/datachain.py +125 -23
- datachain/lib/dc/datasets.py +1 -1
- datachain/lib/file.py +190 -1
- datachain/lib/model_store.py +8 -0
- datachain/lib/signal_schema.py +47 -7
- datachain/lib/udf.py +17 -5
- datachain/query/dataset.py +15 -9
- {datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/METADATA +6 -2
- {datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/RECORD +16 -15
- {datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/WHEEL +0 -0
- {datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -21,6 +21,9 @@ from datachain.lib.dc import (
|
|
|
21
21
|
)
|
|
22
22
|
from datachain.lib.file import (
|
|
23
23
|
ArrowRow,
|
|
24
|
+
Audio,
|
|
25
|
+
AudioFile,
|
|
26
|
+
AudioFragment,
|
|
24
27
|
File,
|
|
25
28
|
FileError,
|
|
26
29
|
Image,
|
|
@@ -43,6 +46,9 @@ __all__ = [
|
|
|
43
46
|
"AbstractUDF",
|
|
44
47
|
"Aggregator",
|
|
45
48
|
"ArrowRow",
|
|
49
|
+
"Audio",
|
|
50
|
+
"AudioFile",
|
|
51
|
+
"AudioFragment",
|
|
46
52
|
"C",
|
|
47
53
|
"Column",
|
|
48
54
|
"DataChain",
|
datachain/lib/audio.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import posixpath
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
|
+
|
|
4
|
+
from datachain.lib.file import FileError
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from numpy import ndarray
|
|
8
|
+
|
|
9
|
+
from datachain.lib.file import Audio, AudioFile, File
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import torchaudio
|
|
13
|
+
except ImportError as exc:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"Missing dependencies for processing audio.\n"
|
|
16
|
+
"To install run:\n\n"
|
|
17
|
+
" pip install 'datachain[audio]'\n"
|
|
18
|
+
) from exc
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def audio_info(file: "Union[File, AudioFile]") -> "Audio":
|
|
22
|
+
"""Extract metadata like sample rate, channels, duration, and format."""
|
|
23
|
+
from datachain.lib.file import Audio
|
|
24
|
+
|
|
25
|
+
file = file.as_audio_file()
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
with file.open() as f:
|
|
29
|
+
info = torchaudio.info(f)
|
|
30
|
+
|
|
31
|
+
sample_rate = int(info.sample_rate)
|
|
32
|
+
channels = int(info.num_channels)
|
|
33
|
+
frames = int(info.num_frames)
|
|
34
|
+
duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
|
|
35
|
+
|
|
36
|
+
# Get format information
|
|
37
|
+
format_name = getattr(info, "format", "")
|
|
38
|
+
codec_name = getattr(info, "encoding", "")
|
|
39
|
+
bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
|
|
40
|
+
|
|
41
|
+
except Exception as exc:
|
|
42
|
+
raise FileError(
|
|
43
|
+
"unable to extract metadata from audio file", file.source, file.path
|
|
44
|
+
) from exc
|
|
45
|
+
|
|
46
|
+
return Audio(
|
|
47
|
+
sample_rate=sample_rate,
|
|
48
|
+
channels=channels,
|
|
49
|
+
duration=duration,
|
|
50
|
+
samples=frames,
|
|
51
|
+
format=format_name,
|
|
52
|
+
codec=codec_name,
|
|
53
|
+
bit_rate=bit_rate,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def audio_fragment_np(
|
|
58
|
+
audio: "AudioFile", start: float = 0, duration: Optional[float] = None
|
|
59
|
+
) -> "tuple[ndarray, int]":
|
|
60
|
+
"""Load audio fragment as numpy array.
|
|
61
|
+
Multi-channel audio is transposed to (samples, channels)."""
|
|
62
|
+
if start < 0:
|
|
63
|
+
raise ValueError("start must be a non-negative float")
|
|
64
|
+
|
|
65
|
+
if duration is not None and duration <= 0:
|
|
66
|
+
raise ValueError("duration must be a positive float")
|
|
67
|
+
|
|
68
|
+
if hasattr(audio, "as_audio_file"):
|
|
69
|
+
audio = audio.as_audio_file()
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
with audio.open() as f:
|
|
73
|
+
info = torchaudio.info(f)
|
|
74
|
+
sample_rate = info.sample_rate
|
|
75
|
+
|
|
76
|
+
frame_offset = int(start * sample_rate)
|
|
77
|
+
num_frames = int(duration * sample_rate) if duration is not None else -1
|
|
78
|
+
|
|
79
|
+
# Reset file pointer to the beginning
|
|
80
|
+
# This is important to ensure we read from the correct position later
|
|
81
|
+
f.seek(0)
|
|
82
|
+
|
|
83
|
+
waveform, sr = torchaudio.load(
|
|
84
|
+
f, frame_offset=frame_offset, num_frames=num_frames
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
audio_np = waveform.numpy()
|
|
88
|
+
|
|
89
|
+
if audio_np.shape[0] > 1:
|
|
90
|
+
audio_np = audio_np.T
|
|
91
|
+
else:
|
|
92
|
+
audio_np = audio_np.squeeze()
|
|
93
|
+
|
|
94
|
+
return audio_np, int(sr)
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
raise FileError(
|
|
97
|
+
"unable to read audio fragment", audio.source, audio.path
|
|
98
|
+
) from exc
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def audio_fragment_bytes(
|
|
102
|
+
audio: "AudioFile",
|
|
103
|
+
start: float = 0,
|
|
104
|
+
duration: Optional[float] = None,
|
|
105
|
+
format: str = "wav",
|
|
106
|
+
) -> bytes:
|
|
107
|
+
"""Convert audio fragment to bytes using soundfile."""
|
|
108
|
+
y, sr = audio_fragment_np(audio, start, duration)
|
|
109
|
+
|
|
110
|
+
import io
|
|
111
|
+
|
|
112
|
+
import soundfile as sf
|
|
113
|
+
|
|
114
|
+
buffer = io.BytesIO()
|
|
115
|
+
sf.write(buffer, y, sr, format=format)
|
|
116
|
+
return buffer.getvalue()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def save_audio_fragment(
|
|
120
|
+
audio: "AudioFile",
|
|
121
|
+
start: float,
|
|
122
|
+
end: float,
|
|
123
|
+
output: str,
|
|
124
|
+
format: Optional[str] = None,
|
|
125
|
+
) -> "AudioFile":
|
|
126
|
+
"""Save audio fragment with timestamped filename.
|
|
127
|
+
Supports local and remote storage upload."""
|
|
128
|
+
if start < 0 or end < 0 or start >= end:
|
|
129
|
+
raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
|
|
130
|
+
|
|
131
|
+
if format is None:
|
|
132
|
+
format = audio.get_file_ext()
|
|
133
|
+
|
|
134
|
+
duration = end - start
|
|
135
|
+
start_ms = int(start * 1000)
|
|
136
|
+
end_ms = int(end * 1000)
|
|
137
|
+
output_file = posixpath.join(
|
|
138
|
+
output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
audio_bytes = audio_fragment_bytes(audio, start, duration, format)
|
|
143
|
+
|
|
144
|
+
from datachain.lib.file import AudioFile
|
|
145
|
+
|
|
146
|
+
return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
|
|
147
|
+
|
|
148
|
+
except Exception as exc:
|
|
149
|
+
raise FileError(
|
|
150
|
+
"unable to save audio fragment", audio.source, audio.path
|
|
151
|
+
) from exc
|
|
@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
|
|
|
9
9
|
type_ = sql_exp.type.python_type
|
|
10
10
|
if type_ == Decimal:
|
|
11
11
|
type_ = float
|
|
12
|
+
elif type_ is list:
|
|
13
|
+
if hasattr(sql_exp.type, "item_type") and hasattr(
|
|
14
|
+
sql_exp.type.item_type, "python_type"
|
|
15
|
+
):
|
|
16
|
+
item_type = getattr(sql_exp.type.item_type, "python_type", Any)
|
|
17
|
+
type_ = list[item_type] # type: ignore[valid-type]
|
|
18
|
+
else:
|
|
19
|
+
type_ = list
|
|
12
20
|
except NotImplementedError:
|
|
13
21
|
type_ = str
|
|
14
22
|
return type_
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -15,6 +15,7 @@ from typing import (
|
|
|
15
15
|
Optional,
|
|
16
16
|
TypeVar,
|
|
17
17
|
Union,
|
|
18
|
+
cast,
|
|
18
19
|
overload,
|
|
19
20
|
)
|
|
20
21
|
|
|
@@ -39,14 +40,15 @@ from datachain.lib.file import (
|
|
|
39
40
|
FileExporter,
|
|
40
41
|
)
|
|
41
42
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
43
|
+
from datachain.lib.model_store import ModelStore
|
|
42
44
|
from datachain.lib.settings import Settings
|
|
43
|
-
from datachain.lib.signal_schema import SignalSchema
|
|
45
|
+
from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
|
|
44
46
|
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
45
47
|
from datachain.lib.udf_signature import UdfSignature
|
|
46
48
|
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
47
49
|
from datachain.query import Session
|
|
48
50
|
from datachain.query.dataset import DatasetQuery, PartitionByType
|
|
49
|
-
from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
51
|
+
from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
50
52
|
from datachain.sql.functions import path as pathfunc
|
|
51
53
|
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
52
54
|
|
|
@@ -758,11 +760,12 @@ class DataChain:
|
|
|
758
760
|
@delta_disabled
|
|
759
761
|
def agg(
|
|
760
762
|
self,
|
|
763
|
+
/,
|
|
761
764
|
func: Optional[Callable] = None,
|
|
762
765
|
partition_by: Optional[PartitionByType] = None,
|
|
763
766
|
params: Union[None, str, Sequence[str]] = None,
|
|
764
767
|
output: OutputType = None,
|
|
765
|
-
**signal_map,
|
|
768
|
+
**signal_map: Callable,
|
|
766
769
|
) -> "Self":
|
|
767
770
|
"""Aggregate rows using `partition_by` statement and apply a function to the
|
|
768
771
|
groups of aggregated rows. The function needs to return new objects for each
|
|
@@ -772,12 +775,28 @@ class DataChain:
|
|
|
772
775
|
|
|
773
776
|
This method bears similarity to `gen()` and `map()`, employing a comparable set
|
|
774
777
|
of parameters, yet differs in two crucial aspects:
|
|
778
|
+
|
|
775
779
|
1. The `partition_by` parameter: This specifies the column name or a list of
|
|
776
780
|
column names that determine the grouping criteria for aggregation.
|
|
777
781
|
2. Group-based UDF function input: Instead of individual rows, the function
|
|
778
|
-
receives a list all rows within each group defined by `partition_by`.
|
|
782
|
+
receives a list of all rows within each group defined by `partition_by`.
|
|
783
|
+
|
|
784
|
+
If `partition_by` is not set or is an empty list, all rows will be placed
|
|
785
|
+
into a single group.
|
|
786
|
+
|
|
787
|
+
Parameters:
|
|
788
|
+
func: Function applied to each group of rows.
|
|
789
|
+
partition_by: Column name(s) to group by. If None, all rows go
|
|
790
|
+
into one group.
|
|
791
|
+
params: List of column names used as input for the function. Default is
|
|
792
|
+
taken from function signature.
|
|
793
|
+
output: Dictionary defining new signals and their corresponding types.
|
|
794
|
+
Default type is taken from function signature.
|
|
795
|
+
**signal_map: kwargs can be used to define `func` together with its return
|
|
796
|
+
signal name in format of `agg(result_column=my_func)`.
|
|
779
797
|
|
|
780
798
|
Examples:
|
|
799
|
+
Basic aggregation with lambda function:
|
|
781
800
|
```py
|
|
782
801
|
chain = chain.agg(
|
|
783
802
|
total=lambda category, amount: [sum(amount)],
|
|
@@ -788,7 +807,6 @@ class DataChain:
|
|
|
788
807
|
```
|
|
789
808
|
|
|
790
809
|
An alternative syntax, when you need to specify a more complex function:
|
|
791
|
-
|
|
792
810
|
```py
|
|
793
811
|
# It automatically resolves which columns to pass to the function
|
|
794
812
|
# by looking at the function signature.
|
|
@@ -806,10 +824,43 @@ class DataChain:
|
|
|
806
824
|
)
|
|
807
825
|
chain.save("new_dataset")
|
|
808
826
|
```
|
|
827
|
+
|
|
828
|
+
Using complex signals for partitioning (`File` or any Pydantic `BaseModel`):
|
|
829
|
+
```py
|
|
830
|
+
def my_agg(files: list[File]) -> Iterator[tuple[File, int]]:
|
|
831
|
+
yield files[0], sum(f.size for f in files)
|
|
832
|
+
|
|
833
|
+
chain = chain.agg(
|
|
834
|
+
my_agg,
|
|
835
|
+
params=("file",),
|
|
836
|
+
output={"file": File, "total": int},
|
|
837
|
+
partition_by="file", # Column referring to all sub-columns of File
|
|
838
|
+
)
|
|
839
|
+
chain.save("new_dataset")
|
|
840
|
+
```
|
|
841
|
+
|
|
842
|
+
Aggregating all rows into a single group (when `partition_by` is not set):
|
|
843
|
+
```py
|
|
844
|
+
chain = chain.agg(
|
|
845
|
+
total_size=lambda file, size: [sum(size)],
|
|
846
|
+
output=int,
|
|
847
|
+
# No partition_by specified - all rows go into one group
|
|
848
|
+
)
|
|
849
|
+
chain.save("new_dataset")
|
|
850
|
+
```
|
|
851
|
+
|
|
852
|
+
Multiple partition columns:
|
|
853
|
+
```py
|
|
854
|
+
chain = chain.agg(
|
|
855
|
+
total=lambda category, subcategory, amount: [sum(amount)],
|
|
856
|
+
output=float,
|
|
857
|
+
partition_by=["category", "subcategory"],
|
|
858
|
+
)
|
|
859
|
+
chain.save("new_dataset")
|
|
860
|
+
```
|
|
809
861
|
"""
|
|
810
|
-
# Convert string partition_by parameters to Column objects
|
|
811
|
-
processed_partition_by = partition_by
|
|
812
862
|
if partition_by is not None:
|
|
863
|
+
# Convert string partition_by parameters to Column objects
|
|
813
864
|
if isinstance(partition_by, (str, Function, ColumnElement)):
|
|
814
865
|
list_partition_by = [partition_by]
|
|
815
866
|
else:
|
|
@@ -818,10 +869,10 @@ class DataChain:
|
|
|
818
869
|
processed_partition_columns: list[ColumnElement] = []
|
|
819
870
|
for col in list_partition_by:
|
|
820
871
|
if isinstance(col, str):
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
processed_partition_columns.
|
|
872
|
+
columns = self.signals_schema.db_signals(name=col, as_columns=True)
|
|
873
|
+
if not columns:
|
|
874
|
+
raise SignalResolvingError([col], "is not found")
|
|
875
|
+
processed_partition_columns.extend(cast("list[Column]", columns))
|
|
825
876
|
elif isinstance(col, Function):
|
|
826
877
|
column = col.get_column(self.signals_schema)
|
|
827
878
|
processed_partition_columns.append(column)
|
|
@@ -830,6 +881,8 @@ class DataChain:
|
|
|
830
881
|
processed_partition_columns.append(col)
|
|
831
882
|
|
|
832
883
|
processed_partition_by = processed_partition_columns
|
|
884
|
+
else:
|
|
885
|
+
processed_partition_by = []
|
|
833
886
|
|
|
834
887
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
835
888
|
return self._evolve(
|
|
@@ -969,7 +1022,7 @@ class DataChain:
|
|
|
969
1022
|
)
|
|
970
1023
|
|
|
971
1024
|
@delta_disabled # type: ignore[arg-type]
|
|
972
|
-
def group_by(
|
|
1025
|
+
def group_by( # noqa: C901, PLR0912
|
|
973
1026
|
self,
|
|
974
1027
|
*,
|
|
975
1028
|
partition_by: Optional[Union[str, Func, Sequence[Union[str, Func]]]] = None,
|
|
@@ -988,6 +1041,15 @@ class DataChain:
|
|
|
988
1041
|
partition_by=("file_source", "file_ext"),
|
|
989
1042
|
)
|
|
990
1043
|
```
|
|
1044
|
+
|
|
1045
|
+
Using complex signals:
|
|
1046
|
+
```py
|
|
1047
|
+
chain = chain.group_by(
|
|
1048
|
+
total_size=func.sum("file.size"),
|
|
1049
|
+
count=func.count(),
|
|
1050
|
+
partition_by="file", # Uses column name, expands to File's unique keys
|
|
1051
|
+
)
|
|
1052
|
+
```
|
|
991
1053
|
"""
|
|
992
1054
|
if partition_by is None:
|
|
993
1055
|
partition_by = []
|
|
@@ -998,20 +1060,61 @@ class DataChain:
|
|
|
998
1060
|
signal_columns: list[Column] = []
|
|
999
1061
|
schema_fields: dict[str, DataType] = {}
|
|
1000
1062
|
keep_columns: list[str] = []
|
|
1063
|
+
partial_fields: list[str] = [] # Track specific fields for partial creation
|
|
1064
|
+
schema_partition_by: list[str] = []
|
|
1001
1065
|
|
|
1002
|
-
# validate partition_by columns and add them to the schema
|
|
1003
1066
|
for col in partition_by:
|
|
1004
1067
|
if isinstance(col, str):
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1068
|
+
columns = self.signals_schema.db_signals(name=col, as_columns=True)
|
|
1069
|
+
if not columns:
|
|
1070
|
+
raise SignalResolvingError([col], "is not found")
|
|
1071
|
+
partition_by_columns.extend(cast("list[Column]", columns))
|
|
1072
|
+
|
|
1073
|
+
# For nested field references (e.g., "nested.level1.name"),
|
|
1074
|
+
# we need to distinguish between:
|
|
1075
|
+
# 1. References to fields within a complex signal (create partials)
|
|
1076
|
+
# 2. Deep nested references that should be flattened
|
|
1077
|
+
if "." in col:
|
|
1078
|
+
# Split the column reference to analyze it
|
|
1079
|
+
parts = col.split(".")
|
|
1080
|
+
parent_signal = parts[0]
|
|
1081
|
+
parent_type = self.signals_schema.values.get(parent_signal)
|
|
1082
|
+
|
|
1083
|
+
if ModelStore.is_partial(parent_type):
|
|
1084
|
+
if parent_signal not in keep_columns:
|
|
1085
|
+
keep_columns.append(parent_signal)
|
|
1086
|
+
partial_fields.append(col)
|
|
1087
|
+
schema_partition_by.append(col)
|
|
1088
|
+
else:
|
|
1089
|
+
# BaseModel or other - add flattened columns directly
|
|
1090
|
+
for column in cast("list[Column]", columns):
|
|
1091
|
+
col_type = self.signals_schema.get_column_type(column.name)
|
|
1092
|
+
schema_fields[column.name] = col_type
|
|
1093
|
+
schema_partition_by.append(col)
|
|
1094
|
+
else:
|
|
1095
|
+
# simple signal - but we need to check if it's a complex signal
|
|
1096
|
+
# complex signal - only include the columns used for partitioning
|
|
1097
|
+
col_type = self.signals_schema.get_column_type(
|
|
1098
|
+
col, with_subtree=True
|
|
1099
|
+
)
|
|
1100
|
+
if isinstance(col_type, type) and issubclass(col_type, BaseModel):
|
|
1101
|
+
# Complex signal - add only the partitioning columns
|
|
1102
|
+
for column in cast("list[Column]", columns):
|
|
1103
|
+
col_type = self.signals_schema.get_column_type(column.name)
|
|
1104
|
+
schema_fields[column.name] = col_type
|
|
1105
|
+
schema_partition_by.append(col)
|
|
1106
|
+
# Simple signal - keep the entire signal
|
|
1107
|
+
else:
|
|
1108
|
+
if col not in keep_columns:
|
|
1109
|
+
keep_columns.append(col)
|
|
1110
|
+
schema_partition_by.append(col)
|
|
1010
1111
|
elif isinstance(col, Function):
|
|
1011
1112
|
column = col.get_column(self.signals_schema)
|
|
1012
1113
|
col_db_name = column.name
|
|
1013
1114
|
col_type = column.type.python_type
|
|
1014
1115
|
schema_fields[col_db_name] = col_type
|
|
1116
|
+
partition_by_columns.append(column)
|
|
1117
|
+
signal_columns.append(column)
|
|
1015
1118
|
else:
|
|
1016
1119
|
raise DataChainColumnError(
|
|
1017
1120
|
col,
|
|
@@ -1020,9 +1123,7 @@ class DataChain:
|
|
|
1020
1123
|
" but expected str or Function"
|
|
1021
1124
|
),
|
|
1022
1125
|
)
|
|
1023
|
-
partition_by_columns.append(column)
|
|
1024
1126
|
|
|
1025
|
-
# validate signal columns and add them to the schema
|
|
1026
1127
|
if not kwargs:
|
|
1027
1128
|
raise ValueError("At least one column should be provided for group_by")
|
|
1028
1129
|
for col_name, func in kwargs.items():
|
|
@@ -1035,9 +1136,9 @@ class DataChain:
|
|
|
1035
1136
|
signal_columns.append(column)
|
|
1036
1137
|
schema_fields[col_name] = func.get_result_type(self.signals_schema)
|
|
1037
1138
|
|
|
1038
|
-
signal_schema =
|
|
1039
|
-
|
|
1040
|
-
|
|
1139
|
+
signal_schema = self.signals_schema.group_by(
|
|
1140
|
+
schema_partition_by, signal_columns
|
|
1141
|
+
)
|
|
1041
1142
|
|
|
1042
1143
|
return self._evolve(
|
|
1043
1144
|
query=self._query.group_by(signal_columns, partition_by_columns),
|
|
@@ -1166,6 +1267,7 @@ class DataChain:
|
|
|
1166
1267
|
db_signals = self._effective_signals_schema.db_signals(
|
|
1167
1268
|
include_hidden=include_hidden
|
|
1168
1269
|
)
|
|
1270
|
+
|
|
1169
1271
|
with self._query.ordered_select(*db_signals).as_iterable() as rows:
|
|
1170
1272
|
if row_factory:
|
|
1171
1273
|
rows = (row_factory(db_signals, r) for r in rows) # type: ignore[assignment]
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -376,7 +376,7 @@ def move_dataset(
|
|
|
376
376
|
the namespace and project, or a regular name. If a regular name is used,
|
|
377
377
|
default values will be applied. The source dataset will no longer exist
|
|
378
378
|
after the move.
|
|
379
|
-
|
|
379
|
+
dest: The destination dataset name. This can also be a fully qualified
|
|
380
380
|
name with a namespace and project, or just a regular name (default values
|
|
381
381
|
will be used in that case). The original dataset will be moved here.
|
|
382
382
|
session: An optional session instance. If not provided, the default session
|
datachain/lib/file.py
CHANGED
|
@@ -43,7 +43,7 @@ logger = logging.getLogger("datachain")
|
|
|
43
43
|
# how to create file path when exporting
|
|
44
44
|
ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
45
45
|
|
|
46
|
-
FileType = Literal["binary", "text", "image", "video"]
|
|
46
|
+
FileType = Literal["binary", "text", "image", "video", "audio"]
|
|
47
47
|
EXPORT_FILES_MAX_THREADS = 5
|
|
48
48
|
|
|
49
49
|
|
|
@@ -312,6 +312,14 @@ class File(DataModel):
|
|
|
312
312
|
file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
|
|
313
313
|
return file
|
|
314
314
|
|
|
315
|
+
def as_audio_file(self) -> "AudioFile":
|
|
316
|
+
"""Convert the file to a `AudioFile` object."""
|
|
317
|
+
if isinstance(self, AudioFile):
|
|
318
|
+
return self
|
|
319
|
+
file = AudioFile(**self.model_dump())
|
|
320
|
+
file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
|
|
321
|
+
return file
|
|
322
|
+
|
|
315
323
|
@classmethod
|
|
316
324
|
def upload(
|
|
317
325
|
cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
|
|
@@ -851,6 +859,157 @@ class VideoFile(File):
|
|
|
851
859
|
start += duration
|
|
852
860
|
|
|
853
861
|
|
|
862
|
+
class AudioFile(File):
|
|
863
|
+
"""
|
|
864
|
+
A data model for handling audio files.
|
|
865
|
+
|
|
866
|
+
This model inherits from the `File` model and provides additional functionality
|
|
867
|
+
for reading audio files, extracting audio fragments, and splitting audio into
|
|
868
|
+
fragments.
|
|
869
|
+
"""
|
|
870
|
+
|
|
871
|
+
def get_info(self) -> "Audio":
|
|
872
|
+
"""
|
|
873
|
+
Retrieves metadata and information about the audio file. It does not
|
|
874
|
+
download the file if possible, only reads its header. It is thus might be
|
|
875
|
+
a good idea to disable caching and prefetching for UDF if you only need
|
|
876
|
+
audio metadata.
|
|
877
|
+
|
|
878
|
+
Returns:
|
|
879
|
+
Audio: A Model containing audio metadata such as duration,
|
|
880
|
+
sample rate, channels, and codec details.
|
|
881
|
+
"""
|
|
882
|
+
from .audio import audio_info
|
|
883
|
+
|
|
884
|
+
return audio_info(self)
|
|
885
|
+
|
|
886
|
+
def get_fragment(self, start: float, end: float) -> "AudioFragment":
|
|
887
|
+
"""
|
|
888
|
+
Returns an audio fragment from the specified time range. It does not
|
|
889
|
+
download the file, neither it actually extracts the fragment. It returns
|
|
890
|
+
a Model representing the audio fragment, which can be used to read or save
|
|
891
|
+
it later.
|
|
892
|
+
|
|
893
|
+
Args:
|
|
894
|
+
start (float): The start time of the fragment in seconds.
|
|
895
|
+
end (float): The end time of the fragment in seconds.
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
AudioFragment: A Model representing the audio fragment.
|
|
899
|
+
"""
|
|
900
|
+
if start < 0 or end < 0 or start >= end:
|
|
901
|
+
raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
|
|
902
|
+
|
|
903
|
+
return AudioFragment(audio=self, start=start, end=end)
|
|
904
|
+
|
|
905
|
+
def get_fragments(
|
|
906
|
+
self,
|
|
907
|
+
duration: float,
|
|
908
|
+
start: float = 0,
|
|
909
|
+
end: Optional[float] = None,
|
|
910
|
+
) -> "Iterator[AudioFragment]":
|
|
911
|
+
"""
|
|
912
|
+
Splits the audio into multiple fragments of a specified duration.
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
duration (float): The duration of each audio fragment in seconds.
|
|
916
|
+
start (float): The starting time in seconds (default: 0).
|
|
917
|
+
end (float, optional): The ending time in seconds. If None, the entire
|
|
918
|
+
remaining audio is processed (default: None).
|
|
919
|
+
|
|
920
|
+
Returns:
|
|
921
|
+
Iterator[AudioFragment]: An iterator yielding audio fragments.
|
|
922
|
+
|
|
923
|
+
Note:
|
|
924
|
+
If end is not specified, number of samples will be taken from the
|
|
925
|
+
audio file, this means audio file needs to be downloaded.
|
|
926
|
+
"""
|
|
927
|
+
if duration <= 0:
|
|
928
|
+
raise ValueError("duration must be a positive float")
|
|
929
|
+
if start < 0:
|
|
930
|
+
raise ValueError("start must be a non-negative float")
|
|
931
|
+
|
|
932
|
+
if end is None:
|
|
933
|
+
end = self.get_info().duration
|
|
934
|
+
|
|
935
|
+
if end < 0:
|
|
936
|
+
raise ValueError("end must be a non-negative float")
|
|
937
|
+
if start >= end:
|
|
938
|
+
raise ValueError("start must be less than end")
|
|
939
|
+
|
|
940
|
+
while start < end:
|
|
941
|
+
yield self.get_fragment(start, min(start + duration, end))
|
|
942
|
+
start += duration
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
class AudioFragment(DataModel):
|
|
946
|
+
"""
|
|
947
|
+
A data model for representing an audio fragment.
|
|
948
|
+
|
|
949
|
+
This model represents a specific fragment within an audio file with defined
|
|
950
|
+
start and end times. It allows access to individual fragments and provides
|
|
951
|
+
functionality for reading and saving audio fragments as separate audio files.
|
|
952
|
+
|
|
953
|
+
Attributes:
|
|
954
|
+
audio (AudioFile): The audio file containing the audio fragment.
|
|
955
|
+
start (float): The starting time of the audio fragment in seconds.
|
|
956
|
+
end (float): The ending time of the audio fragment in seconds.
|
|
957
|
+
"""
|
|
958
|
+
|
|
959
|
+
audio: AudioFile
|
|
960
|
+
start: float
|
|
961
|
+
end: float
|
|
962
|
+
|
|
963
|
+
def get_np(self) -> tuple["ndarray", int]:
|
|
964
|
+
"""
|
|
965
|
+
Returns the audio fragment as a NumPy array with sample rate.
|
|
966
|
+
|
|
967
|
+
Returns:
|
|
968
|
+
tuple[ndarray, int]: A tuple containing the audio data as a NumPy array
|
|
969
|
+
and the sample rate.
|
|
970
|
+
"""
|
|
971
|
+
from .audio import audio_fragment_np
|
|
972
|
+
|
|
973
|
+
duration = self.end - self.start
|
|
974
|
+
return audio_fragment_np(self.audio, self.start, duration)
|
|
975
|
+
|
|
976
|
+
def read_bytes(self, format: str = "wav") -> bytes:
|
|
977
|
+
"""
|
|
978
|
+
Returns the audio fragment as audio bytes.
|
|
979
|
+
|
|
980
|
+
Args:
|
|
981
|
+
format (str): The desired audio format (e.g., 'wav', 'mp3').
|
|
982
|
+
Defaults to 'wav'.
|
|
983
|
+
|
|
984
|
+
Returns:
|
|
985
|
+
bytes: The encoded audio fragment as bytes.
|
|
986
|
+
"""
|
|
987
|
+
from .audio import audio_fragment_bytes
|
|
988
|
+
|
|
989
|
+
duration = self.end - self.start
|
|
990
|
+
return audio_fragment_bytes(self.audio, self.start, duration, format)
|
|
991
|
+
|
|
992
|
+
def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
|
|
993
|
+
"""
|
|
994
|
+
Saves the audio fragment as a new audio file.
|
|
995
|
+
|
|
996
|
+
If `output` is a remote path, the audio file will be uploaded to remote storage.
|
|
997
|
+
|
|
998
|
+
Args:
|
|
999
|
+
output (str): The destination path, which can be a local file path
|
|
1000
|
+
or a remote URL.
|
|
1001
|
+
format (str, optional): The output audio format (e.g., 'wav', 'mp3').
|
|
1002
|
+
If None, the format is inferred from the
|
|
1003
|
+
file extension.
|
|
1004
|
+
|
|
1005
|
+
Returns:
|
|
1006
|
+
AudioFile: A Model representing the saved audio file.
|
|
1007
|
+
"""
|
|
1008
|
+
from .audio import save_audio_fragment
|
|
1009
|
+
|
|
1010
|
+
return save_audio_fragment(self.audio, self.start, self.end, output, format)
|
|
1011
|
+
|
|
1012
|
+
|
|
854
1013
|
class VideoFrame(DataModel):
|
|
855
1014
|
"""
|
|
856
1015
|
A data model for representing a video frame.
|
|
@@ -981,6 +1140,34 @@ class Video(DataModel):
|
|
|
981
1140
|
codec: str = Field(default="")
|
|
982
1141
|
|
|
983
1142
|
|
|
1143
|
+
class Audio(DataModel):
|
|
1144
|
+
"""
|
|
1145
|
+
A data model representing metadata for an audio file.
|
|
1146
|
+
|
|
1147
|
+
Attributes:
|
|
1148
|
+
sample_rate (int): The sample rate of the audio (samples per second).
|
|
1149
|
+
Defaults to -1 if unknown.
|
|
1150
|
+
channels (int): The number of audio channels. Defaults to -1 if unknown.
|
|
1151
|
+
duration (float): The total duration of the audio in seconds.
|
|
1152
|
+
Defaults to -1.0 if unknown.
|
|
1153
|
+
samples (int): The total number of samples in the audio.
|
|
1154
|
+
Defaults to -1 if unknown.
|
|
1155
|
+
format (str): The format of the audio file (e.g., 'wav', 'mp3').
|
|
1156
|
+
Defaults to an empty string.
|
|
1157
|
+
codec (str): The codec used for encoding the audio. Defaults to an empty string.
|
|
1158
|
+
bit_rate (int): The bit rate of the audio in bits per second.
|
|
1159
|
+
Defaults to -1 if unknown.
|
|
1160
|
+
"""
|
|
1161
|
+
|
|
1162
|
+
sample_rate: int = Field(default=-1)
|
|
1163
|
+
channels: int = Field(default=-1)
|
|
1164
|
+
duration: float = Field(default=-1.0)
|
|
1165
|
+
samples: int = Field(default=-1)
|
|
1166
|
+
format: str = Field(default="")
|
|
1167
|
+
codec: str = Field(default="")
|
|
1168
|
+
bit_rate: int = Field(default=-1)
|
|
1169
|
+
|
|
1170
|
+
|
|
984
1171
|
class ArrowRow(DataModel):
|
|
985
1172
|
"""`DataModel` for reading row from Arrow-supported file."""
|
|
986
1173
|
|
|
@@ -1018,5 +1205,7 @@ def get_file_type(type_: FileType = "binary") -> type[File]:
|
|
|
1018
1205
|
file = ImageFile # type: ignore[assignment]
|
|
1019
1206
|
elif type_ == "video":
|
|
1020
1207
|
file = VideoFile
|
|
1208
|
+
elif type_ == "audio":
|
|
1209
|
+
file = AudioFile
|
|
1021
1210
|
|
|
1022
1211
|
return file
|
datachain/lib/model_store.py
CHANGED
|
@@ -81,3 +81,11 @@ class ModelStore:
|
|
|
81
81
|
if val is None or not ModelStore.is_pydantic(val):
|
|
82
82
|
return None
|
|
83
83
|
return val
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def is_partial(parent_type) -> bool:
|
|
87
|
+
return (
|
|
88
|
+
parent_type
|
|
89
|
+
and ModelStore.is_pydantic(parent_type)
|
|
90
|
+
and "@" in ModelStore.get_name(parent_type)
|
|
91
|
+
)
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -446,14 +446,14 @@ class SignalSchema:
|
|
|
446
446
|
res[db_name] = python_to_sql(type_)
|
|
447
447
|
return res
|
|
448
448
|
|
|
449
|
-
def row_to_objs(self, row: Sequence[Any]) -> list[
|
|
449
|
+
def row_to_objs(self, row: Sequence[Any]) -> list[Any]:
|
|
450
450
|
self._init_setup_values()
|
|
451
451
|
|
|
452
|
-
objs: list[
|
|
452
|
+
objs: list[Any] = []
|
|
453
453
|
pos = 0
|
|
454
454
|
for name, fr_type in self.values.items():
|
|
455
|
-
if self.setup_values and
|
|
456
|
-
objs.append(
|
|
455
|
+
if self.setup_values and name in self.setup_values:
|
|
456
|
+
objs.append(self.setup_values.get(name))
|
|
457
457
|
elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
|
|
458
458
|
j, pos = unflatten_to_json_pos(fr, row, pos)
|
|
459
459
|
objs.append(fr(**j))
|
|
@@ -589,6 +589,9 @@ class SignalSchema:
|
|
|
589
589
|
]
|
|
590
590
|
|
|
591
591
|
if name:
|
|
592
|
+
if "." in name:
|
|
593
|
+
name = name.replace(".", "__")
|
|
594
|
+
|
|
592
595
|
signals = [
|
|
593
596
|
s
|
|
594
597
|
for s in signals
|
|
@@ -625,6 +628,15 @@ class SignalSchema:
|
|
|
625
628
|
|
|
626
629
|
return curr_type
|
|
627
630
|
|
|
631
|
+
def group_by(
|
|
632
|
+
self, partition_by: Sequence[str], new_column: Sequence[Column]
|
|
633
|
+
) -> "SignalSchema":
|
|
634
|
+
orig_schema = SignalSchema(copy.deepcopy(self.values))
|
|
635
|
+
schema = orig_schema.to_partial(*partition_by)
|
|
636
|
+
|
|
637
|
+
vals = {c.name: sql_to_python(c) for c in new_column}
|
|
638
|
+
return SignalSchema(schema.values | vals)
|
|
639
|
+
|
|
628
640
|
def select_except_signals(self, *args: str) -> "SignalSchema":
|
|
629
641
|
def has_signal(signal: str):
|
|
630
642
|
signal = signal.replace(".", DEFAULT_DELIMITER)
|
|
@@ -888,7 +900,7 @@ class SignalSchema:
|
|
|
888
900
|
|
|
889
901
|
return res
|
|
890
902
|
|
|
891
|
-
def to_partial(self, *columns: str) -> "SignalSchema":
|
|
903
|
+
def to_partial(self, *columns: str) -> "SignalSchema": # noqa: C901
|
|
892
904
|
"""
|
|
893
905
|
Convert the schema to a partial schema with only the specified columns.
|
|
894
906
|
|
|
@@ -931,9 +943,15 @@ class SignalSchema:
|
|
|
931
943
|
partial_versions: dict[str, int] = {}
|
|
932
944
|
|
|
933
945
|
def _type_name_to_partial(signal_name: str, type_name: str) -> str:
|
|
934
|
-
if
|
|
946
|
+
# Check if we need to create a partial for this type
|
|
947
|
+
# Only create partials for custom types that are in the custom_types dict
|
|
948
|
+
if type_name not in custom_types:
|
|
935
949
|
return type_name
|
|
936
|
-
|
|
950
|
+
|
|
951
|
+
if "@" in type_name:
|
|
952
|
+
model_name, _ = ModelStore.parse_name_version(type_name)
|
|
953
|
+
else:
|
|
954
|
+
model_name = type_name
|
|
937
955
|
|
|
938
956
|
if signal_name not in signal_partials:
|
|
939
957
|
partial_versions.setdefault(model_name, 0)
|
|
@@ -957,6 +975,14 @@ class SignalSchema:
|
|
|
957
975
|
parent_type_partial = _type_name_to_partial(signal, parent_type)
|
|
958
976
|
|
|
959
977
|
schema[signal] = parent_type_partial
|
|
978
|
+
|
|
979
|
+
# If this is a complex signal without field specifier (just "file")
|
|
980
|
+
# and it's a custom type, include the entire complex signal
|
|
981
|
+
if len(column_parts) == 1 and parent_type in custom_types:
|
|
982
|
+
# Include the entire complex signal - no need to create partial
|
|
983
|
+
schema[signal] = parent_type
|
|
984
|
+
continue
|
|
985
|
+
|
|
960
986
|
continue
|
|
961
987
|
|
|
962
988
|
if parent_type not in custom_types:
|
|
@@ -971,6 +997,20 @@ class SignalSchema:
|
|
|
971
997
|
f"Field {signal} not found in custom type {parent_type}"
|
|
972
998
|
)
|
|
973
999
|
|
|
1000
|
+
# Check if this is the last part and if the column type is a complex
|
|
1001
|
+
is_last_part = i == len(column_parts) - 1
|
|
1002
|
+
is_complex_signal = signal_type in custom_types
|
|
1003
|
+
|
|
1004
|
+
if is_last_part and is_complex_signal:
|
|
1005
|
+
schema[column] = signal_type
|
|
1006
|
+
# Also need to remove the partial schema entry we created for the
|
|
1007
|
+
# parent since we're promoting the nested complex column to root
|
|
1008
|
+
parent_signal = column_parts[0]
|
|
1009
|
+
schema.pop(parent_signal, None)
|
|
1010
|
+
# Don't create partial types for this case
|
|
1011
|
+
break
|
|
1012
|
+
|
|
1013
|
+
# Create partial type for this field
|
|
974
1014
|
partial_type = _type_name_to_partial(
|
|
975
1015
|
".".join(column_parts[: i + 1]),
|
|
976
1016
|
signal_type,
|
datachain/lib/udf.py
CHANGED
|
@@ -13,8 +13,7 @@ from datachain.asyn import AsyncMapper
|
|
|
13
13
|
from datachain.cache import temporary_cache
|
|
14
14
|
from datachain.dataset import RowDict
|
|
15
15
|
from datachain.lib.convert.flatten import flatten
|
|
16
|
-
from datachain.lib.
|
|
17
|
-
from datachain.lib.file import File
|
|
16
|
+
from datachain.lib.file import DataModel, File
|
|
18
17
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
19
18
|
from datachain.query.batch import (
|
|
20
19
|
Batch,
|
|
@@ -266,15 +265,28 @@ class UDFBase(AbstractUDF):
|
|
|
266
265
|
|
|
267
266
|
def _parse_row(
|
|
268
267
|
self, row_dict: RowDict, catalog: "Catalog", cache: bool, download_cb: Callback
|
|
269
|
-
) -> list[
|
|
268
|
+
) -> list[Any]:
|
|
270
269
|
assert self.params
|
|
271
270
|
row = [row_dict[p] for p in self.params.to_udf_spec()]
|
|
272
271
|
obj_row = self.params.row_to_objs(row)
|
|
273
272
|
for obj in obj_row:
|
|
274
|
-
|
|
275
|
-
obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
|
|
273
|
+
self._set_stream_recursive(obj, catalog, cache, download_cb)
|
|
276
274
|
return obj_row
|
|
277
275
|
|
|
276
|
+
def _set_stream_recursive(
|
|
277
|
+
self, obj: Any, catalog: "Catalog", cache: bool, download_cb: Callback
|
|
278
|
+
) -> None:
|
|
279
|
+
"""Recursively set the catalog stream on all File objects within an object."""
|
|
280
|
+
if isinstance(obj, File):
|
|
281
|
+
obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
|
|
282
|
+
|
|
283
|
+
# Check all fields for nested File objects, but only for DataModel objects
|
|
284
|
+
if isinstance(obj, DataModel):
|
|
285
|
+
for field_name in obj.model_fields:
|
|
286
|
+
field_value = getattr(obj, field_name, None)
|
|
287
|
+
if isinstance(field_value, DataModel):
|
|
288
|
+
self._set_stream_recursive(field_value, catalog, cache, download_cb)
|
|
289
|
+
|
|
278
290
|
def _prepare_row(self, row, udf_fields, catalog, cache, download_cb):
|
|
279
291
|
row_dict = RowDict(zip(udf_fields, row))
|
|
280
292
|
return self._parse_row(row_dict, catalog, cache, download_cb)
|
datachain/query/dataset.py
CHANGED
|
@@ -1031,16 +1031,22 @@ class SQLGroupBy(SQLClause):
|
|
|
1031
1031
|
c.get_column() if isinstance(c, Function) else c for c in self.group_by
|
|
1032
1032
|
]
|
|
1033
1033
|
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
if isinstance(c, Function)
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1034
|
+
cols_dict: dict[str, Any] = {}
|
|
1035
|
+
for c in (*group_by, *self.cols):
|
|
1036
|
+
if isinstance(c, Function):
|
|
1037
|
+
key = c.name
|
|
1038
|
+
value = c.get_column()
|
|
1039
|
+
elif isinstance(c, (str, C)):
|
|
1040
|
+
key = str(c)
|
|
1041
|
+
value = subquery.c[str(c)]
|
|
1042
|
+
else:
|
|
1043
|
+
key = c.name
|
|
1044
|
+
value = c # type: ignore[assignment]
|
|
1045
|
+
cols_dict[key] = value
|
|
1046
|
+
|
|
1047
|
+
unique_cols = cols_dict.values()
|
|
1042
1048
|
|
|
1043
|
-
return sqlalchemy.select(*
|
|
1049
|
+
return sqlalchemy.select(*unique_cols).select_from(subquery).group_by(*group_by)
|
|
1044
1050
|
|
|
1045
1051
|
|
|
1046
1052
|
def _validate_columns(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.26.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -63,6 +63,9 @@ Provides-Extra: torch
|
|
|
63
63
|
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
64
64
|
Requires-Dist: torchvision; extra == "torch"
|
|
65
65
|
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
66
|
+
Provides-Extra: audio
|
|
67
|
+
Requires-Dist: torchaudio; extra == "audio"
|
|
68
|
+
Requires-Dist: soundfile; extra == "audio"
|
|
66
69
|
Provides-Extra: remote
|
|
67
70
|
Requires-Dist: lz4; extra == "remote"
|
|
68
71
|
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
@@ -78,7 +81,7 @@ Requires-Dist: ffmpeg-python; extra == "video"
|
|
|
78
81
|
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
79
82
|
Requires-Dist: opencv-python; extra == "video"
|
|
80
83
|
Provides-Extra: tests
|
|
81
|
-
Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
|
|
84
|
+
Requires-Dist: datachain[audio,hf,remote,torch,vector,video]; extra == "tests"
|
|
82
85
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
83
86
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
84
87
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
@@ -108,6 +111,7 @@ Requires-Dist: accelerate; extra == "examples"
|
|
|
108
111
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
109
112
|
Requires-Dist: ultralytics; extra == "examples"
|
|
110
113
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
114
|
+
Requires-Dist: openai; extra == "examples"
|
|
111
115
|
Dynamic: license-file
|
|
112
116
|
|
|
113
117
|
================
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=2TZ8ptSB9BtnYF31mDEhWG9N16EQ5pf9vNqQaFr2txs,1712
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
@@ -71,24 +71,25 @@ datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
|
|
|
71
71
|
datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
|
|
72
72
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
73
|
datachain/lib/arrow.py,sha256=hdEQ8I1JgNmEAaXTaqaU1qvZDi5dgtes1IC69ycthz8,10753
|
|
74
|
+
datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
|
|
74
75
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
75
76
|
datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
|
|
76
77
|
datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
|
|
77
|
-
datachain/lib/file.py,sha256=
|
|
78
|
+
datachain/lib/file.py,sha256=tHBBacsh1580UPFC6fAINBNwNiyymNgzj89rpsz1LKc,40817
|
|
78
79
|
datachain/lib/hf.py,sha256=_dCoGTv7n5cBgxhCDfZI-t3hnMCXGHd6sEsxRThcizE,5754
|
|
79
80
|
datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
80
81
|
datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
|
|
81
82
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
82
83
|
datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
|
|
83
|
-
datachain/lib/model_store.py,sha256=
|
|
84
|
+
datachain/lib/model_store.py,sha256=dkL2rcT5ag-kbgkhQPL_byEs-TCYr29qvdltroL5NxM,2734
|
|
84
85
|
datachain/lib/namespaces.py,sha256=it52UbbwB8dzhesO2pMs_nThXiPQ1Ph9sD9I3GQkg5s,2099
|
|
85
86
|
datachain/lib/projects.py,sha256=8lN0qV8czX1LGtWURCUvRlSJk-RpO9w9Rra_pOZus6g,2595
|
|
86
87
|
datachain/lib/pytorch.py,sha256=oBBd6cxYrcwaFz7IQajKqhGqDdNnwUZWs0wJPRizrjk,7712
|
|
87
88
|
datachain/lib/settings.py,sha256=9wi0FoHxRxNiyn99pR28IYsMkoo47jQxeXuObQr2Ar0,2929
|
|
88
|
-
datachain/lib/signal_schema.py,sha256=
|
|
89
|
+
datachain/lib/signal_schema.py,sha256=UGbjG6yJKIU2i4B6z9AK1rqaPWtxRjsPnCV6GYbNqGg,38329
|
|
89
90
|
datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
|
|
90
91
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
91
|
-
datachain/lib/udf.py,sha256=
|
|
92
|
+
datachain/lib/udf.py,sha256=nkcB3HNtSteUspwsGmOKyy3mH2F-Sfo6iW64-Ep47-I,17299
|
|
92
93
|
datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
|
|
93
94
|
datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
|
|
94
95
|
datachain/lib/video.py,sha256=u6fLJWj5G6QqsVkpfHnKGklBNpG3BRRg6v3izngnNcU,6767
|
|
@@ -97,14 +98,14 @@ datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0E
|
|
|
97
98
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
98
99
|
datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1NY4,1505
|
|
99
100
|
datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djMP3t8CHJLo,3188
|
|
100
|
-
datachain/lib/convert/sql_to_python.py,sha256=
|
|
101
|
+
datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
|
|
101
102
|
datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
|
|
102
103
|
datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
|
|
103
104
|
datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
|
|
104
105
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
105
106
|
datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
|
|
106
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
107
|
-
datachain/lib/dc/datasets.py,sha256=
|
|
107
|
+
datachain/lib/dc/datachain.py,sha256=YJYHp94yTWjd_ZmBXEUOHVeEvOb5jOhjIxgtqu1dnW4,91746
|
|
108
|
+
datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
|
|
108
109
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
109
110
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
110
111
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
@@ -125,7 +126,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
|
|
|
125
126
|
datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
|
|
126
127
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
127
128
|
datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
|
|
128
|
-
datachain/query/dataset.py,sha256=
|
|
129
|
+
datachain/query/dataset.py,sha256=bhJpm53tNLQzGECuR1nC1tg2Vd6foq6AKST5h1rb41U,61606
|
|
129
130
|
datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
|
|
130
131
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
131
132
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -157,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
157
158
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
158
159
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
159
160
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
160
|
-
datachain-0.
|
|
161
|
-
datachain-0.
|
|
162
|
-
datachain-0.
|
|
163
|
-
datachain-0.
|
|
164
|
-
datachain-0.
|
|
165
|
-
datachain-0.
|
|
161
|
+
datachain-0.26.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
162
|
+
datachain-0.26.0.dist-info/METADATA,sha256=4-DhUSU6ciIc8iUiB4UAh1ZKyFczvN5rHZnvd1x2Y9U,13543
|
|
163
|
+
datachain-0.26.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
164
|
+
datachain-0.26.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
165
|
+
datachain-0.26.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
166
|
+
datachain-0.26.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|