datachain 0.25.2__py3-none-any.whl → 0.26.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +6 -0
- datachain/catalog/loader.py +4 -0
- datachain/func/__init__.py +2 -1
- datachain/func/conditional.py +34 -0
- datachain/lib/audio.py +151 -0
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/dc/datachain.py +227 -67
- datachain/lib/file.py +190 -1
- datachain/lib/model_store.py +8 -0
- datachain/lib/pytorch.py +4 -1
- datachain/lib/signal_schema.py +56 -11
- datachain/lib/udf.py +17 -5
- datachain/query/dataset.py +37 -9
- {datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/METADATA +6 -2
- {datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/RECORD +19 -18
- {datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/WHEEL +0 -0
- {datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -21,6 +21,9 @@ from datachain.lib.dc import (
|
|
|
21
21
|
)
|
|
22
22
|
from datachain.lib.file import (
|
|
23
23
|
ArrowRow,
|
|
24
|
+
Audio,
|
|
25
|
+
AudioFile,
|
|
26
|
+
AudioFragment,
|
|
24
27
|
File,
|
|
25
28
|
FileError,
|
|
26
29
|
Image,
|
|
@@ -43,6 +46,9 @@ __all__ = [
|
|
|
43
46
|
"AbstractUDF",
|
|
44
47
|
"Aggregator",
|
|
45
48
|
"ArrowRow",
|
|
49
|
+
"Audio",
|
|
50
|
+
"AudioFile",
|
|
51
|
+
"AudioFragment",
|
|
46
52
|
"C",
|
|
47
53
|
"Column",
|
|
48
54
|
"DataChain",
|
datachain/catalog/loader.py
CHANGED
|
@@ -18,6 +18,7 @@ WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
|
18
18
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
19
19
|
DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
|
|
20
20
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
21
|
+
DISTRIBUTED_DISABLED = "DATACHAIN_DISTRIBUTED_DISABLED"
|
|
21
22
|
|
|
22
23
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
23
24
|
|
|
@@ -103,6 +104,9 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
107
|
+
if os.environ.get(DISTRIBUTED_DISABLED) == "True":
|
|
108
|
+
return None
|
|
109
|
+
|
|
106
110
|
if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
|
|
107
111
|
return None
|
|
108
112
|
|
datachain/func/__init__.py
CHANGED
|
@@ -16,7 +16,7 @@ from .aggregate import (
|
|
|
16
16
|
sum,
|
|
17
17
|
)
|
|
18
18
|
from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
|
|
19
|
-
from .conditional import and_, case, greatest, ifelse, isnone, least, or_
|
|
19
|
+
from .conditional import and_, case, greatest, ifelse, isnone, least, not_, or_
|
|
20
20
|
from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
|
|
21
21
|
from .path import file_ext, file_stem, name, parent
|
|
22
22
|
from .random import rand
|
|
@@ -54,6 +54,7 @@ __all__ = [
|
|
|
54
54
|
"max",
|
|
55
55
|
"min",
|
|
56
56
|
"name",
|
|
57
|
+
"not_",
|
|
57
58
|
"or_",
|
|
58
59
|
"parent",
|
|
59
60
|
"path",
|
datachain/func/conditional.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Optional, Union
|
|
|
3
3
|
from sqlalchemy import ColumnElement
|
|
4
4
|
from sqlalchemy import and_ as sql_and
|
|
5
5
|
from sqlalchemy import case as sql_case
|
|
6
|
+
from sqlalchemy import not_ as sql_not
|
|
6
7
|
from sqlalchemy import or_ as sql_or
|
|
7
8
|
|
|
8
9
|
from datachain.lib.utils import DataChainParamsError
|
|
@@ -288,3 +289,36 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
|
|
|
288
289
|
func_args.append(arg)
|
|
289
290
|
|
|
290
291
|
return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def not_(arg: Union[ColumnElement, Func]) -> Func:
|
|
295
|
+
"""
|
|
296
|
+
Returns the function that produces NOT of the given expressions.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
arg (ColumnElement | Func): The expression for NOT statement.
|
|
300
|
+
If a string is provided, it is assumed to be the name of the column.
|
|
301
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
302
|
+
If a Func is provided, it is assumed to be a function returning a value.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Func: A `Func` object that represents the NOT function.
|
|
306
|
+
|
|
307
|
+
Example:
|
|
308
|
+
```py
|
|
309
|
+
dc.mutate(
|
|
310
|
+
test=not_(C("value") == 5)
|
|
311
|
+
)
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
Notes:
|
|
315
|
+
- The result column will always be of type bool.
|
|
316
|
+
"""
|
|
317
|
+
cols, func_args = [], []
|
|
318
|
+
|
|
319
|
+
if isinstance(arg, (str, Func)):
|
|
320
|
+
cols.append(arg)
|
|
321
|
+
else:
|
|
322
|
+
func_args.append(arg)
|
|
323
|
+
|
|
324
|
+
return Func("not", inner=sql_not, cols=cols, args=func_args, result_type=bool)
|
datachain/lib/audio.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import posixpath
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
|
+
|
|
4
|
+
from datachain.lib.file import FileError
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from numpy import ndarray
|
|
8
|
+
|
|
9
|
+
from datachain.lib.file import Audio, AudioFile, File
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import torchaudio
|
|
13
|
+
except ImportError as exc:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"Missing dependencies for processing audio.\n"
|
|
16
|
+
"To install run:\n\n"
|
|
17
|
+
" pip install 'datachain[audio]'\n"
|
|
18
|
+
) from exc
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def audio_info(file: "Union[File, AudioFile]") -> "Audio":
|
|
22
|
+
"""Extract metadata like sample rate, channels, duration, and format."""
|
|
23
|
+
from datachain.lib.file import Audio
|
|
24
|
+
|
|
25
|
+
file = file.as_audio_file()
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
with file.open() as f:
|
|
29
|
+
info = torchaudio.info(f)
|
|
30
|
+
|
|
31
|
+
sample_rate = int(info.sample_rate)
|
|
32
|
+
channels = int(info.num_channels)
|
|
33
|
+
frames = int(info.num_frames)
|
|
34
|
+
duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
|
|
35
|
+
|
|
36
|
+
# Get format information
|
|
37
|
+
format_name = getattr(info, "format", "")
|
|
38
|
+
codec_name = getattr(info, "encoding", "")
|
|
39
|
+
bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
|
|
40
|
+
|
|
41
|
+
except Exception as exc:
|
|
42
|
+
raise FileError(
|
|
43
|
+
"unable to extract metadata from audio file", file.source, file.path
|
|
44
|
+
) from exc
|
|
45
|
+
|
|
46
|
+
return Audio(
|
|
47
|
+
sample_rate=sample_rate,
|
|
48
|
+
channels=channels,
|
|
49
|
+
duration=duration,
|
|
50
|
+
samples=frames,
|
|
51
|
+
format=format_name,
|
|
52
|
+
codec=codec_name,
|
|
53
|
+
bit_rate=bit_rate,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def audio_fragment_np(
|
|
58
|
+
audio: "AudioFile", start: float = 0, duration: Optional[float] = None
|
|
59
|
+
) -> "tuple[ndarray, int]":
|
|
60
|
+
"""Load audio fragment as numpy array.
|
|
61
|
+
Multi-channel audio is transposed to (samples, channels)."""
|
|
62
|
+
if start < 0:
|
|
63
|
+
raise ValueError("start must be a non-negative float")
|
|
64
|
+
|
|
65
|
+
if duration is not None and duration <= 0:
|
|
66
|
+
raise ValueError("duration must be a positive float")
|
|
67
|
+
|
|
68
|
+
if hasattr(audio, "as_audio_file"):
|
|
69
|
+
audio = audio.as_audio_file()
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
with audio.open() as f:
|
|
73
|
+
info = torchaudio.info(f)
|
|
74
|
+
sample_rate = info.sample_rate
|
|
75
|
+
|
|
76
|
+
frame_offset = int(start * sample_rate)
|
|
77
|
+
num_frames = int(duration * sample_rate) if duration is not None else -1
|
|
78
|
+
|
|
79
|
+
# Reset file pointer to the beginning
|
|
80
|
+
# This is important to ensure we read from the correct position later
|
|
81
|
+
f.seek(0)
|
|
82
|
+
|
|
83
|
+
waveform, sr = torchaudio.load(
|
|
84
|
+
f, frame_offset=frame_offset, num_frames=num_frames
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
audio_np = waveform.numpy()
|
|
88
|
+
|
|
89
|
+
if audio_np.shape[0] > 1:
|
|
90
|
+
audio_np = audio_np.T
|
|
91
|
+
else:
|
|
92
|
+
audio_np = audio_np.squeeze()
|
|
93
|
+
|
|
94
|
+
return audio_np, int(sr)
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
raise FileError(
|
|
97
|
+
"unable to read audio fragment", audio.source, audio.path
|
|
98
|
+
) from exc
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def audio_fragment_bytes(
|
|
102
|
+
audio: "AudioFile",
|
|
103
|
+
start: float = 0,
|
|
104
|
+
duration: Optional[float] = None,
|
|
105
|
+
format: str = "wav",
|
|
106
|
+
) -> bytes:
|
|
107
|
+
"""Convert audio fragment to bytes using soundfile."""
|
|
108
|
+
y, sr = audio_fragment_np(audio, start, duration)
|
|
109
|
+
|
|
110
|
+
import io
|
|
111
|
+
|
|
112
|
+
import soundfile as sf
|
|
113
|
+
|
|
114
|
+
buffer = io.BytesIO()
|
|
115
|
+
sf.write(buffer, y, sr, format=format)
|
|
116
|
+
return buffer.getvalue()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def save_audio_fragment(
|
|
120
|
+
audio: "AudioFile",
|
|
121
|
+
start: float,
|
|
122
|
+
end: float,
|
|
123
|
+
output: str,
|
|
124
|
+
format: Optional[str] = None,
|
|
125
|
+
) -> "AudioFile":
|
|
126
|
+
"""Save audio fragment with timestamped filename.
|
|
127
|
+
Supports local and remote storage upload."""
|
|
128
|
+
if start < 0 or end < 0 or start >= end:
|
|
129
|
+
raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
|
|
130
|
+
|
|
131
|
+
if format is None:
|
|
132
|
+
format = audio.get_file_ext()
|
|
133
|
+
|
|
134
|
+
duration = end - start
|
|
135
|
+
start_ms = int(start * 1000)
|
|
136
|
+
end_ms = int(end * 1000)
|
|
137
|
+
output_file = posixpath.join(
|
|
138
|
+
output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
audio_bytes = audio_fragment_bytes(audio, start, duration, format)
|
|
143
|
+
|
|
144
|
+
from datachain.lib.file import AudioFile
|
|
145
|
+
|
|
146
|
+
return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
|
|
147
|
+
|
|
148
|
+
except Exception as exc:
|
|
149
|
+
raise FileError(
|
|
150
|
+
"unable to save audio fragment", audio.source, audio.path
|
|
151
|
+
) from exc
|
|
@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
|
|
|
9
9
|
type_ = sql_exp.type.python_type
|
|
10
10
|
if type_ == Decimal:
|
|
11
11
|
type_ = float
|
|
12
|
+
elif type_ is list:
|
|
13
|
+
if hasattr(sql_exp.type, "item_type") and hasattr(
|
|
14
|
+
sql_exp.type.item_type, "python_type"
|
|
15
|
+
):
|
|
16
|
+
item_type = getattr(sql_exp.type.item_type, "python_type", Any)
|
|
17
|
+
type_ = list[item_type] # type: ignore[valid-type]
|
|
18
|
+
else:
|
|
19
|
+
type_ = list
|
|
12
20
|
except NotImplementedError:
|
|
13
21
|
type_ = str
|
|
14
22
|
return type_
|