datachain 0.25.2__py3-none-any.whl → 0.26.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -21,6 +21,9 @@ from datachain.lib.dc import (
21
21
  )
22
22
  from datachain.lib.file import (
23
23
  ArrowRow,
24
+ Audio,
25
+ AudioFile,
26
+ AudioFragment,
24
27
  File,
25
28
  FileError,
26
29
  Image,
@@ -43,6 +46,9 @@ __all__ = [
43
46
  "AbstractUDF",
44
47
  "Aggregator",
45
48
  "ArrowRow",
49
+ "Audio",
50
+ "AudioFile",
51
+ "AudioFragment",
46
52
  "C",
47
53
  "Column",
48
54
  "DataChain",
@@ -18,6 +18,7 @@ WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
18
18
  WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
19
19
  DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
20
20
  DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
21
+ DISTRIBUTED_DISABLED = "DATACHAIN_DISTRIBUTED_DISABLED"
21
22
 
22
23
  IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
23
24
 
@@ -103,6 +104,9 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
103
104
 
104
105
 
105
106
  def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
107
+ if os.environ.get(DISTRIBUTED_DISABLED) == "True":
108
+ return None
109
+
106
110
  if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
107
111
  return None
108
112
 
@@ -16,7 +16,7 @@ from .aggregate import (
16
16
  sum,
17
17
  )
18
18
  from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
19
- from .conditional import and_, case, greatest, ifelse, isnone, least, or_
19
+ from .conditional import and_, case, greatest, ifelse, isnone, least, not_, or_
20
20
  from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
21
21
  from .path import file_ext, file_stem, name, parent
22
22
  from .random import rand
@@ -54,6 +54,7 @@ __all__ = [
54
54
  "max",
55
55
  "min",
56
56
  "name",
57
+ "not_",
57
58
  "or_",
58
59
  "parent",
59
60
  "path",
@@ -3,6 +3,7 @@ from typing import Optional, Union
3
3
  from sqlalchemy import ColumnElement
4
4
  from sqlalchemy import and_ as sql_and
5
5
  from sqlalchemy import case as sql_case
6
+ from sqlalchemy import not_ as sql_not
6
7
  from sqlalchemy import or_ as sql_or
7
8
 
8
9
  from datachain.lib.utils import DataChainParamsError
@@ -288,3 +289,36 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
288
289
  func_args.append(arg)
289
290
 
290
291
  return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
292
+
293
+
294
+ def not_(arg: Union[ColumnElement, Func]) -> Func:
295
+ """
296
+ Returns the function that produces NOT of the given expressions.
297
+
298
+ Args:
299
+ arg (ColumnElement | Func): The expression for NOT statement.
300
+ If a string is provided, it is assumed to be the name of the column.
301
+ If a Column is provided, it is assumed to be a column in the dataset.
302
+ If a Func is provided, it is assumed to be a function returning a value.
303
+
304
+ Returns:
305
+ Func: A `Func` object that represents the NOT function.
306
+
307
+ Example:
308
+ ```py
309
+ dc.mutate(
310
+ test=not_(C("value") == 5)
311
+ )
312
+ ```
313
+
314
+ Notes:
315
+ - The result column will always be of type bool.
316
+ """
317
+ cols, func_args = [], []
318
+
319
+ if isinstance(arg, (str, Func)):
320
+ cols.append(arg)
321
+ else:
322
+ func_args.append(arg)
323
+
324
+ return Func("not", inner=sql_not, cols=cols, args=func_args, result_type=bool)
datachain/lib/audio.py ADDED
@@ -0,0 +1,151 @@
1
+ import posixpath
2
+ from typing import TYPE_CHECKING, Optional, Union
3
+
4
+ from datachain.lib.file import FileError
5
+
6
+ if TYPE_CHECKING:
7
+ from numpy import ndarray
8
+
9
+ from datachain.lib.file import Audio, AudioFile, File
10
+
11
+ try:
12
+ import torchaudio
13
+ except ImportError as exc:
14
+ raise ImportError(
15
+ "Missing dependencies for processing audio.\n"
16
+ "To install run:\n\n"
17
+ " pip install 'datachain[audio]'\n"
18
+ ) from exc
19
+
20
+
21
+ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
22
+ """Extract metadata like sample rate, channels, duration, and format."""
23
+ from datachain.lib.file import Audio
24
+
25
+ file = file.as_audio_file()
26
+
27
+ try:
28
+ with file.open() as f:
29
+ info = torchaudio.info(f)
30
+
31
+ sample_rate = int(info.sample_rate)
32
+ channels = int(info.num_channels)
33
+ frames = int(info.num_frames)
34
+ duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
35
+
36
+ # Get format information
37
+ format_name = getattr(info, "format", "")
38
+ codec_name = getattr(info, "encoding", "")
39
+ bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
40
+
41
+ except Exception as exc:
42
+ raise FileError(
43
+ "unable to extract metadata from audio file", file.source, file.path
44
+ ) from exc
45
+
46
+ return Audio(
47
+ sample_rate=sample_rate,
48
+ channels=channels,
49
+ duration=duration,
50
+ samples=frames,
51
+ format=format_name,
52
+ codec=codec_name,
53
+ bit_rate=bit_rate,
54
+ )
55
+
56
+
57
+ def audio_fragment_np(
58
+ audio: "AudioFile", start: float = 0, duration: Optional[float] = None
59
+ ) -> "tuple[ndarray, int]":
60
+ """Load audio fragment as numpy array.
61
+ Multi-channel audio is transposed to (samples, channels)."""
62
+ if start < 0:
63
+ raise ValueError("start must be a non-negative float")
64
+
65
+ if duration is not None and duration <= 0:
66
+ raise ValueError("duration must be a positive float")
67
+
68
+ if hasattr(audio, "as_audio_file"):
69
+ audio = audio.as_audio_file()
70
+
71
+ try:
72
+ with audio.open() as f:
73
+ info = torchaudio.info(f)
74
+ sample_rate = info.sample_rate
75
+
76
+ frame_offset = int(start * sample_rate)
77
+ num_frames = int(duration * sample_rate) if duration is not None else -1
78
+
79
+ # Reset file pointer to the beginning
80
+ # This is important to ensure we read from the correct position later
81
+ f.seek(0)
82
+
83
+ waveform, sr = torchaudio.load(
84
+ f, frame_offset=frame_offset, num_frames=num_frames
85
+ )
86
+
87
+ audio_np = waveform.numpy()
88
+
89
+ if audio_np.shape[0] > 1:
90
+ audio_np = audio_np.T
91
+ else:
92
+ audio_np = audio_np.squeeze()
93
+
94
+ return audio_np, int(sr)
95
+ except Exception as exc:
96
+ raise FileError(
97
+ "unable to read audio fragment", audio.source, audio.path
98
+ ) from exc
99
+
100
+
101
+ def audio_fragment_bytes(
102
+ audio: "AudioFile",
103
+ start: float = 0,
104
+ duration: Optional[float] = None,
105
+ format: str = "wav",
106
+ ) -> bytes:
107
+ """Convert audio fragment to bytes using soundfile."""
108
+ y, sr = audio_fragment_np(audio, start, duration)
109
+
110
+ import io
111
+
112
+ import soundfile as sf
113
+
114
+ buffer = io.BytesIO()
115
+ sf.write(buffer, y, sr, format=format)
116
+ return buffer.getvalue()
117
+
118
+
119
+ def save_audio_fragment(
120
+ audio: "AudioFile",
121
+ start: float,
122
+ end: float,
123
+ output: str,
124
+ format: Optional[str] = None,
125
+ ) -> "AudioFile":
126
+ """Save audio fragment with timestamped filename.
127
+ Supports local and remote storage upload."""
128
+ if start < 0 or end < 0 or start >= end:
129
+ raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
130
+
131
+ if format is None:
132
+ format = audio.get_file_ext()
133
+
134
+ duration = end - start
135
+ start_ms = int(start * 1000)
136
+ end_ms = int(end * 1000)
137
+ output_file = posixpath.join(
138
+ output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
139
+ )
140
+
141
+ try:
142
+ audio_bytes = audio_fragment_bytes(audio, start, duration, format)
143
+
144
+ from datachain.lib.file import AudioFile
145
+
146
+ return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
147
+
148
+ except Exception as exc:
149
+ raise FileError(
150
+ "unable to save audio fragment", audio.source, audio.path
151
+ ) from exc
@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
9
9
  type_ = sql_exp.type.python_type
10
10
  if type_ == Decimal:
11
11
  type_ = float
12
+ elif type_ is list:
13
+ if hasattr(sql_exp.type, "item_type") and hasattr(
14
+ sql_exp.type.item_type, "python_type"
15
+ ):
16
+ item_type = getattr(sql_exp.type.item_type, "python_type", Any)
17
+ type_ = list[item_type] # type: ignore[valid-type]
18
+ else:
19
+ type_ = list
12
20
  except NotImplementedError:
13
21
  type_ = str
14
22
  return type_