lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. lattifai/__init__.py +61 -47
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/alignment/lattice1_worker.py +185 -0
  5. lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/gemini_reader.py +30 -30
  12. lattifai/{io → caption}/gemini_writer.py +17 -17
  13. lattifai/{io → caption}/supervision.py +4 -3
  14. lattifai/caption/text_parser.py +145 -0
  15. lattifai/cli/__init__.py +17 -0
  16. lattifai/cli/alignment.py +153 -0
  17. lattifai/cli/caption.py +204 -0
  18. lattifai/cli/server.py +19 -0
  19. lattifai/cli/transcribe.py +197 -0
  20. lattifai/cli/youtube.py +128 -0
  21. lattifai/client.py +460 -251
  22. lattifai/config/__init__.py +20 -0
  23. lattifai/config/alignment.py +73 -0
  24. lattifai/config/caption.py +178 -0
  25. lattifai/config/client.py +46 -0
  26. lattifai/config/diarization.py +67 -0
  27. lattifai/config/media.py +335 -0
  28. lattifai/config/transcription.py +84 -0
  29. lattifai/diarization/__init__.py +5 -0
  30. lattifai/diarization/lattifai.py +89 -0
  31. lattifai/errors.py +98 -91
  32. lattifai/logging.py +116 -0
  33. lattifai/mixin.py +552 -0
  34. lattifai/server/app.py +420 -0
  35. lattifai/transcription/__init__.py +76 -0
  36. lattifai/transcription/base.py +108 -0
  37. lattifai/transcription/gemini.py +219 -0
  38. lattifai/transcription/lattifai.py +103 -0
  39. lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
  40. lattifai/types.py +30 -0
  41. lattifai/utils.py +16 -44
  42. lattifai/workflow/__init__.py +22 -0
  43. lattifai/workflow/agents.py +6 -0
  44. lattifai/{workflows → workflow}/base.py +22 -22
  45. lattifai/{workflows → workflow}/file_manager.py +239 -215
  46. lattifai/workflow/youtube.py +564 -0
  47. lattifai-1.0.0.dist-info/METADATA +736 -0
  48. lattifai-1.0.0.dist-info/RECORD +52 -0
  49. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  50. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  51. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
  52. lattifai/base_client.py +0 -126
  53. lattifai/bin/__init__.py +0 -3
  54. lattifai/bin/agent.py +0 -325
  55. lattifai/bin/align.py +0 -296
  56. lattifai/bin/cli_base.py +0 -25
  57. lattifai/bin/subtitle.py +0 -210
  58. lattifai/io/__init__.py +0 -42
  59. lattifai/io/reader.py +0 -85
  60. lattifai/io/text_parser.py +0 -75
  61. lattifai/io/utils.py +0 -15
  62. lattifai/io/writer.py +0 -90
  63. lattifai/tokenizer/__init__.py +0 -3
  64. lattifai/workers/__init__.py +0 -3
  65. lattifai/workers/lattice1_alpha.py +0 -284
  66. lattifai/workflows/__init__.py +0 -34
  67. lattifai/workflows/agents.py +0 -10
  68. lattifai/workflows/gemini.py +0 -167
  69. lattifai/workflows/prompts/README.md +0 -22
  70. lattifai/workflows/prompts/gemini/README.md +0 -24
  71. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  72. lattifai/workflows/youtube.py +0 -931
  73. lattifai-0.4.5.dist-info/METADATA +0 -808
  74. lattifai-0.4.5.dist-info/RECORD +0 -39
  75. lattifai-0.4.5.dist-info/entry_points.txt +0 -3
  76. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/audio2.py ADDED
@@ -0,0 +1,211 @@
1
+ """Audio loading and resampling utilities."""
2
+
3
+ from collections import namedtuple
4
+ from pathlib import Path
5
+ from typing import BinaryIO, Iterable, Optional, Tuple, Union
6
+
7
+ import numpy as np
8
+ import soundfile as sf
9
+ import torch
10
+ from lhotse.augmentation import get_or_create_resampler
11
+ from lhotse.utils import Pathlike
12
+
13
+ from lattifai.errors import AudioLoadError
14
+
15
+ # ChannelSelectorType = Union[int, Iterable[int], str]
16
+ ChannelSelectorType = Union[int, str]
17
+
18
+
19
+ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "device", "path"])):
20
+ """Audio data container with sampling rate, numpy array, tensor, and device information."""
21
+
22
+ def __str__(self) -> str:
23
+ return self.path
24
+
25
+ @property
26
+ def duration(self) -> float:
27
+ """Duration of the audio in seconds."""
28
+ return self.ndarray.shape[-1] / self.sampling_rate
29
+
30
+
31
+ class AudioLoader:
32
+ """Load and preprocess audio files into AudioData format."""
33
+
34
+ def __init__(
35
+ self,
36
+ device: str = "cpu",
37
+ ):
38
+ """Initialize AudioLoader.
39
+
40
+ Args:
41
+ device: Device to load audio tensors on (default: "cpu").
42
+ """
43
+ self.device = device
44
+ self._resampler_cache = {}
45
+
46
+ def _resample_audio(
47
+ self,
48
+ audio_sr: Tuple[torch.Tensor, int],
49
+ sampling_rate: int,
50
+ device: Optional[str],
51
+ channel_selector: Optional[ChannelSelectorType],
52
+ ) -> torch.Tensor:
53
+ """Resample audio to target sampling rate with channel selection.
54
+
55
+ Args:
56
+ audio_sr: Tuple of (audio_tensor, original_sample_rate).
57
+ sampling_rate: Target sampling rate.
58
+ device: Device to perform resampling on.
59
+ channel_selector: How to select channels.
60
+
61
+ Returns:
62
+ Resampled audio tensor of shape (1, T) or (C, T).
63
+ """
64
+ audio, sr = audio_sr
65
+
66
+ if channel_selector is None:
67
+ # keep the original multi-channel signal
68
+ tensor = audio
69
+ elif isinstance(channel_selector, int):
70
+ assert audio.shape[0] >= channel_selector, f"Invalid channel: {channel_selector}"
71
+ tensor = audio[channel_selector : channel_selector + 1].clone()
72
+ del audio
73
+ elif isinstance(channel_selector, str):
74
+ assert channel_selector == "average"
75
+ tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
76
+ del audio
77
+ else:
78
+ raise ValueError(f"Unsupported channel_selector: {channel_selector}")
79
+ # assert isinstance(channel_selector, Iterable)
80
+ # num_channels = audio.shape[0]
81
+ # print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
82
+ # if max(channel_selector) >= num_channels:
83
+ # raise ValueError(
84
+ # f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
85
+ # )
86
+ # tensor = audio[channel_selector]
87
+
88
+ tensor = tensor.to(device)
89
+ if sr != sampling_rate:
90
+ cache_key = (sr, sampling_rate, device)
91
+ if cache_key not in self._resampler_cache:
92
+ self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
93
+ resampler = self._resampler_cache[cache_key]
94
+
95
+ length = tensor.size(-1)
96
+ chunk_size = sampling_rate * 3600
97
+ if length > chunk_size:
98
+ resampled_chunks = []
99
+ for i in range(0, length, chunk_size):
100
+ resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
101
+ tensor = torch.cat(resampled_chunks, dim=-1)
102
+ else:
103
+ tensor = resampler(tensor)
104
+
105
+ return tensor
106
+
107
+ def _load_audio(
108
+ self,
109
+ audio: Union[Pathlike, BinaryIO],
110
+ sampling_rate: int,
111
+ channel_selector: Optional[ChannelSelectorType],
112
+ ) -> torch.Tensor:
113
+ """Load audio from file or binary stream and resample to target rate.
114
+
115
+ Args:
116
+ audio: Path to audio file or binary stream.
117
+ sampling_rate: Target sampling rate.
118
+ channel_selector: How to select channels.
119
+
120
+ Returns:
121
+ Resampled audio tensor.
122
+
123
+ Raises:
124
+ ImportError: If PyAV is needed but not installed.
125
+ ValueError: If no audio stream found.
126
+ RuntimeError: If audio loading fails.
127
+ """
128
+ if isinstance(audio, Pathlike):
129
+ audio = str(Path(str(audio)).expanduser())
130
+
131
+ # load audio
132
+ try:
133
+ waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32") # numpy array
134
+ waveform = waveform.T # (channels, samples)
135
+ except Exception as primary_error:
136
+ # Fallback to PyAV for formats not supported by soundfile
137
+ try:
138
+ import av
139
+ except ImportError:
140
+ raise AudioLoadError(
141
+ "PyAV (av) is required for loading certain audio formats. "
142
+ f"Install it with: pip install av\n"
143
+ f"Primary error was: {primary_error}"
144
+ )
145
+
146
+ try:
147
+ container = av.open(audio)
148
+ audio_stream = next((s for s in container.streams if s.type == "audio"), None)
149
+
150
+ if audio_stream is None:
151
+ raise ValueError(f"No audio stream found in file: {audio}")
152
+
153
+ # Resample to target sample rate during decoding
154
+ audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
155
+
156
+ frames = []
157
+ for frame in container.decode(audio_stream):
158
+ # Convert frame to numpy array
159
+ array = frame.to_ndarray()
160
+ # Ensure shape is (channels, samples)
161
+ if array.ndim == 1:
162
+ array = array.reshape(1, -1)
163
+ elif array.ndim == 2 and array.shape[0] > array.shape[1]:
164
+ array = array.T
165
+ frames.append(array)
166
+
167
+ container.close()
168
+
169
+ if not frames:
170
+ raise ValueError(f"No audio data found in file: {audio}")
171
+
172
+ # Concatenate all frames
173
+ waveform = np.concatenate(frames, axis=1).astype(np.float32) # (channels, samples)
174
+ sample_rate = audio_stream.codec_context.sample_rate
175
+ except Exception as e:
176
+ raise RuntimeError(f"Failed to load audio file {audio}: {e}")
177
+
178
+ return self._resample_audio(
179
+ (torch.from_numpy(waveform), sample_rate),
180
+ sampling_rate,
181
+ device=self.device,
182
+ channel_selector=channel_selector,
183
+ )
184
+
185
+ def __call__(
186
+ self,
187
+ audio: Union[Pathlike, BinaryIO],
188
+ sampling_rate: int = 16000,
189
+ channel_selector: Optional[ChannelSelectorType] = "average",
190
+ ) -> AudioData:
191
+ """
192
+ Args:
193
+ audio: Path to audio file or binary stream.
194
+ channel_selector: How to select channels (default: "average").
195
+ sampling_rate: Target sampling rate (default: use instance sampling_rate).
196
+
197
+ Returns:
198
+ AudioData namedtuple with sampling_rate, ndarray, and tensor fields.
199
+ """
200
+ tensor = self._load_audio(audio, sampling_rate, channel_selector)
201
+
202
+ # tensor is (1, T) or (C, T)
203
+ ndarray = tensor.cpu().numpy()
204
+
205
+ return AudioData(
206
+ sampling_rate=sampling_rate,
207
+ ndarray=ndarray,
208
+ tensor=tensor,
209
+ device=self.device,
210
+ path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
211
+ )
@@ -0,0 +1,20 @@
1
+ from typing import List, Optional
2
+
3
+ from lhotse.utils import Pathlike
4
+
5
+ from ..config.caption import InputCaptionFormat
6
+ from .caption import Caption
7
+ from .gemini_reader import GeminiReader, GeminiSegment
8
+ from .gemini_writer import GeminiWriter
9
+ from .supervision import Supervision
10
+ from .text_parser import normalize_text
11
+
12
+ __all__ = [
13
+ "Caption",
14
+ "Supervision",
15
+ "GeminiReader",
16
+ "GeminiWriter",
17
+ "GeminiSegment",
18
+ "normalize_text",
19
+ "InputCaptionFormat",
20
+ ]