omnius 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4959 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +630665 -0
- package/dist/launcher.cjs +78 -0
- package/dist/postinstall-daemon.cjs +776 -0
- package/dist/preinstall.cjs +92 -0
- package/dist/scripts/autoresearch-prepare.py +459 -0
- package/dist/scripts/autoresearch-train.py +661 -0
- package/dist/scripts/crawlee-scraper.py +358 -0
- package/dist/scripts/live-nemotron.py +478 -0
- package/dist/scripts/live-whisper.py +242 -0
- package/dist/scripts/ocr-advanced.py +571 -0
- package/dist/scripts/start-moondream.py +112 -0
- package/dist/scripts/tor/UPSTREAM-README.md +148 -0
- package/dist/scripts/tor/destroy_tor.sh +29 -0
- package/dist/scripts/tor/tor_setup.sh +163 -0
- package/dist/scripts/transcribe-file.py +63 -0
- package/dist/scripts/web_scrape.py +1295 -0
- package/npm-shrinkwrap.json +7412 -0
- package/package.json +142 -0
- package/prompts/agentic/system-large.md +569 -0
- package/prompts/agentic/system-medium.md +211 -0
- package/prompts/agentic/system-small.md +114 -0
- package/prompts/compaction/context-compaction.md +44 -0
- package/prompts/personality/level-1-minimal.md +3 -0
- package/prompts/personality/level-2-concise.md +3 -0
- package/prompts/personality/level-4-explanatory.md +3 -0
- package/prompts/personality/level-5-thorough.md +3 -0
- package/prompts/personality/level-autist.md +3 -0
- package/prompts/personality/level-stark.md +3 -0
- package/prompts/runners/dispatcher.md +24 -0
- package/prompts/runners/editor.md +44 -0
- package/prompts/runners/evaluator.md +30 -0
- package/prompts/runners/merge-summary.md +9 -0
- package/prompts/runners/normalizer.md +23 -0
- package/prompts/runners/planner.md +33 -0
- package/prompts/runners/scout.md +39 -0
- package/prompts/runners/verifier.md +36 -0
- package/prompts/skill-builder/seed-analysis.md +30 -0
- package/prompts/skill-builder/skill-expansion.md +76 -0
- package/prompts/skill-builder/skill-validation.md +31 -0
- package/prompts/templates/analysis.md +14 -0
- package/prompts/templates/code-review.md +16 -0
- package/prompts/templates/code.md +13 -0
- package/prompts/templates/document.md +13 -0
- package/prompts/templates/error-diagnosis.md +14 -0
- package/prompts/templates/general.md +9 -0
- package/prompts/templates/plan.md +15 -0
- package/prompts/templates/system.md +16 -0
- package/prompts/tui/dmn-gather.md +128 -0
- package/prompts/tui/dream-consolidate.md +48 -0
- package/prompts/tui/dream-lucid-eval.md +17 -0
- package/prompts/tui/dream-lucid-implement.md +14 -0
- package/prompts/tui/dream-stages.md +19 -0
- package/prompts/tui/emotion-behavioral.md +2 -0
- package/prompts/tui/emotion-center.md +12 -0
- package/voices/personaplex/OverBarn.pt +0 -0
- package/voices/personaplex/clone-voice.py +384 -0
- package/voices/personaplex/dequant-loader.py +174 -0
- package/voices/personaplex/quantize-weights.py +167 -0
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
live-nemotron.py — Self-contained streaming ASR worker using NVIDIA's
|
|
4
|
+
nvidia/nemotron-speech-streaming-en-0.6b model.
|
|
5
|
+
|
|
6
|
+
Parallel to live-whisper.py. Same stdin/stdout protocol so the same
|
|
7
|
+
pipelines (nexus voice subsystem, asr_listen tool, eval harness) can
|
|
8
|
+
swap backends by pointing at a different script.
|
|
9
|
+
|
|
10
|
+
Protocol:
|
|
11
|
+
stdin — raw PCM16 (16kHz, mono, 16-bit signed little-endian)
|
|
12
|
+
stdout — JSON lines:
|
|
13
|
+
{"type":"status","message":"Creating venv..."}
|
|
14
|
+
{"type":"status","message":"Installing dependencies..."}
|
|
15
|
+
{"type":"status","message":"Loading model..."}
|
|
16
|
+
{"type":"ready"}
|
|
17
|
+
{"type":"transcript","text":"hello world","isFinal":false}
|
|
18
|
+
{"type":"transcript","text":"hello world how are you","isFinal":true}
|
|
19
|
+
{"type":"error","message":"..."}
|
|
20
|
+
|
|
21
|
+
Usage:
|
|
22
|
+
# Live stream from mic:
|
|
23
|
+
arecord -f S16_LE -r 16000 -c 1 -t raw -q - | python3 live-nemotron.py
|
|
24
|
+
# Single file transcription (write path + read transcript JSON):
|
|
25
|
+
python3 live-nemotron.py --file recording.wav
|
|
26
|
+
|
|
27
|
+
Backend selection:
|
|
28
|
+
1. NeMo toolkit (nvidia NeMo) — native streaming support for Parakeet-
|
|
29
|
+
style models. Preferred when available.
|
|
30
|
+
2. transformers + torchaudio — fallback via HuggingFace's generic
|
|
31
|
+
ASR pipeline. Works for file-based transcription even when NeMo
|
|
32
|
+
install fails (common on macOS / no-CUDA setups). Does NOT do
|
|
33
|
+
streaming — buffers the full window each chunk.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
import sys
|
|
37
|
+
import os
|
|
38
|
+
import json
|
|
39
|
+
import subprocess
|
|
40
|
+
import struct
|
|
41
|
+
import time
|
|
42
|
+
import threading
|
|
43
|
+
import argparse
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# Configuration
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
51
|
+
VENV = SCRIPT_DIR / ".nemotron-venv"
|
|
52
|
+
PY = VENV / "bin" / "python"
|
|
53
|
+
PIP = VENV / "bin" / "pip"
|
|
54
|
+
|
|
55
|
+
SAMPLE_RATE = 16000
|
|
56
|
+
CHANNELS = 1
|
|
57
|
+
SAMPLE_WIDTH = 2 # 16-bit
|
|
58
|
+
CHUNK_SECONDS = 2.0 # Nemotron is a streaming model — shorter chunks than whisper
|
|
59
|
+
WINDOW_SECONDS = 8.0
|
|
60
|
+
|
|
61
|
+
# HuggingFace model identifier
|
|
62
|
+
MODEL_ID = "nvidia/nemotron-speech-streaming-en-0.6b"
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Output helpers (JSON lines to stdout)
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def emit(event: dict):
|
|
69
|
+
sys.stdout.write(json.dumps(event) + "\n")
|
|
70
|
+
sys.stdout.flush()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def emit_status(msg: str):
|
|
74
|
+
emit({"type": "status", "message": msg})
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def emit_error(msg: str):
|
|
78
|
+
emit({"type": "error", "message": msg})
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def emit_transcript(text: str, is_final: bool = False, backend: str = "nemotron"):
|
|
82
|
+
emit({"type": "transcript", "text": text, "isFinal": is_final, "backend": backend})
|
|
83
|
+
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
# Venv bootstrap (same pattern as live-whisper.py)
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
def _in_venv() -> bool:
|
|
89
|
+
return sys.prefix != sys.base_prefix and str(SCRIPT_DIR) in sys.prefix
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _ensure_venv():
|
|
93
|
+
if VENV.exists():
|
|
94
|
+
return
|
|
95
|
+
emit_status("Creating Python venv for Nemotron ASR...")
|
|
96
|
+
import venv
|
|
97
|
+
venv.EnvBuilder(with_pip=True).create(str(VENV))
|
|
98
|
+
subprocess.check_call(
|
|
99
|
+
[str(PY), "-m", "pip", "install", "--upgrade", "pip", "wheel"],
|
|
100
|
+
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _ensure_deps():
|
|
105
|
+
"""Install torch + either nemo_toolkit[asr] or transformers as fallback."""
|
|
106
|
+
need = []
|
|
107
|
+
try:
|
|
108
|
+
import numpy # noqa: F401
|
|
109
|
+
except ImportError:
|
|
110
|
+
need.append("numpy")
|
|
111
|
+
try:
|
|
112
|
+
import torch # noqa: F401
|
|
113
|
+
except ImportError:
|
|
114
|
+
need.append("torch")
|
|
115
|
+
try:
|
|
116
|
+
import soundfile # noqa: F401
|
|
117
|
+
except ImportError:
|
|
118
|
+
need.append("soundfile")
|
|
119
|
+
try:
|
|
120
|
+
import transformers # noqa: F401
|
|
121
|
+
except ImportError:
|
|
122
|
+
need.append("transformers")
|
|
123
|
+
|
|
124
|
+
if need:
|
|
125
|
+
emit_status(f"Installing core deps: {', '.join(need)}...")
|
|
126
|
+
try:
|
|
127
|
+
subprocess.check_call(
|
|
128
|
+
[str(PIP), "install", *need],
|
|
129
|
+
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE,
|
|
130
|
+
)
|
|
131
|
+
except subprocess.CalledProcessError as e:
|
|
132
|
+
emit_error(f"pip install failed: {e}")
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
# Force reimport
|
|
135
|
+
for mod in ("numpy", "torch", "soundfile", "transformers"):
|
|
136
|
+
if mod in sys.modules:
|
|
137
|
+
del sys.modules[mod]
|
|
138
|
+
|
|
139
|
+
# NeMo toolkit is large and optional — try to install it but fall
|
|
140
|
+
# back gracefully if it's unavailable on this platform.
|
|
141
|
+
try:
|
|
142
|
+
import nemo.collections.asr # noqa: F401
|
|
143
|
+
except ImportError:
|
|
144
|
+
emit_status("Installing nemo_toolkit[asr] (large — may take a few minutes)...")
|
|
145
|
+
try:
|
|
146
|
+
subprocess.check_call(
|
|
147
|
+
[str(PIP), "install", "nemo_toolkit[asr]"],
|
|
148
|
+
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE,
|
|
149
|
+
timeout=600,
|
|
150
|
+
)
|
|
151
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
|
152
|
+
emit_status(f"NeMo install skipped ({e}) — will use transformers fallback")
|
|
153
|
+
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
# Bootstrap: re-exec inside venv
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
# --check short-circuit — runs on the host Python without any venv or
|
|
159
|
+
# dependency install so CI and smoke tests can verify the script parses
|
|
160
|
+
# + is callable without triggering a 5-minute NeMo download.
|
|
161
|
+
if "--check" in sys.argv:
|
|
162
|
+
emit({"type": "check", "ok": True, "script": str(Path(__file__).resolve())})
|
|
163
|
+
sys.exit(0)
|
|
164
|
+
|
|
165
|
+
if not _in_venv():
|
|
166
|
+
_ensure_venv()
|
|
167
|
+
os.execv(str(PY), [str(PY)] + sys.argv)
|
|
168
|
+
|
|
169
|
+
_ensure_deps()
|
|
170
|
+
|
|
171
|
+
# Now safe to import
|
|
172
|
+
import numpy as np # noqa: E402
|
|
173
|
+
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
# Backend loaders
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
def _load_nemo_model(model_id: str = MODEL_ID, force_cpu: bool = False):
|
|
179
|
+
"""Try to load via NeMo toolkit. Returns (model, device) or (None, None).
|
|
180
|
+
|
|
181
|
+
Handles the "cuDNN not compatible with SM < 7.5" error by retrying
|
|
182
|
+
on CPU. This is the common failure mode on older NVIDIA GPUs where
|
|
183
|
+
the installed torch has a newer cuDNN than the hardware supports.
|
|
184
|
+
"""
|
|
185
|
+
# If caller asked for CPU explicitly, hide the GPU from torch before
|
|
186
|
+
# importing anything that might touch CUDA.
|
|
187
|
+
if force_cpu:
|
|
188
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|
189
|
+
try:
|
|
190
|
+
import nemo.collections.asr as nemo_asr
|
|
191
|
+
import torch
|
|
192
|
+
except ImportError:
|
|
193
|
+
return (None, None)
|
|
194
|
+
try:
|
|
195
|
+
emit_status(f"Loading NeMo model {model_id}...")
|
|
196
|
+
model = nemo_asr.models.ASRModel.from_pretrained(model_id)
|
|
197
|
+
model.eval()
|
|
198
|
+
# Force CPU to avoid cuDNN version mismatches on older GPUs
|
|
199
|
+
if force_cpu or not torch.cuda.is_available():
|
|
200
|
+
try:
|
|
201
|
+
model = model.cpu()
|
|
202
|
+
except Exception:
|
|
203
|
+
pass
|
|
204
|
+
return (model, "cpu" if (force_cpu or not torch.cuda.is_available()) else "cuda")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
msg = str(e)
|
|
207
|
+
emit_status(f"NeMo load failed: {msg[:200]}")
|
|
208
|
+
# Retry on CPU if the error looks like a cuDNN / device compat issue
|
|
209
|
+
if not force_cpu and any(k in msg for k in ("cuDNN", "SM <", "CUDA", "device side")):
|
|
210
|
+
emit_status("Retrying NeMo load on CPU only...")
|
|
211
|
+
return _load_nemo_model(model_id, force_cpu=True)
|
|
212
|
+
return (None, None)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _load_transformers_model(model_id: str = MODEL_ID):
|
|
216
|
+
"""Fallback: load via HuggingFace transformers pipeline."""
|
|
217
|
+
try:
|
|
218
|
+
from transformers import pipeline
|
|
219
|
+
except ImportError:
|
|
220
|
+
return None
|
|
221
|
+
try:
|
|
222
|
+
emit_status(f"Loading transformers pipeline for {model_id}...")
|
|
223
|
+
device = -1
|
|
224
|
+
try:
|
|
225
|
+
import torch
|
|
226
|
+
if torch.cuda.is_available():
|
|
227
|
+
device = 0
|
|
228
|
+
except ImportError:
|
|
229
|
+
pass
|
|
230
|
+
pipe = pipeline(
|
|
231
|
+
task="automatic-speech-recognition",
|
|
232
|
+
model=model_id,
|
|
233
|
+
device=device,
|
|
234
|
+
return_timestamps=False,
|
|
235
|
+
chunk_length_s=30,
|
|
236
|
+
stride_length_s=5,
|
|
237
|
+
)
|
|
238
|
+
return pipe
|
|
239
|
+
except Exception as e:
|
|
240
|
+
emit_status(f"transformers load failed: {e}")
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _extract_hypothesis_text(r0) -> str:
|
|
245
|
+
"""Extract the transcript string from a NeMo result item. Handles
|
|
246
|
+
plain strings, Hypothesis objects (with possibly empty text), and
|
|
247
|
+
nested lists of Hypotheses returned by RNNT models. Returns an
|
|
248
|
+
empty string for silent input rather than dumping the repr."""
|
|
249
|
+
if r0 is None:
|
|
250
|
+
return ""
|
|
251
|
+
if isinstance(r0, str):
|
|
252
|
+
return r0.strip()
|
|
253
|
+
# Nested list of hypotheses (some RNNT decoders)
|
|
254
|
+
if isinstance(r0, list):
|
|
255
|
+
if not r0:
|
|
256
|
+
return ""
|
|
257
|
+
return _extract_hypothesis_text(r0[0])
|
|
258
|
+
# Hypothesis object — may have text="" for silent audio, which is
|
|
259
|
+
# a VALID transcript (just empty). Return it without falling through
|
|
260
|
+
# to str(r0) which would dump the whole repr.
|
|
261
|
+
if hasattr(r0, "text"):
|
|
262
|
+
return str(r0.text or "").strip()
|
|
263
|
+
# best_hypothesis() method (rare)
|
|
264
|
+
if hasattr(r0, "best_hypothesis"):
|
|
265
|
+
try:
|
|
266
|
+
bh = r0.best_hypothesis()
|
|
267
|
+
if bh and hasattr(bh, "text"):
|
|
268
|
+
return str(bh.text or "").strip()
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
return ""
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _transcribe_buffer_nemo(model, audio: np.ndarray) -> str:
|
|
275
|
+
"""Transcribe a 16kHz mono float32 numpy array via NeMo.
|
|
276
|
+
|
|
277
|
+
Tries multiple invocation signatures across NeMo versions:
|
|
278
|
+
- transcribe([np.ndarray]) — newest
|
|
279
|
+
- transcribe(paths2audio_files=["file.wav"]) — legacy, requires tmp wav
|
|
280
|
+
"""
|
|
281
|
+
try:
|
|
282
|
+
# Newest API: pass audio arrays directly
|
|
283
|
+
result = model.transcribe([audio], batch_size=1, verbose=False)
|
|
284
|
+
if not result:
|
|
285
|
+
return ""
|
|
286
|
+
return _extract_hypothesis_text(result[0])
|
|
287
|
+
except TypeError:
|
|
288
|
+
# Older NeMo — fall through to file path invocation
|
|
289
|
+
pass
|
|
290
|
+
except Exception as e:
|
|
291
|
+
emit_error(f"NeMo transcribe error: {e}")
|
|
292
|
+
return ""
|
|
293
|
+
|
|
294
|
+
# Fallback: write audio to a temp WAV and pass the path
|
|
295
|
+
try:
|
|
296
|
+
import soundfile as sf
|
|
297
|
+
import tempfile
|
|
298
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
|
299
|
+
sf.write(tmp.name, audio, SAMPLE_RATE, subtype="PCM_16")
|
|
300
|
+
tmp_path = tmp.name
|
|
301
|
+
try:
|
|
302
|
+
result = model.transcribe(paths2audio_files=[tmp_path], batch_size=1, verbose=False)
|
|
303
|
+
if result and result[0] is not None:
|
|
304
|
+
return _extract_hypothesis_text(result[0])
|
|
305
|
+
finally:
|
|
306
|
+
try:
|
|
307
|
+
os.unlink(tmp_path)
|
|
308
|
+
except Exception:
|
|
309
|
+
pass
|
|
310
|
+
except Exception as e:
|
|
311
|
+
emit_error(f"NeMo legacy transcribe error: {e}")
|
|
312
|
+
return ""
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _transcribe_buffer_transformers(pipe, audio: np.ndarray) -> str:
|
|
316
|
+
"""Transcribe via transformers pipeline."""
|
|
317
|
+
try:
|
|
318
|
+
result = pipe({"array": audio, "sampling_rate": SAMPLE_RATE})
|
|
319
|
+
if isinstance(result, dict):
|
|
320
|
+
return str(result.get("text", "")).strip()
|
|
321
|
+
if isinstance(result, list) and result:
|
|
322
|
+
return str(result[0].get("text", "")).strip() if isinstance(result[0], dict) else ""
|
|
323
|
+
return ""
|
|
324
|
+
except Exception as e:
|
|
325
|
+
emit_error(f"transformers transcribe error: {e}")
|
|
326
|
+
return ""
|
|
327
|
+
|
|
328
|
+
# ---------------------------------------------------------------------------
|
|
329
|
+
# File transcription mode (single-shot)
|
|
330
|
+
# ---------------------------------------------------------------------------
|
|
331
|
+
|
|
332
|
+
def transcribe_file(path: str, language: str = "en") -> int:
|
|
333
|
+
"""Single-file transcription — reads a WAV, prints one transcript
|
|
334
|
+
JSON line, exits. Used by AsrListenTool's file path. Exit code 0
|
|
335
|
+
on success, 1 on failure."""
|
|
336
|
+
try:
|
|
337
|
+
import soundfile as sf
|
|
338
|
+
audio, sr = sf.read(path, dtype="float32")
|
|
339
|
+
if audio.ndim > 1:
|
|
340
|
+
audio = audio.mean(axis=1) # downmix to mono
|
|
341
|
+
if sr != SAMPLE_RATE:
|
|
342
|
+
# Resample via simple linear interpolation (avoids scipy dep)
|
|
343
|
+
ratio = SAMPLE_RATE / sr
|
|
344
|
+
new_len = int(len(audio) * ratio)
|
|
345
|
+
idx = np.linspace(0, len(audio) - 1, new_len).astype(np.float32)
|
|
346
|
+
audio = np.interp(idx, np.arange(len(audio), dtype=np.float32), audio).astype(np.float32)
|
|
347
|
+
except Exception as e:
|
|
348
|
+
emit_error(f"Failed to load audio file {path}: {e}")
|
|
349
|
+
return 1
|
|
350
|
+
|
|
351
|
+
(model, device) = _load_nemo_model()
|
|
352
|
+
backend = "nemo"
|
|
353
|
+
if model is None:
|
|
354
|
+
model = _load_transformers_model()
|
|
355
|
+
backend = "transformers"
|
|
356
|
+
if model is None:
|
|
357
|
+
emit_error("No nemotron backend available (tried NeMo + transformers)")
|
|
358
|
+
return 1
|
|
359
|
+
|
|
360
|
+
emit({"type": "ready", "backend": backend, "device": device or "cpu"})
|
|
361
|
+
|
|
362
|
+
t0 = time.time()
|
|
363
|
+
if backend == "nemo":
|
|
364
|
+
text = _transcribe_buffer_nemo(model, audio)
|
|
365
|
+
else:
|
|
366
|
+
text = _transcribe_buffer_transformers(model, audio)
|
|
367
|
+
elapsed = time.time() - t0
|
|
368
|
+
|
|
369
|
+
# Silent / no-speech audio is NOT an error — it's a valid transcript
|
|
370
|
+
# (empty string). Emit the full envelope so the caller can distinguish
|
|
371
|
+
# "no speech" from "engine crashed". Exit 0 either way.
|
|
372
|
+
emit({
|
|
373
|
+
"type": "transcript",
|
|
374
|
+
"text": text or "",
|
|
375
|
+
"isFinal": True,
|
|
376
|
+
"backend": f"nemotron-{backend}",
|
|
377
|
+
"latencyMs": int(elapsed * 1000),
|
|
378
|
+
"audioSeconds": float(len(audio) / SAMPLE_RATE),
|
|
379
|
+
"empty": not bool(text),
|
|
380
|
+
})
|
|
381
|
+
return 0
|
|
382
|
+
|
|
383
|
+
# ---------------------------------------------------------------------------
|
|
384
|
+
# Streaming mode (stdin → transcripts)
|
|
385
|
+
# ---------------------------------------------------------------------------
|
|
386
|
+
|
|
387
|
+
def stream_stdin(args) -> int:
|
|
388
|
+
(model, _device) = _load_nemo_model(args.model)
|
|
389
|
+
backend = "nemo"
|
|
390
|
+
if model is None:
|
|
391
|
+
model = _load_transformers_model(args.model)
|
|
392
|
+
backend = "transformers"
|
|
393
|
+
if model is None:
|
|
394
|
+
emit_error("No nemotron backend available (tried NeMo + transformers)")
|
|
395
|
+
return 1
|
|
396
|
+
|
|
397
|
+
emit({"type": "ready"})
|
|
398
|
+
|
|
399
|
+
audio_buf = np.zeros(0, dtype=np.float32)
|
|
400
|
+
buf_lock = threading.Lock()
|
|
401
|
+
chunk_bytes = int(args.chunk_seconds * SAMPLE_RATE * SAMPLE_WIDTH)
|
|
402
|
+
window_samples = int(args.window_seconds * SAMPLE_RATE)
|
|
403
|
+
last_text = ""
|
|
404
|
+
running = True
|
|
405
|
+
|
|
406
|
+
def read_stdin():
|
|
407
|
+
nonlocal audio_buf, running
|
|
408
|
+
try:
|
|
409
|
+
while running:
|
|
410
|
+
data = sys.stdin.buffer.read(chunk_bytes)
|
|
411
|
+
if not data:
|
|
412
|
+
break
|
|
413
|
+
samples = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
|
|
414
|
+
with buf_lock:
|
|
415
|
+
audio_buf = np.concatenate([audio_buf, samples])
|
|
416
|
+
except Exception:
|
|
417
|
+
pass
|
|
418
|
+
finally:
|
|
419
|
+
running = False
|
|
420
|
+
|
|
421
|
+
reader = threading.Thread(target=read_stdin, daemon=True)
|
|
422
|
+
reader.start()
|
|
423
|
+
|
|
424
|
+
try:
|
|
425
|
+
while running:
|
|
426
|
+
time.sleep(args.chunk_seconds)
|
|
427
|
+
with buf_lock:
|
|
428
|
+
if len(audio_buf) < SAMPLE_RATE:
|
|
429
|
+
continue
|
|
430
|
+
window = audio_buf[-window_samples:].copy() if len(audio_buf) > window_samples else audio_buf.copy()
|
|
431
|
+
if backend == "nemo":
|
|
432
|
+
text = _transcribe_buffer_nemo(model, window)
|
|
433
|
+
else:
|
|
434
|
+
text = _transcribe_buffer_transformers(model, window)
|
|
435
|
+
if text and text != last_text:
|
|
436
|
+
last_text = text
|
|
437
|
+
emit_transcript(text, is_final=False, backend=f"nemotron-{backend}")
|
|
438
|
+
except KeyboardInterrupt:
|
|
439
|
+
pass
|
|
440
|
+
|
|
441
|
+
with buf_lock:
|
|
442
|
+
full_audio = audio_buf.copy()
|
|
443
|
+
if len(full_audio) >= SAMPLE_RATE:
|
|
444
|
+
if backend == "nemo":
|
|
445
|
+
text = _transcribe_buffer_nemo(model, full_audio)
|
|
446
|
+
else:
|
|
447
|
+
text = _transcribe_buffer_transformers(model, full_audio)
|
|
448
|
+
if text:
|
|
449
|
+
emit_transcript(text, is_final=True, backend=f"nemotron-{backend}")
|
|
450
|
+
running = False
|
|
451
|
+
return 0
|
|
452
|
+
|
|
453
|
+
# ---------------------------------------------------------------------------
|
|
454
|
+
# Main
|
|
455
|
+
# ---------------------------------------------------------------------------
|
|
456
|
+
|
|
457
|
+
def main():
|
|
458
|
+
parser = argparse.ArgumentParser(description="Nemotron streaming ASR worker")
|
|
459
|
+
parser.add_argument("--model", default=MODEL_ID, help="HuggingFace model id (default: nvidia/nemotron-speech-streaming-en-0.6b)")
|
|
460
|
+
parser.add_argument("--file", default=None, help="Transcribe a single audio file instead of stdin")
|
|
461
|
+
parser.add_argument("--language", default="en", help="Language code")
|
|
462
|
+
parser.add_argument("--chunk-seconds", type=float, default=CHUNK_SECONDS, help="Transcribe interval")
|
|
463
|
+
parser.add_argument("--window-seconds", type=float, default=WINDOW_SECONDS, help="Sliding window size")
|
|
464
|
+
parser.add_argument("--stdin", action="store_true", help="Explicit stdin mode (default when no --file)")
|
|
465
|
+
parser.add_argument("--check", action="store_true", help="Just verify the script parses + imports; no model load")
|
|
466
|
+
args = parser.parse_args()
|
|
467
|
+
|
|
468
|
+
if args.check:
|
|
469
|
+
emit({"type": "check", "ok": True, "script": str(Path(__file__).resolve())})
|
|
470
|
+
return 0
|
|
471
|
+
|
|
472
|
+
if args.file:
|
|
473
|
+
return transcribe_file(args.file, args.language)
|
|
474
|
+
return stream_stdin(args)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
if __name__ == "__main__":
|
|
478
|
+
sys.exit(main())
|