PayPerTranscript 0.2.8__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/PKG-INFO +1 -1
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/PayPerTranscript.egg-info/PKG-INFO +1 -1
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/__init__.py +1 -1
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/config.py +2 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/cost_tracker.py +18 -7
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/hotkey.py +21 -1
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/text_inserter.py +23 -8
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/pipeline/transcription.py +46 -3
- paypertranscript-0.3.0/paypertranscript/providers/groq_provider.py +273 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/app.py +73 -6
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/constants.py +9 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/overlay.py +43 -1
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/pages/home_page.py +2 -2
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/pages/settings_page.py +70 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/tray.py +15 -9
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/pyproject.toml +1 -1
- paypertranscript-0.2.8/paypertranscript/providers/groq_provider.py +0 -182
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/LICENSE +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/PayPerTranscript.egg-info/SOURCES.txt +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/PayPerTranscript.egg-info/dependency_links.txt +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/PayPerTranscript.egg-info/entry_points.txt +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/PayPerTranscript.egg-info/requires.txt +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/PayPerTranscript.egg-info/top_level.txt +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/README.md +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/__main__.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/icons/app.ico +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/icons/app.png +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/icons/app_big.png +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/icons/arrow_down.svg +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/icons/tray.png +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/icons/tray_green.png +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/icons/tray_orange.png +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/sounds/start.wav +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/sounds/stop.wav +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/assets/styles/dark.qss +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/__init__.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/audio_manager.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/logging.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/paths.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/recorder.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/session_logger.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/updater.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/core/window_detector.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/pipeline/__init__.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/providers/__init__.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/providers/base.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/__init__.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/animated.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/main_window.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/pages/__init__.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/pages/statistics_page.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/pages/window_mapping_page.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/pages/word_list_page.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/setup_wizard.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/sidebar.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/ui/widgets.py +0 -0
- {paypertranscript-0.2.8 → paypertranscript-0.3.0}/setup.cfg +0 -0
|
@@ -33,6 +33,7 @@ DEFAULT_CONFIG: dict[str, Any] = {
|
|
|
33
33
|
"provider": "groq",
|
|
34
34
|
"stt_model": "whisper-large-v3-turbo",
|
|
35
35
|
"llm_model": "openai/gpt-oss-20b",
|
|
36
|
+
"llm_temperature": 1.0,
|
|
36
37
|
},
|
|
37
38
|
"words": {
|
|
38
39
|
"misspelled_words": [],
|
|
@@ -87,6 +88,7 @@ _SCHEMA: dict[str, type | tuple[type, ...]] = {
|
|
|
87
88
|
"api.provider": str,
|
|
88
89
|
"api.stt_model": str,
|
|
89
90
|
"api.llm_model": str,
|
|
91
|
+
"api.llm_temperature": (int, float),
|
|
90
92
|
"words.misspelled_words": list,
|
|
91
93
|
"formatting.window_mappings": dict,
|
|
92
94
|
"formatting.categories": dict,
|
|
@@ -6,12 +6,18 @@ Keine I/O, keine Seiteneffekte - einfach testbar.
|
|
|
6
6
|
|
|
7
7
|
from dataclasses import dataclass
|
|
8
8
|
|
|
9
|
-
# STT
|
|
9
|
+
# STT API-Preise (Stand: 2026-02)
|
|
10
10
|
STT_PRICE_PER_HOUR_USD = 0.04
|
|
11
11
|
STT_MIN_BILLED_SECONDS = 10 # API-seitiges Minimum-Billing
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
# LLM-Preise pro Modell: (Input USD/M Tokens, Output USD/M Tokens)
|
|
14
|
+
LLM_PRICES: dict[str, tuple[float, float]] = {
|
|
15
|
+
"openai/gpt-oss-20b": (0.075, 0.30),
|
|
16
|
+
"openai/gpt-oss-120b": (0.15, 0.60),
|
|
17
|
+
"moonshotai/kimi-k2-instruct-0905": (1.00, 3.00),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
_DEFAULT_LLM_PRICES = (0.075, 0.30) # Fallback
|
|
15
21
|
|
|
16
22
|
|
|
17
23
|
@dataclass(frozen=True)
|
|
@@ -41,19 +47,22 @@ def calculate_stt_cost(audio_duration_seconds: float) -> tuple[float, float]:
|
|
|
41
47
|
return billed, cost
|
|
42
48
|
|
|
43
49
|
|
|
44
|
-
def calculate_llm_cost(
|
|
50
|
+
def calculate_llm_cost(
|
|
51
|
+
input_tokens: int, output_tokens: int, model: str = "",
|
|
52
|
+
) -> float:
|
|
45
53
|
"""Berechnet LLM-Kosten.
|
|
46
54
|
|
|
47
55
|
Args:
|
|
48
56
|
input_tokens: Anzahl Input-Tokens.
|
|
49
57
|
output_tokens: Anzahl Output-Tokens.
|
|
58
|
+
model: LLM-Modellname fuer modellspezifische Preise.
|
|
50
59
|
|
|
51
60
|
Returns:
|
|
52
61
|
Kosten in USD.
|
|
53
62
|
"""
|
|
63
|
+
input_price, output_price = LLM_PRICES.get(model, _DEFAULT_LLM_PRICES)
|
|
54
64
|
return (
|
|
55
|
-
input_tokens *
|
|
56
|
-
+ output_tokens * LLM_OUTPUT_PRICE_PER_M_TOKENS
|
|
65
|
+
input_tokens * input_price + output_tokens * output_price
|
|
57
66
|
) / 1_000_000
|
|
58
67
|
|
|
59
68
|
|
|
@@ -61,6 +70,7 @@ def calculate_total_cost(
|
|
|
61
70
|
audio_duration_seconds: float,
|
|
62
71
|
llm_input_tokens: int = 0,
|
|
63
72
|
llm_output_tokens: int = 0,
|
|
73
|
+
llm_model: str = "",
|
|
64
74
|
) -> CostResult:
|
|
65
75
|
"""Berechnet Gesamtkosten einer Transkription.
|
|
66
76
|
|
|
@@ -68,12 +78,13 @@ def calculate_total_cost(
|
|
|
68
78
|
audio_duration_seconds: Audio-Dauer in Sekunden.
|
|
69
79
|
llm_input_tokens: LLM Input-Tokens (0 wenn kein LLM).
|
|
70
80
|
llm_output_tokens: LLM Output-Tokens (0 wenn kein LLM).
|
|
81
|
+
llm_model: LLM-Modellname fuer modellspezifische Preise.
|
|
71
82
|
|
|
72
83
|
Returns:
|
|
73
84
|
CostResult mit allen Kosten-Details.
|
|
74
85
|
"""
|
|
75
86
|
billed, stt_cost = calculate_stt_cost(audio_duration_seconds)
|
|
76
|
-
llm_cost = calculate_llm_cost(llm_input_tokens, llm_output_tokens)
|
|
87
|
+
llm_cost = calculate_llm_cost(llm_input_tokens, llm_output_tokens, llm_model)
|
|
77
88
|
return CostResult(
|
|
78
89
|
audio_duration_seconds=audio_duration_seconds,
|
|
79
90
|
billed_seconds=billed,
|
|
@@ -54,6 +54,11 @@ _MODIFIER_GROUPS: dict[str, set[keyboard.Key]] = {
|
|
|
54
54
|
"cmd": {keyboard.Key.cmd, keyboard.Key.cmd_l, keyboard.Key.cmd_r},
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
# Alle Modifier-Keys (flach) fuer Exakt-Match-Pruefung
|
|
58
|
+
_ALL_MODIFIER_KEYS: set[keyboard.Key] = set()
|
|
59
|
+
for _grp in _MODIFIER_GROUPS.values():
|
|
60
|
+
_ALL_MODIFIER_KEYS |= _grp
|
|
61
|
+
|
|
57
62
|
# Alt-Keys fuer Menu-Bar-Workaround (Windows aktiviert Menueleiste bei bare Alt-Release)
|
|
58
63
|
_ALT_KEYS: set[keyboard.Key] = {keyboard.Key.alt_l, keyboard.Key.alt_r}
|
|
59
64
|
|
|
@@ -149,18 +154,33 @@ class HotkeyListener:
|
|
|
149
154
|
target_keys: list[keyboard.Key | keyboard.KeyCode],
|
|
150
155
|
modifier_groups: list[set[keyboard.Key]],
|
|
151
156
|
) -> bool:
|
|
152
|
-
"""Prüft ob eine Tastenkombination aktuell gedrückt ist.
|
|
157
|
+
"""Prüft ob eine Tastenkombination aktuell gedrückt ist.
|
|
158
|
+
|
|
159
|
+
Exaktes Modifier-Matching: es muessen genau die konfigurierten Modifier
|
|
160
|
+
gedrueckt sein, keine zusaetzlichen. Damit wird verhindert, dass z.B.
|
|
161
|
+
Ctrl+Win auch durch Ctrl+Shift+Alt+F9 ausgeloest wird.
|
|
162
|
+
"""
|
|
153
163
|
if not target_keys:
|
|
154
164
|
return False
|
|
155
165
|
|
|
166
|
+
# Sammle welche Modifier-Gruppen zum Hotkey gehoeren
|
|
167
|
+
required_modifier_keys: set[keyboard.Key] = set()
|
|
168
|
+
|
|
156
169
|
for i, target_key in enumerate(target_keys):
|
|
157
170
|
# Für Modifier: prüfe ob *irgendein* Key aus der Gruppe gedrückt ist
|
|
158
171
|
if i < len(modifier_groups) and modifier_groups[i]:
|
|
159
172
|
if not (modifier_groups[i] & self._pressed_keys):
|
|
160
173
|
return False
|
|
174
|
+
required_modifier_keys |= modifier_groups[i]
|
|
161
175
|
else:
|
|
162
176
|
if target_key not in self._pressed_keys:
|
|
163
177
|
return False
|
|
178
|
+
|
|
179
|
+
# Pruefe ob Extra-Modifier gedrueckt sind, die nicht zum Hotkey gehoeren
|
|
180
|
+
extra_modifiers = (self._pressed_keys & _ALL_MODIFIER_KEYS) - required_modifier_keys
|
|
181
|
+
if extra_modifiers:
|
|
182
|
+
return False
|
|
183
|
+
|
|
164
184
|
return True
|
|
165
185
|
|
|
166
186
|
def _combo_uses_alt(self, target_keys: list[keyboard.Key | keyboard.KeyCode]) -> bool:
|
|
@@ -19,6 +19,27 @@ log = get_logger("core.text_inserter")
|
|
|
19
19
|
pyautogui.FAILSAFE = False
|
|
20
20
|
pyautogui.PAUSE = 0
|
|
21
21
|
|
|
22
|
+
# Clipboard-Wiederherstellung: Retry-Konfiguration
|
|
23
|
+
_CLIPBOARD_RESTORE_RETRIES = 3
|
|
24
|
+
_CLIPBOARD_RESTORE_DELAY = 0.05 # 50ms zwischen Versuchen
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _restore_clipboard(content: str) -> None:
|
|
28
|
+
"""Stellt die Zwischenablage wieder her mit Retry-Logik.
|
|
29
|
+
|
|
30
|
+
Andere Apps (Clipboard-Manager, Password-Manager) koennen die
|
|
31
|
+
Zwischenablage kurzzeitig sperren. Daher mehrere Versuche.
|
|
32
|
+
"""
|
|
33
|
+
for attempt in range(1, _CLIPBOARD_RESTORE_RETRIES + 1):
|
|
34
|
+
try:
|
|
35
|
+
pyperclip.copy(content)
|
|
36
|
+
return
|
|
37
|
+
except Exception:
|
|
38
|
+
if attempt < _CLIPBOARD_RESTORE_RETRIES:
|
|
39
|
+
time.sleep(_CLIPBOARD_RESTORE_DELAY)
|
|
40
|
+
else:
|
|
41
|
+
log.warning("Zwischenablage konnte nicht wiederhergestellt werden (nach %d Versuchen)", _CLIPBOARD_RESTORE_RETRIES)
|
|
42
|
+
|
|
22
43
|
|
|
23
44
|
def insert_text(text: str) -> None:
|
|
24
45
|
"""Fügt Text an der aktuellen Cursor-Position ein.
|
|
@@ -62,10 +83,7 @@ def insert_text(text: str) -> None:
|
|
|
62
83
|
|
|
63
84
|
finally:
|
|
64
85
|
# 5. Alte Zwischenablage wiederherstellen
|
|
65
|
-
|
|
66
|
-
pyperclip.copy(old_clipboard)
|
|
67
|
-
except Exception:
|
|
68
|
-
log.debug("Zwischenablage konnte nicht wiederhergestellt werden")
|
|
86
|
+
_restore_clipboard(old_clipboard)
|
|
69
87
|
|
|
70
88
|
|
|
71
89
|
# Intervall (Sekunden) zwischen Chunk-Pastes bei Streaming-Typing
|
|
@@ -125,7 +143,4 @@ def insert_text_streaming(chunks: Iterator[str]) -> None:
|
|
|
125
143
|
log.error("Auch Fallback-Paste fehlgeschlagen")
|
|
126
144
|
|
|
127
145
|
finally:
|
|
128
|
-
|
|
129
|
-
pyperclip.copy(old_clipboard)
|
|
130
|
-
except Exception:
|
|
131
|
-
log.debug("Zwischenablage konnte nicht wiederhergestellt werden")
|
|
146
|
+
_restore_clipboard(old_clipboard)
|
{paypertranscript-0.2.8 → paypertranscript-0.3.0}/paypertranscript/pipeline/transcription.py
RENAMED
|
@@ -26,9 +26,38 @@ STATUS_STT_DONE = "stt_done"
|
|
|
26
26
|
STATUS_LLM_START = "llm_start"
|
|
27
27
|
STATUS_DONE = "done"
|
|
28
28
|
STATUS_ERROR = "error"
|
|
29
|
+
STATUS_LLM_FALLBACK = "llm_fallback"
|
|
29
30
|
|
|
30
31
|
log = get_logger("pipeline.transcription")
|
|
31
32
|
|
|
33
|
+
# Halluzinationsfilter: Whisper halluziniert bei kurzen Aufnahmen ohne Sprache
|
|
34
|
+
_HALLUCINATION_PATTERNS = [
|
|
35
|
+
"copyright", "untertitel", "subtitles by",
|
|
36
|
+
"thanks for watching", "thank you for watching",
|
|
37
|
+
"sous-titres", "amara.org",
|
|
38
|
+
]
|
|
39
|
+
_HALLUCINATION_MAX_DURATION = 5.0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _is_hallucination(text: str, audio_duration: float) -> bool:
|
|
43
|
+
"""Prueft ob ein STT-Ergebnis eine Whisper-Halluzination ist.
|
|
44
|
+
|
|
45
|
+
Bei kurzen Aufnahmen (< 5s) ohne Sprache halluziniert Whisper
|
|
46
|
+
stereotypische Strings wie "Copyright Australian Broadcasting Corporation".
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
text: STT-Ergebnis.
|
|
50
|
+
audio_duration: Audio-Dauer in Sekunden.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
True wenn der Text als Halluzination erkannt wurde.
|
|
54
|
+
"""
|
|
55
|
+
if audio_duration >= _HALLUCINATION_MAX_DURATION:
|
|
56
|
+
return False
|
|
57
|
+
text_lower = text.lower()
|
|
58
|
+
return any(pattern in text_lower for pattern in _HALLUCINATION_PATTERNS)
|
|
59
|
+
|
|
60
|
+
|
|
32
61
|
# Maximale Prompt-Laenge fuer Whisper (224 Tokens).
|
|
33
62
|
# Konservative Schaetzung: ~4 Zeichen pro Token fuer gemischten DE/EN Text.
|
|
34
63
|
_MAX_PROMPT_CHARS = 896
|
|
@@ -86,6 +115,7 @@ class TranscriptionPipeline:
|
|
|
86
115
|
self._config = config
|
|
87
116
|
self._session_logger = session_logger
|
|
88
117
|
self.last_transcription: str | None = None
|
|
118
|
+
self.last_wav_path: Path | None = None
|
|
89
119
|
log.info(
|
|
90
120
|
"TranscriptionPipeline initialisiert (LLM: %s, Tracking: %s)",
|
|
91
121
|
"aktiv" if llm_provider else "deaktiviert",
|
|
@@ -165,6 +195,7 @@ class TranscriptionPipeline:
|
|
|
165
195
|
audio_duration_seconds=audio_duration,
|
|
166
196
|
llm_input_tokens=llm_input_tokens,
|
|
167
197
|
llm_output_tokens=llm_output_tokens,
|
|
198
|
+
llm_model=self._config.get("api.llm_model", ""),
|
|
168
199
|
)
|
|
169
200
|
|
|
170
201
|
session_data = {
|
|
@@ -218,6 +249,8 @@ class TranscriptionPipeline:
|
|
|
218
249
|
except Exception:
|
|
219
250
|
pass
|
|
220
251
|
|
|
252
|
+
self.last_wav_path = wav_path
|
|
253
|
+
|
|
221
254
|
try:
|
|
222
255
|
# Audio-Dauer: entweder uebergeben oder aus WAV-Datei berechnen
|
|
223
256
|
if audio_duration is None:
|
|
@@ -240,7 +273,16 @@ class TranscriptionPipeline:
|
|
|
240
273
|
|
|
241
274
|
if not text:
|
|
242
275
|
log.info("Pipeline: STT lieferte leeren Text - uebersprungen")
|
|
243
|
-
_notify(
|
|
276
|
+
_notify(STATUS_ERROR, "Kein Text erkannt")
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
if _is_hallucination(text, audio_duration):
|
|
280
|
+
log.info(
|
|
281
|
+
"Pipeline: Halluzination erkannt (%.1fs, '%s') - uebersprungen",
|
|
282
|
+
audio_duration,
|
|
283
|
+
text[:80],
|
|
284
|
+
)
|
|
285
|
+
_notify(STATUS_ERROR, "Keine Sprache erkannt")
|
|
244
286
|
return
|
|
245
287
|
|
|
246
288
|
# LLM-Formatierung (falls Window-Mapping existiert)
|
|
@@ -261,7 +303,7 @@ class TranscriptionPipeline:
|
|
|
261
303
|
except Exception as e:
|
|
262
304
|
insert_ok = False
|
|
263
305
|
log.error("Pipeline: Text-Einfuegung fehlgeschlagen: %s", e)
|
|
264
|
-
_notify(STATUS_ERROR, "Text
|
|
306
|
+
_notify(STATUS_ERROR, f"Text-Einfuegung fehlgeschlagen: {e}")
|
|
265
307
|
|
|
266
308
|
def _do_insert_stream(chunks_iter: object) -> None:
|
|
267
309
|
nonlocal insert_ok
|
|
@@ -270,7 +312,7 @@ class TranscriptionPipeline:
|
|
|
270
312
|
except Exception as e:
|
|
271
313
|
insert_ok = False
|
|
272
314
|
log.error("Pipeline: Streaming-Einfuegung fehlgeschlagen: %s", e)
|
|
273
|
-
_notify(STATUS_ERROR, "Text
|
|
315
|
+
_notify(STATUS_ERROR, f"Text-Einfuegung fehlgeschlagen: {e}")
|
|
274
316
|
|
|
275
317
|
if system_prompt and self._llm:
|
|
276
318
|
_notify(STATUS_LLM_START)
|
|
@@ -298,6 +340,7 @@ class TranscriptionPipeline:
|
|
|
298
340
|
|
|
299
341
|
except ProviderError as e:
|
|
300
342
|
log.warning("Pipeline: LLM-Fehler - Fallback auf Rohtext: %s", e)
|
|
343
|
+
_notify(STATUS_LLM_FALLBACK, str(e))
|
|
301
344
|
_do_insert(text)
|
|
302
345
|
else:
|
|
303
346
|
# Kein Mapping oder kein LLM-Provider -> Rohtext direkt einfuegen
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""GroqCloud API-Provider für PayPerTranscript.
|
|
2
|
+
|
|
3
|
+
Implementiert STT (Whisper) und LLM-Formatierung über die GroqCloud API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import groq
|
|
11
|
+
|
|
12
|
+
from paypertranscript.core.logging import get_logger
|
|
13
|
+
from paypertranscript.providers.base import AbstractLLMProvider, AbstractSTTProvider, ProviderError
|
|
14
|
+
|
|
15
|
+
log = get_logger("providers.groq")
|
|
16
|
+
|
|
17
|
+
# Retry-Konfiguration fuer transiente API-Fehler
|
|
18
|
+
_MAX_RETRIES = 3
|
|
19
|
+
_RETRY_BASE_DELAY = 1.0 # Sekunden (exponential: 1s, 2s, 4s)
|
|
20
|
+
_RETRYABLE_ERRORS = (groq.RateLimitError, groq.APITimeoutError, groq.APIConnectionError)
|
|
21
|
+
|
|
22
|
+
# Minimale WAV-Dateigroesse (44 Bytes = WAV-Header ohne Audio-Daten)
|
|
23
|
+
_MIN_WAV_SIZE = 44
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GroqSTTProvider(AbstractSTTProvider):
|
|
27
|
+
"""GroqCloud Whisper STT-Provider.
|
|
28
|
+
|
|
29
|
+
Nutzt whisper-large-v3-turbo für Speech-to-Text.
|
|
30
|
+
Der Groq-Client wird einmal instanziiert und wiederverwendet
|
|
31
|
+
(Connection Pooling via httpx).
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
api_key: str | None = None,
|
|
37
|
+
model: str = "whisper-large-v3-turbo",
|
|
38
|
+
) -> None:
|
|
39
|
+
self._model = model
|
|
40
|
+
try:
|
|
41
|
+
self._client = groq.Groq(api_key=api_key)
|
|
42
|
+
except groq.GroqError as e:
|
|
43
|
+
raise ProviderError(f"Groq-Client konnte nicht erstellt werden: {e}") from e
|
|
44
|
+
log.info("GroqSTTProvider initialisiert (Modell: %s)", self._model)
|
|
45
|
+
|
|
46
|
+
def transcribe(self, audio_path: Path, language: str, prompt: str = "") -> str:
|
|
47
|
+
"""Transkribiert eine WAV-Datei via GroqCloud Whisper API."""
|
|
48
|
+
if not audio_path.exists():
|
|
49
|
+
raise ProviderError(f"Audio-Datei nicht gefunden: {audio_path}")
|
|
50
|
+
|
|
51
|
+
# V05: Audio-Datei validieren (WAV-Header = 44 Bytes, leere Datei vermeiden)
|
|
52
|
+
file_size = audio_path.stat().st_size
|
|
53
|
+
if file_size <= _MIN_WAV_SIZE:
|
|
54
|
+
raise ProviderError(
|
|
55
|
+
f"Audio-Datei ist leer oder beschädigt ({file_size} Bytes)"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
log.info(
|
|
59
|
+
"STT-Anfrage: %s (Sprache: %s, Modell: %s)",
|
|
60
|
+
audio_path.name,
|
|
61
|
+
language,
|
|
62
|
+
self._model,
|
|
63
|
+
)
|
|
64
|
+
if prompt:
|
|
65
|
+
log.info("STT-Prompt: %s", prompt)
|
|
66
|
+
|
|
67
|
+
# V01: Retry-Loop fuer transiente Fehler
|
|
68
|
+
last_error: Exception | None = None
|
|
69
|
+
for attempt in range(1, _MAX_RETRIES + 1):
|
|
70
|
+
try:
|
|
71
|
+
with open(audio_path, "rb") as audio_file:
|
|
72
|
+
transcription = self._client.audio.transcriptions.create(
|
|
73
|
+
model=self._model,
|
|
74
|
+
file=audio_file,
|
|
75
|
+
language=language,
|
|
76
|
+
prompt=prompt,
|
|
77
|
+
response_format="text",
|
|
78
|
+
temperature=0.0,
|
|
79
|
+
)
|
|
80
|
+
break # Erfolg
|
|
81
|
+
except groq.AuthenticationError as e:
|
|
82
|
+
raise ProviderError(f"API-Key ungültig: {e}") from e
|
|
83
|
+
except _RETRYABLE_ERRORS as e:
|
|
84
|
+
last_error = e
|
|
85
|
+
if attempt < _MAX_RETRIES:
|
|
86
|
+
delay = _RETRY_BASE_DELAY * (2 ** (attempt - 1))
|
|
87
|
+
log.warning(
|
|
88
|
+
"STT-Versuch %d/%d fehlgeschlagen: %s - Retry in %.1fs",
|
|
89
|
+
attempt, _MAX_RETRIES, e, delay,
|
|
90
|
+
)
|
|
91
|
+
time.sleep(delay)
|
|
92
|
+
else:
|
|
93
|
+
log.error("STT: Alle %d Versuche fehlgeschlagen", _MAX_RETRIES)
|
|
94
|
+
except groq.APIError as e:
|
|
95
|
+
raise ProviderError(f"GroqCloud API-Fehler: {e}") from e
|
|
96
|
+
else:
|
|
97
|
+
# Alle Retries erschoepft
|
|
98
|
+
e = last_error
|
|
99
|
+
if isinstance(e, groq.RateLimitError):
|
|
100
|
+
raise ProviderError(f"Rate Limit erreicht: {e}") from e
|
|
101
|
+
elif isinstance(e, groq.APITimeoutError):
|
|
102
|
+
raise ProviderError(f"GroqCloud Timeout: {e}") from e
|
|
103
|
+
else:
|
|
104
|
+
raise ProviderError(f"Keine Verbindung zu GroqCloud: {e}") from e
|
|
105
|
+
|
|
106
|
+
# response_format="text" gibt direkt einen String zurück
|
|
107
|
+
text = transcription.strip() if isinstance(transcription, str) else transcription.text.strip()
|
|
108
|
+
|
|
109
|
+
log.info("STT-Ergebnis: %d Zeichen", len(text))
|
|
110
|
+
return text
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class GroqLLMProvider(AbstractLLMProvider):
|
|
114
|
+
"""GroqCloud LLM-Provider für Textformatierung.
|
|
115
|
+
|
|
116
|
+
Nutzt openai/gpt-oss-20b für kontextabhängige Formatierung.
|
|
117
|
+
Der Groq-Client wird einmal instanziiert und wiederverwendet.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
api_key: str | None = None,
|
|
123
|
+
model: str = "openai/gpt-oss-20b",
|
|
124
|
+
temperature: float | None = None,
|
|
125
|
+
) -> None:
|
|
126
|
+
self._model = model
|
|
127
|
+
self._temperature = temperature
|
|
128
|
+
self._last_usage: dict[str, int] | None = None
|
|
129
|
+
try:
|
|
130
|
+
self._client = groq.Groq(api_key=api_key)
|
|
131
|
+
except groq.GroqError as e:
|
|
132
|
+
raise ProviderError(f"Groq-Client konnte nicht erstellt werden: {e}") from e
|
|
133
|
+
log.info("GroqLLMProvider initialisiert (Modell: %s, Temperature: %s)", self._model, self._temperature)
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def last_usage(self) -> dict[str, int] | None:
|
|
137
|
+
"""Token-Usage der letzten LLM-Anfrage."""
|
|
138
|
+
return self._last_usage
|
|
139
|
+
|
|
140
|
+
def _build_messages(
|
|
141
|
+
self, system_prompt: str, text: str
|
|
142
|
+
) -> list[dict[str, str]]:
|
|
143
|
+
return [
|
|
144
|
+
{"role": "system", "content": system_prompt},
|
|
145
|
+
{"role": "user", "content": f"<transcript>{text}</transcript>"},
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
def _completion_kwargs(self) -> dict:
|
|
149
|
+
"""Baut gemeinsame kwargs für chat.completions.create."""
|
|
150
|
+
kwargs: dict = {}
|
|
151
|
+
if self._temperature is not None:
|
|
152
|
+
kwargs["temperature"] = self._temperature
|
|
153
|
+
return kwargs
|
|
154
|
+
|
|
155
|
+
def format_text(self, system_prompt: str, text: str) -> str:
|
|
156
|
+
log.info("LLM-Anfrage (non-streaming, Modell: %s, Temperature: %s)", self._model, self._temperature)
|
|
157
|
+
self._last_usage = None
|
|
158
|
+
|
|
159
|
+
# V01: Retry-Loop fuer transiente Fehler
|
|
160
|
+
last_error: Exception | None = None
|
|
161
|
+
for attempt in range(1, _MAX_RETRIES + 1):
|
|
162
|
+
try:
|
|
163
|
+
response = self._client.chat.completions.create(
|
|
164
|
+
model=self._model,
|
|
165
|
+
messages=self._build_messages(system_prompt, text),
|
|
166
|
+
stream=False,
|
|
167
|
+
**self._completion_kwargs(),
|
|
168
|
+
)
|
|
169
|
+
break # Erfolg
|
|
170
|
+
except groq.AuthenticationError as e:
|
|
171
|
+
raise ProviderError(f"API-Key ungültig: {e}") from e
|
|
172
|
+
except _RETRYABLE_ERRORS as e:
|
|
173
|
+
last_error = e
|
|
174
|
+
if attempt < _MAX_RETRIES:
|
|
175
|
+
delay = _RETRY_BASE_DELAY * (2 ** (attempt - 1))
|
|
176
|
+
log.warning(
|
|
177
|
+
"LLM-Versuch %d/%d fehlgeschlagen: %s - Retry in %.1fs",
|
|
178
|
+
attempt, _MAX_RETRIES, e, delay,
|
|
179
|
+
)
|
|
180
|
+
time.sleep(delay)
|
|
181
|
+
else:
|
|
182
|
+
log.error("LLM: Alle %d Versuche fehlgeschlagen", _MAX_RETRIES)
|
|
183
|
+
except groq.APIError as e:
|
|
184
|
+
raise ProviderError(f"GroqCloud API-Fehler: {e}") from e
|
|
185
|
+
else:
|
|
186
|
+
e = last_error
|
|
187
|
+
if isinstance(e, groq.RateLimitError):
|
|
188
|
+
raise ProviderError(f"Rate Limit erreicht: {e}") from e
|
|
189
|
+
elif isinstance(e, groq.APITimeoutError):
|
|
190
|
+
raise ProviderError(f"GroqCloud Timeout: {e}") from e
|
|
191
|
+
else:
|
|
192
|
+
raise ProviderError(f"Keine Verbindung zu GroqCloud: {e}") from e
|
|
193
|
+
|
|
194
|
+
# Usage-Daten erfassen
|
|
195
|
+
if hasattr(response, "usage") and response.usage:
|
|
196
|
+
self._last_usage = {
|
|
197
|
+
"prompt_tokens": response.usage.prompt_tokens or 0,
|
|
198
|
+
"completion_tokens": response.usage.completion_tokens or 0,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
result = response.choices[0].message.content or ""
|
|
202
|
+
result = result.strip()
|
|
203
|
+
log.info("LLM-Ergebnis: %d Zeichen", len(result))
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
def format_text_stream(self, system_prompt: str, text: str) -> Iterator[str]:
|
|
207
|
+
log.info("LLM-Anfrage (streaming, Modell: %s, Temperature: %s)", self._model, self._temperature)
|
|
208
|
+
self._last_usage = None
|
|
209
|
+
|
|
210
|
+
# V01: Retry-Loop fuer transiente Fehler beim Stream-Aufbau
|
|
211
|
+
last_error: Exception | None = None
|
|
212
|
+
for attempt in range(1, _MAX_RETRIES + 1):
|
|
213
|
+
try:
|
|
214
|
+
stream = self._client.chat.completions.create(
|
|
215
|
+
model=self._model,
|
|
216
|
+
messages=self._build_messages(system_prompt, text),
|
|
217
|
+
stream=True,
|
|
218
|
+
**self._completion_kwargs(),
|
|
219
|
+
)
|
|
220
|
+
break # Erfolg
|
|
221
|
+
except groq.AuthenticationError as e:
|
|
222
|
+
raise ProviderError(f"API-Key ungültig: {e}") from e
|
|
223
|
+
except _RETRYABLE_ERRORS as e:
|
|
224
|
+
last_error = e
|
|
225
|
+
if attempt < _MAX_RETRIES:
|
|
226
|
+
delay = _RETRY_BASE_DELAY * (2 ** (attempt - 1))
|
|
227
|
+
log.warning(
|
|
228
|
+
"LLM-Stream-Versuch %d/%d fehlgeschlagen: %s - Retry in %.1fs",
|
|
229
|
+
attempt, _MAX_RETRIES, e, delay,
|
|
230
|
+
)
|
|
231
|
+
time.sleep(delay)
|
|
232
|
+
else:
|
|
233
|
+
log.error("LLM-Stream: Alle %d Versuche fehlgeschlagen", _MAX_RETRIES)
|
|
234
|
+
except groq.APIError as e:
|
|
235
|
+
raise ProviderError(f"GroqCloud API-Fehler: {e}") from e
|
|
236
|
+
else:
|
|
237
|
+
e = last_error
|
|
238
|
+
if isinstance(e, groq.RateLimitError):
|
|
239
|
+
raise ProviderError(f"Rate Limit erreicht: {e}") from e
|
|
240
|
+
elif isinstance(e, groq.APITimeoutError):
|
|
241
|
+
raise ProviderError(f"GroqCloud Timeout: {e}") from e
|
|
242
|
+
else:
|
|
243
|
+
raise ProviderError(f"Keine Verbindung zu GroqCloud: {e}") from e
|
|
244
|
+
|
|
245
|
+
# V02: Stream-Iteration in try/except — Verbindungsabbruch waehrend Streaming erkennen
|
|
246
|
+
total_chars = 0
|
|
247
|
+
try:
|
|
248
|
+
for chunk in stream:
|
|
249
|
+
delta = chunk.choices[0].delta.content
|
|
250
|
+
if delta:
|
|
251
|
+
total_chars += len(delta)
|
|
252
|
+
yield delta
|
|
253
|
+
# Groq streaming: Usage im letzten Chunk via x_groq
|
|
254
|
+
if (
|
|
255
|
+
hasattr(chunk, "x_groq")
|
|
256
|
+
and chunk.x_groq
|
|
257
|
+
and hasattr(chunk.x_groq, "usage")
|
|
258
|
+
and chunk.x_groq.usage
|
|
259
|
+
):
|
|
260
|
+
usage = chunk.x_groq.usage
|
|
261
|
+
self._last_usage = {
|
|
262
|
+
"prompt_tokens": getattr(usage, "prompt_tokens", 0) or 0,
|
|
263
|
+
"completion_tokens": getattr(usage, "completion_tokens", 0) or 0,
|
|
264
|
+
}
|
|
265
|
+
except groq.APIError as e:
|
|
266
|
+
raise ProviderError(
|
|
267
|
+
f"LLM-Stream abgebrochen nach {total_chars} Zeichen: {e}"
|
|
268
|
+
) from e
|
|
269
|
+
except Exception as e:
|
|
270
|
+
raise ProviderError(
|
|
271
|
+
f"LLM-Stream abgebrochen nach {total_chars} Zeichen: {e}"
|
|
272
|
+
) from e
|
|
273
|
+
log.info("LLM-Stream abgeschlossen: %d Zeichen", total_chars)
|