karaoke-gen 0.81.1__py3-none-any.whl → 0.86.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karaoke_gen/instrumental_review/static/index.html +2 -2
- karaoke_gen/lyrics_processor.py +9 -5
- {karaoke_gen-0.81.1.dist-info → karaoke_gen-0.86.5.dist-info}/METADATA +2 -2
- {karaoke_gen-0.81.1.dist-info → karaoke_gen-0.86.5.dist-info}/RECORD +20 -21
- lyrics_transcriber/core/controller.py +16 -5
- lyrics_transcriber/correction/agentic/agent.py +19 -7
- lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +178 -5
- lyrics_transcriber/correction/agentic/prompts/__init__.py +23 -0
- lyrics_transcriber/correction/agentic/prompts/classifier.py +66 -6
- lyrics_transcriber/correction/agentic/prompts/langfuse_prompts.py +298 -0
- lyrics_transcriber/correction/agentic/providers/config.py +19 -6
- lyrics_transcriber/correction/agentic/providers/constants.py +1 -1
- lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +125 -20
- lyrics_transcriber/correction/agentic/providers/model_factory.py +58 -25
- lyrics_transcriber/correction/agentic/providers/response_parser.py +18 -6
- lyrics_transcriber/correction/agentic/router.py +2 -1
- lyrics_transcriber/correction/corrector.py +44 -49
- lyrics_transcriber/correction/handlers/llm.py +0 -293
- lyrics_transcriber/correction/handlers/llm_providers.py +0 -60
- {karaoke_gen-0.81.1.dist-info → karaoke_gen-0.86.5.dist-info}/WHEEL +0 -0
- {karaoke_gen-0.81.1.dist-info → karaoke_gen-0.86.5.dist-info}/entry_points.txt +0 -0
- {karaoke_gen-0.81.1.dist-info → karaoke_gen-0.86.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -51,22 +51,34 @@ class ResponseParser:
|
|
|
51
51
|
|
|
52
52
|
def _attempt_json_fix(self, content: str) -> str:
|
|
53
53
|
"""Attempt to fix common JSON formatting issues.
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
Args:
|
|
56
56
|
content: Raw JSON string
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
Returns:
|
|
59
59
|
Fixed JSON string (or original if no fixes applied)
|
|
60
60
|
"""
|
|
61
|
+
import re
|
|
62
|
+
|
|
63
|
+
# Fix 0: Strip markdown code fences (```json ... ``` or ``` ... ```)
|
|
64
|
+
# Models often wrap JSON in markdown code blocks
|
|
65
|
+
fixed = content.strip()
|
|
66
|
+
if fixed.startswith("```"):
|
|
67
|
+
# Remove opening fence (with optional language identifier)
|
|
68
|
+
fixed = re.sub(r'^```\w*\s*\n?', '', fixed)
|
|
69
|
+
# Remove closing fence
|
|
70
|
+
fixed = re.sub(r'\n?```\s*$', '', fixed)
|
|
71
|
+
fixed = fixed.strip()
|
|
72
|
+
logger.debug("🤖 Stripped markdown code fences from response")
|
|
73
|
+
|
|
61
74
|
# Fix 1: Replace invalid escape sequences like \' with '
|
|
62
75
|
# (JSON only allows \", \\, \/, \b, \f, \n, \r, \t)
|
|
63
|
-
fixed =
|
|
64
|
-
|
|
76
|
+
fixed = fixed.replace("\\'", "'")
|
|
77
|
+
|
|
65
78
|
# Fix 2: Remove any trailing commas before } or ]
|
|
66
|
-
import re
|
|
67
79
|
fixed = re.sub(r',\s*}', '}', fixed)
|
|
68
80
|
fixed = re.sub(r',\s*]', ']', fixed)
|
|
69
|
-
|
|
81
|
+
|
|
70
82
|
return fixed
|
|
71
83
|
|
|
72
84
|
def _normalize_json_response(self, data: Any) -> List[Dict[str, Any]]:
|
|
@@ -6,6 +6,7 @@ from typing import Dict, Any
|
|
|
6
6
|
from .providers.config import ProviderConfig
|
|
7
7
|
|
|
8
8
|
# Default model for cloud deployments - Gemini 3 Flash via Vertex AI
|
|
9
|
+
# Note: Gemini 3 models require 'global' location (not regional like us-central1)
|
|
9
10
|
DEFAULT_CLOUD_MODEL = "vertexai/gemini-3-flash-preview"
|
|
10
11
|
|
|
11
12
|
|
|
@@ -33,7 +34,7 @@ class ModelRouter:
|
|
|
33
34
|
if self._config.privacy_mode:
|
|
34
35
|
return "ollama/llama3.2:latest"
|
|
35
36
|
|
|
36
|
-
# Default to Gemini 3 Flash for all cases (fast, cost-effective)
|
|
37
|
+
# Default to Gemini 3 Flash for all cases (fast, cost-effective, latest capabilities)
|
|
37
38
|
return DEFAULT_CLOUD_MODEL
|
|
38
39
|
|
|
39
40
|
|
|
@@ -4,9 +4,9 @@ from pathlib import Path
|
|
|
4
4
|
from copy import deepcopy
|
|
5
5
|
import os
|
|
6
6
|
import shortuuid
|
|
7
|
+
import time
|
|
7
8
|
|
|
8
9
|
from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinHandler
|
|
9
|
-
from lyrics_transcriber.correction.handlers.llm import LLMHandler
|
|
10
10
|
from lyrics_transcriber.correction.handlers.no_space_punct_match import NoSpacePunctuationMatchHandler
|
|
11
11
|
from lyrics_transcriber.correction.handlers.relaxed_word_count_match import RelaxedWordCountMatchHandler
|
|
12
12
|
from lyrics_transcriber.correction.handlers.repeat import RepeatCorrectionHandler
|
|
@@ -27,7 +27,6 @@ from lyrics_transcriber.correction.anchor_sequence import AnchorSequenceFinder
|
|
|
27
27
|
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
|
28
28
|
from lyrics_transcriber.correction.handlers.extend_anchor import ExtendAnchorHandler
|
|
29
29
|
from lyrics_transcriber.utils.word_utils import WordUtils
|
|
30
|
-
from lyrics_transcriber.correction.handlers.llm_providers import OllamaProvider, OpenAIProvider
|
|
31
30
|
|
|
32
31
|
|
|
33
32
|
class LyricsCorrector:
|
|
@@ -57,60 +56,18 @@ class LyricsCorrector:
|
|
|
57
56
|
]
|
|
58
57
|
|
|
59
58
|
# Create all handlers but respect enabled_handlers if provided
|
|
59
|
+
# Note: Legacy LLMHandler removed - use AgenticCorrector via USE_AGENTIC_AI=1 instead
|
|
60
60
|
all_handlers = [
|
|
61
61
|
("ExtendAnchorHandler", ExtendAnchorHandler(logger=self.logger)),
|
|
62
62
|
("WordCountMatchHandler", WordCountMatchHandler(logger=self.logger)),
|
|
63
63
|
("SyllablesMatchHandler", SyllablesMatchHandler(logger=self.logger)),
|
|
64
64
|
("RelaxedWordCountMatchHandler", RelaxedWordCountMatchHandler(logger=self.logger)),
|
|
65
65
|
("NoSpacePunctuationMatchHandler", NoSpacePunctuationMatchHandler(logger=self.logger)),
|
|
66
|
-
(
|
|
67
|
-
"LLMHandler_Ollama_R17B",
|
|
68
|
-
LLMHandler(
|
|
69
|
-
provider=OllamaProvider(model="deepseek-r1:7b", logger=self.logger),
|
|
70
|
-
name="LLMHandler_Ollama_R17B",
|
|
71
|
-
logger=self.logger,
|
|
72
|
-
cache_dir=self._cache_dir,
|
|
73
|
-
),
|
|
74
|
-
),
|
|
75
66
|
("RepeatCorrectionHandler", RepeatCorrectionHandler(logger=self.logger)),
|
|
76
67
|
("SoundAlikeHandler", SoundAlikeHandler(logger=self.logger)),
|
|
77
68
|
("LevenshteinHandler", LevenshteinHandler(logger=self.logger)),
|
|
78
69
|
]
|
|
79
70
|
|
|
80
|
-
# Add OpenRouter handlers only if API key is available
|
|
81
|
-
if os.getenv("OPENROUTER_API_KEY"):
|
|
82
|
-
openrouter_handlers = [
|
|
83
|
-
(
|
|
84
|
-
"LLMHandler_OpenRouter_Sonnet",
|
|
85
|
-
LLMHandler(
|
|
86
|
-
provider=OpenAIProvider(
|
|
87
|
-
model="anthropic/claude-3-sonnet",
|
|
88
|
-
api_key=os.getenv("OPENROUTER_API_KEY"),
|
|
89
|
-
base_url="https://openrouter.ai/api/v1",
|
|
90
|
-
logger=self.logger,
|
|
91
|
-
),
|
|
92
|
-
name="LLMHandler_OpenRouter_Sonnet",
|
|
93
|
-
logger=self.logger,
|
|
94
|
-
cache_dir=self._cache_dir,
|
|
95
|
-
),
|
|
96
|
-
),
|
|
97
|
-
(
|
|
98
|
-
"LLMHandler_OpenRouter_R1",
|
|
99
|
-
LLMHandler(
|
|
100
|
-
provider=OpenAIProvider(
|
|
101
|
-
model="deepseek/deepseek-r1",
|
|
102
|
-
api_key=os.getenv("OPENROUTER_API_KEY"),
|
|
103
|
-
base_url="https://openrouter.ai/api/v1",
|
|
104
|
-
logger=self.logger,
|
|
105
|
-
),
|
|
106
|
-
name="LLMHandler_OpenRouter_R1",
|
|
107
|
-
logger=self.logger,
|
|
108
|
-
cache_dir=self._cache_dir,
|
|
109
|
-
),
|
|
110
|
-
),
|
|
111
|
-
]
|
|
112
|
-
all_handlers.extend(openrouter_handlers)
|
|
113
|
-
|
|
114
71
|
# Store all handler information
|
|
115
72
|
self.all_handlers = [
|
|
116
73
|
{
|
|
@@ -151,8 +108,24 @@ class LyricsCorrector:
|
|
|
151
108
|
transcription_results: List[TranscriptionResult],
|
|
152
109
|
lyrics_results: Dict[str, LyricsData],
|
|
153
110
|
metadata: Optional[Dict[str, Any]] = None,
|
|
111
|
+
agentic_deadline: Optional[float] = None,
|
|
154
112
|
) -> CorrectionResult:
|
|
155
|
-
"""Execute the correction process.
|
|
113
|
+
"""Execute the correction process.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
transcription_results: List of transcription results to correct.
|
|
117
|
+
lyrics_results: Dictionary of lyrics data from various sources.
|
|
118
|
+
metadata: Optional metadata including artist, title, audio file hash.
|
|
119
|
+
agentic_deadline: Optional Unix timestamp (from time.time()). If agentic
|
|
120
|
+
correction is still running after this time, it will abort and return
|
|
121
|
+
uncorrected results for human review.
|
|
122
|
+
|
|
123
|
+
Note:
|
|
124
|
+
The deadline is checked between gap iterations, not during LLM processing.
|
|
125
|
+
A single long-running LLM call may exceed the deadline. The caller should
|
|
126
|
+
wrap this method with an outer timeout (e.g., asyncio.wait_for) as a safety
|
|
127
|
+
net for hung operations.
|
|
128
|
+
"""
|
|
156
129
|
# Optional agentic routing flag from environment; default off for safety
|
|
157
130
|
agentic_enabled = os.getenv("USE_AGENTIC_AI", "").lower() in {"1", "true", "yes"}
|
|
158
131
|
self.logger.info(f"🤖 AGENTIC MODE: {'ENABLED' if agentic_enabled else 'DISABLED'} (USE_AGENTIC_AI={os.getenv('USE_AGENTIC_AI', 'NOT_SET')})")
|
|
@@ -176,9 +149,9 @@ class LyricsCorrector:
|
|
|
176
149
|
# Store anchor sequences for use in correction handlers
|
|
177
150
|
self._anchor_sequences = anchor_sequences
|
|
178
151
|
|
|
179
|
-
# Process corrections with metadata
|
|
152
|
+
# Process corrections with metadata and optional deadline for agentic timeout
|
|
180
153
|
corrections, corrected_segments, correction_steps, word_id_map, segment_id_map = self._process_corrections(
|
|
181
|
-
primary_transcription.segments, gap_sequences, metadata=metadata
|
|
154
|
+
primary_transcription.segments, gap_sequences, metadata=metadata, deadline=agentic_deadline
|
|
182
155
|
)
|
|
183
156
|
|
|
184
157
|
# Calculate correction ratio
|
|
@@ -222,10 +195,22 @@ class LyricsCorrector:
|
|
|
222
195
|
return leading_space + new_word.strip() + trailing_space
|
|
223
196
|
|
|
224
197
|
def _process_corrections(
|
|
225
|
-
self, segments: List[LyricsSegment], gap_sequences: List[GapSequence], metadata: Optional[Dict[str, Any]] = None
|
|
198
|
+
self, segments: List[LyricsSegment], gap_sequences: List[GapSequence], metadata: Optional[Dict[str, Any]] = None,
|
|
199
|
+
deadline: Optional[float] = None
|
|
226
200
|
) -> Tuple[List[WordCorrection], List[LyricsSegment], List[CorrectionStep], Dict[str, str], Dict[str, str]]:
|
|
227
201
|
"""Process corrections using handlers.
|
|
228
202
|
|
|
203
|
+
Args:
|
|
204
|
+
segments: List of lyrics segments to process.
|
|
205
|
+
gap_sequences: List of gap sequences to correct.
|
|
206
|
+
metadata: Optional metadata including artist, title, audio file hash.
|
|
207
|
+
deadline: Optional Unix timestamp (from time.time()). When agentic mode is
|
|
208
|
+
enabled and this deadline is exceeded, remaining gaps are skipped and
|
|
209
|
+
the method returns with whatever corrections have been made (likely none).
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Tuple of (corrections, corrected_segments, correction_steps, word_id_map, segment_id_map).
|
|
213
|
+
|
|
229
214
|
The correction flow works as follows:
|
|
230
215
|
1. First pass: Process all gaps
|
|
231
216
|
- Iterate through each gap sequence
|
|
@@ -459,6 +444,16 @@ class LyricsCorrector:
|
|
|
459
444
|
# === END TEMPORARY CODE ===
|
|
460
445
|
|
|
461
446
|
for i, gap in enumerate(gap_sequences, 1):
|
|
447
|
+
# Check deadline before processing each gap (agentic mode only)
|
|
448
|
+
# This allows us to abort early and return uncorrected results for human review
|
|
449
|
+
if deadline and use_agentic_env and time.time() > deadline:
|
|
450
|
+
self.logger.warning(
|
|
451
|
+
f"⏰ AGENTIC TIMEOUT: Deadline exceeded after processing {i-1}/{len(gap_sequences)} gaps. "
|
|
452
|
+
"Skipping remaining gaps - human review will correct any issues."
|
|
453
|
+
)
|
|
454
|
+
# Break out of loop - continue with whatever corrections we have (likely none)
|
|
455
|
+
break
|
|
456
|
+
|
|
462
457
|
self.logger.info(f"Processing gap {i}/{len(gap_sequences)} at position {gap.transcription_position}")
|
|
463
458
|
|
|
464
459
|
# Get the actual words for logging
|
|
@@ -1,293 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, Tuple, Dict, Any, Union
|
|
2
|
-
import logging
|
|
3
|
-
import json
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
from lyrics_transcriber.types import GapSequence, WordCorrection
|
|
8
|
-
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
|
9
|
-
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
|
10
|
-
from lyrics_transcriber.correction.handlers.llm_providers import LLMProvider
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class LLMHandler(GapCorrectionHandler):
|
|
14
|
-
"""Uses an LLM to analyze and correct gaps by comparing with reference lyrics."""
|
|
15
|
-
|
|
16
|
-
def __init__(
|
|
17
|
-
self, provider: LLMProvider, name: str, logger: Optional[logging.Logger] = None, cache_dir: Optional[Union[str, Path]] = None
|
|
18
|
-
):
|
|
19
|
-
super().__init__(logger)
|
|
20
|
-
self.logger = logger or logging.getLogger(__name__)
|
|
21
|
-
self.provider = provider
|
|
22
|
-
self.name = name
|
|
23
|
-
self.cache_dir = Path(cache_dir) if cache_dir else None
|
|
24
|
-
|
|
25
|
-
def _format_prompt(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> str:
|
|
26
|
-
"""Format the prompt for the LLM with context about the gap and reference lyrics."""
|
|
27
|
-
word_map = data.get("word_map", {})
|
|
28
|
-
metadata = data.get("metadata", {}) if data else {}
|
|
29
|
-
|
|
30
|
-
if not word_map:
|
|
31
|
-
self.logger.error("No word_map provided in data")
|
|
32
|
-
return ""
|
|
33
|
-
|
|
34
|
-
# Format transcribed words with their IDs
|
|
35
|
-
transcribed_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in gap.transcribed_word_ids if word_id in word_map]
|
|
36
|
-
|
|
37
|
-
prompt = (
|
|
38
|
-
"You are a lyrics correction expert. You will be given transcribed lyrics that may contain errors "
|
|
39
|
-
"and reference lyrics from multiple sources. Your task is to analyze each word in the transcribed text "
|
|
40
|
-
"and suggest specific corrections based on the reference lyrics.\n\n"
|
|
41
|
-
"Each word has a unique ID. When suggesting corrections, you must specify the ID of the word being corrected. "
|
|
42
|
-
"This ensures accuracy in applying your corrections.\n\n"
|
|
43
|
-
"For each correction, specify:\n"
|
|
44
|
-
"1. The word ID being corrected\n"
|
|
45
|
-
"2. The correction type ('replace', 'split', 'combine', or 'delete')\n"
|
|
46
|
-
"3. The corrected text\n"
|
|
47
|
-
"4. Your confidence level\n"
|
|
48
|
-
"5. The reason for the correction\n\n"
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
# Add song context if available
|
|
52
|
-
if metadata and metadata.get("artist") and metadata.get("title"):
|
|
53
|
-
prompt += f"Song: {metadata['title']}\nArtist: {metadata['artist']}\n\n"
|
|
54
|
-
|
|
55
|
-
# Format transcribed words with IDs
|
|
56
|
-
prompt += "Transcribed words:\n"
|
|
57
|
-
for word in transcribed_words:
|
|
58
|
-
prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
|
|
59
|
-
|
|
60
|
-
prompt += "\nReference lyrics from different sources:\n"
|
|
61
|
-
|
|
62
|
-
# Add each reference source with words and their IDs
|
|
63
|
-
for source, word_ids in gap.reference_word_ids.items():
|
|
64
|
-
reference_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in word_ids if word_id in word_map]
|
|
65
|
-
prompt += f"\n{source} immediate context:\n"
|
|
66
|
-
for word in reference_words:
|
|
67
|
-
prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
|
|
68
|
-
|
|
69
|
-
# Add full lyrics if available
|
|
70
|
-
if metadata and metadata.get("full_reference_texts", {}).get(source):
|
|
71
|
-
prompt += f"\nFull {source} lyrics:\n{metadata['full_reference_texts'][source]}\n"
|
|
72
|
-
|
|
73
|
-
# Add context about surrounding anchors if available
|
|
74
|
-
if gap.preceding_anchor_id:
|
|
75
|
-
preceding_anchor = next((a.anchor for a in data.get("anchor_sequences", []) if a.anchor.id == gap.preceding_anchor_id), None)
|
|
76
|
-
if preceding_anchor:
|
|
77
|
-
anchor_words = [
|
|
78
|
-
{"id": word_id, "text": word_map[word_id].text}
|
|
79
|
-
for word_id in preceding_anchor.transcribed_word_ids
|
|
80
|
-
if word_id in word_map
|
|
81
|
-
]
|
|
82
|
-
prompt += "\nPreceding correct words:\n"
|
|
83
|
-
for word in anchor_words:
|
|
84
|
-
prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
|
|
85
|
-
|
|
86
|
-
prompt += (
|
|
87
|
-
"\nProvide corrections in the following JSON format:\n"
|
|
88
|
-
"{\n"
|
|
89
|
-
' "corrections": [\n'
|
|
90
|
-
" {\n"
|
|
91
|
-
' "word_id": "id_of_word_to_correct",\n'
|
|
92
|
-
' "type": "replace|split|combine|delete",\n'
|
|
93
|
-
' "corrected_text": "new text",\n'
|
|
94
|
-
' "reference_word_id": "id_from_reference_lyrics", // Optional, use when matching a specific reference word\n'
|
|
95
|
-
' "confidence": 0.9,\n'
|
|
96
|
-
' "reason": "explanation of correction"\n'
|
|
97
|
-
" }\n"
|
|
98
|
-
" ]\n"
|
|
99
|
-
"}\n\n"
|
|
100
|
-
"Important rules:\n"
|
|
101
|
-
"1. Always include the word_id for each correction\n"
|
|
102
|
-
"2. For 'split' type, corrected_text should contain the space-separated words\n"
|
|
103
|
-
"3. For 'combine' type, word_id should be the first word to combine\n"
|
|
104
|
-
"4. Include reference_word_id when the correction matches a specific reference word\n"
|
|
105
|
-
"5. Only suggest corrections when you're confident they improve the lyrics\n"
|
|
106
|
-
"6. Preserve any existing words that match the reference lyrics\n"
|
|
107
|
-
"7. Respond ONLY with the JSON object, no other text"
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
return prompt
|
|
111
|
-
|
|
112
|
-
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
|
113
|
-
"""LLM handler can attempt to handle any gap with reference words."""
|
|
114
|
-
if not gap.reference_word_ids:
|
|
115
|
-
self.logger.debug("No reference words available")
|
|
116
|
-
return False, {}
|
|
117
|
-
|
|
118
|
-
return True, {}
|
|
119
|
-
|
|
120
|
-
def _write_debug_info(self, prompt: str, response: str, gap_index: int, audio_file_hash: Optional[str] = None) -> None:
|
|
121
|
-
"""Write prompt and response to debug files."""
|
|
122
|
-
if not self.cache_dir:
|
|
123
|
-
self.logger.warning("No cache directory provided, skipping LLM debug output")
|
|
124
|
-
return
|
|
125
|
-
|
|
126
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
127
|
-
debug_dir = self.cache_dir / "llm_debug"
|
|
128
|
-
debug_dir.mkdir(exist_ok=True, parents=True)
|
|
129
|
-
|
|
130
|
-
hash_prefix = f"{audio_file_hash}_" if audio_file_hash else ""
|
|
131
|
-
filename = debug_dir / f"llm_debug_{hash_prefix}{gap_index}_{timestamp}.txt"
|
|
132
|
-
|
|
133
|
-
debug_content = "=== LLM PROMPT ===\n" f"{prompt}\n\n" "=== LLM RESPONSE ===\n" f"{response}\n"
|
|
134
|
-
|
|
135
|
-
try:
|
|
136
|
-
with open(filename, "w", encoding="utf-8") as f:
|
|
137
|
-
f.write(debug_content)
|
|
138
|
-
except IOError as e:
|
|
139
|
-
self.logger.error(f"Failed to write LLM debug file: {e}")
|
|
140
|
-
|
|
141
|
-
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
|
142
|
-
"""Process the gap using the LLM and create corrections based on its response."""
|
|
143
|
-
if not data or "word_map" not in data:
|
|
144
|
-
self.logger.error("No word_map provided in data")
|
|
145
|
-
return []
|
|
146
|
-
|
|
147
|
-
word_map = data["word_map"]
|
|
148
|
-
transcribed_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids if word_id in word_map]
|
|
149
|
-
|
|
150
|
-
# Calculate reference positions using the centralized method
|
|
151
|
-
reference_positions = (
|
|
152
|
-
WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", [])) or {}
|
|
153
|
-
) # Ensure empty dict if None
|
|
154
|
-
|
|
155
|
-
prompt = self._format_prompt(gap, data)
|
|
156
|
-
if not prompt:
|
|
157
|
-
return []
|
|
158
|
-
|
|
159
|
-
# Get a unique index for this gap based on its position
|
|
160
|
-
gap_index = gap.transcription_position
|
|
161
|
-
|
|
162
|
-
try:
|
|
163
|
-
self.logger.debug(f"Processing gap words: {transcribed_words}")
|
|
164
|
-
self.logger.debug(f"Reference word IDs: {gap.reference_word_ids}")
|
|
165
|
-
|
|
166
|
-
response = self.provider.generate_response(prompt)
|
|
167
|
-
|
|
168
|
-
# Write debug info to files
|
|
169
|
-
self._write_debug_info(prompt, response, gap_index, audio_file_hash=data.get("audio_file_hash"))
|
|
170
|
-
|
|
171
|
-
try:
|
|
172
|
-
corrections_data = json.loads(response)
|
|
173
|
-
except json.JSONDecodeError as e:
|
|
174
|
-
self.logger.error(f"Failed to parse LLM response as JSON: {e}")
|
|
175
|
-
self.logger.error(f"Raw response content: {response}")
|
|
176
|
-
return []
|
|
177
|
-
|
|
178
|
-
# Check if corrections exist and are non-empty
|
|
179
|
-
if not corrections_data.get("corrections"):
|
|
180
|
-
self.logger.debug("No corrections suggested by LLM")
|
|
181
|
-
return []
|
|
182
|
-
|
|
183
|
-
corrections = []
|
|
184
|
-
for correction in corrections_data["corrections"]:
|
|
185
|
-
# Validate word_id exists in gap
|
|
186
|
-
if correction["word_id"] not in gap.transcribed_word_ids:
|
|
187
|
-
self.logger.error(f"LLM suggested correction for word_id {correction['word_id']} which is not in the gap")
|
|
188
|
-
continue
|
|
189
|
-
|
|
190
|
-
# Get original word from word map
|
|
191
|
-
original_word = word_map[correction["word_id"]]
|
|
192
|
-
position = gap.transcription_position + gap.transcribed_word_ids.index(correction["word_id"])
|
|
193
|
-
|
|
194
|
-
self.logger.debug(f"Processing correction: {correction}")
|
|
195
|
-
|
|
196
|
-
if correction["type"] == "replace":
|
|
197
|
-
self.logger.debug(
|
|
198
|
-
f"Creating replacement: '{original_word.text}' -> '{correction['corrected_text']}' " f"at position {position}"
|
|
199
|
-
)
|
|
200
|
-
corrections.append(
|
|
201
|
-
WordOperations.create_word_replacement_correction(
|
|
202
|
-
original_word=original_word.text,
|
|
203
|
-
corrected_word=correction["corrected_text"],
|
|
204
|
-
original_position=position,
|
|
205
|
-
source="LLM",
|
|
206
|
-
confidence=correction["confidence"],
|
|
207
|
-
reason=correction["reason"],
|
|
208
|
-
handler=self.name,
|
|
209
|
-
reference_positions=reference_positions,
|
|
210
|
-
original_word_id=correction["word_id"],
|
|
211
|
-
corrected_word_id=correction.get("reference_word_id"),
|
|
212
|
-
)
|
|
213
|
-
)
|
|
214
|
-
elif correction["type"] == "split":
|
|
215
|
-
split_words = correction["corrected_text"].split()
|
|
216
|
-
self.logger.debug(f"Creating split: '{original_word.text}' -> {split_words} " f"at position {position}")
|
|
217
|
-
|
|
218
|
-
# Get reference word IDs if provided
|
|
219
|
-
reference_word_ids = correction.get("reference_word_ids", [None] * len(split_words))
|
|
220
|
-
|
|
221
|
-
corrections.extend(
|
|
222
|
-
WordOperations.create_word_split_corrections(
|
|
223
|
-
original_word=original_word.text,
|
|
224
|
-
reference_words=split_words,
|
|
225
|
-
original_position=position,
|
|
226
|
-
source="LLM",
|
|
227
|
-
confidence=correction["confidence"],
|
|
228
|
-
reason=correction["reason"],
|
|
229
|
-
handler=self.name,
|
|
230
|
-
reference_positions=reference_positions,
|
|
231
|
-
original_word_id=correction["word_id"],
|
|
232
|
-
corrected_word_ids=reference_word_ids,
|
|
233
|
-
)
|
|
234
|
-
)
|
|
235
|
-
elif correction["type"] == "combine":
|
|
236
|
-
# Get all word IDs to combine
|
|
237
|
-
word_ids_to_combine = []
|
|
238
|
-
current_idx = gap.transcribed_word_ids.index(correction["word_id"])
|
|
239
|
-
words_needed = len(correction["corrected_text"].split())
|
|
240
|
-
|
|
241
|
-
if current_idx + words_needed <= len(gap.transcribed_word_ids):
|
|
242
|
-
word_ids_to_combine = gap.transcribed_word_ids[current_idx : current_idx + words_needed]
|
|
243
|
-
else:
|
|
244
|
-
self.logger.error(f"Not enough words available to combine at position {position}")
|
|
245
|
-
continue
|
|
246
|
-
|
|
247
|
-
words_to_combine = [word_map[word_id].text for word_id in word_ids_to_combine]
|
|
248
|
-
|
|
249
|
-
self.logger.debug(
|
|
250
|
-
f"Creating combine: {words_to_combine} -> '{correction['corrected_text']}' " f"at position {position}"
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
corrections.extend(
|
|
254
|
-
WordOperations.create_word_combine_corrections(
|
|
255
|
-
original_words=words_to_combine,
|
|
256
|
-
reference_word=correction["corrected_text"],
|
|
257
|
-
original_position=position,
|
|
258
|
-
source="LLM",
|
|
259
|
-
confidence=correction["confidence"],
|
|
260
|
-
combine_reason=correction["reason"],
|
|
261
|
-
delete_reason=f"Part of combining words: {correction['reason']}",
|
|
262
|
-
handler=self.name,
|
|
263
|
-
reference_positions=reference_positions,
|
|
264
|
-
original_word_ids=word_ids_to_combine,
|
|
265
|
-
corrected_word_id=correction.get("reference_word_id"),
|
|
266
|
-
)
|
|
267
|
-
)
|
|
268
|
-
elif correction["type"] == "delete":
|
|
269
|
-
self.logger.debug(f"Creating deletion: '{original_word.text}' at position {position}")
|
|
270
|
-
corrections.append(
|
|
271
|
-
WordCorrection(
|
|
272
|
-
original_word=original_word.text,
|
|
273
|
-
corrected_word="",
|
|
274
|
-
segment_index=0,
|
|
275
|
-
original_position=position,
|
|
276
|
-
confidence=correction["confidence"],
|
|
277
|
-
source="LLM",
|
|
278
|
-
reason=correction["reason"],
|
|
279
|
-
alternatives={},
|
|
280
|
-
is_deletion=True,
|
|
281
|
-
handler=self.name,
|
|
282
|
-
reference_positions=reference_positions,
|
|
283
|
-
word_id=correction["word_id"],
|
|
284
|
-
corrected_word_id=None,
|
|
285
|
-
)
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
self.logger.debug(f"Created {len(corrections)} corrections: {[f'{c.original_word}->{c.corrected_word}' for c in corrections]}")
|
|
289
|
-
return corrections
|
|
290
|
-
|
|
291
|
-
except Exception as e:
|
|
292
|
-
self.logger.error(f"Unexpected error in LLM handler: {e}")
|
|
293
|
-
return []
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Optional
|
|
3
|
-
import logging
|
|
4
|
-
from ollama import chat as ollama_chat
|
|
5
|
-
import openai
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class LLMProvider(ABC):
|
|
9
|
-
"""Abstract base class for LLM providers."""
|
|
10
|
-
|
|
11
|
-
def __init__(self, logger: Optional[logging.Logger] = None):
|
|
12
|
-
self.logger = logger or logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
@abstractmethod
|
|
15
|
-
def generate_response(self, prompt: str, **kwargs) -> str:
|
|
16
|
-
"""Generate a response from the LLM.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
prompt: The prompt to send to the LLM
|
|
20
|
-
**kwargs: Additional provider-specific parameters
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
str: The LLM's response
|
|
24
|
-
"""
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class OllamaProvider(LLMProvider):
|
|
29
|
-
"""Provider for local Ollama models."""
|
|
30
|
-
|
|
31
|
-
def __init__(self, model: str, logger: Optional[logging.Logger] = None):
|
|
32
|
-
super().__init__(logger)
|
|
33
|
-
self.model = model
|
|
34
|
-
|
|
35
|
-
def generate_response(self, prompt: str, **kwargs) -> str:
|
|
36
|
-
try:
|
|
37
|
-
response = ollama_chat(model=self.model, messages=[{"role": "user", "content": prompt}], format="json")
|
|
38
|
-
return response.message.content
|
|
39
|
-
except Exception as e:
|
|
40
|
-
self.logger.error(f"Error generating Ollama response: {e}")
|
|
41
|
-
raise
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class OpenAIProvider(LLMProvider):
|
|
45
|
-
"""Provider for OpenAI-compatible APIs (including OpenRouter)."""
|
|
46
|
-
|
|
47
|
-
def __init__(self, model: str, api_key: str, base_url: Optional[str] = None, logger: Optional[logging.Logger] = None):
|
|
48
|
-
super().__init__(logger)
|
|
49
|
-
self.model = model
|
|
50
|
-
self.client = openai.OpenAI(api_key=api_key, base_url=base_url)
|
|
51
|
-
|
|
52
|
-
def generate_response(self, prompt: str, **kwargs) -> str:
|
|
53
|
-
try:
|
|
54
|
-
response = self.client.chat.completions.create(
|
|
55
|
-
model=self.model, messages=[{"role": "user", "content": prompt}], response_format={"type": "json_object"}, **kwargs
|
|
56
|
-
)
|
|
57
|
-
return response.choices[0].message.content
|
|
58
|
-
except Exception as e:
|
|
59
|
-
self.logger.error(f"Error generating OpenAI response: {e}")
|
|
60
|
-
raise
|
|
File without changes
|
|
File without changes
|
|
File without changes
|