supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Translation Memory Module - SQLite Database Backend
|
|
3
|
+
|
|
4
|
+
Manages translation memory with fuzzy matching capabilities using SQLite.
|
|
5
|
+
Supports multiple TMs: Project TM, Big Mama TM, and custom TMX files.
|
|
6
|
+
|
|
7
|
+
Migrated from in-memory dictionaries to SQLite for scalability.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import xml.etree.ElementTree as ET
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from difflib import SequenceMatcher
|
|
14
|
+
from typing import Dict, List, Optional, Tuple
|
|
15
|
+
from modules.database_manager import DatabaseManager
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TM:
|
|
19
|
+
"""Individual Translation Memory with metadata"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, name: str, tm_id: str, enabled: bool = True, read_only: bool = False):
|
|
22
|
+
self.name = name
|
|
23
|
+
self.tm_id = tm_id
|
|
24
|
+
self.enabled = enabled
|
|
25
|
+
self.read_only = read_only
|
|
26
|
+
self.entries: Dict[str, str] = {} # source -> target mapping
|
|
27
|
+
self.metadata = {
|
|
28
|
+
'source_lang': None,
|
|
29
|
+
'target_lang': None,
|
|
30
|
+
'file_path': None,
|
|
31
|
+
'created': datetime.now().isoformat(),
|
|
32
|
+
'modified': datetime.now().isoformat()
|
|
33
|
+
}
|
|
34
|
+
self.fuzzy_threshold = 0.75
|
|
35
|
+
|
|
36
|
+
def add_entry(self, source: str, target: str):
|
|
37
|
+
"""Add translation pair to this TM"""
|
|
38
|
+
if not self.read_only and source and target:
|
|
39
|
+
self.entries[source.strip()] = target.strip()
|
|
40
|
+
self.metadata['modified'] = datetime.now().isoformat()
|
|
41
|
+
|
|
42
|
+
def get_exact_match(self, source: str) -> Optional[str]:
|
|
43
|
+
"""Get exact match from this TM"""
|
|
44
|
+
return self.entries.get(source.strip())
|
|
45
|
+
|
|
46
|
+
def calculate_similarity(self, text1: str, text2: str) -> float:
|
|
47
|
+
"""Calculate similarity ratio between two texts"""
|
|
48
|
+
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
|
|
49
|
+
|
|
50
|
+
def get_fuzzy_matches(self, source: str, max_matches: int = 5) -> List[Dict]:
|
|
51
|
+
"""Get fuzzy matches from this TM"""
|
|
52
|
+
source = source.strip()
|
|
53
|
+
matches = []
|
|
54
|
+
|
|
55
|
+
for tm_source, tm_target in self.entries.items():
|
|
56
|
+
similarity = self.calculate_similarity(source, tm_source)
|
|
57
|
+
if similarity >= self.fuzzy_threshold:
|
|
58
|
+
matches.append({
|
|
59
|
+
'source': tm_source,
|
|
60
|
+
'target': tm_target,
|
|
61
|
+
'similarity': similarity,
|
|
62
|
+
'match_pct': int(similarity * 100),
|
|
63
|
+
'tm_name': self.name,
|
|
64
|
+
'tm_id': self.tm_id
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
matches.sort(key=lambda x: x['similarity'], reverse=True)
|
|
68
|
+
return matches[:max_matches]
|
|
69
|
+
|
|
70
|
+
def get_entry_count(self) -> int:
|
|
71
|
+
"""Get number of entries in this TM"""
|
|
72
|
+
return len(self.entries)
|
|
73
|
+
|
|
74
|
+
def to_dict(self) -> Dict:
|
|
75
|
+
"""Serialize TM to dictionary for JSON storage"""
|
|
76
|
+
return {
|
|
77
|
+
'name': self.name,
|
|
78
|
+
'tm_id': self.tm_id,
|
|
79
|
+
'enabled': self.enabled,
|
|
80
|
+
'read_only': self.read_only,
|
|
81
|
+
'entries': self.entries,
|
|
82
|
+
'metadata': self.metadata,
|
|
83
|
+
'fuzzy_threshold': self.fuzzy_threshold
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def from_dict(data: Dict) -> 'TM':
|
|
88
|
+
"""Deserialize TM from dictionary"""
|
|
89
|
+
tm = TM(
|
|
90
|
+
name=data.get('name', 'Unnamed TM'),
|
|
91
|
+
tm_id=data.get('tm_id', 'unknown'),
|
|
92
|
+
enabled=data.get('enabled', True),
|
|
93
|
+
read_only=data.get('read_only', False)
|
|
94
|
+
)
|
|
95
|
+
tm.entries = data.get('entries', {})
|
|
96
|
+
tm.metadata = data.get('metadata', {})
|
|
97
|
+
tm.fuzzy_threshold = data.get('fuzzy_threshold', 0.75)
|
|
98
|
+
return tm
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class TMDatabase:
|
|
102
|
+
"""Manages multiple Translation Memories using SQLite backend"""
|
|
103
|
+
|
|
104
|
+
def __init__(self, source_lang: str = None, target_lang: str = None, db_path: str = None, log_callback=None):
|
|
105
|
+
"""
|
|
106
|
+
Initialize TM database
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
source_lang: Source language (e.g., "en" or "English")
|
|
110
|
+
target_lang: Target language (e.g., "nl" or "Dutch")
|
|
111
|
+
db_path: Path to SQLite database file (default: user_data/supervertaler.db)
|
|
112
|
+
log_callback: Logging function
|
|
113
|
+
"""
|
|
114
|
+
self.source_lang = source_lang
|
|
115
|
+
self.target_lang = target_lang
|
|
116
|
+
self.log = log_callback if log_callback else print
|
|
117
|
+
|
|
118
|
+
# Initialize database manager
|
|
119
|
+
self.db = DatabaseManager(db_path=db_path, log_callback=log_callback)
|
|
120
|
+
self.db.connect()
|
|
121
|
+
|
|
122
|
+
# Set language metadata if provided
|
|
123
|
+
if source_lang and target_lang:
|
|
124
|
+
self.set_tm_languages(source_lang, target_lang)
|
|
125
|
+
|
|
126
|
+
# Global fuzzy threshold
|
|
127
|
+
self.fuzzy_threshold = 0.75
|
|
128
|
+
|
|
129
|
+
# TM metadata cache (populated from database as needed)
|
|
130
|
+
# Note: Legacy 'project' and 'big_mama' TMs are no longer used.
|
|
131
|
+
# All TMs are now managed through TMMetadataManager and stored in translation_memories table.
|
|
132
|
+
self.tm_metadata = {}
|
|
133
|
+
|
|
134
|
+
def set_tm_languages(self, source_lang: str, target_lang: str):
|
|
135
|
+
"""Set language pair for TMs"""
|
|
136
|
+
# Convert to ISO codes
|
|
137
|
+
from modules.tmx_generator import get_simple_lang_code
|
|
138
|
+
self.source_lang = get_simple_lang_code(source_lang)
|
|
139
|
+
self.target_lang = get_simple_lang_code(target_lang)
|
|
140
|
+
|
|
141
|
+
def add_entry(self, source: str, target: str, tm_id: str = 'project',
|
|
142
|
+
context_before: str = None, context_after: str = None, notes: str = None):
|
|
143
|
+
"""
|
|
144
|
+
Add translation pair to TM
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
source: Source text
|
|
148
|
+
target: Target text
|
|
149
|
+
tm_id: TM identifier ('project', 'big_mama', or custom)
|
|
150
|
+
context_before: Previous segment for context
|
|
151
|
+
context_after: Next segment for context
|
|
152
|
+
notes: Optional notes
|
|
153
|
+
"""
|
|
154
|
+
if not source or not target:
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
self.db.add_translation_unit(
|
|
158
|
+
source=source.strip(),
|
|
159
|
+
target=target.strip(),
|
|
160
|
+
source_lang=self.source_lang or 'en',
|
|
161
|
+
target_lang=self.target_lang or 'nl',
|
|
162
|
+
tm_id=tm_id,
|
|
163
|
+
context_before=context_before,
|
|
164
|
+
context_after=context_after,
|
|
165
|
+
notes=notes
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def add_to_project_tm(self, source: str, target: str):
|
|
169
|
+
"""Add entry to Project TM (convenience method)"""
|
|
170
|
+
self.add_entry(source, target, tm_id='project')
|
|
171
|
+
|
|
172
|
+
def get_exact_match(self, source: str, tm_ids: List[str] = None) -> Optional[str]:
|
|
173
|
+
"""
|
|
174
|
+
Get exact match from TM(s)
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
source: Source text to match
|
|
178
|
+
tm_ids: List of TM IDs to search (None = all enabled)
|
|
179
|
+
|
|
180
|
+
Returns: Target text or None
|
|
181
|
+
"""
|
|
182
|
+
if tm_ids is None:
|
|
183
|
+
# Search all enabled TMs
|
|
184
|
+
tm_ids = [tm_id for tm_id, meta in self.tm_metadata.items() if meta.get('enabled', True)]
|
|
185
|
+
|
|
186
|
+
match = self.db.get_exact_match(
|
|
187
|
+
source=source,
|
|
188
|
+
tm_ids=tm_ids,
|
|
189
|
+
source_lang=self.source_lang,
|
|
190
|
+
target_lang=self.target_lang
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return match['target_text'] if match else None
|
|
194
|
+
|
|
195
|
+
def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True, max_matches: int = 5) -> List[Dict]:
|
|
196
|
+
"""
|
|
197
|
+
Search across multiple TMs for fuzzy matches
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
source: Source text to search for
|
|
201
|
+
tm_ids: Specific TM IDs to search (None = search all)
|
|
202
|
+
enabled_only: Only search enabled TMs
|
|
203
|
+
max_matches: Maximum number of results
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
List of match dictionaries sorted by similarity
|
|
207
|
+
"""
|
|
208
|
+
print(f"[DEBUG] TMDatabase.search_all: source='{source[:50]}...', tm_ids={tm_ids}")
|
|
209
|
+
|
|
210
|
+
# Determine which TMs to search
|
|
211
|
+
# If tm_ids is None or empty, search ALL TMs (don't filter by tm_id)
|
|
212
|
+
if tm_ids is None and enabled_only:
|
|
213
|
+
tm_ids = [tm_id for tm_id, meta in self.tm_metadata.items() if meta.get('enabled', True)]
|
|
214
|
+
print(f"[DEBUG] TMDatabase.search_all: No tm_ids provided, using from metadata: {tm_ids}")
|
|
215
|
+
|
|
216
|
+
# If tm_ids is still empty, set to None to search ALL TMs
|
|
217
|
+
if tm_ids is not None and len(tm_ids) == 0:
|
|
218
|
+
tm_ids = None
|
|
219
|
+
print(f"[DEBUG] TMDatabase.search_all: Empty tm_ids, setting to None to search ALL")
|
|
220
|
+
|
|
221
|
+
print(f"[DEBUG] TMDatabase.search_all: Final tm_ids to search: {tm_ids}")
|
|
222
|
+
|
|
223
|
+
# First try exact match
|
|
224
|
+
exact_match = self.db.get_exact_match(
|
|
225
|
+
source=source,
|
|
226
|
+
tm_ids=tm_ids,
|
|
227
|
+
source_lang=self.source_lang,
|
|
228
|
+
target_lang=self.target_lang
|
|
229
|
+
)
|
|
230
|
+
print(f"[DEBUG] TMDatabase.search_all: Exact match result: {exact_match}")
|
|
231
|
+
|
|
232
|
+
if exact_match:
|
|
233
|
+
# Format as match dictionary
|
|
234
|
+
return [{
|
|
235
|
+
'source': exact_match['source_text'],
|
|
236
|
+
'target': exact_match['target_text'],
|
|
237
|
+
'similarity': 1.0,
|
|
238
|
+
'match_pct': 100,
|
|
239
|
+
'tm_name': self.tm_metadata.get(exact_match['tm_id'], {}).get('name', exact_match['tm_id']),
|
|
240
|
+
'tm_id': exact_match['tm_id']
|
|
241
|
+
}]
|
|
242
|
+
|
|
243
|
+
# Try fuzzy matches
|
|
244
|
+
print(f"[DEBUG] TMDatabase.search_all: Calling fuzzy search with source_lang={self.source_lang}, target_lang={self.target_lang}")
|
|
245
|
+
fuzzy_matches = self.db.search_fuzzy_matches(
|
|
246
|
+
source=source,
|
|
247
|
+
tm_ids=tm_ids,
|
|
248
|
+
threshold=self.fuzzy_threshold,
|
|
249
|
+
max_results=max_matches,
|
|
250
|
+
source_lang=self.source_lang,
|
|
251
|
+
target_lang=self.target_lang
|
|
252
|
+
)
|
|
253
|
+
print(f"[DEBUG] TMDatabase.search_all: Fuzzy search returned {len(fuzzy_matches)} matches")
|
|
254
|
+
|
|
255
|
+
# Format matches for UI
|
|
256
|
+
formatted_matches = []
|
|
257
|
+
for match in fuzzy_matches:
|
|
258
|
+
formatted_matches.append({
|
|
259
|
+
'source': match['source_text'],
|
|
260
|
+
'target': match['target_text'],
|
|
261
|
+
'similarity': match.get('similarity', 0.85),
|
|
262
|
+
'match_pct': match.get('match_pct', 85),
|
|
263
|
+
'tm_name': self.tm_metadata.get(match['tm_id'], {}).get('name', match['tm_id']),
|
|
264
|
+
'tm_id': match['tm_id']
|
|
265
|
+
})
|
|
266
|
+
|
|
267
|
+
return formatted_matches
|
|
268
|
+
|
|
269
|
+
def concordance_search(self, query: str, tm_ids: List[str] = None, direction: str = 'both',
|
|
270
|
+
source_lang: str = None, target_lang: str = None) -> List[Dict]:
|
|
271
|
+
"""
|
|
272
|
+
Search for text in both source and target
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
query: Search query
|
|
276
|
+
tm_ids: TM IDs to search (None = all)
|
|
277
|
+
direction: 'source' = search source only, 'target' = search target only, 'both' = bidirectional
|
|
278
|
+
source_lang: Filter by source language (None = any)
|
|
279
|
+
target_lang: Filter by target language (None = any)
|
|
280
|
+
|
|
281
|
+
Returns: List of matching entries
|
|
282
|
+
"""
|
|
283
|
+
results = self.db.concordance_search(query=query, tm_ids=tm_ids, direction=direction,
|
|
284
|
+
source_lang=source_lang, target_lang=target_lang)
|
|
285
|
+
|
|
286
|
+
# Format for UI
|
|
287
|
+
formatted = []
|
|
288
|
+
for result in results:
|
|
289
|
+
formatted.append({
|
|
290
|
+
'source': result['source_text'],
|
|
291
|
+
'target': result['target_text'],
|
|
292
|
+
'tm_name': self.tm_metadata.get(result['tm_id'], {}).get('name', result['tm_id']),
|
|
293
|
+
'tm_id': result['tm_id'],
|
|
294
|
+
'created': result.get('created_date', ''),
|
|
295
|
+
'usage_count': result.get('usage_count', 0)
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
return formatted
|
|
299
|
+
|
|
300
|
+
def get_tm_entries(self, tm_id: str, limit: int = None) -> List[Dict]:
|
|
301
|
+
"""
|
|
302
|
+
Get all entries from a specific TM
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
tm_id: TM identifier
|
|
306
|
+
limit: Maximum number of entries (None = all)
|
|
307
|
+
|
|
308
|
+
Returns: List of entry dictionaries
|
|
309
|
+
"""
|
|
310
|
+
entries = self.db.get_tm_entries(tm_id=tm_id, limit=limit)
|
|
311
|
+
|
|
312
|
+
# Format for UI
|
|
313
|
+
formatted = []
|
|
314
|
+
for entry in entries:
|
|
315
|
+
formatted.append({
|
|
316
|
+
'source': entry['source_text'],
|
|
317
|
+
'target': entry['target_text'],
|
|
318
|
+
'created': entry.get('created_date', ''),
|
|
319
|
+
'modified': entry.get('modified_date', ''),
|
|
320
|
+
'usage_count': entry.get('usage_count', 0),
|
|
321
|
+
'notes': entry.get('notes', '')
|
|
322
|
+
})
|
|
323
|
+
|
|
324
|
+
return formatted
|
|
325
|
+
|
|
326
|
+
def get_entry_count(self, tm_id: str = None, enabled_only: bool = False) -> int:
|
|
327
|
+
"""
|
|
328
|
+
Get entry count for TM(s)
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
tm_id: Specific TM ID (None = all)
|
|
332
|
+
enabled_only: Only count enabled TMs
|
|
333
|
+
|
|
334
|
+
Returns: Total entry count
|
|
335
|
+
"""
|
|
336
|
+
if tm_id:
|
|
337
|
+
return self.db.get_tm_count(tm_id=tm_id)
|
|
338
|
+
|
|
339
|
+
# Count all TMs
|
|
340
|
+
if enabled_only:
|
|
341
|
+
tm_ids = [tm_id for tm_id, meta in self.tm_metadata.items() if meta.get('enabled', True)]
|
|
342
|
+
return sum(self.db.get_tm_count(tm_id) for tm_id in tm_ids)
|
|
343
|
+
else:
|
|
344
|
+
return self.db.get_tm_count()
|
|
345
|
+
|
|
346
|
+
def clear_tm(self, tm_id: str):
|
|
347
|
+
"""Clear all entries from a TM"""
|
|
348
|
+
self.db.clear_tm(tm_id=tm_id)
|
|
349
|
+
|
|
350
|
+
def delete_entry(self, tm_id: str, source: str, target: str):
|
|
351
|
+
"""Delete a specific entry from a TM"""
|
|
352
|
+
self.db.delete_entry(tm_id, source, target)
|
|
353
|
+
|
|
354
|
+
def add_custom_tm(self, name: str, tm_id: str = None, read_only: bool = False):
|
|
355
|
+
"""Register a custom TM"""
|
|
356
|
+
if tm_id is None:
|
|
357
|
+
tm_id = f"custom_{len(self.tm_metadata)}"
|
|
358
|
+
|
|
359
|
+
self.tm_metadata[tm_id] = {
|
|
360
|
+
'name': name,
|
|
361
|
+
'enabled': True,
|
|
362
|
+
'read_only': read_only
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
return tm_id
|
|
366
|
+
|
|
367
|
+
def remove_custom_tm(self, tm_id: str) -> bool:
|
|
368
|
+
"""Remove a custom TM and its entries"""
|
|
369
|
+
if tm_id in self.tm_metadata and tm_id not in ['project', 'big_mama']:
|
|
370
|
+
# Clear entries from database
|
|
371
|
+
self.clear_tm(tm_id)
|
|
372
|
+
# Remove metadata
|
|
373
|
+
del self.tm_metadata[tm_id]
|
|
374
|
+
return True
|
|
375
|
+
return False
|
|
376
|
+
|
|
377
|
+
def get_tm_list(self, enabled_only: bool = False) -> List[Dict]:
|
|
378
|
+
"""
|
|
379
|
+
Get list of all TMs with metadata
|
|
380
|
+
|
|
381
|
+
Returns: List of TM info dictionaries
|
|
382
|
+
"""
|
|
383
|
+
tm_list = []
|
|
384
|
+
for tm_id, meta in self.tm_metadata.items():
|
|
385
|
+
if enabled_only and not meta.get('enabled', True):
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
tm_list.append({
|
|
389
|
+
'tm_id': tm_id,
|
|
390
|
+
'name': meta.get('name', tm_id),
|
|
391
|
+
'enabled': meta.get('enabled', True),
|
|
392
|
+
'read_only': meta.get('read_only', False),
|
|
393
|
+
'entry_count': self.db.get_tm_count(tm_id)
|
|
394
|
+
})
|
|
395
|
+
|
|
396
|
+
return tm_list
|
|
397
|
+
|
|
398
|
+
def get_all_tms(self, enabled_only: bool = False) -> List[Dict]:
|
|
399
|
+
"""Alias for get_tm_list() for backward compatibility"""
|
|
400
|
+
return self.get_tm_list(enabled_only=enabled_only)
|
|
401
|
+
|
|
402
|
+
def load_tmx_file(self, filepath: str, src_lang: str, tgt_lang: str,
|
|
403
|
+
tm_name: str = None, read_only: bool = False,
|
|
404
|
+
strip_variants: bool = True, progress_callback=None) -> tuple[str, int]:
|
|
405
|
+
"""
|
|
406
|
+
Load TMX file into a new custom TM
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
filepath: Path to TMX file
|
|
410
|
+
src_lang: Source language code
|
|
411
|
+
tgt_lang: Target language code
|
|
412
|
+
tm_name: Custom name for TM (default: filename)
|
|
413
|
+
read_only: Make TM read-only
|
|
414
|
+
strip_variants: Match base languages ignoring regional variants (default: True)
|
|
415
|
+
progress_callback: Optional callback function(current, total, message) for progress updates
|
|
416
|
+
|
|
417
|
+
Returns: (tm_id, entry_count)
|
|
418
|
+
"""
|
|
419
|
+
if tm_name is None:
|
|
420
|
+
tm_name = os.path.basename(filepath).replace('.tmx', '')
|
|
421
|
+
|
|
422
|
+
# Create custom TM
|
|
423
|
+
tm_id = f"custom_{os.path.basename(filepath).replace('.', '_')}"
|
|
424
|
+
self.add_custom_tm(tm_name, tm_id, read_only=read_only)
|
|
425
|
+
|
|
426
|
+
# Load TMX content
|
|
427
|
+
loaded_count = self._load_tmx_into_db(filepath, src_lang, tgt_lang, tm_id,
|
|
428
|
+
strip_variants=strip_variants,
|
|
429
|
+
progress_callback=progress_callback)
|
|
430
|
+
|
|
431
|
+
self.log(f"✓ Loaded {loaded_count} entries from {os.path.basename(filepath)}")
|
|
432
|
+
|
|
433
|
+
return tm_id, loaded_count
|
|
434
|
+
|
|
435
|
+
def _load_tmx_into_db(self, filepath: str, src_lang: str, tgt_lang: str, tm_id: str,
|
|
436
|
+
strip_variants: bool = False, progress_callback=None) -> int:
|
|
437
|
+
"""
|
|
438
|
+
Internal: Load TMX content into database with chunked processing
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
filepath: Path to TMX file
|
|
442
|
+
src_lang: Target source language code
|
|
443
|
+
tgt_lang: Target target language code
|
|
444
|
+
tm_id: TM identifier
|
|
445
|
+
strip_variants: If True, match base languages ignoring regional variants
|
|
446
|
+
progress_callback: Optional callback function(current, total, message) for progress updates
|
|
447
|
+
"""
|
|
448
|
+
loaded_count = 0
|
|
449
|
+
chunk_size = 1000 # Process in chunks for responsiveness
|
|
450
|
+
chunk_buffer = []
|
|
451
|
+
|
|
452
|
+
try:
|
|
453
|
+
# First pass: count total TUs for progress bar
|
|
454
|
+
if progress_callback:
|
|
455
|
+
progress_callback(0, 0, "Counting translation units...")
|
|
456
|
+
|
|
457
|
+
tree = ET.parse(filepath)
|
|
458
|
+
root = tree.getroot()
|
|
459
|
+
total_tus = len(root.findall('.//tu'))
|
|
460
|
+
|
|
461
|
+
if progress_callback:
|
|
462
|
+
progress_callback(0, total_tus, f"Processing 0 / {total_tus:,} entries...")
|
|
463
|
+
|
|
464
|
+
xml_ns = "http://www.w3.org/XML/1998/namespace"
|
|
465
|
+
|
|
466
|
+
# Normalize language codes
|
|
467
|
+
from modules.tmx_generator import get_simple_lang_code, get_base_lang_code
|
|
468
|
+
src_lang_normalized = get_simple_lang_code(src_lang)
|
|
469
|
+
tgt_lang_normalized = get_simple_lang_code(tgt_lang)
|
|
470
|
+
|
|
471
|
+
# If stripping variants, get base codes for comparison
|
|
472
|
+
if strip_variants:
|
|
473
|
+
src_base = get_base_lang_code(src_lang_normalized)
|
|
474
|
+
tgt_base = get_base_lang_code(tgt_lang_normalized)
|
|
475
|
+
|
|
476
|
+
processed = 0
|
|
477
|
+
for tu in root.findall('.//tu'):
|
|
478
|
+
src_text, tgt_text = None, None
|
|
479
|
+
|
|
480
|
+
for tuv_node in tu.findall('tuv'):
|
|
481
|
+
lang_attr = tuv_node.get(f'{{{xml_ns}}}lang')
|
|
482
|
+
if not lang_attr:
|
|
483
|
+
continue
|
|
484
|
+
|
|
485
|
+
tmx_lang = get_simple_lang_code(lang_attr)
|
|
486
|
+
|
|
487
|
+
seg_node = tuv_node.find('seg')
|
|
488
|
+
if seg_node is not None:
|
|
489
|
+
try:
|
|
490
|
+
text = ET.tostring(seg_node, encoding='unicode', method='text').strip()
|
|
491
|
+
except:
|
|
492
|
+
text = "".join(seg_node.itertext()).strip()
|
|
493
|
+
|
|
494
|
+
# Match languages (exact or base code match if stripping variants)
|
|
495
|
+
if strip_variants:
|
|
496
|
+
if get_base_lang_code(tmx_lang) == src_base:
|
|
497
|
+
src_text = text
|
|
498
|
+
elif get_base_lang_code(tmx_lang) == tgt_base:
|
|
499
|
+
tgt_text = text
|
|
500
|
+
else:
|
|
501
|
+
if tmx_lang == src_lang_normalized:
|
|
502
|
+
src_text = text
|
|
503
|
+
elif tmx_lang == tgt_lang_normalized:
|
|
504
|
+
tgt_text = text
|
|
505
|
+
|
|
506
|
+
if src_text and tgt_text:
|
|
507
|
+
chunk_buffer.append((src_text, tgt_text))
|
|
508
|
+
loaded_count += 1
|
|
509
|
+
|
|
510
|
+
# Process chunk when buffer is full
|
|
511
|
+
if len(chunk_buffer) >= chunk_size:
|
|
512
|
+
for src, tgt in chunk_buffer:
|
|
513
|
+
self.db.add_translation_unit(
|
|
514
|
+
source=src,
|
|
515
|
+
target=tgt,
|
|
516
|
+
source_lang=src_lang_normalized,
|
|
517
|
+
target_lang=tgt_lang_normalized,
|
|
518
|
+
tm_id=tm_id
|
|
519
|
+
)
|
|
520
|
+
chunk_buffer.clear()
|
|
521
|
+
|
|
522
|
+
# Update progress
|
|
523
|
+
if progress_callback:
|
|
524
|
+
progress_callback(processed + 1, total_tus,
|
|
525
|
+
f"Processing {loaded_count:,} / {total_tus:,} entries...")
|
|
526
|
+
|
|
527
|
+
processed += 1
|
|
528
|
+
|
|
529
|
+
# Process remaining entries in buffer
|
|
530
|
+
if chunk_buffer:
|
|
531
|
+
for src, tgt in chunk_buffer:
|
|
532
|
+
self.db.add_translation_unit(
|
|
533
|
+
source=src,
|
|
534
|
+
target=tgt,
|
|
535
|
+
source_lang=src_lang_normalized,
|
|
536
|
+
target_lang=tgt_lang_normalized,
|
|
537
|
+
tm_id=tm_id
|
|
538
|
+
)
|
|
539
|
+
chunk_buffer.clear()
|
|
540
|
+
|
|
541
|
+
# Final progress update
|
|
542
|
+
if progress_callback:
|
|
543
|
+
progress_callback(total_tus, total_tus, f"Completed: {loaded_count:,} entries imported")
|
|
544
|
+
|
|
545
|
+
return loaded_count
|
|
546
|
+
except Exception as e:
|
|
547
|
+
self.log(f"✗ Error loading TMX: {e}")
|
|
548
|
+
return 0
|
|
549
|
+
|
|
550
|
+
def detect_tmx_languages(self, filepath: str) -> List[str]:
|
|
551
|
+
"""Detect all language codes present in a TMX file"""
|
|
552
|
+
try:
|
|
553
|
+
tree = ET.parse(filepath)
|
|
554
|
+
root = tree.getroot()
|
|
555
|
+
xml_ns = "http://www.w3.org/XML/1998/namespace"
|
|
556
|
+
|
|
557
|
+
languages = set()
|
|
558
|
+
for tuv in root.findall('.//tuv'):
|
|
559
|
+
lang_attr = tuv.get(f'{{{xml_ns}}}lang')
|
|
560
|
+
if lang_attr:
|
|
561
|
+
languages.add(lang_attr)
|
|
562
|
+
|
|
563
|
+
return sorted(list(languages))
|
|
564
|
+
except:
|
|
565
|
+
return []
|
|
566
|
+
|
|
567
|
+
def check_language_compatibility(self, tmx_langs: List[str], target_src: str, target_tgt: str) -> dict:
|
|
568
|
+
"""
|
|
569
|
+
Analyze if TMX languages match target TM languages, handling variants.
|
|
570
|
+
Returns dict with compatibility info and suggestions.
|
|
571
|
+
"""
|
|
572
|
+
from modules.tmx_generator import get_base_lang_code, languages_are_compatible
|
|
573
|
+
|
|
574
|
+
if len(tmx_langs) < 2:
|
|
575
|
+
return {'compatible': False, 'reason': 'tmx_incomplete'}
|
|
576
|
+
|
|
577
|
+
# Get base codes
|
|
578
|
+
tmx_bases = [get_base_lang_code(lang) for lang in tmx_langs]
|
|
579
|
+
target_src_base = get_base_lang_code(target_src)
|
|
580
|
+
target_tgt_base = get_base_lang_code(target_tgt)
|
|
581
|
+
|
|
582
|
+
# Check if we can find matching pair
|
|
583
|
+
src_match = None
|
|
584
|
+
tgt_match = None
|
|
585
|
+
|
|
586
|
+
for tmx_lang in tmx_langs:
|
|
587
|
+
if get_base_lang_code(tmx_lang) == target_src_base and src_match is None:
|
|
588
|
+
src_match = tmx_lang
|
|
589
|
+
if get_base_lang_code(tmx_lang) == target_tgt_base and tgt_match is None:
|
|
590
|
+
tgt_match = tmx_lang
|
|
591
|
+
|
|
592
|
+
if not src_match or not tgt_match:
|
|
593
|
+
return {
|
|
594
|
+
'compatible': False,
|
|
595
|
+
'reason': 'no_match',
|
|
596
|
+
'tmx_langs': tmx_langs,
|
|
597
|
+
'target_langs': [target_src, target_tgt]
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
# Check if exact match or variant match
|
|
601
|
+
exact_match = (src_match == target_src and tgt_match == target_tgt)
|
|
602
|
+
|
|
603
|
+
return {
|
|
604
|
+
'compatible': True,
|
|
605
|
+
'exact_match': exact_match,
|
|
606
|
+
'variant_match': not exact_match,
|
|
607
|
+
'tmx_source': src_match,
|
|
608
|
+
'tmx_target': tgt_match,
|
|
609
|
+
'target_source': target_src,
|
|
610
|
+
'target_target': target_tgt
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
def close(self):
|
|
614
|
+
"""Close database connection"""
|
|
615
|
+
if self.db:
|
|
616
|
+
self.db.close()
|
|
617
|
+
|
|
618
|
+
def __del__(self):
|
|
619
|
+
"""Ensure database is closed on cleanup"""
|
|
620
|
+
self.close()
|
|
621
|
+
|
|
622
|
+
# Legacy compatibility methods for old JSON format
|
|
623
|
+
def to_dict(self) -> Dict:
|
|
624
|
+
"""Export to legacy dictionary format (for JSON serialization)"""
|
|
625
|
+
# NOTE: This is a legacy method - new code should use database directly
|
|
626
|
+
# Exporting large databases to JSON is not recommended
|
|
627
|
+
self.log("⚠️ Warning: Exporting database to dict format. Use TMX export for large datasets.")
|
|
628
|
+
|
|
629
|
+
return {
|
|
630
|
+
'project_tm': {'entries': {e['source']: e['target'] for e in self.get_tm_entries('project')}},
|
|
631
|
+
'big_mama_tm': {'entries': {e['source']: e['target'] for e in self.get_tm_entries('big_mama')}},
|
|
632
|
+
'custom_tms': {},
|
|
633
|
+
'fuzzy_threshold': self.fuzzy_threshold
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
@staticmethod
|
|
637
|
+
def from_dict(data: Dict, db_path: str = None, log_callback=None) -> 'TMDatabase':
|
|
638
|
+
"""Import from legacy dictionary format (for JSON deserialization)"""
|
|
639
|
+
# NOTE: This is a legacy method - new code should use database directly
|
|
640
|
+
db = TMDatabase(db_path=db_path, log_callback=log_callback)
|
|
641
|
+
|
|
642
|
+
# Import Project TM
|
|
643
|
+
if 'project_tm' in data and 'entries' in data['project_tm']:
|
|
644
|
+
for src, tgt in data['project_tm']['entries'].items():
|
|
645
|
+
db.add_entry(src, tgt, tm_id='project')
|
|
646
|
+
|
|
647
|
+
# Import Big Mama TM
|
|
648
|
+
if 'big_mama_tm' in data and 'entries' in data['big_mama_tm']:
|
|
649
|
+
for src, tgt in data['big_mama_tm']['entries'].items():
|
|
650
|
+
db.add_entry(src, tgt, tm_id='big_mama')
|
|
651
|
+
elif 'main_tm' in data and 'entries' in data['main_tm']: # Legacy support
|
|
652
|
+
for src, tgt in data['main_tm']['entries'].items():
|
|
653
|
+
db.add_entry(src, tgt, tm_id='big_mama')
|
|
654
|
+
|
|
655
|
+
db.fuzzy_threshold = data.get('fuzzy_threshold', 0.75)
|
|
656
|
+
|
|
657
|
+
return db
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
class TMAgent:
|
|
661
|
+
"""Legacy wrapper for backwards compatibility - delegates to TMDatabase"""
|
|
662
|
+
|
|
663
|
+
def __init__(self, db_path: str = None):
|
|
664
|
+
self.tm_database = TMDatabase(db_path=db_path)
|
|
665
|
+
self.fuzzy_threshold = 0.75
|
|
666
|
+
|
|
667
|
+
@property
|
|
668
|
+
def tm_data(self):
|
|
669
|
+
"""Legacy property - returns Project TM entries as dictionary"""
|
|
670
|
+
entries = self.tm_database.get_tm_entries('project')
|
|
671
|
+
return {e['source']: e['target'] for e in entries}
|
|
672
|
+
|
|
673
|
+
@tm_data.setter
|
|
674
|
+
def tm_data(self, value: Dict[str, str]):
|
|
675
|
+
"""Legacy property setter - loads entries into Project TM"""
|
|
676
|
+
# Clear existing entries
|
|
677
|
+
self.tm_database.clear_tm('project')
|
|
678
|
+
# Add new entries
|
|
679
|
+
for source, target in value.items():
|
|
680
|
+
self.tm_database.add_entry(source, target, tm_id='project')
|
|
681
|
+
|
|
682
|
+
def add_entry(self, source: str, target: str):
|
|
683
|
+
"""Add to Project TM"""
|
|
684
|
+
self.tm_database.add_to_project_tm(source, target)
|
|
685
|
+
|
|
686
|
+
def get_exact_match(self, source: str) -> Optional[str]:
|
|
687
|
+
"""Search all enabled TMs for exact match"""
|
|
688
|
+
return self.tm_database.get_exact_match(source)
|
|
689
|
+
|
|
690
|
+
def get_fuzzy_matches(self, source: str, max_matches: int = 5) -> List[Tuple[str, str, float]]:
|
|
691
|
+
"""Legacy format - returns tuples"""
|
|
692
|
+
matches = self.tm_database.search_all(source, enabled_only=True, max_matches=max_matches)
|
|
693
|
+
return [(m['source'], m['target'], m['similarity']) for m in matches]
|
|
694
|
+
|
|
695
|
+
def get_best_match(self, source: str) -> Optional[Tuple[str, str, float]]:
|
|
696
|
+
"""Get best match in legacy format"""
|
|
697
|
+
matches = self.get_fuzzy_matches(source, max_matches=1)
|
|
698
|
+
return matches[0] if matches else None
|
|
699
|
+
|
|
700
|
+
def load_from_tmx(self, filepath: str, src_lang: str = "en", tgt_lang: str = "nl") -> int:
|
|
701
|
+
"""Legacy TMX load - loads into a new custom TM"""
|
|
702
|
+
tm_id, count = self.tm_database.load_tmx_file(filepath, src_lang, tgt_lang)
|
|
703
|
+
return count
|
|
704
|
+
|
|
705
|
+
def get_entry_count(self) -> int:
|
|
706
|
+
"""Get total entry count"""
|
|
707
|
+
return self.tm_database.get_entry_count(enabled_only=False)
|
|
708
|
+
|
|
709
|
+
def clear(self):
|
|
710
|
+
"""Clear Project TM only"""
|
|
711
|
+
self.tm_database.clear_tm('project')
|
|
712
|
+
|
|
713
|
+
def delete_entry(self, tm_id: str, source: str, target: str):
|
|
714
|
+
"""Delete a specific entry from a TM"""
|
|
715
|
+
self.tm_database.delete_entry(tm_id, source, target)
|