supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,715 @@
1
+ """
2
+ Translation Memory Module - SQLite Database Backend
3
+
4
+ Manages translation memory with fuzzy matching capabilities using SQLite.
5
+ Supports multiple TMs: Project TM, Big Mama TM, and custom TMX files.
6
+
7
+ Migrated from in-memory dictionaries to SQLite for scalability.
8
+ """
9
+
10
+ import os
11
+ import xml.etree.ElementTree as ET
12
+ from datetime import datetime
13
+ from difflib import SequenceMatcher
14
+ from typing import Dict, List, Optional, Tuple
15
+ from modules.database_manager import DatabaseManager
16
+
17
+
18
+ class TM:
19
+ """Individual Translation Memory with metadata"""
20
+
21
+ def __init__(self, name: str, tm_id: str, enabled: bool = True, read_only: bool = False):
22
+ self.name = name
23
+ self.tm_id = tm_id
24
+ self.enabled = enabled
25
+ self.read_only = read_only
26
+ self.entries: Dict[str, str] = {} # source -> target mapping
27
+ self.metadata = {
28
+ 'source_lang': None,
29
+ 'target_lang': None,
30
+ 'file_path': None,
31
+ 'created': datetime.now().isoformat(),
32
+ 'modified': datetime.now().isoformat()
33
+ }
34
+ self.fuzzy_threshold = 0.75
35
+
36
+ def add_entry(self, source: str, target: str):
37
+ """Add translation pair to this TM"""
38
+ if not self.read_only and source and target:
39
+ self.entries[source.strip()] = target.strip()
40
+ self.metadata['modified'] = datetime.now().isoformat()
41
+
42
+ def get_exact_match(self, source: str) -> Optional[str]:
43
+ """Get exact match from this TM"""
44
+ return self.entries.get(source.strip())
45
+
46
+ def calculate_similarity(self, text1: str, text2: str) -> float:
47
+ """Calculate similarity ratio between two texts"""
48
+ return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
49
+
50
+ def get_fuzzy_matches(self, source: str, max_matches: int = 5) -> List[Dict]:
51
+ """Get fuzzy matches from this TM"""
52
+ source = source.strip()
53
+ matches = []
54
+
55
+ for tm_source, tm_target in self.entries.items():
56
+ similarity = self.calculate_similarity(source, tm_source)
57
+ if similarity >= self.fuzzy_threshold:
58
+ matches.append({
59
+ 'source': tm_source,
60
+ 'target': tm_target,
61
+ 'similarity': similarity,
62
+ 'match_pct': int(similarity * 100),
63
+ 'tm_name': self.name,
64
+ 'tm_id': self.tm_id
65
+ })
66
+
67
+ matches.sort(key=lambda x: x['similarity'], reverse=True)
68
+ return matches[:max_matches]
69
+
70
+ def get_entry_count(self) -> int:
71
+ """Get number of entries in this TM"""
72
+ return len(self.entries)
73
+
74
+ def to_dict(self) -> Dict:
75
+ """Serialize TM to dictionary for JSON storage"""
76
+ return {
77
+ 'name': self.name,
78
+ 'tm_id': self.tm_id,
79
+ 'enabled': self.enabled,
80
+ 'read_only': self.read_only,
81
+ 'entries': self.entries,
82
+ 'metadata': self.metadata,
83
+ 'fuzzy_threshold': self.fuzzy_threshold
84
+ }
85
+
86
+ @staticmethod
87
+ def from_dict(data: Dict) -> 'TM':
88
+ """Deserialize TM from dictionary"""
89
+ tm = TM(
90
+ name=data.get('name', 'Unnamed TM'),
91
+ tm_id=data.get('tm_id', 'unknown'),
92
+ enabled=data.get('enabled', True),
93
+ read_only=data.get('read_only', False)
94
+ )
95
+ tm.entries = data.get('entries', {})
96
+ tm.metadata = data.get('metadata', {})
97
+ tm.fuzzy_threshold = data.get('fuzzy_threshold', 0.75)
98
+ return tm
99
+
100
+
101
+ class TMDatabase:
102
+ """Manages multiple Translation Memories using SQLite backend"""
103
+
104
+ def __init__(self, source_lang: str = None, target_lang: str = None, db_path: str = None, log_callback=None):
105
+ """
106
+ Initialize TM database
107
+
108
+ Args:
109
+ source_lang: Source language (e.g., "en" or "English")
110
+ target_lang: Target language (e.g., "nl" or "Dutch")
111
+ db_path: Path to SQLite database file (default: user_data/supervertaler.db)
112
+ log_callback: Logging function
113
+ """
114
+ self.source_lang = source_lang
115
+ self.target_lang = target_lang
116
+ self.log = log_callback if log_callback else print
117
+
118
+ # Initialize database manager
119
+ self.db = DatabaseManager(db_path=db_path, log_callback=log_callback)
120
+ self.db.connect()
121
+
122
+ # Set language metadata if provided
123
+ if source_lang and target_lang:
124
+ self.set_tm_languages(source_lang, target_lang)
125
+
126
+ # Global fuzzy threshold
127
+ self.fuzzy_threshold = 0.75
128
+
129
+ # TM metadata cache (populated from database as needed)
130
+ # Note: Legacy 'project' and 'big_mama' TMs are no longer used.
131
+ # All TMs are now managed through TMMetadataManager and stored in translation_memories table.
132
+ self.tm_metadata = {}
133
+
134
+ def set_tm_languages(self, source_lang: str, target_lang: str):
135
+ """Set language pair for TMs"""
136
+ # Convert to ISO codes
137
+ from modules.tmx_generator import get_simple_lang_code
138
+ self.source_lang = get_simple_lang_code(source_lang)
139
+ self.target_lang = get_simple_lang_code(target_lang)
140
+
141
+ def add_entry(self, source: str, target: str, tm_id: str = 'project',
142
+ context_before: str = None, context_after: str = None, notes: str = None):
143
+ """
144
+ Add translation pair to TM
145
+
146
+ Args:
147
+ source: Source text
148
+ target: Target text
149
+ tm_id: TM identifier ('project', 'big_mama', or custom)
150
+ context_before: Previous segment for context
151
+ context_after: Next segment for context
152
+ notes: Optional notes
153
+ """
154
+ if not source or not target:
155
+ return
156
+
157
+ self.db.add_translation_unit(
158
+ source=source.strip(),
159
+ target=target.strip(),
160
+ source_lang=self.source_lang or 'en',
161
+ target_lang=self.target_lang or 'nl',
162
+ tm_id=tm_id,
163
+ context_before=context_before,
164
+ context_after=context_after,
165
+ notes=notes
166
+ )
167
+
168
+ def add_to_project_tm(self, source: str, target: str):
169
+ """Add entry to Project TM (convenience method)"""
170
+ self.add_entry(source, target, tm_id='project')
171
+
172
+ def get_exact_match(self, source: str, tm_ids: List[str] = None) -> Optional[str]:
173
+ """
174
+ Get exact match from TM(s)
175
+
176
+ Args:
177
+ source: Source text to match
178
+ tm_ids: List of TM IDs to search (None = all enabled)
179
+
180
+ Returns: Target text or None
181
+ """
182
+ if tm_ids is None:
183
+ # Search all enabled TMs
184
+ tm_ids = [tm_id for tm_id, meta in self.tm_metadata.items() if meta.get('enabled', True)]
185
+
186
+ match = self.db.get_exact_match(
187
+ source=source,
188
+ tm_ids=tm_ids,
189
+ source_lang=self.source_lang,
190
+ target_lang=self.target_lang
191
+ )
192
+
193
+ return match['target_text'] if match else None
194
+
195
+ def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True, max_matches: int = 5) -> List[Dict]:
196
+ """
197
+ Search across multiple TMs for fuzzy matches
198
+
199
+ Args:
200
+ source: Source text to search for
201
+ tm_ids: Specific TM IDs to search (None = search all)
202
+ enabled_only: Only search enabled TMs
203
+ max_matches: Maximum number of results
204
+
205
+ Returns:
206
+ List of match dictionaries sorted by similarity
207
+ """
208
+ print(f"[DEBUG] TMDatabase.search_all: source='{source[:50]}...', tm_ids={tm_ids}")
209
+
210
+ # Determine which TMs to search
211
+ # If tm_ids is None or empty, search ALL TMs (don't filter by tm_id)
212
+ if tm_ids is None and enabled_only:
213
+ tm_ids = [tm_id for tm_id, meta in self.tm_metadata.items() if meta.get('enabled', True)]
214
+ print(f"[DEBUG] TMDatabase.search_all: No tm_ids provided, using from metadata: {tm_ids}")
215
+
216
+ # If tm_ids is still empty, set to None to search ALL TMs
217
+ if tm_ids is not None and len(tm_ids) == 0:
218
+ tm_ids = None
219
+ print(f"[DEBUG] TMDatabase.search_all: Empty tm_ids, setting to None to search ALL")
220
+
221
+ print(f"[DEBUG] TMDatabase.search_all: Final tm_ids to search: {tm_ids}")
222
+
223
+ # First try exact match
224
+ exact_match = self.db.get_exact_match(
225
+ source=source,
226
+ tm_ids=tm_ids,
227
+ source_lang=self.source_lang,
228
+ target_lang=self.target_lang
229
+ )
230
+ print(f"[DEBUG] TMDatabase.search_all: Exact match result: {exact_match}")
231
+
232
+ if exact_match:
233
+ # Format as match dictionary
234
+ return [{
235
+ 'source': exact_match['source_text'],
236
+ 'target': exact_match['target_text'],
237
+ 'similarity': 1.0,
238
+ 'match_pct': 100,
239
+ 'tm_name': self.tm_metadata.get(exact_match['tm_id'], {}).get('name', exact_match['tm_id']),
240
+ 'tm_id': exact_match['tm_id']
241
+ }]
242
+
243
+ # Try fuzzy matches
244
+ print(f"[DEBUG] TMDatabase.search_all: Calling fuzzy search with source_lang={self.source_lang}, target_lang={self.target_lang}")
245
+ fuzzy_matches = self.db.search_fuzzy_matches(
246
+ source=source,
247
+ tm_ids=tm_ids,
248
+ threshold=self.fuzzy_threshold,
249
+ max_results=max_matches,
250
+ source_lang=self.source_lang,
251
+ target_lang=self.target_lang
252
+ )
253
+ print(f"[DEBUG] TMDatabase.search_all: Fuzzy search returned {len(fuzzy_matches)} matches")
254
+
255
+ # Format matches for UI
256
+ formatted_matches = []
257
+ for match in fuzzy_matches:
258
+ formatted_matches.append({
259
+ 'source': match['source_text'],
260
+ 'target': match['target_text'],
261
+ 'similarity': match.get('similarity', 0.85),
262
+ 'match_pct': match.get('match_pct', 85),
263
+ 'tm_name': self.tm_metadata.get(match['tm_id'], {}).get('name', match['tm_id']),
264
+ 'tm_id': match['tm_id']
265
+ })
266
+
267
+ return formatted_matches
268
+
269
+ def concordance_search(self, query: str, tm_ids: List[str] = None, direction: str = 'both',
270
+ source_lang: str = None, target_lang: str = None) -> List[Dict]:
271
+ """
272
+ Search for text in both source and target
273
+
274
+ Args:
275
+ query: Search query
276
+ tm_ids: TM IDs to search (None = all)
277
+ direction: 'source' = search source only, 'target' = search target only, 'both' = bidirectional
278
+ source_lang: Filter by source language (None = any)
279
+ target_lang: Filter by target language (None = any)
280
+
281
+ Returns: List of matching entries
282
+ """
283
+ results = self.db.concordance_search(query=query, tm_ids=tm_ids, direction=direction,
284
+ source_lang=source_lang, target_lang=target_lang)
285
+
286
+ # Format for UI
287
+ formatted = []
288
+ for result in results:
289
+ formatted.append({
290
+ 'source': result['source_text'],
291
+ 'target': result['target_text'],
292
+ 'tm_name': self.tm_metadata.get(result['tm_id'], {}).get('name', result['tm_id']),
293
+ 'tm_id': result['tm_id'],
294
+ 'created': result.get('created_date', ''),
295
+ 'usage_count': result.get('usage_count', 0)
296
+ })
297
+
298
+ return formatted
299
+
300
+ def get_tm_entries(self, tm_id: str, limit: int = None) -> List[Dict]:
301
+ """
302
+ Get all entries from a specific TM
303
+
304
+ Args:
305
+ tm_id: TM identifier
306
+ limit: Maximum number of entries (None = all)
307
+
308
+ Returns: List of entry dictionaries
309
+ """
310
+ entries = self.db.get_tm_entries(tm_id=tm_id, limit=limit)
311
+
312
+ # Format for UI
313
+ formatted = []
314
+ for entry in entries:
315
+ formatted.append({
316
+ 'source': entry['source_text'],
317
+ 'target': entry['target_text'],
318
+ 'created': entry.get('created_date', ''),
319
+ 'modified': entry.get('modified_date', ''),
320
+ 'usage_count': entry.get('usage_count', 0),
321
+ 'notes': entry.get('notes', '')
322
+ })
323
+
324
+ return formatted
325
+
326
+ def get_entry_count(self, tm_id: str = None, enabled_only: bool = False) -> int:
327
+ """
328
+ Get entry count for TM(s)
329
+
330
+ Args:
331
+ tm_id: Specific TM ID (None = all)
332
+ enabled_only: Only count enabled TMs
333
+
334
+ Returns: Total entry count
335
+ """
336
+ if tm_id:
337
+ return self.db.get_tm_count(tm_id=tm_id)
338
+
339
+ # Count all TMs
340
+ if enabled_only:
341
+ tm_ids = [tm_id for tm_id, meta in self.tm_metadata.items() if meta.get('enabled', True)]
342
+ return sum(self.db.get_tm_count(tm_id) for tm_id in tm_ids)
343
+ else:
344
+ return self.db.get_tm_count()
345
+
346
+ def clear_tm(self, tm_id: str):
347
+ """Clear all entries from a TM"""
348
+ self.db.clear_tm(tm_id=tm_id)
349
+
350
+ def delete_entry(self, tm_id: str, source: str, target: str):
351
+ """Delete a specific entry from a TM"""
352
+ self.db.delete_entry(tm_id, source, target)
353
+
354
+ def add_custom_tm(self, name: str, tm_id: str = None, read_only: bool = False):
355
+ """Register a custom TM"""
356
+ if tm_id is None:
357
+ tm_id = f"custom_{len(self.tm_metadata)}"
358
+
359
+ self.tm_metadata[tm_id] = {
360
+ 'name': name,
361
+ 'enabled': True,
362
+ 'read_only': read_only
363
+ }
364
+
365
+ return tm_id
366
+
367
+ def remove_custom_tm(self, tm_id: str) -> bool:
368
+ """Remove a custom TM and its entries"""
369
+ if tm_id in self.tm_metadata and tm_id not in ['project', 'big_mama']:
370
+ # Clear entries from database
371
+ self.clear_tm(tm_id)
372
+ # Remove metadata
373
+ del self.tm_metadata[tm_id]
374
+ return True
375
+ return False
376
+
377
+ def get_tm_list(self, enabled_only: bool = False) -> List[Dict]:
378
+ """
379
+ Get list of all TMs with metadata
380
+
381
+ Returns: List of TM info dictionaries
382
+ """
383
+ tm_list = []
384
+ for tm_id, meta in self.tm_metadata.items():
385
+ if enabled_only and not meta.get('enabled', True):
386
+ continue
387
+
388
+ tm_list.append({
389
+ 'tm_id': tm_id,
390
+ 'name': meta.get('name', tm_id),
391
+ 'enabled': meta.get('enabled', True),
392
+ 'read_only': meta.get('read_only', False),
393
+ 'entry_count': self.db.get_tm_count(tm_id)
394
+ })
395
+
396
+ return tm_list
397
+
398
+ def get_all_tms(self, enabled_only: bool = False) -> List[Dict]:
399
+ """Alias for get_tm_list() for backward compatibility"""
400
+ return self.get_tm_list(enabled_only=enabled_only)
401
+
402
+ def load_tmx_file(self, filepath: str, src_lang: str, tgt_lang: str,
403
+ tm_name: str = None, read_only: bool = False,
404
+ strip_variants: bool = True, progress_callback=None) -> tuple[str, int]:
405
+ """
406
+ Load TMX file into a new custom TM
407
+
408
+ Args:
409
+ filepath: Path to TMX file
410
+ src_lang: Source language code
411
+ tgt_lang: Target language code
412
+ tm_name: Custom name for TM (default: filename)
413
+ read_only: Make TM read-only
414
+ strip_variants: Match base languages ignoring regional variants (default: True)
415
+ progress_callback: Optional callback function(current, total, message) for progress updates
416
+
417
+ Returns: (tm_id, entry_count)
418
+ """
419
+ if tm_name is None:
420
+ tm_name = os.path.basename(filepath).replace('.tmx', '')
421
+
422
+ # Create custom TM
423
+ tm_id = f"custom_{os.path.basename(filepath).replace('.', '_')}"
424
+ self.add_custom_tm(tm_name, tm_id, read_only=read_only)
425
+
426
+ # Load TMX content
427
+ loaded_count = self._load_tmx_into_db(filepath, src_lang, tgt_lang, tm_id,
428
+ strip_variants=strip_variants,
429
+ progress_callback=progress_callback)
430
+
431
+ self.log(f"✓ Loaded {loaded_count} entries from {os.path.basename(filepath)}")
432
+
433
+ return tm_id, loaded_count
434
+
435
+ def _load_tmx_into_db(self, filepath: str, src_lang: str, tgt_lang: str, tm_id: str,
436
+ strip_variants: bool = False, progress_callback=None) -> int:
437
+ """
438
+ Internal: Load TMX content into database with chunked processing
439
+
440
+ Args:
441
+ filepath: Path to TMX file
442
+ src_lang: Target source language code
443
+ tgt_lang: Target target language code
444
+ tm_id: TM identifier
445
+ strip_variants: If True, match base languages ignoring regional variants
446
+ progress_callback: Optional callback function(current, total, message) for progress updates
447
+ """
448
+ loaded_count = 0
449
+ chunk_size = 1000 # Process in chunks for responsiveness
450
+ chunk_buffer = []
451
+
452
+ try:
453
+ # First pass: count total TUs for progress bar
454
+ if progress_callback:
455
+ progress_callback(0, 0, "Counting translation units...")
456
+
457
+ tree = ET.parse(filepath)
458
+ root = tree.getroot()
459
+ total_tus = len(root.findall('.//tu'))
460
+
461
+ if progress_callback:
462
+ progress_callback(0, total_tus, f"Processing 0 / {total_tus:,} entries...")
463
+
464
+ xml_ns = "http://www.w3.org/XML/1998/namespace"
465
+
466
+ # Normalize language codes
467
+ from modules.tmx_generator import get_simple_lang_code, get_base_lang_code
468
+ src_lang_normalized = get_simple_lang_code(src_lang)
469
+ tgt_lang_normalized = get_simple_lang_code(tgt_lang)
470
+
471
+ # If stripping variants, get base codes for comparison
472
+ if strip_variants:
473
+ src_base = get_base_lang_code(src_lang_normalized)
474
+ tgt_base = get_base_lang_code(tgt_lang_normalized)
475
+
476
+ processed = 0
477
+ for tu in root.findall('.//tu'):
478
+ src_text, tgt_text = None, None
479
+
480
+ for tuv_node in tu.findall('tuv'):
481
+ lang_attr = tuv_node.get(f'{{{xml_ns}}}lang')
482
+ if not lang_attr:
483
+ continue
484
+
485
+ tmx_lang = get_simple_lang_code(lang_attr)
486
+
487
+ seg_node = tuv_node.find('seg')
488
+ if seg_node is not None:
489
+ try:
490
+ text = ET.tostring(seg_node, encoding='unicode', method='text').strip()
491
+ except:
492
+ text = "".join(seg_node.itertext()).strip()
493
+
494
+ # Match languages (exact or base code match if stripping variants)
495
+ if strip_variants:
496
+ if get_base_lang_code(tmx_lang) == src_base:
497
+ src_text = text
498
+ elif get_base_lang_code(tmx_lang) == tgt_base:
499
+ tgt_text = text
500
+ else:
501
+ if tmx_lang == src_lang_normalized:
502
+ src_text = text
503
+ elif tmx_lang == tgt_lang_normalized:
504
+ tgt_text = text
505
+
506
+ if src_text and tgt_text:
507
+ chunk_buffer.append((src_text, tgt_text))
508
+ loaded_count += 1
509
+
510
+ # Process chunk when buffer is full
511
+ if len(chunk_buffer) >= chunk_size:
512
+ for src, tgt in chunk_buffer:
513
+ self.db.add_translation_unit(
514
+ source=src,
515
+ target=tgt,
516
+ source_lang=src_lang_normalized,
517
+ target_lang=tgt_lang_normalized,
518
+ tm_id=tm_id
519
+ )
520
+ chunk_buffer.clear()
521
+
522
+ # Update progress
523
+ if progress_callback:
524
+ progress_callback(processed + 1, total_tus,
525
+ f"Processing {loaded_count:,} / {total_tus:,} entries...")
526
+
527
+ processed += 1
528
+
529
+ # Process remaining entries in buffer
530
+ if chunk_buffer:
531
+ for src, tgt in chunk_buffer:
532
+ self.db.add_translation_unit(
533
+ source=src,
534
+ target=tgt,
535
+ source_lang=src_lang_normalized,
536
+ target_lang=tgt_lang_normalized,
537
+ tm_id=tm_id
538
+ )
539
+ chunk_buffer.clear()
540
+
541
+ # Final progress update
542
+ if progress_callback:
543
+ progress_callback(total_tus, total_tus, f"Completed: {loaded_count:,} entries imported")
544
+
545
+ return loaded_count
546
+ except Exception as e:
547
+ self.log(f"✗ Error loading TMX: {e}")
548
+ return 0
549
+
550
+ def detect_tmx_languages(self, filepath: str) -> List[str]:
551
+ """Detect all language codes present in a TMX file"""
552
+ try:
553
+ tree = ET.parse(filepath)
554
+ root = tree.getroot()
555
+ xml_ns = "http://www.w3.org/XML/1998/namespace"
556
+
557
+ languages = set()
558
+ for tuv in root.findall('.//tuv'):
559
+ lang_attr = tuv.get(f'{{{xml_ns}}}lang')
560
+ if lang_attr:
561
+ languages.add(lang_attr)
562
+
563
+ return sorted(list(languages))
564
+ except:
565
+ return []
566
+
567
+ def check_language_compatibility(self, tmx_langs: List[str], target_src: str, target_tgt: str) -> dict:
568
+ """
569
+ Analyze if TMX languages match target TM languages, handling variants.
570
+ Returns dict with compatibility info and suggestions.
571
+ """
572
+ from modules.tmx_generator import get_base_lang_code, languages_are_compatible
573
+
574
+ if len(tmx_langs) < 2:
575
+ return {'compatible': False, 'reason': 'tmx_incomplete'}
576
+
577
+ # Get base codes
578
+ tmx_bases = [get_base_lang_code(lang) for lang in tmx_langs]
579
+ target_src_base = get_base_lang_code(target_src)
580
+ target_tgt_base = get_base_lang_code(target_tgt)
581
+
582
+ # Check if we can find matching pair
583
+ src_match = None
584
+ tgt_match = None
585
+
586
+ for tmx_lang in tmx_langs:
587
+ if get_base_lang_code(tmx_lang) == target_src_base and src_match is None:
588
+ src_match = tmx_lang
589
+ if get_base_lang_code(tmx_lang) == target_tgt_base and tgt_match is None:
590
+ tgt_match = tmx_lang
591
+
592
+ if not src_match or not tgt_match:
593
+ return {
594
+ 'compatible': False,
595
+ 'reason': 'no_match',
596
+ 'tmx_langs': tmx_langs,
597
+ 'target_langs': [target_src, target_tgt]
598
+ }
599
+
600
+ # Check if exact match or variant match
601
+ exact_match = (src_match == target_src and tgt_match == target_tgt)
602
+
603
+ return {
604
+ 'compatible': True,
605
+ 'exact_match': exact_match,
606
+ 'variant_match': not exact_match,
607
+ 'tmx_source': src_match,
608
+ 'tmx_target': tgt_match,
609
+ 'target_source': target_src,
610
+ 'target_target': target_tgt
611
+ }
612
+
613
+ def close(self):
614
+ """Close database connection"""
615
+ if self.db:
616
+ self.db.close()
617
+
618
+ def __del__(self):
619
+ """Ensure database is closed on cleanup"""
620
+ self.close()
621
+
622
+ # Legacy compatibility methods for old JSON format
623
+ def to_dict(self) -> Dict:
624
+ """Export to legacy dictionary format (for JSON serialization)"""
625
+ # NOTE: This is a legacy method - new code should use database directly
626
+ # Exporting large databases to JSON is not recommended
627
+ self.log("⚠️ Warning: Exporting database to dict format. Use TMX export for large datasets.")
628
+
629
+ return {
630
+ 'project_tm': {'entries': {e['source']: e['target'] for e in self.get_tm_entries('project')}},
631
+ 'big_mama_tm': {'entries': {e['source']: e['target'] for e in self.get_tm_entries('big_mama')}},
632
+ 'custom_tms': {},
633
+ 'fuzzy_threshold': self.fuzzy_threshold
634
+ }
635
+
636
+ @staticmethod
637
+ def from_dict(data: Dict, db_path: str = None, log_callback=None) -> 'TMDatabase':
638
+ """Import from legacy dictionary format (for JSON deserialization)"""
639
+ # NOTE: This is a legacy method - new code should use database directly
640
+ db = TMDatabase(db_path=db_path, log_callback=log_callback)
641
+
642
+ # Import Project TM
643
+ if 'project_tm' in data and 'entries' in data['project_tm']:
644
+ for src, tgt in data['project_tm']['entries'].items():
645
+ db.add_entry(src, tgt, tm_id='project')
646
+
647
+ # Import Big Mama TM
648
+ if 'big_mama_tm' in data and 'entries' in data['big_mama_tm']:
649
+ for src, tgt in data['big_mama_tm']['entries'].items():
650
+ db.add_entry(src, tgt, tm_id='big_mama')
651
+ elif 'main_tm' in data and 'entries' in data['main_tm']: # Legacy support
652
+ for src, tgt in data['main_tm']['entries'].items():
653
+ db.add_entry(src, tgt, tm_id='big_mama')
654
+
655
+ db.fuzzy_threshold = data.get('fuzzy_threshold', 0.75)
656
+
657
+ return db
658
+
659
+
660
+ class TMAgent:
661
+ """Legacy wrapper for backwards compatibility - delegates to TMDatabase"""
662
+
663
+ def __init__(self, db_path: str = None):
664
+ self.tm_database = TMDatabase(db_path=db_path)
665
+ self.fuzzy_threshold = 0.75
666
+
667
+ @property
668
+ def tm_data(self):
669
+ """Legacy property - returns Project TM entries as dictionary"""
670
+ entries = self.tm_database.get_tm_entries('project')
671
+ return {e['source']: e['target'] for e in entries}
672
+
673
+ @tm_data.setter
674
+ def tm_data(self, value: Dict[str, str]):
675
+ """Legacy property setter - loads entries into Project TM"""
676
+ # Clear existing entries
677
+ self.tm_database.clear_tm('project')
678
+ # Add new entries
679
+ for source, target in value.items():
680
+ self.tm_database.add_entry(source, target, tm_id='project')
681
+
682
+ def add_entry(self, source: str, target: str):
683
+ """Add to Project TM"""
684
+ self.tm_database.add_to_project_tm(source, target)
685
+
686
+ def get_exact_match(self, source: str) -> Optional[str]:
687
+ """Search all enabled TMs for exact match"""
688
+ return self.tm_database.get_exact_match(source)
689
+
690
+ def get_fuzzy_matches(self, source: str, max_matches: int = 5) -> List[Tuple[str, str, float]]:
691
+ """Legacy format - returns tuples"""
692
+ matches = self.tm_database.search_all(source, enabled_only=True, max_matches=max_matches)
693
+ return [(m['source'], m['target'], m['similarity']) for m in matches]
694
+
695
+ def get_best_match(self, source: str) -> Optional[Tuple[str, str, float]]:
696
+ """Get best match in legacy format"""
697
+ matches = self.get_fuzzy_matches(source, max_matches=1)
698
+ return matches[0] if matches else None
699
+
700
+ def load_from_tmx(self, filepath: str, src_lang: str = "en", tgt_lang: str = "nl") -> int:
701
+ """Legacy TMX load - loads into a new custom TM"""
702
+ tm_id, count = self.tm_database.load_tmx_file(filepath, src_lang, tgt_lang)
703
+ return count
704
+
705
+ def get_entry_count(self) -> int:
706
+ """Get total entry count"""
707
+ return self.tm_database.get_entry_count(enabled_only=False)
708
+
709
+ def clear(self):
710
+ """Clear Project TM only"""
711
+ self.tm_database.clear_tm('project')
712
+
713
+ def delete_entry(self, tm_id: str, source: str, target: str):
714
+ """Delete a specific entry from a TM"""
715
+ self.tm_database.delete_entry(tm_id, source, target)