supervertaler 1.9.163__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. Supervertaler.py +48473 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1911 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +351 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1176 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.163.dist-info/METADATA +906 -0
  81. supervertaler-1.9.163.dist-info/RECORD +85 -0
  82. supervertaler-1.9.163.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.163.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1911 @@
1
+ """
2
+ Database Manager Module
3
+
4
+ SQLite database backend for Translation Memories, Glossaries, and related resources.
5
+ Replaces in-memory JSON-based storage with efficient database storage.
6
+
7
+ Schema includes:
8
+ - Translation units (TM entries)
9
+ - Termbase terms
10
+ - Non-translatables
11
+ - Segmentation rules
12
+ - Project metadata
13
+ - Resource file references
14
+ """
15
+
16
+ import sqlite3
17
+ import os
18
+ import json
19
+ import hashlib
20
+ from datetime import datetime
21
+ from typing import List, Dict, Optional, Tuple
22
+ from pathlib import Path
23
+ from difflib import SequenceMatcher
24
+
25
+
26
+ class DatabaseManager:
27
+ """Manages SQLite database for translation resources"""
28
+
29
+ def __init__(self, db_path: str = None, log_callback=None):
30
+ """
31
+ Initialize database manager
32
+
33
+ Args:
34
+ db_path: Path to SQLite database file (default: user_data/supervertaler.db)
35
+ log_callback: Optional logging function
36
+ """
37
+ self.log = log_callback if log_callback else print
38
+
39
+ # Set default database path if not provided
40
+ if db_path is None:
41
+ # Will be set by application - defaults to user_data folder
42
+ self.db_path = "supervertaler.db"
43
+ else:
44
+ self.db_path = db_path
45
+
46
+ self.connection = None
47
+ self.cursor = None
48
+
49
+ def connect(self):
50
+ """Connect to database and create tables if needed"""
51
+ try:
52
+ # Create directory if it doesn't exist
53
+ os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
54
+
55
+ # Connect to database
56
+ self.connection = sqlite3.connect(self.db_path)
57
+ self.connection.row_factory = sqlite3.Row # Access columns by name
58
+ self.cursor = self.connection.cursor()
59
+
60
+ # Enable foreign keys
61
+ self.cursor.execute("PRAGMA foreign_keys = ON")
62
+
63
+ # Create tables
64
+ self._create_tables()
65
+
66
+ # Run database migrations (adds new columns/tables as needed)
67
+ try:
68
+ from modules.database_migrations import check_and_migrate
69
+ migration_success = check_and_migrate(self)
70
+ if not migration_success:
71
+ self.log("[WARNING] Database migration reported failure")
72
+ except Exception as e:
73
+ self.log(f"[WARNING] Database migration check failed: {e}")
74
+ import traceback
75
+ traceback.print_exc()
76
+
77
+ # Auto-sync FTS5 index if out of sync
78
+ try:
79
+ fts_status = self.check_fts_index()
80
+ if not fts_status.get('in_sync', True):
81
+ self.log(f"[TM] FTS5 index out of sync ({fts_status.get('fts_count', 0)} vs {fts_status.get('main_count', 0)}), rebuilding...")
82
+ self.rebuild_fts_index()
83
+ except Exception as e:
84
+ self.log(f"[WARNING] FTS5 index check failed: {e}")
85
+
86
+ self.log(f"[OK] Database connected: {os.path.basename(self.db_path)}")
87
+ return True
88
+
89
+ except Exception as e:
90
+ self.log(f"[ERROR] Database connection failed: {e}")
91
+ return False
92
+
93
+ def _create_tables(self):
94
+ """Create database schema"""
95
+ print("📊 Creating database tables...")
96
+
97
+ # ============================================
98
+ # TRANSLATION MEMORY TABLES
99
+ # ============================================
100
+
101
+ self.cursor.execute("""
102
+ CREATE TABLE IF NOT EXISTS translation_units (
103
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
104
+ source_text TEXT NOT NULL,
105
+ target_text TEXT NOT NULL,
106
+ source_lang TEXT NOT NULL,
107
+ target_lang TEXT NOT NULL,
108
+ tm_id TEXT NOT NULL,
109
+ project_id TEXT,
110
+
111
+ -- Context for better matching
112
+ context_before TEXT,
113
+ context_after TEXT,
114
+
115
+ -- Fast exact matching
116
+ source_hash TEXT NOT NULL,
117
+
118
+ -- Metadata
119
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
120
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
121
+ usage_count INTEGER DEFAULT 0,
122
+ created_by TEXT,
123
+ notes TEXT,
124
+
125
+ -- Indexes
126
+ UNIQUE(source_hash, target_text, tm_id)
127
+ )
128
+ """)
129
+
130
+ # Indexes for translation_units
131
+ self.cursor.execute("""
132
+ CREATE INDEX IF NOT EXISTS idx_tu_source_hash
133
+ ON translation_units(source_hash)
134
+ """)
135
+
136
+ self.cursor.execute("""
137
+ CREATE INDEX IF NOT EXISTS idx_tu_tm_id
138
+ ON translation_units(tm_id)
139
+ """)
140
+
141
+ self.cursor.execute("""
142
+ CREATE INDEX IF NOT EXISTS idx_tu_project_id
143
+ ON translation_units(project_id)
144
+ """)
145
+
146
+ self.cursor.execute("""
147
+ CREATE INDEX IF NOT EXISTS idx_tu_langs
148
+ ON translation_units(source_lang, target_lang)
149
+ """)
150
+
151
+ # Full-text search for fuzzy matching
152
+ self.cursor.execute("""
153
+ CREATE VIRTUAL TABLE IF NOT EXISTS translation_units_fts
154
+ USING fts5(
155
+ source_text,
156
+ target_text,
157
+ content=translation_units,
158
+ content_rowid=id
159
+ )
160
+ """)
161
+
162
+ # Triggers to keep FTS index in sync
163
+ self.cursor.execute("""
164
+ CREATE TRIGGER IF NOT EXISTS tu_fts_insert AFTER INSERT ON translation_units BEGIN
165
+ INSERT INTO translation_units_fts(rowid, source_text, target_text)
166
+ VALUES (new.id, new.source_text, new.target_text);
167
+ END
168
+ """)
169
+
170
+ self.cursor.execute("""
171
+ CREATE TRIGGER IF NOT EXISTS tu_fts_delete AFTER DELETE ON translation_units BEGIN
172
+ DELETE FROM translation_units_fts WHERE rowid = old.id;
173
+ END
174
+ """)
175
+
176
+ self.cursor.execute("""
177
+ CREATE TRIGGER IF NOT EXISTS tu_fts_update AFTER UPDATE ON translation_units BEGIN
178
+ DELETE FROM translation_units_fts WHERE rowid = old.id;
179
+ INSERT INTO translation_units_fts(rowid, source_text, target_text)
180
+ VALUES (new.id, new.source_text, new.target_text);
181
+ END
182
+ """)
183
+
184
+ # ============================================
185
+ # TRANSLATION MEMORY METADATA
186
+ # ============================================
187
+
188
+ # Translation Memories table - tracks individual TM names/metadata
189
+ self.cursor.execute("""
190
+ CREATE TABLE IF NOT EXISTS translation_memories (
191
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
192
+ name TEXT NOT NULL UNIQUE,
193
+ description TEXT,
194
+ source_lang TEXT,
195
+ target_lang TEXT,
196
+ tm_id TEXT NOT NULL UNIQUE, -- The tm_id used in translation_units table
197
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
198
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
199
+ entry_count INTEGER DEFAULT 0, -- Cached count, updated on changes
200
+ last_used TIMESTAMP,
201
+ is_project_tm BOOLEAN DEFAULT 0, -- Whether this is the special project TM
202
+ read_only BOOLEAN DEFAULT 1, -- Whether this TM should not be updated (default: read-only, Write unchecked)
203
+ project_id INTEGER -- Which project this TM belongs to (NULL = global)
204
+ )
205
+ """)
206
+
207
+ # TM activation (tracks which TMs are active for which projects)
208
+ self.cursor.execute("""
209
+ CREATE TABLE IF NOT EXISTS tm_activation (
210
+ tm_id INTEGER NOT NULL,
211
+ project_id INTEGER NOT NULL,
212
+ is_active BOOLEAN DEFAULT 1,
213
+ activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
214
+ PRIMARY KEY (tm_id, project_id),
215
+ FOREIGN KEY (tm_id) REFERENCES translation_memories(id) ON DELETE CASCADE
216
+ )
217
+ """)
218
+
219
+ # Index for fast tm_id lookups
220
+ self.cursor.execute("""
221
+ CREATE INDEX IF NOT EXISTS idx_tm_tm_id
222
+ ON translation_memories(tm_id)
223
+ """)
224
+
225
+ # Migration: Add is_project_tm, read_only, and project_id columns if they don't exist
226
+ try:
227
+ self.cursor.execute("PRAGMA table_info(translation_memories)")
228
+ columns = [row[1] for row in self.cursor.fetchall()]
229
+
230
+ if 'is_project_tm' not in columns:
231
+ self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN is_project_tm BOOLEAN DEFAULT 0")
232
+ print("✓ Added is_project_tm column to translation_memories")
233
+
234
+ if 'read_only' not in columns:
235
+ self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN read_only BOOLEAN DEFAULT 1")
236
+ print("✓ Added read_only column to translation_memories (default: read-only)")
237
+
238
+ if 'project_id' not in columns:
239
+ self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN project_id INTEGER")
240
+ print("✓ Added project_id column to translation_memories")
241
+
242
+ self.connection.commit()
243
+ except Exception as e:
244
+ print(f"Migration info: {e}")
245
+
246
+ # ============================================
247
+ # TERMBASE TABLES
248
+ # ============================================
249
+
250
+ # Termbases container table (terminology, never "termbase")
251
+ self.cursor.execute("""
252
+ CREATE TABLE IF NOT EXISTS termbases (
253
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
254
+ name TEXT NOT NULL UNIQUE,
255
+ description TEXT,
256
+ source_lang TEXT,
257
+ target_lang TEXT,
258
+ project_id INTEGER, -- NULL = global, set = project-specific
259
+ is_global BOOLEAN DEFAULT 1,
260
+ is_project_termbase BOOLEAN DEFAULT 0, -- True if this is a project-specific termbase
261
+ priority INTEGER DEFAULT 50, -- DEPRECATED: Use ranking instead
262
+ ranking INTEGER, -- Termbase activation ranking: 1 = highest priority, 2 = second highest, etc. Only for activated termbases.
263
+ read_only BOOLEAN DEFAULT 1, -- Whether this termbase should not be updated (default: read-only, Write unchecked)
264
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
265
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
266
+ )
267
+ """)
268
+
269
+ # Migration: Add priority column if it doesn't exist (for existing databases)
270
+ try:
271
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN priority INTEGER DEFAULT 50")
272
+ self.connection.commit()
273
+ except Exception:
274
+ # Column already exists, ignore
275
+ pass
276
+
277
+ # Migration: Add is_project_termbase column if it doesn't exist
278
+ try:
279
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN is_project_termbase BOOLEAN DEFAULT 0")
280
+ self.connection.commit()
281
+ except Exception:
282
+ # Column already exists, ignore
283
+ pass
284
+
285
+ # Migration: Add ranking column if it doesn't exist
286
+ try:
287
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN ranking INTEGER")
288
+ self.connection.commit()
289
+ except Exception:
290
+ # Column already exists, ignore
291
+ pass
292
+
293
+ # Migration: Add read_only column if it doesn't exist
294
+ try:
295
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN read_only BOOLEAN DEFAULT 1")
296
+ self.connection.commit()
297
+ except Exception:
298
+ # Column already exists, ignore
299
+ pass
300
+
301
+ # Data Migration: Set is_project_termbase=1 for termbases with non-NULL project_id
302
+ # This ensures existing project termbases are correctly flagged
303
+ try:
304
+ self.cursor.execute("""
305
+ UPDATE termbases
306
+ SET is_project_termbase = 1
307
+ WHERE project_id IS NOT NULL
308
+ AND (is_project_termbase IS NULL OR is_project_termbase = 0)
309
+ """)
310
+ updated_count = self.cursor.rowcount
311
+ if updated_count > 0:
312
+ self.log(f"✅ Data migration: Updated {updated_count} project termbase(s) with is_project_termbase=1")
313
+ self.connection.commit()
314
+ except Exception as e:
315
+ self.log(f"⚠️ Data migration warning (is_project_termbase): {e}")
316
+ pass
317
+
318
+ # Legacy support: create glossaries as alias for termbases
319
+ self.cursor.execute("""
320
+ CREATE TABLE IF NOT EXISTS glossaries (
321
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
322
+ name TEXT NOT NULL UNIQUE,
323
+ description TEXT,
324
+ source_lang TEXT,
325
+ target_lang TEXT,
326
+ project_id INTEGER,
327
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
328
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
329
+ )
330
+ """)
331
+
332
+ # Termbase activation (tracks which termbases are active for which projects)
333
+ self.cursor.execute("""
334
+ CREATE TABLE IF NOT EXISTS termbase_activation (
335
+ termbase_id INTEGER NOT NULL,
336
+ project_id INTEGER NOT NULL,
337
+ is_active BOOLEAN DEFAULT 1,
338
+ activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
339
+ priority INTEGER, -- Manual priority (1=highest, 2=second, etc.). Multiple termbases can share same priority.
340
+ PRIMARY KEY (termbase_id, project_id),
341
+ FOREIGN KEY (termbase_id) REFERENCES termbases(id) ON DELETE CASCADE
342
+ )
343
+ """)
344
+
345
+ # Migration: Add priority column to termbase_activation if it doesn't exist
346
+ try:
347
+ self.cursor.execute("ALTER TABLE termbase_activation ADD COLUMN priority INTEGER")
348
+ self.connection.commit()
349
+ except Exception:
350
+ # Column already exists, ignore
351
+ pass
352
+
353
+ # Legacy support: termbase_project_activation as alias
354
+ # Note: Foreign key now references termbases for consistency with Qt version
355
+ self.cursor.execute("""
356
+ CREATE TABLE IF NOT EXISTS termbase_project_activation (
357
+ termbase_id INTEGER NOT NULL,
358
+ project_id INTEGER NOT NULL,
359
+ activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
360
+ PRIMARY KEY (termbase_id, project_id),
361
+ FOREIGN KEY (termbase_id) REFERENCES termbases(id) ON DELETE CASCADE
362
+ )
363
+ """)
364
+
365
+ self.cursor.execute("""
366
+ CREATE TABLE IF NOT EXISTS termbase_terms (
367
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
368
+ source_term TEXT NOT NULL,
369
+ target_term TEXT NOT NULL,
370
+ source_lang TEXT DEFAULT 'unknown',
371
+ target_lang TEXT DEFAULT 'unknown',
372
+ termbase_id TEXT NOT NULL,
373
+ priority INTEGER DEFAULT 99,
374
+ project_id TEXT,
375
+
376
+ -- Terminology-specific fields
377
+ synonyms TEXT,
378
+ forbidden_terms TEXT,
379
+ definition TEXT,
380
+ context TEXT,
381
+ part_of_speech TEXT,
382
+ domain TEXT,
383
+ case_sensitive BOOLEAN DEFAULT 0,
384
+ forbidden BOOLEAN DEFAULT 0,
385
+
386
+ -- Link to TM entry (optional)
387
+ tm_source_id INTEGER,
388
+
389
+ -- Metadata
390
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
391
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
392
+ usage_count INTEGER DEFAULT 0,
393
+ notes TEXT,
394
+ note TEXT,
395
+ project TEXT,
396
+ client TEXT,
397
+ term_uuid TEXT,
398
+
399
+ FOREIGN KEY (tm_source_id) REFERENCES translation_units(id) ON DELETE SET NULL
400
+ )
401
+ """)
402
+
403
+ # Indexes for termbase_terms
404
+ self.cursor.execute("""
405
+ CREATE INDEX IF NOT EXISTS idx_gt_source_term
406
+ ON termbase_terms(source_term)
407
+ """)
408
+
409
+ self.cursor.execute("""
410
+ CREATE INDEX IF NOT EXISTS idx_gt_termbase_id
411
+ ON termbase_terms(termbase_id)
412
+ """)
413
+
414
+ self.cursor.execute("""
415
+ CREATE INDEX IF NOT EXISTS idx_gt_project_id
416
+ ON termbase_terms(project_id)
417
+ """)
418
+
419
+ self.cursor.execute("""
420
+ CREATE INDEX IF NOT EXISTS idx_gt_domain
421
+ ON termbase_terms(domain)
422
+ """)
423
+
424
+ # Full-text search for termbase
425
+ self.cursor.execute("""
426
+ CREATE VIRTUAL TABLE IF NOT EXISTS termbase_terms_fts
427
+ USING fts5(
428
+ source_term,
429
+ target_term,
430
+ definition,
431
+ content=termbase_terms,
432
+ content_rowid=id
433
+ )
434
+ """)
435
+
436
+ # ============================================
437
+ # NON-TRANSLATABLES
438
+ # ============================================
439
+
440
+ self.cursor.execute("""
441
+ CREATE TABLE IF NOT EXISTS non_translatables (
442
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
443
+ pattern TEXT NOT NULL UNIQUE,
444
+ pattern_type TEXT DEFAULT 'regex',
445
+ description TEXT,
446
+ project_id TEXT,
447
+ enabled BOOLEAN DEFAULT 1,
448
+ example_text TEXT,
449
+ category TEXT,
450
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
451
+ )
452
+ """)
453
+
454
+ self.cursor.execute("""
455
+ CREATE INDEX IF NOT EXISTS idx_nt_project_id
456
+ ON non_translatables(project_id)
457
+ """)
458
+
459
+ self.cursor.execute("""
460
+ CREATE INDEX IF NOT EXISTS idx_nt_category
461
+ ON non_translatables(category)
462
+ """)
463
+
464
+ # ============================================
465
+ # SEGMENTATION RULES
466
+ # ============================================
467
+
468
+ self.cursor.execute("""
469
+ CREATE TABLE IF NOT EXISTS segmentation_rules (
470
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
471
+ rule_name TEXT NOT NULL,
472
+ source_lang TEXT,
473
+ rule_type TEXT NOT NULL,
474
+ pattern TEXT NOT NULL,
475
+ description TEXT,
476
+ priority INTEGER DEFAULT 100,
477
+ enabled BOOLEAN DEFAULT 1,
478
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
479
+ )
480
+ """)
481
+
482
+ self.cursor.execute("""
483
+ CREATE INDEX IF NOT EXISTS idx_sr_source_lang
484
+ ON segmentation_rules(source_lang)
485
+ """)
486
+
487
+ self.cursor.execute("""
488
+ CREATE INDEX IF NOT EXISTS idx_sr_priority
489
+ ON segmentation_rules(priority)
490
+ """)
491
+
492
+ # ============================================
493
+ # PROJECT METADATA
494
+ # ============================================
495
+
496
+ self.cursor.execute("""
497
+ CREATE TABLE IF NOT EXISTS projects (
498
+ id TEXT PRIMARY KEY,
499
+ name TEXT NOT NULL,
500
+ source_lang TEXT,
501
+ target_lang TEXT,
502
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
503
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
504
+ last_opened TIMESTAMP,
505
+
506
+ -- Linked resources (JSON arrays)
507
+ active_tm_ids TEXT,
508
+ active_termbase_ids TEXT,
509
+ active_prompt_file TEXT,
510
+ active_style_guide TEXT,
511
+
512
+ -- Statistics
513
+ segment_count INTEGER DEFAULT 0,
514
+ translated_count INTEGER DEFAULT 0,
515
+
516
+ -- Settings (JSON blob)
517
+ settings TEXT
518
+ )
519
+ """)
520
+
521
+ # ============================================
522
+ # FILE METADATA (for prompts and style guides)
523
+ # ============================================
524
+
525
+ self.cursor.execute("""
526
+ CREATE TABLE IF NOT EXISTS prompt_files (
527
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
528
+ file_path TEXT NOT NULL UNIQUE,
529
+ file_type TEXT NOT NULL,
530
+ name TEXT NOT NULL,
531
+ description TEXT,
532
+ last_used TIMESTAMP,
533
+ use_count INTEGER DEFAULT 0
534
+ )
535
+ """)
536
+
537
+ # ============================================
538
+ # TMX EDITOR TABLES (for database-backed TMX files)
539
+ # ============================================
540
+
541
+ self.cursor.execute("""
542
+ CREATE TABLE IF NOT EXISTS tmx_files (
543
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
544
+ file_path TEXT NOT NULL UNIQUE,
545
+ file_name TEXT NOT NULL,
546
+ original_file_path TEXT, -- Original file path when imported
547
+ load_mode TEXT NOT NULL, -- 'ram' or 'database'
548
+ file_size INTEGER, -- File size in bytes
549
+
550
+ -- Header metadata (JSON)
551
+ header_data TEXT NOT NULL,
552
+
553
+ -- Statistics
554
+ tu_count INTEGER DEFAULT 0,
555
+ languages TEXT, -- JSON array of language codes
556
+
557
+ -- Timestamps
558
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
559
+ last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
560
+ last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP
561
+ )
562
+ """)
563
+
564
+ self.cursor.execute("""
565
+ CREATE TABLE IF NOT EXISTS tmx_translation_units (
566
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
567
+ tmx_file_id INTEGER NOT NULL,
568
+ tu_id INTEGER NOT NULL, -- Original TU ID from TMX file
569
+
570
+ -- System attributes
571
+ creation_date TEXT,
572
+ creation_id TEXT,
573
+ change_date TEXT,
574
+ change_id TEXT,
575
+ srclang TEXT,
576
+
577
+ -- Custom attributes (JSON)
578
+ custom_attributes TEXT,
579
+
580
+ -- Comments (JSON array)
581
+ comments TEXT,
582
+
583
+ FOREIGN KEY (tmx_file_id) REFERENCES tmx_files(id) ON DELETE CASCADE,
584
+ UNIQUE(tmx_file_id, tu_id)
585
+ )
586
+ """)
587
+
588
+ self.cursor.execute("""
589
+ CREATE TABLE IF NOT EXISTS tmx_segments (
590
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
591
+ tu_id INTEGER NOT NULL, -- References tmx_translation_units.id
592
+ lang TEXT NOT NULL,
593
+ text TEXT NOT NULL,
594
+
595
+ -- Language-specific attributes
596
+ creation_date TEXT,
597
+ creation_id TEXT,
598
+ change_date TEXT,
599
+ change_id TEXT,
600
+
601
+ FOREIGN KEY (tu_id) REFERENCES tmx_translation_units(id) ON DELETE CASCADE,
602
+ UNIQUE(tu_id, lang)
603
+ )
604
+ """)
605
+
606
+ # Indexes for TMX tables
607
+ self.cursor.execute("""
608
+ CREATE INDEX IF NOT EXISTS idx_tmx_tu_file_id
609
+ ON tmx_translation_units(tmx_file_id)
610
+ """)
611
+
612
+ self.cursor.execute("""
613
+ CREATE INDEX IF NOT EXISTS idx_tmx_tu_tu_id
614
+ ON tmx_translation_units(tu_id)
615
+ """)
616
+
617
+ self.cursor.execute("""
618
+ CREATE INDEX IF NOT EXISTS idx_tmx_seg_tu_id
619
+ ON tmx_segments(tu_id)
620
+ """)
621
+
622
+ self.cursor.execute("""
623
+ CREATE INDEX IF NOT EXISTS idx_tmx_seg_lang
624
+ ON tmx_segments(lang)
625
+ """)
626
+
627
+ self.cursor.execute("""
628
+ CREATE TABLE IF NOT EXISTS style_guide_files (
629
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
630
+ file_path TEXT NOT NULL UNIQUE,
631
+ language TEXT NOT NULL,
632
+ last_used TIMESTAMP,
633
+ use_count INTEGER DEFAULT 0
634
+ )
635
+ """)
636
+
637
+ # Commit schema
638
+ try:
639
+ self.connection.commit()
640
+ print("✅ Database tables created and committed successfully")
641
+ except Exception as e:
642
+ print(f"❌ Error committing database schema: {e}")
643
+ import traceback
644
+ traceback.print_exc()
645
+ raise
646
+
647
+ def close(self):
648
+ """Close database connection"""
649
+ if self.connection:
650
+ self.connection.close()
651
+ self.connection = None
652
+ self.cursor = None
653
+
654
+ # ============================================
655
+ # TRANSLATION MEMORY METHODS
656
+ # ============================================
657
+
658
+ def add_translation_unit(self, source: str, target: str, source_lang: str,
659
+ target_lang: str, tm_id: str = 'project',
660
+ project_id: str = None, context_before: str = None,
661
+ context_after: str = None, notes: str = None) -> int:
662
+ """
663
+ Add translation unit to database
664
+
665
+ Returns: ID of inserted/updated entry
666
+ """
667
+ # Generate hash for fast exact matching
668
+ source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
669
+
670
+ try:
671
+ self.cursor.execute("""
672
+ INSERT INTO translation_units
673
+ (source_text, target_text, source_lang, target_lang, tm_id,
674
+ project_id, context_before, context_after, source_hash, notes)
675
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
676
+ ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
677
+ usage_count = usage_count + 1,
678
+ modified_date = CURRENT_TIMESTAMP
679
+ """, (source, target, source_lang, target_lang, tm_id,
680
+ project_id, context_before, context_after, source_hash, notes))
681
+
682
+ self.connection.commit()
683
+ return self.cursor.lastrowid
684
+
685
+ except Exception as e:
686
+ self.log(f"Error adding translation unit: {e}")
687
+ return None
688
+
689
+ def get_exact_match(self, source: str, tm_ids: List[str] = None,
690
+ source_lang: str = None, target_lang: str = None,
691
+ bidirectional: bool = True) -> Optional[Dict]:
692
+ """
693
+ Get exact match from TM
694
+
695
+ Args:
696
+ source: Source text to match
697
+ tm_ids: List of TM IDs to search (None = all)
698
+ source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
699
+ target_lang: Filter by target language (base code matching)
700
+ bidirectional: If True, search both directions (nl→en AND en→nl)
701
+
702
+ Returns: Dictionary with match data or None
703
+ """
704
+ from modules.tmx_generator import get_base_lang_code
705
+
706
+ source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
707
+
708
+ # Get base language codes for comparison
709
+ src_base = get_base_lang_code(source_lang) if source_lang else None
710
+ tgt_base = get_base_lang_code(target_lang) if target_lang else None
711
+
712
+ query = """
713
+ SELECT * FROM translation_units
714
+ WHERE source_hash = ? AND source_text = ?
715
+ """
716
+ params = [source_hash, source]
717
+
718
+ if tm_ids:
719
+ placeholders = ','.join('?' * len(tm_ids))
720
+ query += f" AND tm_id IN ({placeholders})"
721
+ params.extend(tm_ids)
722
+
723
+ # Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
724
+ from modules.tmx_generator import get_lang_match_variants
725
+ if src_base:
726
+ src_variants = get_lang_match_variants(source_lang)
727
+ src_conditions = []
728
+ for variant in src_variants:
729
+ src_conditions.append("source_lang = ?")
730
+ params.append(variant)
731
+ src_conditions.append("source_lang LIKE ?")
732
+ params.append(f"{variant}-%")
733
+ query += f" AND ({' OR '.join(src_conditions)})"
734
+
735
+ if tgt_base:
736
+ tgt_variants = get_lang_match_variants(target_lang)
737
+ tgt_conditions = []
738
+ for variant in tgt_variants:
739
+ tgt_conditions.append("target_lang = ?")
740
+ params.append(variant)
741
+ tgt_conditions.append("target_lang LIKE ?")
742
+ params.append(f"{variant}-%")
743
+ query += f" AND ({' OR '.join(tgt_conditions)})"
744
+
745
+ query += " ORDER BY usage_count DESC, modified_date DESC LIMIT 1"
746
+
747
+ self.cursor.execute(query, params)
748
+ row = self.cursor.fetchone()
749
+
750
+ if row:
751
+ # Update usage count
752
+ self.cursor.execute("""
753
+ UPDATE translation_units
754
+ SET usage_count = usage_count + 1
755
+ WHERE id = ?
756
+ """, (row['id'],))
757
+ self.connection.commit()
758
+
759
+ return dict(row)
760
+
761
+ # If bidirectional and no forward match, try reverse direction
762
+ if bidirectional and src_base and tgt_base:
763
+ # Search where our source text is in the target field (reverse direction)
764
+ query = """
765
+ SELECT * FROM translation_units
766
+ WHERE target_text = ?
767
+ """
768
+ params = [source]
769
+
770
+ if tm_ids:
771
+ placeholders = ','.join('?' * len(tm_ids))
772
+ query += f" AND tm_id IN ({placeholders})"
773
+ params.extend(tm_ids)
774
+
775
+ # Reversed: search where TM source_lang matches our target_lang (flexible matching)
776
+ # Note: for reverse, we swap - TM source_lang should match our target_lang
777
+ tgt_variants = get_lang_match_variants(target_lang)
778
+ src_variants = get_lang_match_variants(source_lang)
779
+
780
+ src_conditions = []
781
+ for variant in tgt_variants: # TM source_lang = our target_lang
782
+ src_conditions.append("source_lang = ?")
783
+ params.append(variant)
784
+ src_conditions.append("source_lang LIKE ?")
785
+ params.append(f"{variant}-%")
786
+
787
+ tgt_conditions = []
788
+ for variant in src_variants: # TM target_lang = our source_lang
789
+ tgt_conditions.append("target_lang = ?")
790
+ params.append(variant)
791
+ tgt_conditions.append("target_lang LIKE ?")
792
+ params.append(f"{variant}-%")
793
+
794
+ query += f" AND ({' OR '.join(src_conditions)}) AND ({' OR '.join(tgt_conditions)})"
795
+
796
+ query += " ORDER BY usage_count DESC, modified_date DESC LIMIT 1"
797
+
798
+ self.cursor.execute(query, params)
799
+ row = self.cursor.fetchone()
800
+
801
+ if row:
802
+ # Update usage count
803
+ self.cursor.execute("""
804
+ UPDATE translation_units
805
+ SET usage_count = usage_count + 1
806
+ WHERE id = ?
807
+ """, (row['id'],))
808
+ self.connection.commit()
809
+
810
+ # Swap source/target since this is a reverse match
811
+ result = dict(row)
812
+ result['source_text'], result['target_text'] = result['target_text'], result['source_text']
813
+ result['source_lang'], result['target_lang'] = result['target_lang'], result['source_lang']
814
+ result['reverse_match'] = True
815
+ return result
816
+
817
+ return None
818
+
819
+ def calculate_similarity(self, text1: str, text2: str) -> float:
820
+ """
821
+ Calculate similarity ratio between two texts using SequenceMatcher.
822
+ Tags are stripped before comparison for better matching accuracy.
823
+
824
+ Returns: Similarity score from 0.0 to 1.0
825
+ """
826
+ import re
827
+ # Strip HTML/XML tags for comparison
828
+ clean1 = re.sub(r'<[^>]+>', '', text1).lower()
829
+ clean2 = re.sub(r'<[^>]+>', '', text2).lower()
830
+ return SequenceMatcher(None, clean1, clean2).ratio()
831
+
832
+ def search_fuzzy_matches(self, source: str, tm_ids: List[str] = None,
833
+ threshold: float = 0.75, max_results: int = 5,
834
+ source_lang: str = None, target_lang: str = None,
835
+ bidirectional: bool = True) -> List[Dict]:
836
+ """
837
+ Search for fuzzy matches using FTS5 with proper similarity calculation
838
+
839
+ Args:
840
+ bidirectional: If True, search both directions (nl→en AND en→nl)
841
+
842
+ Returns: List of matches with similarity scores
843
+
844
+ Note: When multiple TMs are provided, searches each TM separately to ensure
845
+ good matches from smaller TMs aren't pushed out by BM25 keyword ranking
846
+ from larger TMs. Results are merged and sorted by actual similarity.
847
+ """
848
+ # For better FTS5 matching, tokenize the query and escape special chars
849
+ # FTS5 special characters: " ( ) - : , . ! ?
850
+ import re
851
+ from modules.tmx_generator import get_base_lang_code, get_lang_match_variants
852
+
853
+ # Strip HTML/XML tags from source for clean text search
854
+ text_without_tags = re.sub(r'<[^>]+>', '', source)
855
+
856
+ # Remove special FTS5 characters and split into words (from tag-stripped text)
857
+ clean_text = re.sub(r'[^\w\s]', ' ', text_without_tags) # Replace special chars with spaces
858
+ search_terms_clean = [term for term in clean_text.strip().split() if len(term) > 2] # Min 3 chars
859
+
860
+ # Also get search terms from original source (in case TM was indexed with tags)
861
+ clean_text_with_tags = re.sub(r'[^\w\s]', ' ', source)
862
+ search_terms_with_tags = [term for term in clean_text_with_tags.strip().split() if len(term) > 2]
863
+
864
+ # Combine both sets of search terms (deduplicated)
865
+ all_search_terms = list(dict.fromkeys(search_terms_clean + search_terms_with_tags))
866
+
867
+ # For long segments, prioritize longer/rarer words to get better FTS5 candidates
868
+ # Sort by length (longer words are usually more discriminating)
869
+ all_search_terms.sort(key=len, reverse=True)
870
+
871
+ # Limit search terms to avoid overly complex queries (top 20 longest words)
872
+ # This helps find similar long segments more reliably
873
+ search_terms_for_query = all_search_terms[:20]
874
+
875
+ if not search_terms_for_query:
876
+ # If no valid terms, return empty results
877
+ return []
878
+
879
+ # Quote each term to prevent FTS5 syntax errors
880
+ fts_query = ' OR '.join(f'"{term}"' for term in search_terms_for_query)
881
+
882
+ # Get base language codes for comparison
883
+ src_base = get_base_lang_code(source_lang) if source_lang else None
884
+ tgt_base = get_base_lang_code(target_lang) if target_lang else None
885
+
886
+ # MULTI-TM FIX: Search each TM separately to avoid BM25 ranking issues
887
+ # When a large TM is combined with a small TM, the large TM's many keyword matches
888
+ # push down genuinely similar sentences from the small TM
889
+ tms_to_search = tm_ids if tm_ids else [None] # None means search all TMs together
890
+
891
+ all_results = []
892
+
893
+ for tm_id in tms_to_search:
894
+ # Search this specific TM (or all if tm_id is None)
895
+ tm_results = self._search_single_tm_fuzzy(
896
+ source, fts_query, [tm_id] if tm_id else None,
897
+ threshold, max_results, src_base, tgt_base,
898
+ source_lang, target_lang, bidirectional
899
+ )
900
+ all_results.extend(tm_results)
901
+
902
+ # Deduplicate by source_text (keep highest similarity for each unique source)
903
+ seen = {}
904
+ for result in all_results:
905
+ key = result['source_text']
906
+ if key not in seen or result['similarity'] > seen[key]['similarity']:
907
+ seen[key] = result
908
+
909
+ deduped_results = list(seen.values())
910
+
911
+ # Sort ALL results by similarity (highest first) - this ensures the 76% match
912
+ # appears before 40% matches regardless of which TM they came from
913
+ deduped_results.sort(key=lambda x: x['similarity'], reverse=True)
914
+
915
+ return deduped_results[:max_results]
916
+
917
+ def _search_single_tm_fuzzy(self, source: str, fts_query: str, tm_ids: List[str],
918
+ threshold: float, max_results: int,
919
+ src_base: str, tgt_base: str,
920
+ source_lang: str, target_lang: str,
921
+ bidirectional: bool) -> List[Dict]:
922
+ """Search a single TM (or all TMs if tm_ids is None) for fuzzy matches"""
923
+ from modules.tmx_generator import get_lang_match_variants
924
+
925
+ # Build query for this TM
926
+ query = """
927
+ SELECT tu.*,
928
+ bm25(translation_units_fts) as relevance
929
+ FROM translation_units tu
930
+ JOIN translation_units_fts ON tu.id = translation_units_fts.rowid
931
+ WHERE translation_units_fts MATCH ?
932
+ """
933
+ params = [fts_query]
934
+
935
+ if tm_ids and tm_ids[0] is not None:
936
+ placeholders = ','.join('?' * len(tm_ids))
937
+ query += f" AND tu.tm_id IN ({placeholders})"
938
+ params.extend(tm_ids)
939
+
940
+ # Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
941
+ if src_base:
942
+ src_variants = get_lang_match_variants(source_lang)
943
+ src_conditions = []
944
+ for variant in src_variants:
945
+ src_conditions.append("tu.source_lang = ?")
946
+ params.append(variant)
947
+ src_conditions.append("tu.source_lang LIKE ?")
948
+ params.append(f"{variant}-%")
949
+ query += f" AND ({' OR '.join(src_conditions)})"
950
+
951
+ if tgt_base:
952
+ tgt_variants = get_lang_match_variants(target_lang)
953
+ tgt_conditions = []
954
+ for variant in tgt_variants:
955
+ tgt_conditions.append("tu.target_lang = ?")
956
+ params.append(variant)
957
+ tgt_conditions.append("tu.target_lang LIKE ?")
958
+ params.append(f"{variant}-%")
959
+ query += f" AND ({' OR '.join(tgt_conditions)})"
960
+
961
+ # Per-TM candidate limit - INCREASED to catch more potential fuzzy matches
962
+ # When multiple TMs are searched, BM25 ranking can push genuinely similar
963
+ # entries far down the list due to common word matches in other entries
964
+ candidate_limit = max(500, max_results * 50)
965
+ query += f" ORDER BY relevance DESC LIMIT {candidate_limit}"
966
+
967
+ try:
968
+ self.cursor.execute(query, params)
969
+ all_rows = self.cursor.fetchall()
970
+ except Exception as e:
971
+ return []
972
+
973
+ results = []
974
+
975
+ for row in all_rows:
976
+ match_dict = dict(row)
977
+ # Calculate actual similarity using SequenceMatcher
978
+ similarity = self.calculate_similarity(source, match_dict['source_text'])
979
+
980
+ # Only include matches above threshold
981
+ if similarity >= threshold:
982
+ match_dict['similarity'] = similarity
983
+ match_dict['match_pct'] = int(similarity * 100)
984
+ results.append(match_dict)
985
+
986
+ # If bidirectional, also search reverse direction
987
+ if bidirectional and src_base and tgt_base:
988
+ query = """
989
+ SELECT tu.*,
990
+ bm25(translation_units_fts) as relevance
991
+ FROM translation_units tu
992
+ JOIN translation_units_fts ON tu.id = translation_units_fts.rowid
993
+ WHERE translation_units_fts MATCH ?
994
+ """
995
+ params = [fts_query]
996
+
997
+ if tm_ids and tm_ids[0] is not None:
998
+ placeholders = ','.join('?' * len(tm_ids))
999
+ query += f" AND tu.tm_id IN ({placeholders})"
1000
+ params.extend(tm_ids)
1001
+
1002
+ # Reversed language filters with flexible matching
1003
+ src_variants = get_lang_match_variants(source_lang)
1004
+ tgt_variants = get_lang_match_variants(target_lang)
1005
+
1006
+ # TM target_lang = our source_lang
1007
+ tgt_conditions = []
1008
+ for variant in src_variants:
1009
+ tgt_conditions.append("tu.target_lang = ?")
1010
+ params.append(variant)
1011
+ tgt_conditions.append("tu.target_lang LIKE ?")
1012
+ params.append(f"{variant}-%")
1013
+ query += f" AND ({' OR '.join(tgt_conditions)})"
1014
+
1015
+ # TM source_lang = our target_lang
1016
+ src_conditions = []
1017
+ for variant in tgt_variants:
1018
+ src_conditions.append("tu.source_lang = ?")
1019
+ params.append(variant)
1020
+ src_conditions.append("tu.source_lang LIKE ?")
1021
+ params.append(f"{variant}-%")
1022
+ query += f" AND ({' OR '.join(src_conditions)})"
1023
+
1024
+ query += f" ORDER BY relevance DESC LIMIT {max_results * 5}"
1025
+
1026
+ try:
1027
+ self.cursor.execute(query, params)
1028
+
1029
+ for row in self.cursor.fetchall():
1030
+ match_dict = dict(row)
1031
+ # Calculate similarity against target_text (since we're reversing)
1032
+ similarity = self.calculate_similarity(source, match_dict['target_text'])
1033
+
1034
+ # Only include matches above threshold
1035
+ if similarity >= threshold:
1036
+ # Swap source/target for reverse match
1037
+ match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
1038
+ match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
1039
+ match_dict['similarity'] = similarity
1040
+ match_dict['match_pct'] = int(similarity * 100)
1041
+ match_dict['reverse_match'] = True
1042
+ results.append(match_dict)
1043
+ except Exception as e:
1044
+ print(f"[DEBUG] _search_single_tm_fuzzy (reverse): SQL ERROR: {e}")
1045
+
1046
+ return results
1047
+
1048
+ def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True,
1049
+ threshold: float = 0.75, max_results: int = 10) -> List[Dict]:
1050
+ """
1051
+ Search for matches across TMs (both exact and fuzzy)
1052
+
1053
+ Args:
1054
+ source: Source text to search for
1055
+ tm_ids: List of TM IDs to search (None = all)
1056
+ enabled_only: Currently ignored (all TMs enabled)
1057
+ threshold: Minimum similarity threshold (0.0-1.0)
1058
+ max_results: Maximum number of results
1059
+
1060
+ Returns:
1061
+ List of matches with source, target, match_pct, tm_name
1062
+ """
1063
+ # First try exact match
1064
+ exact = self.get_exact_match(source, tm_ids=tm_ids)
1065
+ if exact:
1066
+ return [{
1067
+ 'source': exact['source_text'],
1068
+ 'target': exact['target_text'],
1069
+ 'match_pct': 100,
1070
+ 'tm_name': exact['tm_id'].replace('_', ' ').title(),
1071
+ 'tm_id': exact['tm_id']
1072
+ }]
1073
+
1074
+ # No exact match, try fuzzy
1075
+ fuzzy_matches = self.search_fuzzy_matches(
1076
+ source,
1077
+ tm_ids=tm_ids,
1078
+ threshold=threshold,
1079
+ max_results=max_results
1080
+ )
1081
+
1082
+ results = []
1083
+ for match in fuzzy_matches:
1084
+ results.append({
1085
+ 'source': match['source_text'],
1086
+ 'target': match['target_text'],
1087
+ 'match_pct': match['match_pct'],
1088
+ 'tm_name': match['tm_id'].replace('_', ' ').title(),
1089
+ 'tm_id': match['tm_id']
1090
+ })
1091
+
1092
+ return results
1093
+
1094
+ def get_tm_entries(self, tm_id: str, limit: int = None) -> List[Dict]:
1095
+ """Get all entries from a specific TM"""
1096
+ query = "SELECT * FROM translation_units WHERE tm_id = ? ORDER BY id"
1097
+ params = [tm_id]
1098
+
1099
+ if limit:
1100
+ query += f" LIMIT {limit}"
1101
+
1102
+ self.cursor.execute(query, params)
1103
+ return [dict(row) for row in self.cursor.fetchall()]
1104
+
1105
+ def get_tm_count(self, tm_id: str = None) -> int:
1106
+ """Get entry count for TM(s)"""
1107
+ if tm_id:
1108
+ self.cursor.execute("""
1109
+ SELECT COUNT(*) FROM translation_units WHERE tm_id = ?
1110
+ """, (tm_id,))
1111
+ else:
1112
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units")
1113
+
1114
+ return self.cursor.fetchone()[0]
1115
+
1116
+ def clear_tm(self, tm_id: str):
1117
+ """Clear all entries from a TM"""
1118
+ self.cursor.execute("""
1119
+ DELETE FROM translation_units WHERE tm_id = ?
1120
+ """, (tm_id,))
1121
+ self.connection.commit()
1122
+
1123
+ def delete_entry(self, tm_id: str, source: str, target: str):
1124
+ """Delete a specific entry from a TM"""
1125
+ # Get the ID first
1126
+ self.cursor.execute("""
1127
+ SELECT id FROM translation_units
1128
+ WHERE tm_id = ? AND source_text = ? AND target_text = ?
1129
+ """, (tm_id, source, target))
1130
+
1131
+ result = self.cursor.fetchone()
1132
+ if not result:
1133
+ return # Entry not found
1134
+
1135
+ entry_id = result['id']
1136
+
1137
+ # Delete from FTS5 index first
1138
+ try:
1139
+ self.cursor.execute("""
1140
+ DELETE FROM tm_fts WHERE rowid = ?
1141
+ """, (entry_id,))
1142
+ except Exception:
1143
+ pass # FTS5 table might not exist
1144
+
1145
+ # Delete from main table
1146
+ self.cursor.execute("""
1147
+ DELETE FROM translation_units
1148
+ WHERE id = ?
1149
+ """, (entry_id,))
1150
+
1151
+ self.connection.commit()
1152
+
1153
+ def concordance_search(self, query: str, tm_ids: List[str] = None, direction: str = 'both',
1154
+ source_lang = None, target_lang = None) -> List[Dict]:
1155
+ """
1156
+ Search for text in source and/or target (concordance search)
1157
+ Uses FTS5 full-text search for fast matching on millions of segments.
1158
+ Falls back to LIKE queries if FTS5 fails.
1159
+
1160
+ Language filters define what you're searching FOR and what translation you want:
1161
+ - "From: Dutch, To: English" = Search for Dutch text, show English translations
1162
+ - Searches ALL TMs (regardless of their stored language pair direction)
1163
+ - Automatically swaps columns when needed (e.g., finds Dutch in target column of EN→NL TM)
1164
+ - This is MORE intuitive than traditional CAT tools that only search specific TM directions
1165
+
1166
+ Args:
1167
+ query: Text to search for
1168
+ tm_ids: List of TM IDs to search (None = all)
1169
+ direction: 'source' = search source only, 'target' = search target only, 'both' = bidirectional
1170
+ source_lang: Filter by source language - can be a string OR a list of language variants (None = any)
1171
+ target_lang: Filter by target language - can be a string OR a list of language variants (None = any)
1172
+ """
1173
+ # Normalize language filters to lists for consistent handling
1174
+ source_langs = source_lang if isinstance(source_lang, list) else ([source_lang] if source_lang else None)
1175
+ target_langs = target_lang if isinstance(target_lang, list) else ([target_lang] if target_lang else None)
1176
+
1177
+ # Escape FTS5 special characters and wrap words for prefix matching
1178
+ # FTS5 special chars: " * ( ) : ^
1179
+ fts_query = query.replace('"', '""')
1180
+ # Wrap in quotes for phrase search
1181
+ fts_query = f'"{fts_query}"'
1182
+
1183
+ # When language filters specified, we need to search intelligently:
1184
+ # - Don't filter by TM language pair (search ALL TMs)
1185
+ # - Search in BOTH columns to find text
1186
+ # - Swap columns if needed to show correct language order
1187
+ use_smart_search = (source_langs or target_langs)
1188
+
1189
+ try:
1190
+ # Use FTS5 for fast full-text search
1191
+ if direction == 'source':
1192
+ fts_sql = """
1193
+ SELECT tu.* FROM translation_units tu
1194
+ JOIN translation_units_fts fts ON tu.id = fts.rowid
1195
+ WHERE fts.source_text MATCH ?
1196
+ """
1197
+ params = [fts_query]
1198
+ elif direction == 'target':
1199
+ fts_sql = """
1200
+ SELECT tu.* FROM translation_units tu
1201
+ JOIN translation_units_fts fts ON tu.id = fts.rowid
1202
+ WHERE fts.target_text MATCH ?
1203
+ """
1204
+ params = [fts_query]
1205
+ else:
1206
+ # Both directions - search in combined FTS index
1207
+ fts_sql = """
1208
+ SELECT tu.* FROM translation_units tu
1209
+ JOIN translation_units_fts fts ON tu.id = fts.rowid
1210
+ WHERE translation_units_fts MATCH ?
1211
+ """
1212
+ params = [fts_query]
1213
+
1214
+ if tm_ids:
1215
+ placeholders = ','.join('?' * len(tm_ids))
1216
+ fts_sql += f" AND tu.tm_id IN ({placeholders})"
1217
+ params.extend(tm_ids)
1218
+
1219
+ # DON'T filter by language when smart search active
1220
+ # (we need to search all TMs and figure out which column has our language)
1221
+ if not use_smart_search:
1222
+ # Traditional filtering when no language filters
1223
+ if source_langs:
1224
+ placeholders = ','.join('?' * len(source_langs))
1225
+ fts_sql += f" AND tu.source_lang IN ({placeholders})"
1226
+ params.extend(source_langs)
1227
+ if target_langs:
1228
+ placeholders = ','.join('?' * len(target_langs))
1229
+ fts_sql += f" AND tu.target_lang IN ({placeholders})"
1230
+ params.extend(target_langs)
1231
+
1232
+ fts_sql += " ORDER BY tu.modified_date DESC LIMIT 100"
1233
+
1234
+ self.cursor.execute(fts_sql, params)
1235
+ raw_results = [dict(row) for row in self.cursor.fetchall()]
1236
+
1237
+ # Smart search: Filter and swap based on language metadata
1238
+ if use_smart_search:
1239
+ processed_results = []
1240
+ for row in raw_results:
1241
+ row_src_lang = row.get('source_lang', '')
1242
+ row_tgt_lang = row.get('target_lang', '')
1243
+
1244
+ # Check if this row matches our language requirements
1245
+ # If "From: Dutch, To: English":
1246
+ # - Accept if source=nl and target=en (normal)
1247
+ # - Accept if source=en and target=nl (swap needed)
1248
+
1249
+ matches = False
1250
+ needs_swap = False
1251
+
1252
+ if source_langs and target_langs:
1253
+ # Both filters specified
1254
+ if row_src_lang in source_langs and row_tgt_lang in target_langs:
1255
+ # Perfect match - no swap
1256
+ matches = True
1257
+ needs_swap = False
1258
+ elif row_src_lang in target_langs and row_tgt_lang in source_langs:
1259
+ # Reversed - needs swap
1260
+ matches = True
1261
+ needs_swap = True
1262
+ elif source_langs:
1263
+ # Only "From" specified - just check if Dutch is in EITHER column
1264
+ if row_src_lang in source_langs:
1265
+ matches = True
1266
+ needs_swap = False
1267
+ elif row_tgt_lang in source_langs:
1268
+ matches = True
1269
+ needs_swap = True
1270
+ elif target_langs:
1271
+ # Only "To" specified - just check if English is in EITHER column
1272
+ if row_tgt_lang in target_langs:
1273
+ matches = True
1274
+ needs_swap = False
1275
+ elif row_src_lang in target_langs:
1276
+ matches = True
1277
+ needs_swap = True
1278
+
1279
+ if matches:
1280
+ # CRITICAL CHECK: Verify the search text is actually in the correct column
1281
+ # If user searches for Dutch with "From: Dutch", the text must be in the source column (after any swap)
1282
+ # This prevents finding Dutch text when user asks to search FOR English
1283
+
1284
+ if needs_swap:
1285
+ # After swap, check if query is in the NEW source column (was target)
1286
+ text_to_check = row['target_text'].lower()
1287
+ else:
1288
+ # No swap, check if query is in source column
1289
+ text_to_check = row['source_text'].lower()
1290
+
1291
+ # Only include if query text is actually in the source column
1292
+ if query.lower() in text_to_check:
1293
+ if needs_swap:
1294
+ # Swap columns to show correct language order
1295
+ swapped_row = row.copy()
1296
+ swapped_row['source'] = row['target_text']
1297
+ swapped_row['target'] = row['source_text']
1298
+ swapped_row['source_lang'] = row['target_lang']
1299
+ swapped_row['target_lang'] = row['source_lang']
1300
+ processed_results.append(swapped_row)
1301
+ else:
1302
+ # No swap needed - just rename columns
1303
+ processed_row = row.copy()
1304
+ processed_row['source'] = row['source_text']
1305
+ processed_row['target'] = row['target_text']
1306
+ processed_results.append(processed_row)
1307
+
1308
+ return processed_results
1309
+ else:
1310
+ # No language filters - just rename columns
1311
+ processed_results = []
1312
+ for row in raw_results:
1313
+ processed_row = row.copy()
1314
+ processed_row['source'] = row['source_text']
1315
+ processed_row['target'] = row['target_text']
1316
+ processed_results.append(processed_row)
1317
+ return processed_results
1318
+
1319
+ except Exception as e:
1320
+ # Fallback to LIKE query if FTS5 fails (e.g., index not built)
1321
+ print(f"[TM] FTS5 search failed, falling back to LIKE: {e}")
1322
+ search_query = f"%{query}%"
1323
+
1324
+ if direction == 'source':
1325
+ sql = """
1326
+ SELECT * FROM translation_units
1327
+ WHERE source_text LIKE ?
1328
+ """
1329
+ params = [search_query]
1330
+ elif direction == 'target':
1331
+ sql = """
1332
+ SELECT * FROM translation_units
1333
+ WHERE target_text LIKE ?
1334
+ """
1335
+ params = [search_query]
1336
+ else:
1337
+ sql = """
1338
+ SELECT * FROM translation_units
1339
+ WHERE (source_text LIKE ? OR target_text LIKE ?)
1340
+ """
1341
+ params = [search_query, search_query]
1342
+
1343
+ if tm_ids:
1344
+ placeholders = ','.join('?' * len(tm_ids))
1345
+ sql += f" AND tm_id IN ({placeholders})"
1346
+ params.extend(tm_ids)
1347
+
1348
+ # Add language filters (support for list of variants)
1349
+ if source_langs:
1350
+ placeholders = ','.join('?' * len(source_langs))
1351
+ sql += f" AND source_lang IN ({placeholders})"
1352
+ params.extend(source_langs)
1353
+ if target_langs:
1354
+ placeholders = ','.join('?' * len(target_langs))
1355
+ sql += f" AND target_lang IN ({placeholders})"
1356
+ params.extend(target_langs)
1357
+
1358
+ sql += " ORDER BY modified_date DESC LIMIT 100"
1359
+
1360
+ self.cursor.execute(sql, params)
1361
+ return [dict(row) for row in self.cursor.fetchall()]
1362
+
1363
+ def rebuild_fts_index(self) -> int:
1364
+ """
1365
+ Rebuild the FTS5 full-text search index from scratch.
1366
+ Use this after importing TMs or if FTS search isn't returning results.
1367
+
1368
+ Returns:
1369
+ Number of entries indexed
1370
+ """
1371
+ try:
1372
+ # Clear existing FTS data
1373
+ self.cursor.execute("DELETE FROM translation_units_fts")
1374
+
1375
+ # Repopulate from translation_units table
1376
+ self.cursor.execute("""
1377
+ INSERT INTO translation_units_fts(rowid, source_text, target_text)
1378
+ SELECT id, source_text, target_text FROM translation_units
1379
+ """)
1380
+
1381
+ self.conn.commit()
1382
+
1383
+ # Get count
1384
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units_fts")
1385
+ count = self.cursor.fetchone()[0]
1386
+ print(f"[TM] FTS5 index rebuilt with {count:,} entries")
1387
+ return count
1388
+ except Exception as e:
1389
+ print(f"[TM] Error rebuilding FTS index: {e}")
1390
+ return 0
1391
+
1392
+ def check_fts_index(self) -> Dict:
1393
+ """
1394
+ Check if FTS5 index is in sync with main table.
1395
+
1396
+ Returns:
1397
+ Dict with 'main_count', 'fts_count', 'in_sync' keys
1398
+ """
1399
+ try:
1400
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units")
1401
+ main_count = self.cursor.fetchone()[0]
1402
+
1403
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units_fts")
1404
+ fts_count = self.cursor.fetchone()[0]
1405
+
1406
+ return {
1407
+ 'main_count': main_count,
1408
+ 'fts_count': fts_count,
1409
+ 'in_sync': main_count == fts_count
1410
+ }
1411
+ except Exception as e:
1412
+ return {'main_count': 0, 'fts_count': 0, 'in_sync': False, 'error': str(e)}
1413
+
1414
+ # ============================================
1415
+ # termbase METHODS (Placeholder for Phase 3)
1416
+ # ============================================
1417
+
1418
+ def add_termbase_term(self, source_term: str, target_term: str,
1419
+ source_lang: str, target_lang: str,
1420
+ termbase_id: str = 'main', **kwargs) -> int:
1421
+ """Add term to termbase (Phase 3)"""
1422
+ # TODO: Implement in Phase 3
1423
+ pass
1424
+
1425
+ def search_termbases(self, search_term: str, source_lang: str = None,
1426
+ target_lang: str = None, project_id: str = None,
1427
+ min_length: int = 0) -> List[Dict]:
1428
+ """
1429
+ Search termbases for matching source terms
1430
+
1431
+ Args:
1432
+ search_term: Source term to search for
1433
+ source_lang: Filter by source language (optional)
1434
+ target_lang: Filter by target language (optional)
1435
+ project_id: Filter by project (optional)
1436
+ min_length: Minimum term length to return
1437
+
1438
+ Returns:
1439
+ List of termbase hits, sorted by priority (lower = higher priority)
1440
+ """
1441
+ # Build query with filters - include termbase name and ranking via JOIN
1442
+ # Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
1443
+ # Use CAST to ensure proper comparison
1444
+ # IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
1445
+ # CRITICAL FIX: Also match when search_term starts with the glossary term
1446
+ # This handles cases like searching for "ca." when glossary has "ca."
1447
+ # AND searching for "ca" when glossary has "ca."
1448
+ # We also strip trailing punctuation from glossary terms for comparison
1449
+ query = """
1450
+ SELECT
1451
+ t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1452
+ t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
1453
+ t.notes, t.project, t.client,
1454
+ tb.name as termbase_name,
1455
+ tb.source_lang as termbase_source_lang,
1456
+ tb.target_lang as termbase_target_lang,
1457
+ tb.is_project_termbase,
1458
+ COALESCE(ta.priority, tb.ranking) as ranking
1459
+ FROM termbase_terms t
1460
+ LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1461
+ LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1462
+ WHERE (
1463
+ LOWER(t.source_term) = LOWER(?) OR
1464
+ LOWER(t.source_term) LIKE LOWER(?) OR
1465
+ LOWER(t.source_term) LIKE LOWER(?) OR
1466
+ LOWER(t.source_term) LIKE LOWER(?) OR
1467
+ LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
1468
+ LOWER(?) LIKE LOWER(t.source_term) || '%' OR
1469
+ LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
1470
+ )
1471
+ AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1472
+ """
1473
+ # Matching patterns:
1474
+ # 1. Exact match: source_term = search_term
1475
+ # 2. Glossary term starts with search: source_term LIKE "search_term %"
1476
+ # 3. Glossary term ends with search: source_term LIKE "% search_term"
1477
+ # 4. Glossary term contains search: source_term LIKE "% search_term %"
1478
+ # 5. Glossary term (stripped) = search_term: RTRIM(source_term) = search_term (handles "ca." = "ca")
1479
+ # 6. Search starts with glossary term: search_term LIKE source_term || '%'
1480
+ # 7. Search = glossary term stripped: search_term = RTRIM(source_term)
1481
+ params = [
1482
+ project_id if project_id else 0, # Use 0 if no project (won't match any activation records)
1483
+ search_term,
1484
+ f"{search_term} %",
1485
+ f"% {search_term}",
1486
+ f"% {search_term} %",
1487
+ search_term, # For RTRIM comparison
1488
+ search_term, # For reverse LIKE
1489
+ search_term # For reverse RTRIM comparison
1490
+ ]
1491
+
1492
+ # Language filters - if term has no language, use termbase language for filtering
1493
+ if source_lang:
1494
+ query += """ AND (
1495
+ t.source_lang = ? OR
1496
+ (t.source_lang IS NULL AND tb.source_lang = ?) OR
1497
+ (t.source_lang IS NULL AND tb.source_lang IS NULL)
1498
+ )"""
1499
+ params.extend([source_lang, source_lang])
1500
+
1501
+ if target_lang:
1502
+ query += """ AND (
1503
+ t.target_lang = ? OR
1504
+ (t.target_lang IS NULL AND tb.target_lang = ?) OR
1505
+ (t.target_lang IS NULL AND tb.target_lang IS NULL)
1506
+ )"""
1507
+ params.extend([target_lang, target_lang])
1508
+
1509
+ # Project filter: match project-specific terms OR global terms (project_id IS NULL)
1510
+ if project_id:
1511
+ query += " AND (t.project_id = ? OR t.project_id IS NULL)"
1512
+ params.append(project_id)
1513
+
1514
+ if min_length > 0:
1515
+ query += f" AND LENGTH(t.source_term) >= {min_length}"
1516
+
1517
+ # Sort by ranking (lower number = higher priority)
1518
+ # Project termbases (ranking IS NULL) appear first, then by ranking, then alphabetically
1519
+ # Use COALESCE to treat NULL as -1 (highest priority)
1520
+ query += " ORDER BY COALESCE(tb.ranking, -1) ASC, t.source_term ASC"
1521
+
1522
+ self.cursor.execute(query, params)
1523
+ results = []
1524
+ for row in self.cursor.fetchall():
1525
+ result_dict = dict(row)
1526
+ # SQLite stores booleans as 0/1, explicitly convert to Python bool
1527
+ if 'is_project_termbase' in result_dict:
1528
+ result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
1529
+
1530
+ # Fetch target synonyms for this term and include them in the result
1531
+ term_id = result_dict.get('id')
1532
+ if term_id:
1533
+ try:
1534
+ self.cursor.execute("""
1535
+ SELECT synonym_text, forbidden FROM termbase_synonyms
1536
+ WHERE term_id = ? AND language = 'target'
1537
+ ORDER BY display_order ASC
1538
+ """, (term_id,))
1539
+ synonyms = []
1540
+ for syn_row in self.cursor.fetchall():
1541
+ syn_text = syn_row[0]
1542
+ syn_forbidden = bool(syn_row[1])
1543
+ if not syn_forbidden: # Only include non-forbidden synonyms
1544
+ synonyms.append(syn_text)
1545
+ result_dict['target_synonyms'] = synonyms
1546
+ except Exception:
1547
+ result_dict['target_synonyms'] = []
1548
+
1549
+ results.append(result_dict)
1550
+ return results
1551
+
1552
+ # ============================================
1553
+ # UTILITY METHODS
1554
+ # ============================================
1555
+
1556
+ def get_all_tms(self, enabled_only: bool = True) -> List[Dict]:
1557
+ """
1558
+ Get list of all translation memories
1559
+
1560
+ Args:
1561
+ enabled_only: If True, only return enabled TMs
1562
+
1563
+ Returns:
1564
+ List of TM info dictionaries with tm_id, name, entry_count, enabled
1565
+ """
1566
+ # Get distinct TM IDs from translation_units
1567
+ query = "SELECT DISTINCT tm_id FROM translation_units ORDER BY tm_id"
1568
+ self.cursor.execute(query)
1569
+ tm_ids = [row[0] for row in self.cursor.fetchall()]
1570
+
1571
+ tm_list = []
1572
+ for tm_id in tm_ids:
1573
+ entry_count = self.get_tm_count(tm_id)
1574
+ tm_info = {
1575
+ 'tm_id': tm_id,
1576
+ 'name': tm_id.replace('_', ' ').title(),
1577
+ 'entry_count': entry_count,
1578
+ 'enabled': True, # For now, all TMs are enabled
1579
+ 'read_only': False
1580
+ }
1581
+ tm_list.append(tm_info)
1582
+
1583
+ return tm_list
1584
+
1585
+ def get_tm_list(self, enabled_only: bool = True) -> List[Dict]:
1586
+ """Alias for get_all_tms for backward compatibility"""
1587
+ return self.get_all_tms(enabled_only=enabled_only)
1588
+
1589
+ def get_entry_count(self, enabled_only: bool = True) -> int:
1590
+ """
1591
+ Get total number of translation entries
1592
+
1593
+ Args:
1594
+ enabled_only: Currently ignored (all TMs enabled)
1595
+
1596
+ Returns:
1597
+ Total number of translation units
1598
+ """
1599
+ return self.get_tm_count()
1600
+
1601
+ def vacuum(self):
1602
+ """Optimize database (VACUUM)"""
1603
+ self.cursor.execute("VACUUM")
1604
+ self.connection.commit()
1605
+
1606
+ # ============================================
1607
+ # TMX EDITOR METHODS (database-backed TMX files)
1608
+ # ============================================
1609
+
1610
+ def tmx_store_file(self, file_path: str, file_name: str, original_file_path: str,
1611
+ load_mode: str, file_size: int, header_data: dict,
1612
+ tu_count: int, languages: List[str]) -> int:
1613
+ """
1614
+ Store TMX file metadata in database
1615
+
1616
+ Returns:
1617
+ tmx_file_id (int)
1618
+ """
1619
+ languages_json = json.dumps(languages)
1620
+ header_json = json.dumps(header_data)
1621
+
1622
+ # Check if file already exists
1623
+ self.cursor.execute("SELECT id FROM tmx_files WHERE file_path = ?", (file_path,))
1624
+ existing = self.cursor.fetchone()
1625
+
1626
+ if existing:
1627
+ # Update existing
1628
+ self.cursor.execute("""
1629
+ UPDATE tmx_files
1630
+ SET file_name = ?, original_file_path = ?, load_mode = ?, file_size = ?,
1631
+ header_data = ?, tu_count = ?, languages = ?, last_accessed = CURRENT_TIMESTAMP
1632
+ WHERE id = ?
1633
+ """, (file_name, original_file_path, load_mode, file_size, header_json,
1634
+ tu_count, languages_json, existing['id']))
1635
+ self.connection.commit()
1636
+ return existing['id']
1637
+ else:
1638
+ # Insert new
1639
+ self.cursor.execute("""
1640
+ INSERT INTO tmx_files
1641
+ (file_path, file_name, original_file_path, load_mode, file_size,
1642
+ header_data, tu_count, languages)
1643
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1644
+ """, (file_path, file_name, original_file_path, load_mode, file_size,
1645
+ header_json, tu_count, languages_json))
1646
+ self.connection.commit()
1647
+ return self.cursor.lastrowid
1648
+
1649
+ def tmx_store_translation_unit(self, tmx_file_id: int, tu_id: int,
1650
+ creation_date: str = None, creation_id: str = None,
1651
+ change_date: str = None, change_id: str = None,
1652
+ srclang: str = None, custom_attributes: dict = None,
1653
+ comments: List[str] = None, commit: bool = True) -> int:
1654
+ """
1655
+ Store a translation unit in database
1656
+
1657
+ Args:
1658
+ commit: If False, don't commit (for batch operations)
1659
+
1660
+ Returns:
1661
+ Internal TU ID (for referencing segments)
1662
+ """
1663
+ custom_attrs_json = json.dumps(custom_attributes) if custom_attributes else None
1664
+ comments_json = json.dumps(comments) if comments else None
1665
+
1666
+ self.cursor.execute("""
1667
+ INSERT OR REPLACE INTO tmx_translation_units
1668
+ (tmx_file_id, tu_id, creation_date, creation_id, change_date, change_id,
1669
+ srclang, custom_attributes, comments)
1670
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1671
+ """, (tmx_file_id, tu_id, creation_date, creation_id, change_date, change_id,
1672
+ srclang, custom_attrs_json, comments_json))
1673
+ if commit:
1674
+ self.connection.commit()
1675
+ return self.cursor.lastrowid
1676
+
1677
+ def tmx_store_segment(self, tu_db_id: int, lang: str, text: str,
1678
+ creation_date: str = None, creation_id: str = None,
1679
+ change_date: str = None, change_id: str = None,
1680
+ commit: bool = True):
1681
+ """
1682
+ Store a segment (language variant) for a translation unit
1683
+
1684
+ Args:
1685
+ commit: If False, don't commit (for batch operations)
1686
+ """
1687
+ self.cursor.execute("""
1688
+ INSERT OR REPLACE INTO tmx_segments
1689
+ (tu_id, lang, text, creation_date, creation_id, change_date, change_id)
1690
+ VALUES (?, ?, ?, ?, ?, ?, ?)
1691
+ """, (tu_db_id, lang, text, creation_date, creation_id, change_date, change_id))
1692
+ if commit:
1693
+ self.connection.commit()
1694
+
1695
+ def tmx_get_file_id(self, file_path: str) -> Optional[int]:
1696
+ """Get TMX file ID by file path"""
1697
+ self.cursor.execute("SELECT id FROM tmx_files WHERE file_path = ?", (file_path,))
1698
+ row = self.cursor.fetchone()
1699
+ return row['id'] if row else None
1700
+
1701
+ def tmx_get_translation_units(self, tmx_file_id: int, offset: int = 0,
1702
+ limit: int = 50, src_lang: str = None,
1703
+ tgt_lang: str = None, src_filter: str = None,
1704
+ tgt_filter: str = None, ignore_case: bool = True) -> List[Dict]:
1705
+ """
1706
+ Get translation units with pagination and filtering
1707
+
1708
+ Returns:
1709
+ List of dicts with TU data including segments
1710
+ """
1711
+ # Build base query
1712
+ query = """
1713
+ SELECT tu.id as tu_db_id, tu.tu_id, tu.creation_date, tu.creation_id,
1714
+ tu.change_date, tu.change_id, tu.srclang, tu.custom_attributes, tu.comments
1715
+ FROM tmx_translation_units tu
1716
+ WHERE tu.tmx_file_id = ?
1717
+ """
1718
+ params = [tmx_file_id]
1719
+
1720
+ # Add filters
1721
+ if src_filter or tgt_filter:
1722
+ query += """
1723
+ AND EXISTS (
1724
+ SELECT 1 FROM tmx_segments seg1
1725
+ WHERE seg1.tu_id = tu.id
1726
+ """
1727
+ if src_lang:
1728
+ query += " AND seg1.lang = ?"
1729
+ params.append(src_lang)
1730
+ if src_filter:
1731
+ if ignore_case:
1732
+ query += " AND LOWER(seg1.text) LIKE LOWER(?)"
1733
+ params.append(f"%{src_filter}%")
1734
+ else:
1735
+ query += " AND seg1.text LIKE ?"
1736
+ params.append(f"%{src_filter}%")
1737
+
1738
+ if tgt_filter:
1739
+ query += """
1740
+ AND EXISTS (
1741
+ SELECT 1 FROM tmx_segments seg2
1742
+ WHERE seg2.tu_id = tu.id
1743
+ """
1744
+ if tgt_lang:
1745
+ query += " AND seg2.lang = ?"
1746
+ params.append(tgt_lang)
1747
+ if ignore_case:
1748
+ query += " AND LOWER(seg2.text) LIKE LOWER(?)"
1749
+ params.append(f"%{tgt_filter}%")
1750
+ else:
1751
+ query += " AND seg2.text LIKE ?"
1752
+ params.append(f"%{tgt_filter}%")
1753
+ query += ")"
1754
+
1755
+ query += ")"
1756
+
1757
+ query += " ORDER BY tu.tu_id LIMIT ? OFFSET ?"
1758
+ params.extend([limit, offset])
1759
+
1760
+ self.cursor.execute(query, params)
1761
+ rows = self.cursor.fetchall()
1762
+
1763
+ # Fetch segments for each TU
1764
+ result = []
1765
+ for row in rows:
1766
+ tu_data = dict(row)
1767
+ # Get segments
1768
+ self.cursor.execute("""
1769
+ SELECT lang, text, creation_date, creation_id, change_date, change_id
1770
+ FROM tmx_segments
1771
+ WHERE tu_id = ?
1772
+ """, (tu_data['tu_db_id'],))
1773
+ segments = {}
1774
+ for seg_row in self.cursor.fetchall():
1775
+ seg_dict = dict(seg_row)
1776
+ segments[seg_dict['lang']] = seg_dict
1777
+
1778
+ tu_data['segments'] = segments
1779
+ if tu_data['custom_attributes']:
1780
+ tu_data['custom_attributes'] = json.loads(tu_data['custom_attributes'])
1781
+ if tu_data['comments']:
1782
+ tu_data['comments'] = json.loads(tu_data['comments'])
1783
+
1784
+ result.append(tu_data)
1785
+
1786
+ return result
1787
+
1788
+ def tmx_count_translation_units(self, tmx_file_id: int, src_lang: str = None,
1789
+ tgt_lang: str = None, src_filter: str = None,
1790
+ tgt_filter: str = None, ignore_case: bool = True) -> int:
1791
+ """Count translation units matching filters"""
1792
+ query = """
1793
+ SELECT COUNT(DISTINCT tu.id)
1794
+ FROM tmx_translation_units tu
1795
+ WHERE tu.tmx_file_id = ?
1796
+ """
1797
+ params = [tmx_file_id]
1798
+
1799
+ # Add same filters as tmx_get_translation_units
1800
+ if src_filter or tgt_filter:
1801
+ query += """
1802
+ AND EXISTS (
1803
+ SELECT 1 FROM tmx_segments seg1
1804
+ WHERE seg1.tu_id = tu.id
1805
+ """
1806
+ if src_lang:
1807
+ query += " AND seg1.lang = ?"
1808
+ params.append(src_lang)
1809
+ if src_filter:
1810
+ if ignore_case:
1811
+ query += " AND LOWER(seg1.text) LIKE LOWER(?)"
1812
+ params.append(f"%{src_filter}%")
1813
+ else:
1814
+ query += " AND seg1.text LIKE ?"
1815
+ params.append(f"%{src_filter}%")
1816
+
1817
+ if tgt_filter:
1818
+ query += """
1819
+ AND EXISTS (
1820
+ SELECT 1 FROM tmx_segments seg2
1821
+ WHERE seg2.tu_id = tu.id
1822
+ """
1823
+ if tgt_lang:
1824
+ query += " AND seg2.lang = ?"
1825
+ params.append(tgt_lang)
1826
+ if ignore_case:
1827
+ query += " AND LOWER(seg2.text) LIKE LOWER(?)"
1828
+ params.append(f"%{tgt_filter}%")
1829
+ else:
1830
+ query += " AND seg2.text LIKE ?"
1831
+ params.append(f"%{tgt_filter}%")
1832
+ query += ")"
1833
+
1834
+ query += ")"
1835
+
1836
+ self.cursor.execute(query, params)
1837
+ return self.cursor.fetchone()[0]
1838
+
1839
+ def tmx_update_segment(self, tmx_file_id: int, tu_id: int, lang: str, text: str):
1840
+ """Update a segment text"""
1841
+ # Get internal TU ID
1842
+ self.cursor.execute("""
1843
+ SELECT tu.id FROM tmx_translation_units tu
1844
+ WHERE tu.tmx_file_id = ? AND tu.tu_id = ?
1845
+ """, (tmx_file_id, tu_id))
1846
+ tu_row = self.cursor.fetchone()
1847
+ if not tu_row:
1848
+ return False
1849
+
1850
+ tu_db_id = tu_row['id']
1851
+ change_date = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
1852
+
1853
+ # Update segment
1854
+ self.cursor.execute("""
1855
+ UPDATE tmx_segments
1856
+ SET text = ?, change_date = ?
1857
+ WHERE tu_id = ? AND lang = ?
1858
+ """, (text, change_date, tu_db_id, lang))
1859
+
1860
+ # Update TU change date
1861
+ self.cursor.execute("""
1862
+ UPDATE tmx_translation_units
1863
+ SET change_date = ?
1864
+ WHERE id = ?
1865
+ """, (change_date, tu_db_id))
1866
+
1867
+ # Update file last_modified
1868
+ self.cursor.execute("""
1869
+ UPDATE tmx_files
1870
+ SET last_modified = CURRENT_TIMESTAMP
1871
+ WHERE id = ?
1872
+ """, (tmx_file_id,))
1873
+
1874
+ self.connection.commit()
1875
+ return True
1876
+
1877
+ def tmx_delete_file(self, tmx_file_id: int):
1878
+ """Delete TMX file and all its data (CASCADE will handle TUs and segments)"""
1879
+ self.cursor.execute("DELETE FROM tmx_files WHERE id = ?", (tmx_file_id,))
1880
+ self.connection.commit()
1881
+
1882
+ def tmx_get_file_info(self, tmx_file_id: int) -> Optional[Dict]:
1883
+ """Get TMX file metadata"""
1884
+ self.cursor.execute("""
1885
+ SELECT id, file_path, file_name, original_file_path, load_mode,
1886
+ file_size, header_data, tu_count, languages,
1887
+ created_date, last_accessed, last_modified
1888
+ FROM tmx_files
1889
+ WHERE id = ?
1890
+ """, (tmx_file_id,))
1891
+ row = self.cursor.fetchone()
1892
+ if not row:
1893
+ return None
1894
+
1895
+ info = dict(row)
1896
+ info['header_data'] = json.loads(info['header_data'])
1897
+ info['languages'] = json.loads(info['languages'])
1898
+ return info
1899
+
1900
+ def get_database_info(self) -> Dict:
1901
+ """Get database statistics"""
1902
+ info = {
1903
+ 'path': self.db_path,
1904
+ 'size_bytes': os.path.getsize(self.db_path) if os.path.exists(self.db_path) else 0,
1905
+ 'tm_entries': self.get_tm_count(),
1906
+ }
1907
+
1908
+ # Get size in MB
1909
+ info['size_mb'] = round(info['size_bytes'] / (1024 * 1024), 2)
1910
+
1911
+ return info