supervertaler 1.9.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. Supervertaler.py +44945 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1766 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +904 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +325 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +248 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1161 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +670 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +656 -0
  76. modules/unified_prompt_manager_qt.py +3715 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.109.dist-info/METADATA +788 -0
  81. supervertaler-1.9.109.dist-info/RECORD +85 -0
  82. supervertaler-1.9.109.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.109.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.109.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.109.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1766 @@
1
+ """
2
+ Database Manager Module
3
+
4
+ SQLite database backend for Translation Memories, Glossaries, and related resources.
5
+ Replaces in-memory JSON-based storage with efficient database storage.
6
+
7
+ Schema includes:
8
+ - Translation units (TM entries)
9
+ - Termbase terms
10
+ - Non-translatables
11
+ - Segmentation rules
12
+ - Project metadata
13
+ - Resource file references
14
+ """
15
+
16
+ import sqlite3
17
+ import os
18
+ import json
19
+ import hashlib
20
+ from datetime import datetime
21
+ from typing import List, Dict, Optional, Tuple
22
+ from pathlib import Path
23
+ from difflib import SequenceMatcher
24
+
25
+
26
+ class DatabaseManager:
27
+ """Manages SQLite database for translation resources"""
28
+
29
+ def __init__(self, db_path: str = None, log_callback=None):
30
+ """
31
+ Initialize database manager
32
+
33
+ Args:
34
+ db_path: Path to SQLite database file (default: user_data/supervertaler.db)
35
+ log_callback: Optional logging function
36
+ """
37
+ self.log = log_callback if log_callback else print
38
+
39
+ # Set default database path if not provided
40
+ if db_path is None:
41
+ # Will be set by application - defaults to user_data folder
42
+ self.db_path = "supervertaler.db"
43
+ else:
44
+ self.db_path = db_path
45
+
46
+ self.connection = None
47
+ self.cursor = None
48
+
49
+ def connect(self):
50
+ """Connect to database and create tables if needed"""
51
+ try:
52
+ # Create directory if it doesn't exist
53
+ os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
54
+
55
+ # Connect to database
56
+ self.connection = sqlite3.connect(self.db_path)
57
+ self.connection.row_factory = sqlite3.Row # Access columns by name
58
+ self.cursor = self.connection.cursor()
59
+
60
+ # Enable foreign keys
61
+ self.cursor.execute("PRAGMA foreign_keys = ON")
62
+
63
+ # Create tables
64
+ self._create_tables()
65
+
66
+ # Run database migrations (adds new columns/tables as needed)
67
+ try:
68
+ from modules.database_migrations import check_and_migrate
69
+ migration_success = check_and_migrate(self)
70
+ if not migration_success:
71
+ self.log("[WARNING] Database migration reported failure")
72
+ except Exception as e:
73
+ self.log(f"[WARNING] Database migration check failed: {e}")
74
+ import traceback
75
+ traceback.print_exc()
76
+
77
+ # Auto-sync FTS5 index if out of sync
78
+ try:
79
+ fts_status = self.check_fts_index()
80
+ if not fts_status.get('in_sync', True):
81
+ self.log(f"[TM] FTS5 index out of sync ({fts_status.get('fts_count', 0)} vs {fts_status.get('main_count', 0)}), rebuilding...")
82
+ self.rebuild_fts_index()
83
+ except Exception as e:
84
+ self.log(f"[WARNING] FTS5 index check failed: {e}")
85
+
86
+ self.log(f"[OK] Database connected: {os.path.basename(self.db_path)}")
87
+ return True
88
+
89
+ except Exception as e:
90
+ self.log(f"[ERROR] Database connection failed: {e}")
91
+ return False
92
+
93
+ def _create_tables(self):
94
+ """Create database schema"""
95
+ print("📊 Creating database tables...")
96
+
97
+ # ============================================
98
+ # TRANSLATION MEMORY TABLES
99
+ # ============================================
100
+
101
+ self.cursor.execute("""
102
+ CREATE TABLE IF NOT EXISTS translation_units (
103
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
104
+ source_text TEXT NOT NULL,
105
+ target_text TEXT NOT NULL,
106
+ source_lang TEXT NOT NULL,
107
+ target_lang TEXT NOT NULL,
108
+ tm_id TEXT NOT NULL,
109
+ project_id TEXT,
110
+
111
+ -- Context for better matching
112
+ context_before TEXT,
113
+ context_after TEXT,
114
+
115
+ -- Fast exact matching
116
+ source_hash TEXT NOT NULL,
117
+
118
+ -- Metadata
119
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
120
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
121
+ usage_count INTEGER DEFAULT 0,
122
+ created_by TEXT,
123
+ notes TEXT,
124
+
125
+ -- Indexes
126
+ UNIQUE(source_hash, target_text, tm_id)
127
+ )
128
+ """)
129
+
130
+ # Indexes for translation_units
131
+ self.cursor.execute("""
132
+ CREATE INDEX IF NOT EXISTS idx_tu_source_hash
133
+ ON translation_units(source_hash)
134
+ """)
135
+
136
+ self.cursor.execute("""
137
+ CREATE INDEX IF NOT EXISTS idx_tu_tm_id
138
+ ON translation_units(tm_id)
139
+ """)
140
+
141
+ self.cursor.execute("""
142
+ CREATE INDEX IF NOT EXISTS idx_tu_project_id
143
+ ON translation_units(project_id)
144
+ """)
145
+
146
+ self.cursor.execute("""
147
+ CREATE INDEX IF NOT EXISTS idx_tu_langs
148
+ ON translation_units(source_lang, target_lang)
149
+ """)
150
+
151
+ # Full-text search for fuzzy matching
152
+ self.cursor.execute("""
153
+ CREATE VIRTUAL TABLE IF NOT EXISTS translation_units_fts
154
+ USING fts5(
155
+ source_text,
156
+ target_text,
157
+ content=translation_units,
158
+ content_rowid=id
159
+ )
160
+ """)
161
+
162
+ # Triggers to keep FTS index in sync
163
+ self.cursor.execute("""
164
+ CREATE TRIGGER IF NOT EXISTS tu_fts_insert AFTER INSERT ON translation_units BEGIN
165
+ INSERT INTO translation_units_fts(rowid, source_text, target_text)
166
+ VALUES (new.id, new.source_text, new.target_text);
167
+ END
168
+ """)
169
+
170
+ self.cursor.execute("""
171
+ CREATE TRIGGER IF NOT EXISTS tu_fts_delete AFTER DELETE ON translation_units BEGIN
172
+ DELETE FROM translation_units_fts WHERE rowid = old.id;
173
+ END
174
+ """)
175
+
176
+ self.cursor.execute("""
177
+ CREATE TRIGGER IF NOT EXISTS tu_fts_update AFTER UPDATE ON translation_units BEGIN
178
+ DELETE FROM translation_units_fts WHERE rowid = old.id;
179
+ INSERT INTO translation_units_fts(rowid, source_text, target_text)
180
+ VALUES (new.id, new.source_text, new.target_text);
181
+ END
182
+ """)
183
+
184
+ # ============================================
185
+ # TRANSLATION MEMORY METADATA
186
+ # ============================================
187
+
188
+ # Translation Memories table - tracks individual TM names/metadata
189
+ self.cursor.execute("""
190
+ CREATE TABLE IF NOT EXISTS translation_memories (
191
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
192
+ name TEXT NOT NULL UNIQUE,
193
+ description TEXT,
194
+ source_lang TEXT,
195
+ target_lang TEXT,
196
+ tm_id TEXT NOT NULL UNIQUE, -- The tm_id used in translation_units table
197
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
198
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
199
+ entry_count INTEGER DEFAULT 0, -- Cached count, updated on changes
200
+ last_used TIMESTAMP,
201
+ is_project_tm BOOLEAN DEFAULT 0, -- Whether this is the special project TM
202
+ read_only BOOLEAN DEFAULT 1, -- Whether this TM should not be updated (default: read-only, Write unchecked)
203
+ project_id INTEGER -- Which project this TM belongs to (NULL = global)
204
+ )
205
+ """)
206
+
207
+ # TM activation (tracks which TMs are active for which projects)
208
+ self.cursor.execute("""
209
+ CREATE TABLE IF NOT EXISTS tm_activation (
210
+ tm_id INTEGER NOT NULL,
211
+ project_id INTEGER NOT NULL,
212
+ is_active BOOLEAN DEFAULT 1,
213
+ activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
214
+ PRIMARY KEY (tm_id, project_id),
215
+ FOREIGN KEY (tm_id) REFERENCES translation_memories(id) ON DELETE CASCADE
216
+ )
217
+ """)
218
+
219
+ # Index for fast tm_id lookups
220
+ self.cursor.execute("""
221
+ CREATE INDEX IF NOT EXISTS idx_tm_tm_id
222
+ ON translation_memories(tm_id)
223
+ """)
224
+
225
+ # Migration: Add is_project_tm, read_only, and project_id columns if they don't exist
226
+ try:
227
+ self.cursor.execute("PRAGMA table_info(translation_memories)")
228
+ columns = [row[1] for row in self.cursor.fetchall()]
229
+
230
+ if 'is_project_tm' not in columns:
231
+ self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN is_project_tm BOOLEAN DEFAULT 0")
232
+ print("✓ Added is_project_tm column to translation_memories")
233
+
234
+ if 'read_only' not in columns:
235
+ self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN read_only BOOLEAN DEFAULT 1")
236
+ print("✓ Added read_only column to translation_memories (default: read-only)")
237
+
238
+ if 'project_id' not in columns:
239
+ self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN project_id INTEGER")
240
+ print("✓ Added project_id column to translation_memories")
241
+
242
+ self.connection.commit()
243
+ except Exception as e:
244
+ print(f"Migration info: {e}")
245
+
246
+ # ============================================
247
+ # TERMBASE TABLES
248
+ # ============================================
249
+
250
+ # Termbases container table (terminology, never "termbase")
251
+ self.cursor.execute("""
252
+ CREATE TABLE IF NOT EXISTS termbases (
253
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
254
+ name TEXT NOT NULL UNIQUE,
255
+ description TEXT,
256
+ source_lang TEXT,
257
+ target_lang TEXT,
258
+ project_id INTEGER, -- NULL = global, set = project-specific
259
+ is_global BOOLEAN DEFAULT 1,
260
+ is_project_termbase BOOLEAN DEFAULT 0, -- True if this is a project-specific termbase
261
+ priority INTEGER DEFAULT 50, -- DEPRECATED: Use ranking instead
262
+ ranking INTEGER, -- Termbase activation ranking: 1 = highest priority, 2 = second highest, etc. Only for activated termbases.
263
+ read_only BOOLEAN DEFAULT 1, -- Whether this termbase should not be updated (default: read-only, Write unchecked)
264
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
265
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
266
+ )
267
+ """)
268
+
269
+ # Migration: Add priority column if it doesn't exist (for existing databases)
270
+ try:
271
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN priority INTEGER DEFAULT 50")
272
+ self.connection.commit()
273
+ except Exception:
274
+ # Column already exists, ignore
275
+ pass
276
+
277
+ # Migration: Add is_project_termbase column if it doesn't exist
278
+ try:
279
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN is_project_termbase BOOLEAN DEFAULT 0")
280
+ self.connection.commit()
281
+ except Exception:
282
+ # Column already exists, ignore
283
+ pass
284
+
285
+ # Migration: Add ranking column if it doesn't exist
286
+ try:
287
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN ranking INTEGER")
288
+ self.connection.commit()
289
+ except Exception:
290
+ # Column already exists, ignore
291
+ pass
292
+
293
+ # Migration: Add read_only column if it doesn't exist
294
+ try:
295
+ self.cursor.execute("ALTER TABLE termbases ADD COLUMN read_only BOOLEAN DEFAULT 1")
296
+ self.connection.commit()
297
+ except Exception:
298
+ # Column already exists, ignore
299
+ pass
300
+
301
+ # Data Migration: Set is_project_termbase=1 for termbases with non-NULL project_id
302
+ # This ensures existing project termbases are correctly flagged
303
+ try:
304
+ self.cursor.execute("""
305
+ UPDATE termbases
306
+ SET is_project_termbase = 1
307
+ WHERE project_id IS NOT NULL
308
+ AND (is_project_termbase IS NULL OR is_project_termbase = 0)
309
+ """)
310
+ updated_count = self.cursor.rowcount
311
+ if updated_count > 0:
312
+ self.log(f"✅ Data migration: Updated {updated_count} project termbase(s) with is_project_termbase=1")
313
+ self.connection.commit()
314
+ except Exception as e:
315
+ self.log(f"⚠️ Data migration warning (is_project_termbase): {e}")
316
+ pass
317
+
318
+ # Legacy support: create glossaries as alias for termbases
319
+ self.cursor.execute("""
320
+ CREATE TABLE IF NOT EXISTS glossaries (
321
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
322
+ name TEXT NOT NULL UNIQUE,
323
+ description TEXT,
324
+ source_lang TEXT,
325
+ target_lang TEXT,
326
+ project_id INTEGER,
327
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
328
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
329
+ )
330
+ """)
331
+
332
+ # Termbase activation (tracks which termbases are active for which projects)
333
+ self.cursor.execute("""
334
+ CREATE TABLE IF NOT EXISTS termbase_activation (
335
+ termbase_id INTEGER NOT NULL,
336
+ project_id INTEGER NOT NULL,
337
+ is_active BOOLEAN DEFAULT 1,
338
+ activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
339
+ priority INTEGER, -- Manual priority (1=highest, 2=second, etc.). Multiple termbases can share same priority.
340
+ PRIMARY KEY (termbase_id, project_id),
341
+ FOREIGN KEY (termbase_id) REFERENCES termbases(id) ON DELETE CASCADE
342
+ )
343
+ """)
344
+
345
+ # Migration: Add priority column to termbase_activation if it doesn't exist
346
+ try:
347
+ self.cursor.execute("ALTER TABLE termbase_activation ADD COLUMN priority INTEGER")
348
+ self.connection.commit()
349
+ except Exception:
350
+ # Column already exists, ignore
351
+ pass
352
+
353
+ # Legacy support: termbase_project_activation as alias
354
+ # Note: Foreign key now references termbases for consistency with Qt version
355
+ self.cursor.execute("""
356
+ CREATE TABLE IF NOT EXISTS termbase_project_activation (
357
+ termbase_id INTEGER NOT NULL,
358
+ project_id INTEGER NOT NULL,
359
+ activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
360
+ PRIMARY KEY (termbase_id, project_id),
361
+ FOREIGN KEY (termbase_id) REFERENCES termbases(id) ON DELETE CASCADE
362
+ )
363
+ """)
364
+
365
+ self.cursor.execute("""
366
+ CREATE TABLE IF NOT EXISTS termbase_terms (
367
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
368
+ source_term TEXT NOT NULL,
369
+ target_term TEXT NOT NULL,
370
+ source_lang TEXT DEFAULT 'unknown',
371
+ target_lang TEXT DEFAULT 'unknown',
372
+ termbase_id TEXT NOT NULL,
373
+ priority INTEGER DEFAULT 99,
374
+ project_id TEXT,
375
+
376
+ -- Terminology-specific fields
377
+ synonyms TEXT,
378
+ forbidden_terms TEXT,
379
+ definition TEXT,
380
+ context TEXT,
381
+ part_of_speech TEXT,
382
+ domain TEXT,
383
+ case_sensitive BOOLEAN DEFAULT 0,
384
+ forbidden BOOLEAN DEFAULT 0,
385
+
386
+ -- Link to TM entry (optional)
387
+ tm_source_id INTEGER,
388
+
389
+ -- Metadata
390
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
391
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
392
+ usage_count INTEGER DEFAULT 0,
393
+ notes TEXT,
394
+ note TEXT,
395
+ project TEXT,
396
+ client TEXT,
397
+ term_uuid TEXT,
398
+
399
+ FOREIGN KEY (tm_source_id) REFERENCES translation_units(id) ON DELETE SET NULL
400
+ )
401
+ """)
402
+
403
+ # Indexes for termbase_terms
404
+ self.cursor.execute("""
405
+ CREATE INDEX IF NOT EXISTS idx_gt_source_term
406
+ ON termbase_terms(source_term)
407
+ """)
408
+
409
+ self.cursor.execute("""
410
+ CREATE INDEX IF NOT EXISTS idx_gt_termbase_id
411
+ ON termbase_terms(termbase_id)
412
+ """)
413
+
414
+ self.cursor.execute("""
415
+ CREATE INDEX IF NOT EXISTS idx_gt_project_id
416
+ ON termbase_terms(project_id)
417
+ """)
418
+
419
+ self.cursor.execute("""
420
+ CREATE INDEX IF NOT EXISTS idx_gt_domain
421
+ ON termbase_terms(domain)
422
+ """)
423
+
424
+ # Full-text search for termbase
425
+ self.cursor.execute("""
426
+ CREATE VIRTUAL TABLE IF NOT EXISTS termbase_terms_fts
427
+ USING fts5(
428
+ source_term,
429
+ target_term,
430
+ definition,
431
+ content=termbase_terms,
432
+ content_rowid=id
433
+ )
434
+ """)
435
+
436
+ # ============================================
437
+ # NON-TRANSLATABLES
438
+ # ============================================
439
+
440
+ self.cursor.execute("""
441
+ CREATE TABLE IF NOT EXISTS non_translatables (
442
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
443
+ pattern TEXT NOT NULL UNIQUE,
444
+ pattern_type TEXT DEFAULT 'regex',
445
+ description TEXT,
446
+ project_id TEXT,
447
+ enabled BOOLEAN DEFAULT 1,
448
+ example_text TEXT,
449
+ category TEXT,
450
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
451
+ )
452
+ """)
453
+
454
+ self.cursor.execute("""
455
+ CREATE INDEX IF NOT EXISTS idx_nt_project_id
456
+ ON non_translatables(project_id)
457
+ """)
458
+
459
+ self.cursor.execute("""
460
+ CREATE INDEX IF NOT EXISTS idx_nt_category
461
+ ON non_translatables(category)
462
+ """)
463
+
464
+ # ============================================
465
+ # SEGMENTATION RULES
466
+ # ============================================
467
+
468
+ self.cursor.execute("""
469
+ CREATE TABLE IF NOT EXISTS segmentation_rules (
470
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
471
+ rule_name TEXT NOT NULL,
472
+ source_lang TEXT,
473
+ rule_type TEXT NOT NULL,
474
+ pattern TEXT NOT NULL,
475
+ description TEXT,
476
+ priority INTEGER DEFAULT 100,
477
+ enabled BOOLEAN DEFAULT 1,
478
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
479
+ )
480
+ """)
481
+
482
+ self.cursor.execute("""
483
+ CREATE INDEX IF NOT EXISTS idx_sr_source_lang
484
+ ON segmentation_rules(source_lang)
485
+ """)
486
+
487
+ self.cursor.execute("""
488
+ CREATE INDEX IF NOT EXISTS idx_sr_priority
489
+ ON segmentation_rules(priority)
490
+ """)
491
+
492
+ # ============================================
493
+ # PROJECT METADATA
494
+ # ============================================
495
+
496
+ self.cursor.execute("""
497
+ CREATE TABLE IF NOT EXISTS projects (
498
+ id TEXT PRIMARY KEY,
499
+ name TEXT NOT NULL,
500
+ source_lang TEXT,
501
+ target_lang TEXT,
502
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
503
+ modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
504
+ last_opened TIMESTAMP,
505
+
506
+ -- Linked resources (JSON arrays)
507
+ active_tm_ids TEXT,
508
+ active_termbase_ids TEXT,
509
+ active_prompt_file TEXT,
510
+ active_style_guide TEXT,
511
+
512
+ -- Statistics
513
+ segment_count INTEGER DEFAULT 0,
514
+ translated_count INTEGER DEFAULT 0,
515
+
516
+ -- Settings (JSON blob)
517
+ settings TEXT
518
+ )
519
+ """)
520
+
521
+ # ============================================
522
+ # FILE METADATA (for prompts and style guides)
523
+ # ============================================
524
+
525
+ self.cursor.execute("""
526
+ CREATE TABLE IF NOT EXISTS prompt_files (
527
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
528
+ file_path TEXT NOT NULL UNIQUE,
529
+ file_type TEXT NOT NULL,
530
+ name TEXT NOT NULL,
531
+ description TEXT,
532
+ last_used TIMESTAMP,
533
+ use_count INTEGER DEFAULT 0
534
+ )
535
+ """)
536
+
537
+ # ============================================
538
+ # TMX EDITOR TABLES (for database-backed TMX files)
539
+ # ============================================
540
+
541
+ self.cursor.execute("""
542
+ CREATE TABLE IF NOT EXISTS tmx_files (
543
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
544
+ file_path TEXT NOT NULL UNIQUE,
545
+ file_name TEXT NOT NULL,
546
+ original_file_path TEXT, -- Original file path when imported
547
+ load_mode TEXT NOT NULL, -- 'ram' or 'database'
548
+ file_size INTEGER, -- File size in bytes
549
+
550
+ -- Header metadata (JSON)
551
+ header_data TEXT NOT NULL,
552
+
553
+ -- Statistics
554
+ tu_count INTEGER DEFAULT 0,
555
+ languages TEXT, -- JSON array of language codes
556
+
557
+ -- Timestamps
558
+ created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
559
+ last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
560
+ last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP
561
+ )
562
+ """)
563
+
564
+ self.cursor.execute("""
565
+ CREATE TABLE IF NOT EXISTS tmx_translation_units (
566
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
567
+ tmx_file_id INTEGER NOT NULL,
568
+ tu_id INTEGER NOT NULL, -- Original TU ID from TMX file
569
+
570
+ -- System attributes
571
+ creation_date TEXT,
572
+ creation_id TEXT,
573
+ change_date TEXT,
574
+ change_id TEXT,
575
+ srclang TEXT,
576
+
577
+ -- Custom attributes (JSON)
578
+ custom_attributes TEXT,
579
+
580
+ -- Comments (JSON array)
581
+ comments TEXT,
582
+
583
+ FOREIGN KEY (tmx_file_id) REFERENCES tmx_files(id) ON DELETE CASCADE,
584
+ UNIQUE(tmx_file_id, tu_id)
585
+ )
586
+ """)
587
+
588
+ self.cursor.execute("""
589
+ CREATE TABLE IF NOT EXISTS tmx_segments (
590
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
591
+ tu_id INTEGER NOT NULL, -- References tmx_translation_units.id
592
+ lang TEXT NOT NULL,
593
+ text TEXT NOT NULL,
594
+
595
+ -- Language-specific attributes
596
+ creation_date TEXT,
597
+ creation_id TEXT,
598
+ change_date TEXT,
599
+ change_id TEXT,
600
+
601
+ FOREIGN KEY (tu_id) REFERENCES tmx_translation_units(id) ON DELETE CASCADE,
602
+ UNIQUE(tu_id, lang)
603
+ )
604
+ """)
605
+
606
+ # Indexes for TMX tables
607
+ self.cursor.execute("""
608
+ CREATE INDEX IF NOT EXISTS idx_tmx_tu_file_id
609
+ ON tmx_translation_units(tmx_file_id)
610
+ """)
611
+
612
+ self.cursor.execute("""
613
+ CREATE INDEX IF NOT EXISTS idx_tmx_tu_tu_id
614
+ ON tmx_translation_units(tu_id)
615
+ """)
616
+
617
+ self.cursor.execute("""
618
+ CREATE INDEX IF NOT EXISTS idx_tmx_seg_tu_id
619
+ ON tmx_segments(tu_id)
620
+ """)
621
+
622
+ self.cursor.execute("""
623
+ CREATE INDEX IF NOT EXISTS idx_tmx_seg_lang
624
+ ON tmx_segments(lang)
625
+ """)
626
+
627
+ self.cursor.execute("""
628
+ CREATE TABLE IF NOT EXISTS style_guide_files (
629
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
630
+ file_path TEXT NOT NULL UNIQUE,
631
+ language TEXT NOT NULL,
632
+ last_used TIMESTAMP,
633
+ use_count INTEGER DEFAULT 0
634
+ )
635
+ """)
636
+
637
+ # Commit schema
638
+ try:
639
+ self.connection.commit()
640
+ print("✅ Database tables created and committed successfully")
641
+ except Exception as e:
642
+ print(f"❌ Error committing database schema: {e}")
643
+ import traceback
644
+ traceback.print_exc()
645
+ raise
646
+
647
+ def close(self):
648
+ """Close database connection"""
649
+ if self.connection:
650
+ self.connection.close()
651
+ self.connection = None
652
+ self.cursor = None
653
+
654
+ # ============================================
655
+ # TRANSLATION MEMORY METHODS
656
+ # ============================================
657
+
658
+ def add_translation_unit(self, source: str, target: str, source_lang: str,
659
+ target_lang: str, tm_id: str = 'project',
660
+ project_id: str = None, context_before: str = None,
661
+ context_after: str = None, notes: str = None) -> int:
662
+ """
663
+ Add translation unit to database
664
+
665
+ Returns: ID of inserted/updated entry
666
+ """
667
+ # Generate hash for fast exact matching
668
+ source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
669
+
670
+ try:
671
+ self.cursor.execute("""
672
+ INSERT INTO translation_units
673
+ (source_text, target_text, source_lang, target_lang, tm_id,
674
+ project_id, context_before, context_after, source_hash, notes)
675
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
676
+ ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
677
+ usage_count = usage_count + 1,
678
+ modified_date = CURRENT_TIMESTAMP
679
+ """, (source, target, source_lang, target_lang, tm_id,
680
+ project_id, context_before, context_after, source_hash, notes))
681
+
682
+ self.connection.commit()
683
+ return self.cursor.lastrowid
684
+
685
+ except Exception as e:
686
+ self.log(f"Error adding translation unit: {e}")
687
+ return None
688
+
689
+ def get_exact_match(self, source: str, tm_ids: List[str] = None,
690
+ source_lang: str = None, target_lang: str = None,
691
+ bidirectional: bool = True) -> Optional[Dict]:
692
+ """
693
+ Get exact match from TM
694
+
695
+ Args:
696
+ source: Source text to match
697
+ tm_ids: List of TM IDs to search (None = all)
698
+ source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
699
+ target_lang: Filter by target language (base code matching)
700
+ bidirectional: If True, search both directions (nl→en AND en→nl)
701
+
702
+ Returns: Dictionary with match data or None
703
+ """
704
+ from modules.tmx_generator import get_base_lang_code
705
+
706
+ source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
707
+
708
+ # Get base language codes for comparison
709
+ src_base = get_base_lang_code(source_lang) if source_lang else None
710
+ tgt_base = get_base_lang_code(target_lang) if target_lang else None
711
+
712
+ query = """
713
+ SELECT * FROM translation_units
714
+ WHERE source_hash = ? AND source_text = ?
715
+ """
716
+ params = [source_hash, source]
717
+
718
+ if tm_ids:
719
+ placeholders = ','.join('?' * len(tm_ids))
720
+ query += f" AND tm_id IN ({placeholders})"
721
+ params.extend(tm_ids)
722
+
723
+ # Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
724
+ from modules.tmx_generator import get_lang_match_variants
725
+ if src_base:
726
+ src_variants = get_lang_match_variants(source_lang)
727
+ src_conditions = []
728
+ for variant in src_variants:
729
+ src_conditions.append("source_lang = ?")
730
+ params.append(variant)
731
+ src_conditions.append("source_lang LIKE ?")
732
+ params.append(f"{variant}-%")
733
+ query += f" AND ({' OR '.join(src_conditions)})"
734
+
735
+ if tgt_base:
736
+ tgt_variants = get_lang_match_variants(target_lang)
737
+ tgt_conditions = []
738
+ for variant in tgt_variants:
739
+ tgt_conditions.append("target_lang = ?")
740
+ params.append(variant)
741
+ tgt_conditions.append("target_lang LIKE ?")
742
+ params.append(f"{variant}-%")
743
+ query += f" AND ({' OR '.join(tgt_conditions)})"
744
+
745
+ query += " ORDER BY usage_count DESC, modified_date DESC LIMIT 1"
746
+
747
+ self.cursor.execute(query, params)
748
+ row = self.cursor.fetchone()
749
+
750
+ if row:
751
+ # Update usage count
752
+ self.cursor.execute("""
753
+ UPDATE translation_units
754
+ SET usage_count = usage_count + 1
755
+ WHERE id = ?
756
+ """, (row['id'],))
757
+ self.connection.commit()
758
+
759
+ return dict(row)
760
+
761
+ # If bidirectional and no forward match, try reverse direction
762
+ if bidirectional and src_base and tgt_base:
763
+ # Search where our source text is in the target field (reverse direction)
764
+ query = """
765
+ SELECT * FROM translation_units
766
+ WHERE target_text = ?
767
+ """
768
+ params = [source]
769
+
770
+ if tm_ids:
771
+ placeholders = ','.join('?' * len(tm_ids))
772
+ query += f" AND tm_id IN ({placeholders})"
773
+ params.extend(tm_ids)
774
+
775
+ # Reversed: search where TM source_lang matches our target_lang (flexible matching)
776
+ # Note: for reverse, we swap - TM source_lang should match our target_lang
777
+ tgt_variants = get_lang_match_variants(target_lang)
778
+ src_variants = get_lang_match_variants(source_lang)
779
+
780
+ src_conditions = []
781
+ for variant in tgt_variants: # TM source_lang = our target_lang
782
+ src_conditions.append("source_lang = ?")
783
+ params.append(variant)
784
+ src_conditions.append("source_lang LIKE ?")
785
+ params.append(f"{variant}-%")
786
+
787
+ tgt_conditions = []
788
+ for variant in src_variants: # TM target_lang = our source_lang
789
+ tgt_conditions.append("target_lang = ?")
790
+ params.append(variant)
791
+ tgt_conditions.append("target_lang LIKE ?")
792
+ params.append(f"{variant}-%")
793
+
794
+ query += f" AND ({' OR '.join(src_conditions)}) AND ({' OR '.join(tgt_conditions)})"
795
+
796
+ query += " ORDER BY usage_count DESC, modified_date DESC LIMIT 1"
797
+
798
+ self.cursor.execute(query, params)
799
+ row = self.cursor.fetchone()
800
+
801
+ if row:
802
+ # Update usage count
803
+ self.cursor.execute("""
804
+ UPDATE translation_units
805
+ SET usage_count = usage_count + 1
806
+ WHERE id = ?
807
+ """, (row['id'],))
808
+ self.connection.commit()
809
+
810
+ # Swap source/target since this is a reverse match
811
+ result = dict(row)
812
+ result['source_text'], result['target_text'] = result['target_text'], result['source_text']
813
+ result['source_lang'], result['target_lang'] = result['target_lang'], result['source_lang']
814
+ result['reverse_match'] = True
815
+ return result
816
+
817
+ return None
818
+
819
+ def calculate_similarity(self, text1: str, text2: str) -> float:
820
+ """
821
+ Calculate similarity ratio between two texts using SequenceMatcher.
822
+ Tags are stripped before comparison for better matching accuracy.
823
+
824
+ Returns: Similarity score from 0.0 to 1.0
825
+ """
826
+ import re
827
+ # Strip HTML/XML tags for comparison
828
+ clean1 = re.sub(r'<[^>]+>', '', text1).lower()
829
+ clean2 = re.sub(r'<[^>]+>', '', text2).lower()
830
+ return SequenceMatcher(None, clean1, clean2).ratio()
831
+
832
+ def search_fuzzy_matches(self, source: str, tm_ids: List[str] = None,
833
+ threshold: float = 0.75, max_results: int = 5,
834
+ source_lang: str = None, target_lang: str = None,
835
+ bidirectional: bool = True) -> List[Dict]:
836
+ """
837
+ Search for fuzzy matches using FTS5 with proper similarity calculation
838
+
839
+ Args:
840
+ bidirectional: If True, search both directions (nl→en AND en→nl)
841
+
842
+ Returns: List of matches with similarity scores
843
+ """
844
+ # For better FTS5 matching, tokenize the query and escape special chars
845
+ # FTS5 special characters: " ( ) - : , . ! ?
846
+ import re
847
+ from modules.tmx_generator import get_base_lang_code
848
+
849
+ # Strip HTML/XML tags from source for clean text search
850
+ text_without_tags = re.sub(r'<[^>]+>', '', source)
851
+
852
+ # Remove special FTS5 characters and split into words (from tag-stripped text)
853
+ clean_text = re.sub(r'[^\w\s]', ' ', text_without_tags) # Replace special chars with spaces
854
+ search_terms_clean = [term for term in clean_text.strip().split() if len(term) > 2] # Min 3 chars
855
+
856
+ # Also get search terms from original source (in case TM was indexed with tags)
857
+ clean_text_with_tags = re.sub(r'[^\w\s]', ' ', source)
858
+ search_terms_with_tags = [term for term in clean_text_with_tags.strip().split() if len(term) > 2]
859
+
860
+ # Combine both sets of search terms (deduplicated)
861
+ all_search_terms = list(dict.fromkeys(search_terms_clean + search_terms_with_tags))
862
+
863
+ # For long segments, prioritize longer/rarer words to get better FTS5 candidates
864
+ # Sort by length (longer words are usually more discriminating)
865
+ all_search_terms.sort(key=len, reverse=True)
866
+
867
+ # Limit search terms to avoid overly complex queries (top 20 longest words)
868
+ # This helps find similar long segments more reliably
869
+ search_terms_for_query = all_search_terms[:20]
870
+
871
+ print(f"[DEBUG] search_fuzzy_matches: source='{source[:50]}...', {len(all_search_terms)} terms")
872
+
873
+ if not search_terms_for_query:
874
+ # If no valid terms, return empty results
875
+ print(f"[DEBUG] search_fuzzy_matches: No valid search terms, returning empty")
876
+ return []
877
+
878
+ # Quote each term to prevent FTS5 syntax errors
879
+ fts_query = ' OR '.join(f'"{term}"' for term in search_terms_for_query)
880
+ print(f"[DEBUG] search_fuzzy_matches: FTS query terms = {search_terms_for_query[:10]}...")
881
+
882
+ # Get base language codes for comparison
883
+ src_base = get_base_lang_code(source_lang) if source_lang else None
884
+ tgt_base = get_base_lang_code(target_lang) if target_lang else None
885
+
886
+ # Use FTS5 for initial candidate retrieval (fast)
887
+ query = """
888
+ SELECT tu.*,
889
+ bm25(translation_units_fts) as relevance
890
+ FROM translation_units tu
891
+ JOIN translation_units_fts ON tu.id = translation_units_fts.rowid
892
+ WHERE translation_units_fts MATCH ?
893
+ """
894
+ params = [fts_query]
895
+
896
+ if tm_ids:
897
+ placeholders = ','.join('?' * len(tm_ids))
898
+ query += f" AND tu.tm_id IN ({placeholders})"
899
+ params.extend(tm_ids)
900
+
901
+ # Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
902
+ from modules.tmx_generator import get_lang_match_variants
903
+ if src_base:
904
+ src_variants = get_lang_match_variants(source_lang)
905
+ src_conditions = []
906
+ for variant in src_variants:
907
+ src_conditions.append("tu.source_lang = ?")
908
+ params.append(variant)
909
+ src_conditions.append("tu.source_lang LIKE ?")
910
+ params.append(f"{variant}-%")
911
+ query += f" AND ({' OR '.join(src_conditions)})"
912
+
913
+ if tgt_base:
914
+ tgt_variants = get_lang_match_variants(target_lang)
915
+ tgt_conditions = []
916
+ for variant in tgt_variants:
917
+ tgt_conditions.append("tu.target_lang = ?")
918
+ params.append(variant)
919
+ tgt_conditions.append("tu.target_lang LIKE ?")
920
+ params.append(f"{variant}-%")
921
+ query += f" AND ({' OR '.join(tgt_conditions)})"
922
+
923
+ # Get more candidates than needed for proper scoring (increase limit for long segments)
924
+ # Long segments need MANY more candidates because BM25 ranking may push down
925
+ # the truly similar entries in favor of entries matching more search terms
926
+ candidate_limit = max(500, max_results * 50)
927
+ query += f" ORDER BY relevance DESC LIMIT {candidate_limit}"
928
+
929
+ print(f"[DEBUG] search_fuzzy_matches: Executing query (limit={candidate_limit})...")
930
+
931
+ try:
932
+ self.cursor.execute(query, params)
933
+ all_rows = self.cursor.fetchall()
934
+ except Exception as e:
935
+ print(f"[DEBUG] search_fuzzy_matches: SQL ERROR: {e}")
936
+ return []
937
+
938
+ results = []
939
+
940
+ for row in all_rows:
941
+ match_dict = dict(row)
942
+ # Calculate actual similarity using SequenceMatcher
943
+ similarity = self.calculate_similarity(source, match_dict['source_text'])
944
+
945
+ # Only include matches above threshold
946
+ if similarity >= threshold:
947
+ match_dict['similarity'] = similarity
948
+ match_dict['match_pct'] = int(similarity * 100)
949
+ results.append(match_dict)
950
+
951
+ print(f"[DEBUG] search_fuzzy_matches: After threshold filter ({threshold}): {len(results)} matches")
952
+
953
+ # If bidirectional, also search reverse direction
954
+ if bidirectional and src_base and tgt_base:
955
+ query = """
956
+ SELECT tu.*,
957
+ bm25(translation_units_fts) as relevance
958
+ FROM translation_units tu
959
+ JOIN translation_units_fts ON tu.id = translation_units_fts.rowid
960
+ WHERE translation_units_fts MATCH ?
961
+ """
962
+ params = [fts_query]
963
+
964
+ if tm_ids:
965
+ placeholders = ','.join('?' * len(tm_ids))
966
+ query += f" AND tu.tm_id IN ({placeholders})"
967
+ params.extend(tm_ids)
968
+
969
+ # Reversed language filters with flexible matching
970
+ # For reverse: TM target_lang should match our source_lang, TM source_lang should match our target_lang
971
+ src_variants = get_lang_match_variants(source_lang)
972
+ tgt_variants = get_lang_match_variants(target_lang)
973
+
974
+ # TM target_lang = our source_lang
975
+ tgt_conditions = []
976
+ for variant in src_variants:
977
+ tgt_conditions.append("tu.target_lang = ?")
978
+ params.append(variant)
979
+ tgt_conditions.append("tu.target_lang LIKE ?")
980
+ params.append(f"{variant}-%")
981
+ query += f" AND ({' OR '.join(tgt_conditions)})"
982
+
983
+ # TM source_lang = our target_lang
984
+ src_conditions = []
985
+ for variant in tgt_variants:
986
+ src_conditions.append("tu.source_lang = ?")
987
+ params.append(variant)
988
+ src_conditions.append("tu.source_lang LIKE ?")
989
+ params.append(f"{variant}-%")
990
+ query += f" AND ({' OR '.join(src_conditions)})"
991
+
992
+ query += f" ORDER BY relevance DESC LIMIT {max_results * 5}"
993
+
994
+ self.cursor.execute(query, params)
995
+
996
+ for row in self.cursor.fetchall():
997
+ match_dict = dict(row)
998
+ # Calculate similarity against target_text (since we're reversing)
999
+ similarity = self.calculate_similarity(source, match_dict['target_text'])
1000
+
1001
+ # Only include matches above threshold
1002
+ if similarity >= threshold:
1003
+ # Swap source/target for reverse match
1004
+ match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
1005
+ match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
1006
+ match_dict['similarity'] = similarity
1007
+ match_dict['match_pct'] = int(similarity * 100)
1008
+ match_dict['reverse_match'] = True
1009
+ results.append(match_dict)
1010
+
1011
+ # Sort by similarity (highest first) and limit results
1012
+ results.sort(key=lambda x: x['similarity'], reverse=True)
1013
+ return results[:max_results]
1014
+
1015
+ def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True,
1016
+ threshold: float = 0.75, max_results: int = 10) -> List[Dict]:
1017
+ """
1018
+ Search for matches across TMs (both exact and fuzzy)
1019
+
1020
+ Args:
1021
+ source: Source text to search for
1022
+ tm_ids: List of TM IDs to search (None = all)
1023
+ enabled_only: Currently ignored (all TMs enabled)
1024
+ threshold: Minimum similarity threshold (0.0-1.0)
1025
+ max_results: Maximum number of results
1026
+
1027
+ Returns:
1028
+ List of matches with source, target, match_pct, tm_name
1029
+ """
1030
+ # First try exact match
1031
+ exact = self.get_exact_match(source, tm_ids=tm_ids)
1032
+ if exact:
1033
+ return [{
1034
+ 'source': exact['source_text'],
1035
+ 'target': exact['target_text'],
1036
+ 'match_pct': 100,
1037
+ 'tm_name': exact['tm_id'].replace('_', ' ').title(),
1038
+ 'tm_id': exact['tm_id']
1039
+ }]
1040
+
1041
+ # No exact match, try fuzzy
1042
+ fuzzy_matches = self.search_fuzzy_matches(
1043
+ source,
1044
+ tm_ids=tm_ids,
1045
+ threshold=threshold,
1046
+ max_results=max_results
1047
+ )
1048
+
1049
+ results = []
1050
+ for match in fuzzy_matches:
1051
+ results.append({
1052
+ 'source': match['source_text'],
1053
+ 'target': match['target_text'],
1054
+ 'match_pct': match['match_pct'],
1055
+ 'tm_name': match['tm_id'].replace('_', ' ').title(),
1056
+ 'tm_id': match['tm_id']
1057
+ })
1058
+
1059
+ return results
1060
+
1061
+ def get_tm_entries(self, tm_id: str, limit: int = None) -> List[Dict]:
1062
+ """Get all entries from a specific TM"""
1063
+ query = "SELECT * FROM translation_units WHERE tm_id = ? ORDER BY id"
1064
+ params = [tm_id]
1065
+
1066
+ if limit:
1067
+ query += f" LIMIT {limit}"
1068
+
1069
+ self.cursor.execute(query, params)
1070
+ return [dict(row) for row in self.cursor.fetchall()]
1071
+
1072
+ def get_tm_count(self, tm_id: str = None) -> int:
1073
+ """Get entry count for TM(s)"""
1074
+ if tm_id:
1075
+ self.cursor.execute("""
1076
+ SELECT COUNT(*) FROM translation_units WHERE tm_id = ?
1077
+ """, (tm_id,))
1078
+ else:
1079
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units")
1080
+
1081
+ return self.cursor.fetchone()[0]
1082
+
1083
+ def clear_tm(self, tm_id: str):
1084
+ """Clear all entries from a TM"""
1085
+ self.cursor.execute("""
1086
+ DELETE FROM translation_units WHERE tm_id = ?
1087
+ """, (tm_id,))
1088
+ self.connection.commit()
1089
+
1090
+ def delete_entry(self, tm_id: str, source: str, target: str):
1091
+ """Delete a specific entry from a TM"""
1092
+ # Get the ID first
1093
+ self.cursor.execute("""
1094
+ SELECT id FROM translation_units
1095
+ WHERE tm_id = ? AND source_text = ? AND target_text = ?
1096
+ """, (tm_id, source, target))
1097
+
1098
+ result = self.cursor.fetchone()
1099
+ if not result:
1100
+ return # Entry not found
1101
+
1102
+ entry_id = result['id']
1103
+
1104
+ # Delete from FTS5 index first
1105
+ try:
1106
+ self.cursor.execute("""
1107
+ DELETE FROM tm_fts WHERE rowid = ?
1108
+ """, (entry_id,))
1109
+ except Exception:
1110
+ pass # FTS5 table might not exist
1111
+
1112
+ # Delete from main table
1113
+ self.cursor.execute("""
1114
+ DELETE FROM translation_units
1115
+ WHERE id = ?
1116
+ """, (entry_id,))
1117
+
1118
+ self.connection.commit()
1119
+
1120
+ def concordance_search(self, query: str, tm_ids: List[str] = None, direction: str = 'both',
1121
+ source_lang = None, target_lang = None) -> List[Dict]:
1122
+ """
1123
+ Search for text in source and/or target (concordance search)
1124
+ Uses FTS5 full-text search for fast matching on millions of segments.
1125
+ Falls back to LIKE queries if FTS5 fails.
1126
+
1127
+ Args:
1128
+ query: Text to search for
1129
+ tm_ids: List of TM IDs to search (None = all)
1130
+ direction: 'source' = search source only, 'target' = search target only, 'both' = bidirectional
1131
+ source_lang: Filter by source language - can be a string OR a list of language variants (None = any)
1132
+ target_lang: Filter by target language - can be a string OR a list of language variants (None = any)
1133
+ """
1134
+ # Normalize language filters to lists for consistent handling
1135
+ source_langs = source_lang if isinstance(source_lang, list) else ([source_lang] if source_lang else None)
1136
+ target_langs = target_lang if isinstance(target_lang, list) else ([target_lang] if target_lang else None)
1137
+
1138
+ # Escape FTS5 special characters and wrap words for prefix matching
1139
+ # FTS5 special chars: " * ( ) : ^
1140
+ fts_query = query.replace('"', '""')
1141
+ # Wrap in quotes for phrase search
1142
+ fts_query = f'"{fts_query}"'
1143
+
1144
+ try:
1145
+ # Use FTS5 for fast full-text search
1146
+ if direction == 'source':
1147
+ fts_sql = """
1148
+ SELECT tu.* FROM translation_units tu
1149
+ JOIN translation_units_fts fts ON tu.id = fts.rowid
1150
+ WHERE fts.source_text MATCH ?
1151
+ """
1152
+ params = [fts_query]
1153
+ elif direction == 'target':
1154
+ fts_sql = """
1155
+ SELECT tu.* FROM translation_units tu
1156
+ JOIN translation_units_fts fts ON tu.id = fts.rowid
1157
+ WHERE fts.target_text MATCH ?
1158
+ """
1159
+ params = [fts_query]
1160
+ else:
1161
+ # Both directions - search in combined FTS index
1162
+ fts_sql = """
1163
+ SELECT tu.* FROM translation_units tu
1164
+ JOIN translation_units_fts fts ON tu.id = fts.rowid
1165
+ WHERE translation_units_fts MATCH ?
1166
+ """
1167
+ params = [fts_query]
1168
+
1169
+ if tm_ids:
1170
+ placeholders = ','.join('?' * len(tm_ids))
1171
+ fts_sql += f" AND tu.tm_id IN ({placeholders})"
1172
+ params.extend(tm_ids)
1173
+
1174
+ # Add language filters (support for list of variants)
1175
+ if source_langs:
1176
+ placeholders = ','.join('?' * len(source_langs))
1177
+ fts_sql += f" AND tu.source_lang IN ({placeholders})"
1178
+ params.extend(source_langs)
1179
+ if target_langs:
1180
+ placeholders = ','.join('?' * len(target_langs))
1181
+ fts_sql += f" AND tu.target_lang IN ({placeholders})"
1182
+ params.extend(target_langs)
1183
+
1184
+ fts_sql += " ORDER BY tu.modified_date DESC LIMIT 100"
1185
+
1186
+ self.cursor.execute(fts_sql, params)
1187
+ return [dict(row) for row in self.cursor.fetchall()]
1188
+
1189
+ except Exception as e:
1190
+ # Fallback to LIKE query if FTS5 fails (e.g., index not built)
1191
+ print(f"[TM] FTS5 search failed, falling back to LIKE: {e}")
1192
+ search_query = f"%{query}%"
1193
+
1194
+ if direction == 'source':
1195
+ sql = """
1196
+ SELECT * FROM translation_units
1197
+ WHERE source_text LIKE ?
1198
+ """
1199
+ params = [search_query]
1200
+ elif direction == 'target':
1201
+ sql = """
1202
+ SELECT * FROM translation_units
1203
+ WHERE target_text LIKE ?
1204
+ """
1205
+ params = [search_query]
1206
+ else:
1207
+ sql = """
1208
+ SELECT * FROM translation_units
1209
+ WHERE (source_text LIKE ? OR target_text LIKE ?)
1210
+ """
1211
+ params = [search_query, search_query]
1212
+
1213
+ if tm_ids:
1214
+ placeholders = ','.join('?' * len(tm_ids))
1215
+ sql += f" AND tm_id IN ({placeholders})"
1216
+ params.extend(tm_ids)
1217
+
1218
+ # Add language filters (support for list of variants)
1219
+ if source_langs:
1220
+ placeholders = ','.join('?' * len(source_langs))
1221
+ sql += f" AND source_lang IN ({placeholders})"
1222
+ params.extend(source_langs)
1223
+ if target_langs:
1224
+ placeholders = ','.join('?' * len(target_langs))
1225
+ sql += f" AND target_lang IN ({placeholders})"
1226
+ params.extend(target_langs)
1227
+
1228
+ sql += " ORDER BY modified_date DESC LIMIT 100"
1229
+
1230
+ self.cursor.execute(sql, params)
1231
+ return [dict(row) for row in self.cursor.fetchall()]
1232
+
1233
+ def rebuild_fts_index(self) -> int:
1234
+ """
1235
+ Rebuild the FTS5 full-text search index from scratch.
1236
+ Use this after importing TMs or if FTS search isn't returning results.
1237
+
1238
+ Returns:
1239
+ Number of entries indexed
1240
+ """
1241
+ try:
1242
+ # Clear existing FTS data
1243
+ self.cursor.execute("DELETE FROM translation_units_fts")
1244
+
1245
+ # Repopulate from translation_units table
1246
+ self.cursor.execute("""
1247
+ INSERT INTO translation_units_fts(rowid, source_text, target_text)
1248
+ SELECT id, source_text, target_text FROM translation_units
1249
+ """)
1250
+
1251
+ self.conn.commit()
1252
+
1253
+ # Get count
1254
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units_fts")
1255
+ count = self.cursor.fetchone()[0]
1256
+ print(f"[TM] FTS5 index rebuilt with {count:,} entries")
1257
+ return count
1258
+ except Exception as e:
1259
+ print(f"[TM] Error rebuilding FTS index: {e}")
1260
+ return 0
1261
+
1262
+ def check_fts_index(self) -> Dict:
1263
+ """
1264
+ Check if FTS5 index is in sync with main table.
1265
+
1266
+ Returns:
1267
+ Dict with 'main_count', 'fts_count', 'in_sync' keys
1268
+ """
1269
+ try:
1270
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units")
1271
+ main_count = self.cursor.fetchone()[0]
1272
+
1273
+ self.cursor.execute("SELECT COUNT(*) FROM translation_units_fts")
1274
+ fts_count = self.cursor.fetchone()[0]
1275
+
1276
+ return {
1277
+ 'main_count': main_count,
1278
+ 'fts_count': fts_count,
1279
+ 'in_sync': main_count == fts_count
1280
+ }
1281
+ except Exception as e:
1282
+ return {'main_count': 0, 'fts_count': 0, 'in_sync': False, 'error': str(e)}
1283
+
1284
+ # ============================================
1285
+ # termbase METHODS (Placeholder for Phase 3)
1286
+ # ============================================
1287
+
1288
+ def add_termbase_term(self, source_term: str, target_term: str,
1289
+ source_lang: str, target_lang: str,
1290
+ termbase_id: str = 'main', **kwargs) -> int:
1291
+ """Add term to termbase (Phase 3)"""
1292
+ # TODO: Implement in Phase 3
1293
+ pass
1294
+
1295
+ def search_termbases(self, search_term: str, source_lang: str = None,
1296
+ target_lang: str = None, project_id: str = None,
1297
+ min_length: int = 0) -> List[Dict]:
1298
+ """
1299
+ Search termbases for matching source terms
1300
+
1301
+ Args:
1302
+ search_term: Source term to search for
1303
+ source_lang: Filter by source language (optional)
1304
+ target_lang: Filter by target language (optional)
1305
+ project_id: Filter by project (optional)
1306
+ min_length: Minimum term length to return
1307
+
1308
+ Returns:
1309
+ List of termbase hits, sorted by priority (lower = higher priority)
1310
+ """
1311
+ # Build query with filters - include termbase name and ranking via JOIN
1312
+ # Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
1313
+ # Use CAST to ensure proper comparison
1314
+ # IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
1315
+ query = """
1316
+ SELECT
1317
+ t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1318
+ t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
1319
+ t.notes, t.project, t.client,
1320
+ tb.name as termbase_name,
1321
+ tb.source_lang as termbase_source_lang,
1322
+ tb.target_lang as termbase_target_lang,
1323
+ tb.is_project_termbase,
1324
+ COALESCE(ta.priority, tb.ranking) as ranking
1325
+ FROM termbase_terms t
1326
+ LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1327
+ LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1328
+ WHERE (
1329
+ LOWER(t.source_term) = LOWER(?) OR
1330
+ LOWER(t.source_term) LIKE LOWER(?) OR
1331
+ LOWER(t.source_term) LIKE LOWER(?) OR
1332
+ LOWER(t.source_term) LIKE LOWER(?)
1333
+ )
1334
+ AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1335
+ """
1336
+ # Exact match, word at start, word at end, word in middle
1337
+ # Use LOWER() for case-insensitive matching (handles "Edelmetalen" = "edelmetalen")
1338
+ # IMPORTANT: project_id must be first param for the LEFT JOIN ta.project_id = ? above
1339
+ params = [
1340
+ project_id if project_id else 0, # Use 0 if no project (won't match any activation records)
1341
+ search_term,
1342
+ f"{search_term} %",
1343
+ f"% {search_term}",
1344
+ f"% {search_term} %"
1345
+ ]
1346
+
1347
+ # Language filters - if term has no language, use termbase language for filtering
1348
+ if source_lang:
1349
+ query += """ AND (
1350
+ t.source_lang = ? OR
1351
+ (t.source_lang IS NULL AND tb.source_lang = ?) OR
1352
+ (t.source_lang IS NULL AND tb.source_lang IS NULL)
1353
+ )"""
1354
+ params.extend([source_lang, source_lang])
1355
+
1356
+ if target_lang:
1357
+ query += """ AND (
1358
+ t.target_lang = ? OR
1359
+ (t.target_lang IS NULL AND tb.target_lang = ?) OR
1360
+ (t.target_lang IS NULL AND tb.target_lang IS NULL)
1361
+ )"""
1362
+ params.extend([target_lang, target_lang])
1363
+
1364
+ # Project filter: match project-specific terms OR global terms (project_id IS NULL)
1365
+ if project_id:
1366
+ query += " AND (t.project_id = ? OR t.project_id IS NULL)"
1367
+ params.append(project_id)
1368
+
1369
+ if min_length > 0:
1370
+ query += f" AND LENGTH(t.source_term) >= {min_length}"
1371
+
1372
+ # Sort by ranking (lower number = higher priority)
1373
+ # Project termbases (ranking IS NULL) appear first, then by ranking, then alphabetically
1374
+ # Use COALESCE to treat NULL as -1 (highest priority)
1375
+ query += " ORDER BY COALESCE(tb.ranking, -1) ASC, t.source_term ASC"
1376
+
1377
+ self.cursor.execute(query, params)
1378
+ results = []
1379
+ for row in self.cursor.fetchall():
1380
+ result_dict = dict(row)
1381
+ # SQLite stores booleans as 0/1, explicitly convert to Python bool
1382
+ if 'is_project_termbase' in result_dict:
1383
+ result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
1384
+
1385
+ # Fetch target synonyms for this term and include them in the result
1386
+ term_id = result_dict.get('id')
1387
+ if term_id:
1388
+ try:
1389
+ self.cursor.execute("""
1390
+ SELECT synonym_text, forbidden FROM termbase_synonyms
1391
+ WHERE term_id = ? AND language = 'target'
1392
+ ORDER BY display_order ASC
1393
+ """, (term_id,))
1394
+ synonyms = []
1395
+ for syn_row in self.cursor.fetchall():
1396
+ syn_text = syn_row[0]
1397
+ syn_forbidden = bool(syn_row[1])
1398
+ if not syn_forbidden: # Only include non-forbidden synonyms
1399
+ synonyms.append(syn_text)
1400
+ result_dict['target_synonyms'] = synonyms
1401
+ except Exception:
1402
+ result_dict['target_synonyms'] = []
1403
+
1404
+ results.append(result_dict)
1405
+ return results
1406
+
1407
+ # ============================================
1408
+ # UTILITY METHODS
1409
+ # ============================================
1410
+
1411
+ def get_all_tms(self, enabled_only: bool = True) -> List[Dict]:
1412
+ """
1413
+ Get list of all translation memories
1414
+
1415
+ Args:
1416
+ enabled_only: If True, only return enabled TMs
1417
+
1418
+ Returns:
1419
+ List of TM info dictionaries with tm_id, name, entry_count, enabled
1420
+ """
1421
+ # Get distinct TM IDs from translation_units
1422
+ query = "SELECT DISTINCT tm_id FROM translation_units ORDER BY tm_id"
1423
+ self.cursor.execute(query)
1424
+ tm_ids = [row[0] for row in self.cursor.fetchall()]
1425
+
1426
+ tm_list = []
1427
+ for tm_id in tm_ids:
1428
+ entry_count = self.get_tm_count(tm_id)
1429
+ tm_info = {
1430
+ 'tm_id': tm_id,
1431
+ 'name': tm_id.replace('_', ' ').title(),
1432
+ 'entry_count': entry_count,
1433
+ 'enabled': True, # For now, all TMs are enabled
1434
+ 'read_only': False
1435
+ }
1436
+ tm_list.append(tm_info)
1437
+
1438
+ return tm_list
1439
+
1440
+ def get_tm_list(self, enabled_only: bool = True) -> List[Dict]:
1441
+ """Alias for get_all_tms for backward compatibility"""
1442
+ return self.get_all_tms(enabled_only=enabled_only)
1443
+
1444
+ def get_entry_count(self, enabled_only: bool = True) -> int:
1445
+ """
1446
+ Get total number of translation entries
1447
+
1448
+ Args:
1449
+ enabled_only: Currently ignored (all TMs enabled)
1450
+
1451
+ Returns:
1452
+ Total number of translation units
1453
+ """
1454
+ return self.get_tm_count()
1455
+
1456
+ def vacuum(self):
1457
+ """Optimize database (VACUUM)"""
1458
+ self.cursor.execute("VACUUM")
1459
+ self.connection.commit()
1460
+
1461
+ # ============================================
1462
+ # TMX EDITOR METHODS (database-backed TMX files)
1463
+ # ============================================
1464
+
1465
+ def tmx_store_file(self, file_path: str, file_name: str, original_file_path: str,
1466
+ load_mode: str, file_size: int, header_data: dict,
1467
+ tu_count: int, languages: List[str]) -> int:
1468
+ """
1469
+ Store TMX file metadata in database
1470
+
1471
+ Returns:
1472
+ tmx_file_id (int)
1473
+ """
1474
+ languages_json = json.dumps(languages)
1475
+ header_json = json.dumps(header_data)
1476
+
1477
+ # Check if file already exists
1478
+ self.cursor.execute("SELECT id FROM tmx_files WHERE file_path = ?", (file_path,))
1479
+ existing = self.cursor.fetchone()
1480
+
1481
+ if existing:
1482
+ # Update existing
1483
+ self.cursor.execute("""
1484
+ UPDATE tmx_files
1485
+ SET file_name = ?, original_file_path = ?, load_mode = ?, file_size = ?,
1486
+ header_data = ?, tu_count = ?, languages = ?, last_accessed = CURRENT_TIMESTAMP
1487
+ WHERE id = ?
1488
+ """, (file_name, original_file_path, load_mode, file_size, header_json,
1489
+ tu_count, languages_json, existing['id']))
1490
+ self.connection.commit()
1491
+ return existing['id']
1492
+ else:
1493
+ # Insert new
1494
+ self.cursor.execute("""
1495
+ INSERT INTO tmx_files
1496
+ (file_path, file_name, original_file_path, load_mode, file_size,
1497
+ header_data, tu_count, languages)
1498
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1499
+ """, (file_path, file_name, original_file_path, load_mode, file_size,
1500
+ header_json, tu_count, languages_json))
1501
+ self.connection.commit()
1502
+ return self.cursor.lastrowid
1503
+
1504
+ def tmx_store_translation_unit(self, tmx_file_id: int, tu_id: int,
1505
+ creation_date: str = None, creation_id: str = None,
1506
+ change_date: str = None, change_id: str = None,
1507
+ srclang: str = None, custom_attributes: dict = None,
1508
+ comments: List[str] = None, commit: bool = True) -> int:
1509
+ """
1510
+ Store a translation unit in database
1511
+
1512
+ Args:
1513
+ commit: If False, don't commit (for batch operations)
1514
+
1515
+ Returns:
1516
+ Internal TU ID (for referencing segments)
1517
+ """
1518
+ custom_attrs_json = json.dumps(custom_attributes) if custom_attributes else None
1519
+ comments_json = json.dumps(comments) if comments else None
1520
+
1521
+ self.cursor.execute("""
1522
+ INSERT OR REPLACE INTO tmx_translation_units
1523
+ (tmx_file_id, tu_id, creation_date, creation_id, change_date, change_id,
1524
+ srclang, custom_attributes, comments)
1525
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1526
+ """, (tmx_file_id, tu_id, creation_date, creation_id, change_date, change_id,
1527
+ srclang, custom_attrs_json, comments_json))
1528
+ if commit:
1529
+ self.connection.commit()
1530
+ return self.cursor.lastrowid
1531
+
1532
+ def tmx_store_segment(self, tu_db_id: int, lang: str, text: str,
1533
+ creation_date: str = None, creation_id: str = None,
1534
+ change_date: str = None, change_id: str = None,
1535
+ commit: bool = True):
1536
+ """
1537
+ Store a segment (language variant) for a translation unit
1538
+
1539
+ Args:
1540
+ commit: If False, don't commit (for batch operations)
1541
+ """
1542
+ self.cursor.execute("""
1543
+ INSERT OR REPLACE INTO tmx_segments
1544
+ (tu_id, lang, text, creation_date, creation_id, change_date, change_id)
1545
+ VALUES (?, ?, ?, ?, ?, ?, ?)
1546
+ """, (tu_db_id, lang, text, creation_date, creation_id, change_date, change_id))
1547
+ if commit:
1548
+ self.connection.commit()
1549
+
1550
+ def tmx_get_file_id(self, file_path: str) -> Optional[int]:
1551
+ """Get TMX file ID by file path"""
1552
+ self.cursor.execute("SELECT id FROM tmx_files WHERE file_path = ?", (file_path,))
1553
+ row = self.cursor.fetchone()
1554
+ return row['id'] if row else None
1555
+
1556
+ def tmx_get_translation_units(self, tmx_file_id: int, offset: int = 0,
1557
+ limit: int = 50, src_lang: str = None,
1558
+ tgt_lang: str = None, src_filter: str = None,
1559
+ tgt_filter: str = None, ignore_case: bool = True) -> List[Dict]:
1560
+ """
1561
+ Get translation units with pagination and filtering
1562
+
1563
+ Returns:
1564
+ List of dicts with TU data including segments
1565
+ """
1566
+ # Build base query
1567
+ query = """
1568
+ SELECT tu.id as tu_db_id, tu.tu_id, tu.creation_date, tu.creation_id,
1569
+ tu.change_date, tu.change_id, tu.srclang, tu.custom_attributes, tu.comments
1570
+ FROM tmx_translation_units tu
1571
+ WHERE tu.tmx_file_id = ?
1572
+ """
1573
+ params = [tmx_file_id]
1574
+
1575
+ # Add filters
1576
+ if src_filter or tgt_filter:
1577
+ query += """
1578
+ AND EXISTS (
1579
+ SELECT 1 FROM tmx_segments seg1
1580
+ WHERE seg1.tu_id = tu.id
1581
+ """
1582
+ if src_lang:
1583
+ query += " AND seg1.lang = ?"
1584
+ params.append(src_lang)
1585
+ if src_filter:
1586
+ if ignore_case:
1587
+ query += " AND LOWER(seg1.text) LIKE LOWER(?)"
1588
+ params.append(f"%{src_filter}%")
1589
+ else:
1590
+ query += " AND seg1.text LIKE ?"
1591
+ params.append(f"%{src_filter}%")
1592
+
1593
+ if tgt_filter:
1594
+ query += """
1595
+ AND EXISTS (
1596
+ SELECT 1 FROM tmx_segments seg2
1597
+ WHERE seg2.tu_id = tu.id
1598
+ """
1599
+ if tgt_lang:
1600
+ query += " AND seg2.lang = ?"
1601
+ params.append(tgt_lang)
1602
+ if ignore_case:
1603
+ query += " AND LOWER(seg2.text) LIKE LOWER(?)"
1604
+ params.append(f"%{tgt_filter}%")
1605
+ else:
1606
+ query += " AND seg2.text LIKE ?"
1607
+ params.append(f"%{tgt_filter}%")
1608
+ query += ")"
1609
+
1610
+ query += ")"
1611
+
1612
+ query += " ORDER BY tu.tu_id LIMIT ? OFFSET ?"
1613
+ params.extend([limit, offset])
1614
+
1615
+ self.cursor.execute(query, params)
1616
+ rows = self.cursor.fetchall()
1617
+
1618
+ # Fetch segments for each TU
1619
+ result = []
1620
+ for row in rows:
1621
+ tu_data = dict(row)
1622
+ # Get segments
1623
+ self.cursor.execute("""
1624
+ SELECT lang, text, creation_date, creation_id, change_date, change_id
1625
+ FROM tmx_segments
1626
+ WHERE tu_id = ?
1627
+ """, (tu_data['tu_db_id'],))
1628
+ segments = {}
1629
+ for seg_row in self.cursor.fetchall():
1630
+ seg_dict = dict(seg_row)
1631
+ segments[seg_dict['lang']] = seg_dict
1632
+
1633
+ tu_data['segments'] = segments
1634
+ if tu_data['custom_attributes']:
1635
+ tu_data['custom_attributes'] = json.loads(tu_data['custom_attributes'])
1636
+ if tu_data['comments']:
1637
+ tu_data['comments'] = json.loads(tu_data['comments'])
1638
+
1639
+ result.append(tu_data)
1640
+
1641
+ return result
1642
+
1643
+ def tmx_count_translation_units(self, tmx_file_id: int, src_lang: str = None,
1644
+ tgt_lang: str = None, src_filter: str = None,
1645
+ tgt_filter: str = None, ignore_case: bool = True) -> int:
1646
+ """Count translation units matching filters"""
1647
+ query = """
1648
+ SELECT COUNT(DISTINCT tu.id)
1649
+ FROM tmx_translation_units tu
1650
+ WHERE tu.tmx_file_id = ?
1651
+ """
1652
+ params = [tmx_file_id]
1653
+
1654
+ # Add same filters as tmx_get_translation_units
1655
+ if src_filter or tgt_filter:
1656
+ query += """
1657
+ AND EXISTS (
1658
+ SELECT 1 FROM tmx_segments seg1
1659
+ WHERE seg1.tu_id = tu.id
1660
+ """
1661
+ if src_lang:
1662
+ query += " AND seg1.lang = ?"
1663
+ params.append(src_lang)
1664
+ if src_filter:
1665
+ if ignore_case:
1666
+ query += " AND LOWER(seg1.text) LIKE LOWER(?)"
1667
+ params.append(f"%{src_filter}%")
1668
+ else:
1669
+ query += " AND seg1.text LIKE ?"
1670
+ params.append(f"%{src_filter}%")
1671
+
1672
+ if tgt_filter:
1673
+ query += """
1674
+ AND EXISTS (
1675
+ SELECT 1 FROM tmx_segments seg2
1676
+ WHERE seg2.tu_id = tu.id
1677
+ """
1678
+ if tgt_lang:
1679
+ query += " AND seg2.lang = ?"
1680
+ params.append(tgt_lang)
1681
+ if ignore_case:
1682
+ query += " AND LOWER(seg2.text) LIKE LOWER(?)"
1683
+ params.append(f"%{tgt_filter}%")
1684
+ else:
1685
+ query += " AND seg2.text LIKE ?"
1686
+ params.append(f"%{tgt_filter}%")
1687
+ query += ")"
1688
+
1689
+ query += ")"
1690
+
1691
+ self.cursor.execute(query, params)
1692
+ return self.cursor.fetchone()[0]
1693
+
1694
+ def tmx_update_segment(self, tmx_file_id: int, tu_id: int, lang: str, text: str):
1695
+ """Update a segment text"""
1696
+ # Get internal TU ID
1697
+ self.cursor.execute("""
1698
+ SELECT tu.id FROM tmx_translation_units tu
1699
+ WHERE tu.tmx_file_id = ? AND tu.tu_id = ?
1700
+ """, (tmx_file_id, tu_id))
1701
+ tu_row = self.cursor.fetchone()
1702
+ if not tu_row:
1703
+ return False
1704
+
1705
+ tu_db_id = tu_row['id']
1706
+ change_date = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
1707
+
1708
+ # Update segment
1709
+ self.cursor.execute("""
1710
+ UPDATE tmx_segments
1711
+ SET text = ?, change_date = ?
1712
+ WHERE tu_id = ? AND lang = ?
1713
+ """, (text, change_date, tu_db_id, lang))
1714
+
1715
+ # Update TU change date
1716
+ self.cursor.execute("""
1717
+ UPDATE tmx_translation_units
1718
+ SET change_date = ?
1719
+ WHERE id = ?
1720
+ """, (change_date, tu_db_id))
1721
+
1722
+ # Update file last_modified
1723
+ self.cursor.execute("""
1724
+ UPDATE tmx_files
1725
+ SET last_modified = CURRENT_TIMESTAMP
1726
+ WHERE id = ?
1727
+ """, (tmx_file_id,))
1728
+
1729
+ self.connection.commit()
1730
+ return True
1731
+
1732
+ def tmx_delete_file(self, tmx_file_id: int):
1733
+ """Delete TMX file and all its data (CASCADE will handle TUs and segments)"""
1734
+ self.cursor.execute("DELETE FROM tmx_files WHERE id = ?", (tmx_file_id,))
1735
+ self.connection.commit()
1736
+
1737
+ def tmx_get_file_info(self, tmx_file_id: int) -> Optional[Dict]:
1738
+ """Get TMX file metadata"""
1739
+ self.cursor.execute("""
1740
+ SELECT id, file_path, file_name, original_file_path, load_mode,
1741
+ file_size, header_data, tu_count, languages,
1742
+ created_date, last_accessed, last_modified
1743
+ FROM tmx_files
1744
+ WHERE id = ?
1745
+ """, (tmx_file_id,))
1746
+ row = self.cursor.fetchone()
1747
+ if not row:
1748
+ return None
1749
+
1750
+ info = dict(row)
1751
+ info['header_data'] = json.loads(info['header_data'])
1752
+ info['languages'] = json.loads(info['languages'])
1753
+ return info
1754
+
1755
+ def get_database_info(self) -> Dict:
1756
+ """Get database statistics"""
1757
+ info = {
1758
+ 'path': self.db_path,
1759
+ 'size_bytes': os.path.getsize(self.db_path) if os.path.exists(self.db_path) else 0,
1760
+ 'tm_entries': self.get_tm_count(),
1761
+ }
1762
+
1763
+ # Get size in MB
1764
+ info['size_mb'] = round(info['size_bytes'] / (1024 * 1024), 2)
1765
+
1766
+ return info