supervertaler 1.9.163__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Supervertaler.py +48473 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1911 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +351 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1176 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.163.dist-info/METADATA +906 -0
- supervertaler-1.9.163.dist-info/RECORD +85 -0
- supervertaler-1.9.163.dist-info/WHEEL +5 -0
- supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.163.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1911 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database Manager Module
|
|
3
|
+
|
|
4
|
+
SQLite database backend for Translation Memories, Glossaries, and related resources.
|
|
5
|
+
Replaces in-memory JSON-based storage with efficient database storage.
|
|
6
|
+
|
|
7
|
+
Schema includes:
|
|
8
|
+
- Translation units (TM entries)
|
|
9
|
+
- Termbase terms
|
|
10
|
+
- Non-translatables
|
|
11
|
+
- Segmentation rules
|
|
12
|
+
- Project metadata
|
|
13
|
+
- Resource file references
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import sqlite3
|
|
17
|
+
import os
|
|
18
|
+
import json
|
|
19
|
+
import hashlib
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from typing import List, Dict, Optional, Tuple
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from difflib import SequenceMatcher
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DatabaseManager:
|
|
27
|
+
"""Manages SQLite database for translation resources"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, db_path: str = None, log_callback=None):
|
|
30
|
+
"""
|
|
31
|
+
Initialize database manager
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
db_path: Path to SQLite database file (default: user_data/supervertaler.db)
|
|
35
|
+
log_callback: Optional logging function
|
|
36
|
+
"""
|
|
37
|
+
self.log = log_callback if log_callback else print
|
|
38
|
+
|
|
39
|
+
# Set default database path if not provided
|
|
40
|
+
if db_path is None:
|
|
41
|
+
# Will be set by application - defaults to user_data folder
|
|
42
|
+
self.db_path = "supervertaler.db"
|
|
43
|
+
else:
|
|
44
|
+
self.db_path = db_path
|
|
45
|
+
|
|
46
|
+
self.connection = None
|
|
47
|
+
self.cursor = None
|
|
48
|
+
|
|
49
|
+
def connect(self):
|
|
50
|
+
"""Connect to database and create tables if needed"""
|
|
51
|
+
try:
|
|
52
|
+
# Create directory if it doesn't exist
|
|
53
|
+
os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
|
|
54
|
+
|
|
55
|
+
# Connect to database
|
|
56
|
+
self.connection = sqlite3.connect(self.db_path)
|
|
57
|
+
self.connection.row_factory = sqlite3.Row # Access columns by name
|
|
58
|
+
self.cursor = self.connection.cursor()
|
|
59
|
+
|
|
60
|
+
# Enable foreign keys
|
|
61
|
+
self.cursor.execute("PRAGMA foreign_keys = ON")
|
|
62
|
+
|
|
63
|
+
# Create tables
|
|
64
|
+
self._create_tables()
|
|
65
|
+
|
|
66
|
+
# Run database migrations (adds new columns/tables as needed)
|
|
67
|
+
try:
|
|
68
|
+
from modules.database_migrations import check_and_migrate
|
|
69
|
+
migration_success = check_and_migrate(self)
|
|
70
|
+
if not migration_success:
|
|
71
|
+
self.log("[WARNING] Database migration reported failure")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
self.log(f"[WARNING] Database migration check failed: {e}")
|
|
74
|
+
import traceback
|
|
75
|
+
traceback.print_exc()
|
|
76
|
+
|
|
77
|
+
# Auto-sync FTS5 index if out of sync
|
|
78
|
+
try:
|
|
79
|
+
fts_status = self.check_fts_index()
|
|
80
|
+
if not fts_status.get('in_sync', True):
|
|
81
|
+
self.log(f"[TM] FTS5 index out of sync ({fts_status.get('fts_count', 0)} vs {fts_status.get('main_count', 0)}), rebuilding...")
|
|
82
|
+
self.rebuild_fts_index()
|
|
83
|
+
except Exception as e:
|
|
84
|
+
self.log(f"[WARNING] FTS5 index check failed: {e}")
|
|
85
|
+
|
|
86
|
+
self.log(f"[OK] Database connected: {os.path.basename(self.db_path)}")
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
self.log(f"[ERROR] Database connection failed: {e}")
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
def _create_tables(self):
|
|
94
|
+
"""Create database schema"""
|
|
95
|
+
print("📊 Creating database tables...")
|
|
96
|
+
|
|
97
|
+
# ============================================
|
|
98
|
+
# TRANSLATION MEMORY TABLES
|
|
99
|
+
# ============================================
|
|
100
|
+
|
|
101
|
+
self.cursor.execute("""
|
|
102
|
+
CREATE TABLE IF NOT EXISTS translation_units (
|
|
103
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
104
|
+
source_text TEXT NOT NULL,
|
|
105
|
+
target_text TEXT NOT NULL,
|
|
106
|
+
source_lang TEXT NOT NULL,
|
|
107
|
+
target_lang TEXT NOT NULL,
|
|
108
|
+
tm_id TEXT NOT NULL,
|
|
109
|
+
project_id TEXT,
|
|
110
|
+
|
|
111
|
+
-- Context for better matching
|
|
112
|
+
context_before TEXT,
|
|
113
|
+
context_after TEXT,
|
|
114
|
+
|
|
115
|
+
-- Fast exact matching
|
|
116
|
+
source_hash TEXT NOT NULL,
|
|
117
|
+
|
|
118
|
+
-- Metadata
|
|
119
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
120
|
+
modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
121
|
+
usage_count INTEGER DEFAULT 0,
|
|
122
|
+
created_by TEXT,
|
|
123
|
+
notes TEXT,
|
|
124
|
+
|
|
125
|
+
-- Indexes
|
|
126
|
+
UNIQUE(source_hash, target_text, tm_id)
|
|
127
|
+
)
|
|
128
|
+
""")
|
|
129
|
+
|
|
130
|
+
# Indexes for translation_units
|
|
131
|
+
self.cursor.execute("""
|
|
132
|
+
CREATE INDEX IF NOT EXISTS idx_tu_source_hash
|
|
133
|
+
ON translation_units(source_hash)
|
|
134
|
+
""")
|
|
135
|
+
|
|
136
|
+
self.cursor.execute("""
|
|
137
|
+
CREATE INDEX IF NOT EXISTS idx_tu_tm_id
|
|
138
|
+
ON translation_units(tm_id)
|
|
139
|
+
""")
|
|
140
|
+
|
|
141
|
+
self.cursor.execute("""
|
|
142
|
+
CREATE INDEX IF NOT EXISTS idx_tu_project_id
|
|
143
|
+
ON translation_units(project_id)
|
|
144
|
+
""")
|
|
145
|
+
|
|
146
|
+
self.cursor.execute("""
|
|
147
|
+
CREATE INDEX IF NOT EXISTS idx_tu_langs
|
|
148
|
+
ON translation_units(source_lang, target_lang)
|
|
149
|
+
""")
|
|
150
|
+
|
|
151
|
+
# Full-text search for fuzzy matching
|
|
152
|
+
self.cursor.execute("""
|
|
153
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS translation_units_fts
|
|
154
|
+
USING fts5(
|
|
155
|
+
source_text,
|
|
156
|
+
target_text,
|
|
157
|
+
content=translation_units,
|
|
158
|
+
content_rowid=id
|
|
159
|
+
)
|
|
160
|
+
""")
|
|
161
|
+
|
|
162
|
+
# Triggers to keep FTS index in sync
|
|
163
|
+
self.cursor.execute("""
|
|
164
|
+
CREATE TRIGGER IF NOT EXISTS tu_fts_insert AFTER INSERT ON translation_units BEGIN
|
|
165
|
+
INSERT INTO translation_units_fts(rowid, source_text, target_text)
|
|
166
|
+
VALUES (new.id, new.source_text, new.target_text);
|
|
167
|
+
END
|
|
168
|
+
""")
|
|
169
|
+
|
|
170
|
+
self.cursor.execute("""
|
|
171
|
+
CREATE TRIGGER IF NOT EXISTS tu_fts_delete AFTER DELETE ON translation_units BEGIN
|
|
172
|
+
DELETE FROM translation_units_fts WHERE rowid = old.id;
|
|
173
|
+
END
|
|
174
|
+
""")
|
|
175
|
+
|
|
176
|
+
self.cursor.execute("""
|
|
177
|
+
CREATE TRIGGER IF NOT EXISTS tu_fts_update AFTER UPDATE ON translation_units BEGIN
|
|
178
|
+
DELETE FROM translation_units_fts WHERE rowid = old.id;
|
|
179
|
+
INSERT INTO translation_units_fts(rowid, source_text, target_text)
|
|
180
|
+
VALUES (new.id, new.source_text, new.target_text);
|
|
181
|
+
END
|
|
182
|
+
""")
|
|
183
|
+
|
|
184
|
+
# ============================================
|
|
185
|
+
# TRANSLATION MEMORY METADATA
|
|
186
|
+
# ============================================
|
|
187
|
+
|
|
188
|
+
# Translation Memories table - tracks individual TM names/metadata
|
|
189
|
+
self.cursor.execute("""
|
|
190
|
+
CREATE TABLE IF NOT EXISTS translation_memories (
|
|
191
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
192
|
+
name TEXT NOT NULL UNIQUE,
|
|
193
|
+
description TEXT,
|
|
194
|
+
source_lang TEXT,
|
|
195
|
+
target_lang TEXT,
|
|
196
|
+
tm_id TEXT NOT NULL UNIQUE, -- The tm_id used in translation_units table
|
|
197
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
198
|
+
modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
199
|
+
entry_count INTEGER DEFAULT 0, -- Cached count, updated on changes
|
|
200
|
+
last_used TIMESTAMP,
|
|
201
|
+
is_project_tm BOOLEAN DEFAULT 0, -- Whether this is the special project TM
|
|
202
|
+
read_only BOOLEAN DEFAULT 1, -- Whether this TM should not be updated (default: read-only, Write unchecked)
|
|
203
|
+
project_id INTEGER -- Which project this TM belongs to (NULL = global)
|
|
204
|
+
)
|
|
205
|
+
""")
|
|
206
|
+
|
|
207
|
+
# TM activation (tracks which TMs are active for which projects)
|
|
208
|
+
self.cursor.execute("""
|
|
209
|
+
CREATE TABLE IF NOT EXISTS tm_activation (
|
|
210
|
+
tm_id INTEGER NOT NULL,
|
|
211
|
+
project_id INTEGER NOT NULL,
|
|
212
|
+
is_active BOOLEAN DEFAULT 1,
|
|
213
|
+
activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
214
|
+
PRIMARY KEY (tm_id, project_id),
|
|
215
|
+
FOREIGN KEY (tm_id) REFERENCES translation_memories(id) ON DELETE CASCADE
|
|
216
|
+
)
|
|
217
|
+
""")
|
|
218
|
+
|
|
219
|
+
# Index for fast tm_id lookups
|
|
220
|
+
self.cursor.execute("""
|
|
221
|
+
CREATE INDEX IF NOT EXISTS idx_tm_tm_id
|
|
222
|
+
ON translation_memories(tm_id)
|
|
223
|
+
""")
|
|
224
|
+
|
|
225
|
+
# Migration: Add is_project_tm, read_only, and project_id columns if they don't exist
|
|
226
|
+
try:
|
|
227
|
+
self.cursor.execute("PRAGMA table_info(translation_memories)")
|
|
228
|
+
columns = [row[1] for row in self.cursor.fetchall()]
|
|
229
|
+
|
|
230
|
+
if 'is_project_tm' not in columns:
|
|
231
|
+
self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN is_project_tm BOOLEAN DEFAULT 0")
|
|
232
|
+
print("✓ Added is_project_tm column to translation_memories")
|
|
233
|
+
|
|
234
|
+
if 'read_only' not in columns:
|
|
235
|
+
self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN read_only BOOLEAN DEFAULT 1")
|
|
236
|
+
print("✓ Added read_only column to translation_memories (default: read-only)")
|
|
237
|
+
|
|
238
|
+
if 'project_id' not in columns:
|
|
239
|
+
self.cursor.execute("ALTER TABLE translation_memories ADD COLUMN project_id INTEGER")
|
|
240
|
+
print("✓ Added project_id column to translation_memories")
|
|
241
|
+
|
|
242
|
+
self.connection.commit()
|
|
243
|
+
except Exception as e:
|
|
244
|
+
print(f"Migration info: {e}")
|
|
245
|
+
|
|
246
|
+
# ============================================
|
|
247
|
+
# TERMBASE TABLES
|
|
248
|
+
# ============================================
|
|
249
|
+
|
|
250
|
+
# Termbases container table (terminology, never "termbase")
|
|
251
|
+
self.cursor.execute("""
|
|
252
|
+
CREATE TABLE IF NOT EXISTS termbases (
|
|
253
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
254
|
+
name TEXT NOT NULL UNIQUE,
|
|
255
|
+
description TEXT,
|
|
256
|
+
source_lang TEXT,
|
|
257
|
+
target_lang TEXT,
|
|
258
|
+
project_id INTEGER, -- NULL = global, set = project-specific
|
|
259
|
+
is_global BOOLEAN DEFAULT 1,
|
|
260
|
+
is_project_termbase BOOLEAN DEFAULT 0, -- True if this is a project-specific termbase
|
|
261
|
+
priority INTEGER DEFAULT 50, -- DEPRECATED: Use ranking instead
|
|
262
|
+
ranking INTEGER, -- Termbase activation ranking: 1 = highest priority, 2 = second highest, etc. Only for activated termbases.
|
|
263
|
+
read_only BOOLEAN DEFAULT 1, -- Whether this termbase should not be updated (default: read-only, Write unchecked)
|
|
264
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
265
|
+
modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
266
|
+
)
|
|
267
|
+
""")
|
|
268
|
+
|
|
269
|
+
# Migration: Add priority column if it doesn't exist (for existing databases)
|
|
270
|
+
try:
|
|
271
|
+
self.cursor.execute("ALTER TABLE termbases ADD COLUMN priority INTEGER DEFAULT 50")
|
|
272
|
+
self.connection.commit()
|
|
273
|
+
except Exception:
|
|
274
|
+
# Column already exists, ignore
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
# Migration: Add is_project_termbase column if it doesn't exist
|
|
278
|
+
try:
|
|
279
|
+
self.cursor.execute("ALTER TABLE termbases ADD COLUMN is_project_termbase BOOLEAN DEFAULT 0")
|
|
280
|
+
self.connection.commit()
|
|
281
|
+
except Exception:
|
|
282
|
+
# Column already exists, ignore
|
|
283
|
+
pass
|
|
284
|
+
|
|
285
|
+
# Migration: Add ranking column if it doesn't exist
|
|
286
|
+
try:
|
|
287
|
+
self.cursor.execute("ALTER TABLE termbases ADD COLUMN ranking INTEGER")
|
|
288
|
+
self.connection.commit()
|
|
289
|
+
except Exception:
|
|
290
|
+
# Column already exists, ignore
|
|
291
|
+
pass
|
|
292
|
+
|
|
293
|
+
# Migration: Add read_only column if it doesn't exist
|
|
294
|
+
try:
|
|
295
|
+
self.cursor.execute("ALTER TABLE termbases ADD COLUMN read_only BOOLEAN DEFAULT 1")
|
|
296
|
+
self.connection.commit()
|
|
297
|
+
except Exception:
|
|
298
|
+
# Column already exists, ignore
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
# Data Migration: Set is_project_termbase=1 for termbases with non-NULL project_id
|
|
302
|
+
# This ensures existing project termbases are correctly flagged
|
|
303
|
+
try:
|
|
304
|
+
self.cursor.execute("""
|
|
305
|
+
UPDATE termbases
|
|
306
|
+
SET is_project_termbase = 1
|
|
307
|
+
WHERE project_id IS NOT NULL
|
|
308
|
+
AND (is_project_termbase IS NULL OR is_project_termbase = 0)
|
|
309
|
+
""")
|
|
310
|
+
updated_count = self.cursor.rowcount
|
|
311
|
+
if updated_count > 0:
|
|
312
|
+
self.log(f"✅ Data migration: Updated {updated_count} project termbase(s) with is_project_termbase=1")
|
|
313
|
+
self.connection.commit()
|
|
314
|
+
except Exception as e:
|
|
315
|
+
self.log(f"⚠️ Data migration warning (is_project_termbase): {e}")
|
|
316
|
+
pass
|
|
317
|
+
|
|
318
|
+
# Legacy support: create glossaries as alias for termbases
|
|
319
|
+
self.cursor.execute("""
|
|
320
|
+
CREATE TABLE IF NOT EXISTS glossaries (
|
|
321
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
322
|
+
name TEXT NOT NULL UNIQUE,
|
|
323
|
+
description TEXT,
|
|
324
|
+
source_lang TEXT,
|
|
325
|
+
target_lang TEXT,
|
|
326
|
+
project_id INTEGER,
|
|
327
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
328
|
+
modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
329
|
+
)
|
|
330
|
+
""")
|
|
331
|
+
|
|
332
|
+
# Termbase activation (tracks which termbases are active for which projects)
|
|
333
|
+
self.cursor.execute("""
|
|
334
|
+
CREATE TABLE IF NOT EXISTS termbase_activation (
|
|
335
|
+
termbase_id INTEGER NOT NULL,
|
|
336
|
+
project_id INTEGER NOT NULL,
|
|
337
|
+
is_active BOOLEAN DEFAULT 1,
|
|
338
|
+
activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
339
|
+
priority INTEGER, -- Manual priority (1=highest, 2=second, etc.). Multiple termbases can share same priority.
|
|
340
|
+
PRIMARY KEY (termbase_id, project_id),
|
|
341
|
+
FOREIGN KEY (termbase_id) REFERENCES termbases(id) ON DELETE CASCADE
|
|
342
|
+
)
|
|
343
|
+
""")
|
|
344
|
+
|
|
345
|
+
# Migration: Add priority column to termbase_activation if it doesn't exist
|
|
346
|
+
try:
|
|
347
|
+
self.cursor.execute("ALTER TABLE termbase_activation ADD COLUMN priority INTEGER")
|
|
348
|
+
self.connection.commit()
|
|
349
|
+
except Exception:
|
|
350
|
+
# Column already exists, ignore
|
|
351
|
+
pass
|
|
352
|
+
|
|
353
|
+
# Legacy support: termbase_project_activation as alias
|
|
354
|
+
# Note: Foreign key now references termbases for consistency with Qt version
|
|
355
|
+
self.cursor.execute("""
|
|
356
|
+
CREATE TABLE IF NOT EXISTS termbase_project_activation (
|
|
357
|
+
termbase_id INTEGER NOT NULL,
|
|
358
|
+
project_id INTEGER NOT NULL,
|
|
359
|
+
activated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
360
|
+
PRIMARY KEY (termbase_id, project_id),
|
|
361
|
+
FOREIGN KEY (termbase_id) REFERENCES termbases(id) ON DELETE CASCADE
|
|
362
|
+
)
|
|
363
|
+
""")
|
|
364
|
+
|
|
365
|
+
self.cursor.execute("""
|
|
366
|
+
CREATE TABLE IF NOT EXISTS termbase_terms (
|
|
367
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
368
|
+
source_term TEXT NOT NULL,
|
|
369
|
+
target_term TEXT NOT NULL,
|
|
370
|
+
source_lang TEXT DEFAULT 'unknown',
|
|
371
|
+
target_lang TEXT DEFAULT 'unknown',
|
|
372
|
+
termbase_id TEXT NOT NULL,
|
|
373
|
+
priority INTEGER DEFAULT 99,
|
|
374
|
+
project_id TEXT,
|
|
375
|
+
|
|
376
|
+
-- Terminology-specific fields
|
|
377
|
+
synonyms TEXT,
|
|
378
|
+
forbidden_terms TEXT,
|
|
379
|
+
definition TEXT,
|
|
380
|
+
context TEXT,
|
|
381
|
+
part_of_speech TEXT,
|
|
382
|
+
domain TEXT,
|
|
383
|
+
case_sensitive BOOLEAN DEFAULT 0,
|
|
384
|
+
forbidden BOOLEAN DEFAULT 0,
|
|
385
|
+
|
|
386
|
+
-- Link to TM entry (optional)
|
|
387
|
+
tm_source_id INTEGER,
|
|
388
|
+
|
|
389
|
+
-- Metadata
|
|
390
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
391
|
+
modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
392
|
+
usage_count INTEGER DEFAULT 0,
|
|
393
|
+
notes TEXT,
|
|
394
|
+
note TEXT,
|
|
395
|
+
project TEXT,
|
|
396
|
+
client TEXT,
|
|
397
|
+
term_uuid TEXT,
|
|
398
|
+
|
|
399
|
+
FOREIGN KEY (tm_source_id) REFERENCES translation_units(id) ON DELETE SET NULL
|
|
400
|
+
)
|
|
401
|
+
""")
|
|
402
|
+
|
|
403
|
+
# Indexes for termbase_terms
|
|
404
|
+
self.cursor.execute("""
|
|
405
|
+
CREATE INDEX IF NOT EXISTS idx_gt_source_term
|
|
406
|
+
ON termbase_terms(source_term)
|
|
407
|
+
""")
|
|
408
|
+
|
|
409
|
+
self.cursor.execute("""
|
|
410
|
+
CREATE INDEX IF NOT EXISTS idx_gt_termbase_id
|
|
411
|
+
ON termbase_terms(termbase_id)
|
|
412
|
+
""")
|
|
413
|
+
|
|
414
|
+
self.cursor.execute("""
|
|
415
|
+
CREATE INDEX IF NOT EXISTS idx_gt_project_id
|
|
416
|
+
ON termbase_terms(project_id)
|
|
417
|
+
""")
|
|
418
|
+
|
|
419
|
+
self.cursor.execute("""
|
|
420
|
+
CREATE INDEX IF NOT EXISTS idx_gt_domain
|
|
421
|
+
ON termbase_terms(domain)
|
|
422
|
+
""")
|
|
423
|
+
|
|
424
|
+
# Full-text search for termbase
|
|
425
|
+
self.cursor.execute("""
|
|
426
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS termbase_terms_fts
|
|
427
|
+
USING fts5(
|
|
428
|
+
source_term,
|
|
429
|
+
target_term,
|
|
430
|
+
definition,
|
|
431
|
+
content=termbase_terms,
|
|
432
|
+
content_rowid=id
|
|
433
|
+
)
|
|
434
|
+
""")
|
|
435
|
+
|
|
436
|
+
# ============================================
|
|
437
|
+
# NON-TRANSLATABLES
|
|
438
|
+
# ============================================
|
|
439
|
+
|
|
440
|
+
self.cursor.execute("""
|
|
441
|
+
CREATE TABLE IF NOT EXISTS non_translatables (
|
|
442
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
443
|
+
pattern TEXT NOT NULL UNIQUE,
|
|
444
|
+
pattern_type TEXT DEFAULT 'regex',
|
|
445
|
+
description TEXT,
|
|
446
|
+
project_id TEXT,
|
|
447
|
+
enabled BOOLEAN DEFAULT 1,
|
|
448
|
+
example_text TEXT,
|
|
449
|
+
category TEXT,
|
|
450
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
451
|
+
)
|
|
452
|
+
""")
|
|
453
|
+
|
|
454
|
+
self.cursor.execute("""
|
|
455
|
+
CREATE INDEX IF NOT EXISTS idx_nt_project_id
|
|
456
|
+
ON non_translatables(project_id)
|
|
457
|
+
""")
|
|
458
|
+
|
|
459
|
+
self.cursor.execute("""
|
|
460
|
+
CREATE INDEX IF NOT EXISTS idx_nt_category
|
|
461
|
+
ON non_translatables(category)
|
|
462
|
+
""")
|
|
463
|
+
|
|
464
|
+
# ============================================
|
|
465
|
+
# SEGMENTATION RULES
|
|
466
|
+
# ============================================
|
|
467
|
+
|
|
468
|
+
self.cursor.execute("""
|
|
469
|
+
CREATE TABLE IF NOT EXISTS segmentation_rules (
|
|
470
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
471
|
+
rule_name TEXT NOT NULL,
|
|
472
|
+
source_lang TEXT,
|
|
473
|
+
rule_type TEXT NOT NULL,
|
|
474
|
+
pattern TEXT NOT NULL,
|
|
475
|
+
description TEXT,
|
|
476
|
+
priority INTEGER DEFAULT 100,
|
|
477
|
+
enabled BOOLEAN DEFAULT 1,
|
|
478
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
479
|
+
)
|
|
480
|
+
""")
|
|
481
|
+
|
|
482
|
+
self.cursor.execute("""
|
|
483
|
+
CREATE INDEX IF NOT EXISTS idx_sr_source_lang
|
|
484
|
+
ON segmentation_rules(source_lang)
|
|
485
|
+
""")
|
|
486
|
+
|
|
487
|
+
self.cursor.execute("""
|
|
488
|
+
CREATE INDEX IF NOT EXISTS idx_sr_priority
|
|
489
|
+
ON segmentation_rules(priority)
|
|
490
|
+
""")
|
|
491
|
+
|
|
492
|
+
# ============================================
|
|
493
|
+
# PROJECT METADATA
|
|
494
|
+
# ============================================
|
|
495
|
+
|
|
496
|
+
self.cursor.execute("""
|
|
497
|
+
CREATE TABLE IF NOT EXISTS projects (
|
|
498
|
+
id TEXT PRIMARY KEY,
|
|
499
|
+
name TEXT NOT NULL,
|
|
500
|
+
source_lang TEXT,
|
|
501
|
+
target_lang TEXT,
|
|
502
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
503
|
+
modified_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
504
|
+
last_opened TIMESTAMP,
|
|
505
|
+
|
|
506
|
+
-- Linked resources (JSON arrays)
|
|
507
|
+
active_tm_ids TEXT,
|
|
508
|
+
active_termbase_ids TEXT,
|
|
509
|
+
active_prompt_file TEXT,
|
|
510
|
+
active_style_guide TEXT,
|
|
511
|
+
|
|
512
|
+
-- Statistics
|
|
513
|
+
segment_count INTEGER DEFAULT 0,
|
|
514
|
+
translated_count INTEGER DEFAULT 0,
|
|
515
|
+
|
|
516
|
+
-- Settings (JSON blob)
|
|
517
|
+
settings TEXT
|
|
518
|
+
)
|
|
519
|
+
""")
|
|
520
|
+
|
|
521
|
+
# ============================================
|
|
522
|
+
# FILE METADATA (for prompts and style guides)
|
|
523
|
+
# ============================================
|
|
524
|
+
|
|
525
|
+
self.cursor.execute("""
|
|
526
|
+
CREATE TABLE IF NOT EXISTS prompt_files (
|
|
527
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
528
|
+
file_path TEXT NOT NULL UNIQUE,
|
|
529
|
+
file_type TEXT NOT NULL,
|
|
530
|
+
name TEXT NOT NULL,
|
|
531
|
+
description TEXT,
|
|
532
|
+
last_used TIMESTAMP,
|
|
533
|
+
use_count INTEGER DEFAULT 0
|
|
534
|
+
)
|
|
535
|
+
""")
|
|
536
|
+
|
|
537
|
+
# ============================================
|
|
538
|
+
# TMX EDITOR TABLES (for database-backed TMX files)
|
|
539
|
+
# ============================================
|
|
540
|
+
|
|
541
|
+
self.cursor.execute("""
|
|
542
|
+
CREATE TABLE IF NOT EXISTS tmx_files (
|
|
543
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
544
|
+
file_path TEXT NOT NULL UNIQUE,
|
|
545
|
+
file_name TEXT NOT NULL,
|
|
546
|
+
original_file_path TEXT, -- Original file path when imported
|
|
547
|
+
load_mode TEXT NOT NULL, -- 'ram' or 'database'
|
|
548
|
+
file_size INTEGER, -- File size in bytes
|
|
549
|
+
|
|
550
|
+
-- Header metadata (JSON)
|
|
551
|
+
header_data TEXT NOT NULL,
|
|
552
|
+
|
|
553
|
+
-- Statistics
|
|
554
|
+
tu_count INTEGER DEFAULT 0,
|
|
555
|
+
languages TEXT, -- JSON array of language codes
|
|
556
|
+
|
|
557
|
+
-- Timestamps
|
|
558
|
+
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
559
|
+
last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
560
|
+
last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
561
|
+
)
|
|
562
|
+
""")
|
|
563
|
+
|
|
564
|
+
self.cursor.execute("""
|
|
565
|
+
CREATE TABLE IF NOT EXISTS tmx_translation_units (
|
|
566
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
567
|
+
tmx_file_id INTEGER NOT NULL,
|
|
568
|
+
tu_id INTEGER NOT NULL, -- Original TU ID from TMX file
|
|
569
|
+
|
|
570
|
+
-- System attributes
|
|
571
|
+
creation_date TEXT,
|
|
572
|
+
creation_id TEXT,
|
|
573
|
+
change_date TEXT,
|
|
574
|
+
change_id TEXT,
|
|
575
|
+
srclang TEXT,
|
|
576
|
+
|
|
577
|
+
-- Custom attributes (JSON)
|
|
578
|
+
custom_attributes TEXT,
|
|
579
|
+
|
|
580
|
+
-- Comments (JSON array)
|
|
581
|
+
comments TEXT,
|
|
582
|
+
|
|
583
|
+
FOREIGN KEY (tmx_file_id) REFERENCES tmx_files(id) ON DELETE CASCADE,
|
|
584
|
+
UNIQUE(tmx_file_id, tu_id)
|
|
585
|
+
)
|
|
586
|
+
""")
|
|
587
|
+
|
|
588
|
+
self.cursor.execute("""
|
|
589
|
+
CREATE TABLE IF NOT EXISTS tmx_segments (
|
|
590
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
591
|
+
tu_id INTEGER NOT NULL, -- References tmx_translation_units.id
|
|
592
|
+
lang TEXT NOT NULL,
|
|
593
|
+
text TEXT NOT NULL,
|
|
594
|
+
|
|
595
|
+
-- Language-specific attributes
|
|
596
|
+
creation_date TEXT,
|
|
597
|
+
creation_id TEXT,
|
|
598
|
+
change_date TEXT,
|
|
599
|
+
change_id TEXT,
|
|
600
|
+
|
|
601
|
+
FOREIGN KEY (tu_id) REFERENCES tmx_translation_units(id) ON DELETE CASCADE,
|
|
602
|
+
UNIQUE(tu_id, lang)
|
|
603
|
+
)
|
|
604
|
+
""")
|
|
605
|
+
|
|
606
|
+
# Indexes for TMX tables
|
|
607
|
+
self.cursor.execute("""
|
|
608
|
+
CREATE INDEX IF NOT EXISTS idx_tmx_tu_file_id
|
|
609
|
+
ON tmx_translation_units(tmx_file_id)
|
|
610
|
+
""")
|
|
611
|
+
|
|
612
|
+
self.cursor.execute("""
|
|
613
|
+
CREATE INDEX IF NOT EXISTS idx_tmx_tu_tu_id
|
|
614
|
+
ON tmx_translation_units(tu_id)
|
|
615
|
+
""")
|
|
616
|
+
|
|
617
|
+
self.cursor.execute("""
|
|
618
|
+
CREATE INDEX IF NOT EXISTS idx_tmx_seg_tu_id
|
|
619
|
+
ON tmx_segments(tu_id)
|
|
620
|
+
""")
|
|
621
|
+
|
|
622
|
+
self.cursor.execute("""
|
|
623
|
+
CREATE INDEX IF NOT EXISTS idx_tmx_seg_lang
|
|
624
|
+
ON tmx_segments(lang)
|
|
625
|
+
""")
|
|
626
|
+
|
|
627
|
+
self.cursor.execute("""
|
|
628
|
+
CREATE TABLE IF NOT EXISTS style_guide_files (
|
|
629
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
630
|
+
file_path TEXT NOT NULL UNIQUE,
|
|
631
|
+
language TEXT NOT NULL,
|
|
632
|
+
last_used TIMESTAMP,
|
|
633
|
+
use_count INTEGER DEFAULT 0
|
|
634
|
+
)
|
|
635
|
+
""")
|
|
636
|
+
|
|
637
|
+
# Commit schema
|
|
638
|
+
try:
|
|
639
|
+
self.connection.commit()
|
|
640
|
+
print("✅ Database tables created and committed successfully")
|
|
641
|
+
except Exception as e:
|
|
642
|
+
print(f"❌ Error committing database schema: {e}")
|
|
643
|
+
import traceback
|
|
644
|
+
traceback.print_exc()
|
|
645
|
+
raise
|
|
646
|
+
|
|
647
|
+
def close(self):
|
|
648
|
+
"""Close database connection"""
|
|
649
|
+
if self.connection:
|
|
650
|
+
self.connection.close()
|
|
651
|
+
self.connection = None
|
|
652
|
+
self.cursor = None
|
|
653
|
+
|
|
654
|
+
# ============================================
|
|
655
|
+
# TRANSLATION MEMORY METHODS
|
|
656
|
+
# ============================================
|
|
657
|
+
|
|
658
|
+
def add_translation_unit(self, source: str, target: str, source_lang: str,
|
|
659
|
+
target_lang: str, tm_id: str = 'project',
|
|
660
|
+
project_id: str = None, context_before: str = None,
|
|
661
|
+
context_after: str = None, notes: str = None) -> int:
|
|
662
|
+
"""
|
|
663
|
+
Add translation unit to database
|
|
664
|
+
|
|
665
|
+
Returns: ID of inserted/updated entry
|
|
666
|
+
"""
|
|
667
|
+
# Generate hash for fast exact matching
|
|
668
|
+
source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
|
|
669
|
+
|
|
670
|
+
try:
|
|
671
|
+
self.cursor.execute("""
|
|
672
|
+
INSERT INTO translation_units
|
|
673
|
+
(source_text, target_text, source_lang, target_lang, tm_id,
|
|
674
|
+
project_id, context_before, context_after, source_hash, notes)
|
|
675
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
676
|
+
ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
|
|
677
|
+
usage_count = usage_count + 1,
|
|
678
|
+
modified_date = CURRENT_TIMESTAMP
|
|
679
|
+
""", (source, target, source_lang, target_lang, tm_id,
|
|
680
|
+
project_id, context_before, context_after, source_hash, notes))
|
|
681
|
+
|
|
682
|
+
self.connection.commit()
|
|
683
|
+
return self.cursor.lastrowid
|
|
684
|
+
|
|
685
|
+
except Exception as e:
|
|
686
|
+
self.log(f"Error adding translation unit: {e}")
|
|
687
|
+
return None
|
|
688
|
+
|
|
689
|
+
def get_exact_match(self, source: str, tm_ids: List[str] = None,
|
|
690
|
+
source_lang: str = None, target_lang: str = None,
|
|
691
|
+
bidirectional: bool = True) -> Optional[Dict]:
|
|
692
|
+
"""
|
|
693
|
+
Get exact match from TM
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
source: Source text to match
|
|
697
|
+
tm_ids: List of TM IDs to search (None = all)
|
|
698
|
+
source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
|
|
699
|
+
target_lang: Filter by target language (base code matching)
|
|
700
|
+
bidirectional: If True, search both directions (nl→en AND en→nl)
|
|
701
|
+
|
|
702
|
+
Returns: Dictionary with match data or None
|
|
703
|
+
"""
|
|
704
|
+
from modules.tmx_generator import get_base_lang_code
|
|
705
|
+
|
|
706
|
+
source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
|
|
707
|
+
|
|
708
|
+
# Get base language codes for comparison
|
|
709
|
+
src_base = get_base_lang_code(source_lang) if source_lang else None
|
|
710
|
+
tgt_base = get_base_lang_code(target_lang) if target_lang else None
|
|
711
|
+
|
|
712
|
+
query = """
|
|
713
|
+
SELECT * FROM translation_units
|
|
714
|
+
WHERE source_hash = ? AND source_text = ?
|
|
715
|
+
"""
|
|
716
|
+
params = [source_hash, source]
|
|
717
|
+
|
|
718
|
+
if tm_ids:
|
|
719
|
+
placeholders = ','.join('?' * len(tm_ids))
|
|
720
|
+
query += f" AND tm_id IN ({placeholders})"
|
|
721
|
+
params.extend(tm_ids)
|
|
722
|
+
|
|
723
|
+
# Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
|
|
724
|
+
from modules.tmx_generator import get_lang_match_variants
|
|
725
|
+
if src_base:
|
|
726
|
+
src_variants = get_lang_match_variants(source_lang)
|
|
727
|
+
src_conditions = []
|
|
728
|
+
for variant in src_variants:
|
|
729
|
+
src_conditions.append("source_lang = ?")
|
|
730
|
+
params.append(variant)
|
|
731
|
+
src_conditions.append("source_lang LIKE ?")
|
|
732
|
+
params.append(f"{variant}-%")
|
|
733
|
+
query += f" AND ({' OR '.join(src_conditions)})"
|
|
734
|
+
|
|
735
|
+
if tgt_base:
|
|
736
|
+
tgt_variants = get_lang_match_variants(target_lang)
|
|
737
|
+
tgt_conditions = []
|
|
738
|
+
for variant in tgt_variants:
|
|
739
|
+
tgt_conditions.append("target_lang = ?")
|
|
740
|
+
params.append(variant)
|
|
741
|
+
tgt_conditions.append("target_lang LIKE ?")
|
|
742
|
+
params.append(f"{variant}-%")
|
|
743
|
+
query += f" AND ({' OR '.join(tgt_conditions)})"
|
|
744
|
+
|
|
745
|
+
query += " ORDER BY usage_count DESC, modified_date DESC LIMIT 1"
|
|
746
|
+
|
|
747
|
+
self.cursor.execute(query, params)
|
|
748
|
+
row = self.cursor.fetchone()
|
|
749
|
+
|
|
750
|
+
if row:
|
|
751
|
+
# Update usage count
|
|
752
|
+
self.cursor.execute("""
|
|
753
|
+
UPDATE translation_units
|
|
754
|
+
SET usage_count = usage_count + 1
|
|
755
|
+
WHERE id = ?
|
|
756
|
+
""", (row['id'],))
|
|
757
|
+
self.connection.commit()
|
|
758
|
+
|
|
759
|
+
return dict(row)
|
|
760
|
+
|
|
761
|
+
# If bidirectional and no forward match, try reverse direction
|
|
762
|
+
if bidirectional and src_base and tgt_base:
|
|
763
|
+
# Search where our source text is in the target field (reverse direction)
|
|
764
|
+
query = """
|
|
765
|
+
SELECT * FROM translation_units
|
|
766
|
+
WHERE target_text = ?
|
|
767
|
+
"""
|
|
768
|
+
params = [source]
|
|
769
|
+
|
|
770
|
+
if tm_ids:
|
|
771
|
+
placeholders = ','.join('?' * len(tm_ids))
|
|
772
|
+
query += f" AND tm_id IN ({placeholders})"
|
|
773
|
+
params.extend(tm_ids)
|
|
774
|
+
|
|
775
|
+
# Reversed: search where TM source_lang matches our target_lang (flexible matching)
|
|
776
|
+
# Note: for reverse, we swap - TM source_lang should match our target_lang
|
|
777
|
+
tgt_variants = get_lang_match_variants(target_lang)
|
|
778
|
+
src_variants = get_lang_match_variants(source_lang)
|
|
779
|
+
|
|
780
|
+
src_conditions = []
|
|
781
|
+
for variant in tgt_variants: # TM source_lang = our target_lang
|
|
782
|
+
src_conditions.append("source_lang = ?")
|
|
783
|
+
params.append(variant)
|
|
784
|
+
src_conditions.append("source_lang LIKE ?")
|
|
785
|
+
params.append(f"{variant}-%")
|
|
786
|
+
|
|
787
|
+
tgt_conditions = []
|
|
788
|
+
for variant in src_variants: # TM target_lang = our source_lang
|
|
789
|
+
tgt_conditions.append("target_lang = ?")
|
|
790
|
+
params.append(variant)
|
|
791
|
+
tgt_conditions.append("target_lang LIKE ?")
|
|
792
|
+
params.append(f"{variant}-%")
|
|
793
|
+
|
|
794
|
+
query += f" AND ({' OR '.join(src_conditions)}) AND ({' OR '.join(tgt_conditions)})"
|
|
795
|
+
|
|
796
|
+
query += " ORDER BY usage_count DESC, modified_date DESC LIMIT 1"
|
|
797
|
+
|
|
798
|
+
self.cursor.execute(query, params)
|
|
799
|
+
row = self.cursor.fetchone()
|
|
800
|
+
|
|
801
|
+
if row:
|
|
802
|
+
# Update usage count
|
|
803
|
+
self.cursor.execute("""
|
|
804
|
+
UPDATE translation_units
|
|
805
|
+
SET usage_count = usage_count + 1
|
|
806
|
+
WHERE id = ?
|
|
807
|
+
""", (row['id'],))
|
|
808
|
+
self.connection.commit()
|
|
809
|
+
|
|
810
|
+
# Swap source/target since this is a reverse match
|
|
811
|
+
result = dict(row)
|
|
812
|
+
result['source_text'], result['target_text'] = result['target_text'], result['source_text']
|
|
813
|
+
result['source_lang'], result['target_lang'] = result['target_lang'], result['source_lang']
|
|
814
|
+
result['reverse_match'] = True
|
|
815
|
+
return result
|
|
816
|
+
|
|
817
|
+
return None
|
|
818
|
+
|
|
819
|
+
def calculate_similarity(self, text1: str, text2: str) -> float:
|
|
820
|
+
"""
|
|
821
|
+
Calculate similarity ratio between two texts using SequenceMatcher.
|
|
822
|
+
Tags are stripped before comparison for better matching accuracy.
|
|
823
|
+
|
|
824
|
+
Returns: Similarity score from 0.0 to 1.0
|
|
825
|
+
"""
|
|
826
|
+
import re
|
|
827
|
+
# Strip HTML/XML tags for comparison
|
|
828
|
+
clean1 = re.sub(r'<[^>]+>', '', text1).lower()
|
|
829
|
+
clean2 = re.sub(r'<[^>]+>', '', text2).lower()
|
|
830
|
+
return SequenceMatcher(None, clean1, clean2).ratio()
|
|
831
|
+
|
|
832
|
+
def search_fuzzy_matches(self, source: str, tm_ids: List[str] = None,
|
|
833
|
+
threshold: float = 0.75, max_results: int = 5,
|
|
834
|
+
source_lang: str = None, target_lang: str = None,
|
|
835
|
+
bidirectional: bool = True) -> List[Dict]:
|
|
836
|
+
"""
|
|
837
|
+
Search for fuzzy matches using FTS5 with proper similarity calculation
|
|
838
|
+
|
|
839
|
+
Args:
|
|
840
|
+
bidirectional: If True, search both directions (nl→en AND en→nl)
|
|
841
|
+
|
|
842
|
+
Returns: List of matches with similarity scores
|
|
843
|
+
|
|
844
|
+
Note: When multiple TMs are provided, searches each TM separately to ensure
|
|
845
|
+
good matches from smaller TMs aren't pushed out by BM25 keyword ranking
|
|
846
|
+
from larger TMs. Results are merged and sorted by actual similarity.
|
|
847
|
+
"""
|
|
848
|
+
# For better FTS5 matching, tokenize the query and escape special chars
|
|
849
|
+
# FTS5 special characters: " ( ) - : , . ! ?
|
|
850
|
+
import re
|
|
851
|
+
from modules.tmx_generator import get_base_lang_code, get_lang_match_variants
|
|
852
|
+
|
|
853
|
+
# Strip HTML/XML tags from source for clean text search
|
|
854
|
+
text_without_tags = re.sub(r'<[^>]+>', '', source)
|
|
855
|
+
|
|
856
|
+
# Remove special FTS5 characters and split into words (from tag-stripped text)
|
|
857
|
+
clean_text = re.sub(r'[^\w\s]', ' ', text_without_tags) # Replace special chars with spaces
|
|
858
|
+
search_terms_clean = [term for term in clean_text.strip().split() if len(term) > 2] # Min 3 chars
|
|
859
|
+
|
|
860
|
+
# Also get search terms from original source (in case TM was indexed with tags)
|
|
861
|
+
clean_text_with_tags = re.sub(r'[^\w\s]', ' ', source)
|
|
862
|
+
search_terms_with_tags = [term for term in clean_text_with_tags.strip().split() if len(term) > 2]
|
|
863
|
+
|
|
864
|
+
# Combine both sets of search terms (deduplicated)
|
|
865
|
+
all_search_terms = list(dict.fromkeys(search_terms_clean + search_terms_with_tags))
|
|
866
|
+
|
|
867
|
+
# For long segments, prioritize longer/rarer words to get better FTS5 candidates
|
|
868
|
+
# Sort by length (longer words are usually more discriminating)
|
|
869
|
+
all_search_terms.sort(key=len, reverse=True)
|
|
870
|
+
|
|
871
|
+
# Limit search terms to avoid overly complex queries (top 20 longest words)
|
|
872
|
+
# This helps find similar long segments more reliably
|
|
873
|
+
search_terms_for_query = all_search_terms[:20]
|
|
874
|
+
|
|
875
|
+
if not search_terms_for_query:
|
|
876
|
+
# If no valid terms, return empty results
|
|
877
|
+
return []
|
|
878
|
+
|
|
879
|
+
# Quote each term to prevent FTS5 syntax errors
|
|
880
|
+
fts_query = ' OR '.join(f'"{term}"' for term in search_terms_for_query)
|
|
881
|
+
|
|
882
|
+
# Get base language codes for comparison
|
|
883
|
+
src_base = get_base_lang_code(source_lang) if source_lang else None
|
|
884
|
+
tgt_base = get_base_lang_code(target_lang) if target_lang else None
|
|
885
|
+
|
|
886
|
+
# MULTI-TM FIX: Search each TM separately to avoid BM25 ranking issues
|
|
887
|
+
# When a large TM is combined with a small TM, the large TM's many keyword matches
|
|
888
|
+
# push down genuinely similar sentences from the small TM
|
|
889
|
+
tms_to_search = tm_ids if tm_ids else [None] # None means search all TMs together
|
|
890
|
+
|
|
891
|
+
all_results = []
|
|
892
|
+
|
|
893
|
+
for tm_id in tms_to_search:
|
|
894
|
+
# Search this specific TM (or all if tm_id is None)
|
|
895
|
+
tm_results = self._search_single_tm_fuzzy(
|
|
896
|
+
source, fts_query, [tm_id] if tm_id else None,
|
|
897
|
+
threshold, max_results, src_base, tgt_base,
|
|
898
|
+
source_lang, target_lang, bidirectional
|
|
899
|
+
)
|
|
900
|
+
all_results.extend(tm_results)
|
|
901
|
+
|
|
902
|
+
# Deduplicate by source_text (keep highest similarity for each unique source)
|
|
903
|
+
seen = {}
|
|
904
|
+
for result in all_results:
|
|
905
|
+
key = result['source_text']
|
|
906
|
+
if key not in seen or result['similarity'] > seen[key]['similarity']:
|
|
907
|
+
seen[key] = result
|
|
908
|
+
|
|
909
|
+
deduped_results = list(seen.values())
|
|
910
|
+
|
|
911
|
+
# Sort ALL results by similarity (highest first) - this ensures the 76% match
|
|
912
|
+
# appears before 40% matches regardless of which TM they came from
|
|
913
|
+
deduped_results.sort(key=lambda x: x['similarity'], reverse=True)
|
|
914
|
+
|
|
915
|
+
return deduped_results[:max_results]
|
|
916
|
+
|
|
917
|
+
def _search_single_tm_fuzzy(self, source: str, fts_query: str, tm_ids: List[str],
|
|
918
|
+
threshold: float, max_results: int,
|
|
919
|
+
src_base: str, tgt_base: str,
|
|
920
|
+
source_lang: str, target_lang: str,
|
|
921
|
+
bidirectional: bool) -> List[Dict]:
|
|
922
|
+
"""Search a single TM (or all TMs if tm_ids is None) for fuzzy matches"""
|
|
923
|
+
from modules.tmx_generator import get_lang_match_variants
|
|
924
|
+
|
|
925
|
+
# Build query for this TM
|
|
926
|
+
query = """
|
|
927
|
+
SELECT tu.*,
|
|
928
|
+
bm25(translation_units_fts) as relevance
|
|
929
|
+
FROM translation_units tu
|
|
930
|
+
JOIN translation_units_fts ON tu.id = translation_units_fts.rowid
|
|
931
|
+
WHERE translation_units_fts MATCH ?
|
|
932
|
+
"""
|
|
933
|
+
params = [fts_query]
|
|
934
|
+
|
|
935
|
+
if tm_ids and tm_ids[0] is not None:
|
|
936
|
+
placeholders = ','.join('?' * len(tm_ids))
|
|
937
|
+
query += f" AND tu.tm_id IN ({placeholders})"
|
|
938
|
+
params.extend(tm_ids)
|
|
939
|
+
|
|
940
|
+
# Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
|
|
941
|
+
if src_base:
|
|
942
|
+
src_variants = get_lang_match_variants(source_lang)
|
|
943
|
+
src_conditions = []
|
|
944
|
+
for variant in src_variants:
|
|
945
|
+
src_conditions.append("tu.source_lang = ?")
|
|
946
|
+
params.append(variant)
|
|
947
|
+
src_conditions.append("tu.source_lang LIKE ?")
|
|
948
|
+
params.append(f"{variant}-%")
|
|
949
|
+
query += f" AND ({' OR '.join(src_conditions)})"
|
|
950
|
+
|
|
951
|
+
if tgt_base:
|
|
952
|
+
tgt_variants = get_lang_match_variants(target_lang)
|
|
953
|
+
tgt_conditions = []
|
|
954
|
+
for variant in tgt_variants:
|
|
955
|
+
tgt_conditions.append("tu.target_lang = ?")
|
|
956
|
+
params.append(variant)
|
|
957
|
+
tgt_conditions.append("tu.target_lang LIKE ?")
|
|
958
|
+
params.append(f"{variant}-%")
|
|
959
|
+
query += f" AND ({' OR '.join(tgt_conditions)})"
|
|
960
|
+
|
|
961
|
+
# Per-TM candidate limit - INCREASED to catch more potential fuzzy matches
|
|
962
|
+
# When multiple TMs are searched, BM25 ranking can push genuinely similar
|
|
963
|
+
# entries far down the list due to common word matches in other entries
|
|
964
|
+
candidate_limit = max(500, max_results * 50)
|
|
965
|
+
query += f" ORDER BY relevance DESC LIMIT {candidate_limit}"
|
|
966
|
+
|
|
967
|
+
try:
|
|
968
|
+
self.cursor.execute(query, params)
|
|
969
|
+
all_rows = self.cursor.fetchall()
|
|
970
|
+
except Exception as e:
|
|
971
|
+
return []
|
|
972
|
+
|
|
973
|
+
results = []
|
|
974
|
+
|
|
975
|
+
for row in all_rows:
|
|
976
|
+
match_dict = dict(row)
|
|
977
|
+
# Calculate actual similarity using SequenceMatcher
|
|
978
|
+
similarity = self.calculate_similarity(source, match_dict['source_text'])
|
|
979
|
+
|
|
980
|
+
# Only include matches above threshold
|
|
981
|
+
if similarity >= threshold:
|
|
982
|
+
match_dict['similarity'] = similarity
|
|
983
|
+
match_dict['match_pct'] = int(similarity * 100)
|
|
984
|
+
results.append(match_dict)
|
|
985
|
+
|
|
986
|
+
# If bidirectional, also search reverse direction
|
|
987
|
+
if bidirectional and src_base and tgt_base:
|
|
988
|
+
query = """
|
|
989
|
+
SELECT tu.*,
|
|
990
|
+
bm25(translation_units_fts) as relevance
|
|
991
|
+
FROM translation_units tu
|
|
992
|
+
JOIN translation_units_fts ON tu.id = translation_units_fts.rowid
|
|
993
|
+
WHERE translation_units_fts MATCH ?
|
|
994
|
+
"""
|
|
995
|
+
params = [fts_query]
|
|
996
|
+
|
|
997
|
+
if tm_ids and tm_ids[0] is not None:
|
|
998
|
+
placeholders = ','.join('?' * len(tm_ids))
|
|
999
|
+
query += f" AND tu.tm_id IN ({placeholders})"
|
|
1000
|
+
params.extend(tm_ids)
|
|
1001
|
+
|
|
1002
|
+
# Reversed language filters with flexible matching
|
|
1003
|
+
src_variants = get_lang_match_variants(source_lang)
|
|
1004
|
+
tgt_variants = get_lang_match_variants(target_lang)
|
|
1005
|
+
|
|
1006
|
+
# TM target_lang = our source_lang
|
|
1007
|
+
tgt_conditions = []
|
|
1008
|
+
for variant in src_variants:
|
|
1009
|
+
tgt_conditions.append("tu.target_lang = ?")
|
|
1010
|
+
params.append(variant)
|
|
1011
|
+
tgt_conditions.append("tu.target_lang LIKE ?")
|
|
1012
|
+
params.append(f"{variant}-%")
|
|
1013
|
+
query += f" AND ({' OR '.join(tgt_conditions)})"
|
|
1014
|
+
|
|
1015
|
+
# TM source_lang = our target_lang
|
|
1016
|
+
src_conditions = []
|
|
1017
|
+
for variant in tgt_variants:
|
|
1018
|
+
src_conditions.append("tu.source_lang = ?")
|
|
1019
|
+
params.append(variant)
|
|
1020
|
+
src_conditions.append("tu.source_lang LIKE ?")
|
|
1021
|
+
params.append(f"{variant}-%")
|
|
1022
|
+
query += f" AND ({' OR '.join(src_conditions)})"
|
|
1023
|
+
|
|
1024
|
+
query += f" ORDER BY relevance DESC LIMIT {max_results * 5}"
|
|
1025
|
+
|
|
1026
|
+
try:
|
|
1027
|
+
self.cursor.execute(query, params)
|
|
1028
|
+
|
|
1029
|
+
for row in self.cursor.fetchall():
|
|
1030
|
+
match_dict = dict(row)
|
|
1031
|
+
# Calculate similarity against target_text (since we're reversing)
|
|
1032
|
+
similarity = self.calculate_similarity(source, match_dict['target_text'])
|
|
1033
|
+
|
|
1034
|
+
# Only include matches above threshold
|
|
1035
|
+
if similarity >= threshold:
|
|
1036
|
+
# Swap source/target for reverse match
|
|
1037
|
+
match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
|
|
1038
|
+
match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
|
|
1039
|
+
match_dict['similarity'] = similarity
|
|
1040
|
+
match_dict['match_pct'] = int(similarity * 100)
|
|
1041
|
+
match_dict['reverse_match'] = True
|
|
1042
|
+
results.append(match_dict)
|
|
1043
|
+
except Exception as e:
|
|
1044
|
+
print(f"[DEBUG] _search_single_tm_fuzzy (reverse): SQL ERROR: {e}")
|
|
1045
|
+
|
|
1046
|
+
return results
|
|
1047
|
+
|
|
1048
|
+
def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True,
|
|
1049
|
+
threshold: float = 0.75, max_results: int = 10) -> List[Dict]:
|
|
1050
|
+
"""
|
|
1051
|
+
Search for matches across TMs (both exact and fuzzy)
|
|
1052
|
+
|
|
1053
|
+
Args:
|
|
1054
|
+
source: Source text to search for
|
|
1055
|
+
tm_ids: List of TM IDs to search (None = all)
|
|
1056
|
+
enabled_only: Currently ignored (all TMs enabled)
|
|
1057
|
+
threshold: Minimum similarity threshold (0.0-1.0)
|
|
1058
|
+
max_results: Maximum number of results
|
|
1059
|
+
|
|
1060
|
+
Returns:
|
|
1061
|
+
List of matches with source, target, match_pct, tm_name
|
|
1062
|
+
"""
|
|
1063
|
+
# First try exact match
|
|
1064
|
+
exact = self.get_exact_match(source, tm_ids=tm_ids)
|
|
1065
|
+
if exact:
|
|
1066
|
+
return [{
|
|
1067
|
+
'source': exact['source_text'],
|
|
1068
|
+
'target': exact['target_text'],
|
|
1069
|
+
'match_pct': 100,
|
|
1070
|
+
'tm_name': exact['tm_id'].replace('_', ' ').title(),
|
|
1071
|
+
'tm_id': exact['tm_id']
|
|
1072
|
+
}]
|
|
1073
|
+
|
|
1074
|
+
# No exact match, try fuzzy
|
|
1075
|
+
fuzzy_matches = self.search_fuzzy_matches(
|
|
1076
|
+
source,
|
|
1077
|
+
tm_ids=tm_ids,
|
|
1078
|
+
threshold=threshold,
|
|
1079
|
+
max_results=max_results
|
|
1080
|
+
)
|
|
1081
|
+
|
|
1082
|
+
results = []
|
|
1083
|
+
for match in fuzzy_matches:
|
|
1084
|
+
results.append({
|
|
1085
|
+
'source': match['source_text'],
|
|
1086
|
+
'target': match['target_text'],
|
|
1087
|
+
'match_pct': match['match_pct'],
|
|
1088
|
+
'tm_name': match['tm_id'].replace('_', ' ').title(),
|
|
1089
|
+
'tm_id': match['tm_id']
|
|
1090
|
+
})
|
|
1091
|
+
|
|
1092
|
+
return results
|
|
1093
|
+
|
|
1094
|
+
def get_tm_entries(self, tm_id: str, limit: int = None) -> List[Dict]:
|
|
1095
|
+
"""Get all entries from a specific TM"""
|
|
1096
|
+
query = "SELECT * FROM translation_units WHERE tm_id = ? ORDER BY id"
|
|
1097
|
+
params = [tm_id]
|
|
1098
|
+
|
|
1099
|
+
if limit:
|
|
1100
|
+
query += f" LIMIT {limit}"
|
|
1101
|
+
|
|
1102
|
+
self.cursor.execute(query, params)
|
|
1103
|
+
return [dict(row) for row in self.cursor.fetchall()]
|
|
1104
|
+
|
|
1105
|
+
def get_tm_count(self, tm_id: str = None) -> int:
|
|
1106
|
+
"""Get entry count for TM(s)"""
|
|
1107
|
+
if tm_id:
|
|
1108
|
+
self.cursor.execute("""
|
|
1109
|
+
SELECT COUNT(*) FROM translation_units WHERE tm_id = ?
|
|
1110
|
+
""", (tm_id,))
|
|
1111
|
+
else:
|
|
1112
|
+
self.cursor.execute("SELECT COUNT(*) FROM translation_units")
|
|
1113
|
+
|
|
1114
|
+
return self.cursor.fetchone()[0]
|
|
1115
|
+
|
|
1116
|
+
def clear_tm(self, tm_id: str):
|
|
1117
|
+
"""Clear all entries from a TM"""
|
|
1118
|
+
self.cursor.execute("""
|
|
1119
|
+
DELETE FROM translation_units WHERE tm_id = ?
|
|
1120
|
+
""", (tm_id,))
|
|
1121
|
+
self.connection.commit()
|
|
1122
|
+
|
|
1123
|
+
def delete_entry(self, tm_id: str, source: str, target: str):
|
|
1124
|
+
"""Delete a specific entry from a TM"""
|
|
1125
|
+
# Get the ID first
|
|
1126
|
+
self.cursor.execute("""
|
|
1127
|
+
SELECT id FROM translation_units
|
|
1128
|
+
WHERE tm_id = ? AND source_text = ? AND target_text = ?
|
|
1129
|
+
""", (tm_id, source, target))
|
|
1130
|
+
|
|
1131
|
+
result = self.cursor.fetchone()
|
|
1132
|
+
if not result:
|
|
1133
|
+
return # Entry not found
|
|
1134
|
+
|
|
1135
|
+
entry_id = result['id']
|
|
1136
|
+
|
|
1137
|
+
# Delete from FTS5 index first
|
|
1138
|
+
try:
|
|
1139
|
+
self.cursor.execute("""
|
|
1140
|
+
DELETE FROM tm_fts WHERE rowid = ?
|
|
1141
|
+
""", (entry_id,))
|
|
1142
|
+
except Exception:
|
|
1143
|
+
pass # FTS5 table might not exist
|
|
1144
|
+
|
|
1145
|
+
# Delete from main table
|
|
1146
|
+
self.cursor.execute("""
|
|
1147
|
+
DELETE FROM translation_units
|
|
1148
|
+
WHERE id = ?
|
|
1149
|
+
""", (entry_id,))
|
|
1150
|
+
|
|
1151
|
+
self.connection.commit()
|
|
1152
|
+
|
|
1153
|
+
def concordance_search(self, query: str, tm_ids: List[str] = None, direction: str = 'both',
|
|
1154
|
+
source_lang = None, target_lang = None) -> List[Dict]:
|
|
1155
|
+
"""
|
|
1156
|
+
Search for text in source and/or target (concordance search)
|
|
1157
|
+
Uses FTS5 full-text search for fast matching on millions of segments.
|
|
1158
|
+
Falls back to LIKE queries if FTS5 fails.
|
|
1159
|
+
|
|
1160
|
+
Language filters define what you're searching FOR and what translation you want:
|
|
1161
|
+
- "From: Dutch, To: English" = Search for Dutch text, show English translations
|
|
1162
|
+
- Searches ALL TMs (regardless of their stored language pair direction)
|
|
1163
|
+
- Automatically swaps columns when needed (e.g., finds Dutch in target column of EN→NL TM)
|
|
1164
|
+
- This is MORE intuitive than traditional CAT tools that only search specific TM directions
|
|
1165
|
+
|
|
1166
|
+
Args:
|
|
1167
|
+
query: Text to search for
|
|
1168
|
+
tm_ids: List of TM IDs to search (None = all)
|
|
1169
|
+
direction: 'source' = search source only, 'target' = search target only, 'both' = bidirectional
|
|
1170
|
+
source_lang: Filter by source language - can be a string OR a list of language variants (None = any)
|
|
1171
|
+
target_lang: Filter by target language - can be a string OR a list of language variants (None = any)
|
|
1172
|
+
"""
|
|
1173
|
+
# Normalize language filters to lists for consistent handling
|
|
1174
|
+
source_langs = source_lang if isinstance(source_lang, list) else ([source_lang] if source_lang else None)
|
|
1175
|
+
target_langs = target_lang if isinstance(target_lang, list) else ([target_lang] if target_lang else None)
|
|
1176
|
+
|
|
1177
|
+
# Escape FTS5 special characters and wrap words for prefix matching
|
|
1178
|
+
# FTS5 special chars: " * ( ) : ^
|
|
1179
|
+
fts_query = query.replace('"', '""')
|
|
1180
|
+
# Wrap in quotes for phrase search
|
|
1181
|
+
fts_query = f'"{fts_query}"'
|
|
1182
|
+
|
|
1183
|
+
# When language filters specified, we need to search intelligently:
|
|
1184
|
+
# - Don't filter by TM language pair (search ALL TMs)
|
|
1185
|
+
# - Search in BOTH columns to find text
|
|
1186
|
+
# - Swap columns if needed to show correct language order
|
|
1187
|
+
use_smart_search = (source_langs or target_langs)
|
|
1188
|
+
|
|
1189
|
+
try:
|
|
1190
|
+
# Use FTS5 for fast full-text search
|
|
1191
|
+
if direction == 'source':
|
|
1192
|
+
fts_sql = """
|
|
1193
|
+
SELECT tu.* FROM translation_units tu
|
|
1194
|
+
JOIN translation_units_fts fts ON tu.id = fts.rowid
|
|
1195
|
+
WHERE fts.source_text MATCH ?
|
|
1196
|
+
"""
|
|
1197
|
+
params = [fts_query]
|
|
1198
|
+
elif direction == 'target':
|
|
1199
|
+
fts_sql = """
|
|
1200
|
+
SELECT tu.* FROM translation_units tu
|
|
1201
|
+
JOIN translation_units_fts fts ON tu.id = fts.rowid
|
|
1202
|
+
WHERE fts.target_text MATCH ?
|
|
1203
|
+
"""
|
|
1204
|
+
params = [fts_query]
|
|
1205
|
+
else:
|
|
1206
|
+
# Both directions - search in combined FTS index
|
|
1207
|
+
fts_sql = """
|
|
1208
|
+
SELECT tu.* FROM translation_units tu
|
|
1209
|
+
JOIN translation_units_fts fts ON tu.id = fts.rowid
|
|
1210
|
+
WHERE translation_units_fts MATCH ?
|
|
1211
|
+
"""
|
|
1212
|
+
params = [fts_query]
|
|
1213
|
+
|
|
1214
|
+
if tm_ids:
|
|
1215
|
+
placeholders = ','.join('?' * len(tm_ids))
|
|
1216
|
+
fts_sql += f" AND tu.tm_id IN ({placeholders})"
|
|
1217
|
+
params.extend(tm_ids)
|
|
1218
|
+
|
|
1219
|
+
# DON'T filter by language when smart search active
|
|
1220
|
+
# (we need to search all TMs and figure out which column has our language)
|
|
1221
|
+
if not use_smart_search:
|
|
1222
|
+
# Traditional filtering when no language filters
|
|
1223
|
+
if source_langs:
|
|
1224
|
+
placeholders = ','.join('?' * len(source_langs))
|
|
1225
|
+
fts_sql += f" AND tu.source_lang IN ({placeholders})"
|
|
1226
|
+
params.extend(source_langs)
|
|
1227
|
+
if target_langs:
|
|
1228
|
+
placeholders = ','.join('?' * len(target_langs))
|
|
1229
|
+
fts_sql += f" AND tu.target_lang IN ({placeholders})"
|
|
1230
|
+
params.extend(target_langs)
|
|
1231
|
+
|
|
1232
|
+
fts_sql += " ORDER BY tu.modified_date DESC LIMIT 100"
|
|
1233
|
+
|
|
1234
|
+
self.cursor.execute(fts_sql, params)
|
|
1235
|
+
raw_results = [dict(row) for row in self.cursor.fetchall()]
|
|
1236
|
+
|
|
1237
|
+
# Smart search: Filter and swap based on language metadata
|
|
1238
|
+
if use_smart_search:
|
|
1239
|
+
processed_results = []
|
|
1240
|
+
for row in raw_results:
|
|
1241
|
+
row_src_lang = row.get('source_lang', '')
|
|
1242
|
+
row_tgt_lang = row.get('target_lang', '')
|
|
1243
|
+
|
|
1244
|
+
# Check if this row matches our language requirements
|
|
1245
|
+
# If "From: Dutch, To: English":
|
|
1246
|
+
# - Accept if source=nl and target=en (normal)
|
|
1247
|
+
# - Accept if source=en and target=nl (swap needed)
|
|
1248
|
+
|
|
1249
|
+
matches = False
|
|
1250
|
+
needs_swap = False
|
|
1251
|
+
|
|
1252
|
+
if source_langs and target_langs:
|
|
1253
|
+
# Both filters specified
|
|
1254
|
+
if row_src_lang in source_langs and row_tgt_lang in target_langs:
|
|
1255
|
+
# Perfect match - no swap
|
|
1256
|
+
matches = True
|
|
1257
|
+
needs_swap = False
|
|
1258
|
+
elif row_src_lang in target_langs and row_tgt_lang in source_langs:
|
|
1259
|
+
# Reversed - needs swap
|
|
1260
|
+
matches = True
|
|
1261
|
+
needs_swap = True
|
|
1262
|
+
elif source_langs:
|
|
1263
|
+
# Only "From" specified - just check if Dutch is in EITHER column
|
|
1264
|
+
if row_src_lang in source_langs:
|
|
1265
|
+
matches = True
|
|
1266
|
+
needs_swap = False
|
|
1267
|
+
elif row_tgt_lang in source_langs:
|
|
1268
|
+
matches = True
|
|
1269
|
+
needs_swap = True
|
|
1270
|
+
elif target_langs:
|
|
1271
|
+
# Only "To" specified - just check if English is in EITHER column
|
|
1272
|
+
if row_tgt_lang in target_langs:
|
|
1273
|
+
matches = True
|
|
1274
|
+
needs_swap = False
|
|
1275
|
+
elif row_src_lang in target_langs:
|
|
1276
|
+
matches = True
|
|
1277
|
+
needs_swap = True
|
|
1278
|
+
|
|
1279
|
+
if matches:
|
|
1280
|
+
# CRITICAL CHECK: Verify the search text is actually in the correct column
|
|
1281
|
+
# If user searches for Dutch with "From: Dutch", the text must be in the source column (after any swap)
|
|
1282
|
+
# This prevents finding Dutch text when user asks to search FOR English
|
|
1283
|
+
|
|
1284
|
+
if needs_swap:
|
|
1285
|
+
# After swap, check if query is in the NEW source column (was target)
|
|
1286
|
+
text_to_check = row['target_text'].lower()
|
|
1287
|
+
else:
|
|
1288
|
+
# No swap, check if query is in source column
|
|
1289
|
+
text_to_check = row['source_text'].lower()
|
|
1290
|
+
|
|
1291
|
+
# Only include if query text is actually in the source column
|
|
1292
|
+
if query.lower() in text_to_check:
|
|
1293
|
+
if needs_swap:
|
|
1294
|
+
# Swap columns to show correct language order
|
|
1295
|
+
swapped_row = row.copy()
|
|
1296
|
+
swapped_row['source'] = row['target_text']
|
|
1297
|
+
swapped_row['target'] = row['source_text']
|
|
1298
|
+
swapped_row['source_lang'] = row['target_lang']
|
|
1299
|
+
swapped_row['target_lang'] = row['source_lang']
|
|
1300
|
+
processed_results.append(swapped_row)
|
|
1301
|
+
else:
|
|
1302
|
+
# No swap needed - just rename columns
|
|
1303
|
+
processed_row = row.copy()
|
|
1304
|
+
processed_row['source'] = row['source_text']
|
|
1305
|
+
processed_row['target'] = row['target_text']
|
|
1306
|
+
processed_results.append(processed_row)
|
|
1307
|
+
|
|
1308
|
+
return processed_results
|
|
1309
|
+
else:
|
|
1310
|
+
# No language filters - just rename columns
|
|
1311
|
+
processed_results = []
|
|
1312
|
+
for row in raw_results:
|
|
1313
|
+
processed_row = row.copy()
|
|
1314
|
+
processed_row['source'] = row['source_text']
|
|
1315
|
+
processed_row['target'] = row['target_text']
|
|
1316
|
+
processed_results.append(processed_row)
|
|
1317
|
+
return processed_results
|
|
1318
|
+
|
|
1319
|
+
except Exception as e:
|
|
1320
|
+
# Fallback to LIKE query if FTS5 fails (e.g., index not built)
|
|
1321
|
+
print(f"[TM] FTS5 search failed, falling back to LIKE: {e}")
|
|
1322
|
+
search_query = f"%{query}%"
|
|
1323
|
+
|
|
1324
|
+
if direction == 'source':
|
|
1325
|
+
sql = """
|
|
1326
|
+
SELECT * FROM translation_units
|
|
1327
|
+
WHERE source_text LIKE ?
|
|
1328
|
+
"""
|
|
1329
|
+
params = [search_query]
|
|
1330
|
+
elif direction == 'target':
|
|
1331
|
+
sql = """
|
|
1332
|
+
SELECT * FROM translation_units
|
|
1333
|
+
WHERE target_text LIKE ?
|
|
1334
|
+
"""
|
|
1335
|
+
params = [search_query]
|
|
1336
|
+
else:
|
|
1337
|
+
sql = """
|
|
1338
|
+
SELECT * FROM translation_units
|
|
1339
|
+
WHERE (source_text LIKE ? OR target_text LIKE ?)
|
|
1340
|
+
"""
|
|
1341
|
+
params = [search_query, search_query]
|
|
1342
|
+
|
|
1343
|
+
if tm_ids:
|
|
1344
|
+
placeholders = ','.join('?' * len(tm_ids))
|
|
1345
|
+
sql += f" AND tm_id IN ({placeholders})"
|
|
1346
|
+
params.extend(tm_ids)
|
|
1347
|
+
|
|
1348
|
+
# Add language filters (support for list of variants)
|
|
1349
|
+
if source_langs:
|
|
1350
|
+
placeholders = ','.join('?' * len(source_langs))
|
|
1351
|
+
sql += f" AND source_lang IN ({placeholders})"
|
|
1352
|
+
params.extend(source_langs)
|
|
1353
|
+
if target_langs:
|
|
1354
|
+
placeholders = ','.join('?' * len(target_langs))
|
|
1355
|
+
sql += f" AND target_lang IN ({placeholders})"
|
|
1356
|
+
params.extend(target_langs)
|
|
1357
|
+
|
|
1358
|
+
sql += " ORDER BY modified_date DESC LIMIT 100"
|
|
1359
|
+
|
|
1360
|
+
self.cursor.execute(sql, params)
|
|
1361
|
+
return [dict(row) for row in self.cursor.fetchall()]
|
|
1362
|
+
|
|
1363
|
+
def rebuild_fts_index(self) -> int:
|
|
1364
|
+
"""
|
|
1365
|
+
Rebuild the FTS5 full-text search index from scratch.
|
|
1366
|
+
Use this after importing TMs or if FTS search isn't returning results.
|
|
1367
|
+
|
|
1368
|
+
Returns:
|
|
1369
|
+
Number of entries indexed
|
|
1370
|
+
"""
|
|
1371
|
+
try:
|
|
1372
|
+
# Clear existing FTS data
|
|
1373
|
+
self.cursor.execute("DELETE FROM translation_units_fts")
|
|
1374
|
+
|
|
1375
|
+
# Repopulate from translation_units table
|
|
1376
|
+
self.cursor.execute("""
|
|
1377
|
+
INSERT INTO translation_units_fts(rowid, source_text, target_text)
|
|
1378
|
+
SELECT id, source_text, target_text FROM translation_units
|
|
1379
|
+
""")
|
|
1380
|
+
|
|
1381
|
+
self.conn.commit()
|
|
1382
|
+
|
|
1383
|
+
# Get count
|
|
1384
|
+
self.cursor.execute("SELECT COUNT(*) FROM translation_units_fts")
|
|
1385
|
+
count = self.cursor.fetchone()[0]
|
|
1386
|
+
print(f"[TM] FTS5 index rebuilt with {count:,} entries")
|
|
1387
|
+
return count
|
|
1388
|
+
except Exception as e:
|
|
1389
|
+
print(f"[TM] Error rebuilding FTS index: {e}")
|
|
1390
|
+
return 0
|
|
1391
|
+
|
|
1392
|
+
def check_fts_index(self) -> Dict:
|
|
1393
|
+
"""
|
|
1394
|
+
Check if FTS5 index is in sync with main table.
|
|
1395
|
+
|
|
1396
|
+
Returns:
|
|
1397
|
+
Dict with 'main_count', 'fts_count', 'in_sync' keys
|
|
1398
|
+
"""
|
|
1399
|
+
try:
|
|
1400
|
+
self.cursor.execute("SELECT COUNT(*) FROM translation_units")
|
|
1401
|
+
main_count = self.cursor.fetchone()[0]
|
|
1402
|
+
|
|
1403
|
+
self.cursor.execute("SELECT COUNT(*) FROM translation_units_fts")
|
|
1404
|
+
fts_count = self.cursor.fetchone()[0]
|
|
1405
|
+
|
|
1406
|
+
return {
|
|
1407
|
+
'main_count': main_count,
|
|
1408
|
+
'fts_count': fts_count,
|
|
1409
|
+
'in_sync': main_count == fts_count
|
|
1410
|
+
}
|
|
1411
|
+
except Exception as e:
|
|
1412
|
+
return {'main_count': 0, 'fts_count': 0, 'in_sync': False, 'error': str(e)}
|
|
1413
|
+
|
|
1414
|
+
# ============================================
|
|
1415
|
+
# termbase METHODS (Placeholder for Phase 3)
|
|
1416
|
+
# ============================================
|
|
1417
|
+
|
|
1418
|
+
def add_termbase_term(self, source_term: str, target_term: str,
|
|
1419
|
+
source_lang: str, target_lang: str,
|
|
1420
|
+
termbase_id: str = 'main', **kwargs) -> int:
|
|
1421
|
+
"""Add term to termbase (Phase 3)"""
|
|
1422
|
+
# TODO: Implement in Phase 3
|
|
1423
|
+
pass
|
|
1424
|
+
|
|
1425
|
+
def search_termbases(self, search_term: str, source_lang: str = None,
|
|
1426
|
+
target_lang: str = None, project_id: str = None,
|
|
1427
|
+
min_length: int = 0) -> List[Dict]:
|
|
1428
|
+
"""
|
|
1429
|
+
Search termbases for matching source terms
|
|
1430
|
+
|
|
1431
|
+
Args:
|
|
1432
|
+
search_term: Source term to search for
|
|
1433
|
+
source_lang: Filter by source language (optional)
|
|
1434
|
+
target_lang: Filter by target language (optional)
|
|
1435
|
+
project_id: Filter by project (optional)
|
|
1436
|
+
min_length: Minimum term length to return
|
|
1437
|
+
|
|
1438
|
+
Returns:
|
|
1439
|
+
List of termbase hits, sorted by priority (lower = higher priority)
|
|
1440
|
+
"""
|
|
1441
|
+
# Build query with filters - include termbase name and ranking via JOIN
|
|
1442
|
+
# Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
|
|
1443
|
+
# Use CAST to ensure proper comparison
|
|
1444
|
+
# IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
|
|
1445
|
+
# CRITICAL FIX: Also match when search_term starts with the glossary term
|
|
1446
|
+
# This handles cases like searching for "ca." when glossary has "ca."
|
|
1447
|
+
# AND searching for "ca" when glossary has "ca."
|
|
1448
|
+
# We also strip trailing punctuation from glossary terms for comparison
|
|
1449
|
+
query = """
|
|
1450
|
+
SELECT
|
|
1451
|
+
t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
|
|
1452
|
+
t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
|
|
1453
|
+
t.notes, t.project, t.client,
|
|
1454
|
+
tb.name as termbase_name,
|
|
1455
|
+
tb.source_lang as termbase_source_lang,
|
|
1456
|
+
tb.target_lang as termbase_target_lang,
|
|
1457
|
+
tb.is_project_termbase,
|
|
1458
|
+
COALESCE(ta.priority, tb.ranking) as ranking
|
|
1459
|
+
FROM termbase_terms t
|
|
1460
|
+
LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
|
|
1461
|
+
LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
|
|
1462
|
+
WHERE (
|
|
1463
|
+
LOWER(t.source_term) = LOWER(?) OR
|
|
1464
|
+
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1465
|
+
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1466
|
+
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1467
|
+
LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
|
|
1468
|
+
LOWER(?) LIKE LOWER(t.source_term) || '%' OR
|
|
1469
|
+
LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
|
|
1470
|
+
)
|
|
1471
|
+
AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
|
|
1472
|
+
"""
|
|
1473
|
+
# Matching patterns:
|
|
1474
|
+
# 1. Exact match: source_term = search_term
|
|
1475
|
+
# 2. Glossary term starts with search: source_term LIKE "search_term %"
|
|
1476
|
+
# 3. Glossary term ends with search: source_term LIKE "% search_term"
|
|
1477
|
+
# 4. Glossary term contains search: source_term LIKE "% search_term %"
|
|
1478
|
+
# 5. Glossary term (stripped) = search_term: RTRIM(source_term) = search_term (handles "ca." = "ca")
|
|
1479
|
+
# 6. Search starts with glossary term: search_term LIKE source_term || '%'
|
|
1480
|
+
# 7. Search = glossary term stripped: search_term = RTRIM(source_term)
|
|
1481
|
+
params = [
|
|
1482
|
+
project_id if project_id else 0, # Use 0 if no project (won't match any activation records)
|
|
1483
|
+
search_term,
|
|
1484
|
+
f"{search_term} %",
|
|
1485
|
+
f"% {search_term}",
|
|
1486
|
+
f"% {search_term} %",
|
|
1487
|
+
search_term, # For RTRIM comparison
|
|
1488
|
+
search_term, # For reverse LIKE
|
|
1489
|
+
search_term # For reverse RTRIM comparison
|
|
1490
|
+
]
|
|
1491
|
+
|
|
1492
|
+
# Language filters - if term has no language, use termbase language for filtering
|
|
1493
|
+
if source_lang:
|
|
1494
|
+
query += """ AND (
|
|
1495
|
+
t.source_lang = ? OR
|
|
1496
|
+
(t.source_lang IS NULL AND tb.source_lang = ?) OR
|
|
1497
|
+
(t.source_lang IS NULL AND tb.source_lang IS NULL)
|
|
1498
|
+
)"""
|
|
1499
|
+
params.extend([source_lang, source_lang])
|
|
1500
|
+
|
|
1501
|
+
if target_lang:
|
|
1502
|
+
query += """ AND (
|
|
1503
|
+
t.target_lang = ? OR
|
|
1504
|
+
(t.target_lang IS NULL AND tb.target_lang = ?) OR
|
|
1505
|
+
(t.target_lang IS NULL AND tb.target_lang IS NULL)
|
|
1506
|
+
)"""
|
|
1507
|
+
params.extend([target_lang, target_lang])
|
|
1508
|
+
|
|
1509
|
+
# Project filter: match project-specific terms OR global terms (project_id IS NULL)
|
|
1510
|
+
if project_id:
|
|
1511
|
+
query += " AND (t.project_id = ? OR t.project_id IS NULL)"
|
|
1512
|
+
params.append(project_id)
|
|
1513
|
+
|
|
1514
|
+
if min_length > 0:
|
|
1515
|
+
query += f" AND LENGTH(t.source_term) >= {min_length}"
|
|
1516
|
+
|
|
1517
|
+
# Sort by ranking (lower number = higher priority)
|
|
1518
|
+
# Project termbases (ranking IS NULL) appear first, then by ranking, then alphabetically
|
|
1519
|
+
# Use COALESCE to treat NULL as -1 (highest priority)
|
|
1520
|
+
query += " ORDER BY COALESCE(tb.ranking, -1) ASC, t.source_term ASC"
|
|
1521
|
+
|
|
1522
|
+
self.cursor.execute(query, params)
|
|
1523
|
+
results = []
|
|
1524
|
+
for row in self.cursor.fetchall():
|
|
1525
|
+
result_dict = dict(row)
|
|
1526
|
+
# SQLite stores booleans as 0/1, explicitly convert to Python bool
|
|
1527
|
+
if 'is_project_termbase' in result_dict:
|
|
1528
|
+
result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
|
|
1529
|
+
|
|
1530
|
+
# Fetch target synonyms for this term and include them in the result
|
|
1531
|
+
term_id = result_dict.get('id')
|
|
1532
|
+
if term_id:
|
|
1533
|
+
try:
|
|
1534
|
+
self.cursor.execute("""
|
|
1535
|
+
SELECT synonym_text, forbidden FROM termbase_synonyms
|
|
1536
|
+
WHERE term_id = ? AND language = 'target'
|
|
1537
|
+
ORDER BY display_order ASC
|
|
1538
|
+
""", (term_id,))
|
|
1539
|
+
synonyms = []
|
|
1540
|
+
for syn_row in self.cursor.fetchall():
|
|
1541
|
+
syn_text = syn_row[0]
|
|
1542
|
+
syn_forbidden = bool(syn_row[1])
|
|
1543
|
+
if not syn_forbidden: # Only include non-forbidden synonyms
|
|
1544
|
+
synonyms.append(syn_text)
|
|
1545
|
+
result_dict['target_synonyms'] = synonyms
|
|
1546
|
+
except Exception:
|
|
1547
|
+
result_dict['target_synonyms'] = []
|
|
1548
|
+
|
|
1549
|
+
results.append(result_dict)
|
|
1550
|
+
return results
|
|
1551
|
+
|
|
1552
|
+
# ============================================
|
|
1553
|
+
# UTILITY METHODS
|
|
1554
|
+
# ============================================
|
|
1555
|
+
|
|
1556
|
+
def get_all_tms(self, enabled_only: bool = True) -> List[Dict]:
|
|
1557
|
+
"""
|
|
1558
|
+
Get list of all translation memories
|
|
1559
|
+
|
|
1560
|
+
Args:
|
|
1561
|
+
enabled_only: If True, only return enabled TMs
|
|
1562
|
+
|
|
1563
|
+
Returns:
|
|
1564
|
+
List of TM info dictionaries with tm_id, name, entry_count, enabled
|
|
1565
|
+
"""
|
|
1566
|
+
# Get distinct TM IDs from translation_units
|
|
1567
|
+
query = "SELECT DISTINCT tm_id FROM translation_units ORDER BY tm_id"
|
|
1568
|
+
self.cursor.execute(query)
|
|
1569
|
+
tm_ids = [row[0] for row in self.cursor.fetchall()]
|
|
1570
|
+
|
|
1571
|
+
tm_list = []
|
|
1572
|
+
for tm_id in tm_ids:
|
|
1573
|
+
entry_count = self.get_tm_count(tm_id)
|
|
1574
|
+
tm_info = {
|
|
1575
|
+
'tm_id': tm_id,
|
|
1576
|
+
'name': tm_id.replace('_', ' ').title(),
|
|
1577
|
+
'entry_count': entry_count,
|
|
1578
|
+
'enabled': True, # For now, all TMs are enabled
|
|
1579
|
+
'read_only': False
|
|
1580
|
+
}
|
|
1581
|
+
tm_list.append(tm_info)
|
|
1582
|
+
|
|
1583
|
+
return tm_list
|
|
1584
|
+
|
|
1585
|
+
def get_tm_list(self, enabled_only: bool = True) -> List[Dict]:
|
|
1586
|
+
"""Alias for get_all_tms for backward compatibility"""
|
|
1587
|
+
return self.get_all_tms(enabled_only=enabled_only)
|
|
1588
|
+
|
|
1589
|
+
def get_entry_count(self, enabled_only: bool = True) -> int:
|
|
1590
|
+
"""
|
|
1591
|
+
Get total number of translation entries
|
|
1592
|
+
|
|
1593
|
+
Args:
|
|
1594
|
+
enabled_only: Currently ignored (all TMs enabled)
|
|
1595
|
+
|
|
1596
|
+
Returns:
|
|
1597
|
+
Total number of translation units
|
|
1598
|
+
"""
|
|
1599
|
+
return self.get_tm_count()
|
|
1600
|
+
|
|
1601
|
+
def vacuum(self):
|
|
1602
|
+
"""Optimize database (VACUUM)"""
|
|
1603
|
+
self.cursor.execute("VACUUM")
|
|
1604
|
+
self.connection.commit()
|
|
1605
|
+
|
|
1606
|
+
# ============================================
|
|
1607
|
+
# TMX EDITOR METHODS (database-backed TMX files)
|
|
1608
|
+
# ============================================
|
|
1609
|
+
|
|
1610
|
+
def tmx_store_file(self, file_path: str, file_name: str, original_file_path: str,
|
|
1611
|
+
load_mode: str, file_size: int, header_data: dict,
|
|
1612
|
+
tu_count: int, languages: List[str]) -> int:
|
|
1613
|
+
"""
|
|
1614
|
+
Store TMX file metadata in database
|
|
1615
|
+
|
|
1616
|
+
Returns:
|
|
1617
|
+
tmx_file_id (int)
|
|
1618
|
+
"""
|
|
1619
|
+
languages_json = json.dumps(languages)
|
|
1620
|
+
header_json = json.dumps(header_data)
|
|
1621
|
+
|
|
1622
|
+
# Check if file already exists
|
|
1623
|
+
self.cursor.execute("SELECT id FROM tmx_files WHERE file_path = ?", (file_path,))
|
|
1624
|
+
existing = self.cursor.fetchone()
|
|
1625
|
+
|
|
1626
|
+
if existing:
|
|
1627
|
+
# Update existing
|
|
1628
|
+
self.cursor.execute("""
|
|
1629
|
+
UPDATE tmx_files
|
|
1630
|
+
SET file_name = ?, original_file_path = ?, load_mode = ?, file_size = ?,
|
|
1631
|
+
header_data = ?, tu_count = ?, languages = ?, last_accessed = CURRENT_TIMESTAMP
|
|
1632
|
+
WHERE id = ?
|
|
1633
|
+
""", (file_name, original_file_path, load_mode, file_size, header_json,
|
|
1634
|
+
tu_count, languages_json, existing['id']))
|
|
1635
|
+
self.connection.commit()
|
|
1636
|
+
return existing['id']
|
|
1637
|
+
else:
|
|
1638
|
+
# Insert new
|
|
1639
|
+
self.cursor.execute("""
|
|
1640
|
+
INSERT INTO tmx_files
|
|
1641
|
+
(file_path, file_name, original_file_path, load_mode, file_size,
|
|
1642
|
+
header_data, tu_count, languages)
|
|
1643
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
1644
|
+
""", (file_path, file_name, original_file_path, load_mode, file_size,
|
|
1645
|
+
header_json, tu_count, languages_json))
|
|
1646
|
+
self.connection.commit()
|
|
1647
|
+
return self.cursor.lastrowid
|
|
1648
|
+
|
|
1649
|
+
def tmx_store_translation_unit(self, tmx_file_id: int, tu_id: int,
|
|
1650
|
+
creation_date: str = None, creation_id: str = None,
|
|
1651
|
+
change_date: str = None, change_id: str = None,
|
|
1652
|
+
srclang: str = None, custom_attributes: dict = None,
|
|
1653
|
+
comments: List[str] = None, commit: bool = True) -> int:
|
|
1654
|
+
"""
|
|
1655
|
+
Store a translation unit in database
|
|
1656
|
+
|
|
1657
|
+
Args:
|
|
1658
|
+
commit: If False, don't commit (for batch operations)
|
|
1659
|
+
|
|
1660
|
+
Returns:
|
|
1661
|
+
Internal TU ID (for referencing segments)
|
|
1662
|
+
"""
|
|
1663
|
+
custom_attrs_json = json.dumps(custom_attributes) if custom_attributes else None
|
|
1664
|
+
comments_json = json.dumps(comments) if comments else None
|
|
1665
|
+
|
|
1666
|
+
self.cursor.execute("""
|
|
1667
|
+
INSERT OR REPLACE INTO tmx_translation_units
|
|
1668
|
+
(tmx_file_id, tu_id, creation_date, creation_id, change_date, change_id,
|
|
1669
|
+
srclang, custom_attributes, comments)
|
|
1670
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1671
|
+
""", (tmx_file_id, tu_id, creation_date, creation_id, change_date, change_id,
|
|
1672
|
+
srclang, custom_attrs_json, comments_json))
|
|
1673
|
+
if commit:
|
|
1674
|
+
self.connection.commit()
|
|
1675
|
+
return self.cursor.lastrowid
|
|
1676
|
+
|
|
1677
|
+
def tmx_store_segment(self, tu_db_id: int, lang: str, text: str,
|
|
1678
|
+
creation_date: str = None, creation_id: str = None,
|
|
1679
|
+
change_date: str = None, change_id: str = None,
|
|
1680
|
+
commit: bool = True):
|
|
1681
|
+
"""
|
|
1682
|
+
Store a segment (language variant) for a translation unit
|
|
1683
|
+
|
|
1684
|
+
Args:
|
|
1685
|
+
commit: If False, don't commit (for batch operations)
|
|
1686
|
+
"""
|
|
1687
|
+
self.cursor.execute("""
|
|
1688
|
+
INSERT OR REPLACE INTO tmx_segments
|
|
1689
|
+
(tu_id, lang, text, creation_date, creation_id, change_date, change_id)
|
|
1690
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
1691
|
+
""", (tu_db_id, lang, text, creation_date, creation_id, change_date, change_id))
|
|
1692
|
+
if commit:
|
|
1693
|
+
self.connection.commit()
|
|
1694
|
+
|
|
1695
|
+
def tmx_get_file_id(self, file_path: str) -> Optional[int]:
|
|
1696
|
+
"""Get TMX file ID by file path"""
|
|
1697
|
+
self.cursor.execute("SELECT id FROM tmx_files WHERE file_path = ?", (file_path,))
|
|
1698
|
+
row = self.cursor.fetchone()
|
|
1699
|
+
return row['id'] if row else None
|
|
1700
|
+
|
|
1701
|
+
def tmx_get_translation_units(self, tmx_file_id: int, offset: int = 0,
|
|
1702
|
+
limit: int = 50, src_lang: str = None,
|
|
1703
|
+
tgt_lang: str = None, src_filter: str = None,
|
|
1704
|
+
tgt_filter: str = None, ignore_case: bool = True) -> List[Dict]:
|
|
1705
|
+
"""
|
|
1706
|
+
Get translation units with pagination and filtering
|
|
1707
|
+
|
|
1708
|
+
Returns:
|
|
1709
|
+
List of dicts with TU data including segments
|
|
1710
|
+
"""
|
|
1711
|
+
# Build base query
|
|
1712
|
+
query = """
|
|
1713
|
+
SELECT tu.id as tu_db_id, tu.tu_id, tu.creation_date, tu.creation_id,
|
|
1714
|
+
tu.change_date, tu.change_id, tu.srclang, tu.custom_attributes, tu.comments
|
|
1715
|
+
FROM tmx_translation_units tu
|
|
1716
|
+
WHERE tu.tmx_file_id = ?
|
|
1717
|
+
"""
|
|
1718
|
+
params = [tmx_file_id]
|
|
1719
|
+
|
|
1720
|
+
# Add filters
|
|
1721
|
+
if src_filter or tgt_filter:
|
|
1722
|
+
query += """
|
|
1723
|
+
AND EXISTS (
|
|
1724
|
+
SELECT 1 FROM tmx_segments seg1
|
|
1725
|
+
WHERE seg1.tu_id = tu.id
|
|
1726
|
+
"""
|
|
1727
|
+
if src_lang:
|
|
1728
|
+
query += " AND seg1.lang = ?"
|
|
1729
|
+
params.append(src_lang)
|
|
1730
|
+
if src_filter:
|
|
1731
|
+
if ignore_case:
|
|
1732
|
+
query += " AND LOWER(seg1.text) LIKE LOWER(?)"
|
|
1733
|
+
params.append(f"%{src_filter}%")
|
|
1734
|
+
else:
|
|
1735
|
+
query += " AND seg1.text LIKE ?"
|
|
1736
|
+
params.append(f"%{src_filter}%")
|
|
1737
|
+
|
|
1738
|
+
if tgt_filter:
|
|
1739
|
+
query += """
|
|
1740
|
+
AND EXISTS (
|
|
1741
|
+
SELECT 1 FROM tmx_segments seg2
|
|
1742
|
+
WHERE seg2.tu_id = tu.id
|
|
1743
|
+
"""
|
|
1744
|
+
if tgt_lang:
|
|
1745
|
+
query += " AND seg2.lang = ?"
|
|
1746
|
+
params.append(tgt_lang)
|
|
1747
|
+
if ignore_case:
|
|
1748
|
+
query += " AND LOWER(seg2.text) LIKE LOWER(?)"
|
|
1749
|
+
params.append(f"%{tgt_filter}%")
|
|
1750
|
+
else:
|
|
1751
|
+
query += " AND seg2.text LIKE ?"
|
|
1752
|
+
params.append(f"%{tgt_filter}%")
|
|
1753
|
+
query += ")"
|
|
1754
|
+
|
|
1755
|
+
query += ")"
|
|
1756
|
+
|
|
1757
|
+
query += " ORDER BY tu.tu_id LIMIT ? OFFSET ?"
|
|
1758
|
+
params.extend([limit, offset])
|
|
1759
|
+
|
|
1760
|
+
self.cursor.execute(query, params)
|
|
1761
|
+
rows = self.cursor.fetchall()
|
|
1762
|
+
|
|
1763
|
+
# Fetch segments for each TU
|
|
1764
|
+
result = []
|
|
1765
|
+
for row in rows:
|
|
1766
|
+
tu_data = dict(row)
|
|
1767
|
+
# Get segments
|
|
1768
|
+
self.cursor.execute("""
|
|
1769
|
+
SELECT lang, text, creation_date, creation_id, change_date, change_id
|
|
1770
|
+
FROM tmx_segments
|
|
1771
|
+
WHERE tu_id = ?
|
|
1772
|
+
""", (tu_data['tu_db_id'],))
|
|
1773
|
+
segments = {}
|
|
1774
|
+
for seg_row in self.cursor.fetchall():
|
|
1775
|
+
seg_dict = dict(seg_row)
|
|
1776
|
+
segments[seg_dict['lang']] = seg_dict
|
|
1777
|
+
|
|
1778
|
+
tu_data['segments'] = segments
|
|
1779
|
+
if tu_data['custom_attributes']:
|
|
1780
|
+
tu_data['custom_attributes'] = json.loads(tu_data['custom_attributes'])
|
|
1781
|
+
if tu_data['comments']:
|
|
1782
|
+
tu_data['comments'] = json.loads(tu_data['comments'])
|
|
1783
|
+
|
|
1784
|
+
result.append(tu_data)
|
|
1785
|
+
|
|
1786
|
+
return result
|
|
1787
|
+
|
|
1788
|
+
def tmx_count_translation_units(self, tmx_file_id: int, src_lang: str = None,
|
|
1789
|
+
tgt_lang: str = None, src_filter: str = None,
|
|
1790
|
+
tgt_filter: str = None, ignore_case: bool = True) -> int:
|
|
1791
|
+
"""Count translation units matching filters"""
|
|
1792
|
+
query = """
|
|
1793
|
+
SELECT COUNT(DISTINCT tu.id)
|
|
1794
|
+
FROM tmx_translation_units tu
|
|
1795
|
+
WHERE tu.tmx_file_id = ?
|
|
1796
|
+
"""
|
|
1797
|
+
params = [tmx_file_id]
|
|
1798
|
+
|
|
1799
|
+
# Add same filters as tmx_get_translation_units
|
|
1800
|
+
if src_filter or tgt_filter:
|
|
1801
|
+
query += """
|
|
1802
|
+
AND EXISTS (
|
|
1803
|
+
SELECT 1 FROM tmx_segments seg1
|
|
1804
|
+
WHERE seg1.tu_id = tu.id
|
|
1805
|
+
"""
|
|
1806
|
+
if src_lang:
|
|
1807
|
+
query += " AND seg1.lang = ?"
|
|
1808
|
+
params.append(src_lang)
|
|
1809
|
+
if src_filter:
|
|
1810
|
+
if ignore_case:
|
|
1811
|
+
query += " AND LOWER(seg1.text) LIKE LOWER(?)"
|
|
1812
|
+
params.append(f"%{src_filter}%")
|
|
1813
|
+
else:
|
|
1814
|
+
query += " AND seg1.text LIKE ?"
|
|
1815
|
+
params.append(f"%{src_filter}%")
|
|
1816
|
+
|
|
1817
|
+
if tgt_filter:
|
|
1818
|
+
query += """
|
|
1819
|
+
AND EXISTS (
|
|
1820
|
+
SELECT 1 FROM tmx_segments seg2
|
|
1821
|
+
WHERE seg2.tu_id = tu.id
|
|
1822
|
+
"""
|
|
1823
|
+
if tgt_lang:
|
|
1824
|
+
query += " AND seg2.lang = ?"
|
|
1825
|
+
params.append(tgt_lang)
|
|
1826
|
+
if ignore_case:
|
|
1827
|
+
query += " AND LOWER(seg2.text) LIKE LOWER(?)"
|
|
1828
|
+
params.append(f"%{tgt_filter}%")
|
|
1829
|
+
else:
|
|
1830
|
+
query += " AND seg2.text LIKE ?"
|
|
1831
|
+
params.append(f"%{tgt_filter}%")
|
|
1832
|
+
query += ")"
|
|
1833
|
+
|
|
1834
|
+
query += ")"
|
|
1835
|
+
|
|
1836
|
+
self.cursor.execute(query, params)
|
|
1837
|
+
return self.cursor.fetchone()[0]
|
|
1838
|
+
|
|
1839
|
+
def tmx_update_segment(self, tmx_file_id: int, tu_id: int, lang: str, text: str):
|
|
1840
|
+
"""Update a segment text"""
|
|
1841
|
+
# Get internal TU ID
|
|
1842
|
+
self.cursor.execute("""
|
|
1843
|
+
SELECT tu.id FROM tmx_translation_units tu
|
|
1844
|
+
WHERE tu.tmx_file_id = ? AND tu.tu_id = ?
|
|
1845
|
+
""", (tmx_file_id, tu_id))
|
|
1846
|
+
tu_row = self.cursor.fetchone()
|
|
1847
|
+
if not tu_row:
|
|
1848
|
+
return False
|
|
1849
|
+
|
|
1850
|
+
tu_db_id = tu_row['id']
|
|
1851
|
+
change_date = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
|
1852
|
+
|
|
1853
|
+
# Update segment
|
|
1854
|
+
self.cursor.execute("""
|
|
1855
|
+
UPDATE tmx_segments
|
|
1856
|
+
SET text = ?, change_date = ?
|
|
1857
|
+
WHERE tu_id = ? AND lang = ?
|
|
1858
|
+
""", (text, change_date, tu_db_id, lang))
|
|
1859
|
+
|
|
1860
|
+
# Update TU change date
|
|
1861
|
+
self.cursor.execute("""
|
|
1862
|
+
UPDATE tmx_translation_units
|
|
1863
|
+
SET change_date = ?
|
|
1864
|
+
WHERE id = ?
|
|
1865
|
+
""", (change_date, tu_db_id))
|
|
1866
|
+
|
|
1867
|
+
# Update file last_modified
|
|
1868
|
+
self.cursor.execute("""
|
|
1869
|
+
UPDATE tmx_files
|
|
1870
|
+
SET last_modified = CURRENT_TIMESTAMP
|
|
1871
|
+
WHERE id = ?
|
|
1872
|
+
""", (tmx_file_id,))
|
|
1873
|
+
|
|
1874
|
+
self.connection.commit()
|
|
1875
|
+
return True
|
|
1876
|
+
|
|
1877
|
+
def tmx_delete_file(self, tmx_file_id: int):
|
|
1878
|
+
"""Delete TMX file and all its data (CASCADE will handle TUs and segments)"""
|
|
1879
|
+
self.cursor.execute("DELETE FROM tmx_files WHERE id = ?", (tmx_file_id,))
|
|
1880
|
+
self.connection.commit()
|
|
1881
|
+
|
|
1882
|
+
def tmx_get_file_info(self, tmx_file_id: int) -> Optional[Dict]:
|
|
1883
|
+
"""Get TMX file metadata"""
|
|
1884
|
+
self.cursor.execute("""
|
|
1885
|
+
SELECT id, file_path, file_name, original_file_path, load_mode,
|
|
1886
|
+
file_size, header_data, tu_count, languages,
|
|
1887
|
+
created_date, last_accessed, last_modified
|
|
1888
|
+
FROM tmx_files
|
|
1889
|
+
WHERE id = ?
|
|
1890
|
+
""", (tmx_file_id,))
|
|
1891
|
+
row = self.cursor.fetchone()
|
|
1892
|
+
if not row:
|
|
1893
|
+
return None
|
|
1894
|
+
|
|
1895
|
+
info = dict(row)
|
|
1896
|
+
info['header_data'] = json.loads(info['header_data'])
|
|
1897
|
+
info['languages'] = json.loads(info['languages'])
|
|
1898
|
+
return info
|
|
1899
|
+
|
|
1900
|
+
def get_database_info(self) -> Dict:
|
|
1901
|
+
"""Get database statistics"""
|
|
1902
|
+
info = {
|
|
1903
|
+
'path': self.db_path,
|
|
1904
|
+
'size_bytes': os.path.getsize(self.db_path) if os.path.exists(self.db_path) else 0,
|
|
1905
|
+
'tm_entries': self.get_tm_count(),
|
|
1906
|
+
}
|
|
1907
|
+
|
|
1908
|
+
# Get size in MB
|
|
1909
|
+
info['size_mb'] = round(info['size_bytes'] / (1024 * 1024), 2)
|
|
1910
|
+
|
|
1911
|
+
return info
|