supervertaler 1.9.153__py3-none-any.whl → 1.9.185__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +3450 -1135
- modules/database_manager.py +313 -120
- modules/database_migrations.py +54 -7
- modules/extract_tm.py +518 -0
- modules/keyboard_shortcuts_widget.py +7 -0
- modules/mqxliff_handler.py +71 -2
- modules/project_tm.py +320 -0
- modules/superlookup.py +12 -8
- modules/tag_manager.py +20 -2
- modules/termbase_manager.py +105 -2
- modules/termview_widget.py +82 -42
- modules/theme_manager.py +41 -4
- modules/tm_metadata_manager.py +59 -13
- modules/translation_memory.py +4 -13
- modules/translation_results_panel.py +0 -7
- modules/unified_prompt_library.py +2 -2
- modules/unified_prompt_manager_qt.py +47 -18
- supervertaler-1.9.185.dist-info/METADATA +151 -0
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/RECORD +23 -21
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/WHEEL +1 -1
- supervertaler-1.9.153.dist-info/METADATA +0 -896
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/entry_points.txt +0 -0
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/licenses/LICENSE +0 -0
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/top_level.txt +0 -0
modules/database_migrations.py
CHANGED
|
@@ -186,9 +186,13 @@ def run_all_migrations(db_manager) -> bool:
|
|
|
186
186
|
# Migration 3: Add display_order and forbidden fields to synonyms
|
|
187
187
|
if not migrate_synonym_fields(db_manager):
|
|
188
188
|
success = False
|
|
189
|
-
|
|
189
|
+
|
|
190
|
+
# Migration 4: Add ai_inject field to termbases
|
|
191
|
+
if not migrate_termbase_ai_inject(db_manager):
|
|
192
|
+
success = False
|
|
193
|
+
|
|
190
194
|
print("="*60)
|
|
191
|
-
|
|
195
|
+
|
|
192
196
|
return success
|
|
193
197
|
|
|
194
198
|
|
|
@@ -221,18 +225,26 @@ def check_and_migrate(db_manager) -> bool:
|
|
|
221
225
|
|
|
222
226
|
# Check if synonyms table exists
|
|
223
227
|
cursor.execute("""
|
|
224
|
-
SELECT name FROM sqlite_master
|
|
228
|
+
SELECT name FROM sqlite_master
|
|
225
229
|
WHERE type='table' AND name='termbase_synonyms'
|
|
226
230
|
""")
|
|
227
231
|
needs_synonyms_table = cursor.fetchone() is None
|
|
228
|
-
|
|
232
|
+
|
|
233
|
+
# Check if termbases table has ai_inject column
|
|
234
|
+
cursor.execute("PRAGMA table_info(termbases)")
|
|
235
|
+
termbase_columns = {row[1] for row in cursor.fetchall()}
|
|
236
|
+
needs_ai_inject = 'ai_inject' not in termbase_columns
|
|
237
|
+
|
|
229
238
|
if needs_migration:
|
|
230
239
|
print(f"⚠️ Migration needed - missing columns: {', '.join([c for c in ['project', 'client', 'term_uuid', 'note'] if c not in columns])}")
|
|
231
|
-
|
|
240
|
+
|
|
232
241
|
if needs_synonyms_table:
|
|
233
242
|
print("⚠️ Migration needed - termbase_synonyms table missing")
|
|
234
|
-
|
|
235
|
-
if
|
|
243
|
+
|
|
244
|
+
if needs_ai_inject:
|
|
245
|
+
print("⚠️ Migration needed - termbases.ai_inject column missing")
|
|
246
|
+
|
|
247
|
+
if needs_migration or needs_synonyms_table or needs_ai_inject:
|
|
236
248
|
success = run_all_migrations(db_manager)
|
|
237
249
|
if success:
|
|
238
250
|
# Generate UUIDs for terms that don't have them
|
|
@@ -316,6 +328,41 @@ def migrate_synonym_fields(db_manager) -> bool:
|
|
|
316
328
|
return False
|
|
317
329
|
|
|
318
330
|
|
|
331
|
+
def migrate_termbase_ai_inject(db_manager) -> bool:
|
|
332
|
+
"""
|
|
333
|
+
Add ai_inject column to termbases table.
|
|
334
|
+
When enabled, the termbase's terms will be injected into LLM translation prompts.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
db_manager: DatabaseManager instance
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
True if migration successful
|
|
341
|
+
"""
|
|
342
|
+
try:
|
|
343
|
+
cursor = db_manager.cursor
|
|
344
|
+
|
|
345
|
+
# Check which columns exist
|
|
346
|
+
cursor.execute("PRAGMA table_info(termbases)")
|
|
347
|
+
columns = {row[1] for row in cursor.fetchall()}
|
|
348
|
+
|
|
349
|
+
if 'ai_inject' not in columns:
|
|
350
|
+
print("📊 Adding 'ai_inject' column to termbases...")
|
|
351
|
+
cursor.execute("ALTER TABLE termbases ADD COLUMN ai_inject BOOLEAN DEFAULT 0")
|
|
352
|
+
db_manager.connection.commit()
|
|
353
|
+
print(" ✓ Column 'ai_inject' added successfully")
|
|
354
|
+
else:
|
|
355
|
+
print("✅ termbases.ai_inject column already exists")
|
|
356
|
+
|
|
357
|
+
return True
|
|
358
|
+
|
|
359
|
+
except Exception as e:
|
|
360
|
+
print(f"❌ ai_inject migration failed: {e}")
|
|
361
|
+
import traceback
|
|
362
|
+
traceback.print_exc()
|
|
363
|
+
return False
|
|
364
|
+
|
|
365
|
+
|
|
319
366
|
def generate_missing_uuids(db_manager) -> bool:
|
|
320
367
|
"""
|
|
321
368
|
Generate UUIDs for any termbase terms that don't have them.
|
modules/extract_tm.py
ADDED
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ExtractTM - Persistent TM extraction saved to .svtm files
|
|
3
|
+
|
|
4
|
+
This module implements TM extraction that saves relevant segments from existing TMs
|
|
5
|
+
to a .svtm file (SQLite database) next to the project file. Unlike the in-memory
|
|
6
|
+
ProjectTM, this persists across sessions.
|
|
7
|
+
|
|
8
|
+
File format: .svtm (Supervertaler TM) - SQLite database internally
|
|
9
|
+
Filename pattern: {ProjectName}_Extract.svtm
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import sqlite3
|
|
13
|
+
import threading
|
|
14
|
+
import os
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from difflib import SequenceMatcher
|
|
17
|
+
from typing import Dict, List, Optional, Callable, Tuple
|
|
18
|
+
import re
|
|
19
|
+
import time
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ExtractTM:
|
|
23
|
+
"""
|
|
24
|
+
Persistent TM extraction saved to disk as .svtm file.
|
|
25
|
+
|
|
26
|
+
Extracts relevant segments from selected TMs and saves them to a SQLite
|
|
27
|
+
database file next to the project. This persists across sessions, so
|
|
28
|
+
extraction only needs to happen once per project.
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
extract_tm = ExtractTM()
|
|
32
|
+
|
|
33
|
+
# Extract and save
|
|
34
|
+
extract_tm.extract_and_save(
|
|
35
|
+
output_path="MyProject_Extract.svtm",
|
|
36
|
+
db_manager=db_manager,
|
|
37
|
+
project_segments=segments,
|
|
38
|
+
tm_ids=['tm1', 'tm2'],
|
|
39
|
+
threshold=0.80,
|
|
40
|
+
progress_callback=lambda cur, total, msg: print(f"{cur}/{total} - {msg}")
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Load existing extraction
|
|
44
|
+
extract_tm.load("MyProject_Extract.svtm")
|
|
45
|
+
|
|
46
|
+
# Search
|
|
47
|
+
matches = extract_tm.search("source text")
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
SCHEMA_VERSION = 1
|
|
51
|
+
|
|
52
|
+
def __init__(self):
|
|
53
|
+
"""Initialize ExtractTM (not connected to any file yet)"""
|
|
54
|
+
self.conn = None
|
|
55
|
+
self.file_path = None
|
|
56
|
+
self.lock = threading.Lock()
|
|
57
|
+
self.is_loaded = False
|
|
58
|
+
self.segment_count = 0
|
|
59
|
+
self.metadata = {}
|
|
60
|
+
|
|
61
|
+
def _create_schema(self):
|
|
62
|
+
"""Create the database schema"""
|
|
63
|
+
with self.lock:
|
|
64
|
+
cursor = self.conn.cursor()
|
|
65
|
+
|
|
66
|
+
# Metadata table
|
|
67
|
+
cursor.execute("""
|
|
68
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
69
|
+
key TEXT PRIMARY KEY,
|
|
70
|
+
value TEXT
|
|
71
|
+
)
|
|
72
|
+
""")
|
|
73
|
+
|
|
74
|
+
# Segments table
|
|
75
|
+
cursor.execute("""
|
|
76
|
+
CREATE TABLE IF NOT EXISTS segments (
|
|
77
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
78
|
+
source_text TEXT NOT NULL,
|
|
79
|
+
target_text TEXT NOT NULL,
|
|
80
|
+
source_lower TEXT NOT NULL,
|
|
81
|
+
tm_id TEXT,
|
|
82
|
+
tm_name TEXT,
|
|
83
|
+
similarity REAL,
|
|
84
|
+
original_id INTEGER,
|
|
85
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
86
|
+
)
|
|
87
|
+
""")
|
|
88
|
+
|
|
89
|
+
# Indexes
|
|
90
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_source_lower ON segments(source_lower)")
|
|
91
|
+
|
|
92
|
+
# FTS5 for fuzzy text search
|
|
93
|
+
cursor.execute("""
|
|
94
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5(
|
|
95
|
+
source_text,
|
|
96
|
+
content=segments,
|
|
97
|
+
content_rowid=id
|
|
98
|
+
)
|
|
99
|
+
""")
|
|
100
|
+
|
|
101
|
+
# Store schema version
|
|
102
|
+
cursor.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES ('schema_version', ?)",
|
|
103
|
+
(str(self.SCHEMA_VERSION),))
|
|
104
|
+
|
|
105
|
+
self.conn.commit()
|
|
106
|
+
|
|
107
|
+
def _set_metadata(self, key: str, value: str):
|
|
108
|
+
"""Store metadata in the database"""
|
|
109
|
+
with self.lock:
|
|
110
|
+
cursor = self.conn.cursor()
|
|
111
|
+
cursor.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)", (key, value))
|
|
112
|
+
self.conn.commit()
|
|
113
|
+
|
|
114
|
+
def _get_metadata(self, key: str, default: str = None) -> Optional[str]:
|
|
115
|
+
"""Retrieve metadata from the database"""
|
|
116
|
+
with self.lock:
|
|
117
|
+
cursor = self.conn.cursor()
|
|
118
|
+
cursor.execute("SELECT value FROM metadata WHERE key = ?", (key,))
|
|
119
|
+
row = cursor.fetchone()
|
|
120
|
+
return row[0] if row else default
|
|
121
|
+
|
|
122
|
+
def extract_and_save(
|
|
123
|
+
self,
|
|
124
|
+
output_path: str,
|
|
125
|
+
db_manager,
|
|
126
|
+
project_segments: List,
|
|
127
|
+
tm_ids: List[str],
|
|
128
|
+
tm_names: List[str] = None,
|
|
129
|
+
source_lang: str = None,
|
|
130
|
+
target_lang: str = None,
|
|
131
|
+
threshold: float = 0.80,
|
|
132
|
+
project_name: str = None,
|
|
133
|
+
progress_callback: Optional[Callable[[int, int, str], None]] = None
|
|
134
|
+
) -> Tuple[int, str]:
|
|
135
|
+
"""
|
|
136
|
+
Extract segments from TMs and save to .svtm file.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
output_path: Path for the .svtm file
|
|
140
|
+
db_manager: The main database manager with TM data
|
|
141
|
+
project_segments: List of project segments to find matches for
|
|
142
|
+
tm_ids: List of TM IDs to extract from
|
|
143
|
+
tm_names: List of TM names (for display/metadata)
|
|
144
|
+
source_lang: Source language filter
|
|
145
|
+
target_lang: Target language filter
|
|
146
|
+
threshold: Minimum similarity threshold (0.0-1.0)
|
|
147
|
+
project_name: Project name for metadata
|
|
148
|
+
progress_callback: Optional callback(current, total, message)
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Tuple of (segments_extracted, output_path)
|
|
152
|
+
"""
|
|
153
|
+
start_time = time.time()
|
|
154
|
+
|
|
155
|
+
# Close any existing connection
|
|
156
|
+
if self.conn:
|
|
157
|
+
self.conn.close()
|
|
158
|
+
self.conn = None
|
|
159
|
+
|
|
160
|
+
# Remove existing file if present
|
|
161
|
+
if os.path.exists(output_path):
|
|
162
|
+
os.remove(output_path)
|
|
163
|
+
|
|
164
|
+
# Create new database file
|
|
165
|
+
self.file_path = output_path
|
|
166
|
+
self.conn = sqlite3.connect(output_path, check_same_thread=False)
|
|
167
|
+
self.conn.row_factory = sqlite3.Row
|
|
168
|
+
|
|
169
|
+
# Create schema
|
|
170
|
+
self._create_schema()
|
|
171
|
+
|
|
172
|
+
# Store metadata
|
|
173
|
+
self._set_metadata('project_name', project_name or 'Unknown')
|
|
174
|
+
self._set_metadata('source_lang', source_lang or '')
|
|
175
|
+
self._set_metadata('target_lang', target_lang or '')
|
|
176
|
+
self._set_metadata('threshold', str(threshold))
|
|
177
|
+
self._set_metadata('tm_ids', ','.join(tm_ids) if tm_ids else '')
|
|
178
|
+
self._set_metadata('tm_names', ','.join(tm_names) if tm_names else '')
|
|
179
|
+
self._set_metadata('created_at', time.strftime('%Y-%m-%d %H:%M:%S'))
|
|
180
|
+
|
|
181
|
+
if not project_segments or not db_manager or not tm_ids:
|
|
182
|
+
self.is_loaded = True
|
|
183
|
+
self.segment_count = 0
|
|
184
|
+
return 0, output_path
|
|
185
|
+
|
|
186
|
+
# Get unique source texts from project
|
|
187
|
+
unique_sources = {}
|
|
188
|
+
for seg in project_segments:
|
|
189
|
+
# Try both 'source' and 'source_text' attributes (different segment types use different names)
|
|
190
|
+
source = getattr(seg, 'source', None) or getattr(seg, 'source_text', None)
|
|
191
|
+
if source and source.strip():
|
|
192
|
+
key = source.strip().lower()
|
|
193
|
+
if key not in unique_sources:
|
|
194
|
+
unique_sources[key] = source.strip()
|
|
195
|
+
|
|
196
|
+
total = len(unique_sources)
|
|
197
|
+
if total == 0:
|
|
198
|
+
self.is_loaded = True
|
|
199
|
+
self.segment_count = 0
|
|
200
|
+
return 0, output_path
|
|
201
|
+
|
|
202
|
+
extracted_count = 0
|
|
203
|
+
seen_sources = set()
|
|
204
|
+
cursor = self.conn.cursor()
|
|
205
|
+
|
|
206
|
+
tm_names_str = ', '.join(tm_names) if tm_names else 'Selected TMs'
|
|
207
|
+
|
|
208
|
+
for i, (key, source_text) in enumerate(unique_sources.items()):
|
|
209
|
+
if progress_callback:
|
|
210
|
+
progress_callback(i, total, f"Searching: {tm_names_str}")
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
# Search TMs for fuzzy matches
|
|
214
|
+
matches = db_manager.search_fuzzy_matches(
|
|
215
|
+
source_text,
|
|
216
|
+
tm_ids=tm_ids,
|
|
217
|
+
threshold=threshold,
|
|
218
|
+
max_results=10,
|
|
219
|
+
source_lang=source_lang,
|
|
220
|
+
target_lang=target_lang,
|
|
221
|
+
bidirectional=True
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
for match in matches:
|
|
225
|
+
match_source = match.get('source_text', '')
|
|
226
|
+
match_target = match.get('target_text', '')
|
|
227
|
+
|
|
228
|
+
if not match_source or not match_target:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
# Deduplicate
|
|
232
|
+
source_key = match_source.strip().lower()
|
|
233
|
+
if source_key in seen_sources:
|
|
234
|
+
continue
|
|
235
|
+
seen_sources.add(source_key)
|
|
236
|
+
|
|
237
|
+
cursor.execute("""
|
|
238
|
+
INSERT INTO segments (source_text, target_text, source_lower,
|
|
239
|
+
tm_id, tm_name, similarity, original_id)
|
|
240
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
241
|
+
""", (
|
|
242
|
+
match_source,
|
|
243
|
+
match_target,
|
|
244
|
+
source_key,
|
|
245
|
+
match.get('tm_id'),
|
|
246
|
+
match.get('tm_name', 'Unknown'),
|
|
247
|
+
match.get('similarity', 0),
|
|
248
|
+
match.get('id')
|
|
249
|
+
))
|
|
250
|
+
extracted_count += 1
|
|
251
|
+
|
|
252
|
+
except Exception as e:
|
|
253
|
+
pass # Continue on errors
|
|
254
|
+
|
|
255
|
+
# Commit and rebuild FTS
|
|
256
|
+
self.conn.commit()
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
cursor.execute("INSERT INTO segments_fts(segments_fts) VALUES('rebuild')")
|
|
260
|
+
self.conn.commit()
|
|
261
|
+
except Exception:
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
# Update metadata with final count
|
|
265
|
+
elapsed = time.time() - start_time
|
|
266
|
+
self._set_metadata('segment_count', str(extracted_count))
|
|
267
|
+
self._set_metadata('extraction_time', f"{elapsed:.1f}s")
|
|
268
|
+
|
|
269
|
+
if progress_callback:
|
|
270
|
+
progress_callback(total, total, f"Complete: {extracted_count} segments")
|
|
271
|
+
|
|
272
|
+
self.is_loaded = True
|
|
273
|
+
self.segment_count = extracted_count
|
|
274
|
+
|
|
275
|
+
return extracted_count, output_path
|
|
276
|
+
|
|
277
|
+
def load(self, file_path: str) -> bool:
|
|
278
|
+
"""
|
|
279
|
+
Load an existing .svtm file.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
file_path: Path to the .svtm file
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
True if loaded successfully, False otherwise
|
|
286
|
+
"""
|
|
287
|
+
if not os.path.exists(file_path):
|
|
288
|
+
return False
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
# Close existing connection
|
|
292
|
+
if self.conn:
|
|
293
|
+
self.conn.close()
|
|
294
|
+
|
|
295
|
+
self.file_path = file_path
|
|
296
|
+
self.conn = sqlite3.connect(file_path, check_same_thread=False)
|
|
297
|
+
self.conn.row_factory = sqlite3.Row
|
|
298
|
+
|
|
299
|
+
# Load metadata
|
|
300
|
+
self.metadata = {
|
|
301
|
+
'project_name': self._get_metadata('project_name', 'Unknown'),
|
|
302
|
+
'source_lang': self._get_metadata('source_lang', ''),
|
|
303
|
+
'target_lang': self._get_metadata('target_lang', ''),
|
|
304
|
+
'threshold': self._get_metadata('threshold', '0.80'),
|
|
305
|
+
'tm_ids': self._get_metadata('tm_ids', ''),
|
|
306
|
+
'tm_names': self._get_metadata('tm_names', ''),
|
|
307
|
+
'created_at': self._get_metadata('created_at', ''),
|
|
308
|
+
'segment_count': self._get_metadata('segment_count', '0'),
|
|
309
|
+
'extraction_time': self._get_metadata('extraction_time', ''),
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
# Get actual segment count
|
|
313
|
+
cursor = self.conn.cursor()
|
|
314
|
+
cursor.execute("SELECT COUNT(*) FROM segments")
|
|
315
|
+
self.segment_count = cursor.fetchone()[0]
|
|
316
|
+
|
|
317
|
+
self.is_loaded = True
|
|
318
|
+
return True
|
|
319
|
+
|
|
320
|
+
except Exception as e:
|
|
321
|
+
self.is_loaded = False
|
|
322
|
+
return False
|
|
323
|
+
|
|
324
|
+
def search(self, source_text: str, max_results: int = 5) -> List[Dict]:
|
|
325
|
+
"""
|
|
326
|
+
Search ExtractTM for matches.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
source_text: Source text to search for
|
|
330
|
+
max_results: Maximum results to return
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
List of match dictionaries
|
|
334
|
+
"""
|
|
335
|
+
if not self.is_loaded or not source_text or not self.conn:
|
|
336
|
+
return []
|
|
337
|
+
|
|
338
|
+
source_lower = source_text.strip().lower()
|
|
339
|
+
results = []
|
|
340
|
+
|
|
341
|
+
with self.lock:
|
|
342
|
+
cursor = self.conn.cursor()
|
|
343
|
+
|
|
344
|
+
# 1. Exact match
|
|
345
|
+
cursor.execute("SELECT * FROM segments WHERE source_lower = ? LIMIT 1", (source_lower,))
|
|
346
|
+
exact = cursor.fetchone()
|
|
347
|
+
|
|
348
|
+
if exact:
|
|
349
|
+
results.append({
|
|
350
|
+
'source_text': exact['source_text'],
|
|
351
|
+
'target_text': exact['target_text'],
|
|
352
|
+
'tm_id': exact['tm_id'],
|
|
353
|
+
'tm_name': exact['tm_name'] + ' (Extract)',
|
|
354
|
+
'similarity': 1.0,
|
|
355
|
+
'match_pct': 100,
|
|
356
|
+
'id': exact['original_id']
|
|
357
|
+
})
|
|
358
|
+
return results
|
|
359
|
+
|
|
360
|
+
# 2. FTS5 fuzzy search
|
|
361
|
+
try:
|
|
362
|
+
clean_text = re.sub(r'[^\w\s]', ' ', source_text)
|
|
363
|
+
search_terms = [t for t in clean_text.split() if len(t) > 2]
|
|
364
|
+
|
|
365
|
+
if search_terms:
|
|
366
|
+
fts_query = ' OR '.join(f'"{term}"' for term in search_terms[:10])
|
|
367
|
+
|
|
368
|
+
cursor.execute("""
|
|
369
|
+
SELECT s.*, bm25(segments_fts) as rank
|
|
370
|
+
FROM segments s
|
|
371
|
+
JOIN segments_fts ON s.id = segments_fts.rowid
|
|
372
|
+
WHERE segments_fts MATCH ?
|
|
373
|
+
ORDER BY rank
|
|
374
|
+
LIMIT ?
|
|
375
|
+
""", (fts_query, max_results * 3))
|
|
376
|
+
|
|
377
|
+
candidates = cursor.fetchall()
|
|
378
|
+
|
|
379
|
+
for row in candidates:
|
|
380
|
+
similarity = self._calculate_similarity(source_text, row['source_text'])
|
|
381
|
+
if similarity >= 0.5:
|
|
382
|
+
results.append({
|
|
383
|
+
'source_text': row['source_text'],
|
|
384
|
+
'target_text': row['target_text'],
|
|
385
|
+
'tm_id': row['tm_id'],
|
|
386
|
+
'tm_name': row['tm_name'] + ' (Extract)',
|
|
387
|
+
'similarity': similarity,
|
|
388
|
+
'match_pct': int(similarity * 100),
|
|
389
|
+
'id': row['original_id']
|
|
390
|
+
})
|
|
391
|
+
|
|
392
|
+
results.sort(key=lambda x: x['similarity'], reverse=True)
|
|
393
|
+
results = results[:max_results]
|
|
394
|
+
|
|
395
|
+
except Exception:
|
|
396
|
+
pass
|
|
397
|
+
|
|
398
|
+
return results
|
|
399
|
+
|
|
400
|
+
def _calculate_similarity(self, text1: str, text2: str) -> float:
|
|
401
|
+
"""Calculate similarity between two texts"""
|
|
402
|
+
clean1 = re.sub(r'<[^>]+>', '', text1).lower()
|
|
403
|
+
clean2 = re.sub(r'<[^>]+>', '', text2).lower()
|
|
404
|
+
return SequenceMatcher(None, clean1, clean2).ratio()
|
|
405
|
+
|
|
406
|
+
def export_to_tmx(self, output_path: str, progress_callback: Optional[Callable[[int, int], None]] = None) -> int:
|
|
407
|
+
"""
|
|
408
|
+
Export the ExtractTM to a TMX file.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
output_path: Path for the TMX file
|
|
412
|
+
progress_callback: Optional callback(current, total)
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
Number of segments exported
|
|
416
|
+
"""
|
|
417
|
+
if not self.is_loaded or not self.conn:
|
|
418
|
+
return 0
|
|
419
|
+
|
|
420
|
+
with self.lock:
|
|
421
|
+
cursor = self.conn.cursor()
|
|
422
|
+
cursor.execute("SELECT * FROM segments")
|
|
423
|
+
rows = cursor.fetchall()
|
|
424
|
+
|
|
425
|
+
if not rows:
|
|
426
|
+
return 0
|
|
427
|
+
|
|
428
|
+
source_lang = self.metadata.get('source_lang', 'en')
|
|
429
|
+
target_lang = self.metadata.get('target_lang', 'nl')
|
|
430
|
+
|
|
431
|
+
# Build TMX content
|
|
432
|
+
tmx_header = f'''<?xml version="1.0" encoding="UTF-8"?>
|
|
433
|
+
<!DOCTYPE tmx SYSTEM "tmx14.dtd">
|
|
434
|
+
<tmx version="1.4">
|
|
435
|
+
<header creationtool="Supervertaler" creationtoolversion="1.0"
|
|
436
|
+
datatype="plaintext" segtype="sentence"
|
|
437
|
+
adminlang="en" srclang="{source_lang}" o-tmf="Supervertaler">
|
|
438
|
+
</header>
|
|
439
|
+
<body>
|
|
440
|
+
'''
|
|
441
|
+
tmx_footer = ''' </body>
|
|
442
|
+
</tmx>
|
|
443
|
+
'''
|
|
444
|
+
|
|
445
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
446
|
+
f.write(tmx_header)
|
|
447
|
+
|
|
448
|
+
for i, row in enumerate(rows):
|
|
449
|
+
if progress_callback and i % 100 == 0:
|
|
450
|
+
progress_callback(i, len(rows))
|
|
451
|
+
|
|
452
|
+
source = self._escape_xml(row['source_text'])
|
|
453
|
+
target = self._escape_xml(row['target_text'])
|
|
454
|
+
|
|
455
|
+
tu = f''' <tu>
|
|
456
|
+
<tuv xml:lang="{source_lang}">
|
|
457
|
+
<seg>{source}</seg>
|
|
458
|
+
</tuv>
|
|
459
|
+
<tuv xml:lang="{target_lang}">
|
|
460
|
+
<seg>{target}</seg>
|
|
461
|
+
</tuv>
|
|
462
|
+
</tu>
|
|
463
|
+
'''
|
|
464
|
+
f.write(tu)
|
|
465
|
+
|
|
466
|
+
f.write(tmx_footer)
|
|
467
|
+
|
|
468
|
+
if progress_callback:
|
|
469
|
+
progress_callback(len(rows), len(rows))
|
|
470
|
+
|
|
471
|
+
return len(rows)
|
|
472
|
+
|
|
473
|
+
def _escape_xml(self, text: str) -> str:
|
|
474
|
+
"""Escape XML special characters"""
|
|
475
|
+
if not text:
|
|
476
|
+
return ''
|
|
477
|
+
return (text
|
|
478
|
+
.replace('&', '&')
|
|
479
|
+
.replace('<', '<')
|
|
480
|
+
.replace('>', '>')
|
|
481
|
+
.replace('"', '"')
|
|
482
|
+
.replace("'", '''))
|
|
483
|
+
|
|
484
|
+
def get_info(self) -> Dict:
|
|
485
|
+
"""Get information about the loaded ExtractTM"""
|
|
486
|
+
return {
|
|
487
|
+
'file_path': self.file_path,
|
|
488
|
+
'is_loaded': self.is_loaded,
|
|
489
|
+
'segment_count': self.segment_count,
|
|
490
|
+
**self.metadata
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
def close(self):
|
|
494
|
+
"""Close the database connection"""
|
|
495
|
+
if self.conn:
|
|
496
|
+
self.conn.close()
|
|
497
|
+
self.conn = None
|
|
498
|
+
self.is_loaded = False
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def get_extract_path(project_path: str) -> str:
|
|
502
|
+
"""
|
|
503
|
+
Get the expected Extract TM path for a project.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
project_path: Path to the project file (.sproj)
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Path to the Extract TM file (.svtm)
|
|
510
|
+
"""
|
|
511
|
+
project_dir = os.path.dirname(project_path)
|
|
512
|
+
project_name = os.path.splitext(os.path.basename(project_path))[0]
|
|
513
|
+
return os.path.join(project_dir, f"{project_name}_Extract.svtm")
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def extract_exists(project_path: str) -> bool:
|
|
517
|
+
"""Check if an Extract TM exists for a project"""
|
|
518
|
+
return os.path.exists(get_extract_path(project_path))
|
|
@@ -301,6 +301,10 @@ class KeyboardShortcutsWidget(QWidget):
|
|
|
301
301
|
|
|
302
302
|
def load_shortcuts(self):
|
|
303
303
|
"""Load shortcuts into the table"""
|
|
304
|
+
# CRITICAL: Disable sorting during table modifications to prevent
|
|
305
|
+
# items from becoming disassociated from their rows (causes vanishing text bug)
|
|
306
|
+
self.table.setSortingEnabled(False)
|
|
307
|
+
|
|
304
308
|
self.table.setRowCount(0)
|
|
305
309
|
|
|
306
310
|
all_shortcuts = self.manager.get_all_shortcuts()
|
|
@@ -362,6 +366,9 @@ class KeyboardShortcutsWidget(QWidget):
|
|
|
362
366
|
self.table.setItem(row, 4, status_item)
|
|
363
367
|
|
|
364
368
|
row += 1
|
|
369
|
+
|
|
370
|
+
# Re-enable sorting after all modifications are complete
|
|
371
|
+
self.table.setSortingEnabled(True)
|
|
365
372
|
|
|
366
373
|
def _on_enabled_changed(self, state):
|
|
367
374
|
"""Handle checkbox state change for enabling/disabling shortcuts"""
|