supervertaler 1.9.153__py3-none-any.whl → 1.9.189__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

@@ -186,9 +186,13 @@ def run_all_migrations(db_manager) -> bool:
186
186
  # Migration 3: Add display_order and forbidden fields to synonyms
187
187
  if not migrate_synonym_fields(db_manager):
188
188
  success = False
189
-
189
+
190
+ # Migration 4: Add ai_inject field to termbases
191
+ if not migrate_termbase_ai_inject(db_manager):
192
+ success = False
193
+
190
194
  print("="*60)
191
-
195
+
192
196
  return success
193
197
 
194
198
 
@@ -221,18 +225,26 @@ def check_and_migrate(db_manager) -> bool:
221
225
 
222
226
  # Check if synonyms table exists
223
227
  cursor.execute("""
224
- SELECT name FROM sqlite_master
228
+ SELECT name FROM sqlite_master
225
229
  WHERE type='table' AND name='termbase_synonyms'
226
230
  """)
227
231
  needs_synonyms_table = cursor.fetchone() is None
228
-
232
+
233
+ # Check if termbases table has ai_inject column
234
+ cursor.execute("PRAGMA table_info(termbases)")
235
+ termbase_columns = {row[1] for row in cursor.fetchall()}
236
+ needs_ai_inject = 'ai_inject' not in termbase_columns
237
+
229
238
  if needs_migration:
230
239
  print(f"⚠️ Migration needed - missing columns: {', '.join([c for c in ['project', 'client', 'term_uuid', 'note'] if c not in columns])}")
231
-
240
+
232
241
  if needs_synonyms_table:
233
242
  print("⚠️ Migration needed - termbase_synonyms table missing")
234
-
235
- if needs_migration or needs_synonyms_table:
243
+
244
+ if needs_ai_inject:
245
+ print("⚠️ Migration needed - termbases.ai_inject column missing")
246
+
247
+ if needs_migration or needs_synonyms_table or needs_ai_inject:
236
248
  success = run_all_migrations(db_manager)
237
249
  if success:
238
250
  # Generate UUIDs for terms that don't have them
@@ -316,6 +328,41 @@ def migrate_synonym_fields(db_manager) -> bool:
316
328
  return False
317
329
 
318
330
 
331
+ def migrate_termbase_ai_inject(db_manager) -> bool:
332
+ """
333
+ Add ai_inject column to termbases table.
334
+ When enabled, the termbase's terms will be injected into LLM translation prompts.
335
+
336
+ Args:
337
+ db_manager: DatabaseManager instance
338
+
339
+ Returns:
340
+ True if migration successful
341
+ """
342
+ try:
343
+ cursor = db_manager.cursor
344
+
345
+ # Check which columns exist
346
+ cursor.execute("PRAGMA table_info(termbases)")
347
+ columns = {row[1] for row in cursor.fetchall()}
348
+
349
+ if 'ai_inject' not in columns:
350
+ print("📊 Adding 'ai_inject' column to termbases...")
351
+ cursor.execute("ALTER TABLE termbases ADD COLUMN ai_inject BOOLEAN DEFAULT 0")
352
+ db_manager.connection.commit()
353
+ print(" ✓ Column 'ai_inject' added successfully")
354
+ else:
355
+ print("✅ termbases.ai_inject column already exists")
356
+
357
+ return True
358
+
359
+ except Exception as e:
360
+ print(f"❌ ai_inject migration failed: {e}")
361
+ import traceback
362
+ traceback.print_exc()
363
+ return False
364
+
365
+
319
366
  def generate_missing_uuids(db_manager) -> bool:
320
367
  """
321
368
  Generate UUIDs for any termbase terms that don't have them.
modules/extract_tm.py ADDED
@@ -0,0 +1,518 @@
1
+ """
2
+ ExtractTM - Persistent TM extraction saved to .svtm files
3
+
4
+ This module implements TM extraction that saves relevant segments from existing TMs
5
+ to a .svtm file (SQLite database) next to the project file. Unlike the in-memory
6
+ ProjectTM, this persists across sessions.
7
+
8
+ File format: .svtm (Supervertaler TM) - SQLite database internally
9
+ Filename pattern: {ProjectName}_Extract.svtm
10
+ """
11
+
12
+ import sqlite3
13
+ import threading
14
+ import os
15
+ from pathlib import Path
16
+ from difflib import SequenceMatcher
17
+ from typing import Dict, List, Optional, Callable, Tuple
18
+ import re
19
+ import time
20
+
21
+
22
+ class ExtractTM:
23
+ """
24
+ Persistent TM extraction saved to disk as .svtm file.
25
+
26
+ Extracts relevant segments from selected TMs and saves them to a SQLite
27
+ database file next to the project. This persists across sessions, so
28
+ extraction only needs to happen once per project.
29
+
30
+ Usage:
31
+ extract_tm = ExtractTM()
32
+
33
+ # Extract and save
34
+ extract_tm.extract_and_save(
35
+ output_path="MyProject_Extract.svtm",
36
+ db_manager=db_manager,
37
+ project_segments=segments,
38
+ tm_ids=['tm1', 'tm2'],
39
+ threshold=0.80,
40
+ progress_callback=lambda cur, total, msg: print(f"{cur}/{total} - {msg}")
41
+ )
42
+
43
+ # Load existing extraction
44
+ extract_tm.load("MyProject_Extract.svtm")
45
+
46
+ # Search
47
+ matches = extract_tm.search("source text")
48
+ """
49
+
50
+ SCHEMA_VERSION = 1
51
+
52
+ def __init__(self):
53
+ """Initialize ExtractTM (not connected to any file yet)"""
54
+ self.conn = None
55
+ self.file_path = None
56
+ self.lock = threading.Lock()
57
+ self.is_loaded = False
58
+ self.segment_count = 0
59
+ self.metadata = {}
60
+
61
+ def _create_schema(self):
62
+ """Create the database schema"""
63
+ with self.lock:
64
+ cursor = self.conn.cursor()
65
+
66
+ # Metadata table
67
+ cursor.execute("""
68
+ CREATE TABLE IF NOT EXISTS metadata (
69
+ key TEXT PRIMARY KEY,
70
+ value TEXT
71
+ )
72
+ """)
73
+
74
+ # Segments table
75
+ cursor.execute("""
76
+ CREATE TABLE IF NOT EXISTS segments (
77
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
78
+ source_text TEXT NOT NULL,
79
+ target_text TEXT NOT NULL,
80
+ source_lower TEXT NOT NULL,
81
+ tm_id TEXT,
82
+ tm_name TEXT,
83
+ similarity REAL,
84
+ original_id INTEGER,
85
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
86
+ )
87
+ """)
88
+
89
+ # Indexes
90
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_source_lower ON segments(source_lower)")
91
+
92
+ # FTS5 for fuzzy text search
93
+ cursor.execute("""
94
+ CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5(
95
+ source_text,
96
+ content=segments,
97
+ content_rowid=id
98
+ )
99
+ """)
100
+
101
+ # Store schema version
102
+ cursor.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES ('schema_version', ?)",
103
+ (str(self.SCHEMA_VERSION),))
104
+
105
+ self.conn.commit()
106
+
107
+ def _set_metadata(self, key: str, value: str):
108
+ """Store metadata in the database"""
109
+ with self.lock:
110
+ cursor = self.conn.cursor()
111
+ cursor.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)", (key, value))
112
+ self.conn.commit()
113
+
114
+ def _get_metadata(self, key: str, default: str = None) -> Optional[str]:
115
+ """Retrieve metadata from the database"""
116
+ with self.lock:
117
+ cursor = self.conn.cursor()
118
+ cursor.execute("SELECT value FROM metadata WHERE key = ?", (key,))
119
+ row = cursor.fetchone()
120
+ return row[0] if row else default
121
+
122
+ def extract_and_save(
123
+ self,
124
+ output_path: str,
125
+ db_manager,
126
+ project_segments: List,
127
+ tm_ids: List[str],
128
+ tm_names: List[str] = None,
129
+ source_lang: str = None,
130
+ target_lang: str = None,
131
+ threshold: float = 0.80,
132
+ project_name: str = None,
133
+ progress_callback: Optional[Callable[[int, int, str], None]] = None
134
+ ) -> Tuple[int, str]:
135
+ """
136
+ Extract segments from TMs and save to .svtm file.
137
+
138
+ Args:
139
+ output_path: Path for the .svtm file
140
+ db_manager: The main database manager with TM data
141
+ project_segments: List of project segments to find matches for
142
+ tm_ids: List of TM IDs to extract from
143
+ tm_names: List of TM names (for display/metadata)
144
+ source_lang: Source language filter
145
+ target_lang: Target language filter
146
+ threshold: Minimum similarity threshold (0.0-1.0)
147
+ project_name: Project name for metadata
148
+ progress_callback: Optional callback(current, total, message)
149
+
150
+ Returns:
151
+ Tuple of (segments_extracted, output_path)
152
+ """
153
+ start_time = time.time()
154
+
155
+ # Close any existing connection
156
+ if self.conn:
157
+ self.conn.close()
158
+ self.conn = None
159
+
160
+ # Remove existing file if present
161
+ if os.path.exists(output_path):
162
+ os.remove(output_path)
163
+
164
+ # Create new database file
165
+ self.file_path = output_path
166
+ self.conn = sqlite3.connect(output_path, check_same_thread=False)
167
+ self.conn.row_factory = sqlite3.Row
168
+
169
+ # Create schema
170
+ self._create_schema()
171
+
172
+ # Store metadata
173
+ self._set_metadata('project_name', project_name or 'Unknown')
174
+ self._set_metadata('source_lang', source_lang or '')
175
+ self._set_metadata('target_lang', target_lang or '')
176
+ self._set_metadata('threshold', str(threshold))
177
+ self._set_metadata('tm_ids', ','.join(tm_ids) if tm_ids else '')
178
+ self._set_metadata('tm_names', ','.join(tm_names) if tm_names else '')
179
+ self._set_metadata('created_at', time.strftime('%Y-%m-%d %H:%M:%S'))
180
+
181
+ if not project_segments or not db_manager or not tm_ids:
182
+ self.is_loaded = True
183
+ self.segment_count = 0
184
+ return 0, output_path
185
+
186
+ # Get unique source texts from project
187
+ unique_sources = {}
188
+ for seg in project_segments:
189
+ # Try both 'source' and 'source_text' attributes (different segment types use different names)
190
+ source = getattr(seg, 'source', None) or getattr(seg, 'source_text', None)
191
+ if source and source.strip():
192
+ key = source.strip().lower()
193
+ if key not in unique_sources:
194
+ unique_sources[key] = source.strip()
195
+
196
+ total = len(unique_sources)
197
+ if total == 0:
198
+ self.is_loaded = True
199
+ self.segment_count = 0
200
+ return 0, output_path
201
+
202
+ extracted_count = 0
203
+ seen_sources = set()
204
+ cursor = self.conn.cursor()
205
+
206
+ tm_names_str = ', '.join(tm_names) if tm_names else 'Selected TMs'
207
+
208
+ for i, (key, source_text) in enumerate(unique_sources.items()):
209
+ if progress_callback:
210
+ progress_callback(i, total, f"Searching: {tm_names_str}")
211
+
212
+ try:
213
+ # Search TMs for fuzzy matches
214
+ matches = db_manager.search_fuzzy_matches(
215
+ source_text,
216
+ tm_ids=tm_ids,
217
+ threshold=threshold,
218
+ max_results=10,
219
+ source_lang=source_lang,
220
+ target_lang=target_lang,
221
+ bidirectional=True
222
+ )
223
+
224
+ for match in matches:
225
+ match_source = match.get('source_text', '')
226
+ match_target = match.get('target_text', '')
227
+
228
+ if not match_source or not match_target:
229
+ continue
230
+
231
+ # Deduplicate
232
+ source_key = match_source.strip().lower()
233
+ if source_key in seen_sources:
234
+ continue
235
+ seen_sources.add(source_key)
236
+
237
+ cursor.execute("""
238
+ INSERT INTO segments (source_text, target_text, source_lower,
239
+ tm_id, tm_name, similarity, original_id)
240
+ VALUES (?, ?, ?, ?, ?, ?, ?)
241
+ """, (
242
+ match_source,
243
+ match_target,
244
+ source_key,
245
+ match.get('tm_id'),
246
+ match.get('tm_name', 'Unknown'),
247
+ match.get('similarity', 0),
248
+ match.get('id')
249
+ ))
250
+ extracted_count += 1
251
+
252
+ except Exception as e:
253
+ pass # Continue on errors
254
+
255
+ # Commit and rebuild FTS
256
+ self.conn.commit()
257
+
258
+ try:
259
+ cursor.execute("INSERT INTO segments_fts(segments_fts) VALUES('rebuild')")
260
+ self.conn.commit()
261
+ except Exception:
262
+ pass
263
+
264
+ # Update metadata with final count
265
+ elapsed = time.time() - start_time
266
+ self._set_metadata('segment_count', str(extracted_count))
267
+ self._set_metadata('extraction_time', f"{elapsed:.1f}s")
268
+
269
+ if progress_callback:
270
+ progress_callback(total, total, f"Complete: {extracted_count} segments")
271
+
272
+ self.is_loaded = True
273
+ self.segment_count = extracted_count
274
+
275
+ return extracted_count, output_path
276
+
277
+ def load(self, file_path: str) -> bool:
278
+ """
279
+ Load an existing .svtm file.
280
+
281
+ Args:
282
+ file_path: Path to the .svtm file
283
+
284
+ Returns:
285
+ True if loaded successfully, False otherwise
286
+ """
287
+ if not os.path.exists(file_path):
288
+ return False
289
+
290
+ try:
291
+ # Close existing connection
292
+ if self.conn:
293
+ self.conn.close()
294
+
295
+ self.file_path = file_path
296
+ self.conn = sqlite3.connect(file_path, check_same_thread=False)
297
+ self.conn.row_factory = sqlite3.Row
298
+
299
+ # Load metadata
300
+ self.metadata = {
301
+ 'project_name': self._get_metadata('project_name', 'Unknown'),
302
+ 'source_lang': self._get_metadata('source_lang', ''),
303
+ 'target_lang': self._get_metadata('target_lang', ''),
304
+ 'threshold': self._get_metadata('threshold', '0.80'),
305
+ 'tm_ids': self._get_metadata('tm_ids', ''),
306
+ 'tm_names': self._get_metadata('tm_names', ''),
307
+ 'created_at': self._get_metadata('created_at', ''),
308
+ 'segment_count': self._get_metadata('segment_count', '0'),
309
+ 'extraction_time': self._get_metadata('extraction_time', ''),
310
+ }
311
+
312
+ # Get actual segment count
313
+ cursor = self.conn.cursor()
314
+ cursor.execute("SELECT COUNT(*) FROM segments")
315
+ self.segment_count = cursor.fetchone()[0]
316
+
317
+ self.is_loaded = True
318
+ return True
319
+
320
+ except Exception as e:
321
+ self.is_loaded = False
322
+ return False
323
+
324
+ def search(self, source_text: str, max_results: int = 5) -> List[Dict]:
325
+ """
326
+ Search ExtractTM for matches.
327
+
328
+ Args:
329
+ source_text: Source text to search for
330
+ max_results: Maximum results to return
331
+
332
+ Returns:
333
+ List of match dictionaries
334
+ """
335
+ if not self.is_loaded or not source_text or not self.conn:
336
+ return []
337
+
338
+ source_lower = source_text.strip().lower()
339
+ results = []
340
+
341
+ with self.lock:
342
+ cursor = self.conn.cursor()
343
+
344
+ # 1. Exact match
345
+ cursor.execute("SELECT * FROM segments WHERE source_lower = ? LIMIT 1", (source_lower,))
346
+ exact = cursor.fetchone()
347
+
348
+ if exact:
349
+ results.append({
350
+ 'source_text': exact['source_text'],
351
+ 'target_text': exact['target_text'],
352
+ 'tm_id': exact['tm_id'],
353
+ 'tm_name': exact['tm_name'] + ' (Extract)',
354
+ 'similarity': 1.0,
355
+ 'match_pct': 100,
356
+ 'id': exact['original_id']
357
+ })
358
+ return results
359
+
360
+ # 2. FTS5 fuzzy search
361
+ try:
362
+ clean_text = re.sub(r'[^\w\s]', ' ', source_text)
363
+ search_terms = [t for t in clean_text.split() if len(t) > 2]
364
+
365
+ if search_terms:
366
+ fts_query = ' OR '.join(f'"{term}"' for term in search_terms[:10])
367
+
368
+ cursor.execute("""
369
+ SELECT s.*, bm25(segments_fts) as rank
370
+ FROM segments s
371
+ JOIN segments_fts ON s.id = segments_fts.rowid
372
+ WHERE segments_fts MATCH ?
373
+ ORDER BY rank
374
+ LIMIT ?
375
+ """, (fts_query, max_results * 3))
376
+
377
+ candidates = cursor.fetchall()
378
+
379
+ for row in candidates:
380
+ similarity = self._calculate_similarity(source_text, row['source_text'])
381
+ if similarity >= 0.5:
382
+ results.append({
383
+ 'source_text': row['source_text'],
384
+ 'target_text': row['target_text'],
385
+ 'tm_id': row['tm_id'],
386
+ 'tm_name': row['tm_name'] + ' (Extract)',
387
+ 'similarity': similarity,
388
+ 'match_pct': int(similarity * 100),
389
+ 'id': row['original_id']
390
+ })
391
+
392
+ results.sort(key=lambda x: x['similarity'], reverse=True)
393
+ results = results[:max_results]
394
+
395
+ except Exception:
396
+ pass
397
+
398
+ return results
399
+
400
+ def _calculate_similarity(self, text1: str, text2: str) -> float:
401
+ """Calculate similarity between two texts"""
402
+ clean1 = re.sub(r'<[^>]+>', '', text1).lower()
403
+ clean2 = re.sub(r'<[^>]+>', '', text2).lower()
404
+ return SequenceMatcher(None, clean1, clean2).ratio()
405
+
406
+ def export_to_tmx(self, output_path: str, progress_callback: Optional[Callable[[int, int], None]] = None) -> int:
407
+ """
408
+ Export the ExtractTM to a TMX file.
409
+
410
+ Args:
411
+ output_path: Path for the TMX file
412
+ progress_callback: Optional callback(current, total)
413
+
414
+ Returns:
415
+ Number of segments exported
416
+ """
417
+ if not self.is_loaded or not self.conn:
418
+ return 0
419
+
420
+ with self.lock:
421
+ cursor = self.conn.cursor()
422
+ cursor.execute("SELECT * FROM segments")
423
+ rows = cursor.fetchall()
424
+
425
+ if not rows:
426
+ return 0
427
+
428
+ source_lang = self.metadata.get('source_lang', 'en')
429
+ target_lang = self.metadata.get('target_lang', 'nl')
430
+
431
+ # Build TMX content
432
+ tmx_header = f'''<?xml version="1.0" encoding="UTF-8"?>
433
+ <!DOCTYPE tmx SYSTEM "tmx14.dtd">
434
+ <tmx version="1.4">
435
+ <header creationtool="Supervertaler" creationtoolversion="1.0"
436
+ datatype="plaintext" segtype="sentence"
437
+ adminlang="en" srclang="{source_lang}" o-tmf="Supervertaler">
438
+ </header>
439
+ <body>
440
+ '''
441
+ tmx_footer = ''' </body>
442
+ </tmx>
443
+ '''
444
+
445
+ with open(output_path, 'w', encoding='utf-8') as f:
446
+ f.write(tmx_header)
447
+
448
+ for i, row in enumerate(rows):
449
+ if progress_callback and i % 100 == 0:
450
+ progress_callback(i, len(rows))
451
+
452
+ source = self._escape_xml(row['source_text'])
453
+ target = self._escape_xml(row['target_text'])
454
+
455
+ tu = f''' <tu>
456
+ <tuv xml:lang="{source_lang}">
457
+ <seg>{source}</seg>
458
+ </tuv>
459
+ <tuv xml:lang="{target_lang}">
460
+ <seg>{target}</seg>
461
+ </tuv>
462
+ </tu>
463
+ '''
464
+ f.write(tu)
465
+
466
+ f.write(tmx_footer)
467
+
468
+ if progress_callback:
469
+ progress_callback(len(rows), len(rows))
470
+
471
+ return len(rows)
472
+
473
+ def _escape_xml(self, text: str) -> str:
474
+ """Escape XML special characters"""
475
+ if not text:
476
+ return ''
477
+ return (text
478
+ .replace('&', '&amp;')
479
+ .replace('<', '&lt;')
480
+ .replace('>', '&gt;')
481
+ .replace('"', '&quot;')
482
+ .replace("'", '&apos;'))
483
+
484
+ def get_info(self) -> Dict:
485
+ """Get information about the loaded ExtractTM"""
486
+ return {
487
+ 'file_path': self.file_path,
488
+ 'is_loaded': self.is_loaded,
489
+ 'segment_count': self.segment_count,
490
+ **self.metadata
491
+ }
492
+
493
+ def close(self):
494
+ """Close the database connection"""
495
+ if self.conn:
496
+ self.conn.close()
497
+ self.conn = None
498
+ self.is_loaded = False
499
+
500
+
501
+ def get_extract_path(project_path: str) -> str:
502
+ """
503
+ Get the expected Extract TM path for a project.
504
+
505
+ Args:
506
+ project_path: Path to the project file (.sproj)
507
+
508
+ Returns:
509
+ Path to the Extract TM file (.svtm)
510
+ """
511
+ project_dir = os.path.dirname(project_path)
512
+ project_name = os.path.splitext(os.path.basename(project_path))[0]
513
+ return os.path.join(project_dir, f"{project_name}_Extract.svtm")
514
+
515
+
516
+ def extract_exists(project_path: str) -> bool:
517
+ """Check if an Extract TM exists for a project"""
518
+ return os.path.exists(get_extract_path(project_path))
@@ -301,6 +301,10 @@ class KeyboardShortcutsWidget(QWidget):
301
301
 
302
302
  def load_shortcuts(self):
303
303
  """Load shortcuts into the table"""
304
+ # CRITICAL: Disable sorting during table modifications to prevent
305
+ # items from becoming disassociated from their rows (causes vanishing text bug)
306
+ self.table.setSortingEnabled(False)
307
+
304
308
  self.table.setRowCount(0)
305
309
 
306
310
  all_shortcuts = self.manager.get_all_shortcuts()
@@ -362,6 +366,9 @@ class KeyboardShortcutsWidget(QWidget):
362
366
  self.table.setItem(row, 4, status_item)
363
367
 
364
368
  row += 1
369
+
370
+ # Re-enable sorting after all modifications are complete
371
+ self.table.setSortingEnabled(True)
365
372
 
366
373
  def _on_enabled_changed(self, state):
367
374
  """Handle checkbox state change for enabling/disabling shortcuts"""