supervertaler 1.9.153__py3-none-any.whl → 1.9.189__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

@@ -159,9 +159,78 @@ class MQXLIFFHandler:
159
159
 
160
160
  segment = FormattedSegment(trans_unit_id, plain_text, formatted_xml)
161
161
  segments.append(segment)
162
-
162
+
163
163
  return segments
164
-
164
+
165
+ def extract_bilingual_segments(self) -> List[Dict]:
166
+ """
167
+ Extract all source AND target segments from the MQXLIFF file.
168
+ Used for importing pretranslated mqxliff files.
169
+
170
+ Returns:
171
+ List of dicts with 'id', 'source', 'target', 'status' keys
172
+ """
173
+ segments = []
174
+
175
+ if self.body_element is None:
176
+ return segments
177
+
178
+ # Find all trans-unit elements (with or without namespace)
179
+ trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
180
+ if not trans_units:
181
+ trans_units = self.body_element.findall('.//trans-unit')
182
+
183
+ for trans_unit in trans_units:
184
+ trans_unit_id = trans_unit.get('id', 'unknown')
185
+
186
+ # Skip auxiliary segments (like hyperlink URLs with mq:nosplitjoin="true")
187
+ nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
188
+ if nosplitjoin == 'true':
189
+ continue
190
+
191
+ # Find source element
192
+ source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
193
+ if source_elem is None:
194
+ source_elem = trans_unit.find('source')
195
+
196
+ # Find target element
197
+ target_elem = trans_unit.find('xliff:target', self.NAMESPACES)
198
+ if target_elem is None:
199
+ target_elem = trans_unit.find('target')
200
+
201
+ source_text = ""
202
+ target_text = ""
203
+
204
+ if source_elem is not None:
205
+ source_text = self._extract_plain_text(source_elem)
206
+
207
+ if target_elem is not None:
208
+ target_text = self._extract_plain_text(target_elem)
209
+
210
+ # Get memoQ status if available
211
+ mq_status = trans_unit.get('{MQXliff}status', '')
212
+
213
+ # Map memoQ status to internal status
214
+ # memoQ statuses: "NotStarted", "Editing", "Confirmed", "Reviewed", "Rejected", etc.
215
+ status = 'not_started'
216
+ if mq_status in ['Confirmed', 'ProofRead', 'Reviewed']:
217
+ status = 'confirmed'
218
+ elif mq_status == 'Editing':
219
+ status = 'translated'
220
+ elif target_text.strip():
221
+ # Has target but unknown status - mark as pre-translated
222
+ status = 'pre_translated'
223
+
224
+ segments.append({
225
+ 'id': trans_unit_id,
226
+ 'source': source_text,
227
+ 'target': target_text,
228
+ 'status': status,
229
+ 'mq_status': mq_status
230
+ })
231
+
232
+ return segments
233
+
165
234
  def _extract_plain_text(self, element: ET.Element) -> str:
166
235
  """
167
236
  Recursively extract plain text from an XML element, stripping all tags.
modules/project_tm.py ADDED
@@ -0,0 +1,320 @@
1
+ """
2
+ ProjectTM - In-memory TM for instant grid lookups (Total Recall architecture)
3
+
4
+ This module implements a lightweight in-memory Translation Memory that extracts
5
+ relevant segments from the full TM database on project load. This makes grid
6
+ navigation instant while keeping the full TM for concordance searches.
7
+
8
+ Inspired by CafeTran's "Total Recall" feature.
9
+ """
10
+
11
+ import sqlite3
12
+ import threading
13
+ from difflib import SequenceMatcher
14
+ from typing import Dict, List, Optional, Callable
15
+ import re
16
+
17
+
18
+ class ProjectTM:
19
+ """
20
+ Lightweight in-memory TM extracted from the main TM database.
21
+
22
+ On project load, extracts segments that are relevant to the current project
23
+ (fuzzy matches above threshold) into an in-memory SQLite database for
24
+ instant lookups during grid navigation.
25
+
26
+ Usage:
27
+ project_tm = ProjectTM()
28
+ project_tm.extract_from_database(
29
+ db_manager,
30
+ project_segments,
31
+ tm_ids=['tm1', 'tm2'],
32
+ threshold=0.75,
33
+ progress_callback=lambda cur, total: print(f"{cur}/{total}")
34
+ )
35
+
36
+ # Fast lookup during grid navigation
37
+ matches = project_tm.search("source text to translate")
38
+ """
39
+
40
+ def __init__(self):
41
+ """Initialize in-memory SQLite database for ProjectTM"""
42
+ self.conn = sqlite3.connect(":memory:", check_same_thread=False)
43
+ self.conn.row_factory = sqlite3.Row
44
+ self.lock = threading.Lock()
45
+ self.is_built = False
46
+ self.segment_count = 0
47
+
48
+ # Create the schema
49
+ self._create_schema()
50
+
51
+ def _create_schema(self):
52
+ """Create the in-memory database schema"""
53
+ with self.lock:
54
+ cursor = self.conn.cursor()
55
+ cursor.execute("""
56
+ CREATE TABLE IF NOT EXISTS segments (
57
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
58
+ source_text TEXT NOT NULL,
59
+ target_text TEXT NOT NULL,
60
+ source_lower TEXT NOT NULL,
61
+ tm_id TEXT,
62
+ tm_name TEXT,
63
+ similarity REAL,
64
+ original_id INTEGER
65
+ )
66
+ """)
67
+ # Index for fast exact match lookups
68
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_source_lower ON segments(source_lower)")
69
+ # FTS5 for fuzzy text search
70
+ cursor.execute("""
71
+ CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5(
72
+ source_text,
73
+ content=segments,
74
+ content_rowid=id
75
+ )
76
+ """)
77
+ self.conn.commit()
78
+
79
+ def clear(self):
80
+ """Clear all segments from the ProjectTM"""
81
+ with self.lock:
82
+ cursor = self.conn.cursor()
83
+ cursor.execute("DELETE FROM segments")
84
+ cursor.execute("DELETE FROM segments_fts")
85
+ self.conn.commit()
86
+ self.is_built = False
87
+ self.segment_count = 0
88
+
89
+ def extract_from_database(
90
+ self,
91
+ db_manager,
92
+ project_segments: List,
93
+ tm_ids: List[str] = None,
94
+ source_lang: str = None,
95
+ target_lang: str = None,
96
+ threshold: float = 0.75,
97
+ progress_callback: Optional[Callable[[int, int], None]] = None,
98
+ log_callback: Optional[Callable[[str], None]] = None
99
+ ) -> int:
100
+ """
101
+ Extract relevant segments from the main TM database into ProjectTM.
102
+
103
+ For each unique source text in the project, searches the TM for fuzzy
104
+ matches above the threshold and stores them in memory.
105
+
106
+ Args:
107
+ db_manager: The main database manager with TM data
108
+ project_segments: List of project segments to find matches for
109
+ tm_ids: List of TM IDs to search (None = all active TMs)
110
+ source_lang: Source language filter
111
+ target_lang: Target language filter
112
+ threshold: Minimum similarity threshold (0.0-1.0)
113
+ progress_callback: Optional callback(current, total) for progress
114
+ log_callback: Optional callback(message) for logging
115
+
116
+ Returns:
117
+ Number of TM segments extracted
118
+ """
119
+ def log(msg):
120
+ if log_callback:
121
+ log_callback(msg)
122
+ else:
123
+ print(msg)
124
+
125
+ self.clear()
126
+
127
+ if not project_segments or not db_manager:
128
+ log(f"[ProjectTM] Early exit: segments={bool(project_segments)}, db={bool(db_manager)}")
129
+ return 0
130
+
131
+ # Get unique source texts from project
132
+ unique_sources = {}
133
+ for seg in project_segments:
134
+ # Try both 'source' and 'source_text' attributes (different segment types use different names)
135
+ source = getattr(seg, 'source', None) or getattr(seg, 'source_text', None)
136
+ if source and source.strip():
137
+ # Normalize: strip and lowercase for deduplication
138
+ key = source.strip().lower()
139
+ if key not in unique_sources:
140
+ unique_sources[key] = source.strip()
141
+
142
+ total = len(unique_sources)
143
+ log(f"[ProjectTM] Found {total} unique source texts from {len(project_segments)} segments")
144
+ if total == 0:
145
+ return 0
146
+
147
+ extracted_count = 0
148
+ seen_sources = set() # Deduplicate TM entries
149
+
150
+ cursor = self.conn.cursor()
151
+
152
+ log(f"[ProjectTM] Searching TMs: {tm_ids}, threshold={threshold}, langs={source_lang}->{target_lang}")
153
+
154
+ for i, (key, source_text) in enumerate(unique_sources.items()):
155
+ if progress_callback and i % 10 == 0:
156
+ progress_callback(i, total)
157
+
158
+ try:
159
+ # Search main TM database for fuzzy matches
160
+ matches = db_manager.search_fuzzy_matches(
161
+ source_text,
162
+ tm_ids=tm_ids,
163
+ threshold=threshold,
164
+ max_results=10, # Keep top 10 matches per source
165
+ source_lang=source_lang,
166
+ target_lang=target_lang,
167
+ bidirectional=True
168
+ )
169
+
170
+ # Debug: log first search
171
+ if i == 0:
172
+ log(f"[ProjectTM] First search '{source_text[:50]}...' returned {len(matches)} matches")
173
+
174
+ for match in matches:
175
+ match_source = match.get('source_text', '')
176
+ match_target = match.get('target_text', '')
177
+
178
+ if not match_source or not match_target:
179
+ continue
180
+
181
+ # Deduplicate by source text
182
+ source_key = match_source.strip().lower()
183
+ if source_key in seen_sources:
184
+ continue
185
+ seen_sources.add(source_key)
186
+
187
+ # Insert into ProjectTM
188
+ cursor.execute("""
189
+ INSERT INTO segments (source_text, target_text, source_lower,
190
+ tm_id, tm_name, similarity, original_id)
191
+ VALUES (?, ?, ?, ?, ?, ?, ?)
192
+ """, (
193
+ match_source,
194
+ match_target,
195
+ source_key,
196
+ match.get('tm_id'),
197
+ match.get('tm_name', 'Unknown TM'),
198
+ match.get('similarity', 0),
199
+ match.get('id')
200
+ ))
201
+ extracted_count += 1
202
+
203
+ except Exception as e:
204
+ # Log but continue - don't fail extraction for one bad segment
205
+ pass
206
+
207
+ # Commit all inserts
208
+ self.conn.commit()
209
+
210
+ # Rebuild FTS5 index
211
+ try:
212
+ cursor.execute("INSERT INTO segments_fts(segments_fts) VALUES('rebuild')")
213
+ self.conn.commit()
214
+ except Exception:
215
+ pass # FTS rebuild may fail if no data, that's OK
216
+
217
+ if progress_callback:
218
+ progress_callback(total, total)
219
+
220
+ self.is_built = True
221
+ self.segment_count = extracted_count
222
+
223
+ return extracted_count
224
+
225
+ def search(self, source_text: str, max_results: int = 5) -> List[Dict]:
226
+ """
227
+ Search ProjectTM for matches (instant lookup).
228
+
229
+ First checks for exact matches, then falls back to fuzzy search.
230
+
231
+ Args:
232
+ source_text: Source text to search for
233
+ max_results: Maximum number of results to return
234
+
235
+ Returns:
236
+ List of match dictionaries with source_text, target_text, similarity, etc.
237
+ """
238
+ if not self.is_built or not source_text:
239
+ return []
240
+
241
+ source_lower = source_text.strip().lower()
242
+ results = []
243
+
244
+ with self.lock:
245
+ cursor = self.conn.cursor()
246
+
247
+ # 1. Check for exact match first (fastest)
248
+ cursor.execute("""
249
+ SELECT * FROM segments WHERE source_lower = ? LIMIT 1
250
+ """, (source_lower,))
251
+ exact = cursor.fetchone()
252
+
253
+ if exact:
254
+ results.append({
255
+ 'source_text': exact['source_text'],
256
+ 'target_text': exact['target_text'],
257
+ 'tm_id': exact['tm_id'],
258
+ 'tm_name': exact['tm_name'],
259
+ 'similarity': 1.0, # Exact match
260
+ 'match_pct': 100,
261
+ 'id': exact['original_id']
262
+ })
263
+ return results # Exact match - no need to search further
264
+
265
+ # 2. FTS5 fuzzy search
266
+ try:
267
+ # Tokenize query for FTS5
268
+ clean_text = re.sub(r'[^\w\s]', ' ', source_text)
269
+ search_terms = [t for t in clean_text.split() if len(t) > 2]
270
+
271
+ if search_terms:
272
+ fts_query = ' OR '.join(f'"{term}"' for term in search_terms[:10])
273
+
274
+ cursor.execute("""
275
+ SELECT s.*, bm25(segments_fts) as rank
276
+ FROM segments s
277
+ JOIN segments_fts ON s.id = segments_fts.rowid
278
+ WHERE segments_fts MATCH ?
279
+ ORDER BY rank
280
+ LIMIT ?
281
+ """, (fts_query, max_results * 3)) # Get more candidates for re-ranking
282
+
283
+ candidates = cursor.fetchall()
284
+
285
+ # Re-rank by actual similarity
286
+ for row in candidates:
287
+ similarity = self._calculate_similarity(source_text, row['source_text'])
288
+ if similarity >= 0.5: # Lower threshold for ProjectTM (pre-filtered)
289
+ results.append({
290
+ 'source_text': row['source_text'],
291
+ 'target_text': row['target_text'],
292
+ 'tm_id': row['tm_id'],
293
+ 'tm_name': row['tm_name'],
294
+ 'similarity': similarity,
295
+ 'match_pct': int(similarity * 100),
296
+ 'id': row['original_id']
297
+ })
298
+
299
+ # Sort by similarity and limit
300
+ results.sort(key=lambda x: x['similarity'], reverse=True)
301
+ results = results[:max_results]
302
+
303
+ except Exception:
304
+ pass # FTS search may fail, return what we have
305
+
306
+ return results
307
+
308
+ def _calculate_similarity(self, text1: str, text2: str) -> float:
309
+ """Calculate similarity ratio between two texts"""
310
+ # Strip HTML/XML tags for comparison
311
+ clean1 = re.sub(r'<[^>]+>', '', text1).lower()
312
+ clean2 = re.sub(r'<[^>]+>', '', text2).lower()
313
+ return SequenceMatcher(None, clean1, clean2).ratio()
314
+
315
+ def get_stats(self) -> Dict:
316
+ """Get statistics about the ProjectTM"""
317
+ return {
318
+ 'is_built': self.is_built,
319
+ 'segment_count': self.segment_count
320
+ }
modules/superbrowser.py CHANGED
@@ -160,6 +160,20 @@ class ChatColumn(QWidget):
160
160
  """Update URL bar when page changes"""
161
161
  self.url_input.setText(url.toString())
162
162
 
163
+ def cleanup(self):
164
+ """Clean up web engine resources before deletion"""
165
+ try:
166
+ from PyQt6.QtCore import QUrl
167
+ if hasattr(self, 'web_view'):
168
+ self.web_view.stop()
169
+ self.web_view.setPage(None)
170
+ self.web_view.setUrl(QUrl('about:blank'))
171
+ self.web_view.deleteLater()
172
+ if hasattr(self, 'profile'):
173
+ self.profile.deleteLater()
174
+ except:
175
+ pass
176
+
163
177
 
164
178
  class SuperbrowserWidget(QWidget):
165
179
  """
@@ -304,6 +318,14 @@ class SuperbrowserWidget(QWidget):
304
318
  self.claude_column.go_home()
305
319
  self.gemini_column.go_home()
306
320
 
321
+ def cleanup(self):
322
+ """Clean up all web engine resources before widget deletion"""
323
+ try:
324
+ for column in self.chat_columns:
325
+ column.cleanup()
326
+ except:
327
+ pass
328
+
307
329
 
308
330
  # ============================================================================
309
331
  # STANDALONE USAGE
modules/superlookup.py CHANGED
@@ -88,14 +88,18 @@ class SuperlookupEngine:
88
88
  Captured text or None if failed
89
89
  """
90
90
  try:
91
- import keyboard
92
-
93
- # Wait for hotkey to release before sending Ctrl+C
94
- time.sleep(0.2)
95
-
96
- # Use keyboard library to send Ctrl+C
97
- keyboard.press_and_release('ctrl+c')
98
- time.sleep(0.2)
91
+ # keyboard module is Windows-only
92
+ try:
93
+ import keyboard
94
+ # Wait for hotkey to release before sending Ctrl+C
95
+ time.sleep(0.2)
96
+ # Use keyboard library to send Ctrl+C
97
+ keyboard.press_and_release('ctrl+c')
98
+ time.sleep(0.2)
99
+ except ImportError:
100
+ # On non-Windows, just try to get clipboard content directly
101
+ # (user needs to have copied text manually)
102
+ pass
99
103
 
100
104
  # Get clipboard
101
105
  text = pyperclip.paste()
modules/tag_manager.py CHANGED
@@ -77,15 +77,33 @@ class TagManager:
77
77
  runs = []
78
78
  current_pos = 0
79
79
 
80
+ # Check if paragraph style has bold/italic formatting
81
+ # This handles cases like "Subtitle" or "Title" styles that are bold
82
+ style_bold = False
83
+ style_italic = False
84
+ try:
85
+ if paragraph.style and paragraph.style.font:
86
+ if paragraph.style.font.bold:
87
+ style_bold = True
88
+ if paragraph.style.font.italic:
89
+ style_italic = True
90
+ except Exception:
91
+ pass # If we can't read style, just use run-level formatting
92
+
80
93
  for run in paragraph.runs:
81
94
  text = run.text
82
95
  if not text:
83
96
  continue
84
97
 
98
+ # Combine run-level formatting with style-level formatting
99
+ # run.bold can be True, False, or None (None means inherit from style)
100
+ is_bold = run.bold if run.bold is not None else style_bold
101
+ is_italic = run.italic if run.italic is not None else style_italic
102
+
85
103
  run_info = FormattingRun(
86
104
  text=text,
87
- bold=run.bold or False,
88
- italic=run.italic or False,
105
+ bold=is_bold or False,
106
+ italic=is_italic or False,
89
107
  underline=run.underline or False,
90
108
  subscript=run.font.subscript or False if run.font else False,
91
109
  superscript=run.font.superscript or False if run.font else False,
@@ -409,7 +409,111 @@ class TermbaseManager:
409
409
  except Exception as e:
410
410
  self.log(f"✗ Error setting termbase read_only: {e}")
411
411
  return False
412
-
412
+
413
+ def get_termbase_ai_inject(self, termbase_id: int) -> bool:
414
+ """Get whether termbase terms should be injected into LLM prompts"""
415
+ try:
416
+ cursor = self.db_manager.cursor
417
+ cursor.execute("SELECT ai_inject FROM termbases WHERE id = ?", (termbase_id,))
418
+ result = cursor.fetchone()
419
+ return bool(result[0]) if result and result[0] else False
420
+ except Exception as e:
421
+ self.log(f"✗ Error getting termbase ai_inject: {e}")
422
+ return False
423
+
424
+ def set_termbase_ai_inject(self, termbase_id: int, ai_inject: bool) -> bool:
425
+ """Set whether termbase terms should be injected into LLM prompts"""
426
+ try:
427
+ cursor = self.db_manager.cursor
428
+ cursor.execute("""
429
+ UPDATE termbases SET ai_inject = ? WHERE id = ?
430
+ """, (1 if ai_inject else 0, termbase_id))
431
+ self.db_manager.connection.commit()
432
+ status = "enabled" if ai_inject else "disabled"
433
+ self.log(f"✓ AI injection {status} for termbase {termbase_id}")
434
+ return True
435
+ except Exception as e:
436
+ self.log(f"✗ Error setting termbase ai_inject: {e}")
437
+ return False
438
+
439
+ def get_ai_inject_termbases(self, project_id: Optional[int] = None) -> List[Dict]:
440
+ """
441
+ Get all termbases with ai_inject enabled that are active for the given project.
442
+
443
+ Args:
444
+ project_id: Project ID (0 or None for global)
445
+
446
+ Returns:
447
+ List of termbase dictionaries with all terms
448
+ """
449
+ try:
450
+ cursor = self.db_manager.cursor
451
+ proj_id = project_id if project_id else 0
452
+
453
+ cursor.execute("""
454
+ SELECT t.id, t.name, t.source_lang, t.target_lang
455
+ FROM termbases t
456
+ LEFT JOIN termbase_activation ta ON t.id = ta.termbase_id AND ta.project_id = ?
457
+ WHERE t.ai_inject = 1
458
+ AND (ta.is_active = 1 OR (t.is_global = 1 AND ta.is_active IS NULL))
459
+ ORDER BY ta.priority ASC, t.name ASC
460
+ """, (proj_id,))
461
+
462
+ termbases = []
463
+ for row in cursor.fetchall():
464
+ termbases.append({
465
+ 'id': row[0],
466
+ 'name': row[1],
467
+ 'source_lang': row[2],
468
+ 'target_lang': row[3]
469
+ })
470
+ return termbases
471
+ except Exception as e:
472
+ self.log(f"✗ Error getting AI inject termbases: {e}")
473
+ return []
474
+
475
+ def get_ai_inject_terms(self, project_id: Optional[int] = None) -> List[Dict]:
476
+ """
477
+ Get all terms from AI-inject-enabled termbases for the given project.
478
+
479
+ Args:
480
+ project_id: Project ID (0 or None for global)
481
+
482
+ Returns:
483
+ List of term dictionaries with source_term, target_term, forbidden, termbase_name
484
+ """
485
+ try:
486
+ # First get all AI-inject termbases
487
+ ai_termbases = self.get_ai_inject_termbases(project_id)
488
+ if not ai_termbases:
489
+ return []
490
+
491
+ all_terms = []
492
+ cursor = self.db_manager.cursor
493
+
494
+ for tb in ai_termbases:
495
+ cursor.execute("""
496
+ SELECT source_term, target_term, forbidden, priority
497
+ FROM termbase_terms
498
+ WHERE termbase_id = ?
499
+ ORDER BY priority ASC, source_term ASC
500
+ """, (tb['id'],))
501
+
502
+ for row in cursor.fetchall():
503
+ all_terms.append({
504
+ 'source_term': row[0],
505
+ 'target_term': row[1],
506
+ 'forbidden': bool(row[2]) if row[2] else False,
507
+ 'priority': row[3] or 99,
508
+ 'termbase_name': tb['name']
509
+ })
510
+
511
+ self.log(f"📚 Retrieved {len(all_terms)} terms from {len(ai_termbases)} AI-inject glossar{'y' if len(ai_termbases) == 1 else 'ies'}")
512
+ return all_terms
513
+ except Exception as e:
514
+ self.log(f"✗ Error getting AI inject terms: {e}")
515
+ return []
516
+
413
517
  def set_termbase_priority(self, termbase_id: int, project_id: int, priority: int) -> bool:
414
518
  """
415
519
  Set manual priority for a termbase in a specific project.
@@ -505,7 +609,6 @@ class TermbaseManager:
505
609
  """, (project_id,))
506
610
 
507
611
  active_ids = [row[0] for row in cursor.fetchall()]
508
- self.log(f"📋 Found {len(active_ids)} active termbases for project {project_id}: {active_ids}")
509
612
  return active_ids
510
613
  except Exception as e:
511
614
  self.log(f"✗ Error getting active termbase IDs: {e}")