supervertaler 1.9.116__py3-none-any.whl → 1.9.172__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

modules/superlookup.py CHANGED
@@ -88,14 +88,18 @@ class SuperlookupEngine:
88
88
  Captured text or None if failed
89
89
  """
90
90
  try:
91
- import keyboard
92
-
93
- # Wait for hotkey to release before sending Ctrl+C
94
- time.sleep(0.2)
95
-
96
- # Use keyboard library to send Ctrl+C
97
- keyboard.press_and_release('ctrl+c')
98
- time.sleep(0.2)
91
+ # keyboard module is Windows-only
92
+ try:
93
+ import keyboard
94
+ # Wait for hotkey to release before sending Ctrl+C
95
+ time.sleep(0.2)
96
+ # Use keyboard library to send Ctrl+C
97
+ keyboard.press_and_release('ctrl+c')
98
+ time.sleep(0.2)
99
+ except ImportError:
100
+ # On non-Windows, just try to get clipboard content directly
101
+ # (user needs to have copied text manually)
102
+ pass
99
103
 
100
104
  # Get clipboard
101
105
  text = pyperclip.paste()
@@ -157,9 +161,13 @@ class SuperlookupEngine:
157
161
 
158
162
  # Convert to LookupResult format (limit results)
159
163
  for match in matches[:max_results]:
164
+ # Use 'source' and 'target' keys (matches database column names)
165
+ source_text = match.get('source', '')
166
+ target_text = match.get('target', '')
167
+ print(f"[Superlookup] Extracted: source='{source_text[:50]}...', target='{target_text[:50]}...'")
160
168
  results.append(LookupResult(
161
- source=match.get('source', ''),
162
- target=match.get('target', ''),
169
+ source=source_text,
170
+ target=target_text,
163
171
  match_percent=100, # Concordance = contains the text
164
172
  source_type='tm',
165
173
  metadata={
modules/tag_manager.py CHANGED
@@ -77,15 +77,33 @@ class TagManager:
77
77
  runs = []
78
78
  current_pos = 0
79
79
 
80
+ # Check if paragraph style has bold/italic formatting
81
+ # This handles cases like "Subtitle" or "Title" styles that are bold
82
+ style_bold = False
83
+ style_italic = False
84
+ try:
85
+ if paragraph.style and paragraph.style.font:
86
+ if paragraph.style.font.bold:
87
+ style_bold = True
88
+ if paragraph.style.font.italic:
89
+ style_italic = True
90
+ except Exception:
91
+ pass # If we can't read style, just use run-level formatting
92
+
80
93
  for run in paragraph.runs:
81
94
  text = run.text
82
95
  if not text:
83
96
  continue
84
97
 
98
+ # Combine run-level formatting with style-level formatting
99
+ # run.bold can be True, False, or None (None means inherit from style)
100
+ is_bold = run.bold if run.bold is not None else style_bold
101
+ is_italic = run.italic if run.italic is not None else style_italic
102
+
85
103
  run_info = FormattingRun(
86
104
  text=text,
87
- bold=run.bold or False,
88
- italic=run.italic or False,
105
+ bold=is_bold or False,
106
+ italic=is_italic or False,
89
107
  underline=run.underline or False,
90
108
  subscript=run.font.subscript or False if run.font else False,
91
109
  superscript=run.font.superscript or False if run.font else False,
@@ -515,6 +515,9 @@ class TermviewWidget(QWidget):
515
515
  self.current_target_lang = None
516
516
  self.current_project_id = None # Store project ID for termbase priority lookup
517
517
 
518
+ # Debug mode - disable verbose tokenization logging by default (performance)
519
+ self.debug_tokenize = False
520
+
518
521
  # Default font settings (will be updated from main app settings)
519
522
  self.current_font_family = "Segoe UI"
520
523
  self.current_font_size = 10
@@ -750,7 +753,10 @@ class TermviewWidget(QWidget):
750
753
  if not source_term or not target_term:
751
754
  continue
752
755
 
753
- key = source_term.lower()
756
+ # Strip punctuation from key to match lookup normalization
757
+ # This ensures "ca." in glossary matches "ca." token stripped to "ca"
758
+ PUNCT_CHARS_FOR_KEY = '.,;:!?\"\'\u201C\u201D\u201E\u00AB\u00BB\u2018\u2019\u201A\u2039\u203A()[]'
759
+ key = source_term.lower().strip(PUNCT_CHARS_FOR_KEY)
754
760
  if key not in matches_dict:
755
761
  matches_dict[key] = []
756
762
 
@@ -803,7 +809,8 @@ class TermviewWidget(QWidget):
803
809
 
804
810
  # Comprehensive set of quote and punctuation characters to strip
805
811
  # Using Unicode escapes to avoid encoding issues
806
- PUNCT_CHARS = '.,;:!?\"\'\u201C\u201D\u201E\u00AB\u00BB\u2018\u2019\u201A\u2039\u203A'
812
+ # Include brackets for terms like "(typisch)" to match "typisch"
813
+ PUNCT_CHARS = '.,;:!?\"\'\u201C\u201D\u201E\u00AB\u00BB\u2018\u2019\u201A\u2039\u203A()[]'
807
814
 
808
815
  # Track which terms have already been assigned shortcuts (avoid duplicates)
809
816
  assigned_shortcuts = set()
@@ -816,7 +823,6 @@ class TermviewWidget(QWidget):
816
823
 
817
824
  # Check if this is a non-translatable
818
825
  if lookup_key in nt_dict:
819
- # Create NT block
820
826
  nt_block = NTBlock(token, nt_dict[lookup_key], self, theme_manager=self.theme_manager,
821
827
  font_size=self.current_font_size, font_family=self.current_font_family,
822
828
  font_bold=self.current_font_bold)
@@ -941,13 +947,20 @@ class TermviewWidget(QWidget):
941
947
  for quote_char in '\"\'\u201C\u201D\u201E\u00AB\u00BB\u2018\u2019\u201A\u2039\u203A':
942
948
  normalized_text = normalized_text.replace(quote_char, ' ')
943
949
 
950
+ # CRITICAL FIX v1.9.118: Strip punctuation from glossary term before matching
951
+ # This allows entries like "...problemen." (with period) to match source text
952
+ # where tokenization strips the period during word splitting
953
+ # Comprehensive set of quote and punctuation characters to strip
954
+ PUNCT_CHARS = '.,;:!?\"\'\u201C\u201D\u201E\u00AB\u00BB\u2018\u2019\u201A\u2039\u203A'
955
+ normalized_term = source_lower.rstrip(PUNCT_CHARS).lstrip(PUNCT_CHARS)
956
+
944
957
  # Use word boundaries to match complete words/phrases only
945
958
  if ' ' in source_term:
946
959
  # Multi-word term - must exist as exact phrase
947
- pattern = r'\b' + re.escape(source_lower) + r'\b'
960
+ pattern = r'\b' + re.escape(normalized_term) + r'\b'
948
961
  else:
949
962
  # Single word
950
- pattern = r'\b' + re.escape(source_lower) + r'\b'
963
+ pattern = r'\b' + re.escape(normalized_term) + r'\b'
951
964
 
952
965
  # Try matching on normalized text first, then original
953
966
  if not re.search(pattern, normalized_text) and not re.search(pattern, text_lower):
@@ -985,9 +998,9 @@ class TermviewWidget(QWidget):
985
998
  Returns:
986
999
  List of tokens (words/phrases/numbers), with multi-word terms kept together
987
1000
  """
988
- # DEBUG: Log multi-word terms we're looking for
1001
+ # DEBUG: Log multi-word terms we're looking for (only if debug_tokenize enabled)
989
1002
  multi_word_terms = [k for k in matches.keys() if ' ' in k]
990
- if multi_word_terms:
1003
+ if multi_word_terms and self.debug_tokenize:
991
1004
  self.log(f"🔍 Tokenize: Looking for {len(multi_word_terms)} multi-word terms:")
992
1005
  for term in sorted(multi_word_terms, key=len, reverse=True)[:3]:
993
1006
  self.log(f" - '{term}'")
@@ -1012,11 +1025,12 @@ class TermviewWidget(QWidget):
1012
1025
  else:
1013
1026
  pattern = r'\b' + term_escaped + r'\b'
1014
1027
 
1015
- # DEBUG: Check if multi-word term is found
1028
+ # DEBUG: Check if multi-word term is found (only if debug_tokenize enabled)
1016
1029
  found = re.search(pattern, text_lower)
1017
- self.log(f"🔍 Tokenize: Pattern '{pattern}' for '{term}' → {'FOUND' if found else 'NOT FOUND'}")
1018
- if found:
1019
- self.log(f" Match at position {found.span()}: '{text[found.start():found.end()]}'")
1030
+ if self.debug_tokenize:
1031
+ self.log(f"🔍 Tokenize: Pattern '{pattern}' for '{term}' → {'FOUND' if found else 'NOT FOUND'}")
1032
+ if found:
1033
+ self.log(f" Match at position {found.span()}: '{text[found.start():found.end()]}'")
1020
1034
 
1021
1035
  # Find all matches using regex
1022
1036
  for match in re.finditer(pattern, text_lower):
@@ -1029,10 +1043,11 @@ class TermviewWidget(QWidget):
1029
1043
  original_term = text[pos:pos + len(term)]
1030
1044
  tokens_with_positions.append((pos, len(term), original_term))
1031
1045
  used_positions.update(term_positions)
1032
- self.log(f" ✅ Added multi-word token: '{original_term}' covering positions {pos}-{pos+len(term)}")
1046
+ if self.debug_tokenize:
1047
+ self.log(f" ✅ Added multi-word token: '{original_term}' covering positions {pos}-{pos+len(term)}")
1033
1048
 
1034
- # DEBUG: Log used_positions after first pass
1035
- if ' ' in sorted(matches.keys(), key=len, reverse=True)[0]:
1049
+ # DEBUG: Log used_positions after first pass (only if debug_tokenize enabled)
1050
+ if matches and ' ' in sorted(matches.keys(), key=len, reverse=True)[0] and self.debug_tokenize:
1036
1051
  self.log(f"🔍 After first pass: {len(used_positions)} positions marked as used")
1037
1052
  self.log(f" Used positions: {sorted(list(used_positions))[:20]}...")
1038
1053
 
@@ -396,6 +396,47 @@ class TMMetadataManager:
396
396
  self.log(f"✗ Error fetching active tm_ids: {e}")
397
397
  return []
398
398
 
399
+ def get_writable_tm_ids(self, project_id: Optional[int]) -> List[str]:
400
+ """
401
+ Get list of writable tm_id strings for a project.
402
+
403
+ Returns TMs where:
404
+ - The TM has an activation record for this project AND
405
+ - read_only = 0 (Write checkbox is enabled)
406
+
407
+ This is used for SAVING segments to TM, separate from get_active_tm_ids()
408
+ which is used for READING/matching from TM.
409
+
410
+ Returns:
411
+ List of tm_id strings that are writable for the project
412
+ """
413
+ if project_id is None:
414
+ # No project - return all writable TMs
415
+ try:
416
+ cursor = self.db_manager.cursor
417
+ cursor.execute("SELECT tm_id FROM translation_memories WHERE read_only = 0")
418
+ return [row[0] for row in cursor.fetchall()]
419
+ except Exception as e:
420
+ self.log(f"✗ Error fetching all writable tm_ids: {e}")
421
+ return []
422
+
423
+ try:
424
+ cursor = self.db_manager.cursor
425
+
426
+ # Return TMs where Write checkbox is enabled (read_only = 0)
427
+ # AND the TM has an activation record for this project
428
+ cursor.execute("""
429
+ SELECT tm.tm_id
430
+ FROM translation_memories tm
431
+ INNER JOIN tm_activation ta ON tm.id = ta.tm_id
432
+ WHERE ta.project_id = ? AND tm.read_only = 0
433
+ """, (project_id,))
434
+
435
+ return [row[0] for row in cursor.fetchall()]
436
+ except Exception as e:
437
+ self.log(f"✗ Error fetching writable tm_ids: {e}")
438
+ return []
439
+
399
440
  # ========================================================================
400
441
  # PROJECT TM MANAGEMENT (similar to termbases)
401
442
  # ========================================================================
modules/tmx_editor_qt.py CHANGED
@@ -2655,7 +2655,7 @@ if __name__ == "__main__":
2655
2655
  os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", ".supervertaler.local")
2656
2656
  )
2657
2657
  user_data_path = Path("user_data_private" if ENABLE_PRIVATE_FEATURES else "user_data")
2658
- db_path = user_data_path / "Translation_Resources" / "supervertaler.db"
2658
+ db_path = user_data_path / "resources" / "supervertaler.db"
2659
2659
 
2660
2660
  # Ensure database directory exists
2661
2661
  db_path.parent.mkdir(parents=True, exist_ok=True)
@@ -123,8 +123,8 @@ class TMDatabase:
123
123
  if source_lang and target_lang:
124
124
  self.set_tm_languages(source_lang, target_lang)
125
125
 
126
- # Global fuzzy threshold
127
- self.fuzzy_threshold = 0.75
126
+ # Global fuzzy threshold (70% minimum similarity for fuzzy matches)
127
+ self.fuzzy_threshold = 0.7
128
128
 
129
129
  # TM metadata cache (populated from database as needed)
130
130
  # Note: Legacy 'project' and 'big_mama' TMs are no longer used.
@@ -401,7 +401,7 @@ class TMDatabase:
401
401
 
402
402
  def load_tmx_file(self, filepath: str, src_lang: str, tgt_lang: str,
403
403
  tm_name: str = None, read_only: bool = False,
404
- strip_variants: bool = True) -> tuple[str, int]:
404
+ strip_variants: bool = True, progress_callback=None) -> tuple[str, int]:
405
405
  """
406
406
  Load TMX file into a new custom TM
407
407
 
@@ -412,6 +412,7 @@ class TMDatabase:
412
412
  tm_name: Custom name for TM (default: filename)
413
413
  read_only: Make TM read-only
414
414
  strip_variants: Match base languages ignoring regional variants (default: True)
415
+ progress_callback: Optional callback function(current, total, message) for progress updates
415
416
 
416
417
  Returns: (tm_id, entry_count)
417
418
  """
@@ -423,16 +424,18 @@ class TMDatabase:
423
424
  self.add_custom_tm(tm_name, tm_id, read_only=read_only)
424
425
 
425
426
  # Load TMX content
426
- loaded_count = self._load_tmx_into_db(filepath, src_lang, tgt_lang, tm_id, strip_variants=strip_variants)
427
+ loaded_count = self._load_tmx_into_db(filepath, src_lang, tgt_lang, tm_id,
428
+ strip_variants=strip_variants,
429
+ progress_callback=progress_callback)
427
430
 
428
431
  self.log(f"✓ Loaded {loaded_count} entries from {os.path.basename(filepath)}")
429
432
 
430
433
  return tm_id, loaded_count
431
434
 
432
435
  def _load_tmx_into_db(self, filepath: str, src_lang: str, tgt_lang: str, tm_id: str,
433
- strip_variants: bool = False) -> int:
436
+ strip_variants: bool = False, progress_callback=None) -> int:
434
437
  """
435
- Internal: Load TMX content into database
438
+ Internal: Load TMX content into database with chunked processing
436
439
 
437
440
  Args:
438
441
  filepath: Path to TMX file
@@ -440,12 +443,24 @@ class TMDatabase:
440
443
  tgt_lang: Target target language code
441
444
  tm_id: TM identifier
442
445
  strip_variants: If True, match base languages ignoring regional variants
446
+ progress_callback: Optional callback function(current, total, message) for progress updates
443
447
  """
444
448
  loaded_count = 0
449
+ chunk_size = 1000 # Process in chunks for responsiveness
450
+ chunk_buffer = []
445
451
 
446
452
  try:
453
+ # First pass: count total TUs for progress bar
454
+ if progress_callback:
455
+ progress_callback(0, 0, "Counting translation units...")
456
+
447
457
  tree = ET.parse(filepath)
448
458
  root = tree.getroot()
459
+ total_tus = len(root.findall('.//tu'))
460
+
461
+ if progress_callback:
462
+ progress_callback(0, total_tus, f"Processing 0 / {total_tus:,} entries...")
463
+
449
464
  xml_ns = "http://www.w3.org/XML/1998/namespace"
450
465
 
451
466
  # Normalize language codes
@@ -458,6 +473,7 @@ class TMDatabase:
458
473
  src_base = get_base_lang_code(src_lang_normalized)
459
474
  tgt_base = get_base_lang_code(tgt_lang_normalized)
460
475
 
476
+ processed = 0
461
477
  for tu in root.findall('.//tu'):
462
478
  src_text, tgt_text = None, None
463
479
 
@@ -488,14 +504,43 @@ class TMDatabase:
488
504
  tgt_text = text
489
505
 
490
506
  if src_text and tgt_text:
507
+ chunk_buffer.append((src_text, tgt_text))
508
+ loaded_count += 1
509
+
510
+ # Process chunk when buffer is full
511
+ if len(chunk_buffer) >= chunk_size:
512
+ for src, tgt in chunk_buffer:
513
+ self.db.add_translation_unit(
514
+ source=src,
515
+ target=tgt,
516
+ source_lang=src_lang_normalized,
517
+ target_lang=tgt_lang_normalized,
518
+ tm_id=tm_id
519
+ )
520
+ chunk_buffer.clear()
521
+
522
+ # Update progress
523
+ if progress_callback:
524
+ progress_callback(processed + 1, total_tus,
525
+ f"Processing {loaded_count:,} / {total_tus:,} entries...")
526
+
527
+ processed += 1
528
+
529
+ # Process remaining entries in buffer
530
+ if chunk_buffer:
531
+ for src, tgt in chunk_buffer:
491
532
  self.db.add_translation_unit(
492
- source=src_text,
493
- target=tgt_text,
533
+ source=src,
534
+ target=tgt,
494
535
  source_lang=src_lang_normalized,
495
536
  target_lang=tgt_lang_normalized,
496
537
  tm_id=tm_id
497
538
  )
498
- loaded_count += 1
539
+ chunk_buffer.clear()
540
+
541
+ # Final progress update
542
+ if progress_callback:
543
+ progress_callback(total_tus, total_tus, f"Completed: {loaded_count:,} entries imported")
499
544
 
500
545
  return loaded_count
501
546
  except Exception as e:
@@ -30,7 +30,7 @@ class UnifiedPromptLibrary:
30
30
  Initialize the Unified Prompt Library.
31
31
 
32
32
  Args:
33
- library_dir: Path to unified library directory (user_data/Prompt_Library/Library)
33
+ library_dir: Path to unified library directory (user_data/prompt_library)
34
34
  log_callback: Function to call for logging messages
35
35
  """
36
36
  self.library_dir = Path(library_dir) if library_dir else None
@@ -171,11 +171,14 @@ class UnifiedPromptLibrary:
171
171
  # Backward compatibility: quick_run is the legacy field; internally we
172
172
  # treat it as the "QuickMenu (future app menu)" flag.
173
173
  prompt_data.setdefault('quick_run', False)
174
- prompt_data['quickmenu_quickmenu'] = bool(
175
- prompt_data.get('quickmenu_quickmenu', prompt_data.get('quick_run', False))
174
+ # Support legacy quickmenu_quickmenu field (rename to sv_quickmenu)
175
+ if 'quickmenu_quickmenu' in prompt_data:
176
+ prompt_data['sv_quickmenu'] = prompt_data['quickmenu_quickmenu']
177
+ prompt_data['sv_quickmenu'] = bool(
178
+ prompt_data.get('sv_quickmenu', prompt_data.get('quick_run', False))
176
179
  )
177
180
  # Keep legacy field in sync so older code/versions still behave.
178
- prompt_data['quick_run'] = bool(prompt_data['quickmenu_quickmenu'])
181
+ prompt_data['quick_run'] = bool(prompt_data['sv_quickmenu'])
179
182
 
180
183
  # New QuickMenu fields
181
184
  prompt_data.setdefault('quickmenu_grid', False)
@@ -270,7 +273,7 @@ class UnifiedPromptLibrary:
270
273
  'name', 'description', 'domain', 'version', 'task_type',
271
274
  'favorite',
272
275
  # QuickMenu
273
- 'quickmenu_label', 'quickmenu_grid', 'quickmenu_quickmenu',
276
+ 'quickmenu_label', 'quickmenu_grid', 'sv_quickmenu',
274
277
  # Legacy (kept for backward compatibility)
275
278
  'quick_run',
276
279
  'folder', 'tags',
@@ -309,8 +312,8 @@ class UnifiedPromptLibrary:
309
312
  prompt_data['_relative_path'] = relative_path
310
313
 
311
314
  # Keep legacy field in sync
312
- if 'quickmenu_quickmenu' in prompt_data:
313
- prompt_data['quick_run'] = bool(prompt_data.get('quickmenu_quickmenu', False))
315
+ if 'sv_quickmenu' in prompt_data:
316
+ prompt_data['quick_run'] = bool(prompt_data.get('sv_quickmenu', False))
314
317
  self.prompts[relative_path] = prompt_data
315
318
 
316
319
  self.log(f"✓ Saved prompt: {prompt_data.get('name', relative_path)}")
@@ -456,8 +459,8 @@ class UnifiedPromptLibrary:
456
459
  return False
457
460
 
458
461
  prompt_data = self.prompts[relative_path]
459
- new_value = not bool(prompt_data.get('quickmenu_quickmenu', prompt_data.get('quick_run', False)))
460
- prompt_data['quickmenu_quickmenu'] = new_value
462
+ new_value = not bool(prompt_data.get('sv_quickmenu', prompt_data.get('quick_run', False)))
463
+ prompt_data['sv_quickmenu'] = new_value
461
464
  prompt_data['quick_run'] = new_value # keep legacy in sync
462
465
  prompt_data['modified'] = datetime.now().strftime("%Y-%m-%d")
463
466
 
@@ -493,7 +496,7 @@ class UnifiedPromptLibrary:
493
496
  """Update cached QuickMenu (future app menu) list (legacy name: quick_run)."""
494
497
  self._quick_run = []
495
498
  for path, data in self.prompts.items():
496
- is_enabled = bool(data.get('quickmenu_quickmenu', data.get('quick_run', False)))
499
+ is_enabled = bool(data.get('sv_quickmenu', data.get('quick_run', False)))
497
500
  if not is_enabled:
498
501
  continue
499
502
  label = (data.get('quickmenu_label') or data.get('name') or Path(path).stem).strip()