supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,270 @@
1
+ """
2
+ Term Extractor Module
3
+
4
+ Extracts potential terminology from source text for project termbases.
5
+ Can be used as a standalone tool or integrated into Supervertaler.
6
+
7
+ Author: Michael Beijer
8
+ License: MIT
9
+ """
10
+
11
+ import re
12
+ from typing import List, Dict, Set, Optional, Tuple
13
+ from collections import Counter
14
+ import string
15
+
16
+
17
+ class TermExtractor:
18
+ """Extract terminology from source text using various algorithms"""
19
+
20
+ def __init__(self, source_lang: str = "en", min_frequency: int = 2,
21
+ min_word_length: int = 3, max_ngram: int = 3):
22
+ """
23
+ Initialize term extractor
24
+
25
+ Args:
26
+ source_lang: Source language code (e.g., 'en', 'nl', 'de')
27
+ min_frequency: Minimum number of occurrences to consider as term
28
+ min_word_length: Minimum character length for single words
29
+ max_ngram: Maximum n-gram size (1=single words, 2=bigrams, 3=trigrams)
30
+ """
31
+ self.source_lang = source_lang.lower()
32
+ self.min_frequency = min_frequency
33
+ self.min_word_length = min_word_length
34
+ self.max_ngram = max_ngram
35
+
36
+ # Common stop words by language
37
+ self.stop_words = self._get_stop_words(source_lang)
38
+
39
+ def _get_stop_words(self, lang: str) -> Set[str]:
40
+ """Get stop words for a language"""
41
+ # Basic stop words - can be expanded
42
+ stop_words = {
43
+ 'en': {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
44
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
45
+ 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
46
+ 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this',
47
+ 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'},
48
+ 'nl': {'de', 'het', 'een', 'en', 'of', 'maar', 'in', 'op', 'aan', 'te', 'voor',
49
+ 'van', 'met', 'bij', 'uit', 'als', 'is', 'was', 'zijn', 'waren', 'wordt',
50
+ 'worden', 'werd', 'werden', 'hebben', 'heeft', 'had', 'hadden', 'zal',
51
+ 'zou', 'kunnen', 'kan', 'moet', 'mag', 'dit', 'dat', 'deze', 'die',
52
+ 'ik', 'je', 'jij', 'hij', 'zij', 'het', 'wij', 'ze'},
53
+ 'de': {'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'einem',
54
+ 'einen', 'eines', 'und', 'oder', 'aber', 'in', 'an', 'auf', 'zu', 'für',
55
+ 'von', 'mit', 'bei', 'aus', 'als', 'ist', 'war', 'sind', 'waren', 'wird',
56
+ 'werden', 'wurde', 'wurden', 'haben', 'hat', 'hatte', 'hatten', 'ich',
57
+ 'du', 'er', 'sie', 'es', 'wir', 'ihr'},
58
+ 'fr': {'le', 'la', 'les', 'un', 'une', 'des', 'et', 'ou', 'mais', 'dans', 'sur',
59
+ 'à', 'de', 'pour', 'avec', 'par', 'comme', 'est', 'était', 'sont', 'étaient',
60
+ 'être', 'avoir', 'a', 'avait', 'je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils'},
61
+ 'es': {'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'y', 'o', 'pero',
62
+ 'en', 'a', 'de', 'para', 'con', 'por', 'como', 'es', 'era', 'son', 'eran',
63
+ 'ser', 'estar', 'haber', 'he', 'ha', 'yo', 'tú', 'él', 'ella', 'nosotros'},
64
+ }
65
+ return stop_words.get(lang, set())
66
+
67
+ def extract_terms(self, text: str, use_frequency: bool = True,
68
+ use_capitalization: bool = True,
69
+ use_special_chars: bool = True) -> List[Dict[str, any]]:
70
+ """
71
+ Extract potential terms from text
72
+
73
+ Args:
74
+ text: Source text to analyze
75
+ use_frequency: Consider term frequency in ranking
76
+ use_capitalization: Give higher weight to capitalized terms
77
+ use_special_chars: Consider terms with hyphens, underscores, etc.
78
+
79
+ Returns:
80
+ List of term dictionaries with fields: term, frequency, score, type
81
+ """
82
+ if not text:
83
+ return []
84
+
85
+ # Collect all candidate terms
86
+ candidates = {}
87
+
88
+ # Extract n-grams (1 to max_ngram)
89
+ for n in range(1, self.max_ngram + 1):
90
+ ngrams = self._extract_ngrams(text, n)
91
+ for ngram, freq in ngrams.items():
92
+ if ngram not in candidates:
93
+ candidates[ngram] = {
94
+ 'term': ngram,
95
+ 'frequency': freq,
96
+ 'ngram_size': n,
97
+ 'is_capitalized': ngram[0].isupper() if ngram else False,
98
+ 'has_special_chars': bool(re.search(r'[-_./]', ngram))
99
+ }
100
+
101
+ # Score and rank terms
102
+ scored_terms = []
103
+ for term_info in candidates.values():
104
+ score = self._calculate_score(
105
+ term_info,
106
+ use_frequency=use_frequency,
107
+ use_capitalization=use_capitalization,
108
+ use_special_chars=use_special_chars
109
+ )
110
+
111
+ if score > 0: # Only include terms with positive score
112
+ scored_terms.append({
113
+ 'term': term_info['term'],
114
+ 'frequency': term_info['frequency'],
115
+ 'score': score,
116
+ 'type': self._classify_term(term_info)
117
+ })
118
+
119
+ # Sort by score (highest first)
120
+ scored_terms.sort(key=lambda x: x['score'], reverse=True)
121
+
122
+ return scored_terms
123
+
124
+ def _extract_ngrams(self, text: str, n: int) -> Dict[str, int]:
125
+ """Extract n-grams from text"""
126
+ # Tokenize text into words
127
+ words = re.findall(r'\b[\w-]+\b', text.lower())
128
+
129
+ # Generate n-grams
130
+ ngrams = []
131
+ for i in range(len(words) - n + 1):
132
+ ngram_words = words[i:i+n]
133
+
134
+ # Skip if contains stop words (except for longer n-grams where they might be acceptable)
135
+ if n == 1 and ngram_words[0] in self.stop_words:
136
+ continue
137
+
138
+ # Skip if too short
139
+ if n == 1 and len(ngram_words[0]) < self.min_word_length:
140
+ continue
141
+
142
+ # Create n-gram string
143
+ ngram = ' '.join(ngram_words)
144
+ ngrams.append(ngram)
145
+
146
+ # Count frequencies
147
+ return dict(Counter(ngrams))
148
+
149
+ def _calculate_score(self, term_info: Dict, use_frequency: bool,
150
+ use_capitalization: bool, use_special_chars: bool) -> float:
151
+ """Calculate term score based on various factors"""
152
+ score = 0.0
153
+
154
+ # Base score from frequency
155
+ if use_frequency and term_info['frequency'] >= self.min_frequency:
156
+ # Logarithmic scale for frequency (diminishing returns)
157
+ import math
158
+ score += math.log(term_info['frequency'] + 1) * 2
159
+ elif term_info['frequency'] < self.min_frequency:
160
+ return 0.0 # Below minimum threshold
161
+
162
+ # Bonus for capitalization (likely proper nouns or technical terms)
163
+ if use_capitalization and term_info['is_capitalized']:
164
+ score += 3.0
165
+
166
+ # Bonus for special characters (technical terms, compound words)
167
+ if use_special_chars and term_info['has_special_chars']:
168
+ score += 2.0
169
+
170
+ # Bonus for longer n-grams (multi-word terms often more valuable)
171
+ if term_info['ngram_size'] > 1:
172
+ score += term_info['ngram_size'] * 1.5
173
+
174
+ return score
175
+
176
+ def _classify_term(self, term_info: Dict) -> str:
177
+ """Classify term type"""
178
+ if term_info['is_capitalized']:
179
+ return 'proper_noun'
180
+ elif term_info['has_special_chars']:
181
+ return 'technical'
182
+ elif term_info['ngram_size'] > 1:
183
+ return 'phrase'
184
+ else:
185
+ return 'word'
186
+
187
+ def extract_from_segments(self, segments: List[str]) -> List[Dict[str, any]]:
188
+ """
189
+ Extract terms from a list of segments (e.g., translation project)
190
+
191
+ Args:
192
+ segments: List of source text segments
193
+
194
+ Returns:
195
+ List of extracted term dictionaries
196
+ """
197
+ # Combine all segments into one text
198
+ combined_text = '\n'.join(segments)
199
+ return self.extract_terms(combined_text)
200
+
201
+ def filter_by_frequency(self, terms: List[Dict], min_freq: int = None,
202
+ max_freq: int = None) -> List[Dict]:
203
+ """Filter terms by frequency range"""
204
+ if min_freq is None:
205
+ min_freq = self.min_frequency
206
+
207
+ filtered = [t for t in terms if t['frequency'] >= min_freq]
208
+
209
+ if max_freq:
210
+ filtered = [t for t in filtered if t['frequency'] <= max_freq]
211
+
212
+ return filtered
213
+
214
+ def filter_by_type(self, terms: List[Dict], term_types: List[str]) -> List[Dict]:
215
+ """Filter terms by type"""
216
+ return [t for t in terms if t['type'] in term_types]
217
+
218
+ def filter_by_score(self, terms: List[Dict], min_score: float) -> List[Dict]:
219
+ """Filter terms by minimum score"""
220
+ return [t for t in terms if t['score'] >= min_score]
221
+
222
+ def deduplicate_terms(self, terms: List[Dict]) -> List[Dict]:
223
+ """Remove duplicate terms (case-insensitive)"""
224
+ seen = set()
225
+ unique = []
226
+
227
+ for term in terms:
228
+ term_lower = term['term'].lower()
229
+ if term_lower not in seen:
230
+ seen.add(term_lower)
231
+ unique.append(term)
232
+
233
+ return unique
234
+
235
+
236
+ # Convenience function for quick extraction
237
+ def extract_terms_from_text(text: str, source_lang: str = "en",
238
+ min_frequency: int = 2, max_terms: int = 100) -> List[str]:
239
+ """
240
+ Quick term extraction - returns just the term strings
241
+
242
+ Args:
243
+ text: Source text
244
+ source_lang: Language code
245
+ min_frequency: Minimum occurrences
246
+ max_terms: Maximum number of terms to return
247
+
248
+ Returns:
249
+ List of term strings
250
+ """
251
+ extractor = TermExtractor(source_lang=source_lang, min_frequency=min_frequency)
252
+ terms = extractor.extract_terms(text)
253
+ return [t['term'] for t in terms[:max_terms]]
254
+
255
+
256
+ # Example usage
257
+ if __name__ == "__main__":
258
+ sample_text = """
259
+ The system architecture includes a database manager, termbase manager, and
260
+ translation memory. The database manager handles all database operations.
261
+ The termbase manager provides terminology management functionality.
262
+ Translation memory stores previously translated segments for reuse.
263
+ """
264
+
265
+ extractor = TermExtractor(source_lang="en", min_frequency=2)
266
+ terms = extractor.extract_terms(sample_text)
267
+
268
+ print("Extracted Terms:")
269
+ for term in terms[:10]:
270
+ print(f" {term['term']:<30} freq={term['frequency']:<3} score={term['score']:.2f} type={term['type']}")