supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Term Extractor Module
|
|
3
|
+
|
|
4
|
+
Extracts potential terminology from source text for project termbases.
|
|
5
|
+
Can be used as a standalone tool or integrated into Supervertaler.
|
|
6
|
+
|
|
7
|
+
Author: Michael Beijer
|
|
8
|
+
License: MIT
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import List, Dict, Set, Optional, Tuple
|
|
13
|
+
from collections import Counter
|
|
14
|
+
import string
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TermExtractor:
|
|
18
|
+
"""Extract terminology from source text using various algorithms"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, source_lang: str = "en", min_frequency: int = 2,
|
|
21
|
+
min_word_length: int = 3, max_ngram: int = 3):
|
|
22
|
+
"""
|
|
23
|
+
Initialize term extractor
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source_lang: Source language code (e.g., 'en', 'nl', 'de')
|
|
27
|
+
min_frequency: Minimum number of occurrences to consider as term
|
|
28
|
+
min_word_length: Minimum character length for single words
|
|
29
|
+
max_ngram: Maximum n-gram size (1=single words, 2=bigrams, 3=trigrams)
|
|
30
|
+
"""
|
|
31
|
+
self.source_lang = source_lang.lower()
|
|
32
|
+
self.min_frequency = min_frequency
|
|
33
|
+
self.min_word_length = min_word_length
|
|
34
|
+
self.max_ngram = max_ngram
|
|
35
|
+
|
|
36
|
+
# Common stop words by language
|
|
37
|
+
self.stop_words = self._get_stop_words(source_lang)
|
|
38
|
+
|
|
39
|
+
def _get_stop_words(self, lang: str) -> Set[str]:
|
|
40
|
+
"""Get stop words for a language"""
|
|
41
|
+
# Basic stop words - can be expanded
|
|
42
|
+
stop_words = {
|
|
43
|
+
'en': {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
44
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
|
45
|
+
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
|
46
|
+
'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this',
|
|
47
|
+
'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'},
|
|
48
|
+
'nl': {'de', 'het', 'een', 'en', 'of', 'maar', 'in', 'op', 'aan', 'te', 'voor',
|
|
49
|
+
'van', 'met', 'bij', 'uit', 'als', 'is', 'was', 'zijn', 'waren', 'wordt',
|
|
50
|
+
'worden', 'werd', 'werden', 'hebben', 'heeft', 'had', 'hadden', 'zal',
|
|
51
|
+
'zou', 'kunnen', 'kan', 'moet', 'mag', 'dit', 'dat', 'deze', 'die',
|
|
52
|
+
'ik', 'je', 'jij', 'hij', 'zij', 'het', 'wij', 'ze'},
|
|
53
|
+
'de': {'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'einem',
|
|
54
|
+
'einen', 'eines', 'und', 'oder', 'aber', 'in', 'an', 'auf', 'zu', 'für',
|
|
55
|
+
'von', 'mit', 'bei', 'aus', 'als', 'ist', 'war', 'sind', 'waren', 'wird',
|
|
56
|
+
'werden', 'wurde', 'wurden', 'haben', 'hat', 'hatte', 'hatten', 'ich',
|
|
57
|
+
'du', 'er', 'sie', 'es', 'wir', 'ihr'},
|
|
58
|
+
'fr': {'le', 'la', 'les', 'un', 'une', 'des', 'et', 'ou', 'mais', 'dans', 'sur',
|
|
59
|
+
'à', 'de', 'pour', 'avec', 'par', 'comme', 'est', 'était', 'sont', 'étaient',
|
|
60
|
+
'être', 'avoir', 'a', 'avait', 'je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils'},
|
|
61
|
+
'es': {'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'y', 'o', 'pero',
|
|
62
|
+
'en', 'a', 'de', 'para', 'con', 'por', 'como', 'es', 'era', 'son', 'eran',
|
|
63
|
+
'ser', 'estar', 'haber', 'he', 'ha', 'yo', 'tú', 'él', 'ella', 'nosotros'},
|
|
64
|
+
}
|
|
65
|
+
return stop_words.get(lang, set())
|
|
66
|
+
|
|
67
|
+
def extract_terms(self, text: str, use_frequency: bool = True,
|
|
68
|
+
use_capitalization: bool = True,
|
|
69
|
+
use_special_chars: bool = True) -> List[Dict[str, any]]:
|
|
70
|
+
"""
|
|
71
|
+
Extract potential terms from text
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
text: Source text to analyze
|
|
75
|
+
use_frequency: Consider term frequency in ranking
|
|
76
|
+
use_capitalization: Give higher weight to capitalized terms
|
|
77
|
+
use_special_chars: Consider terms with hyphens, underscores, etc.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of term dictionaries with fields: term, frequency, score, type
|
|
81
|
+
"""
|
|
82
|
+
if not text:
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
# Collect all candidate terms
|
|
86
|
+
candidates = {}
|
|
87
|
+
|
|
88
|
+
# Extract n-grams (1 to max_ngram)
|
|
89
|
+
for n in range(1, self.max_ngram + 1):
|
|
90
|
+
ngrams = self._extract_ngrams(text, n)
|
|
91
|
+
for ngram, freq in ngrams.items():
|
|
92
|
+
if ngram not in candidates:
|
|
93
|
+
candidates[ngram] = {
|
|
94
|
+
'term': ngram,
|
|
95
|
+
'frequency': freq,
|
|
96
|
+
'ngram_size': n,
|
|
97
|
+
'is_capitalized': ngram[0].isupper() if ngram else False,
|
|
98
|
+
'has_special_chars': bool(re.search(r'[-_./]', ngram))
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Score and rank terms
|
|
102
|
+
scored_terms = []
|
|
103
|
+
for term_info in candidates.values():
|
|
104
|
+
score = self._calculate_score(
|
|
105
|
+
term_info,
|
|
106
|
+
use_frequency=use_frequency,
|
|
107
|
+
use_capitalization=use_capitalization,
|
|
108
|
+
use_special_chars=use_special_chars
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if score > 0: # Only include terms with positive score
|
|
112
|
+
scored_terms.append({
|
|
113
|
+
'term': term_info['term'],
|
|
114
|
+
'frequency': term_info['frequency'],
|
|
115
|
+
'score': score,
|
|
116
|
+
'type': self._classify_term(term_info)
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
# Sort by score (highest first)
|
|
120
|
+
scored_terms.sort(key=lambda x: x['score'], reverse=True)
|
|
121
|
+
|
|
122
|
+
return scored_terms
|
|
123
|
+
|
|
124
|
+
def _extract_ngrams(self, text: str, n: int) -> Dict[str, int]:
|
|
125
|
+
"""Extract n-grams from text"""
|
|
126
|
+
# Tokenize text into words
|
|
127
|
+
words = re.findall(r'\b[\w-]+\b', text.lower())
|
|
128
|
+
|
|
129
|
+
# Generate n-grams
|
|
130
|
+
ngrams = []
|
|
131
|
+
for i in range(len(words) - n + 1):
|
|
132
|
+
ngram_words = words[i:i+n]
|
|
133
|
+
|
|
134
|
+
# Skip if contains stop words (except for longer n-grams where they might be acceptable)
|
|
135
|
+
if n == 1 and ngram_words[0] in self.stop_words:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Skip if too short
|
|
139
|
+
if n == 1 and len(ngram_words[0]) < self.min_word_length:
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
# Create n-gram string
|
|
143
|
+
ngram = ' '.join(ngram_words)
|
|
144
|
+
ngrams.append(ngram)
|
|
145
|
+
|
|
146
|
+
# Count frequencies
|
|
147
|
+
return dict(Counter(ngrams))
|
|
148
|
+
|
|
149
|
+
def _calculate_score(self, term_info: Dict, use_frequency: bool,
|
|
150
|
+
use_capitalization: bool, use_special_chars: bool) -> float:
|
|
151
|
+
"""Calculate term score based on various factors"""
|
|
152
|
+
score = 0.0
|
|
153
|
+
|
|
154
|
+
# Base score from frequency
|
|
155
|
+
if use_frequency and term_info['frequency'] >= self.min_frequency:
|
|
156
|
+
# Logarithmic scale for frequency (diminishing returns)
|
|
157
|
+
import math
|
|
158
|
+
score += math.log(term_info['frequency'] + 1) * 2
|
|
159
|
+
elif term_info['frequency'] < self.min_frequency:
|
|
160
|
+
return 0.0 # Below minimum threshold
|
|
161
|
+
|
|
162
|
+
# Bonus for capitalization (likely proper nouns or technical terms)
|
|
163
|
+
if use_capitalization and term_info['is_capitalized']:
|
|
164
|
+
score += 3.0
|
|
165
|
+
|
|
166
|
+
# Bonus for special characters (technical terms, compound words)
|
|
167
|
+
if use_special_chars and term_info['has_special_chars']:
|
|
168
|
+
score += 2.0
|
|
169
|
+
|
|
170
|
+
# Bonus for longer n-grams (multi-word terms often more valuable)
|
|
171
|
+
if term_info['ngram_size'] > 1:
|
|
172
|
+
score += term_info['ngram_size'] * 1.5
|
|
173
|
+
|
|
174
|
+
return score
|
|
175
|
+
|
|
176
|
+
def _classify_term(self, term_info: Dict) -> str:
|
|
177
|
+
"""Classify term type"""
|
|
178
|
+
if term_info['is_capitalized']:
|
|
179
|
+
return 'proper_noun'
|
|
180
|
+
elif term_info['has_special_chars']:
|
|
181
|
+
return 'technical'
|
|
182
|
+
elif term_info['ngram_size'] > 1:
|
|
183
|
+
return 'phrase'
|
|
184
|
+
else:
|
|
185
|
+
return 'word'
|
|
186
|
+
|
|
187
|
+
def extract_from_segments(self, segments: List[str]) -> List[Dict[str, any]]:
|
|
188
|
+
"""
|
|
189
|
+
Extract terms from a list of segments (e.g., translation project)
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
segments: List of source text segments
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
List of extracted term dictionaries
|
|
196
|
+
"""
|
|
197
|
+
# Combine all segments into one text
|
|
198
|
+
combined_text = '\n'.join(segments)
|
|
199
|
+
return self.extract_terms(combined_text)
|
|
200
|
+
|
|
201
|
+
def filter_by_frequency(self, terms: List[Dict], min_freq: int = None,
|
|
202
|
+
max_freq: int = None) -> List[Dict]:
|
|
203
|
+
"""Filter terms by frequency range"""
|
|
204
|
+
if min_freq is None:
|
|
205
|
+
min_freq = self.min_frequency
|
|
206
|
+
|
|
207
|
+
filtered = [t for t in terms if t['frequency'] >= min_freq]
|
|
208
|
+
|
|
209
|
+
if max_freq:
|
|
210
|
+
filtered = [t for t in filtered if t['frequency'] <= max_freq]
|
|
211
|
+
|
|
212
|
+
return filtered
|
|
213
|
+
|
|
214
|
+
def filter_by_type(self, terms: List[Dict], term_types: List[str]) -> List[Dict]:
|
|
215
|
+
"""Filter terms by type"""
|
|
216
|
+
return [t for t in terms if t['type'] in term_types]
|
|
217
|
+
|
|
218
|
+
def filter_by_score(self, terms: List[Dict], min_score: float) -> List[Dict]:
|
|
219
|
+
"""Filter terms by minimum score"""
|
|
220
|
+
return [t for t in terms if t['score'] >= min_score]
|
|
221
|
+
|
|
222
|
+
def deduplicate_terms(self, terms: List[Dict]) -> List[Dict]:
|
|
223
|
+
"""Remove duplicate terms (case-insensitive)"""
|
|
224
|
+
seen = set()
|
|
225
|
+
unique = []
|
|
226
|
+
|
|
227
|
+
for term in terms:
|
|
228
|
+
term_lower = term['term'].lower()
|
|
229
|
+
if term_lower not in seen:
|
|
230
|
+
seen.add(term_lower)
|
|
231
|
+
unique.append(term)
|
|
232
|
+
|
|
233
|
+
return unique
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# Convenience function for quick extraction
|
|
237
|
+
def extract_terms_from_text(text: str, source_lang: str = "en",
|
|
238
|
+
min_frequency: int = 2, max_terms: int = 100) -> List[str]:
|
|
239
|
+
"""
|
|
240
|
+
Quick term extraction - returns just the term strings
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
text: Source text
|
|
244
|
+
source_lang: Language code
|
|
245
|
+
min_frequency: Minimum occurrences
|
|
246
|
+
max_terms: Maximum number of terms to return
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List of term strings
|
|
250
|
+
"""
|
|
251
|
+
extractor = TermExtractor(source_lang=source_lang, min_frequency=min_frequency)
|
|
252
|
+
terms = extractor.extract_terms(text)
|
|
253
|
+
return [t['term'] for t in terms[:max_terms]]
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
# Example usage
|
|
257
|
+
if __name__ == "__main__":
|
|
258
|
+
sample_text = """
|
|
259
|
+
The system architecture includes a database manager, termbase manager, and
|
|
260
|
+
translation memory. The database manager handles all database operations.
|
|
261
|
+
The termbase manager provides terminology management functionality.
|
|
262
|
+
Translation memory stores previously translated segments for reuse.
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
extractor = TermExtractor(source_lang="en", min_frequency=2)
|
|
266
|
+
terms = extractor.extract_terms(sample_text)
|
|
267
|
+
|
|
268
|
+
print("Extracted Terms:")
|
|
269
|
+
for term in terms[:10]:
|
|
270
|
+
print(f" {term['term']:<30} freq={term['frequency']:<3} score={term['score']:.2f} type={term['type']}")
|