tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tritopic/__init__.py +22 -32
- tritopic/config.py +305 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
- tritopic-1.0.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TriTopic Multilingual Module
|
|
3
|
+
|
|
4
|
+
Provides language detection, tokenization, and stopwords for 60+ languages.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .detection import (
|
|
8
|
+
detect_language,
|
|
9
|
+
detect_corpus_language,
|
|
10
|
+
normalize_language_code,
|
|
11
|
+
LANGUAGE_ALIASES,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from .tokenizers import (
|
|
15
|
+
TokenizerFactory,
|
|
16
|
+
tokenize_documents,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from .stopwords import (
|
|
20
|
+
get_stopwords,
|
|
21
|
+
get_available_languages,
|
|
22
|
+
STOPWORDS,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
# Detection
|
|
27
|
+
'detect_language',
|
|
28
|
+
'detect_corpus_language',
|
|
29
|
+
'normalize_language_code',
|
|
30
|
+
'LANGUAGE_ALIASES',
|
|
31
|
+
# Tokenizers
|
|
32
|
+
'TokenizerFactory',
|
|
33
|
+
'tokenize_documents',
|
|
34
|
+
# Stopwords
|
|
35
|
+
'get_stopwords',
|
|
36
|
+
'get_available_languages',
|
|
37
|
+
'STOPWORDS',
|
|
38
|
+
]
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Language Detection Module
|
|
3
|
+
|
|
4
|
+
Automatic language detection for document collections.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Dict, Tuple, Optional
|
|
8
|
+
from collections import Counter
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def detect_language(text: str) -> Tuple[str, float]:
|
|
13
|
+
"""
|
|
14
|
+
Detect the language of a single text.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
text : str
|
|
19
|
+
The text to analyze
|
|
20
|
+
|
|
21
|
+
Returns
|
|
22
|
+
-------
|
|
23
|
+
Tuple[str, float]
|
|
24
|
+
Language code and confidence score
|
|
25
|
+
"""
|
|
26
|
+
# Try fasttext first (faster and more accurate)
|
|
27
|
+
try:
|
|
28
|
+
from fasttext_langdetect import detect
|
|
29
|
+
result = detect(text, low_memory=True)
|
|
30
|
+
return result['lang'], result['score']
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
# Fall back to langdetect
|
|
35
|
+
try:
|
|
36
|
+
from langdetect import detect_langs
|
|
37
|
+
results = detect_langs(text)
|
|
38
|
+
if results:
|
|
39
|
+
return results[0].lang, results[0].prob
|
|
40
|
+
except ImportError:
|
|
41
|
+
warnings.warn(
|
|
42
|
+
"No language detection library found. "
|
|
43
|
+
"Install 'langdetect' or 'fasttext-langdetect' for automatic language detection."
|
|
44
|
+
)
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
return "en", 0.5 # Default to English with low confidence
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def detect_corpus_language(
|
|
52
|
+
documents: List[str],
|
|
53
|
+
sample_size: int = 100,
|
|
54
|
+
min_length: int = 20
|
|
55
|
+
) -> Dict:
|
|
56
|
+
"""
|
|
57
|
+
Detect the dominant language(s) in a document collection.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
documents : List[str]
|
|
62
|
+
List of documents to analyze
|
|
63
|
+
sample_size : int
|
|
64
|
+
Number of documents to sample for detection
|
|
65
|
+
min_length : int
|
|
66
|
+
Minimum character length for a document to be considered
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
Dict
|
|
71
|
+
Dictionary with detection results:
|
|
72
|
+
- 'dominant_language': str - The most common language
|
|
73
|
+
- 'confidence': float - Confidence in the dominant language
|
|
74
|
+
- 'distribution': Dict[str, float] - Language distribution
|
|
75
|
+
- 'is_multilingual': bool - True if multiple languages detected
|
|
76
|
+
- 'detected_languages': List[str] - All detected languages
|
|
77
|
+
"""
|
|
78
|
+
import random
|
|
79
|
+
|
|
80
|
+
# Filter valid documents
|
|
81
|
+
valid_docs = [d for d in documents if isinstance(d, str) and len(d.strip()) >= min_length]
|
|
82
|
+
|
|
83
|
+
if not valid_docs:
|
|
84
|
+
return {
|
|
85
|
+
'dominant_language': 'en',
|
|
86
|
+
'confidence': 0.5,
|
|
87
|
+
'distribution': {'en': 1.0},
|
|
88
|
+
'is_multilingual': False,
|
|
89
|
+
'detected_languages': ['en']
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Sample documents
|
|
93
|
+
if len(valid_docs) > sample_size:
|
|
94
|
+
sample = random.sample(valid_docs, sample_size)
|
|
95
|
+
else:
|
|
96
|
+
sample = valid_docs
|
|
97
|
+
|
|
98
|
+
# Detect language for each document
|
|
99
|
+
detections = []
|
|
100
|
+
for doc in sample:
|
|
101
|
+
try:
|
|
102
|
+
# Take first 500 chars for efficiency
|
|
103
|
+
lang, conf = detect_language(doc[:500])
|
|
104
|
+
detections.append((lang, conf))
|
|
105
|
+
except Exception:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
if not detections:
|
|
109
|
+
return {
|
|
110
|
+
'dominant_language': 'en',
|
|
111
|
+
'confidence': 0.5,
|
|
112
|
+
'distribution': {'en': 1.0},
|
|
113
|
+
'is_multilingual': False,
|
|
114
|
+
'detected_languages': ['en']
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Calculate distribution
|
|
118
|
+
lang_counts = Counter(lang for lang, _ in detections)
|
|
119
|
+
total = len(detections)
|
|
120
|
+
distribution = {lang: count / total for lang, count in lang_counts.items()}
|
|
121
|
+
|
|
122
|
+
# Get dominant language
|
|
123
|
+
dominant_language = lang_counts.most_common(1)[0][0]
|
|
124
|
+
dominant_ratio = distribution[dominant_language]
|
|
125
|
+
|
|
126
|
+
# Calculate average confidence for dominant language
|
|
127
|
+
dominant_confidences = [conf for lang, conf in detections if lang == dominant_language]
|
|
128
|
+
avg_confidence = sum(dominant_confidences) / len(dominant_confidences) if dominant_confidences else 0.5
|
|
129
|
+
|
|
130
|
+
# Determine if corpus is multilingual
|
|
131
|
+
# If dominant language is less than 80% of corpus, consider it multilingual
|
|
132
|
+
is_multilingual = dominant_ratio < 0.8 and len(lang_counts) > 1
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
'dominant_language': dominant_language,
|
|
136
|
+
'confidence': avg_confidence * dominant_ratio, # Combined confidence
|
|
137
|
+
'distribution': distribution,
|
|
138
|
+
'is_multilingual': is_multilingual,
|
|
139
|
+
'detected_languages': list(lang_counts.keys())
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Language code normalization
|
|
144
|
+
LANGUAGE_ALIASES = {
|
|
145
|
+
'german': 'de',
|
|
146
|
+
'deutsch': 'de',
|
|
147
|
+
'english': 'en',
|
|
148
|
+
'french': 'fr',
|
|
149
|
+
'français': 'fr',
|
|
150
|
+
'spanish': 'es',
|
|
151
|
+
'español': 'es',
|
|
152
|
+
'italian': 'it',
|
|
153
|
+
'italiano': 'it',
|
|
154
|
+
'portuguese': 'pt',
|
|
155
|
+
'português': 'pt',
|
|
156
|
+
'dutch': 'nl',
|
|
157
|
+
'chinese': 'zh',
|
|
158
|
+
'zh-cn': 'zh',
|
|
159
|
+
'zh-tw': 'zh',
|
|
160
|
+
'japanese': 'ja',
|
|
161
|
+
'korean': 'ko',
|
|
162
|
+
'russian': 'ru',
|
|
163
|
+
'arabic': 'ar',
|
|
164
|
+
'turkish': 'tr',
|
|
165
|
+
'polish': 'pl',
|
|
166
|
+
'swedish': 'sv',
|
|
167
|
+
'norwegian': 'no',
|
|
168
|
+
'danish': 'da',
|
|
169
|
+
'finnish': 'fi',
|
|
170
|
+
'greek': 'el',
|
|
171
|
+
'hebrew': 'he',
|
|
172
|
+
'thai': 'th',
|
|
173
|
+
'vietnamese': 'vi',
|
|
174
|
+
'indonesian': 'id',
|
|
175
|
+
'hindi': 'hi',
|
|
176
|
+
'czech': 'cs',
|
|
177
|
+
'hungarian': 'hu',
|
|
178
|
+
'romanian': 'ro',
|
|
179
|
+
'ukrainian': 'uk',
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def normalize_language_code(language: str) -> str:
|
|
184
|
+
"""
|
|
185
|
+
Normalize a language identifier to ISO 639-1 code.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
language : str
|
|
190
|
+
Language name or code
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
str
|
|
195
|
+
ISO 639-1 language code
|
|
196
|
+
"""
|
|
197
|
+
lang_lower = language.lower().strip()
|
|
198
|
+
|
|
199
|
+
# Check if it's already a valid ISO code
|
|
200
|
+
if len(lang_lower) == 2:
|
|
201
|
+
return lang_lower
|
|
202
|
+
|
|
203
|
+
# Check aliases
|
|
204
|
+
if lang_lower in LANGUAGE_ALIASES:
|
|
205
|
+
return LANGUAGE_ALIASES[lang_lower]
|
|
206
|
+
|
|
207
|
+
# Return as-is if unknown
|
|
208
|
+
return lang_lower[:2] if len(lang_lower) > 2 else lang_lower
|