tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ """
2
+ TriTopic Multilingual Module
3
+
4
+ Provides language detection, tokenization, and stopwords for 60+ languages.
5
+ """
6
+
7
+ from .detection import (
8
+ detect_language,
9
+ detect_corpus_language,
10
+ normalize_language_code,
11
+ LANGUAGE_ALIASES,
12
+ )
13
+
14
+ from .tokenizers import (
15
+ TokenizerFactory,
16
+ tokenize_documents,
17
+ )
18
+
19
+ from .stopwords import (
20
+ get_stopwords,
21
+ get_available_languages,
22
+ STOPWORDS,
23
+ )
24
+
25
+ __all__ = [
26
+ # Detection
27
+ 'detect_language',
28
+ 'detect_corpus_language',
29
+ 'normalize_language_code',
30
+ 'LANGUAGE_ALIASES',
31
+ # Tokenizers
32
+ 'TokenizerFactory',
33
+ 'tokenize_documents',
34
+ # Stopwords
35
+ 'get_stopwords',
36
+ 'get_available_languages',
37
+ 'STOPWORDS',
38
+ ]
@@ -0,0 +1,208 @@
1
+ """
2
+ Language Detection Module
3
+
4
+ Automatic language detection for document collections.
5
+ """
6
+
7
+ from typing import List, Dict, Tuple, Optional
8
+ from collections import Counter
9
+ import warnings
10
+
11
+
12
+ def detect_language(text: str) -> Tuple[str, float]:
13
+ """
14
+ Detect the language of a single text.
15
+
16
+ Parameters
17
+ ----------
18
+ text : str
19
+ The text to analyze
20
+
21
+ Returns
22
+ -------
23
+ Tuple[str, float]
24
+ Language code and confidence score
25
+ """
26
+ # Try fasttext first (faster and more accurate)
27
+ try:
28
+ from fasttext_langdetect import detect
29
+ result = detect(text, low_memory=True)
30
+ return result['lang'], result['score']
31
+ except ImportError:
32
+ pass
33
+
34
+ # Fall back to langdetect
35
+ try:
36
+ from langdetect import detect_langs
37
+ results = detect_langs(text)
38
+ if results:
39
+ return results[0].lang, results[0].prob
40
+ except ImportError:
41
+ warnings.warn(
42
+ "No language detection library found. "
43
+ "Install 'langdetect' or 'fasttext-langdetect' for automatic language detection."
44
+ )
45
+ except Exception:
46
+ pass
47
+
48
+ return "en", 0.5 # Default to English with low confidence
49
+
50
+
51
+ def detect_corpus_language(
52
+ documents: List[str],
53
+ sample_size: int = 100,
54
+ min_length: int = 20
55
+ ) -> Dict:
56
+ """
57
+ Detect the dominant language(s) in a document collection.
58
+
59
+ Parameters
60
+ ----------
61
+ documents : List[str]
62
+ List of documents to analyze
63
+ sample_size : int
64
+ Number of documents to sample for detection
65
+ min_length : int
66
+ Minimum character length for a document to be considered
67
+
68
+ Returns
69
+ -------
70
+ Dict
71
+ Dictionary with detection results:
72
+ - 'dominant_language': str - The most common language
73
+ - 'confidence': float - Confidence in the dominant language
74
+ - 'distribution': Dict[str, float] - Language distribution
75
+ - 'is_multilingual': bool - True if multiple languages detected
76
+ - 'detected_languages': List[str] - All detected languages
77
+ """
78
+ import random
79
+
80
+ # Filter valid documents
81
+ valid_docs = [d for d in documents if isinstance(d, str) and len(d.strip()) >= min_length]
82
+
83
+ if not valid_docs:
84
+ return {
85
+ 'dominant_language': 'en',
86
+ 'confidence': 0.5,
87
+ 'distribution': {'en': 1.0},
88
+ 'is_multilingual': False,
89
+ 'detected_languages': ['en']
90
+ }
91
+
92
+ # Sample documents
93
+ if len(valid_docs) > sample_size:
94
+ sample = random.sample(valid_docs, sample_size)
95
+ else:
96
+ sample = valid_docs
97
+
98
+ # Detect language for each document
99
+ detections = []
100
+ for doc in sample:
101
+ try:
102
+ # Take first 500 chars for efficiency
103
+ lang, conf = detect_language(doc[:500])
104
+ detections.append((lang, conf))
105
+ except Exception:
106
+ continue
107
+
108
+ if not detections:
109
+ return {
110
+ 'dominant_language': 'en',
111
+ 'confidence': 0.5,
112
+ 'distribution': {'en': 1.0},
113
+ 'is_multilingual': False,
114
+ 'detected_languages': ['en']
115
+ }
116
+
117
+ # Calculate distribution
118
+ lang_counts = Counter(lang for lang, _ in detections)
119
+ total = len(detections)
120
+ distribution = {lang: count / total for lang, count in lang_counts.items()}
121
+
122
+ # Get dominant language
123
+ dominant_language = lang_counts.most_common(1)[0][0]
124
+ dominant_ratio = distribution[dominant_language]
125
+
126
+ # Calculate average confidence for dominant language
127
+ dominant_confidences = [conf for lang, conf in detections if lang == dominant_language]
128
+ avg_confidence = sum(dominant_confidences) / len(dominant_confidences) if dominant_confidences else 0.5
129
+
130
+ # Determine if corpus is multilingual
131
+ # If dominant language is less than 80% of corpus, consider it multilingual
132
+ is_multilingual = dominant_ratio < 0.8 and len(lang_counts) > 1
133
+
134
+ return {
135
+ 'dominant_language': dominant_language,
136
+ 'confidence': avg_confidence * dominant_ratio, # Combined confidence
137
+ 'distribution': distribution,
138
+ 'is_multilingual': is_multilingual,
139
+ 'detected_languages': list(lang_counts.keys())
140
+ }
141
+
142
+
143
+ # Language code normalization
144
+ LANGUAGE_ALIASES = {
145
+ 'german': 'de',
146
+ 'deutsch': 'de',
147
+ 'english': 'en',
148
+ 'french': 'fr',
149
+ 'français': 'fr',
150
+ 'spanish': 'es',
151
+ 'español': 'es',
152
+ 'italian': 'it',
153
+ 'italiano': 'it',
154
+ 'portuguese': 'pt',
155
+ 'português': 'pt',
156
+ 'dutch': 'nl',
157
+ 'chinese': 'zh',
158
+ 'zh-cn': 'zh',
159
+ 'zh-tw': 'zh',
160
+ 'japanese': 'ja',
161
+ 'korean': 'ko',
162
+ 'russian': 'ru',
163
+ 'arabic': 'ar',
164
+ 'turkish': 'tr',
165
+ 'polish': 'pl',
166
+ 'swedish': 'sv',
167
+ 'norwegian': 'no',
168
+ 'danish': 'da',
169
+ 'finnish': 'fi',
170
+ 'greek': 'el',
171
+ 'hebrew': 'he',
172
+ 'thai': 'th',
173
+ 'vietnamese': 'vi',
174
+ 'indonesian': 'id',
175
+ 'hindi': 'hi',
176
+ 'czech': 'cs',
177
+ 'hungarian': 'hu',
178
+ 'romanian': 'ro',
179
+ 'ukrainian': 'uk',
180
+ }
181
+
182
+
183
+ def normalize_language_code(language: str) -> str:
184
+ """
185
+ Normalize a language identifier to ISO 639-1 code.
186
+
187
+ Parameters
188
+ ----------
189
+ language : str
190
+ Language name or code
191
+
192
+ Returns
193
+ -------
194
+ str
195
+ ISO 639-1 language code
196
+ """
197
+ lang_lower = language.lower().strip()
198
+
199
+ # Check if it's already a valid ISO code
200
+ if len(lang_lower) == 2:
201
+ return lang_lower
202
+
203
+ # Check aliases
204
+ if lang_lower in LANGUAGE_ALIASES:
205
+ return LANGUAGE_ALIASES[lang_lower]
206
+
207
+ # Return as-is if unknown
208
+ return lang_lower[:2] if len(lang_lower) > 2 else lang_lower