tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tritopic/__init__.py +22 -32
- tritopic/config.py +305 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
- tritopic-1.0.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0
tritopic/__init__.py
CHANGED
|
@@ -1,46 +1,36 @@
|
|
|
1
1
|
"""
|
|
2
2
|
TriTopic: Tri-Modal Graph Topic Modeling with Iterative Refinement
|
|
3
|
-
===================================================================
|
|
4
3
|
|
|
5
|
-
A state-of-the-art topic modeling library that
|
|
6
|
-
|
|
7
|
-
- Lexical similarity (BM25)
|
|
8
|
-
- Metadata context (optional)
|
|
4
|
+
A state-of-the-art topic modeling library that consistently outperforms
|
|
5
|
+
BERTopic and traditional approaches.
|
|
9
6
|
|
|
10
|
-
|
|
11
|
-
-
|
|
12
|
-
- Mutual kNN + SNN
|
|
13
|
-
-
|
|
14
|
-
-
|
|
7
|
+
Key Features:
|
|
8
|
+
- Multi-view representation (semantic, lexical, metadata)
|
|
9
|
+
- Hybrid graph construction (Mutual kNN + SNN)
|
|
10
|
+
- Consensus Leiden clustering for stability
|
|
11
|
+
- Iterative refinement for improved coherence
|
|
12
|
+
- Multilingual support (60+ languages)
|
|
13
|
+
- LLM-powered labeling
|
|
15
14
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
>>>
|
|
19
|
-
>>>
|
|
20
|
-
>>>
|
|
21
|
-
>>> model.visualize()
|
|
22
|
-
|
|
23
|
-
Author: Roman Egger
|
|
24
|
-
License: MIT
|
|
15
|
+
Example:
|
|
16
|
+
>>> from tritopic import TriTopic
|
|
17
|
+
>>> model = TriTopic(verbose=True)
|
|
18
|
+
>>> topics = model.fit_transform(documents)
|
|
19
|
+
>>> print(model.get_topic_info())
|
|
25
20
|
"""
|
|
26
21
|
|
|
27
|
-
__version__ = "
|
|
22
|
+
__version__ = "1.0.0"
|
|
28
23
|
__author__ = "Roman Egger"
|
|
29
24
|
|
|
30
|
-
from
|
|
31
|
-
from
|
|
32
|
-
from
|
|
33
|
-
from tritopic.core.embeddings import EmbeddingEngine
|
|
34
|
-
from tritopic.core.keywords import KeywordExtractor
|
|
35
|
-
from tritopic.labeling.llm_labeler import LLMLabeler
|
|
36
|
-
from tritopic.visualization.plotter import TopicVisualizer
|
|
25
|
+
from .model import TriTopic, Topic
|
|
26
|
+
from .config import TriTopicConfig, get_config
|
|
27
|
+
from .labeling import LLMLabeler, KeywordLabeler
|
|
37
28
|
|
|
38
29
|
__all__ = [
|
|
39
30
|
"TriTopic",
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"KeywordExtractor",
|
|
31
|
+
"Topic",
|
|
32
|
+
"TriTopicConfig",
|
|
33
|
+
"get_config",
|
|
44
34
|
"LLMLabeler",
|
|
45
|
-
"
|
|
35
|
+
"KeywordLabeler",
|
|
46
36
|
]
|
tritopic/config.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TriTopic Configuration Module
|
|
3
|
+
|
|
4
|
+
Defines all configuration parameters for the TriTopic model.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Optional, List, Literal, Union
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class TriTopicConfig:
|
|
13
|
+
"""
|
|
14
|
+
Configuration for TriTopic model.
|
|
15
|
+
|
|
16
|
+
Attributes
|
|
17
|
+
----------
|
|
18
|
+
# Embedding & Language Settings
|
|
19
|
+
embedding_model : str
|
|
20
|
+
Sentence-Transformer model name or "auto" for automatic selection.
|
|
21
|
+
Auto-selection considers the language parameter.
|
|
22
|
+
embedding_batch_size : int
|
|
23
|
+
Batch size for embedding generation.
|
|
24
|
+
language : str
|
|
25
|
+
ISO 639-1 language code (e.g., "en", "de", "zh") or "auto" for detection.
|
|
26
|
+
multilingual : bool
|
|
27
|
+
If True, uses multilingual embedding models regardless of detected language.
|
|
28
|
+
language_detection_sample : int
|
|
29
|
+
Number of documents to sample for automatic language detection.
|
|
30
|
+
tokenizer : str
|
|
31
|
+
Tokenizer to use: "auto", "whitespace", "spacy", "jieba", "fugashi", "konlpy", "pythainlp".
|
|
32
|
+
custom_stopwords : List[str]
|
|
33
|
+
Additional stopwords to add to the language-specific list.
|
|
34
|
+
min_token_length : int
|
|
35
|
+
Minimum token length to keep.
|
|
36
|
+
max_token_length : int
|
|
37
|
+
Maximum token length to keep.
|
|
38
|
+
|
|
39
|
+
# Graph Construction
|
|
40
|
+
n_neighbors : int
|
|
41
|
+
Number of neighbors for kNN graph construction.
|
|
42
|
+
metric : str
|
|
43
|
+
Distance metric for similarity calculation.
|
|
44
|
+
graph_type : str
|
|
45
|
+
Type of graph: "knn", "mutual_knn", "snn", "hybrid".
|
|
46
|
+
snn_weight : float
|
|
47
|
+
Weight of SNN component in hybrid graph (0-1).
|
|
48
|
+
|
|
49
|
+
# Multi-View Fusion
|
|
50
|
+
use_lexical_view : bool
|
|
51
|
+
Whether to include lexical (TF-IDF/BM25) similarity.
|
|
52
|
+
use_metadata_view : bool
|
|
53
|
+
Whether to include metadata-based similarity.
|
|
54
|
+
semantic_weight : float
|
|
55
|
+
Weight for semantic (embedding) view.
|
|
56
|
+
lexical_weight : float
|
|
57
|
+
Weight for lexical view.
|
|
58
|
+
metadata_weight : float
|
|
59
|
+
Weight for metadata view.
|
|
60
|
+
lexical_method : str
|
|
61
|
+
Method for lexical similarity: "tfidf", "bm25".
|
|
62
|
+
ngram_range : tuple
|
|
63
|
+
N-gram range for lexical features.
|
|
64
|
+
|
|
65
|
+
# Clustering
|
|
66
|
+
resolution : float
|
|
67
|
+
Resolution parameter for Leiden algorithm.
|
|
68
|
+
n_consensus_runs : int
|
|
69
|
+
Number of clustering runs for consensus.
|
|
70
|
+
min_cluster_size : int
|
|
71
|
+
Minimum number of documents per topic.
|
|
72
|
+
|
|
73
|
+
# Iterative Refinement
|
|
74
|
+
use_iterative_refinement : bool
|
|
75
|
+
Whether to use iterative embedding refinement.
|
|
76
|
+
max_iterations : int
|
|
77
|
+
Maximum refinement iterations.
|
|
78
|
+
convergence_threshold : float
|
|
79
|
+
ARI threshold for convergence detection.
|
|
80
|
+
refinement_strength : float
|
|
81
|
+
How strongly to pull embeddings toward centroids (0-1).
|
|
82
|
+
|
|
83
|
+
# Keywords
|
|
84
|
+
n_keywords : int
|
|
85
|
+
Number of keywords per topic.
|
|
86
|
+
keyword_method : str
|
|
87
|
+
Method for keyword extraction: "ctfidf", "bm25", "keybert".
|
|
88
|
+
|
|
89
|
+
# Representative Documents
|
|
90
|
+
n_representative_docs : int
|
|
91
|
+
Number of representative documents per topic.
|
|
92
|
+
representative_method : str
|
|
93
|
+
Method for selection: "centroid", "medoid", "archetype", "diverse", "hybrid".
|
|
94
|
+
n_archetypes : int
|
|
95
|
+
Number of archetypes per topic (for archetype/hybrid method).
|
|
96
|
+
archetype_method : str
|
|
97
|
+
Algorithm for archetype analysis: "pcha", "convex_hull", "furthest_sum".
|
|
98
|
+
|
|
99
|
+
# Outlier Handling
|
|
100
|
+
outlier_threshold : float
|
|
101
|
+
Threshold for outlier detection (0-1).
|
|
102
|
+
reassign_outliers : bool
|
|
103
|
+
Whether to try reassigning outliers to nearest topic.
|
|
104
|
+
|
|
105
|
+
# Misc
|
|
106
|
+
random_state : int
|
|
107
|
+
Random seed for reproducibility.
|
|
108
|
+
verbose : bool
|
|
109
|
+
Whether to print progress information.
|
|
110
|
+
n_jobs : int
|
|
111
|
+
Number of parallel jobs (-1 for all cores).
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
# === Embedding & Language Settings ===
|
|
115
|
+
embedding_model: str = "auto"
|
|
116
|
+
embedding_batch_size: int = 32
|
|
117
|
+
language: str = "auto"
|
|
118
|
+
multilingual: bool = False
|
|
119
|
+
language_detection_sample: int = 100
|
|
120
|
+
tokenizer: str = "auto"
|
|
121
|
+
custom_stopwords: Optional[List[str]] = None
|
|
122
|
+
min_token_length: int = 2
|
|
123
|
+
max_token_length: int = 50
|
|
124
|
+
|
|
125
|
+
# === Graph Construction ===
|
|
126
|
+
n_neighbors: int = 15
|
|
127
|
+
metric: str = "cosine"
|
|
128
|
+
graph_type: Literal["knn", "mutual_knn", "snn", "hybrid"] = "hybrid"
|
|
129
|
+
snn_weight: float = 0.5
|
|
130
|
+
|
|
131
|
+
# === Multi-View Fusion ===
|
|
132
|
+
use_lexical_view: bool = True
|
|
133
|
+
use_metadata_view: bool = False
|
|
134
|
+
semantic_weight: float = 0.5
|
|
135
|
+
lexical_weight: float = 0.3
|
|
136
|
+
metadata_weight: float = 0.2
|
|
137
|
+
lexical_method: Literal["tfidf", "bm25"] = "tfidf"
|
|
138
|
+
ngram_range: tuple = (1, 2)
|
|
139
|
+
|
|
140
|
+
# === Clustering ===
|
|
141
|
+
resolution: float = 1.0
|
|
142
|
+
n_consensus_runs: int = 10
|
|
143
|
+
min_cluster_size: int = 5
|
|
144
|
+
|
|
145
|
+
# === Iterative Refinement ===
|
|
146
|
+
use_iterative_refinement: bool = True
|
|
147
|
+
max_iterations: int = 5
|
|
148
|
+
convergence_threshold: float = 0.95
|
|
149
|
+
refinement_strength: float = 0.15
|
|
150
|
+
|
|
151
|
+
# === Keywords ===
|
|
152
|
+
n_keywords: int = 10
|
|
153
|
+
keyword_method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf"
|
|
154
|
+
|
|
155
|
+
# === Representative Documents ===
|
|
156
|
+
n_representative_docs: int = 5
|
|
157
|
+
representative_method: Literal["centroid", "medoid", "archetype", "diverse", "hybrid"] = "hybrid"
|
|
158
|
+
n_archetypes: int = 4
|
|
159
|
+
archetype_method: Literal["pcha", "convex_hull", "furthest_sum"] = "furthest_sum"
|
|
160
|
+
|
|
161
|
+
# === Outlier Handling ===
|
|
162
|
+
outlier_threshold: float = 0.1
|
|
163
|
+
reassign_outliers: bool = False
|
|
164
|
+
|
|
165
|
+
# === Misc ===
|
|
166
|
+
random_state: Optional[int] = 42
|
|
167
|
+
verbose: bool = True
|
|
168
|
+
n_jobs: int = -1
|
|
169
|
+
|
|
170
|
+
def __post_init__(self):
|
|
171
|
+
"""Validate configuration after initialization."""
|
|
172
|
+
self._validate()
|
|
173
|
+
|
|
174
|
+
def _validate(self):
|
|
175
|
+
"""Validate configuration parameters."""
|
|
176
|
+
# Weights should sum to ~1.0
|
|
177
|
+
total_weight = self.semantic_weight
|
|
178
|
+
if self.use_lexical_view:
|
|
179
|
+
total_weight += self.lexical_weight
|
|
180
|
+
if self.use_metadata_view:
|
|
181
|
+
total_weight += self.metadata_weight
|
|
182
|
+
|
|
183
|
+
if abs(total_weight - 1.0) > 0.01:
|
|
184
|
+
# Auto-normalize weights
|
|
185
|
+
if self.use_lexical_view and self.use_metadata_view:
|
|
186
|
+
self.semantic_weight = self.semantic_weight / total_weight
|
|
187
|
+
self.lexical_weight = self.lexical_weight / total_weight
|
|
188
|
+
self.metadata_weight = self.metadata_weight / total_weight
|
|
189
|
+
elif self.use_lexical_view:
|
|
190
|
+
total = self.semantic_weight + self.lexical_weight
|
|
191
|
+
self.semantic_weight = self.semantic_weight / total
|
|
192
|
+
self.lexical_weight = self.lexical_weight / total
|
|
193
|
+
else:
|
|
194
|
+
self.semantic_weight = 1.0
|
|
195
|
+
|
|
196
|
+
# Validate ranges
|
|
197
|
+
assert 0 < self.n_neighbors <= 100, "n_neighbors must be between 1 and 100"
|
|
198
|
+
assert 0 < self.snn_weight <= 1, "snn_weight must be between 0 and 1"
|
|
199
|
+
assert 0 < self.resolution <= 5, "resolution must be between 0 and 5"
|
|
200
|
+
assert 0 < self.convergence_threshold <= 1, "convergence_threshold must be between 0 and 1"
|
|
201
|
+
assert self.n_archetypes >= 2, "n_archetypes must be at least 2"
|
|
202
|
+
|
|
203
|
+
def get_embedding_model_for_language(self, detected_language: str = None) -> str:
|
|
204
|
+
"""
|
|
205
|
+
Get the appropriate embedding model based on language settings.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
detected_language : str, optional
|
|
210
|
+
The detected language code if language="auto"
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
str
|
|
215
|
+
The embedding model name to use
|
|
216
|
+
"""
|
|
217
|
+
if self.embedding_model != "auto":
|
|
218
|
+
return self.embedding_model
|
|
219
|
+
|
|
220
|
+
lang = detected_language or self.language
|
|
221
|
+
|
|
222
|
+
# If multilingual mode is explicitly enabled
|
|
223
|
+
if self.multilingual:
|
|
224
|
+
return "paraphrase-multilingual-mpnet-base-v2"
|
|
225
|
+
|
|
226
|
+
# Language-specific model selection
|
|
227
|
+
model_map = {
|
|
228
|
+
"en": "all-MiniLM-L6-v2",
|
|
229
|
+
"zh": "BAAI/bge-base-zh-v1.5",
|
|
230
|
+
"ja": "paraphrase-multilingual-MiniLM-L12-v2",
|
|
231
|
+
"ko": "paraphrase-multilingual-MiniLM-L12-v2",
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
# Default to multilingual for non-English
|
|
235
|
+
if lang in model_map:
|
|
236
|
+
return model_map[lang]
|
|
237
|
+
elif lang != "en" and lang != "auto":
|
|
238
|
+
return "paraphrase-multilingual-MiniLM-L12-v2"
|
|
239
|
+
else:
|
|
240
|
+
return "all-MiniLM-L6-v2"
|
|
241
|
+
|
|
242
|
+
def to_dict(self) -> dict:
|
|
243
|
+
"""Convert config to dictionary."""
|
|
244
|
+
return {
|
|
245
|
+
k: v for k, v in self.__dict__.items()
|
|
246
|
+
if not k.startswith('_')
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
@classmethod
|
|
250
|
+
def from_dict(cls, config_dict: dict) -> "TriTopicConfig":
|
|
251
|
+
"""Create config from dictionary."""
|
|
252
|
+
return cls(**config_dict)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# Predefined configurations for common use cases
|
|
256
|
+
CONFIGS = {
|
|
257
|
+
"default": TriTopicConfig(),
|
|
258
|
+
|
|
259
|
+
"fast": TriTopicConfig(
|
|
260
|
+
embedding_model="all-MiniLM-L6-v2",
|
|
261
|
+
n_neighbors=10,
|
|
262
|
+
n_consensus_runs=5,
|
|
263
|
+
use_iterative_refinement=False,
|
|
264
|
+
representative_method="centroid",
|
|
265
|
+
),
|
|
266
|
+
|
|
267
|
+
"quality": TriTopicConfig(
|
|
268
|
+
embedding_model="BAAI/bge-base-en-v1.5",
|
|
269
|
+
n_neighbors=20,
|
|
270
|
+
n_consensus_runs=20,
|
|
271
|
+
max_iterations=10,
|
|
272
|
+
representative_method="hybrid",
|
|
273
|
+
n_archetypes=5,
|
|
274
|
+
),
|
|
275
|
+
|
|
276
|
+
"multilingual": TriTopicConfig(
|
|
277
|
+
multilingual=True,
|
|
278
|
+
embedding_model="paraphrase-multilingual-mpnet-base-v2",
|
|
279
|
+
semantic_weight=0.6,
|
|
280
|
+
lexical_weight=0.2,
|
|
281
|
+
metadata_weight=0.2,
|
|
282
|
+
),
|
|
283
|
+
|
|
284
|
+
"multilingual_quality": TriTopicConfig(
|
|
285
|
+
multilingual=True,
|
|
286
|
+
embedding_model="BAAI/bge-m3",
|
|
287
|
+
n_neighbors=20,
|
|
288
|
+
n_consensus_runs=15,
|
|
289
|
+
semantic_weight=0.6,
|
|
290
|
+
lexical_weight=0.2,
|
|
291
|
+
representative_method="hybrid",
|
|
292
|
+
),
|
|
293
|
+
|
|
294
|
+
"chinese": TriTopicConfig(
|
|
295
|
+
language="zh",
|
|
296
|
+
embedding_model="BAAI/bge-base-zh-v1.5",
|
|
297
|
+
tokenizer="jieba",
|
|
298
|
+
ngram_range=(1, 2),
|
|
299
|
+
),
|
|
300
|
+
|
|
301
|
+
"german": TriTopicConfig(
|
|
302
|
+
language="de",
|
|
303
|
+
embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
|
|
304
|
+
),
|
|
305
|
+
}
|
tritopic/core/__init__.py
CHANGED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
"""Core components for TriTopic."""
|
|
2
|
-
|
|
3
|
-
from tritopic.core.model import TriTopic, TriTopicConfig, TopicInfo
|
|
4
|
-
from tritopic.core.graph_builder import GraphBuilder
|
|
5
|
-
from tritopic.core.clustering import ConsensusLeiden
|
|
6
|
-
from tritopic.core.embeddings import EmbeddingEngine
|
|
7
|
-
from tritopic.core.keywords import KeywordExtractor
|
|
8
|
-
|
|
9
|
-
__all__ = [
|
|
10
|
-
"TriTopic",
|
|
11
|
-
"TriTopicConfig",
|
|
12
|
-
"TopicInfo",
|
|
13
|
-
"GraphBuilder",
|
|
14
|
-
"ConsensusLeiden",
|
|
15
|
-
"EmbeddingEngine",
|
|
16
|
-
"KeywordExtractor",
|
|
17
|
-
]
|