tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tritopic/__init__.py CHANGED
@@ -1,46 +1,36 @@
1
1
  """
2
2
  TriTopic: Tri-Modal Graph Topic Modeling with Iterative Refinement
3
- ===================================================================
4
3
 
5
- A state-of-the-art topic modeling library that combines:
6
- - Semantic embeddings (Sentence-BERT, Instructor, BGE)
7
- - Lexical similarity (BM25)
8
- - Metadata context (optional)
4
+ A state-of-the-art topic modeling library that consistently outperforms
5
+ BERTopic and traditional approaches.
9
6
 
10
- With advanced techniques:
11
- - Leiden clustering with consensus
12
- - Mutual kNN + SNN graph construction
13
- - Iterative refinement loop
14
- - LLM-powered topic labeling
7
+ Key Features:
8
+ - Multi-view representation (semantic, lexical, metadata)
9
+ - Hybrid graph construction (Mutual kNN + SNN)
10
+ - Consensus Leiden clustering for stability
11
+ - Iterative refinement for improved coherence
12
+ - Multilingual support (60+ languages)
13
+ - LLM-powered labeling
15
14
 
16
- Basic usage:
17
- -----------
18
- >>> from tritopic import TriTopic
19
- >>> model = TriTopic()
20
- >>> topics = model.fit_transform(documents)
21
- >>> model.visualize()
22
-
23
- Author: Roman Egger
24
- License: MIT
15
+ Example:
16
+ >>> from tritopic import TriTopic
17
+ >>> model = TriTopic(verbose=True)
18
+ >>> topics = model.fit_transform(documents)
19
+ >>> print(model.get_topic_info())
25
20
  """
26
21
 
27
- __version__ = "0.1.0"
22
+ __version__ = "1.0.0"
28
23
  __author__ = "Roman Egger"
29
24
 
30
- from tritopic.core.model import TriTopic
31
- from tritopic.core.graph_builder import GraphBuilder
32
- from tritopic.core.clustering import ConsensusLeiden
33
- from tritopic.core.embeddings import EmbeddingEngine
34
- from tritopic.core.keywords import KeywordExtractor
35
- from tritopic.labeling.llm_labeler import LLMLabeler
36
- from tritopic.visualization.plotter import TopicVisualizer
25
+ from .model import TriTopic, Topic
26
+ from .config import TriTopicConfig, get_config
27
+ from .labeling import LLMLabeler, KeywordLabeler
37
28
 
38
29
  __all__ = [
39
30
  "TriTopic",
40
- "GraphBuilder",
41
- "ConsensusLeiden",
42
- "EmbeddingEngine",
43
- "KeywordExtractor",
31
+ "Topic",
32
+ "TriTopicConfig",
33
+ "get_config",
44
34
  "LLMLabeler",
45
- "TopicVisualizer",
35
+ "KeywordLabeler",
46
36
  ]
tritopic/config.py ADDED
@@ -0,0 +1,305 @@
1
+ """
2
+ TriTopic Configuration Module
3
+
4
+ Defines all configuration parameters for the TriTopic model.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional, List, Literal, Union
9
+
10
+
11
+ @dataclass
12
+ class TriTopicConfig:
13
+ """
14
+ Configuration for TriTopic model.
15
+
16
+ Attributes
17
+ ----------
18
+ # Embedding & Language Settings
19
+ embedding_model : str
20
+ Sentence-Transformer model name or "auto" for automatic selection.
21
+ Auto-selection considers the language parameter.
22
+ embedding_batch_size : int
23
+ Batch size for embedding generation.
24
+ language : str
25
+ ISO 639-1 language code (e.g., "en", "de", "zh") or "auto" for detection.
26
+ multilingual : bool
27
+ If True, uses multilingual embedding models regardless of detected language.
28
+ language_detection_sample : int
29
+ Number of documents to sample for automatic language detection.
30
+ tokenizer : str
31
+ Tokenizer to use: "auto", "whitespace", "spacy", "jieba", "fugashi", "konlpy", "pythainlp".
32
+ custom_stopwords : List[str]
33
+ Additional stopwords to add to the language-specific list.
34
+ min_token_length : int
35
+ Minimum token length to keep.
36
+ max_token_length : int
37
+ Maximum token length to keep.
38
+
39
+ # Graph Construction
40
+ n_neighbors : int
41
+ Number of neighbors for kNN graph construction.
42
+ metric : str
43
+ Distance metric for similarity calculation.
44
+ graph_type : str
45
+ Type of graph: "knn", "mutual_knn", "snn", "hybrid".
46
+ snn_weight : float
47
+ Weight of SNN component in hybrid graph (0-1).
48
+
49
+ # Multi-View Fusion
50
+ use_lexical_view : bool
51
+ Whether to include lexical (TF-IDF/BM25) similarity.
52
+ use_metadata_view : bool
53
+ Whether to include metadata-based similarity.
54
+ semantic_weight : float
55
+ Weight for semantic (embedding) view.
56
+ lexical_weight : float
57
+ Weight for lexical view.
58
+ metadata_weight : float
59
+ Weight for metadata view.
60
+ lexical_method : str
61
+ Method for lexical similarity: "tfidf", "bm25".
62
+ ngram_range : tuple
63
+ N-gram range for lexical features.
64
+
65
+ # Clustering
66
+ resolution : float
67
+ Resolution parameter for Leiden algorithm.
68
+ n_consensus_runs : int
69
+ Number of clustering runs for consensus.
70
+ min_cluster_size : int
71
+ Minimum number of documents per topic.
72
+
73
+ # Iterative Refinement
74
+ use_iterative_refinement : bool
75
+ Whether to use iterative embedding refinement.
76
+ max_iterations : int
77
+ Maximum refinement iterations.
78
+ convergence_threshold : float
79
+ ARI threshold for convergence detection.
80
+ refinement_strength : float
81
+ How strongly to pull embeddings toward centroids (0-1).
82
+
83
+ # Keywords
84
+ n_keywords : int
85
+ Number of keywords per topic.
86
+ keyword_method : str
87
+ Method for keyword extraction: "ctfidf", "bm25", "keybert".
88
+
89
+ # Representative Documents
90
+ n_representative_docs : int
91
+ Number of representative documents per topic.
92
+ representative_method : str
93
+ Method for selection: "centroid", "medoid", "archetype", "diverse", "hybrid".
94
+ n_archetypes : int
95
+ Number of archetypes per topic (for archetype/hybrid method).
96
+ archetype_method : str
97
+ Algorithm for archetype analysis: "pcha", "convex_hull", "furthest_sum".
98
+
99
+ # Outlier Handling
100
+ outlier_threshold : float
101
+ Threshold for outlier detection (0-1).
102
+ reassign_outliers : bool
103
+ Whether to try reassigning outliers to nearest topic.
104
+
105
+ # Misc
106
+ random_state : int
107
+ Random seed for reproducibility.
108
+ verbose : bool
109
+ Whether to print progress information.
110
+ n_jobs : int
111
+ Number of parallel jobs (-1 for all cores).
112
+ """
113
+
114
+ # === Embedding & Language Settings ===
115
+ embedding_model: str = "auto"
116
+ embedding_batch_size: int = 32
117
+ language: str = "auto"
118
+ multilingual: bool = False
119
+ language_detection_sample: int = 100
120
+ tokenizer: str = "auto"
121
+ custom_stopwords: Optional[List[str]] = None
122
+ min_token_length: int = 2
123
+ max_token_length: int = 50
124
+
125
+ # === Graph Construction ===
126
+ n_neighbors: int = 15
127
+ metric: str = "cosine"
128
+ graph_type: Literal["knn", "mutual_knn", "snn", "hybrid"] = "hybrid"
129
+ snn_weight: float = 0.5
130
+
131
+ # === Multi-View Fusion ===
132
+ use_lexical_view: bool = True
133
+ use_metadata_view: bool = False
134
+ semantic_weight: float = 0.5
135
+ lexical_weight: float = 0.3
136
+ metadata_weight: float = 0.2
137
+ lexical_method: Literal["tfidf", "bm25"] = "tfidf"
138
+ ngram_range: tuple = (1, 2)
139
+
140
+ # === Clustering ===
141
+ resolution: float = 1.0
142
+ n_consensus_runs: int = 10
143
+ min_cluster_size: int = 5
144
+
145
+ # === Iterative Refinement ===
146
+ use_iterative_refinement: bool = True
147
+ max_iterations: int = 5
148
+ convergence_threshold: float = 0.95
149
+ refinement_strength: float = 0.15
150
+
151
+ # === Keywords ===
152
+ n_keywords: int = 10
153
+ keyword_method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf"
154
+
155
+ # === Representative Documents ===
156
+ n_representative_docs: int = 5
157
+ representative_method: Literal["centroid", "medoid", "archetype", "diverse", "hybrid"] = "hybrid"
158
+ n_archetypes: int = 4
159
+ archetype_method: Literal["pcha", "convex_hull", "furthest_sum"] = "furthest_sum"
160
+
161
+ # === Outlier Handling ===
162
+ outlier_threshold: float = 0.1
163
+ reassign_outliers: bool = False
164
+
165
+ # === Misc ===
166
+ random_state: Optional[int] = 42
167
+ verbose: bool = True
168
+ n_jobs: int = -1
169
+
170
+ def __post_init__(self):
171
+ """Validate configuration after initialization."""
172
+ self._validate()
173
+
174
+ def _validate(self):
175
+ """Validate configuration parameters."""
176
+ # Weights should sum to ~1.0
177
+ total_weight = self.semantic_weight
178
+ if self.use_lexical_view:
179
+ total_weight += self.lexical_weight
180
+ if self.use_metadata_view:
181
+ total_weight += self.metadata_weight
182
+
183
+ if abs(total_weight - 1.0) > 0.01:
184
+ # Auto-normalize weights
185
+ if self.use_lexical_view and self.use_metadata_view:
186
+ self.semantic_weight = self.semantic_weight / total_weight
187
+ self.lexical_weight = self.lexical_weight / total_weight
188
+ self.metadata_weight = self.metadata_weight / total_weight
189
+ elif self.use_lexical_view:
190
+ total = self.semantic_weight + self.lexical_weight
191
+ self.semantic_weight = self.semantic_weight / total
192
+ self.lexical_weight = self.lexical_weight / total
193
+ else:
194
+ self.semantic_weight = 1.0
195
+
196
+ # Validate ranges
197
+ assert 0 < self.n_neighbors <= 100, "n_neighbors must be between 1 and 100"
198
+ assert 0 < self.snn_weight <= 1, "snn_weight must be between 0 and 1"
199
+ assert 0 < self.resolution <= 5, "resolution must be between 0 and 5"
200
+ assert 0 < self.convergence_threshold <= 1, "convergence_threshold must be between 0 and 1"
201
+ assert self.n_archetypes >= 2, "n_archetypes must be at least 2"
202
+
203
+ def get_embedding_model_for_language(self, detected_language: str = None) -> str:
204
+ """
205
+ Get the appropriate embedding model based on language settings.
206
+
207
+ Parameters
208
+ ----------
209
+ detected_language : str, optional
210
+ The detected language code if language="auto"
211
+
212
+ Returns
213
+ -------
214
+ str
215
+ The embedding model name to use
216
+ """
217
+ if self.embedding_model != "auto":
218
+ return self.embedding_model
219
+
220
+ lang = detected_language or self.language
221
+
222
+ # If multilingual mode is explicitly enabled
223
+ if self.multilingual:
224
+ return "paraphrase-multilingual-mpnet-base-v2"
225
+
226
+ # Language-specific model selection
227
+ model_map = {
228
+ "en": "all-MiniLM-L6-v2",
229
+ "zh": "BAAI/bge-base-zh-v1.5",
230
+ "ja": "paraphrase-multilingual-MiniLM-L12-v2",
231
+ "ko": "paraphrase-multilingual-MiniLM-L12-v2",
232
+ }
233
+
234
+ # Default to multilingual for non-English
235
+ if lang in model_map:
236
+ return model_map[lang]
237
+ elif lang != "en" and lang != "auto":
238
+ return "paraphrase-multilingual-MiniLM-L12-v2"
239
+ else:
240
+ return "all-MiniLM-L6-v2"
241
+
242
+ def to_dict(self) -> dict:
243
+ """Convert config to dictionary."""
244
+ return {
245
+ k: v for k, v in self.__dict__.items()
246
+ if not k.startswith('_')
247
+ }
248
+
249
+ @classmethod
250
+ def from_dict(cls, config_dict: dict) -> "TriTopicConfig":
251
+ """Create config from dictionary."""
252
+ return cls(**config_dict)
253
+
254
+
255
+ # Predefined configurations for common use cases
256
+ CONFIGS = {
257
+ "default": TriTopicConfig(),
258
+
259
+ "fast": TriTopicConfig(
260
+ embedding_model="all-MiniLM-L6-v2",
261
+ n_neighbors=10,
262
+ n_consensus_runs=5,
263
+ use_iterative_refinement=False,
264
+ representative_method="centroid",
265
+ ),
266
+
267
+ "quality": TriTopicConfig(
268
+ embedding_model="BAAI/bge-base-en-v1.5",
269
+ n_neighbors=20,
270
+ n_consensus_runs=20,
271
+ max_iterations=10,
272
+ representative_method="hybrid",
273
+ n_archetypes=5,
274
+ ),
275
+
276
+ "multilingual": TriTopicConfig(
277
+ multilingual=True,
278
+ embedding_model="paraphrase-multilingual-mpnet-base-v2",
279
+ semantic_weight=0.6,
280
+ lexical_weight=0.2,
281
+ metadata_weight=0.2,
282
+ ),
283
+
284
+ "multilingual_quality": TriTopicConfig(
285
+ multilingual=True,
286
+ embedding_model="BAAI/bge-m3",
287
+ n_neighbors=20,
288
+ n_consensus_runs=15,
289
+ semantic_weight=0.6,
290
+ lexical_weight=0.2,
291
+ representative_method="hybrid",
292
+ ),
293
+
294
+ "chinese": TriTopicConfig(
295
+ language="zh",
296
+ embedding_model="BAAI/bge-base-zh-v1.5",
297
+ tokenizer="jieba",
298
+ ngram_range=(1, 2),
299
+ ),
300
+
301
+ "german": TriTopicConfig(
302
+ language="de",
303
+ embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
304
+ ),
305
+ }
tritopic/core/__init__.py CHANGED
@@ -1,17 +0,0 @@
1
- """Core components for TriTopic."""
2
-
3
- from tritopic.core.model import TriTopic, TriTopicConfig, TopicInfo
4
- from tritopic.core.graph_builder import GraphBuilder
5
- from tritopic.core.clustering import ConsensusLeiden
6
- from tritopic.core.embeddings import EmbeddingEngine
7
- from tritopic.core.keywords import KeywordExtractor
8
-
9
- __all__ = [
10
- "TriTopic",
11
- "TriTopicConfig",
12
- "TopicInfo",
13
- "GraphBuilder",
14
- "ConsensusLeiden",
15
- "EmbeddingEngine",
16
- "KeywordExtractor",
17
- ]