tritopic 0.1.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tritopic might be problematic. Click here for more details.
- {tritopic-0.1.0/tritopic.egg-info → tritopic-1.0.0}/PKG-INFO +92 -48
- {tritopic-0.1.0 → tritopic-1.0.0}/README.md +65 -30
- {tritopic-0.1.0 → tritopic-1.0.0}/pyproject.toml +39 -20
- tritopic-1.0.0/tritopic/__init__.py +36 -0
- tritopic-1.0.0/tritopic/config.py +305 -0
- tritopic-1.0.0/tritopic/core/__init__.py +0 -0
- tritopic-1.0.0/tritopic/core/clustering.py +317 -0
- tritopic-1.0.0/tritopic/core/embeddings.py +216 -0
- tritopic-1.0.0/tritopic/core/graph.py +435 -0
- tritopic-1.0.0/tritopic/core/keywords.py +301 -0
- tritopic-1.0.0/tritopic/core/refinement.py +231 -0
- tritopic-1.0.0/tritopic/core/representatives.py +560 -0
- tritopic-1.0.0/tritopic/labeling.py +313 -0
- tritopic-1.0.0/tritopic/model.py +718 -0
- tritopic-1.0.0/tritopic/multilingual/__init__.py +38 -0
- tritopic-1.0.0/tritopic/multilingual/detection.py +208 -0
- tritopic-1.0.0/tritopic/multilingual/stopwords.py +467 -0
- tritopic-1.0.0/tritopic/multilingual/tokenizers.py +275 -0
- tritopic-1.0.0/tritopic/visualization.py +371 -0
- {tritopic-0.1.0 → tritopic-1.0.0/tritopic.egg-info}/PKG-INFO +92 -48
- {tritopic-0.1.0 → tritopic-1.0.0}/tritopic.egg-info/SOURCES.txt +11 -9
- tritopic-1.0.0/tritopic.egg-info/requires.txt +45 -0
- tritopic-0.1.0/LICENSE +0 -21
- tritopic-0.1.0/tritopic/__init__.py +0 -46
- tritopic-0.1.0/tritopic/core/__init__.py +0 -17
- tritopic-0.1.0/tritopic/core/clustering.py +0 -331
- tritopic-0.1.0/tritopic/core/embeddings.py +0 -222
- tritopic-0.1.0/tritopic/core/graph_builder.py +0 -493
- tritopic-0.1.0/tritopic/core/keywords.py +0 -337
- tritopic-0.1.0/tritopic/core/model.py +0 -810
- tritopic-0.1.0/tritopic/labeling/__init__.py +0 -5
- tritopic-0.1.0/tritopic/labeling/llm_labeler.py +0 -279
- tritopic-0.1.0/tritopic/utils/__init__.py +0 -13
- tritopic-0.1.0/tritopic/utils/metrics.py +0 -254
- tritopic-0.1.0/tritopic/visualization/__init__.py +0 -5
- tritopic-0.1.0/tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0/tritopic.egg-info/requires.txt +0 -30
- {tritopic-0.1.0 → tritopic-1.0.0}/setup.cfg +0 -0
- {tritopic-0.1.0 → tritopic-1.0.0}/tritopic.egg-info/dependency_links.txt +0 -0
- {tritopic-0.1.0 → tritopic-1.0.0}/tritopic.egg-info/top_level.txt +0 -0
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tritopic
|
|
3
|
-
Version:
|
|
4
|
-
Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
|
|
5
|
-
Author-email: Roman Egger <roman@example.com>
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
|
|
5
|
+
Author-email: Roman Egger <roman.egger@example.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/roman-egger/tritopic
|
|
8
8
|
Project-URL: Documentation, https://tritopic.readthedocs.io
|
|
9
9
|
Project-URL: Repository, https://github.com/roman-egger/tritopic
|
|
10
|
-
|
|
10
|
+
Project-URL: Issues, https://github.com/roman-egger/tritopic/issues
|
|
11
|
+
Keywords: topic-modeling,nlp,machine-learning,bertopic,clustering,text-analysis,multilingual
|
|
11
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
12
14
|
Classifier: Intended Audience :: Science/Research
|
|
13
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
16
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -20,35 +22,42 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
20
22
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
23
|
Requires-Python: >=3.9
|
|
22
24
|
Description-Content-Type: text/markdown
|
|
23
|
-
License-File: LICENSE
|
|
24
25
|
Requires-Dist: numpy>=1.21.0
|
|
25
|
-
Requires-Dist: pandas>=1.3.0
|
|
26
26
|
Requires-Dist: scipy>=1.7.0
|
|
27
27
|
Requires-Dist: scikit-learn>=1.0.0
|
|
28
|
+
Requires-Dist: pandas>=1.3.0
|
|
28
29
|
Requires-Dist: sentence-transformers>=2.2.0
|
|
29
30
|
Requires-Dist: leidenalg>=0.9.0
|
|
30
|
-
Requires-Dist: igraph>=0.10.0
|
|
31
|
-
Requires-Dist: umap-learn>=0.5.0
|
|
32
|
-
Requires-Dist: hdbscan>=0.8.0
|
|
33
|
-
Requires-Dist: plotly>=5.0.0
|
|
31
|
+
Requires-Dist: python-igraph>=0.10.0
|
|
34
32
|
Requires-Dist: tqdm>=4.60.0
|
|
35
|
-
Requires-Dist: rank-bm25>=0.2.0
|
|
36
|
-
Requires-Dist: keybert>=0.7.0
|
|
37
33
|
Provides-Extra: llm
|
|
38
34
|
Requires-Dist: anthropic>=0.18.0; extra == "llm"
|
|
39
35
|
Requires-Dist: openai>=1.0.0; extra == "llm"
|
|
40
|
-
Provides-Extra:
|
|
41
|
-
Requires-Dist:
|
|
42
|
-
Requires-Dist:
|
|
43
|
-
|
|
44
|
-
Requires-Dist:
|
|
36
|
+
Provides-Extra: multilingual
|
|
37
|
+
Requires-Dist: langdetect>=1.0.9; extra == "multilingual"
|
|
38
|
+
Requires-Dist: jieba>=0.42.1; extra == "multilingual"
|
|
39
|
+
Provides-Extra: japanese
|
|
40
|
+
Requires-Dist: fugashi>=1.2.0; extra == "japanese"
|
|
41
|
+
Requires-Dist: unidic-lite>=1.0.8; extra == "japanese"
|
|
42
|
+
Provides-Extra: korean
|
|
43
|
+
Requires-Dist: konlpy>=0.6.0; extra == "korean"
|
|
44
|
+
Provides-Extra: thai
|
|
45
|
+
Requires-Dist: pythainlp>=4.0.0; extra == "thai"
|
|
46
|
+
Provides-Extra: visualization
|
|
47
|
+
Requires-Dist: plotly>=5.0.0; extra == "visualization"
|
|
48
|
+
Requires-Dist: matplotlib>=3.4.0; extra == "visualization"
|
|
49
|
+
Requires-Dist: umap-learn>=0.5.0; extra == "visualization"
|
|
50
|
+
Provides-Extra: evaluation
|
|
51
|
+
Requires-Dist: gensim>=4.0.0; extra == "evaluation"
|
|
52
|
+
Provides-Extra: all
|
|
53
|
+
Requires-Dist: tritopic[evaluation,llm,multilingual,visualization]; extra == "all"
|
|
45
54
|
Provides-Extra: dev
|
|
46
55
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
47
56
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
48
57
|
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
49
58
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
50
59
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
51
|
-
|
|
60
|
+
Requires-Dist: sphinx>=6.0.0; extra == "dev"
|
|
52
61
|
|
|
53
62
|
# 🔺 TriTopic
|
|
54
63
|
|
|
@@ -68,6 +77,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
|
|
|
68
77
|
| **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
|
|
69
78
|
| **Leiden + Consensus** | Dramatically more stable than single-run clustering |
|
|
70
79
|
| **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
|
|
80
|
+
| **Multilingual Support** | 60+ languages with auto language detection |
|
|
81
|
+
| **Archetype Representatives** | Rich document selection beyond simple centroids |
|
|
71
82
|
| **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
|
|
72
83
|
|
|
73
84
|
## 📦 Installation
|
|
@@ -79,8 +90,11 @@ pip install tritopic
|
|
|
79
90
|
# With LLM labeling support
|
|
80
91
|
pip install tritopic[llm]
|
|
81
92
|
|
|
93
|
+
# With multilingual support
|
|
94
|
+
pip install tritopic[multilingual]
|
|
95
|
+
|
|
82
96
|
# Full installation (all features)
|
|
83
|
-
pip install tritopic[
|
|
97
|
+
pip install tritopic[all]
|
|
84
98
|
```
|
|
85
99
|
|
|
86
100
|
### From source (development)
|
|
@@ -171,6 +185,23 @@ model.generate_labels(labeler)
|
|
|
171
185
|
print(model.get_topic_info())
|
|
172
186
|
```
|
|
173
187
|
|
|
188
|
+
### Multilingual Support
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
from tritopic import TriTopic
|
|
192
|
+
|
|
193
|
+
# Auto-detect language and select appropriate model
|
|
194
|
+
model = TriTopic(
|
|
195
|
+
language="auto", # Auto-detect language
|
|
196
|
+
multilingual=False, # Use language-specific model
|
|
197
|
+
verbose=True
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Works with Chinese, German, Japanese, etc.
|
|
201
|
+
chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
|
|
202
|
+
topics = model.fit_transform(chinese_docs)
|
|
203
|
+
```
|
|
204
|
+
|
|
174
205
|
### With Metadata
|
|
175
206
|
|
|
176
207
|
```python
|
|
@@ -202,13 +233,18 @@ from tritopic import TriTopic, TriTopicConfig
|
|
|
202
233
|
|
|
203
234
|
config = TriTopicConfig(
|
|
204
235
|
# Embedding settings
|
|
205
|
-
embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
|
|
236
|
+
embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
|
|
206
237
|
embedding_batch_size=32,
|
|
207
238
|
|
|
239
|
+
# Language settings
|
|
240
|
+
language="auto", # or "en", "de", "zh", etc.
|
|
241
|
+
multilingual=False, # Force multilingual model
|
|
242
|
+
language_detection_sample=100,
|
|
243
|
+
|
|
208
244
|
# Graph construction
|
|
209
245
|
n_neighbors=15,
|
|
210
246
|
metric="cosine",
|
|
211
|
-
graph_type="hybrid",
|
|
247
|
+
graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
|
|
212
248
|
snn_weight=0.5,
|
|
213
249
|
|
|
214
250
|
# Multi-view fusion weights
|
|
@@ -227,11 +263,17 @@ config = TriTopicConfig(
|
|
|
227
263
|
use_iterative_refinement=True,
|
|
228
264
|
max_iterations=5,
|
|
229
265
|
convergence_threshold=0.95,
|
|
266
|
+
refinement_strength=0.15,
|
|
230
267
|
|
|
231
268
|
# Keywords
|
|
232
269
|
n_keywords=10,
|
|
270
|
+
keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
|
|
271
|
+
|
|
272
|
+
# Representatives (with archetype support)
|
|
233
273
|
n_representative_docs=5,
|
|
234
|
-
|
|
274
|
+
representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
|
|
275
|
+
n_archetypes=4,
|
|
276
|
+
archetype_method="furthest_sum",
|
|
235
277
|
|
|
236
278
|
# Misc
|
|
237
279
|
outlier_threshold=0.1,
|
|
@@ -242,16 +284,25 @@ config = TriTopicConfig(
|
|
|
242
284
|
model = TriTopic(config=config)
|
|
243
285
|
```
|
|
244
286
|
|
|
245
|
-
###
|
|
287
|
+
### Pre-defined Configurations
|
|
246
288
|
|
|
247
289
|
```python
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
)
|
|
290
|
+
from tritopic import TriTopic, get_config
|
|
291
|
+
|
|
292
|
+
# Fast processing (less accurate)
|
|
293
|
+
model = TriTopic(config=get_config("fast"))
|
|
294
|
+
|
|
295
|
+
# High quality (slower)
|
|
296
|
+
model = TriTopic(config=get_config("quality"))
|
|
297
|
+
|
|
298
|
+
# Multilingual corpus
|
|
299
|
+
model = TriTopic(config=get_config("multilingual"))
|
|
300
|
+
|
|
301
|
+
# Chinese text
|
|
302
|
+
model = TriTopic(config=get_config("chinese"))
|
|
303
|
+
|
|
304
|
+
# German text
|
|
305
|
+
model = TriTopic(config=get_config("german"))
|
|
255
306
|
```
|
|
256
307
|
|
|
257
308
|
## 📊 Evaluation
|
|
@@ -285,20 +336,6 @@ model = TriTopic()
|
|
|
285
336
|
topics = model.fit_transform(documents, embeddings=embeddings)
|
|
286
337
|
```
|
|
287
338
|
|
|
288
|
-
### Find Optimal Resolution
|
|
289
|
-
|
|
290
|
-
```python
|
|
291
|
-
from tritopic.core.clustering import ConsensusLeiden
|
|
292
|
-
|
|
293
|
-
clusterer = ConsensusLeiden()
|
|
294
|
-
optimal_res = clusterer.find_optimal_resolution(
|
|
295
|
-
graph=model.graph_,
|
|
296
|
-
resolution_range=(0.5, 2.0),
|
|
297
|
-
target_n_topics=15, # Optional: target number
|
|
298
|
-
)
|
|
299
|
-
print(f"Optimal resolution: {optimal_res}")
|
|
300
|
-
```
|
|
301
|
-
|
|
302
339
|
### Transform New Documents
|
|
303
340
|
|
|
304
341
|
```python
|
|
@@ -327,7 +364,8 @@ model = TriTopic.load("my_topic_model.pkl")
|
|
|
327
364
|
| Views | Embeddings only | Semantic + Lexical + Metadata |
|
|
328
365
|
| Refinement | None | Iterative embedding refinement |
|
|
329
366
|
| Stability | Low (varies by run) | High (consensus clustering) |
|
|
330
|
-
|
|
|
367
|
+
| Languages | Limited | 60+ with auto-detection |
|
|
368
|
+
| Representatives | Centroid only | Archetypes, medoids, diverse |
|
|
331
369
|
|
|
332
370
|
### Benchmark Results
|
|
333
371
|
|
|
@@ -339,7 +377,7 @@ On 20 Newsgroups dataset (n=18,846):
|
|
|
339
377
|
| Diversity | 0.834 | **0.891** | +7% |
|
|
340
378
|
| Stability (ARI) | 0.721 | **0.934** | +30% |
|
|
341
379
|
|
|
342
|
-
##
|
|
380
|
+
## 🗂️ Architecture
|
|
343
381
|
|
|
344
382
|
```
|
|
345
383
|
Documents
|
|
@@ -347,10 +385,10 @@ Documents
|
|
|
347
385
|
├─── Embedding Engine ──────────────┐
|
|
348
386
|
│ (Sentence-BERT/BGE/Instructor) │
|
|
349
387
|
│ │
|
|
350
|
-
├─── Lexical Matrix
|
|
388
|
+
├─── Lexical Matrix ────────────────┼─── Multi-View
|
|
351
389
|
│ (TF-IDF/BM25) │ Graph Builder
|
|
352
390
|
│ │ │
|
|
353
|
-
└─── Metadata Graph
|
|
391
|
+
└─── Metadata Graph ────────────────┘ │
|
|
354
392
|
(Optional) │
|
|
355
393
|
▼
|
|
356
394
|
┌─────────────────────┐
|
|
@@ -369,6 +407,12 @@ Documents
|
|
|
369
407
|
└──────────┬──────────┘
|
|
370
408
|
│
|
|
371
409
|
┌──────────▼──────────┐
|
|
410
|
+
│ Representative │
|
|
411
|
+
│ Selection │
|
|
412
|
+
│ (Archetype/Hybrid) │
|
|
413
|
+
└──────────┬──────────┘
|
|
414
|
+
│
|
|
415
|
+
┌──────────▼──────────┐
|
|
372
416
|
│ LLM Labeling │
|
|
373
417
|
│ (Claude/GPT-4) │
|
|
374
418
|
└─────────────────────┘
|
|
@@ -16,6 +16,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
|
|
|
16
16
|
| **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
|
|
17
17
|
| **Leiden + Consensus** | Dramatically more stable than single-run clustering |
|
|
18
18
|
| **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
|
|
19
|
+
| **Multilingual Support** | 60+ languages with auto language detection |
|
|
20
|
+
| **Archetype Representatives** | Rich document selection beyond simple centroids |
|
|
19
21
|
| **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
|
|
20
22
|
|
|
21
23
|
## 📦 Installation
|
|
@@ -27,8 +29,11 @@ pip install tritopic
|
|
|
27
29
|
# With LLM labeling support
|
|
28
30
|
pip install tritopic[llm]
|
|
29
31
|
|
|
32
|
+
# With multilingual support
|
|
33
|
+
pip install tritopic[multilingual]
|
|
34
|
+
|
|
30
35
|
# Full installation (all features)
|
|
31
|
-
pip install tritopic[
|
|
36
|
+
pip install tritopic[all]
|
|
32
37
|
```
|
|
33
38
|
|
|
34
39
|
### From source (development)
|
|
@@ -119,6 +124,23 @@ model.generate_labels(labeler)
|
|
|
119
124
|
print(model.get_topic_info())
|
|
120
125
|
```
|
|
121
126
|
|
|
127
|
+
### Multilingual Support
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from tritopic import TriTopic
|
|
131
|
+
|
|
132
|
+
# Auto-detect language and select appropriate model
|
|
133
|
+
model = TriTopic(
|
|
134
|
+
language="auto", # Auto-detect language
|
|
135
|
+
multilingual=False, # Use language-specific model
|
|
136
|
+
verbose=True
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Works with Chinese, German, Japanese, etc.
|
|
140
|
+
chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
|
|
141
|
+
topics = model.fit_transform(chinese_docs)
|
|
142
|
+
```
|
|
143
|
+
|
|
122
144
|
### With Metadata
|
|
123
145
|
|
|
124
146
|
```python
|
|
@@ -150,13 +172,18 @@ from tritopic import TriTopic, TriTopicConfig
|
|
|
150
172
|
|
|
151
173
|
config = TriTopicConfig(
|
|
152
174
|
# Embedding settings
|
|
153
|
-
embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
|
|
175
|
+
embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
|
|
154
176
|
embedding_batch_size=32,
|
|
155
177
|
|
|
178
|
+
# Language settings
|
|
179
|
+
language="auto", # or "en", "de", "zh", etc.
|
|
180
|
+
multilingual=False, # Force multilingual model
|
|
181
|
+
language_detection_sample=100,
|
|
182
|
+
|
|
156
183
|
# Graph construction
|
|
157
184
|
n_neighbors=15,
|
|
158
185
|
metric="cosine",
|
|
159
|
-
graph_type="hybrid",
|
|
186
|
+
graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
|
|
160
187
|
snn_weight=0.5,
|
|
161
188
|
|
|
162
189
|
# Multi-view fusion weights
|
|
@@ -175,11 +202,17 @@ config = TriTopicConfig(
|
|
|
175
202
|
use_iterative_refinement=True,
|
|
176
203
|
max_iterations=5,
|
|
177
204
|
convergence_threshold=0.95,
|
|
205
|
+
refinement_strength=0.15,
|
|
178
206
|
|
|
179
207
|
# Keywords
|
|
180
208
|
n_keywords=10,
|
|
209
|
+
keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
|
|
210
|
+
|
|
211
|
+
# Representatives (with archetype support)
|
|
181
212
|
n_representative_docs=5,
|
|
182
|
-
|
|
213
|
+
representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
|
|
214
|
+
n_archetypes=4,
|
|
215
|
+
archetype_method="furthest_sum",
|
|
183
216
|
|
|
184
217
|
# Misc
|
|
185
218
|
outlier_threshold=0.1,
|
|
@@ -190,16 +223,25 @@ config = TriTopicConfig(
|
|
|
190
223
|
model = TriTopic(config=config)
|
|
191
224
|
```
|
|
192
225
|
|
|
193
|
-
###
|
|
226
|
+
### Pre-defined Configurations
|
|
194
227
|
|
|
195
228
|
```python
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
)
|
|
229
|
+
from tritopic import TriTopic, get_config
|
|
230
|
+
|
|
231
|
+
# Fast processing (less accurate)
|
|
232
|
+
model = TriTopic(config=get_config("fast"))
|
|
233
|
+
|
|
234
|
+
# High quality (slower)
|
|
235
|
+
model = TriTopic(config=get_config("quality"))
|
|
236
|
+
|
|
237
|
+
# Multilingual corpus
|
|
238
|
+
model = TriTopic(config=get_config("multilingual"))
|
|
239
|
+
|
|
240
|
+
# Chinese text
|
|
241
|
+
model = TriTopic(config=get_config("chinese"))
|
|
242
|
+
|
|
243
|
+
# German text
|
|
244
|
+
model = TriTopic(config=get_config("german"))
|
|
203
245
|
```
|
|
204
246
|
|
|
205
247
|
## 📊 Evaluation
|
|
@@ -233,20 +275,6 @@ model = TriTopic()
|
|
|
233
275
|
topics = model.fit_transform(documents, embeddings=embeddings)
|
|
234
276
|
```
|
|
235
277
|
|
|
236
|
-
### Find Optimal Resolution
|
|
237
|
-
|
|
238
|
-
```python
|
|
239
|
-
from tritopic.core.clustering import ConsensusLeiden
|
|
240
|
-
|
|
241
|
-
clusterer = ConsensusLeiden()
|
|
242
|
-
optimal_res = clusterer.find_optimal_resolution(
|
|
243
|
-
graph=model.graph_,
|
|
244
|
-
resolution_range=(0.5, 2.0),
|
|
245
|
-
target_n_topics=15, # Optional: target number
|
|
246
|
-
)
|
|
247
|
-
print(f"Optimal resolution: {optimal_res}")
|
|
248
|
-
```
|
|
249
|
-
|
|
250
278
|
### Transform New Documents
|
|
251
279
|
|
|
252
280
|
```python
|
|
@@ -275,7 +303,8 @@ model = TriTopic.load("my_topic_model.pkl")
|
|
|
275
303
|
| Views | Embeddings only | Semantic + Lexical + Metadata |
|
|
276
304
|
| Refinement | None | Iterative embedding refinement |
|
|
277
305
|
| Stability | Low (varies by run) | High (consensus clustering) |
|
|
278
|
-
|
|
|
306
|
+
| Languages | Limited | 60+ with auto-detection |
|
|
307
|
+
| Representatives | Centroid only | Archetypes, medoids, diverse |
|
|
279
308
|
|
|
280
309
|
### Benchmark Results
|
|
281
310
|
|
|
@@ -287,7 +316,7 @@ On 20 Newsgroups dataset (n=18,846):
|
|
|
287
316
|
| Diversity | 0.834 | **0.891** | +7% |
|
|
288
317
|
| Stability (ARI) | 0.721 | **0.934** | +30% |
|
|
289
318
|
|
|
290
|
-
##
|
|
319
|
+
## 🗂️ Architecture
|
|
291
320
|
|
|
292
321
|
```
|
|
293
322
|
Documents
|
|
@@ -295,10 +324,10 @@ Documents
|
|
|
295
324
|
├─── Embedding Engine ──────────────┐
|
|
296
325
|
│ (Sentence-BERT/BGE/Instructor) │
|
|
297
326
|
│ │
|
|
298
|
-
├─── Lexical Matrix
|
|
327
|
+
├─── Lexical Matrix ────────────────┼─── Multi-View
|
|
299
328
|
│ (TF-IDF/BM25) │ Graph Builder
|
|
300
329
|
│ │ │
|
|
301
|
-
└─── Metadata Graph
|
|
330
|
+
└─── Metadata Graph ────────────────┘ │
|
|
302
331
|
(Optional) │
|
|
303
332
|
▼
|
|
304
333
|
┌─────────────────────┐
|
|
@@ -317,6 +346,12 @@ Documents
|
|
|
317
346
|
└──────────┬──────────┘
|
|
318
347
|
│
|
|
319
348
|
┌──────────▼──────────┐
|
|
349
|
+
│ Representative │
|
|
350
|
+
│ Selection │
|
|
351
|
+
│ (Archetype/Hybrid) │
|
|
352
|
+
└──────────┬──────────┘
|
|
353
|
+
│
|
|
354
|
+
┌──────────▼──────────┐
|
|
320
355
|
│ LLM Labeling │
|
|
321
356
|
│ (Claude/GPT-4) │
|
|
322
357
|
└─────────────────────┘
|
|
@@ -4,19 +4,20 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tritopic"
|
|
7
|
-
version = "
|
|
8
|
-
description = "Tri-Modal Graph Topic Modeling with Iterative Refinement
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Tri-Modal Graph Topic Modeling with Iterative Refinement"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
11
11
|
authors = [
|
|
12
|
-
{name = "Roman Egger", email = "roman@example.com"}
|
|
12
|
+
{name = "Roman Egger", email = "roman.egger@example.com"}
|
|
13
13
|
]
|
|
14
14
|
keywords = [
|
|
15
|
-
"topic-modeling", "nlp", "machine-learning", "
|
|
16
|
-
"
|
|
15
|
+
"topic-modeling", "nlp", "machine-learning", "bertopic",
|
|
16
|
+
"clustering", "text-analysis", "multilingual"
|
|
17
17
|
]
|
|
18
18
|
classifiers = [
|
|
19
19
|
"Development Status :: 4 - Beta",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
20
21
|
"Intended Audience :: Science/Research",
|
|
21
22
|
"License :: OSI Approved :: MIT License",
|
|
22
23
|
"Programming Language :: Python :: 3",
|
|
@@ -30,18 +31,13 @@ classifiers = [
|
|
|
30
31
|
requires-python = ">=3.9"
|
|
31
32
|
dependencies = [
|
|
32
33
|
"numpy>=1.21.0",
|
|
33
|
-
"pandas>=1.3.0",
|
|
34
34
|
"scipy>=1.7.0",
|
|
35
35
|
"scikit-learn>=1.0.0",
|
|
36
|
+
"pandas>=1.3.0",
|
|
36
37
|
"sentence-transformers>=2.2.0",
|
|
37
38
|
"leidenalg>=0.9.0",
|
|
38
|
-
"igraph>=0.10.0",
|
|
39
|
-
"umap-learn>=0.5.0",
|
|
40
|
-
"hdbscan>=0.8.0",
|
|
41
|
-
"plotly>=5.0.0",
|
|
39
|
+
"python-igraph>=0.10.0",
|
|
42
40
|
"tqdm>=4.60.0",
|
|
43
|
-
"rank-bm25>=0.2.0",
|
|
44
|
-
"keybert>=0.7.0",
|
|
45
41
|
]
|
|
46
42
|
|
|
47
43
|
[project.optional-dependencies]
|
|
@@ -49,11 +45,30 @@ llm = [
|
|
|
49
45
|
"anthropic>=0.18.0",
|
|
50
46
|
"openai>=1.0.0",
|
|
51
47
|
]
|
|
52
|
-
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
|
|
56
|
-
|
|
48
|
+
multilingual = [
|
|
49
|
+
"langdetect>=1.0.9",
|
|
50
|
+
"jieba>=0.42.1",
|
|
51
|
+
]
|
|
52
|
+
japanese = [
|
|
53
|
+
"fugashi>=1.2.0",
|
|
54
|
+
"unidic-lite>=1.0.8",
|
|
55
|
+
]
|
|
56
|
+
korean = [
|
|
57
|
+
"konlpy>=0.6.0",
|
|
58
|
+
]
|
|
59
|
+
thai = [
|
|
60
|
+
"pythainlp>=4.0.0",
|
|
61
|
+
]
|
|
62
|
+
visualization = [
|
|
63
|
+
"plotly>=5.0.0",
|
|
64
|
+
"matplotlib>=3.4.0",
|
|
65
|
+
"umap-learn>=0.5.0",
|
|
66
|
+
]
|
|
67
|
+
evaluation = [
|
|
68
|
+
"gensim>=4.0.0",
|
|
69
|
+
]
|
|
70
|
+
all = [
|
|
71
|
+
"tritopic[llm,multilingual,visualization,evaluation]",
|
|
57
72
|
]
|
|
58
73
|
dev = [
|
|
59
74
|
"pytest>=7.0.0",
|
|
@@ -61,12 +76,14 @@ dev = [
|
|
|
61
76
|
"black>=23.0.0",
|
|
62
77
|
"ruff>=0.1.0",
|
|
63
78
|
"mypy>=1.0.0",
|
|
79
|
+
"sphinx>=6.0.0",
|
|
64
80
|
]
|
|
65
81
|
|
|
66
82
|
[project.urls]
|
|
67
83
|
Homepage = "https://github.com/roman-egger/tritopic"
|
|
68
84
|
Documentation = "https://tritopic.readthedocs.io"
|
|
69
85
|
Repository = "https://github.com/roman-egger/tritopic"
|
|
86
|
+
Issues = "https://github.com/roman-egger/tritopic/issues"
|
|
70
87
|
|
|
71
88
|
[tool.setuptools.packages.find]
|
|
72
89
|
where = ["."]
|
|
@@ -74,13 +91,15 @@ include = ["tritopic*"]
|
|
|
74
91
|
|
|
75
92
|
[tool.black]
|
|
76
93
|
line-length = 100
|
|
77
|
-
target-version = ['py39', 'py310', 'py311']
|
|
94
|
+
target-version = ['py39', 'py310', 'py311', 'py312']
|
|
78
95
|
|
|
79
96
|
[tool.ruff]
|
|
80
97
|
line-length = 100
|
|
81
|
-
select = ["E", "F", "W", "I", "N"]
|
|
98
|
+
select = ["E", "F", "W", "I", "N", "D", "UP"]
|
|
99
|
+
ignore = ["D100", "D104"]
|
|
82
100
|
|
|
83
101
|
[tool.mypy]
|
|
84
|
-
python_version = "3.
|
|
102
|
+
python_version = "3.9"
|
|
85
103
|
warn_return_any = true
|
|
86
104
|
warn_unused_configs = true
|
|
105
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TriTopic: Tri-Modal Graph Topic Modeling with Iterative Refinement
|
|
3
|
+
|
|
4
|
+
A state-of-the-art topic modeling library that consistently outperforms
|
|
5
|
+
BERTopic and traditional approaches.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- Multi-view representation (semantic, lexical, metadata)
|
|
9
|
+
- Hybrid graph construction (Mutual kNN + SNN)
|
|
10
|
+
- Consensus Leiden clustering for stability
|
|
11
|
+
- Iterative refinement for improved coherence
|
|
12
|
+
- Multilingual support (60+ languages)
|
|
13
|
+
- LLM-powered labeling
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
>>> from tritopic import TriTopic
|
|
17
|
+
>>> model = TriTopic(verbose=True)
|
|
18
|
+
>>> topics = model.fit_transform(documents)
|
|
19
|
+
>>> print(model.get_topic_info())
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
__version__ = "1.0.0"
|
|
23
|
+
__author__ = "Roman Egger"
|
|
24
|
+
|
|
25
|
+
from .model import TriTopic, Topic
|
|
26
|
+
from .config import TriTopicConfig, get_config
|
|
27
|
+
from .labeling import LLMLabeler, KeywordLabeler
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"TriTopic",
|
|
31
|
+
"Topic",
|
|
32
|
+
"TriTopicConfig",
|
|
33
|
+
"get_config",
|
|
34
|
+
"LLMLabeler",
|
|
35
|
+
"KeywordLabeler",
|
|
36
|
+
]
|