tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tritopic might be problematic. Click here for more details.
- tritopic/__init__.py +22 -32
- tritopic/config.py +289 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/METADATA +91 -51
- tritopic-1.1.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tritopic
|
|
3
|
-
Version:
|
|
4
|
-
Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
|
|
5
|
-
Author-email: Roman Egger <roman@
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
|
|
5
|
+
Author-email: Roman Egger <roman.egger@smartvisions.at>
|
|
6
6
|
License: MIT
|
|
7
|
-
|
|
8
|
-
Project-URL: Documentation, https://tritopic.readthedocs.io
|
|
9
|
-
Project-URL: Repository, https://github.com/roman-egger/tritopic
|
|
10
|
-
Keywords: topic-modeling,nlp,machine-learning,graph-clustering,leiden,embeddings,text-analysis,bertopic-alternative
|
|
7
|
+
Keywords: topic-modeling,nlp,machine-learning,bertopic,clustering,text-analysis,multilingual
|
|
11
8
|
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
12
10
|
Classifier: Intended Audience :: Science/Research
|
|
13
11
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
12
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -20,35 +18,42 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
20
18
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
19
|
Requires-Python: >=3.9
|
|
22
20
|
Description-Content-Type: text/markdown
|
|
23
|
-
License-File: LICENSE
|
|
24
21
|
Requires-Dist: numpy>=1.21.0
|
|
25
|
-
Requires-Dist: pandas>=1.3.0
|
|
26
22
|
Requires-Dist: scipy>=1.7.0
|
|
27
23
|
Requires-Dist: scikit-learn>=1.0.0
|
|
24
|
+
Requires-Dist: pandas>=1.3.0
|
|
28
25
|
Requires-Dist: sentence-transformers>=2.2.0
|
|
29
26
|
Requires-Dist: leidenalg>=0.9.0
|
|
30
|
-
Requires-Dist: igraph>=0.10.0
|
|
31
|
-
Requires-Dist: umap-learn>=0.5.0
|
|
32
|
-
Requires-Dist: hdbscan>=0.8.0
|
|
33
|
-
Requires-Dist: plotly>=5.0.0
|
|
27
|
+
Requires-Dist: python-igraph>=0.10.0
|
|
34
28
|
Requires-Dist: tqdm>=4.60.0
|
|
35
|
-
Requires-Dist: rank-bm25>=0.2.0
|
|
36
|
-
Requires-Dist: keybert>=0.7.0
|
|
37
29
|
Provides-Extra: llm
|
|
38
30
|
Requires-Dist: anthropic>=0.18.0; extra == "llm"
|
|
39
31
|
Requires-Dist: openai>=1.0.0; extra == "llm"
|
|
40
|
-
Provides-Extra:
|
|
41
|
-
Requires-Dist:
|
|
42
|
-
Requires-Dist:
|
|
43
|
-
|
|
44
|
-
Requires-Dist:
|
|
32
|
+
Provides-Extra: multilingual
|
|
33
|
+
Requires-Dist: langdetect>=1.0.9; extra == "multilingual"
|
|
34
|
+
Requires-Dist: jieba>=0.42.1; extra == "multilingual"
|
|
35
|
+
Provides-Extra: japanese
|
|
36
|
+
Requires-Dist: fugashi>=1.2.0; extra == "japanese"
|
|
37
|
+
Requires-Dist: unidic-lite>=1.0.8; extra == "japanese"
|
|
38
|
+
Provides-Extra: korean
|
|
39
|
+
Requires-Dist: konlpy>=0.6.0; extra == "korean"
|
|
40
|
+
Provides-Extra: thai
|
|
41
|
+
Requires-Dist: pythainlp>=4.0.0; extra == "thai"
|
|
42
|
+
Provides-Extra: visualization
|
|
43
|
+
Requires-Dist: plotly>=5.0.0; extra == "visualization"
|
|
44
|
+
Requires-Dist: matplotlib>=3.4.0; extra == "visualization"
|
|
45
|
+
Requires-Dist: umap-learn>=0.5.0; extra == "visualization"
|
|
46
|
+
Provides-Extra: evaluation
|
|
47
|
+
Requires-Dist: gensim>=4.0.0; extra == "evaluation"
|
|
48
|
+
Provides-Extra: all
|
|
49
|
+
Requires-Dist: tritopic[evaluation,llm,multilingual,visualization]; extra == "all"
|
|
45
50
|
Provides-Extra: dev
|
|
46
51
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
47
52
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
48
53
|
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
49
54
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
50
55
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
51
|
-
|
|
56
|
+
Requires-Dist: sphinx>=6.0.0; extra == "dev"
|
|
52
57
|
|
|
53
58
|
# 🔺 TriTopic
|
|
54
59
|
|
|
@@ -68,6 +73,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
|
|
|
68
73
|
| **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
|
|
69
74
|
| **Leiden + Consensus** | Dramatically more stable than single-run clustering |
|
|
70
75
|
| **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
|
|
76
|
+
| **Multilingual Support** | 60+ languages with auto language detection |
|
|
77
|
+
| **Archetype Representatives** | Rich document selection beyond simple centroids |
|
|
71
78
|
| **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
|
|
72
79
|
|
|
73
80
|
## 📦 Installation
|
|
@@ -79,8 +86,11 @@ pip install tritopic
|
|
|
79
86
|
# With LLM labeling support
|
|
80
87
|
pip install tritopic[llm]
|
|
81
88
|
|
|
89
|
+
# With multilingual support
|
|
90
|
+
pip install tritopic[multilingual]
|
|
91
|
+
|
|
82
92
|
# Full installation (all features)
|
|
83
|
-
pip install tritopic[
|
|
93
|
+
pip install tritopic[all]
|
|
84
94
|
```
|
|
85
95
|
|
|
86
96
|
### From source (development)
|
|
@@ -171,6 +181,23 @@ model.generate_labels(labeler)
|
|
|
171
181
|
print(model.get_topic_info())
|
|
172
182
|
```
|
|
173
183
|
|
|
184
|
+
### Multilingual Support
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from tritopic import TriTopic
|
|
188
|
+
|
|
189
|
+
# Auto-detect language and select appropriate model
|
|
190
|
+
model = TriTopic(
|
|
191
|
+
language="auto", # Auto-detect language
|
|
192
|
+
multilingual=False, # Use language-specific model
|
|
193
|
+
verbose=True
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Works with Chinese, German, Japanese, etc.
|
|
197
|
+
chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
|
|
198
|
+
topics = model.fit_transform(chinese_docs)
|
|
199
|
+
```
|
|
200
|
+
|
|
174
201
|
### With Metadata
|
|
175
202
|
|
|
176
203
|
```python
|
|
@@ -202,13 +229,18 @@ from tritopic import TriTopic, TriTopicConfig
|
|
|
202
229
|
|
|
203
230
|
config = TriTopicConfig(
|
|
204
231
|
# Embedding settings
|
|
205
|
-
embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
|
|
232
|
+
embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
|
|
206
233
|
embedding_batch_size=32,
|
|
207
234
|
|
|
235
|
+
# Language settings
|
|
236
|
+
language="auto", # or "en", "de", "zh", etc.
|
|
237
|
+
multilingual=False, # Force multilingual model
|
|
238
|
+
language_detection_sample=100,
|
|
239
|
+
|
|
208
240
|
# Graph construction
|
|
209
241
|
n_neighbors=15,
|
|
210
242
|
metric="cosine",
|
|
211
|
-
graph_type="hybrid",
|
|
243
|
+
graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
|
|
212
244
|
snn_weight=0.5,
|
|
213
245
|
|
|
214
246
|
# Multi-view fusion weights
|
|
@@ -227,11 +259,17 @@ config = TriTopicConfig(
|
|
|
227
259
|
use_iterative_refinement=True,
|
|
228
260
|
max_iterations=5,
|
|
229
261
|
convergence_threshold=0.95,
|
|
262
|
+
refinement_strength=0.15,
|
|
230
263
|
|
|
231
264
|
# Keywords
|
|
232
265
|
n_keywords=10,
|
|
266
|
+
keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
|
|
267
|
+
|
|
268
|
+
# Representatives (with archetype support)
|
|
233
269
|
n_representative_docs=5,
|
|
234
|
-
|
|
270
|
+
representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
|
|
271
|
+
n_archetypes=4,
|
|
272
|
+
archetype_method="furthest_sum",
|
|
235
273
|
|
|
236
274
|
# Misc
|
|
237
275
|
outlier_threshold=0.1,
|
|
@@ -242,16 +280,25 @@ config = TriTopicConfig(
|
|
|
242
280
|
model = TriTopic(config=config)
|
|
243
281
|
```
|
|
244
282
|
|
|
245
|
-
###
|
|
283
|
+
### Pre-defined Configurations
|
|
246
284
|
|
|
247
285
|
```python
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
)
|
|
286
|
+
from tritopic import TriTopic, get_config
|
|
287
|
+
|
|
288
|
+
# Fast processing (less accurate)
|
|
289
|
+
model = TriTopic(config=get_config("fast"))
|
|
290
|
+
|
|
291
|
+
# High quality (slower)
|
|
292
|
+
model = TriTopic(config=get_config("quality"))
|
|
293
|
+
|
|
294
|
+
# Multilingual corpus
|
|
295
|
+
model = TriTopic(config=get_config("multilingual"))
|
|
296
|
+
|
|
297
|
+
# Chinese text
|
|
298
|
+
model = TriTopic(config=get_config("chinese"))
|
|
299
|
+
|
|
300
|
+
# German text
|
|
301
|
+
model = TriTopic(config=get_config("german"))
|
|
255
302
|
```
|
|
256
303
|
|
|
257
304
|
## 📊 Evaluation
|
|
@@ -285,20 +332,6 @@ model = TriTopic()
|
|
|
285
332
|
topics = model.fit_transform(documents, embeddings=embeddings)
|
|
286
333
|
```
|
|
287
334
|
|
|
288
|
-
### Find Optimal Resolution
|
|
289
|
-
|
|
290
|
-
```python
|
|
291
|
-
from tritopic.core.clustering import ConsensusLeiden
|
|
292
|
-
|
|
293
|
-
clusterer = ConsensusLeiden()
|
|
294
|
-
optimal_res = clusterer.find_optimal_resolution(
|
|
295
|
-
graph=model.graph_,
|
|
296
|
-
resolution_range=(0.5, 2.0),
|
|
297
|
-
target_n_topics=15, # Optional: target number
|
|
298
|
-
)
|
|
299
|
-
print(f"Optimal resolution: {optimal_res}")
|
|
300
|
-
```
|
|
301
|
-
|
|
302
335
|
### Transform New Documents
|
|
303
336
|
|
|
304
337
|
```python
|
|
@@ -327,7 +360,8 @@ model = TriTopic.load("my_topic_model.pkl")
|
|
|
327
360
|
| Views | Embeddings only | Semantic + Lexical + Metadata |
|
|
328
361
|
| Refinement | None | Iterative embedding refinement |
|
|
329
362
|
| Stability | Low (varies by run) | High (consensus clustering) |
|
|
330
|
-
|
|
|
363
|
+
| Languages | Limited | 60+ with auto-detection |
|
|
364
|
+
| Representatives | Centroid only | Archetypes, medoids, diverse |
|
|
331
365
|
|
|
332
366
|
### Benchmark Results
|
|
333
367
|
|
|
@@ -339,7 +373,7 @@ On 20 Newsgroups dataset (n=18,846):
|
|
|
339
373
|
| Diversity | 0.834 | **0.891** | +7% |
|
|
340
374
|
| Stability (ARI) | 0.721 | **0.934** | +30% |
|
|
341
375
|
|
|
342
|
-
##
|
|
376
|
+
## 🗂️ Architecture
|
|
343
377
|
|
|
344
378
|
```
|
|
345
379
|
Documents
|
|
@@ -347,10 +381,10 @@ Documents
|
|
|
347
381
|
├─── Embedding Engine ──────────────┐
|
|
348
382
|
│ (Sentence-BERT/BGE/Instructor) │
|
|
349
383
|
│ │
|
|
350
|
-
├─── Lexical Matrix
|
|
384
|
+
├─── Lexical Matrix ────────────────┼─── Multi-View
|
|
351
385
|
│ (TF-IDF/BM25) │ Graph Builder
|
|
352
386
|
│ │ │
|
|
353
|
-
└─── Metadata Graph
|
|
387
|
+
└─── Metadata Graph ────────────────┘ │
|
|
354
388
|
(Optional) │
|
|
355
389
|
▼
|
|
356
390
|
┌─────────────────────┐
|
|
@@ -369,6 +403,12 @@ Documents
|
|
|
369
403
|
└──────────┬──────────┘
|
|
370
404
|
│
|
|
371
405
|
┌──────────▼──────────┐
|
|
406
|
+
│ Representative │
|
|
407
|
+
│ Selection │
|
|
408
|
+
│ (Archetype/Hybrid) │
|
|
409
|
+
└──────────┬──────────┘
|
|
410
|
+
│
|
|
411
|
+
┌──────────▼──────────┐
|
|
372
412
|
│ LLM Labeling │
|
|
373
413
|
│ (Claude/GPT-4) │
|
|
374
414
|
└─────────────────────┘
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
tritopic/__init__.py,sha256=BaHbardg5BW9zykYOtYG1ZM1nGwvfVt7DV7NJ7tp4l8,936
|
|
2
|
+
tritopic/config.py,sha256=vL47vU5KAYD1iCzH3cRMFUO1w1NSibmjIuAHNsBLu5c,10614
|
|
3
|
+
tritopic/labeling.py,sha256=SJsvOXRl-q8f3qtk1S66FGozTJsW8bwNnAKGkAklmVQ,8883
|
|
4
|
+
tritopic/model.py,sha256=mzptfvqG_Q81OcS6kiYd7u2uU2AKjxpDYKo9u1EfpH4,25015
|
|
5
|
+
tritopic/visualization.py,sha256=MCiIgIoTzFoQ7GG9WjfSZlV2j1BBGzZwxRddmvmh1OY,9841
|
|
6
|
+
tritopic/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
tritopic/core/clustering.py,sha256=mZoU8SkfSLWjFbdJYcBfjIJ75uQJVH1guMPoVTZOmnM,9461
|
|
8
|
+
tritopic/core/embeddings.py,sha256=ouqW9YQKSn8MtIt0DShhFB14QIhCnqryyz27Ilwg6sM,6707
|
|
9
|
+
tritopic/core/graph.py,sha256=a949-6N9ZH8Jd7hFt6fDdB4K7r1A2qs16eZfYmJFKHM,13605
|
|
10
|
+
tritopic/core/keywords.py,sha256=AnHY7QFGlGsSRfcsss6EpUIGD91ybo3MsWs5Ritb9cM,9667
|
|
11
|
+
tritopic/core/refinement.py,sha256=7e6K-EuqZ4ttqNkhsvOeHlDC1ZjP9TA0_8mdyy5kTEw,7695
|
|
12
|
+
tritopic/core/representatives.py,sha256=hqpnNMxhqyZxjM7CaGV5M1RxR3B358tMaGTKWn8jWOo,19154
|
|
13
|
+
tritopic/multilingual/__init__.py,sha256=EagOqVqMDNKX7AfEAQfVgbR92f2vBy1KSM5O88AEt20,699
|
|
14
|
+
tritopic/multilingual/detection.py,sha256=xeZqNp4l-fRII5s2S4EMzBdJPf3Xgt6e1a3Od2hc2q4,5700
|
|
15
|
+
tritopic/multilingual/stopwords.py,sha256=viMM1pb4VpDEmDpGpx_8sDfumXfrVXKfUULyOZXFFYU,29942
|
|
16
|
+
tritopic/multilingual/tokenizers.py,sha256=seTCzRiUOqO0UbAqA3nn8V8EoVYQ1wiwqcH8lafRCxM,9954
|
|
17
|
+
tritopic-1.1.0.dist-info/METADATA,sha256=nIWD3zUMOQR9efdUFo8zUjM0JVJGgrzgZVDyLbbjJ7I,13922
|
|
18
|
+
tritopic-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
19
|
+
tritopic-1.1.0.dist-info/top_level.txt,sha256=9PASbqQyi0-wa7E2Hl3Z0u1ae7MwLcfgFliFE1ioFBA,9
|
|
20
|
+
tritopic-1.1.0.dist-info/RECORD,,
|