tritopic 0.1.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {tritopic-0.1.0/tritopic.egg-info → tritopic-1.1.0}/PKG-INFO +91 -51
  2. {tritopic-0.1.0 → tritopic-1.1.0}/README.md +65 -30
  3. {tritopic-0.1.0 → tritopic-1.1.0}/pyproject.toml +38 -24
  4. tritopic-1.1.0/tritopic/__init__.py +36 -0
  5. tritopic-1.1.0/tritopic/config.py +289 -0
  6. tritopic-1.1.0/tritopic/core/__init__.py +0 -0
  7. tritopic-1.1.0/tritopic/core/clustering.py +317 -0
  8. tritopic-1.1.0/tritopic/core/embeddings.py +216 -0
  9. tritopic-1.1.0/tritopic/core/graph.py +435 -0
  10. tritopic-1.1.0/tritopic/core/keywords.py +301 -0
  11. tritopic-1.1.0/tritopic/core/refinement.py +231 -0
  12. tritopic-1.1.0/tritopic/core/representatives.py +560 -0
  13. tritopic-1.1.0/tritopic/labeling.py +313 -0
  14. tritopic-1.1.0/tritopic/model.py +718 -0
  15. tritopic-1.1.0/tritopic/multilingual/__init__.py +38 -0
  16. tritopic-1.1.0/tritopic/multilingual/detection.py +208 -0
  17. tritopic-1.1.0/tritopic/multilingual/stopwords.py +467 -0
  18. tritopic-1.1.0/tritopic/multilingual/tokenizers.py +275 -0
  19. tritopic-1.1.0/tritopic/visualization.py +371 -0
  20. {tritopic-0.1.0 → tritopic-1.1.0/tritopic.egg-info}/PKG-INFO +91 -51
  21. {tritopic-0.1.0 → tritopic-1.1.0}/tritopic.egg-info/SOURCES.txt +11 -9
  22. tritopic-1.1.0/tritopic.egg-info/requires.txt +45 -0
  23. tritopic-0.1.0/LICENSE +0 -21
  24. tritopic-0.1.0/tritopic/__init__.py +0 -46
  25. tritopic-0.1.0/tritopic/core/__init__.py +0 -17
  26. tritopic-0.1.0/tritopic/core/clustering.py +0 -331
  27. tritopic-0.1.0/tritopic/core/embeddings.py +0 -222
  28. tritopic-0.1.0/tritopic/core/graph_builder.py +0 -493
  29. tritopic-0.1.0/tritopic/core/keywords.py +0 -337
  30. tritopic-0.1.0/tritopic/core/model.py +0 -810
  31. tritopic-0.1.0/tritopic/labeling/__init__.py +0 -5
  32. tritopic-0.1.0/tritopic/labeling/llm_labeler.py +0 -279
  33. tritopic-0.1.0/tritopic/utils/__init__.py +0 -13
  34. tritopic-0.1.0/tritopic/utils/metrics.py +0 -254
  35. tritopic-0.1.0/tritopic/visualization/__init__.py +0 -5
  36. tritopic-0.1.0/tritopic/visualization/plotter.py +0 -523
  37. tritopic-0.1.0/tritopic.egg-info/requires.txt +0 -30
  38. {tritopic-0.1.0 → tritopic-1.1.0}/setup.cfg +0 -0
  39. {tritopic-0.1.0 → tritopic-1.1.0}/tritopic.egg-info/dependency_links.txt +0 -0
  40. {tritopic-0.1.0 → tritopic-1.1.0}/tritopic.egg-info/top_level.txt +0 -0
@@ -1,14 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tritopic
3
- Version: 0.1.0
4
- Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement - A state-of-the-art topic modeling library
5
- Author-email: Roman Egger <roman@example.com>
3
+ Version: 1.1.0
4
+ Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
5
+ Author-email: Roman Egger <roman.egger@smartvisions.at>
6
6
  License: MIT
7
- Project-URL: Homepage, https://github.com/roman-egger/tritopic
8
- Project-URL: Documentation, https://tritopic.readthedocs.io
9
- Project-URL: Repository, https://github.com/roman-egger/tritopic
10
- Keywords: topic-modeling,nlp,machine-learning,graph-clustering,leiden,embeddings,text-analysis,bertopic-alternative
7
+ Keywords: topic-modeling,nlp,machine-learning,bertopic,clustering,text-analysis,multilingual
11
8
  Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
12
10
  Classifier: Intended Audience :: Science/Research
13
11
  Classifier: License :: OSI Approved :: MIT License
14
12
  Classifier: Programming Language :: Python :: 3
@@ -20,35 +18,42 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
18
  Classifier: Topic :: Text Processing :: Linguistic
21
19
  Requires-Python: >=3.9
22
20
  Description-Content-Type: text/markdown
23
- License-File: LICENSE
24
21
  Requires-Dist: numpy>=1.21.0
25
- Requires-Dist: pandas>=1.3.0
26
22
  Requires-Dist: scipy>=1.7.0
27
23
  Requires-Dist: scikit-learn>=1.0.0
24
+ Requires-Dist: pandas>=1.3.0
28
25
  Requires-Dist: sentence-transformers>=2.2.0
29
26
  Requires-Dist: leidenalg>=0.9.0
30
- Requires-Dist: igraph>=0.10.0
31
- Requires-Dist: umap-learn>=0.5.0
32
- Requires-Dist: hdbscan>=0.8.0
33
- Requires-Dist: plotly>=5.0.0
27
+ Requires-Dist: python-igraph>=0.10.0
34
28
  Requires-Dist: tqdm>=4.60.0
35
- Requires-Dist: rank-bm25>=0.2.0
36
- Requires-Dist: keybert>=0.7.0
37
29
  Provides-Extra: llm
38
30
  Requires-Dist: anthropic>=0.18.0; extra == "llm"
39
31
  Requires-Dist: openai>=1.0.0; extra == "llm"
40
- Provides-Extra: full
41
- Requires-Dist: anthropic>=0.18.0; extra == "full"
42
- Requires-Dist: openai>=1.0.0; extra == "full"
43
- Requires-Dist: pacmap>=0.6.0; extra == "full"
44
- Requires-Dist: datamapplot>=0.1.0; extra == "full"
32
+ Provides-Extra: multilingual
33
+ Requires-Dist: langdetect>=1.0.9; extra == "multilingual"
34
+ Requires-Dist: jieba>=0.42.1; extra == "multilingual"
35
+ Provides-Extra: japanese
36
+ Requires-Dist: fugashi>=1.2.0; extra == "japanese"
37
+ Requires-Dist: unidic-lite>=1.0.8; extra == "japanese"
38
+ Provides-Extra: korean
39
+ Requires-Dist: konlpy>=0.6.0; extra == "korean"
40
+ Provides-Extra: thai
41
+ Requires-Dist: pythainlp>=4.0.0; extra == "thai"
42
+ Provides-Extra: visualization
43
+ Requires-Dist: plotly>=5.0.0; extra == "visualization"
44
+ Requires-Dist: matplotlib>=3.4.0; extra == "visualization"
45
+ Requires-Dist: umap-learn>=0.5.0; extra == "visualization"
46
+ Provides-Extra: evaluation
47
+ Requires-Dist: gensim>=4.0.0; extra == "evaluation"
48
+ Provides-Extra: all
49
+ Requires-Dist: tritopic[evaluation,llm,multilingual,visualization]; extra == "all"
45
50
  Provides-Extra: dev
46
51
  Requires-Dist: pytest>=7.0.0; extra == "dev"
47
52
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
48
53
  Requires-Dist: black>=23.0.0; extra == "dev"
49
54
  Requires-Dist: ruff>=0.1.0; extra == "dev"
50
55
  Requires-Dist: mypy>=1.0.0; extra == "dev"
51
- Dynamic: license-file
56
+ Requires-Dist: sphinx>=6.0.0; extra == "dev"
52
57
 
53
58
  # 🔺 TriTopic
54
59
 
@@ -68,6 +73,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
68
73
  | **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
69
74
  | **Leiden + Consensus** | Dramatically more stable than single-run clustering |
70
75
  | **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
76
+ | **Multilingual Support** | 60+ languages with auto language detection |
77
+ | **Archetype Representatives** | Rich document selection beyond simple centroids |
71
78
  | **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
72
79
 
73
80
  ## 📦 Installation
@@ -79,8 +86,11 @@ pip install tritopic
79
86
  # With LLM labeling support
80
87
  pip install tritopic[llm]
81
88
 
89
+ # With multilingual support
90
+ pip install tritopic[multilingual]
91
+
82
92
  # Full installation (all features)
83
- pip install tritopic[full]
93
+ pip install tritopic[all]
84
94
  ```
85
95
 
86
96
  ### From source (development)
@@ -171,6 +181,23 @@ model.generate_labels(labeler)
171
181
  print(model.get_topic_info())
172
182
  ```
173
183
 
184
+ ### Multilingual Support
185
+
186
+ ```python
187
+ from tritopic import TriTopic
188
+
189
+ # Auto-detect language and select appropriate model
190
+ model = TriTopic(
191
+ language="auto", # Auto-detect language
192
+ multilingual=False, # Use language-specific model
193
+ verbose=True
194
+ )
195
+
196
+ # Works with Chinese, German, Japanese, etc.
197
+ chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
198
+ topics = model.fit_transform(chinese_docs)
199
+ ```
200
+
174
201
  ### With Metadata
175
202
 
176
203
  ```python
@@ -202,13 +229,18 @@ from tritopic import TriTopic, TriTopicConfig
202
229
 
203
230
  config = TriTopicConfig(
204
231
  # Embedding settings
205
- embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
232
+ embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
206
233
  embedding_batch_size=32,
207
234
 
235
+ # Language settings
236
+ language="auto", # or "en", "de", "zh", etc.
237
+ multilingual=False, # Force multilingual model
238
+ language_detection_sample=100,
239
+
208
240
  # Graph construction
209
241
  n_neighbors=15,
210
242
  metric="cosine",
211
- graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
243
+ graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
212
244
  snn_weight=0.5,
213
245
 
214
246
  # Multi-view fusion weights
@@ -227,11 +259,17 @@ config = TriTopicConfig(
227
259
  use_iterative_refinement=True,
228
260
  max_iterations=5,
229
261
  convergence_threshold=0.95,
262
+ refinement_strength=0.15,
230
263
 
231
264
  # Keywords
232
265
  n_keywords=10,
266
+ keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
267
+
268
+ # Representatives (with archetype support)
233
269
  n_representative_docs=5,
234
- keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
270
+ representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
271
+ n_archetypes=4,
272
+ archetype_method="furthest_sum",
235
273
 
236
274
  # Misc
237
275
  outlier_threshold=0.1,
@@ -242,16 +280,25 @@ config = TriTopicConfig(
242
280
  model = TriTopic(config=config)
243
281
  ```
244
282
 
245
- ### Quick Parameter Override
283
+ ### Pre-defined Configurations
246
284
 
247
285
  ```python
248
- # Override just what you need
249
- model = TriTopic(
250
- embedding_model="BAAI/bge-base-en-v1.5",
251
- n_neighbors=20,
252
- use_iterative_refinement=True,
253
- verbose=True,
254
- )
286
+ from tritopic import TriTopic, get_config
287
+
288
+ # Fast processing (less accurate)
289
+ model = TriTopic(config=get_config("fast"))
290
+
291
+ # High quality (slower)
292
+ model = TriTopic(config=get_config("quality"))
293
+
294
+ # Multilingual corpus
295
+ model = TriTopic(config=get_config("multilingual"))
296
+
297
+ # Chinese text
298
+ model = TriTopic(config=get_config("chinese"))
299
+
300
+ # German text
301
+ model = TriTopic(config=get_config("german"))
255
302
  ```
256
303
 
257
304
  ## 📊 Evaluation
@@ -285,20 +332,6 @@ model = TriTopic()
285
332
  topics = model.fit_transform(documents, embeddings=embeddings)
286
333
  ```
287
334
 
288
- ### Find Optimal Resolution
289
-
290
- ```python
291
- from tritopic.core.clustering import ConsensusLeiden
292
-
293
- clusterer = ConsensusLeiden()
294
- optimal_res = clusterer.find_optimal_resolution(
295
- graph=model.graph_,
296
- resolution_range=(0.5, 2.0),
297
- target_n_topics=15, # Optional: target number
298
- )
299
- print(f"Optimal resolution: {optimal_res}")
300
- ```
301
-
302
335
  ### Transform New Documents
303
336
 
304
337
  ```python
@@ -327,7 +360,8 @@ model = TriTopic.load("my_topic_model.pkl")
327
360
  | Views | Embeddings only | Semantic + Lexical + Metadata |
328
361
  | Refinement | None | Iterative embedding refinement |
329
362
  | Stability | Low (varies by run) | High (consensus clustering) |
330
- | Outlier Handling | HDBSCAN built-in | Configurable threshold |
363
+ | Languages | Limited | 60+ with auto-detection |
364
+ | Representatives | Centroid only | Archetypes, medoids, diverse |
331
365
 
332
366
  ### Benchmark Results
333
367
 
@@ -339,7 +373,7 @@ On 20 Newsgroups dataset (n=18,846):
339
373
  | Diversity | 0.834 | **0.891** | +7% |
340
374
  | Stability (ARI) | 0.721 | **0.934** | +30% |
341
375
 
342
- ## 🏗️ Architecture
376
+ ## 🗂️ Architecture
343
377
 
344
378
  ```
345
379
  Documents
@@ -347,10 +381,10 @@ Documents
347
381
  ├─── Embedding Engine ──────────────┐
348
382
  │ (Sentence-BERT/BGE/Instructor) │
349
383
  │ │
350
- ├─── Lexical Matrix ───────────────┼─── Multi-View
384
+ ├─── Lexical Matrix ────────────────┼─── Multi-View
351
385
  │ (TF-IDF/BM25) │ Graph Builder
352
386
  │ │ │
353
- └─── Metadata Graph ───────────────┘
387
+ └─── Metadata Graph ────────────────┘
354
388
  (Optional) │
355
389
 
356
390
  ┌─────────────────────┐
@@ -369,6 +403,12 @@ Documents
369
403
  └──────────┬──────────┘
370
404
 
371
405
  ┌──────────▼──────────┐
406
+ │ Representative │
407
+ │ Selection │
408
+ │ (Archetype/Hybrid) │
409
+ └──────────┬──────────┘
410
+
411
+ ┌──────────▼──────────┐
372
412
  │ LLM Labeling │
373
413
  │ (Claude/GPT-4) │
374
414
  └─────────────────────┘
@@ -16,6 +16,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
16
16
  | **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
17
17
  | **Leiden + Consensus** | Dramatically more stable than single-run clustering |
18
18
  | **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
19
+ | **Multilingual Support** | 60+ languages with auto language detection |
20
+ | **Archetype Representatives** | Rich document selection beyond simple centroids |
19
21
  | **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
20
22
 
21
23
  ## 📦 Installation
@@ -27,8 +29,11 @@ pip install tritopic
27
29
  # With LLM labeling support
28
30
  pip install tritopic[llm]
29
31
 
32
+ # With multilingual support
33
+ pip install tritopic[multilingual]
34
+
30
35
  # Full installation (all features)
31
- pip install tritopic[full]
36
+ pip install tritopic[all]
32
37
  ```
33
38
 
34
39
  ### From source (development)
@@ -119,6 +124,23 @@ model.generate_labels(labeler)
119
124
  print(model.get_topic_info())
120
125
  ```
121
126
 
127
+ ### Multilingual Support
128
+
129
+ ```python
130
+ from tritopic import TriTopic
131
+
132
+ # Auto-detect language and select appropriate model
133
+ model = TriTopic(
134
+ language="auto", # Auto-detect language
135
+ multilingual=False, # Use language-specific model
136
+ verbose=True
137
+ )
138
+
139
+ # Works with Chinese, German, Japanese, etc.
140
+ chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
141
+ topics = model.fit_transform(chinese_docs)
142
+ ```
143
+
122
144
  ### With Metadata
123
145
 
124
146
  ```python
@@ -150,13 +172,18 @@ from tritopic import TriTopic, TriTopicConfig
150
172
 
151
173
  config = TriTopicConfig(
152
174
  # Embedding settings
153
- embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
175
+ embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
154
176
  embedding_batch_size=32,
155
177
 
178
+ # Language settings
179
+ language="auto", # or "en", "de", "zh", etc.
180
+ multilingual=False, # Force multilingual model
181
+ language_detection_sample=100,
182
+
156
183
  # Graph construction
157
184
  n_neighbors=15,
158
185
  metric="cosine",
159
- graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
186
+ graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
160
187
  snn_weight=0.5,
161
188
 
162
189
  # Multi-view fusion weights
@@ -175,11 +202,17 @@ config = TriTopicConfig(
175
202
  use_iterative_refinement=True,
176
203
  max_iterations=5,
177
204
  convergence_threshold=0.95,
205
+ refinement_strength=0.15,
178
206
 
179
207
  # Keywords
180
208
  n_keywords=10,
209
+ keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
210
+
211
+ # Representatives (with archetype support)
181
212
  n_representative_docs=5,
182
- keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
213
+ representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
214
+ n_archetypes=4,
215
+ archetype_method="furthest_sum",
183
216
 
184
217
  # Misc
185
218
  outlier_threshold=0.1,
@@ -190,16 +223,25 @@ config = TriTopicConfig(
190
223
  model = TriTopic(config=config)
191
224
  ```
192
225
 
193
- ### Quick Parameter Override
226
+ ### Pre-defined Configurations
194
227
 
195
228
  ```python
196
- # Override just what you need
197
- model = TriTopic(
198
- embedding_model="BAAI/bge-base-en-v1.5",
199
- n_neighbors=20,
200
- use_iterative_refinement=True,
201
- verbose=True,
202
- )
229
+ from tritopic import TriTopic, get_config
230
+
231
+ # Fast processing (less accurate)
232
+ model = TriTopic(config=get_config("fast"))
233
+
234
+ # High quality (slower)
235
+ model = TriTopic(config=get_config("quality"))
236
+
237
+ # Multilingual corpus
238
+ model = TriTopic(config=get_config("multilingual"))
239
+
240
+ # Chinese text
241
+ model = TriTopic(config=get_config("chinese"))
242
+
243
+ # German text
244
+ model = TriTopic(config=get_config("german"))
203
245
  ```
204
246
 
205
247
  ## 📊 Evaluation
@@ -233,20 +275,6 @@ model = TriTopic()
233
275
  topics = model.fit_transform(documents, embeddings=embeddings)
234
276
  ```
235
277
 
236
- ### Find Optimal Resolution
237
-
238
- ```python
239
- from tritopic.core.clustering import ConsensusLeiden
240
-
241
- clusterer = ConsensusLeiden()
242
- optimal_res = clusterer.find_optimal_resolution(
243
- graph=model.graph_,
244
- resolution_range=(0.5, 2.0),
245
- target_n_topics=15, # Optional: target number
246
- )
247
- print(f"Optimal resolution: {optimal_res}")
248
- ```
249
-
250
278
  ### Transform New Documents
251
279
 
252
280
  ```python
@@ -275,7 +303,8 @@ model = TriTopic.load("my_topic_model.pkl")
275
303
  | Views | Embeddings only | Semantic + Lexical + Metadata |
276
304
  | Refinement | None | Iterative embedding refinement |
277
305
  | Stability | Low (varies by run) | High (consensus clustering) |
278
- | Outlier Handling | HDBSCAN built-in | Configurable threshold |
306
+ | Languages | Limited | 60+ with auto-detection |
307
+ | Representatives | Centroid only | Archetypes, medoids, diverse |
279
308
 
280
309
  ### Benchmark Results
281
310
 
@@ -287,7 +316,7 @@ On 20 Newsgroups dataset (n=18,846):
287
316
  | Diversity | 0.834 | **0.891** | +7% |
288
317
  | Stability (ARI) | 0.721 | **0.934** | +30% |
289
318
 
290
- ## 🏗️ Architecture
319
+ ## 🗂️ Architecture
291
320
 
292
321
  ```
293
322
  Documents
@@ -295,10 +324,10 @@ Documents
295
324
  ├─── Embedding Engine ──────────────┐
296
325
  │ (Sentence-BERT/BGE/Instructor) │
297
326
  │ │
298
- ├─── Lexical Matrix ───────────────┼─── Multi-View
327
+ ├─── Lexical Matrix ────────────────┼─── Multi-View
299
328
  │ (TF-IDF/BM25) │ Graph Builder
300
329
  │ │ │
301
- └─── Metadata Graph ───────────────┘
330
+ └─── Metadata Graph ────────────────┘
302
331
  (Optional) │
303
332
 
304
333
  ┌─────────────────────┐
@@ -317,6 +346,12 @@ Documents
317
346
  └──────────┬──────────┘
318
347
 
319
348
  ┌──────────▼──────────┐
349
+ │ Representative │
350
+ │ Selection │
351
+ │ (Archetype/Hybrid) │
352
+ └──────────┬──────────┘
353
+
354
+ ┌──────────▼──────────┐
320
355
  │ LLM Labeling │
321
356
  │ (Claude/GPT-4) │
322
357
  └─────────────────────┘
@@ -4,19 +4,20 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tritopic"
7
- version = "0.1.0"
8
- description = "Tri-Modal Graph Topic Modeling with Iterative Refinement - A state-of-the-art topic modeling library"
7
+ version = "1.1.0"
8
+ description = "Tri-Modal Graph Topic Modeling with Iterative Refinement"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
11
11
  authors = [
12
- {name = "Roman Egger", email = "roman@example.com"}
12
+ {name = "Roman Egger", email = "roman.egger@smartvisions.at"}
13
13
  ]
14
14
  keywords = [
15
- "topic-modeling", "nlp", "machine-learning", "graph-clustering",
16
- "leiden", "embeddings", "text-analysis", "bertopic-alternative"
15
+ "topic-modeling", "nlp", "machine-learning", "bertopic",
16
+ "clustering", "text-analysis", "multilingual"
17
17
  ]
18
18
  classifiers = [
19
19
  "Development Status :: 4 - Beta",
20
+ "Intended Audience :: Developers",
20
21
  "Intended Audience :: Science/Research",
21
22
  "License :: OSI Approved :: MIT License",
22
23
  "Programming Language :: Python :: 3",
@@ -30,18 +31,13 @@ classifiers = [
30
31
  requires-python = ">=3.9"
31
32
  dependencies = [
32
33
  "numpy>=1.21.0",
33
- "pandas>=1.3.0",
34
34
  "scipy>=1.7.0",
35
35
  "scikit-learn>=1.0.0",
36
+ "pandas>=1.3.0",
36
37
  "sentence-transformers>=2.2.0",
37
38
  "leidenalg>=0.9.0",
38
- "igraph>=0.10.0",
39
- "umap-learn>=0.5.0",
40
- "hdbscan>=0.8.0",
41
- "plotly>=5.0.0",
39
+ "python-igraph>=0.10.0",
42
40
  "tqdm>=4.60.0",
43
- "rank-bm25>=0.2.0",
44
- "keybert>=0.7.0",
45
41
  ]
46
42
 
47
43
  [project.optional-dependencies]
@@ -49,11 +45,30 @@ llm = [
49
45
  "anthropic>=0.18.0",
50
46
  "openai>=1.0.0",
51
47
  ]
52
- full = [
53
- "anthropic>=0.18.0",
54
- "openai>=1.0.0",
55
- "pacmap>=0.6.0",
56
- "datamapplot>=0.1.0",
48
+ multilingual = [
49
+ "langdetect>=1.0.9",
50
+ "jieba>=0.42.1",
51
+ ]
52
+ japanese = [
53
+ "fugashi>=1.2.0",
54
+ "unidic-lite>=1.0.8",
55
+ ]
56
+ korean = [
57
+ "konlpy>=0.6.0",
58
+ ]
59
+ thai = [
60
+ "pythainlp>=4.0.0",
61
+ ]
62
+ visualization = [
63
+ "plotly>=5.0.0",
64
+ "matplotlib>=3.4.0",
65
+ "umap-learn>=0.5.0",
66
+ ]
67
+ evaluation = [
68
+ "gensim>=4.0.0",
69
+ ]
70
+ all = [
71
+ "tritopic[llm,multilingual,visualization,evaluation]",
57
72
  ]
58
73
  dev = [
59
74
  "pytest>=7.0.0",
@@ -61,12 +76,9 @@ dev = [
61
76
  "black>=23.0.0",
62
77
  "ruff>=0.1.0",
63
78
  "mypy>=1.0.0",
79
+ "sphinx>=6.0.0",
64
80
  ]
65
81
 
66
- [project.urls]
67
- Homepage = "https://github.com/roman-egger/tritopic"
68
- Documentation = "https://tritopic.readthedocs.io"
69
- Repository = "https://github.com/roman-egger/tritopic"
70
82
 
71
83
  [tool.setuptools.packages.find]
72
84
  where = ["."]
@@ -74,13 +86,15 @@ include = ["tritopic*"]
74
86
 
75
87
  [tool.black]
76
88
  line-length = 100
77
- target-version = ['py39', 'py310', 'py311']
89
+ target-version = ['py39', 'py310', 'py311', 'py312']
78
90
 
79
91
  [tool.ruff]
80
92
  line-length = 100
81
- select = ["E", "F", "W", "I", "N"]
93
+ select = ["E", "F", "W", "I", "N", "D", "UP"]
94
+ ignore = ["D100", "D104"]
82
95
 
83
96
  [tool.mypy]
84
- python_version = "3.10"
97
+ python_version = "3.9"
85
98
  warn_return_any = true
86
99
  warn_unused_configs = true
100
+ ignore_missing_imports = true
@@ -0,0 +1,36 @@
1
+ """
2
+ TriTopic: Tri-Modal Graph Topic Modeling with Iterative Refinement
3
+
4
+ A state-of-the-art topic modeling library that consistently outperforms
5
+ BERTopic and traditional approaches.
6
+
7
+ Key Features:
8
+ - Multi-view representation (semantic, lexical, metadata)
9
+ - Hybrid graph construction (Mutual kNN + SNN)
10
+ - Consensus Leiden clustering for stability
11
+ - Iterative refinement for improved coherence
12
+ - Multilingual support (60+ languages)
13
+ - LLM-powered labeling
14
+
15
+ Example:
16
+ >>> from tritopic import TriTopic
17
+ >>> model = TriTopic(verbose=True)
18
+ >>> topics = model.fit_transform(documents)
19
+ >>> print(model.get_topic_info())
20
+ """
21
+
22
+ __version__ = "1.0.0"
23
+ __author__ = "Roman Egger"
24
+
25
+ from .model import TriTopic, Topic
26
+ from .config import TriTopicConfig, get_config
27
+ from .labeling import LLMLabeler, KeywordLabeler
28
+
29
+ __all__ = [
30
+ "TriTopic",
31
+ "Topic",
32
+ "TriTopicConfig",
33
+ "get_config",
34
+ "LLMLabeler",
35
+ "KeywordLabeler",
36
+ ]