tritopic 0.1.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tritopic might be problematic. Click here for more details.

Files changed (40) hide show
  1. {tritopic-0.1.0/tritopic.egg-info → tritopic-1.0.0}/PKG-INFO +92 -48
  2. {tritopic-0.1.0 → tritopic-1.0.0}/README.md +65 -30
  3. {tritopic-0.1.0 → tritopic-1.0.0}/pyproject.toml +39 -20
  4. tritopic-1.0.0/tritopic/__init__.py +36 -0
  5. tritopic-1.0.0/tritopic/config.py +305 -0
  6. tritopic-1.0.0/tritopic/core/__init__.py +0 -0
  7. tritopic-1.0.0/tritopic/core/clustering.py +317 -0
  8. tritopic-1.0.0/tritopic/core/embeddings.py +216 -0
  9. tritopic-1.0.0/tritopic/core/graph.py +435 -0
  10. tritopic-1.0.0/tritopic/core/keywords.py +301 -0
  11. tritopic-1.0.0/tritopic/core/refinement.py +231 -0
  12. tritopic-1.0.0/tritopic/core/representatives.py +560 -0
  13. tritopic-1.0.0/tritopic/labeling.py +313 -0
  14. tritopic-1.0.0/tritopic/model.py +718 -0
  15. tritopic-1.0.0/tritopic/multilingual/__init__.py +38 -0
  16. tritopic-1.0.0/tritopic/multilingual/detection.py +208 -0
  17. tritopic-1.0.0/tritopic/multilingual/stopwords.py +467 -0
  18. tritopic-1.0.0/tritopic/multilingual/tokenizers.py +275 -0
  19. tritopic-1.0.0/tritopic/visualization.py +371 -0
  20. {tritopic-0.1.0 → tritopic-1.0.0/tritopic.egg-info}/PKG-INFO +92 -48
  21. {tritopic-0.1.0 → tritopic-1.0.0}/tritopic.egg-info/SOURCES.txt +11 -9
  22. tritopic-1.0.0/tritopic.egg-info/requires.txt +45 -0
  23. tritopic-0.1.0/LICENSE +0 -21
  24. tritopic-0.1.0/tritopic/__init__.py +0 -46
  25. tritopic-0.1.0/tritopic/core/__init__.py +0 -17
  26. tritopic-0.1.0/tritopic/core/clustering.py +0 -331
  27. tritopic-0.1.0/tritopic/core/embeddings.py +0 -222
  28. tritopic-0.1.0/tritopic/core/graph_builder.py +0 -493
  29. tritopic-0.1.0/tritopic/core/keywords.py +0 -337
  30. tritopic-0.1.0/tritopic/core/model.py +0 -810
  31. tritopic-0.1.0/tritopic/labeling/__init__.py +0 -5
  32. tritopic-0.1.0/tritopic/labeling/llm_labeler.py +0 -279
  33. tritopic-0.1.0/tritopic/utils/__init__.py +0 -13
  34. tritopic-0.1.0/tritopic/utils/metrics.py +0 -254
  35. tritopic-0.1.0/tritopic/visualization/__init__.py +0 -5
  36. tritopic-0.1.0/tritopic/visualization/plotter.py +0 -523
  37. tritopic-0.1.0/tritopic.egg-info/requires.txt +0 -30
  38. {tritopic-0.1.0 → tritopic-1.0.0}/setup.cfg +0 -0
  39. {tritopic-0.1.0 → tritopic-1.0.0}/tritopic.egg-info/dependency_links.txt +0 -0
  40. {tritopic-0.1.0 → tritopic-1.0.0}/tritopic.egg-info/top_level.txt +0 -0
@@ -1,14 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tritopic
3
- Version: 0.1.0
4
- Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement - A state-of-the-art topic modeling library
5
- Author-email: Roman Egger <roman@example.com>
3
+ Version: 1.0.0
4
+ Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
5
+ Author-email: Roman Egger <roman.egger@example.com>
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://github.com/roman-egger/tritopic
8
8
  Project-URL: Documentation, https://tritopic.readthedocs.io
9
9
  Project-URL: Repository, https://github.com/roman-egger/tritopic
10
- Keywords: topic-modeling,nlp,machine-learning,graph-clustering,leiden,embeddings,text-analysis,bertopic-alternative
10
+ Project-URL: Issues, https://github.com/roman-egger/tritopic/issues
11
+ Keywords: topic-modeling,nlp,machine-learning,bertopic,clustering,text-analysis,multilingual
11
12
  Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
12
14
  Classifier: Intended Audience :: Science/Research
13
15
  Classifier: License :: OSI Approved :: MIT License
14
16
  Classifier: Programming Language :: Python :: 3
@@ -20,35 +22,42 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
22
  Classifier: Topic :: Text Processing :: Linguistic
21
23
  Requires-Python: >=3.9
22
24
  Description-Content-Type: text/markdown
23
- License-File: LICENSE
24
25
  Requires-Dist: numpy>=1.21.0
25
- Requires-Dist: pandas>=1.3.0
26
26
  Requires-Dist: scipy>=1.7.0
27
27
  Requires-Dist: scikit-learn>=1.0.0
28
+ Requires-Dist: pandas>=1.3.0
28
29
  Requires-Dist: sentence-transformers>=2.2.0
29
30
  Requires-Dist: leidenalg>=0.9.0
30
- Requires-Dist: igraph>=0.10.0
31
- Requires-Dist: umap-learn>=0.5.0
32
- Requires-Dist: hdbscan>=0.8.0
33
- Requires-Dist: plotly>=5.0.0
31
+ Requires-Dist: python-igraph>=0.10.0
34
32
  Requires-Dist: tqdm>=4.60.0
35
- Requires-Dist: rank-bm25>=0.2.0
36
- Requires-Dist: keybert>=0.7.0
37
33
  Provides-Extra: llm
38
34
  Requires-Dist: anthropic>=0.18.0; extra == "llm"
39
35
  Requires-Dist: openai>=1.0.0; extra == "llm"
40
- Provides-Extra: full
41
- Requires-Dist: anthropic>=0.18.0; extra == "full"
42
- Requires-Dist: openai>=1.0.0; extra == "full"
43
- Requires-Dist: pacmap>=0.6.0; extra == "full"
44
- Requires-Dist: datamapplot>=0.1.0; extra == "full"
36
+ Provides-Extra: multilingual
37
+ Requires-Dist: langdetect>=1.0.9; extra == "multilingual"
38
+ Requires-Dist: jieba>=0.42.1; extra == "multilingual"
39
+ Provides-Extra: japanese
40
+ Requires-Dist: fugashi>=1.2.0; extra == "japanese"
41
+ Requires-Dist: unidic-lite>=1.0.8; extra == "japanese"
42
+ Provides-Extra: korean
43
+ Requires-Dist: konlpy>=0.6.0; extra == "korean"
44
+ Provides-Extra: thai
45
+ Requires-Dist: pythainlp>=4.0.0; extra == "thai"
46
+ Provides-Extra: visualization
47
+ Requires-Dist: plotly>=5.0.0; extra == "visualization"
48
+ Requires-Dist: matplotlib>=3.4.0; extra == "visualization"
49
+ Requires-Dist: umap-learn>=0.5.0; extra == "visualization"
50
+ Provides-Extra: evaluation
51
+ Requires-Dist: gensim>=4.0.0; extra == "evaluation"
52
+ Provides-Extra: all
53
+ Requires-Dist: tritopic[evaluation,llm,multilingual,visualization]; extra == "all"
45
54
  Provides-Extra: dev
46
55
  Requires-Dist: pytest>=7.0.0; extra == "dev"
47
56
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
48
57
  Requires-Dist: black>=23.0.0; extra == "dev"
49
58
  Requires-Dist: ruff>=0.1.0; extra == "dev"
50
59
  Requires-Dist: mypy>=1.0.0; extra == "dev"
51
- Dynamic: license-file
60
+ Requires-Dist: sphinx>=6.0.0; extra == "dev"
52
61
 
53
62
  # 🔺 TriTopic
54
63
 
@@ -68,6 +77,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
68
77
  | **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
69
78
  | **Leiden + Consensus** | Dramatically more stable than single-run clustering |
70
79
  | **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
80
+ | **Multilingual Support** | 60+ languages with auto language detection |
81
+ | **Archetype Representatives** | Rich document selection beyond simple centroids |
71
82
  | **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
72
83
 
73
84
  ## 📦 Installation
@@ -79,8 +90,11 @@ pip install tritopic
79
90
  # With LLM labeling support
80
91
  pip install tritopic[llm]
81
92
 
93
+ # With multilingual support
94
+ pip install tritopic[multilingual]
95
+
82
96
  # Full installation (all features)
83
- pip install tritopic[full]
97
+ pip install tritopic[all]
84
98
  ```
85
99
 
86
100
  ### From source (development)
@@ -171,6 +185,23 @@ model.generate_labels(labeler)
171
185
  print(model.get_topic_info())
172
186
  ```
173
187
 
188
+ ### Multilingual Support
189
+
190
+ ```python
191
+ from tritopic import TriTopic
192
+
193
+ # Auto-detect language and select appropriate model
194
+ model = TriTopic(
195
+ language="auto", # Auto-detect language
196
+ multilingual=False, # Use language-specific model
197
+ verbose=True
198
+ )
199
+
200
+ # Works with Chinese, German, Japanese, etc.
201
+ chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
202
+ topics = model.fit_transform(chinese_docs)
203
+ ```
204
+
174
205
  ### With Metadata
175
206
 
176
207
  ```python
@@ -202,13 +233,18 @@ from tritopic import TriTopic, TriTopicConfig
202
233
 
203
234
  config = TriTopicConfig(
204
235
  # Embedding settings
205
- embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
236
+ embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
206
237
  embedding_batch_size=32,
207
238
 
239
+ # Language settings
240
+ language="auto", # or "en", "de", "zh", etc.
241
+ multilingual=False, # Force multilingual model
242
+ language_detection_sample=100,
243
+
208
244
  # Graph construction
209
245
  n_neighbors=15,
210
246
  metric="cosine",
211
- graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
247
+ graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
212
248
  snn_weight=0.5,
213
249
 
214
250
  # Multi-view fusion weights
@@ -227,11 +263,17 @@ config = TriTopicConfig(
227
263
  use_iterative_refinement=True,
228
264
  max_iterations=5,
229
265
  convergence_threshold=0.95,
266
+ refinement_strength=0.15,
230
267
 
231
268
  # Keywords
232
269
  n_keywords=10,
270
+ keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
271
+
272
+ # Representatives (with archetype support)
233
273
  n_representative_docs=5,
234
- keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
274
+ representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
275
+ n_archetypes=4,
276
+ archetype_method="furthest_sum",
235
277
 
236
278
  # Misc
237
279
  outlier_threshold=0.1,
@@ -242,16 +284,25 @@ config = TriTopicConfig(
242
284
  model = TriTopic(config=config)
243
285
  ```
244
286
 
245
- ### Quick Parameter Override
287
+ ### Pre-defined Configurations
246
288
 
247
289
  ```python
248
- # Override just what you need
249
- model = TriTopic(
250
- embedding_model="BAAI/bge-base-en-v1.5",
251
- n_neighbors=20,
252
- use_iterative_refinement=True,
253
- verbose=True,
254
- )
290
+ from tritopic import TriTopic, get_config
291
+
292
+ # Fast processing (less accurate)
293
+ model = TriTopic(config=get_config("fast"))
294
+
295
+ # High quality (slower)
296
+ model = TriTopic(config=get_config("quality"))
297
+
298
+ # Multilingual corpus
299
+ model = TriTopic(config=get_config("multilingual"))
300
+
301
+ # Chinese text
302
+ model = TriTopic(config=get_config("chinese"))
303
+
304
+ # German text
305
+ model = TriTopic(config=get_config("german"))
255
306
  ```
256
307
 
257
308
  ## 📊 Evaluation
@@ -285,20 +336,6 @@ model = TriTopic()
285
336
  topics = model.fit_transform(documents, embeddings=embeddings)
286
337
  ```
287
338
 
288
- ### Find Optimal Resolution
289
-
290
- ```python
291
- from tritopic.core.clustering import ConsensusLeiden
292
-
293
- clusterer = ConsensusLeiden()
294
- optimal_res = clusterer.find_optimal_resolution(
295
- graph=model.graph_,
296
- resolution_range=(0.5, 2.0),
297
- target_n_topics=15, # Optional: target number
298
- )
299
- print(f"Optimal resolution: {optimal_res}")
300
- ```
301
-
302
339
  ### Transform New Documents
303
340
 
304
341
  ```python
@@ -327,7 +364,8 @@ model = TriTopic.load("my_topic_model.pkl")
327
364
  | Views | Embeddings only | Semantic + Lexical + Metadata |
328
365
  | Refinement | None | Iterative embedding refinement |
329
366
  | Stability | Low (varies by run) | High (consensus clustering) |
330
- | Outlier Handling | HDBSCAN built-in | Configurable threshold |
367
+ | Languages | Limited | 60+ with auto-detection |
368
+ | Representatives | Centroid only | Archetypes, medoids, diverse |
331
369
 
332
370
  ### Benchmark Results
333
371
 
@@ -339,7 +377,7 @@ On 20 Newsgroups dataset (n=18,846):
339
377
  | Diversity | 0.834 | **0.891** | +7% |
340
378
  | Stability (ARI) | 0.721 | **0.934** | +30% |
341
379
 
342
- ## 🏗️ Architecture
380
+ ## 🗂️ Architecture
343
381
 
344
382
  ```
345
383
  Documents
@@ -347,10 +385,10 @@ Documents
347
385
  ├─── Embedding Engine ──────────────┐
348
386
  │ (Sentence-BERT/BGE/Instructor) │
349
387
  │ │
350
- ├─── Lexical Matrix ───────────────┼─── Multi-View
388
+ ├─── Lexical Matrix ────────────────┼─── Multi-View
351
389
  │ (TF-IDF/BM25) │ Graph Builder
352
390
  │ │ │
353
- └─── Metadata Graph ───────────────┘
391
+ └─── Metadata Graph ────────────────┘
354
392
  (Optional) │
355
393
 
356
394
  ┌─────────────────────┐
@@ -369,6 +407,12 @@ Documents
369
407
  └──────────┬──────────┘
370
408
 
371
409
  ┌──────────▼──────────┐
410
+ │ Representative │
411
+ │ Selection │
412
+ │ (Archetype/Hybrid) │
413
+ └──────────┬──────────┘
414
+
415
+ ┌──────────▼──────────┐
372
416
  │ LLM Labeling │
373
417
  │ (Claude/GPT-4) │
374
418
  └─────────────────────┘
@@ -16,6 +16,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
16
16
  | **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
17
17
  | **Leiden + Consensus** | Dramatically more stable than single-run clustering |
18
18
  | **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
19
+ | **Multilingual Support** | 60+ languages with auto language detection |
20
+ | **Archetype Representatives** | Rich document selection beyond simple centroids |
19
21
  | **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
20
22
 
21
23
  ## 📦 Installation
@@ -27,8 +29,11 @@ pip install tritopic
27
29
  # With LLM labeling support
28
30
  pip install tritopic[llm]
29
31
 
32
+ # With multilingual support
33
+ pip install tritopic[multilingual]
34
+
30
35
  # Full installation (all features)
31
- pip install tritopic[full]
36
+ pip install tritopic[all]
32
37
  ```
33
38
 
34
39
  ### From source (development)
@@ -119,6 +124,23 @@ model.generate_labels(labeler)
119
124
  print(model.get_topic_info())
120
125
  ```
121
126
 
127
+ ### Multilingual Support
128
+
129
+ ```python
130
+ from tritopic import TriTopic
131
+
132
+ # Auto-detect language and select appropriate model
133
+ model = TriTopic(
134
+ language="auto", # Auto-detect language
135
+ multilingual=False, # Use language-specific model
136
+ verbose=True
137
+ )
138
+
139
+ # Works with Chinese, German, Japanese, etc.
140
+ chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
141
+ topics = model.fit_transform(chinese_docs)
142
+ ```
143
+
122
144
  ### With Metadata
123
145
 
124
146
  ```python
@@ -150,13 +172,18 @@ from tritopic import TriTopic, TriTopicConfig
150
172
 
151
173
  config = TriTopicConfig(
152
174
  # Embedding settings
153
- embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
175
+ embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
154
176
  embedding_batch_size=32,
155
177
 
178
+ # Language settings
179
+ language="auto", # or "en", "de", "zh", etc.
180
+ multilingual=False, # Force multilingual model
181
+ language_detection_sample=100,
182
+
156
183
  # Graph construction
157
184
  n_neighbors=15,
158
185
  metric="cosine",
159
- graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
186
+ graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
160
187
  snn_weight=0.5,
161
188
 
162
189
  # Multi-view fusion weights
@@ -175,11 +202,17 @@ config = TriTopicConfig(
175
202
  use_iterative_refinement=True,
176
203
  max_iterations=5,
177
204
  convergence_threshold=0.95,
205
+ refinement_strength=0.15,
178
206
 
179
207
  # Keywords
180
208
  n_keywords=10,
209
+ keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
210
+
211
+ # Representatives (with archetype support)
181
212
  n_representative_docs=5,
182
- keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
213
+ representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
214
+ n_archetypes=4,
215
+ archetype_method="furthest_sum",
183
216
 
184
217
  # Misc
185
218
  outlier_threshold=0.1,
@@ -190,16 +223,25 @@ config = TriTopicConfig(
190
223
  model = TriTopic(config=config)
191
224
  ```
192
225
 
193
- ### Quick Parameter Override
226
+ ### Pre-defined Configurations
194
227
 
195
228
  ```python
196
- # Override just what you need
197
- model = TriTopic(
198
- embedding_model="BAAI/bge-base-en-v1.5",
199
- n_neighbors=20,
200
- use_iterative_refinement=True,
201
- verbose=True,
202
- )
229
+ from tritopic import TriTopic, get_config
230
+
231
+ # Fast processing (less accurate)
232
+ model = TriTopic(config=get_config("fast"))
233
+
234
+ # High quality (slower)
235
+ model = TriTopic(config=get_config("quality"))
236
+
237
+ # Multilingual corpus
238
+ model = TriTopic(config=get_config("multilingual"))
239
+
240
+ # Chinese text
241
+ model = TriTopic(config=get_config("chinese"))
242
+
243
+ # German text
244
+ model = TriTopic(config=get_config("german"))
203
245
  ```
204
246
 
205
247
  ## 📊 Evaluation
@@ -233,20 +275,6 @@ model = TriTopic()
233
275
  topics = model.fit_transform(documents, embeddings=embeddings)
234
276
  ```
235
277
 
236
- ### Find Optimal Resolution
237
-
238
- ```python
239
- from tritopic.core.clustering import ConsensusLeiden
240
-
241
- clusterer = ConsensusLeiden()
242
- optimal_res = clusterer.find_optimal_resolution(
243
- graph=model.graph_,
244
- resolution_range=(0.5, 2.0),
245
- target_n_topics=15, # Optional: target number
246
- )
247
- print(f"Optimal resolution: {optimal_res}")
248
- ```
249
-
250
278
  ### Transform New Documents
251
279
 
252
280
  ```python
@@ -275,7 +303,8 @@ model = TriTopic.load("my_topic_model.pkl")
275
303
  | Views | Embeddings only | Semantic + Lexical + Metadata |
276
304
  | Refinement | None | Iterative embedding refinement |
277
305
  | Stability | Low (varies by run) | High (consensus clustering) |
278
- | Outlier Handling | HDBSCAN built-in | Configurable threshold |
306
+ | Languages | Limited | 60+ with auto-detection |
307
+ | Representatives | Centroid only | Archetypes, medoids, diverse |
279
308
 
280
309
  ### Benchmark Results
281
310
 
@@ -287,7 +316,7 @@ On 20 Newsgroups dataset (n=18,846):
287
316
  | Diversity | 0.834 | **0.891** | +7% |
288
317
  | Stability (ARI) | 0.721 | **0.934** | +30% |
289
318
 
290
- ## 🏗️ Architecture
319
+ ## 🗂️ Architecture
291
320
 
292
321
  ```
293
322
  Documents
@@ -295,10 +324,10 @@ Documents
295
324
  ├─── Embedding Engine ──────────────┐
296
325
  │ (Sentence-BERT/BGE/Instructor) │
297
326
  │ │
298
- ├─── Lexical Matrix ───────────────┼─── Multi-View
327
+ ├─── Lexical Matrix ────────────────┼─── Multi-View
299
328
  │ (TF-IDF/BM25) │ Graph Builder
300
329
  │ │ │
301
- └─── Metadata Graph ───────────────┘
330
+ └─── Metadata Graph ────────────────┘
302
331
  (Optional) │
303
332
 
304
333
  ┌─────────────────────┐
@@ -317,6 +346,12 @@ Documents
317
346
  └──────────┬──────────┘
318
347
 
319
348
  ┌──────────▼──────────┐
349
+ │ Representative │
350
+ │ Selection │
351
+ │ (Archetype/Hybrid) │
352
+ └──────────┬──────────┘
353
+
354
+ ┌──────────▼──────────┐
320
355
  │ LLM Labeling │
321
356
  │ (Claude/GPT-4) │
322
357
  └─────────────────────┘
@@ -4,19 +4,20 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tritopic"
7
- version = "0.1.0"
8
- description = "Tri-Modal Graph Topic Modeling with Iterative Refinement - A state-of-the-art topic modeling library"
7
+ version = "1.0.0"
8
+ description = "Tri-Modal Graph Topic Modeling with Iterative Refinement"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
11
11
  authors = [
12
- {name = "Roman Egger", email = "roman@example.com"}
12
+ {name = "Roman Egger", email = "roman.egger@example.com"}
13
13
  ]
14
14
  keywords = [
15
- "topic-modeling", "nlp", "machine-learning", "graph-clustering",
16
- "leiden", "embeddings", "text-analysis", "bertopic-alternative"
15
+ "topic-modeling", "nlp", "machine-learning", "bertopic",
16
+ "clustering", "text-analysis", "multilingual"
17
17
  ]
18
18
  classifiers = [
19
19
  "Development Status :: 4 - Beta",
20
+ "Intended Audience :: Developers",
20
21
  "Intended Audience :: Science/Research",
21
22
  "License :: OSI Approved :: MIT License",
22
23
  "Programming Language :: Python :: 3",
@@ -30,18 +31,13 @@ classifiers = [
30
31
  requires-python = ">=3.9"
31
32
  dependencies = [
32
33
  "numpy>=1.21.0",
33
- "pandas>=1.3.0",
34
34
  "scipy>=1.7.0",
35
35
  "scikit-learn>=1.0.0",
36
+ "pandas>=1.3.0",
36
37
  "sentence-transformers>=2.2.0",
37
38
  "leidenalg>=0.9.0",
38
- "igraph>=0.10.0",
39
- "umap-learn>=0.5.0",
40
- "hdbscan>=0.8.0",
41
- "plotly>=5.0.0",
39
+ "python-igraph>=0.10.0",
42
40
  "tqdm>=4.60.0",
43
- "rank-bm25>=0.2.0",
44
- "keybert>=0.7.0",
45
41
  ]
46
42
 
47
43
  [project.optional-dependencies]
@@ -49,11 +45,30 @@ llm = [
49
45
  "anthropic>=0.18.0",
50
46
  "openai>=1.0.0",
51
47
  ]
52
- full = [
53
- "anthropic>=0.18.0",
54
- "openai>=1.0.0",
55
- "pacmap>=0.6.0",
56
- "datamapplot>=0.1.0",
48
+ multilingual = [
49
+ "langdetect>=1.0.9",
50
+ "jieba>=0.42.1",
51
+ ]
52
+ japanese = [
53
+ "fugashi>=1.2.0",
54
+ "unidic-lite>=1.0.8",
55
+ ]
56
+ korean = [
57
+ "konlpy>=0.6.0",
58
+ ]
59
+ thai = [
60
+ "pythainlp>=4.0.0",
61
+ ]
62
+ visualization = [
63
+ "plotly>=5.0.0",
64
+ "matplotlib>=3.4.0",
65
+ "umap-learn>=0.5.0",
66
+ ]
67
+ evaluation = [
68
+ "gensim>=4.0.0",
69
+ ]
70
+ all = [
71
+ "tritopic[llm,multilingual,visualization,evaluation]",
57
72
  ]
58
73
  dev = [
59
74
  "pytest>=7.0.0",
@@ -61,12 +76,14 @@ dev = [
61
76
  "black>=23.0.0",
62
77
  "ruff>=0.1.0",
63
78
  "mypy>=1.0.0",
79
+ "sphinx>=6.0.0",
64
80
  ]
65
81
 
66
82
  [project.urls]
67
83
  Homepage = "https://github.com/roman-egger/tritopic"
68
84
  Documentation = "https://tritopic.readthedocs.io"
69
85
  Repository = "https://github.com/roman-egger/tritopic"
86
+ Issues = "https://github.com/roman-egger/tritopic/issues"
70
87
 
71
88
  [tool.setuptools.packages.find]
72
89
  where = ["."]
@@ -74,13 +91,15 @@ include = ["tritopic*"]
74
91
 
75
92
  [tool.black]
76
93
  line-length = 100
77
- target-version = ['py39', 'py310', 'py311']
94
+ target-version = ['py39', 'py310', 'py311', 'py312']
78
95
 
79
96
  [tool.ruff]
80
97
  line-length = 100
81
- select = ["E", "F", "W", "I", "N"]
98
+ select = ["E", "F", "W", "I", "N", "D", "UP"]
99
+ ignore = ["D100", "D104"]
82
100
 
83
101
  [tool.mypy]
84
- python_version = "3.10"
102
+ python_version = "3.9"
85
103
  warn_return_any = true
86
104
  warn_unused_configs = true
105
+ ignore_missing_imports = true
@@ -0,0 +1,36 @@
1
+ """
2
+ TriTopic: Tri-Modal Graph Topic Modeling with Iterative Refinement
3
+
4
+ A state-of-the-art topic modeling library that consistently outperforms
5
+ BERTopic and traditional approaches.
6
+
7
+ Key Features:
8
+ - Multi-view representation (semantic, lexical, metadata)
9
+ - Hybrid graph construction (Mutual kNN + SNN)
10
+ - Consensus Leiden clustering for stability
11
+ - Iterative refinement for improved coherence
12
+ - Multilingual support (60+ languages)
13
+ - LLM-powered labeling
14
+
15
+ Example:
16
+ >>> from tritopic import TriTopic
17
+ >>> model = TriTopic(verbose=True)
18
+ >>> topics = model.fit_transform(documents)
19
+ >>> print(model.get_topic_info())
20
+ """
21
+
22
+ __version__ = "1.0.0"
23
+ __author__ = "Roman Egger"
24
+
25
+ from .model import TriTopic, Topic
26
+ from .config import TriTopicConfig, get_config
27
+ from .labeling import LLMLabeler, KeywordLabeler
28
+
29
+ __all__ = [
30
+ "TriTopic",
31
+ "Topic",
32
+ "TriTopicConfig",
33
+ "get_config",
34
+ "LLMLabeler",
35
+ "KeywordLabeler",
36
+ ]