tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tritopic
3
- Version: 0.1.0
4
- Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement - A state-of-the-art topic modeling library
5
- Author-email: Roman Egger <roman@example.com>
3
+ Version: 1.0.0
4
+ Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
5
+ Author-email: Roman Egger <roman.egger@example.com>
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://github.com/roman-egger/tritopic
8
8
  Project-URL: Documentation, https://tritopic.readthedocs.io
9
9
  Project-URL: Repository, https://github.com/roman-egger/tritopic
10
- Keywords: topic-modeling,nlp,machine-learning,graph-clustering,leiden,embeddings,text-analysis,bertopic-alternative
10
+ Project-URL: Issues, https://github.com/roman-egger/tritopic/issues
11
+ Keywords: topic-modeling,nlp,machine-learning,bertopic,clustering,text-analysis,multilingual
11
12
  Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
12
14
  Classifier: Intended Audience :: Science/Research
13
15
  Classifier: License :: OSI Approved :: MIT License
14
16
  Classifier: Programming Language :: Python :: 3
@@ -20,35 +22,42 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
22
  Classifier: Topic :: Text Processing :: Linguistic
21
23
  Requires-Python: >=3.9
22
24
  Description-Content-Type: text/markdown
23
- License-File: LICENSE
24
25
  Requires-Dist: numpy>=1.21.0
25
- Requires-Dist: pandas>=1.3.0
26
26
  Requires-Dist: scipy>=1.7.0
27
27
  Requires-Dist: scikit-learn>=1.0.0
28
+ Requires-Dist: pandas>=1.3.0
28
29
  Requires-Dist: sentence-transformers>=2.2.0
29
30
  Requires-Dist: leidenalg>=0.9.0
30
- Requires-Dist: igraph>=0.10.0
31
- Requires-Dist: umap-learn>=0.5.0
32
- Requires-Dist: hdbscan>=0.8.0
33
- Requires-Dist: plotly>=5.0.0
31
+ Requires-Dist: python-igraph>=0.10.0
34
32
  Requires-Dist: tqdm>=4.60.0
35
- Requires-Dist: rank-bm25>=0.2.0
36
- Requires-Dist: keybert>=0.7.0
37
33
  Provides-Extra: llm
38
34
  Requires-Dist: anthropic>=0.18.0; extra == "llm"
39
35
  Requires-Dist: openai>=1.0.0; extra == "llm"
40
- Provides-Extra: full
41
- Requires-Dist: anthropic>=0.18.0; extra == "full"
42
- Requires-Dist: openai>=1.0.0; extra == "full"
43
- Requires-Dist: pacmap>=0.6.0; extra == "full"
44
- Requires-Dist: datamapplot>=0.1.0; extra == "full"
36
+ Provides-Extra: multilingual
37
+ Requires-Dist: langdetect>=1.0.9; extra == "multilingual"
38
+ Requires-Dist: jieba>=0.42.1; extra == "multilingual"
39
+ Provides-Extra: japanese
40
+ Requires-Dist: fugashi>=1.2.0; extra == "japanese"
41
+ Requires-Dist: unidic-lite>=1.0.8; extra == "japanese"
42
+ Provides-Extra: korean
43
+ Requires-Dist: konlpy>=0.6.0; extra == "korean"
44
+ Provides-Extra: thai
45
+ Requires-Dist: pythainlp>=4.0.0; extra == "thai"
46
+ Provides-Extra: visualization
47
+ Requires-Dist: plotly>=5.0.0; extra == "visualization"
48
+ Requires-Dist: matplotlib>=3.4.0; extra == "visualization"
49
+ Requires-Dist: umap-learn>=0.5.0; extra == "visualization"
50
+ Provides-Extra: evaluation
51
+ Requires-Dist: gensim>=4.0.0; extra == "evaluation"
52
+ Provides-Extra: all
53
+ Requires-Dist: tritopic[evaluation,llm,multilingual,visualization]; extra == "all"
45
54
  Provides-Extra: dev
46
55
  Requires-Dist: pytest>=7.0.0; extra == "dev"
47
56
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
48
57
  Requires-Dist: black>=23.0.0; extra == "dev"
49
58
  Requires-Dist: ruff>=0.1.0; extra == "dev"
50
59
  Requires-Dist: mypy>=1.0.0; extra == "dev"
51
- Dynamic: license-file
60
+ Requires-Dist: sphinx>=6.0.0; extra == "dev"
52
61
 
53
62
  # 🔺 TriTopic
54
63
 
@@ -68,6 +77,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
68
77
  | **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
69
78
  | **Leiden + Consensus** | Dramatically more stable than single-run clustering |
70
79
  | **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
80
+ | **Multilingual Support** | 60+ languages with auto language detection |
81
+ | **Archetype Representatives** | Rich document selection beyond simple centroids |
71
82
  | **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
72
83
 
73
84
  ## 📦 Installation
@@ -79,8 +90,11 @@ pip install tritopic
79
90
  # With LLM labeling support
80
91
  pip install tritopic[llm]
81
92
 
93
+ # With multilingual support
94
+ pip install tritopic[multilingual]
95
+
82
96
  # Full installation (all features)
83
- pip install tritopic[full]
97
+ pip install tritopic[all]
84
98
  ```
85
99
 
86
100
  ### From source (development)
@@ -171,6 +185,23 @@ model.generate_labels(labeler)
171
185
  print(model.get_topic_info())
172
186
  ```
173
187
 
188
+ ### Multilingual Support
189
+
190
+ ```python
191
+ from tritopic import TriTopic
192
+
193
+ # Auto-detect language and select appropriate model
194
+ model = TriTopic(
195
+ language="auto", # Auto-detect language
196
+ multilingual=False, # Use language-specific model
197
+ verbose=True
198
+ )
199
+
200
+ # Works with Chinese, German, Japanese, etc.
201
+ chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
202
+ topics = model.fit_transform(chinese_docs)
203
+ ```
204
+
174
205
  ### With Metadata
175
206
 
176
207
  ```python
@@ -202,13 +233,18 @@ from tritopic import TriTopic, TriTopicConfig
202
233
 
203
234
  config = TriTopicConfig(
204
235
  # Embedding settings
205
- embedding_model="all-MiniLM-L6-v2", # or "BAAI/bge-base-en-v1.5"
236
+ embedding_model="all-MiniLM-L6-v2", # or "auto", "BAAI/bge-base-en-v1.5"
206
237
  embedding_batch_size=32,
207
238
 
239
+ # Language settings
240
+ language="auto", # or "en", "de", "zh", etc.
241
+ multilingual=False, # Force multilingual model
242
+ language_detection_sample=100,
243
+
208
244
  # Graph construction
209
245
  n_neighbors=15,
210
246
  metric="cosine",
211
- graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
247
+ graph_type="hybrid", # "knn", "mutual_knn", "snn", "hybrid"
212
248
  snn_weight=0.5,
213
249
 
214
250
  # Multi-view fusion weights
@@ -227,11 +263,17 @@ config = TriTopicConfig(
227
263
  use_iterative_refinement=True,
228
264
  max_iterations=5,
229
265
  convergence_threshold=0.95,
266
+ refinement_strength=0.15,
230
267
 
231
268
  # Keywords
232
269
  n_keywords=10,
270
+ keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
271
+
272
+ # Representatives (with archetype support)
233
273
  n_representative_docs=5,
234
- keyword_method="ctfidf", # "ctfidf", "bm25", "keybert"
274
+ representative_method="hybrid", # "centroid", "medoid", "archetype", "diverse", "hybrid"
275
+ n_archetypes=4,
276
+ archetype_method="furthest_sum",
235
277
 
236
278
  # Misc
237
279
  outlier_threshold=0.1,
@@ -242,16 +284,25 @@ config = TriTopicConfig(
242
284
  model = TriTopic(config=config)
243
285
  ```
244
286
 
245
- ### Quick Parameter Override
287
+ ### Pre-defined Configurations
246
288
 
247
289
  ```python
248
- # Override just what you need
249
- model = TriTopic(
250
- embedding_model="BAAI/bge-base-en-v1.5",
251
- n_neighbors=20,
252
- use_iterative_refinement=True,
253
- verbose=True,
254
- )
290
+ from tritopic import TriTopic, get_config
291
+
292
+ # Fast processing (less accurate)
293
+ model = TriTopic(config=get_config("fast"))
294
+
295
+ # High quality (slower)
296
+ model = TriTopic(config=get_config("quality"))
297
+
298
+ # Multilingual corpus
299
+ model = TriTopic(config=get_config("multilingual"))
300
+
301
+ # Chinese text
302
+ model = TriTopic(config=get_config("chinese"))
303
+
304
+ # German text
305
+ model = TriTopic(config=get_config("german"))
255
306
  ```
256
307
 
257
308
  ## 📊 Evaluation
@@ -285,20 +336,6 @@ model = TriTopic()
285
336
  topics = model.fit_transform(documents, embeddings=embeddings)
286
337
  ```
287
338
 
288
- ### Find Optimal Resolution
289
-
290
- ```python
291
- from tritopic.core.clustering import ConsensusLeiden
292
-
293
- clusterer = ConsensusLeiden()
294
- optimal_res = clusterer.find_optimal_resolution(
295
- graph=model.graph_,
296
- resolution_range=(0.5, 2.0),
297
- target_n_topics=15, # Optional: target number
298
- )
299
- print(f"Optimal resolution: {optimal_res}")
300
- ```
301
-
302
339
  ### Transform New Documents
303
340
 
304
341
  ```python
@@ -327,7 +364,8 @@ model = TriTopic.load("my_topic_model.pkl")
327
364
  | Views | Embeddings only | Semantic + Lexical + Metadata |
328
365
  | Refinement | None | Iterative embedding refinement |
329
366
  | Stability | Low (varies by run) | High (consensus clustering) |
330
- | Outlier Handling | HDBSCAN built-in | Configurable threshold |
367
+ | Languages | Limited | 60+ with auto-detection |
368
+ | Representatives | Centroid only | Archetypes, medoids, diverse |
331
369
 
332
370
  ### Benchmark Results
333
371
 
@@ -339,7 +377,7 @@ On 20 Newsgroups dataset (n=18,846):
339
377
  | Diversity | 0.834 | **0.891** | +7% |
340
378
  | Stability (ARI) | 0.721 | **0.934** | +30% |
341
379
 
342
- ## 🏗️ Architecture
380
+ ## 🗂️ Architecture
343
381
 
344
382
  ```
345
383
  Documents
@@ -347,10 +385,10 @@ Documents
347
385
  ├─── Embedding Engine ──────────────┐
348
386
  │ (Sentence-BERT/BGE/Instructor) │
349
387
  │ │
350
- ├─── Lexical Matrix ───────────────┼─── Multi-View
388
+ ├─── Lexical Matrix ────────────────┼─── Multi-View
351
389
  │ (TF-IDF/BM25) │ Graph Builder
352
390
  │ │ │
353
- └─── Metadata Graph ───────────────┘
391
+ └─── Metadata Graph ────────────────┘
354
392
  (Optional) │
355
393
 
356
394
  ┌─────────────────────┐
@@ -369,6 +407,12 @@ Documents
369
407
  └──────────┬──────────┘
370
408
 
371
409
  ┌──────────▼──────────┐
410
+ │ Representative │
411
+ │ Selection │
412
+ │ (Archetype/Hybrid) │
413
+ └──────────┬──────────┘
414
+
415
+ ┌──────────▼──────────┐
372
416
  │ LLM Labeling │
373
417
  │ (Claude/GPT-4) │
374
418
  └─────────────────────┘
@@ -0,0 +1,20 @@
1
+ tritopic/__init__.py,sha256=BaHbardg5BW9zykYOtYG1ZM1nGwvfVt7DV7NJ7tp4l8,936
2
+ tritopic/config.py,sha256=bsornL0etlRxQyMa6-Yx7tgXqVR1b8OZPpXM62cibhI,10120
3
+ tritopic/labeling.py,sha256=SJsvOXRl-q8f3qtk1S66FGozTJsW8bwNnAKGkAklmVQ,8883
4
+ tritopic/model.py,sha256=mzptfvqG_Q81OcS6kiYd7u2uU2AKjxpDYKo9u1EfpH4,25015
5
+ tritopic/visualization.py,sha256=MCiIgIoTzFoQ7GG9WjfSZlV2j1BBGzZwxRddmvmh1OY,9841
6
+ tritopic/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ tritopic/core/clustering.py,sha256=mZoU8SkfSLWjFbdJYcBfjIJ75uQJVH1guMPoVTZOmnM,9461
8
+ tritopic/core/embeddings.py,sha256=ouqW9YQKSn8MtIt0DShhFB14QIhCnqryyz27Ilwg6sM,6707
9
+ tritopic/core/graph.py,sha256=a949-6N9ZH8Jd7hFt6fDdB4K7r1A2qs16eZfYmJFKHM,13605
10
+ tritopic/core/keywords.py,sha256=AnHY7QFGlGsSRfcsss6EpUIGD91ybo3MsWs5Ritb9cM,9667
11
+ tritopic/core/refinement.py,sha256=7e6K-EuqZ4ttqNkhsvOeHlDC1ZjP9TA0_8mdyy5kTEw,7695
12
+ tritopic/core/representatives.py,sha256=hqpnNMxhqyZxjM7CaGV5M1RxR3B358tMaGTKWn8jWOo,19154
13
+ tritopic/multilingual/__init__.py,sha256=EagOqVqMDNKX7AfEAQfVgbR92f2vBy1KSM5O88AEt20,699
14
+ tritopic/multilingual/detection.py,sha256=xeZqNp4l-fRII5s2S4EMzBdJPf3Xgt6e1a3Od2hc2q4,5700
15
+ tritopic/multilingual/stopwords.py,sha256=viMM1pb4VpDEmDpGpx_8sDfumXfrVXKfUULyOZXFFYU,29942
16
+ tritopic/multilingual/tokenizers.py,sha256=seTCzRiUOqO0UbAqA3nn8V8EoVYQ1wiwqcH8lafRCxM,9954
17
+ tritopic-1.0.0.dist-info/METADATA,sha256=kwoHBkE7i3m59h5i5QA10IsfBpJ5_rqI1u_SKXHFjQU,14178
18
+ tritopic-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
19
+ tritopic-1.0.0.dist-info/top_level.txt,sha256=9PASbqQyi0-wa7E2Hl3Z0u1ae7MwLcfgFliFE1ioFBA,9
20
+ tritopic-1.0.0.dist-info/RECORD,,