PyPI - tritopic - Versions diffs - 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

tritopic 0.1.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tritopic might be problematic. Click here for more details.

Files changed (30) hide show

tritopic/__init__.py +22 -32
tritopic/config.py +289 -0
tritopic/core/__init__.py +0 -17
tritopic/core/clustering.py +229 -243
tritopic/core/embeddings.py +151 -157
tritopic/core/graph.py +435 -0
tritopic/core/keywords.py +213 -249
tritopic/core/refinement.py +231 -0
tritopic/core/representatives.py +560 -0
tritopic/labeling.py +313 -0
tritopic/model.py +718 -0
tritopic/multilingual/__init__.py +38 -0
tritopic/multilingual/detection.py +208 -0
tritopic/multilingual/stopwords.py +467 -0
tritopic/multilingual/tokenizers.py +275 -0
tritopic/visualization.py +371 -0
{tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/METADATA +91 -51
tritopic-1.1.0.dist-info/RECORD +20 -0
tritopic/core/graph_builder.py +0 -493
tritopic/core/model.py +0 -810
tritopic/labeling/__init__.py +0 -5
tritopic/labeling/llm_labeler.py +0 -279
tritopic/utils/__init__.py +0 -13
tritopic/utils/metrics.py +0 -254
tritopic/visualization/__init__.py +0 -5
tritopic/visualization/plotter.py +0 -523
tritopic-0.1.0.dist-info/RECORD +0 -18
tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
{tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/WHEEL +0 -0
{tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/top_level.txt +0 -0

{tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,14 +1,12 @@
 Metadata-Version: 2.4
 Name: tritopic
-Version: 0.1.0
-Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement - A state-of-the-art topic modeling library
-Author-email: Roman Egger <roman@example.com>
+Version: 1.1.0
+Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
+Author-email: Roman Egger <roman.egger@smartvisions.at>
 License: MIT
-Project-URL: Homepage, https://github.com/roman-egger/tritopic
-Project-URL: Documentation, https://tritopic.readthedocs.io
-Project-URL: Repository, https://github.com/roman-egger/tritopic
-Keywords: topic-modeling,nlp,machine-learning,graph-clustering,leiden,embeddings,text-analysis,bertopic-alternative
+Keywords: topic-modeling,nlp,machine-learning,bertopic,clustering,text-analysis,multilingual
 Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
@@ -20,35 +18,42 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
-License-File: LICENSE
 Requires-Dist: numpy>=1.21.0
-Requires-Dist: pandas>=1.3.0
 Requires-Dist: scipy>=1.7.0
 Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: pandas>=1.3.0
 Requires-Dist: sentence-transformers>=2.2.0
 Requires-Dist: leidenalg>=0.9.0
-Requires-Dist: igraph>=0.10.0
-Requires-Dist: umap-learn>=0.5.0
-Requires-Dist: hdbscan>=0.8.0
-Requires-Dist: plotly>=5.0.0
+Requires-Dist: python-igraph>=0.10.0
 Requires-Dist: tqdm>=4.60.0
-Requires-Dist: rank-bm25>=0.2.0
-Requires-Dist: keybert>=0.7.0
 Provides-Extra: llm
 Requires-Dist: anthropic>=0.18.0; extra == "llm"
 Requires-Dist: openai>=1.0.0; extra == "llm"
-Provides-Extra: full
-Requires-Dist: anthropic>=0.18.0; extra == "full"
-Requires-Dist: openai>=1.0.0; extra == "full"
-Requires-Dist: pacmap>=0.6.0; extra == "full"
-Requires-Dist: datamapplot>=0.1.0; extra == "full"
+Provides-Extra: multilingual
+Requires-Dist: langdetect>=1.0.9; extra == "multilingual"
+Requires-Dist: jieba>=0.42.1; extra == "multilingual"
+Provides-Extra: japanese
+Requires-Dist: fugashi>=1.2.0; extra == "japanese"
+Requires-Dist: unidic-lite>=1.0.8; extra == "japanese"
+Provides-Extra: korean
+Requires-Dist: konlpy>=0.6.0; extra == "korean"
+Provides-Extra: thai
+Requires-Dist: pythainlp>=4.0.0; extra == "thai"
+Provides-Extra: visualization
+Requires-Dist: plotly>=5.0.0; extra == "visualization"
+Requires-Dist: matplotlib>=3.4.0; extra == "visualization"
+Requires-Dist: umap-learn>=0.5.0; extra == "visualization"
+Provides-Extra: evaluation
+Requires-Dist: gensim>=4.0.0; extra == "evaluation"
+Provides-Extra: all
+Requires-Dist: tritopic[evaluation,llm,multilingual,visualization]; extra == "all"
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 Requires-Dist: black>=23.0.0; extra == "dev"
 Requires-Dist: ruff>=0.1.0; extra == "dev"
 Requires-Dist: mypy>=1.0.0; extra == "dev"
-Dynamic: license-file
+Requires-Dist: sphinx>=6.0.0; extra == "dev"
 # 🔺 TriTopic
@@ -68,6 +73,8 @@ A state-of-the-art topic modeling library that consistently outperforms BERTopic
 | **Mutual kNN + SNN** | Eliminates noise bridges between unrelated documents |
 | **Leiden + Consensus** | Dramatically more stable than single-run clustering |
 | **Iterative Refinement** | Topics improve embeddings, embeddings improve topics |
+| **Multilingual Support** | 60+ languages with auto language detection |
+| **Archetype Representatives** | Rich document selection beyond simple centroids |
 | **LLM-Powered Labels** | Human-readable topic names via Claude or GPT-4 |
 ## 📦 Installation
@@ -79,8 +86,11 @@ pip install tritopic
 # With LLM labeling support
 pip install tritopic[llm]
+# With multilingual support
+pip install tritopic[multilingual]
 # Full installation (all features)
-pip install tritopic[full]
+pip install tritopic[all]
 ```
 ### From source (development)
@@ -171,6 +181,23 @@ model.generate_labels(labeler)
 print(model.get_topic_info())
 ```
+### Multilingual Support
+```python
+from tritopic import TriTopic
+# Auto-detect language and select appropriate model
+model = TriTopic(
+    language="auto",          # Auto-detect language
+    multilingual=False,       # Use language-specific model
+    verbose=True
+)
+# Works with Chinese, German, Japanese, etc.
+chinese_docs = ["机器学习正在改变医疗诊断", "深度神经网络取得超人类表现", ...]
+topics = model.fit_transform(chinese_docs)
+```
 ### With Metadata
 ```python
@@ -202,13 +229,18 @@ from tritopic import TriTopic, TriTopicConfig
 config = TriTopicConfig(
     # Embedding settings
-    embedding_model="all-MiniLM-L6-v2",  # or "BAAI/bge-base-en-v1.5"
+    embedding_model="all-MiniLM-L6-v2",  # or "auto", "BAAI/bge-base-en-v1.5"
     embedding_batch_size=32,
+    # Language settings
+    language="auto",           # or "en", "de", "zh", etc.
+    multilingual=False,        # Force multilingual model
+    language_detection_sample=100,
     # Graph construction
     n_neighbors=15,
     metric="cosine",
-    graph_type="hybrid",  # "knn", "mutual_knn", "snn", "hybrid"
+    graph_type="hybrid",       # "knn", "mutual_knn", "snn", "hybrid"
     snn_weight=0.5,
     # Multi-view fusion weights
@@ -227,11 +259,17 @@ config = TriTopicConfig(
     use_iterative_refinement=True,
     max_iterations=5,
     convergence_threshold=0.95,
+    refinement_strength=0.15,
     # Keywords
     n_keywords=10,
+    keyword_method="ctfidf",   # "ctfidf", "bm25", "keybert"
+    # Representatives (with archetype support)
     n_representative_docs=5,
-    keyword_method="ctfidf",  # "ctfidf", "bm25", "keybert"
+    representative_method="hybrid",  # "centroid", "medoid", "archetype", "diverse", "hybrid"
+    n_archetypes=4,
+    archetype_method="furthest_sum",
     # Misc
     outlier_threshold=0.1,
@@ -242,16 +280,25 @@ config = TriTopicConfig(
 model = TriTopic(config=config)
 ```
-### Quick Parameter Override
+### Pre-defined Configurations
 ```python
-# Override just what you need
-model = TriTopic(
-    embedding_model="BAAI/bge-base-en-v1.5",
-    n_neighbors=20,
-    use_iterative_refinement=True,
-    verbose=True,
-)
+from tritopic import TriTopic, get_config
+# Fast processing (less accurate)
+model = TriTopic(config=get_config("fast"))
+# High quality (slower)
+model = TriTopic(config=get_config("quality"))
+# Multilingual corpus
+model = TriTopic(config=get_config("multilingual"))
+# Chinese text
+model = TriTopic(config=get_config("chinese"))
+# German text
+model = TriTopic(config=get_config("german"))
 ```
 ## 📊 Evaluation
@@ -285,20 +332,6 @@ model = TriTopic()
 topics = model.fit_transform(documents, embeddings=embeddings)
 ```
-### Find Optimal Resolution
-```python
-from tritopic.core.clustering import ConsensusLeiden
-clusterer = ConsensusLeiden()
-optimal_res = clusterer.find_optimal_resolution(
-    graph=model.graph_,
-    resolution_range=(0.5, 2.0),
-    target_n_topics=15,  # Optional: target number
-)
-print(f"Optimal resolution: {optimal_res}")
-```
 ### Transform New Documents
 ```python
@@ -327,7 +360,8 @@ model = TriTopic.load("my_topic_model.pkl")
 | Views | Embeddings only | Semantic + Lexical + Metadata |
 | Refinement | None | Iterative embedding refinement |
 | Stability | Low (varies by run) | High (consensus clustering) |
-| Outlier Handling | HDBSCAN built-in | Configurable threshold |
+| Languages | Limited | 60+ with auto-detection |
+| Representatives | Centroid only | Archetypes, medoids, diverse |
 ### Benchmark Results
@@ -339,7 +373,7 @@ On 20 Newsgroups dataset (n=18,846):
 | Diversity | 0.834 | **0.891** | +7% |
 | Stability (ARI) | 0.721 | **0.934** | +30% |
-## 🏗️ Architecture
+## 🗂️ Architecture
 ```
 Documents
@@ -347,10 +381,10 @@ Documents
     ├─── Embedding Engine ──────────────┐
     │    (Sentence-BERT/BGE/Instructor) │
     │                                   │
-    ├─── Lexical Matrix ───────────────┼─── Multi-View
+    ├─── Lexical Matrix ────────────────┼─── Multi-View
     │    (TF-IDF/BM25)                  │    Graph Builder
     │                                   │         │
-    └─── Metadata Graph ───────────────┘         │
+    └─── Metadata Graph ────────────────┘         │
          (Optional)                              │
                                                  ▼
                                     ┌─────────────────────┐
@@ -369,6 +403,12 @@ Documents
                                     └──────────┬──────────┘
                                                │
                                     ┌──────────▼──────────┐
+                                    │  Representative      │
+                                    │  Selection           │
+                                    │  (Archetype/Hybrid)  │
+                                    └──────────┬──────────┘
+                                               │
+                                    ┌──────────▼──────────┐
                                     │   LLM Labeling       │
                                     │  (Claude/GPT-4)      │
                                     └─────────────────────┘

tritopic-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+tritopic/__init__.py,sha256=BaHbardg5BW9zykYOtYG1ZM1nGwvfVt7DV7NJ7tp4l8,936
+tritopic/config.py,sha256=vL47vU5KAYD1iCzH3cRMFUO1w1NSibmjIuAHNsBLu5c,10614
+tritopic/labeling.py,sha256=SJsvOXRl-q8f3qtk1S66FGozTJsW8bwNnAKGkAklmVQ,8883
+tritopic/model.py,sha256=mzptfvqG_Q81OcS6kiYd7u2uU2AKjxpDYKo9u1EfpH4,25015
+tritopic/visualization.py,sha256=MCiIgIoTzFoQ7GG9WjfSZlV2j1BBGzZwxRddmvmh1OY,9841
+tritopic/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tritopic/core/clustering.py,sha256=mZoU8SkfSLWjFbdJYcBfjIJ75uQJVH1guMPoVTZOmnM,9461
+tritopic/core/embeddings.py,sha256=ouqW9YQKSn8MtIt0DShhFB14QIhCnqryyz27Ilwg6sM,6707
+tritopic/core/graph.py,sha256=a949-6N9ZH8Jd7hFt6fDdB4K7r1A2qs16eZfYmJFKHM,13605
+tritopic/core/keywords.py,sha256=AnHY7QFGlGsSRfcsss6EpUIGD91ybo3MsWs5Ritb9cM,9667
+tritopic/core/refinement.py,sha256=7e6K-EuqZ4ttqNkhsvOeHlDC1ZjP9TA0_8mdyy5kTEw,7695
+tritopic/core/representatives.py,sha256=hqpnNMxhqyZxjM7CaGV5M1RxR3B358tMaGTKWn8jWOo,19154
+tritopic/multilingual/__init__.py,sha256=EagOqVqMDNKX7AfEAQfVgbR92f2vBy1KSM5O88AEt20,699
+tritopic/multilingual/detection.py,sha256=xeZqNp4l-fRII5s2S4EMzBdJPf3Xgt6e1a3Od2hc2q4,5700
+tritopic/multilingual/stopwords.py,sha256=viMM1pb4VpDEmDpGpx_8sDfumXfrVXKfUULyOZXFFYU,29942
+tritopic/multilingual/tokenizers.py,sha256=seTCzRiUOqO0UbAqA3nn8V8EoVYQ1wiwqcH8lafRCxM,9954
+tritopic-1.1.0.dist-info/METADATA,sha256=nIWD3zUMOQR9efdUFo8zUjM0JVJGgrzgZVDyLbbjJ7I,13922
+tritopic-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+tritopic-1.1.0.dist-info/top_level.txt,sha256=9PASbqQyi0-wa7E2Hl3Z0u1ae7MwLcfgFliFE1ioFBA,9
+tritopic-1.1.0.dist-info/RECORD,,

tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

tritopic 0.1.0py3-none-any.whl → 1.1.0py3-none-any.whl