PyPI - tritopic - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

tritopic 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tritopic might be problematic. Click here for more details.

Files changed (5) hide show

tritopic/config.py CHANGED Viewed

@@ -1,289 +1,305 @@
 """
 TriTopic Configuration Module
-This module provides configuration classes and utilities for TriTopic.
+Defines all configuration parameters for the TriTopic model.
 """
 from dataclasses import dataclass, field
-from typing import Optional, List, Literal
+from typing import Optional, List, Literal, Union
 @dataclass
 class TriTopicConfig:
     """
-    Configuration class for TriTopic model.
+    Configuration for TriTopic model.
+    Attributes
+    ----------
+    # Embedding & Language Settings
+    embedding_model : str
+        Sentence-Transformer model name or "auto" for automatic selection.
+        Auto-selection considers the language parameter.
+    embedding_batch_size : int
+        Batch size for embedding generation.
+    language : str
+        ISO 639-1 language code (e.g., "en", "de", "zh") or "auto" for detection.
+    multilingual : bool
+        If True, uses multilingual embedding models regardless of detected language.
+    language_detection_sample : int
+        Number of documents to sample for automatic language detection.
+    tokenizer : str
+        Tokenizer to use: "auto", "whitespace", "spacy", "jieba", "fugashi", "konlpy", "pythainlp".
+    custom_stopwords : List[str]
+        Additional stopwords to add to the language-specific list.
+    min_token_length : int
+        Minimum token length to keep.
+    max_token_length : int
+        Maximum token length to keep.
-    All hyperparameters can be set here and passed to TriTopic.
+    # Graph Construction
+    n_neighbors : int
+        Number of neighbors for kNN graph construction.
+    metric : str
+        Distance metric for similarity calculation.
+    graph_type : str
+        Type of graph: "knn", "mutual_knn", "snn", "hybrid".
+    snn_weight : float
+        Weight of SNN component in hybrid graph (0-1).
-    Example:
-        config = TriTopicConfig(
-            embedding_model="all-mpnet-base-v2",
-            n_neighbors=20,
-            use_iterative_refinement=True
-        )
-        model = TriTopic(config=config)
-    """
+    # Multi-View Fusion
+    use_lexical_view : bool
+        Whether to include lexical (TF-IDF/BM25) similarity.
+    use_metadata_view : bool
+        Whether to include metadata-based similarity.
+    semantic_weight : float
+        Weight for semantic (embedding) view.
+    lexical_weight : float
+        Weight for lexical view.
+    metadata_weight : float
+        Weight for metadata view.
+    lexical_method : str
+        Method for lexical similarity: "tfidf", "bm25".
+    ngram_range : tuple
+        N-gram range for lexical features.
+    # Clustering
+    resolution : float
+        Resolution parameter for Leiden algorithm.
+    n_consensus_runs : int
+        Number of clustering runs for consensus.
+    min_cluster_size : int
+        Minimum number of documents per topic.
-    # ==========================================================================
-    # Embedding Settings
-    # ==========================================================================
-    embedding_model: str = "all-MiniLM-L6-v2"
-    """Sentence-Transformer model name or path.
-    Options: "all-MiniLM-L6-v2", "all-mpnet-base-v2", "BAAI/bge-base-en-v1.5", etc."""
+    # Iterative Refinement
+    use_iterative_refinement : bool
+        Whether to use iterative embedding refinement.
+    max_iterations : int
+        Maximum refinement iterations.
+    convergence_threshold : float
+        ARI threshold for convergence detection.
+    refinement_strength : float
+        How strongly to pull embeddings toward centroids (0-1).
+    # Keywords
+    n_keywords : int
+        Number of keywords per topic.
+    keyword_method : str
+        Method for keyword extraction: "ctfidf", "bm25", "keybert".
+    # Representative Documents
+    n_representative_docs : int
+        Number of representative documents per topic.
+    representative_method : str
+        Method for selection: "centroid", "medoid", "archetype", "diverse", "hybrid".
+    n_archetypes : int
+        Number of archetypes per topic (for archetype/hybrid method).
+    archetype_method : str
+        Algorithm for archetype analysis: "pcha", "convex_hull", "furthest_sum".
-    embedding_batch_size: int = 32
-    """Batch size for embedding generation. Reduce if GPU OOM."""
+    # Outlier Handling
+    outlier_threshold : float
+        Threshold for outlier detection (0-1).
+    reassign_outliers : bool
+        Whether to try reassigning outliers to nearest topic.
-    # ==========================================================================
-    # Graph Construction
-    # ==========================================================================
-    n_neighbors: int = 15
-    """Number of neighbors for kNN graph construction."""
+    # Misc
+    random_state : int
+        Random seed for reproducibility.
+    verbose : bool
+        Whether to print progress information.
+    n_jobs : int
+        Number of parallel jobs (-1 for all cores).
+    """
+    # === Embedding & Language Settings ===
+    embedding_model: str = "auto"
+    embedding_batch_size: int = 32
+    language: str = "auto"
+    multilingual: bool = False
+    language_detection_sample: int = 100
+    tokenizer: str = "auto"
+    custom_stopwords: Optional[List[str]] = None
+    min_token_length: int = 2
+    max_token_length: int = 50
+    # === Graph Construction ===
+    n_neighbors: int = 15
     metric: str = "cosine"
-    """Distance metric: "cosine", "euclidean", "manhattan"."""
     graph_type: Literal["knn", "mutual_knn", "snn", "hybrid"] = "hybrid"
-    """Graph type:
-    - "knn": Standard k-nearest neighbors
-    - "mutual_knn": Only bidirectional connections
-    - "snn": Shared nearest neighbors
-    - "hybrid": Combination of mutual_knn + snn (recommended)
-    """
     snn_weight: float = 0.5
-    """Weight of SNN component in hybrid graph (0.0 to 1.0)."""
-    # ==========================================================================
-    # Multi-View Fusion
-    # ==========================================================================
+    # === Multi-View Fusion ===
     use_lexical_view: bool = True
-    """Include lexical (TF-IDF) similarity in graph construction."""
     use_metadata_view: bool = False
-    """Include metadata similarity in graph construction."""
     semantic_weight: float = 0.5
-    """Weight for semantic (embedding) similarity."""
     lexical_weight: float = 0.3
-    """Weight for lexical (TF-IDF) similarity."""
     metadata_weight: float = 0.2
-    """Weight for metadata similarity."""
+    lexical_method: Literal["tfidf", "bm25"] = "tfidf"
+    ngram_range: tuple = (1, 2)
-    # ==========================================================================
-    # Clustering (Leiden + Consensus)
-    # ==========================================================================
+    # === Clustering ===
     resolution: float = 1.0
-    """Leiden resolution parameter. Higher = more topics, lower = fewer topics."""
     n_consensus_runs: int = 10
-    """Number of clustering runs for consensus clustering."""
     min_cluster_size: int = 5
-    """Minimum documents per topic. Smaller clusters become outliers."""
-    # ==========================================================================
-    # Iterative Refinement
-    # ==========================================================================
+    # === Iterative Refinement ===
     use_iterative_refinement: bool = True
-    """Enable iterative embedding refinement based on discovered topics."""
     max_iterations: int = 5
-    """Maximum number of refinement iterations."""
     convergence_threshold: float = 0.95
-    """ARI threshold for convergence (0.0 to 1.0)."""
-    refinement_strength: float = 0.1
-    """How strongly to pull embeddings toward topic centroids (0.0 to 1.0)."""
+    refinement_strength: float = 0.15
-    # ==========================================================================
-    # Keyword Extraction
-    # ==========================================================================
+    # === Keywords ===
     n_keywords: int = 10
-    """Number of keywords to extract per topic."""
+    keyword_method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf"
+    # === Representative Documents ===
     n_representative_docs: int = 5
-    """Number of representative documents per topic."""
+    representative_method: Literal["centroid", "medoid", "archetype", "diverse", "hybrid"] = "hybrid"
+    n_archetypes: int = 4
+    archetype_method: Literal["pcha", "convex_hull", "furthest_sum"] = "furthest_sum"
-    keyword_method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf"
-    """Keyword extraction method:
-    - "ctfidf": Class-based TF-IDF (fast, good quality)
-    - "bm25": BM25 scoring
-    - "keybert": KeyBERT extraction (slower, embedding-based)
-    """
-    # ==========================================================================
-    # Outlier Handling
-    # ==========================================================================
+    # === Outlier Handling ===
     outlier_threshold: float = 0.1
-    """Threshold for outlier detection (documents below this similarity)."""
-    reduce_outliers: bool = False
-    """Whether to reassign outliers to nearest topics."""
-    # ==========================================================================
-    # Dimensionality Reduction (for visualization)
-    # ==========================================================================
-    umap_n_neighbors: int = 15
-    """UMAP n_neighbors for visualization."""
-    umap_n_components: int = 2
-    """UMAP dimensions for visualization."""
+    reassign_outliers: bool = False
-    umap_min_dist: float = 0.1
-    """UMAP min_dist parameter."""
-    # ==========================================================================
-    # Misc
-    # ==========================================================================
+    # === Misc ===
     random_state: Optional[int] = 42
-    """Random seed for reproducibility."""
     verbose: bool = True
-    """Print progress information."""
     n_jobs: int = -1
-    """Number of parallel jobs (-1 = all cores)."""
     def __post_init__(self):
         """Validate configuration after initialization."""
-        # Validate weights
-        if self.use_lexical_view and not self.use_metadata_view:
-            total = self.semantic_weight + self.lexical_weight
-            if abs(total - 1.0) > 0.01:
-                # Normalize weights
-                self.semantic_weight = self.semantic_weight / total
-                self.lexical_weight = self.lexical_weight / total
-        elif self.use_lexical_view and self.use_metadata_view:
-            total = self.semantic_weight + self.lexical_weight + self.metadata_weight
-            if abs(total - 1.0) > 0.01:
-                # Normalize weights
+        self._validate()
+    def _validate(self):
+        """Validate configuration parameters."""
+        # Weights should sum to ~1.0
+        total_weight = self.semantic_weight
+        if self.use_lexical_view:
+            total_weight += self.lexical_weight
+        if self.use_metadata_view:
+            total_weight += self.metadata_weight
+        if abs(total_weight - 1.0) > 0.01:
+            # Auto-normalize weights
+            if self.use_lexical_view and self.use_metadata_view:
+                self.semantic_weight = self.semantic_weight / total_weight
+                self.lexical_weight = self.lexical_weight / total_weight
+                self.metadata_weight = self.metadata_weight / total_weight
+            elif self.use_lexical_view:
+                total = self.semantic_weight + self.lexical_weight
                 self.semantic_weight = self.semantic_weight / total
                 self.lexical_weight = self.lexical_weight / total
-                self.metadata_weight = self.metadata_weight / total
+            else:
+                self.semantic_weight = 1.0
         # Validate ranges
         assert 0 < self.n_neighbors <= 100, "n_neighbors must be between 1 and 100"
-        assert 0 < self.n_consensus_runs <= 50, "n_consensus_runs must be between 1 and 50"
-        assert 0 <= self.snn_weight <= 1, "snn_weight must be between 0 and 1"
+        assert 0 < self.snn_weight <= 1, "snn_weight must be between 0 and 1"
+        assert 0 < self.resolution <= 5, "resolution must be between 0 and 5"
         assert 0 < self.convergence_threshold <= 1, "convergence_threshold must be between 0 and 1"
-        assert self.min_cluster_size >= 2, "min_cluster_size must be at least 2"
+        assert self.n_archetypes >= 2, "n_archetypes must be at least 2"
+    def get_embedding_model_for_language(self, detected_language: str = None) -> str:
+        """
+        Get the appropriate embedding model based on language settings.
+        Parameters
+        ----------
+        detected_language : str, optional
+            The detected language code if language="auto"
+        Returns
+        -------
+        str
+            The embedding model name to use
+        """
+        if self.embedding_model != "auto":
+            return self.embedding_model
+        lang = detected_language or self.language
+        # If multilingual mode is explicitly enabled
+        if self.multilingual:
+            return "paraphrase-multilingual-mpnet-base-v2"
+        # Language-specific model selection
+        model_map = {
+            "en": "all-MiniLM-L6-v2",
+            "zh": "BAAI/bge-base-zh-v1.5",
+            "ja": "paraphrase-multilingual-MiniLM-L12-v2",
+            "ko": "paraphrase-multilingual-MiniLM-L12-v2",
+        }
+        # Default to multilingual for non-English
+        if lang in model_map:
+            return model_map[lang]
+        elif lang != "en" and lang != "auto":
+            return "paraphrase-multilingual-MiniLM-L12-v2"
+        else:
+            return "all-MiniLM-L6-v2"
     def to_dict(self) -> dict:
         """Convert config to dictionary."""
         return {
-            'embedding_model': self.embedding_model,
-            'embedding_batch_size': self.embedding_batch_size,
-            'n_neighbors': self.n_neighbors,
-            'metric': self.metric,
-            'graph_type': self.graph_type,
-            'snn_weight': self.snn_weight,
-            'use_lexical_view': self.use_lexical_view,
-            'use_metadata_view': self.use_metadata_view,
-            'semantic_weight': self.semantic_weight,
-            'lexical_weight': self.lexical_weight,
-            'metadata_weight': self.metadata_weight,
-            'resolution': self.resolution,
-            'n_consensus_runs': self.n_consensus_runs,
-            'min_cluster_size': self.min_cluster_size,
-            'use_iterative_refinement': self.use_iterative_refinement,
-            'max_iterations': self.max_iterations,
-            'convergence_threshold': self.convergence_threshold,
-            'refinement_strength': self.refinement_strength,
-            'n_keywords': self.n_keywords,
-            'n_representative_docs': self.n_representative_docs,
-            'keyword_method': self.keyword_method,
-            'outlier_threshold': self.outlier_threshold,
-            'reduce_outliers': self.reduce_outliers,
-            'random_state': self.random_state,
-            'verbose': self.verbose,
-            'n_jobs': self.n_jobs,
+            k: v for k, v in self.__dict__.items()
+            if not k.startswith('_')
         }
     @classmethod
     def from_dict(cls, config_dict: dict) -> "TriTopicConfig":
         """Create config from dictionary."""
-        return cls(**{k: v for k, v in config_dict.items() if hasattr(cls, k)})
+        return cls(**config_dict)
-def get_config(**kwargs) -> TriTopicConfig:
-    """
-    Helper function to create a TriTopicConfig with custom parameters.
-    All parameters are optional and default to TriTopicConfig defaults.
-    Example:
-        config = get_config(
-            embedding_model="all-mpnet-base-v2",
-            n_neighbors=20,
-            use_iterative_refinement=True
-        )
+# Predefined configurations for common use cases
+CONFIGS = {
+    "default": TriTopicConfig(),
-    Args:
-        **kwargs: Any TriTopicConfig parameter
-    Returns:
-        TriTopicConfig instance
-    """
-    return TriTopicConfig(**kwargs)
-# Preset configurations for common use cases
-PRESETS = {
     "fast": TriTopicConfig(
         embedding_model="all-MiniLM-L6-v2",
         n_neighbors=10,
         n_consensus_runs=5,
         use_iterative_refinement=False,
-        keyword_method="ctfidf",
-    ),
-    "balanced": TriTopicConfig(
-        embedding_model="all-MiniLM-L6-v2",
-        n_neighbors=15,
-        n_consensus_runs=10,
-        use_iterative_refinement=True,
-        max_iterations=3,
-        keyword_method="ctfidf",
+        representative_method="centroid",
     ),
     "quality": TriTopicConfig(
-        embedding_model="all-mpnet-base-v2",
-        n_neighbors=20,
-        n_consensus_runs=15,
-        use_iterative_refinement=True,
-        max_iterations=5,
-        keyword_method="ctfidf",
-    ),
-    "research": TriTopicConfig(
         embedding_model="BAAI/bge-base-en-v1.5",
         n_neighbors=20,
         n_consensus_runs=20,
-        use_iterative_refinement=True,
         max_iterations=10,
-        convergence_threshold=0.98,
-        keyword_method="ctfidf",
+        representative_method="hybrid",
+        n_archetypes=5,
     ),
-}
-def get_preset(name: str) -> TriTopicConfig:
-    """
-    Get a preset configuration.
-    Available presets:
-    - "fast": Quick results, lower quality
-    - "balanced": Good balance of speed and quality (default)
-    - "quality": Higher quality, slower
-    - "research": Maximum quality for research/publication
+    "multilingual": TriTopicConfig(
+        multilingual=True,
+        embedding_model="paraphrase-multilingual-mpnet-base-v2",
+        semantic_weight=0.6,
+        lexical_weight=0.2,
+        metadata_weight=0.2,
+    ),
-    Args:
-        name: Preset name
-    Returns:
-        TriTopicConfig instance
-    """
-    if name not in PRESETS:
-        raise ValueError(f"Unknown preset '{name}'. Available: {list(PRESETS.keys())}")
-    return PRESETS[name]
+    "multilingual_quality": TriTopicConfig(
+        multilingual=True,
+        embedding_model="BAAI/bge-m3",
+        n_neighbors=20,
+        n_consensus_runs=15,
+        semantic_weight=0.6,
+        lexical_weight=0.2,
+        representative_method="hybrid",
+    ),
+    "chinese": TriTopicConfig(
+        language="zh",
+        embedding_model="BAAI/bge-base-zh-v1.5",
+        tokenizer="jieba",
+        ngram_range=(1, 2),
+    ),
+    "german": TriTopicConfig(
+        language="de",
+        embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
+    ),
+}

{tritopic-1.1.0.dist-info → tritopic-1.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tritopic
-Version: 1.1.0
+Version: 1.1.2
 Summary: Tri-Modal Graph Topic Modeling with Iterative Refinement
 Author-email: Roman Egger <roman.egger@smartvisions.at>
 License: MIT

{tritopic-1.1.0.dist-info → tritopic-1.1.2.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 tritopic/__init__.py,sha256=BaHbardg5BW9zykYOtYG1ZM1nGwvfVt7DV7NJ7tp4l8,936
-tritopic/config.py,sha256=vL47vU5KAYD1iCzH3cRMFUO1w1NSibmjIuAHNsBLu5c,10614
+tritopic/config.py,sha256=bsornL0etlRxQyMa6-Yx7tgXqVR1b8OZPpXM62cibhI,10120
 tritopic/labeling.py,sha256=SJsvOXRl-q8f3qtk1S66FGozTJsW8bwNnAKGkAklmVQ,8883
 tritopic/model.py,sha256=mzptfvqG_Q81OcS6kiYd7u2uU2AKjxpDYKo9u1EfpH4,25015
 tritopic/visualization.py,sha256=MCiIgIoTzFoQ7GG9WjfSZlV2j1BBGzZwxRddmvmh1OY,9841
@@ -14,7 +14,7 @@ tritopic/multilingual/__init__.py,sha256=EagOqVqMDNKX7AfEAQfVgbR92f2vBy1KSM5O88A
 tritopic/multilingual/detection.py,sha256=xeZqNp4l-fRII5s2S4EMzBdJPf3Xgt6e1a3Od2hc2q4,5700
 tritopic/multilingual/stopwords.py,sha256=viMM1pb4VpDEmDpGpx_8sDfumXfrVXKfUULyOZXFFYU,29942
 tritopic/multilingual/tokenizers.py,sha256=seTCzRiUOqO0UbAqA3nn8V8EoVYQ1wiwqcH8lafRCxM,9954
-tritopic-1.1.0.dist-info/METADATA,sha256=nIWD3zUMOQR9efdUFo8zUjM0JVJGgrzgZVDyLbbjJ7I,13922
-tritopic-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-tritopic-1.1.0.dist-info/top_level.txt,sha256=9PASbqQyi0-wa7E2Hl3Z0u1ae7MwLcfgFliFE1ioFBA,9
-tritopic-1.1.0.dist-info/RECORD,,
+tritopic-1.1.2.dist-info/METADATA,sha256=730Y7lueQ4nGWQeu2187uEpS03aaLLXcLLlZTODi668,13922
+tritopic-1.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+tritopic-1.1.2.dist-info/top_level.txt,sha256=9PASbqQyi0-wa7E2Hl3Z0u1ae7MwLcfgFliFE1ioFBA,9
+tritopic-1.1.2.dist-info/RECORD,,

{tritopic-1.1.0.dist-info → tritopic-1.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{tritopic-1.1.0.dist-info → tritopic-1.1.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

tritopic 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

tritopic 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl