PyPI - tritopic - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

tritopic/__init__.py +22 -32
tritopic/config.py +305 -0
tritopic/core/__init__.py +0 -17
tritopic/core/clustering.py +229 -243
tritopic/core/embeddings.py +151 -157
tritopic/core/graph.py +435 -0
tritopic/core/keywords.py +213 -249
tritopic/core/refinement.py +231 -0
tritopic/core/representatives.py +560 -0
tritopic/labeling.py +313 -0
tritopic/model.py +718 -0
tritopic/multilingual/__init__.py +38 -0
tritopic/multilingual/detection.py +208 -0
tritopic/multilingual/stopwords.py +467 -0
tritopic/multilingual/tokenizers.py +275 -0
tritopic/visualization.py +371 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
tritopic-1.0.0.dist-info/RECORD +20 -0
tritopic/core/graph_builder.py +0 -493
tritopic/core/model.py +0 -810
tritopic/labeling/__init__.py +0 -5
tritopic/labeling/llm_labeler.py +0 -279
tritopic/utils/__init__.py +0 -13
tritopic/utils/metrics.py +0 -254
tritopic/visualization/__init__.py +0 -5
tritopic/visualization/plotter.py +0 -523
tritopic-0.1.0.dist-info/RECORD +0 -18
tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0

tritopic/model.py ADDED Viewed

@@ -0,0 +1,718 @@
+"""
+TriTopic: Tri-Modal Graph Topic Modeling with Iterative Refinement
+Main model class that orchestrates all components.
+"""
+from __future__ import annotations
+import pickle
+import warnings
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import pandas as pd
+from scipy import sparse
+from tqdm import tqdm
+from .config import TriTopicConfig, get_config
+from .core.embeddings import EmbeddingEngine
+from .core.graph import GraphBuilder, MultiViewGraphBuilder
+from .core.clustering import ConsensusLeiden
+from .core.refinement import IterativeRefinement
+from .core.keywords import KeywordExtractor
+from .core.representatives import RepresentativeSelector
+@dataclass
+class Topic:
+    """Represents a single topic with its metadata."""
+    topic_id: int
+    size: int
+    keywords: List[str]
+    keyword_scores: List[float]
+    representative_docs: List[int]
+    representative_texts: List[str]
+    centroid: Optional[np.ndarray] = None
+    label: Optional[str] = None
+    def __repr__(self) -> str:
+        kw_str = ", ".join(self.keywords[:5])
+        label_str = f" ({self.label})" if self.label else ""
+        return f"Topic {self.topic_id}{label_str}: [{kw_str}...] (n={self.size})"
+class TriTopic:
+    """
+    Tri-Modal Graph Topic Modeling with Iterative Refinement.
+    A state-of-the-art topic modeling approach that combines:
+    - Multi-view representation (semantic, lexical, metadata)
+    - Hybrid graph construction (Mutual kNN + SNN)
+    - Consensus Leiden clustering for stability
+    - Iterative refinement for improved coherence
+    - LLM-powered labeling (optional)
+    Parameters
+    ----------
+    config : TriTopicConfig, str, or None
+        Configuration object or name of preset config.
+        If None, uses default config.
+    **kwargs
+        Override any config parameter directly.
+    Examples
+    --------
+    >>> from tritopic import TriTopic
+    >>> model = TriTopic(verbose=True)
+    >>> topics = model.fit_transform(documents)
+    >>> print(model.get_topic_info())
+    >>> # With custom config
+    >>> model = TriTopic(
+    ...     embedding_model="BAAI/bge-base-en-v1.5",
+    ...     n_neighbors=20,
+    ...     use_iterative_refinement=True
+    ... )
+    """
+    def __init__(
+        self,
+        config: Optional[Union[TriTopicConfig, str]] = None,
+        **kwargs
+    ):
+        # Load config
+        if config is None:
+            self.config = TriTopicConfig()
+        elif isinstance(config, str):
+            self.config = get_config(config)
+        else:
+            self.config = config
+        # Override with kwargs
+        for key, value in kwargs.items():
+            if hasattr(self.config, key):
+                setattr(self.config, key, value)
+            else:
+                warnings.warn(f"Unknown config parameter: {key}")
+        # Initialize components (lazy loading)
+        self._embedding_engine: Optional[EmbeddingEngine] = None
+        self._graph_builder: Optional[MultiViewGraphBuilder] = None
+        self._clusterer: Optional[ConsensusLeiden] = None
+        self._refiner: Optional[IterativeRefinement] = None
+        self._keyword_extractor: Optional[KeywordExtractor] = None
+        self._representative_selector: Optional[RepresentativeSelector] = None
+        # Fitted attributes
+        self.documents_: Optional[List[str]] = None
+        self.embeddings_: Optional[np.ndarray] = None
+        self.graph_: Optional[sparse.csr_matrix] = None
+        self.topic_labels_: Optional[np.ndarray] = None
+        self.topics_: Optional[List[Topic]] = None
+        self.n_topics_: int = 0
+        self.outlier_count_: int = 0
+        # Language detection results
+        self.detected_language_: Optional[str] = None
+        self.is_multilingual_: bool = False
+    def _log(self, message: str, level: int = 1) -> None:
+        """Print message if verbose mode is enabled."""
+        if self.config.verbose:
+            indent = "   " * (level - 1)
+            print(f"{indent}{message}")
+    def _initialize_components(self, documents: List[str]) -> None:
+        """Initialize all components based on config and detected language."""
+        # Detect language if auto
+        if self.config.language == "auto":
+            self._detect_language(documents)
+        else:
+            self.detected_language_ = self.config.language
+        # Get appropriate embedding model
+        embedding_model = self.config.embedding_model
+        if embedding_model == "auto":
+            embedding_model = self.config.get_embedding_model_for_language(
+                self.detected_language_ or "en"
+            )
+            self._log(f"Auto-selected embedding model: {embedding_model}", 2)
+        # Initialize embedding engine
+        self._embedding_engine = EmbeddingEngine(
+            model_name=embedding_model,
+            batch_size=self.config.embedding_batch_size,
+            device=self.config.device
+        )
+        # Initialize graph builder
+        self._graph_builder = MultiViewGraphBuilder(
+            n_neighbors=self.config.n_neighbors,
+            metric=self.config.metric,
+            graph_type=self.config.graph_type,
+            snn_weight=self.config.snn_weight,
+            semantic_weight=self.config.semantic_weight,
+            lexical_weight=self.config.lexical_weight,
+            metadata_weight=self.config.metadata_weight,
+            use_lexical=self.config.use_lexical_view,
+            use_metadata=self.config.use_metadata_view,
+            lexical_method=self.config.lexical_method,
+            ngram_range=self.config.ngram_range
+        )
+        # Initialize clusterer
+        self._clusterer = ConsensusLeiden(
+            resolution=self.config.resolution,
+            n_runs=self.config.n_consensus_runs,
+            min_cluster_size=self.config.min_cluster_size,
+            random_state=self.config.random_state
+        )
+        # Initialize refiner
+        if self.config.use_iterative_refinement:
+            self._refiner = IterativeRefinement(
+                max_iterations=self.config.max_iterations,
+                convergence_threshold=self.config.convergence_threshold,
+                refinement_strength=self.config.refinement_strength
+            )
+        # Initialize keyword extractor
+        self._keyword_extractor = KeywordExtractor(
+            method=self.config.keyword_method,
+            n_keywords=self.config.n_keywords,
+            language=self.detected_language_ or "en",
+            ngram_range=self.config.ngram_range
+        )
+        # Initialize representative selector
+        self._representative_selector = RepresentativeSelector(
+            method=self.config.representative_method,
+            n_representatives=self.config.n_representative_docs,
+            n_archetypes=self.config.n_archetypes,
+            archetype_method=self.config.archetype_method
+        )
+    def _detect_language(self, documents: List[str]) -> None:
+        """Detect the dominant language of the corpus."""
+        try:
+            from .multilingual.detection import detect_corpus_language
+            result = detect_corpus_language(
+                documents,
+                sample_size=self.config.language_detection_sample
+            )
+            self.detected_language_ = result["dominant_language"]
+            self.is_multilingual_ = result["is_multilingual"]
+            self._log(f"Detected language: {self.detected_language_} "
+                     f"(confidence: {result['confidence']:.2f})", 2)
+            if self.is_multilingual_:
+                self._log("Corpus appears multilingual", 2)
+        except ImportError:
+            self._log("Language detection not available, defaulting to English", 2)
+            self.detected_language_ = "en"
+    def fit(
+        self,
+        documents: List[str],
+        embeddings: Optional[np.ndarray] = None,
+        metadata: Optional[pd.DataFrame] = None
+    ) -> "TriTopic":
+        """
+        Fit the topic model on documents.
+        Parameters
+        ----------
+        documents : List[str]
+            List of documents to model.
+        embeddings : np.ndarray, optional
+            Pre-computed embeddings. If None, embeddings are generated.
+        metadata : pd.DataFrame, optional
+            Document metadata for multi-view fusion.
+        Returns
+        -------
+        self
+            Fitted model.
+        """
+        n_docs = len(documents)
+        self.documents_ = documents
+        self._log(f"🚀 TriTopic: Fitting model on {n_docs} documents")
+        self._log(f"Config: {self.config.graph_type} graph, "
+                 f"{'iterative' if self.config.use_iterative_refinement else 'single-pass'} mode", 1)
+        # Initialize components
+        self._initialize_components(documents)
+        # Step 1: Generate embeddings
+        if embeddings is not None:
+            self._log("→ Using provided embeddings", 1)
+            self.embeddings_ = embeddings
+        else:
+            model_name = self._embedding_engine.model_name
+            self._log(f"→ Generating embeddings ({model_name})...", 1)
+            self.embeddings_ = self._embedding_engine.encode(
+                documents,
+                show_progress=self.config.verbose
+            )
+        # Step 2: Build lexical similarity (if enabled)
+        if self.config.use_lexical_view:
+            self._log("→ Building lexical similarity matrix...", 1)
+        # Step 3: Build graph
+        self._log("→ Constructing multi-view graph...", 1)
+        self.graph_ = self._graph_builder.build(
+            embeddings=self.embeddings_,
+            documents=documents,
+            metadata=metadata
+        )
+        # Step 4: Clustering (with optional refinement)
+        if self.config.use_iterative_refinement:
+            self._log(f"→ Starting iterative refinement (max {self.config.max_iterations} iterations)...", 1)
+            def graph_builder_fn(emb):
+                return self._graph_builder.build(
+                    embeddings=emb,
+                    documents=documents,
+                    metadata=metadata
+                )
+            def cluster_fn(g):
+                return self._clusterer.fit_predict(g)
+            self.topic_labels_, self.embeddings_, iterations = self._refiner.refine(
+                embeddings=self.embeddings_,
+                initial_labels=self._clusterer.fit_predict(self.graph_),
+                graph_builder_fn=graph_builder_fn,
+                cluster_fn=cluster_fn,
+                verbose=self.config.verbose
+            )
+            # Rebuild final graph with refined embeddings
+            self.graph_ = graph_builder_fn(self.embeddings_)
+        else:
+            self._log("→ Consensus clustering...", 1)
+            self.topic_labels_ = self._clusterer.fit_predict(self.graph_)
+        # Step 5: Extract keywords and representatives
+        self._log("→ Extracting keywords and representative documents...", 1)
+        self._extract_topic_info(documents)
+        # Summary
+        self.n_topics_ = len([t for t in self.topics_ if t.topic_id >= 0])
+        self.outlier_count_ = sum(1 for l in self.topic_labels_ if l < 0)
+        outlier_pct = 100 * self.outlier_count_ / n_docs
+        self._log("")
+        self._log(f"✅ Fitting complete!")
+        self._log(f"   Found {self.n_topics_} topics")
+        self._log(f"   {self.outlier_count_} outlier documents ({outlier_pct:.1f}%)")
+        return self
+    def fit_transform(
+        self,
+        documents: List[str],
+        embeddings: Optional[np.ndarray] = None,
+        metadata: Optional[pd.DataFrame] = None
+    ) -> np.ndarray:
+        """
+        Fit the model and return topic assignments.
+        Parameters
+        ----------
+        documents : List[str]
+            List of documents to model.
+        embeddings : np.ndarray, optional
+            Pre-computed embeddings.
+        metadata : pd.DataFrame, optional
+            Document metadata.
+        Returns
+        -------
+        np.ndarray
+            Topic assignments for each document (-1 for outliers).
+        """
+        self.fit(documents, embeddings, metadata)
+        return self.topic_labels_
+    def transform(self, documents: List[str]) -> np.ndarray:
+        """
+        Assign topics to new documents.
+        Parameters
+        ----------
+        documents : List[str]
+            New documents to classify.
+        Returns
+        -------
+        np.ndarray
+            Topic assignments for each document.
+        """
+        if self.topics_ is None:
+            raise ValueError("Model not fitted. Call fit() first.")
+        # Encode new documents
+        new_embeddings = self._embedding_engine.encode(documents)
+        # Find nearest topic centroid for each document
+        topic_centroids = np.array([
+            t.centroid for t in self.topics_
+            if t.topic_id >= 0 and t.centroid is not None
+        ])
+        topic_ids = [t.topic_id for t in self.topics_ if t.topic_id >= 0]
+        # Compute distances to centroids
+        from sklearn.metrics.pairwise import cosine_distances
+        distances = cosine_distances(new_embeddings, topic_centroids)
+        # Assign to nearest topic (with outlier threshold)
+        assignments = []
+        for i, doc_distances in enumerate(distances):
+            min_idx = np.argmin(doc_distances)
+            min_dist = doc_distances[min_idx]
+            if min_dist > self.config.outlier_threshold:
+                assignments.append(-1)
+            else:
+                assignments.append(topic_ids[min_idx])
+        return np.array(assignments)
+    def _extract_topic_info(self, documents: List[str]) -> None:
+        """Extract keywords and representatives for each topic."""
+        unique_labels = sorted(set(self.topic_labels_))
+        self.topics_ = []
+        for topic_id in unique_labels:
+            # Get documents in this topic
+            mask = self.topic_labels_ == topic_id
+            topic_docs = [documents[i] for i, m in enumerate(mask) if m]
+            topic_embeddings = self.embeddings_[mask]
+            topic_indices = np.where(mask)[0]
+            # Extract keywords
+            if topic_id >= 0:
+                keywords, scores = self._keyword_extractor.extract(
+                    topic_docs,
+                    all_documents=documents
+                )
+            else:
+                keywords, scores = ["[outlier]"], [0.0]
+            # Get representative documents
+            if topic_id >= 0 and len(topic_docs) > 0:
+                reps = self._representative_selector.select(
+                    embeddings=topic_embeddings,
+                    documents=topic_docs,
+                    keywords=keywords[:5],
+                    global_indices=topic_indices
+                )
+                rep_indices = [r.doc_id for r in reps.representatives]
+                rep_texts = [r.text[:200] for r in reps.representatives]
+            else:
+                rep_indices = list(topic_indices[:self.config.n_representative_docs])
+                rep_texts = topic_docs[:self.config.n_representative_docs]
+            # Compute centroid
+            centroid = topic_embeddings.mean(axis=0) if len(topic_embeddings) > 0 else None
+            topic = Topic(
+                topic_id=topic_id,
+                size=len(topic_docs),
+                keywords=keywords,
+                keyword_scores=scores,
+                representative_docs=rep_indices,
+                representative_texts=rep_texts,
+                centroid=centroid
+            )
+            self.topics_.append(topic)
+    def get_topic_info(self) -> pd.DataFrame:
+        """
+        Get summary information about all topics.
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame with topic information.
+        """
+        if self.topics_ is None:
+            raise ValueError("Model not fitted. Call fit() first.")
+        data = []
+        for topic in self.topics_:
+            data.append({
+                "Topic": topic.topic_id,
+                "Size": topic.size,
+                "Label": topic.label or "",
+                "Keywords": ", ".join(topic.keywords[:5]),
+                "Representative": topic.representative_texts[0][:100] + "..."
+                    if topic.representative_texts else ""
+            })
+        return pd.DataFrame(data)
+    def get_topic(self, topic_id: int) -> Optional[Topic]:
+        """
+        Get detailed information about a specific topic.
+        Parameters
+        ----------
+        topic_id : int
+            Topic ID to retrieve.
+        Returns
+        -------
+        Topic or None
+            Topic object if found.
+        """
+        if self.topics_ is None:
+            return None
+        for topic in self.topics_:
+            if topic.topic_id == topic_id:
+                return topic
+        return None
+    def get_document_topics(
+        self,
+        doc_indices: Optional[List[int]] = None
+    ) -> pd.DataFrame:
+        """
+        Get topic assignments for documents.
+        Parameters
+        ----------
+        doc_indices : List[int], optional
+            Specific document indices. If None, returns all.
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame with document-topic assignments.
+        """
+        if self.topic_labels_ is None:
+            raise ValueError("Model not fitted.")
+        if doc_indices is None:
+            doc_indices = list(range(len(self.topic_labels_)))
+        data = []
+        for idx in doc_indices:
+            topic_id = self.topic_labels_[idx]
+            topic = self.get_topic(topic_id)
+            data.append({
+                "Document": idx,
+                "Topic": topic_id,
+                "Topic_Label": topic.label if topic else "",
+                "Text_Preview": self.documents_[idx][:100] + "..."
+            })
+        return pd.DataFrame(data)
+    def generate_labels(self, labeler: Any) -> None:
+        """
+        Generate human-readable labels for topics using an LLM.
+        Parameters
+        ----------
+        labeler : LLMLabeler
+            Labeler instance configured with LLM provider.
+        """
+        if self.topics_ is None:
+            raise ValueError("Model not fitted.")
+        for topic in self.topics_:
+            if topic.topic_id < 0:
+                topic.label = "Outliers"
+                continue
+            label = labeler.generate_label(
+                keywords=topic.keywords,
+                representative_docs=topic.representative_texts
+            )
+            topic.label = label
+    def evaluate(self) -> Dict[str, float]:
+        """
+        Evaluate the topic model quality.
+        Returns
+        -------
+        Dict[str, float]
+            Dictionary with evaluation metrics.
+        """
+        if self.topics_ is None:
+            raise ValueError("Model not fitted.")
+        metrics = {}
+        # Number of topics
+        metrics["n_topics"] = self.n_topics_
+        # Outlier ratio
+        metrics["outlier_ratio"] = self.outlier_count_ / len(self.topic_labels_)
+        # Topic diversity
+        all_keywords = []
+        for topic in self.topics_:
+            if topic.topic_id >= 0:
+                all_keywords.extend(topic.keywords[:10])
+        unique_keywords = len(set(all_keywords))
+        total_keywords = len(all_keywords)
+        metrics["diversity"] = unique_keywords / total_keywords if total_keywords > 0 else 0
+        # Coherence (simplified NPMI)
+        try:
+            from .core.keywords import compute_coherence
+            coherence_scores = []
+            for topic in self.topics_:
+                if topic.topic_id >= 0:
+                    score = compute_coherence(topic.keywords[:10], self.documents_)
+                    coherence_scores.append(score)
+            metrics["coherence_mean"] = np.mean(coherence_scores) if coherence_scores else 0
+            metrics["coherence_std"] = np.std(coherence_scores) if coherence_scores else 0
+        except:
+            metrics["coherence_mean"] = 0
+            metrics["coherence_std"] = 0
+        # Clustering stability
+        if hasattr(self._clusterer, "stability_score_"):
+            metrics["stability"] = self._clusterer.stability_score_
+        return metrics
+    def visualize(
+        self,
+        method: str = "umap",
+        **kwargs
+    ) -> Any:
+        """
+        Visualize the topic model.
+        Parameters
+        ----------
+        method : str
+            Visualization method: "umap", "tsne", or "pca".
+        **kwargs
+            Additional arguments for visualization.
+        Returns
+        -------
+        plotly.graph_objects.Figure
+            Interactive visualization.
+        """
+        try:
+            from .visualization import create_topic_visualization
+            return create_topic_visualization(
+                embeddings=self.embeddings_,
+                labels=self.topic_labels_,
+                topics=self.topics_,
+                documents=self.documents_,
+                method=method,
+                **kwargs
+            )
+        except ImportError:
+            raise ImportError(
+                "Visualization requires plotly. Install with: pip install tritopic[visualization]"
+            )
+    def visualize_topics(self, **kwargs) -> Any:
+        """Visualize topic keywords as bar charts."""
+        try:
+            from .visualization import create_topic_barchart
+            return create_topic_barchart(self.topics_, **kwargs)
+        except ImportError:
+            raise ImportError("Visualization requires plotly.")
+    def visualize_hierarchy(self, **kwargs) -> Any:
+        """Visualize topic hierarchy as dendrogram."""
+        try:
+            from .visualization import create_topic_hierarchy
+            return create_topic_hierarchy(
+                embeddings=self.embeddings_,
+                labels=self.topic_labels_,
+                topics=self.topics_,
+                **kwargs
+            )
+        except ImportError:
+            raise ImportError("Visualization requires plotly.")
+    def save(self, path: Union[str, Path]) -> None:
+        """
+        Save the model to disk.
+        Parameters
+        ----------
+        path : str or Path
+            File path for saving.
+        """
+        path = Path(path)
+        # Prepare state dict (excluding non-picklable objects)
+        state = {
+            "config": self.config,
+            "documents_": self.documents_,
+            "embeddings_": self.embeddings_,
+            "topic_labels_": self.topic_labels_,
+            "topics_": self.topics_,
+            "n_topics_": self.n_topics_,
+            "outlier_count_": self.outlier_count_,
+            "detected_language_": self.detected_language_,
+            "is_multilingual_": self.is_multilingual_,
+        }
+        with open(path, "wb") as f:
+            pickle.dump(state, f)
+    @classmethod
+    def load(cls, path: Union[str, Path]) -> "TriTopic":
+        """
+        Load a model from disk.
+        Parameters
+        ----------
+        path : str or Path
+            File path to load from.
+        Returns
+        -------
+        TriTopic
+            Loaded model.
+        """
+        path = Path(path)
+        with open(path, "rb") as f:
+            state = pickle.load(f)
+        model = cls(config=state["config"])
+        model.documents_ = state["documents_"]
+        model.embeddings_ = state["embeddings_"]
+        model.topic_labels_ = state["topic_labels_"]
+        model.topics_ = state["topics_"]
+        model.n_topics_ = state["n_topics_"]
+        model.outlier_count_ = state["outlier_count_"]
+        model.detected_language_ = state.get("detected_language_")
+        model.is_multilingual_ = state.get("is_multilingual_", False)
+        # Re-initialize components for transform()
+        model._initialize_components(model.documents_)
+        return model
+    def __repr__(self) -> str:
+        if self.topics_ is None:
+            return "TriTopic(not fitted)"
+        return f"TriTopic(n_topics={self.n_topics_}, n_docs={len(self.documents_)})"

tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl