PyPI - tritopic - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

tritopic/__init__.py +22 -32
tritopic/config.py +305 -0
tritopic/core/__init__.py +0 -17
tritopic/core/clustering.py +229 -243
tritopic/core/embeddings.py +151 -157
tritopic/core/graph.py +435 -0
tritopic/core/keywords.py +213 -249
tritopic/core/refinement.py +231 -0
tritopic/core/representatives.py +560 -0
tritopic/labeling.py +313 -0
tritopic/model.py +718 -0
tritopic/multilingual/__init__.py +38 -0
tritopic/multilingual/detection.py +208 -0
tritopic/multilingual/stopwords.py +467 -0
tritopic/multilingual/tokenizers.py +275 -0
tritopic/visualization.py +371 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
tritopic-1.0.0.dist-info/RECORD +20 -0
tritopic/core/graph_builder.py +0 -493
tritopic/core/model.py +0 -810
tritopic/labeling/__init__.py +0 -5
tritopic/labeling/llm_labeler.py +0 -279
tritopic/utils/__init__.py +0 -13
tritopic/utils/metrics.py +0 -254
tritopic/visualization/__init__.py +0 -5
tritopic/visualization/plotter.py +0 -523
tritopic-0.1.0.dist-info/RECORD +0 -18
tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0

tritopic/core/clustering.py CHANGED Viewed

@@ -1,331 +1,317 @@
 """
-Consensus Leiden Clustering
-============================
+Clustering Module
-Robust community detection with:
-- Leiden algorithm (better than Louvain)
-- Consensus clustering for stability
-- Resolution parameter tuning
+Implements Consensus Leiden clustering for stable topic discovery.
 """
-from __future__ import annotations
-from typing import Any
+from typing import List, Optional, Tuple
 import numpy as np
-from scipy.cluster.hierarchy import linkage, fcluster
-from sklearn.metrics import adjusted_rand_score
-from collections import Counter
+from scipy import sparse
+import warnings
 class ConsensusLeiden:
     """
-    Leiden clustering with consensus for stability.
-    Runs multiple Leiden clusterings with different seeds and combines
-    results using consensus clustering. This dramatically improves
-    reproducibility and reduces sensitivity to random initialization.
+    Consensus clustering using Leiden algorithm.
-    Parameters
-    ----------
-    resolution : float
-        Resolution parameter for Leiden. Higher = more clusters. Default: 1.0
-    n_runs : int
-        Number of consensus runs. Default: 10
-    random_state : int
-        Random seed for reproducibility. Default: 42
-    consensus_threshold : float
-        Minimum agreement ratio for consensus. Default: 0.5
+    Runs Leiden multiple times and builds a co-assignment matrix
+    to find stable cluster assignments.
     """
     def __init__(
         self,
         resolution: float = 1.0,
         n_runs: int = 10,
-        random_state: int = 42,
-        consensus_threshold: float = 0.5,
+        min_cluster_size: int = 5,
+        random_state: Optional[int] = 42,
     ):
+        """
+        Initialize the consensus clustering.
+        Parameters
+        ----------
+        resolution : float
+            Resolution parameter for Leiden
+        n_runs : int
+            Number of clustering runs
+        min_cluster_size : int
+            Minimum cluster size
+        random_state : int, optional
+            Random seed for reproducibility
+        """
         self.resolution = resolution
         self.n_runs = n_runs
+        self.min_cluster_size = min_cluster_size
         self.random_state = random_state
-        self.consensus_threshold = consensus_threshold
-        self.labels_: np.ndarray | None = None
-        self.stability_score_: float | None = None
-        self._all_partitions: list[np.ndarray] = []
+        self._check_dependencies()
+    def _check_dependencies(self):
+        """Check if required packages are installed."""
+        try:
+            import leidenalg
+            import igraph
+        except ImportError:
+            raise ImportError(
+                "leidenalg and python-igraph are required for clustering. "
+                "Install with: pip install leidenalg python-igraph"
+            )
     def fit_predict(
         self,
-        graph: "igraph.Graph",
-        min_cluster_size: int = 5,
-        resolution: float | None = None,
+        graph: sparse.csr_matrix,
     ) -> np.ndarray:
         """
-        Fit Leiden clustering with consensus.
+        Fit the consensus clustering and predict labels.
         Parameters
         ----------
-        graph : igraph.Graph
-            Input graph with edge weights.
-        min_cluster_size : int
-            Minimum cluster size. Smaller clusters become outliers.
-        resolution : float, optional
-            Override default resolution.
+        graph : sparse.csr_matrix
+            Adjacency matrix of the document graph
         Returns
         -------
-        labels : np.ndarray
-            Cluster assignments. -1 for outliers.
+        np.ndarray
+            Cluster labels for each document
         """
-        import leidenalg as la
+        import leidenalg
+        import igraph as ig
+        n = graph.shape[0]
-        res = resolution or self.resolution
-        n_nodes = graph.vcount()
+        # Convert sparse matrix to igraph
+        sources, targets = graph.nonzero()
+        weights = np.array(graph[sources, targets]).flatten()
-        # Run multiple Leiden clusterings
-        self._all_partitions = []
+        # Create igraph graph
+        g = ig.Graph(directed=False)
+        g.add_vertices(n)
+        edges = list(zip(sources.tolist(), targets.tolist()))
+        g.add_edges(edges)
+        g.es['weight'] = weights.tolist()
+        # Run Leiden multiple times
+        partitions = []
         for run in range(self.n_runs):
-            seed = self.random_state + run
+            seed = None if self.random_state is None else self.random_state + run
-            # Run Leiden
-            partition = la.find_partition(
-                graph,
-                la.RBConfigurationVertexPartition,
-                weights="weight",
-                resolution_parameter=res,
+            partition = leidenalg.find_partition(
+                g,
+                leidenalg.RBConfigurationVertexPartition,
+                weights='weight',
+                resolution_parameter=self.resolution,
                 seed=seed,
             )
-            # Convert to labels
             labels = np.array(partition.membership)
-            self._all_partitions.append(labels)
+            partitions.append(labels)
-        # Compute consensus
-        self.labels_ = self._compute_consensus(self._all_partitions)
+        # Build co-assignment matrix
+        co_assignment = self._build_co_assignment_matrix(partitions)
-        # Handle small clusters as outliers
-        self.labels_ = self._handle_small_clusters(self.labels_, min_cluster_size)
+        # Final clustering on co-assignment matrix
+        final_labels = self._final_clustering(co_assignment, g)
-        # Compute stability score
-        self.stability_score_ = self._compute_stability()
+        # Apply minimum cluster size constraint
+        final_labels = self._apply_min_cluster_size(final_labels)
-        return self.labels_
+        return final_labels
-    def _compute_consensus(self, partitions: list[np.ndarray]) -> np.ndarray:
+    def _build_co_assignment_matrix(
+        self,
+        partitions: List[np.ndarray],
+    ) -> np.ndarray:
         """
-        Compute consensus partition from multiple runs.
+        Build co-assignment matrix from multiple partitions.
-        Uses co-occurrence matrix and hierarchical clustering.
+        C[i,j] = fraction of runs where i and j are in the same cluster
         """
-        n_nodes = len(partitions[0])
-        n_runs = len(partitions)
-        # Build co-occurrence matrix
-        # co_occur[i,j] = fraction of runs where i and j are in same cluster
-        co_occur = np.zeros((n_nodes, n_nodes))
-        for partition in partitions:
-            for cluster_id in np.unique(partition):
-                members = np.where(partition == cluster_id)[0]
-                for i in members:
-                    for j in members:
-                        co_occur[i, j] += 1
-        co_occur /= n_runs
-        # Convert co-occurrence to distance
-        distance = 1 - co_occur
-        # Hierarchical clustering on distance matrix
-        # Use condensed form for linkage
-        condensed = []
-        for i in range(n_nodes):
-            for j in range(i + 1, n_nodes):
-                condensed.append(distance[i, j])
-        condensed = np.array(condensed)
-        # Average linkage tends to work well for consensus
-        Z = linkage(condensed, method="average")
-        # Cut at threshold that matches approximate number of clusters
-        # from the most frequent partition
-        n_clusters_list = [len(np.unique(p)) for p in partitions]
-        median_n_clusters = int(np.median(n_clusters_list))
-        # Find optimal cut
-        best_labels = None
-        best_score = -1
-        for n_clusters in range(max(2, median_n_clusters - 2), median_n_clusters + 3):
-            try:
-                labels = fcluster(Z, n_clusters, criterion="maxclust")
-                labels = labels - 1  # 0-indexed
-                # Score by average ARI with original partitions
-                ari_scores = [adjusted_rand_score(labels, p) for p in partitions]
-                avg_ari = np.mean(ari_scores)
-                if avg_ari > best_score:
-                    best_score = avg_ari
-                    best_labels = labels
-            except Exception:
-                continue
-        if best_labels is None:
-            # Fallback to most common partition
-            best_labels = partitions[0]
-        return best_labels
+        n = len(partitions[0])
+        co_assignment = np.zeros((n, n))
+        for labels in partitions:
+            # Documents with same label get co-assignment
+            for label in np.unique(labels):
+                mask = labels == label
+                indices = np.where(mask)[0]
+                for i in indices:
+                    for j in indices:
+                        co_assignment[i, j] += 1
+        # Normalize by number of runs
+        co_assignment /= len(partitions)
+        return co_assignment
+    def _final_clustering(
+        self,
+        co_assignment: np.ndarray,
+        original_graph: "igraph.Graph",
+    ) -> np.ndarray:
+        """
+        Perform final clustering on the co-assignment matrix.
+        """
+        import leidenalg
+        import igraph as ig
+        n = co_assignment.shape[0]
+        # Threshold the co-assignment matrix
+        # Keep only edges where co-assignment > 0.5 (majority of runs)
+        threshold = 0.5
+        adjacency = np.where(co_assignment > threshold, co_assignment, 0)
+        # Create graph from co-assignment
+        sources, targets = np.where(adjacency > 0)
+        weights = adjacency[sources, targets]
+        g = ig.Graph(directed=False)
+        g.add_vertices(n)
+        edges = list(zip(sources.tolist(), targets.tolist()))
+        g.add_edges(edges)
+        g.es['weight'] = weights.tolist()
+        # Final Leiden run
+        partition = leidenalg.find_partition(
+            g,
+            leidenalg.RBConfigurationVertexPartition,
+            weights='weight',
+            resolution_parameter=self.resolution,
+            seed=self.random_state,
+        )
+        return np.array(partition.membership)
-    def _handle_small_clusters(
+    def _apply_min_cluster_size(
         self,
         labels: np.ndarray,
-        min_size: int,
     ) -> np.ndarray:
-        """Mark small clusters as outliers (-1)."""
-        result = labels.copy()
+        """
+        Apply minimum cluster size constraint.
-        for cluster_id in np.unique(labels):
-            if cluster_id == -1:
-                continue
-            size = np.sum(labels == cluster_id)
-            if size < min_size:
-                result[labels == cluster_id] = -1
+        Small clusters are marked as outliers (-1).
+        """
+        unique_labels, counts = np.unique(labels, return_counts=True)
-        # Relabel to consecutive integers
-        unique_labels = sorted([l for l in np.unique(result) if l != -1])
-        label_map = {old: new for new, old in enumerate(unique_labels)}
-        label_map[-1] = -1
+        # Find small clusters
+        small_clusters = unique_labels[counts < self.min_cluster_size]
-        result = np.array([label_map[l] for l in result])
+        # Mark as outliers
+        result = labels.copy()
+        for small_label in small_clusters:
+            result[labels == small_label] = -1
+        # Renumber remaining clusters from 0
+        if len(np.unique(result[result >= 0])) > 0:
+            unique_valid = np.unique(result[result >= 0])
+            label_map = {old: new for new, old in enumerate(unique_valid)}
+            for old, new in label_map.items():
+                result[labels == old] = new
         return result
-    def _compute_stability(self) -> float:
-        """Compute stability score as average pairwise ARI."""
-        if len(self._all_partitions) < 2:
-            return 1.0
-        ari_scores = []
-        for i in range(len(self._all_partitions)):
-            for j in range(i + 1, len(self._all_partitions)):
-                ari = adjusted_rand_score(
-                    self._all_partitions[i],
-                    self._all_partitions[j]
-                )
-                ari_scores.append(ari)
-        return float(np.mean(ari_scores))
     def find_optimal_resolution(
         self,
-        graph: "igraph.Graph",
-        resolution_range: tuple[float, float] = (0.1, 2.0),
+        graph: sparse.csr_matrix,
+        resolution_range: Tuple[float, float] = (0.1, 2.0),
         n_steps: int = 10,
-        target_n_topics: int | None = None,
+        target_n_topics: Optional[int] = None,
     ) -> float:
         """
         Find optimal resolution parameter.
         Parameters
         ----------
-        graph : igraph.Graph
-            Input graph.
+        graph : sparse.csr_matrix
+            Document graph
         resolution_range : tuple
-            Range of resolutions to search.
+            Range of resolutions to search
         n_steps : int
-            Number of resolutions to try.
+            Number of steps in search
         target_n_topics : int, optional
-            If provided, find resolution closest to this number of topics.
+            Target number of topics
         Returns
         -------
-        optimal_resolution : float
-            Best resolution parameter.
+        float
+            Optimal resolution
         """
-        import leidenalg as la
         resolutions = np.linspace(resolution_range[0], resolution_range[1], n_steps)
-        results = []
+        best_resolution = self.resolution
+        best_score = float('-inf')
         for res in resolutions:
-            partition = la.find_partition(
-                graph,
-                la.RBConfigurationVertexPartition,
-                weights="weight",
-                resolution_parameter=res,
-                seed=self.random_state,
-            )
+            self.resolution = res
+            labels = self.fit_predict(graph)
-            n_clusters = len(set(partition.membership))
-            modularity = partition.modularity
+            n_topics = len(np.unique(labels[labels >= 0]))
-            results.append({
-                "resolution": res,
-                "n_clusters": n_clusters,
-                "modularity": modularity,
-            })
-        if target_n_topics is not None:
-            # Find closest to target
-            best = min(results, key=lambda x: abs(x["n_clusters"] - target_n_topics))
-        else:
-            # Find highest modularity
-            best = max(results, key=lambda x: x["modularity"])
-        return best["resolution"]
-class HDBSCANClusterer:
-    """
-    Alternative clustering using HDBSCAN.
-    Useful for datasets with varying density or many outliers.
-    """
-    def __init__(
-        self,
-        min_cluster_size: int = 10,
-        min_samples: int = 5,
-        metric: str = "euclidean",
-    ):
-        self.min_cluster_size = min_cluster_size
-        self.min_samples = min_samples
-        self.metric = metric
+            if target_n_topics is not None:
+                # Score based on closeness to target
+                score = -abs(n_topics - target_n_topics)
+            else:
+                # Score based on modularity (higher is better)
+                score = self._compute_modularity(graph, labels)
+            if score > best_score:
+                best_score = score
+                best_resolution = res
-        self.labels_: np.ndarray | None = None
-        self.probabilities_: np.ndarray | None = None
+        self.resolution = best_resolution
+        return best_resolution
-    def fit_predict(
+    def _compute_modularity(
         self,
-        embeddings: np.ndarray,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Fit HDBSCAN clustering.
+        graph: sparse.csr_matrix,
+        labels: np.ndarray,
+    ) -> float:
+        """Compute modularity of a partition."""
+        import igraph as ig
-        Parameters
-        ----------
-        embeddings : np.ndarray
-            Document embeddings (optionally reduced with UMAP first).
-        Returns
-        -------
-        labels : np.ndarray
-            Cluster assignments. -1 for outliers.
-        """
-        import hdbscan
+        n = graph.shape[0]
+        sources, targets = graph.nonzero()
+        weights = np.array(graph[sources, targets]).flatten()
-        clusterer = hdbscan.HDBSCAN(
-            min_cluster_size=self.min_cluster_size,
-            min_samples=self.min_samples,
-            metric=self.metric,
-            **kwargs,
-        )
+        g = ig.Graph(directed=False)
+        g.add_vertices(n)
+        edges = list(zip(sources.tolist(), targets.tolist()))
+        g.add_edges(edges)
+        g.es['weight'] = weights.tolist()
-        self.labels_ = clusterer.fit_predict(embeddings)
-        self.probabilities_ = clusterer.probabilities_
+        # Filter out outliers
+        valid_labels = labels.copy()
+        valid_labels[labels < 0] = 0  # Temporarily assign to cluster 0
-        return self.labels_
+        return g.modularity(valid_labels.tolist(), weights='weight')
+def compute_clustering_stability(
+    labels1: np.ndarray,
+    labels2: np.ndarray,
+) -> float:
+    """
+    Compute stability between two label assignments using Adjusted Rand Index.
+    Parameters
+    ----------
+    labels1 : np.ndarray
+        First label assignment
+    labels2 : np.ndarray
+        Second label assignment
+    Returns
+    -------
+    float
+        Adjusted Rand Index (0-1, higher is more stable)
+    """
+    from sklearn.metrics import adjusted_rand_score
+    # Filter out outliers from both
+    mask = (labels1 >= 0) & (labels2 >= 0)
+    if mask.sum() < 2:
+        return 0.0
+    return adjusted_rand_score(labels1[mask], labels2[mask])

tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl