PyPI - tritopic - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

tritopic/__init__.py +22 -32
tritopic/config.py +305 -0
tritopic/core/__init__.py +0 -17
tritopic/core/clustering.py +229 -243
tritopic/core/embeddings.py +151 -157
tritopic/core/graph.py +435 -0
tritopic/core/keywords.py +213 -249
tritopic/core/refinement.py +231 -0
tritopic/core/representatives.py +560 -0
tritopic/labeling.py +313 -0
tritopic/model.py +718 -0
tritopic/multilingual/__init__.py +38 -0
tritopic/multilingual/detection.py +208 -0
tritopic/multilingual/stopwords.py +467 -0
tritopic/multilingual/tokenizers.py +275 -0
tritopic/visualization.py +371 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
tritopic-1.0.0.dist-info/RECORD +20 -0
tritopic/core/graph_builder.py +0 -493
tritopic/core/model.py +0 -810
tritopic/labeling/__init__.py +0 -5
tritopic/labeling/llm_labeler.py +0 -279
tritopic/utils/__init__.py +0 -13
tritopic/utils/metrics.py +0 -254
tritopic/visualization/__init__.py +0 -5
tritopic/visualization/plotter.py +0 -523
tritopic-0.1.0.dist-info/RECORD +0 -18
tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0

tritopic/core/keywords.py CHANGED Viewed

@@ -1,337 +1,301 @@
 """
-Keyword Extraction for TriTopic
-================================
+Keyword Extraction Module
-Extract representative keywords for topics using:
-- c-TF-IDF (class-based TF-IDF, like BERTopic)
-- BM25 scoring
+Extracts topic keywords using various methods:
+- c-TF-IDF (class-based TF-IDF)
+- BM25
 - KeyBERT (embedding-based)
 """
-from __future__ import annotations
-from typing import Literal
-from collections import Counter
+from typing import List, Dict, Set, Optional, Literal
 import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from collections import Counter
 class KeywordExtractor:
     """
-    Extract keywords for topics.
-    Supports multiple extraction methods for flexibility.
-    Parameters
-    ----------
-    method : str
-        Extraction method: "ctfidf", "bm25", or "keybert"
-    n_keywords : int
-        Number of keywords to extract per topic. Default: 10
-    ngram_range : tuple
-        N-gram range for keyword extraction. Default: (1, 2)
+    Extracts keywords for topics using various methods.
     """
     def __init__(
         self,
         method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf",
         n_keywords: int = 10,
-        ngram_range: tuple[int, int] = (1, 2),
+        ngram_range: tuple = (1, 2),
         min_df: int = 2,
-        max_df: float = 0.95,
+        stopwords: Set[str] = None,
     ):
+        """
+        Initialize the keyword extractor.
+        Parameters
+        ----------
+        method : str
+            Extraction method
+        n_keywords : int
+            Number of keywords per topic
+        ngram_range : tuple
+            N-gram range
+        min_df : int
+            Minimum document frequency
+        stopwords : set
+            Stopwords to exclude
+        """
         self.method = method
         self.n_keywords = n_keywords
         self.ngram_range = ngram_range
         self.min_df = min_df
-        self.max_df = max_df
-        self._vectorizer = None
-        self._vocabulary = None
+        self.stopwords = stopwords or set()
     def extract(
         self,
-        topic_docs: list[str],
-        all_docs: list[str] | None = None,
-        n_keywords: int | None = None,
-    ) -> tuple[list[str], list[float]]:
+        documents: List[str],
+        labels: np.ndarray,
+        tokenized_documents: List[List[str]] = None,
+        embeddings: np.ndarray = None,
+    ) -> Dict[int, List[str]]:
         """
-        Extract keywords from topic documents.
+        Extract keywords for each topic.
         Parameters
         ----------
-        topic_docs : list[str]
-            Documents belonging to the topic.
-        all_docs : list[str], optional
-            All documents in corpus (needed for c-TF-IDF).
-        n_keywords : int, optional
-            Override default n_keywords.
+        documents : List[str]
+            Original documents
+        labels : np.ndarray
+            Topic labels
+        tokenized_documents : List[List[str]], optional
+            Pre-tokenized documents
+        embeddings : np.ndarray, optional
+            Document embeddings (for KeyBERT)
         Returns
         -------
-        keywords : list[str]
-            Top keywords for the topic.
-        scores : list[float]
-            Keyword scores.
+        Dict[int, List[str]]
+            Keywords per topic
         """
-        n = n_keywords or self.n_keywords
         if self.method == "ctfidf":
-            return self._extract_ctfidf(topic_docs, all_docs or topic_docs, n)
+            return self._extract_ctfidf(documents, labels, tokenized_documents)
         elif self.method == "bm25":
-            return self._extract_bm25(topic_docs, all_docs or topic_docs, n)
+            return self._extract_bm25(documents, labels, tokenized_documents)
         elif self.method == "keybert":
-            return self._extract_keybert(topic_docs, n)
+            return self._extract_keybert(documents, labels, embeddings)
         else:
             raise ValueError(f"Unknown method: {self.method}")
     def _extract_ctfidf(
         self,
-        topic_docs: list[str],
-        all_docs: list[str],
-        n_keywords: int,
-    ) -> tuple[list[str], list[float]]:
+        documents: List[str],
+        labels: np.ndarray,
+        tokenized_documents: List[List[str]] = None,
+    ) -> Dict[int, List[str]]:
         """
-        Extract keywords using class-based TF-IDF (c-TF-IDF).
+        Extract keywords using c-TF-IDF.
-        c-TF-IDF treats all documents in a topic as a single "class document"
-        and computes TF-IDF against the corpus. This highlights words that
-        are distinctive for the topic.
+        c-TF-IDF treats each topic as a single "document" by
+        concatenating all documents in the topic.
         """
-        # Fit vectorizer on all docs if not already
-        if self._vectorizer is None:
-            self._vectorizer = CountVectorizer(
-                ngram_range=self.ngram_range,
-                stop_words="english",
-                min_df=self.min_df,
-                max_df=self.max_df,
-            )
-            self._vectorizer.fit(all_docs)
-            self._vocabulary = self._vectorizer.get_feature_names_out()
+        from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
-        # Concatenate topic docs into a single "class document"
-        topic_text = " ".join(topic_docs)
+        unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
-        # Get term frequencies for topic
-        topic_tf = self._vectorizer.transform([topic_text]).toarray()[0]
+        # Create topic documents (concatenate all docs in each topic)
+        topic_docs = []
+        topic_ids = []
-        # Get term frequencies across all docs
-        all_tf = self._vectorizer.transform(all_docs).toarray()
+        for topic_id in unique_topics:
+            mask = labels == topic_id
+            topic_text = ' '.join(documents[i] for i, m in enumerate(mask) if m)
+            topic_docs.append(topic_text)
+            topic_ids.append(topic_id)
+        # Vectorize
+        vectorizer = CountVectorizer(
+            ngram_range=self.ngram_range,
+            min_df=1,  # We're using topic-level docs
+            stop_words=list(self.stopwords) if self.stopwords else None,
+        )
-        # Compute IDF: log(N / (1 + df))
-        doc_freq = np.sum(all_tf > 0, axis=0)
-        idf = np.log(len(all_docs) / (1 + doc_freq))
+        count_matrix = vectorizer.fit_transform(topic_docs)
+        feature_names = vectorizer.get_feature_names_out()
-        # c-TF-IDF = TF * IDF (with smoothing)
-        topic_tf_normalized = topic_tf / (topic_tf.sum() + 1e-10)
-        ctfidf_scores = topic_tf_normalized * idf
+        # Apply TF-IDF transformation
+        tfidf = TfidfTransformer()
+        tfidf_matrix = tfidf.fit_transform(count_matrix)
-        # Get top keywords
-        top_indices = np.argsort(ctfidf_scores)[::-1][:n_keywords]
+        # Extract top keywords per topic
+        keywords = {}
-        keywords = [self._vocabulary[i] for i in top_indices]
-        scores = [float(ctfidf_scores[i]) for i in top_indices]
+        for i, topic_id in enumerate(topic_ids):
+            scores = tfidf_matrix[i].toarray().flatten()
+            top_indices = np.argsort(scores)[::-1]
+            topic_keywords = []
+            for idx in top_indices:
+                word = feature_names[idx]
+                if len(topic_keywords) >= self.n_keywords:
+                    break
+                if word.lower() not in self.stopwords:
+                    topic_keywords.append(word)
+            keywords[topic_id] = topic_keywords
-        return keywords, scores
+        return keywords
     def _extract_bm25(
         self,
-        topic_docs: list[str],
-        all_docs: list[str],
-        n_keywords: int,
-    ) -> tuple[list[str], list[float]]:
+        documents: List[str],
+        labels: np.ndarray,
+        tokenized_documents: List[List[str]] = None,
+    ) -> Dict[int, List[str]]:
         """
         Extract keywords using BM25 scoring.
-        BM25 is more robust to document length variations than TF-IDF.
         """
-        from rank_bm25 import BM25Okapi
+        unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
-        # Tokenize
-        def tokenize(text):
-            # Simple tokenization
-            import re
-            tokens = re.findall(r'\b\w+\b', text.lower())
-            # Remove stopwords
-            stopwords = {
-                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
-                'for', 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were',
-                'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
-                'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
-                'this', 'that', 'these', 'those', 'it', 'its', 'as', 'if', 'then',
-            }
-            return [t for t in tokens if t not in stopwords and len(t) > 2]
+        # Tokenize if not provided
+        if tokenized_documents is None:
+            tokenized_documents = [doc.lower().split() for doc in documents]
-        # Tokenize all docs
-        tokenized_all = [tokenize(doc) for doc in all_docs]
-        tokenized_topic = [tokenize(doc) for doc in topic_docs]
+        # Filter stopwords
+        tokenized_documents = [
+            [w for w in tokens if w not in self.stopwords]
+            for tokens in tokenized_documents
+        ]
-        # Build vocabulary from topic docs
-        topic_vocab = Counter()
-        for tokens in tokenized_topic:
-            topic_vocab.update(tokens)
+        keywords = {}
-        # Fit BM25 on all docs
-        bm25 = BM25Okapi(tokenized_all)
-        # Score each word in topic vocabulary
-        word_scores = {}
-        for word, freq in topic_vocab.items():
-            # Use word as query
-            scores = bm25.get_scores([word])
+        for topic_id in unique_topics:
+            mask = labels == topic_id
+            topic_tokens = [tokenized_documents[i] for i, m in enumerate(mask) if m]
+            other_tokens = [tokenized_documents[i] for i, m in enumerate(mask) if not m]
-            # Average score weighted by frequency in topic
-            avg_score = np.mean(scores)
-            word_scores[word] = avg_score * np.log1p(freq)
-        # Sort by score
-        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
-        keywords = [w for w, s in sorted_words[:n_keywords]]
-        scores = [s for w, s in sorted_words[:n_keywords]]
-        # Normalize scores
-        max_score = max(scores) if scores else 1
-        scores = [s / max_score for s in scores]
+            # Count term frequencies in topic
+            topic_tf = Counter()
+            for tokens in topic_tokens:
+                topic_tf.update(tokens)
+            # Count document frequencies across all docs
+            all_tokens = tokenized_documents
+            df = Counter()
+            for tokens in all_tokens:
+                df.update(set(tokens))
+            # Compute BM25-like scores
+            N = len(all_tokens)
+            avgdl = np.mean([len(t) for t in all_tokens])
+            k1, b = 1.5, 0.75
+            scores = {}
+            topic_len = sum(len(t) for t in topic_tokens)
+            for term, freq in topic_tf.items():
+                if df[term] < self.min_df:
+                    continue
+                idf = np.log((N - df[term] + 0.5) / (df[term] + 0.5) + 1)
+                tf_normalized = freq * (k1 + 1) / (freq + k1 * (1 - b + b * topic_len / avgdl))
+                scores[term] = idf * tf_normalized
+            # Get top keywords
+            sorted_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+            keywords[topic_id] = [term for term, _ in sorted_terms[:self.n_keywords]]
-        return keywords, scores
+        return keywords
     def _extract_keybert(
         self,
-        topic_docs: list[str],
-        n_keywords: int,
-    ) -> tuple[list[str], list[float]]:
+        documents: List[str],
+        labels: np.ndarray,
+        embeddings: np.ndarray = None,
+    ) -> Dict[int, List[str]]:
         """
         Extract keywords using KeyBERT (embedding-based).
-        KeyBERT finds keywords by comparing candidate embeddings
-        to the document embedding.
         """
-        from keybert import KeyBERT
+        try:
+            from keybert import KeyBERT
+        except ImportError:
+            # Fall back to c-TF-IDF
+            import warnings
+            warnings.warn("KeyBERT not installed, falling back to c-TF-IDF")
+            return self._extract_ctfidf(documents, labels, None)
-        # Concatenate topic docs
-        topic_text = " ".join(topic_docs)
+        unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
         # Initialize KeyBERT
         kw_model = KeyBERT()
-        # Extract keywords
-        keywords_with_scores = kw_model.extract_keywords(
-            topic_text,
-            keyphrase_ngram_range=self.ngram_range,
-            stop_words="english",
-            top_n=n_keywords,
-            use_mmr=True,  # Maximal Marginal Relevance for diversity
-            diversity=0.5,
-        )
-        keywords = [kw for kw, score in keywords_with_scores]
-        scores = [float(score) for kw, score in keywords_with_scores]
-        return keywords, scores
-    def extract_all_topics(
-        self,
-        documents: list[str],
-        labels: np.ndarray,
-        n_keywords: int | None = None,
-    ) -> dict[int, tuple[list[str], list[float]]]:
-        """
-        Extract keywords for all topics at once.
-        Parameters
-        ----------
-        documents : list[str]
-            All documents.
-        labels : np.ndarray
-            Topic assignments.
-        n_keywords : int, optional
-            Override default n_keywords.
-        Returns
-        -------
-        topic_keywords : dict
-            Mapping from topic_id to (keywords, scores).
-        """
-        result = {}
+        keywords = {}
-        for topic_id in np.unique(labels):
-            if topic_id == -1:
-                continue
+        for topic_id in unique_topics:
             mask = labels == topic_id
-            topic_docs = [documents[i] for i in np.where(mask)[0]]
+            topic_text = ' '.join(documents[i] for i, m in enumerate(mask) if m)
-            keywords, scores = self.extract(topic_docs, documents, n_keywords)
-            result[int(topic_id)] = (keywords, scores)
+            # Extract keywords
+            kw_results = kw_model.extract_keywords(
+                topic_text,
+                keyphrase_ngram_range=self.ngram_range,
+                stop_words=list(self.stopwords) if self.stopwords else None,
+                top_n=self.n_keywords,
+            )
+            keywords[topic_id] = [kw for kw, _ in kw_results]
-        return result
+        return keywords
-class KeyphraseExtractor:
-    """
-    Extract keyphrases (multi-word) using YAKE or TextRank.
+def compute_keyword_scores(
+    keywords: Dict[int, List[str]],
+    documents: List[str],
+    labels: np.ndarray,
+) -> Dict[int, List[tuple]]:
     """
+    Compute scores for keywords based on their discriminative power.
-    def __init__(
-        self,
-        method: Literal["yake", "textrank"] = "yake",
-        n_keyphrases: int = 10,
-        max_ngram: int = 3,
-    ):
-        self.method = method
-        self.n_keyphrases = n_keyphrases
-        self.max_ngram = max_ngram
-    def extract(self, text: str) -> list[tuple[str, float]]:
-        """Extract keyphrases from text."""
-        if self.method == "yake":
-            return self._extract_yake(text)
-        else:
-            raise ValueError(f"Unknown method: {self.method}")
+    Returns keywords with their scores.
+    """
+    # Get document frequencies per topic
+    topic_dfs = {}
+    unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
-    def _extract_yake(self, text: str) -> list[tuple[str, float]]:
-        """Extract using YAKE algorithm."""
-        try:
-            import yake
-        except ImportError:
-            # Fallback to simple extraction
-            return self._simple_extract(text)
-        kw_extractor = yake.KeywordExtractor(
-            lan="en",
-            n=self.max_ngram,
-            dedupLim=0.7,
-            top=self.n_keyphrases,
-            features=None,
-        )
+    for topic_id in unique_topics:
+        mask = labels == topic_id
+        topic_docs = [documents[i].lower() for i, m in enumerate(mask) if m]
-        keywords = kw_extractor.extract_keywords(text)
+        df = Counter()
+        for doc in topic_docs:
+            words = set(doc.split())
+            df.update(words)
-        # YAKE returns (keyword, score) where lower score is better
-        # Invert for consistency
-        max_score = max(s for _, s in keywords) if keywords else 1
-        return [(kw, 1 - s/max_score) for kw, s in keywords]
+        topic_dfs[topic_id] = df
-    def _simple_extract(self, text: str) -> list[tuple[str, float]]:
-        """Simple n-gram frequency extraction."""
-        import re
-        from collections import Counter
-        # Tokenize
-        tokens = re.findall(r'\b\w+\b', text.lower())
-        # Generate n-grams
-        ngrams = []
-        for n in range(1, self.max_ngram + 1):
-            for i in range(len(tokens) - n + 1):
-                ngram = " ".join(tokens[i:i+n])
-                ngrams.append(ngram)
-        # Count and return top
-        counts = Counter(ngrams)
-        top = counts.most_common(self.n_keyphrases)
+    # Compute scores
+    scored_keywords = {}
+    for topic_id, topic_keywords in keywords.items():
+        scored = []
+        topic_df = topic_dfs[topic_id]
+        n_topic_docs = sum(labels == topic_id)
+        for keyword in topic_keywords:
+            # TF in topic
+            tf = topic_df.get(keyword.lower(), 0) / n_topic_docs
+            # DF across other topics (for IDF-like scoring)
+            other_df = sum(
+                topic_dfs[t].get(keyword.lower(), 0)
+                for t in unique_topics if t != topic_id
+            )
+            n_other_docs = sum(labels != topic_id)
+            if n_other_docs > 0:
+                other_ratio = other_df / n_other_docs
+                # Discriminative score
+                score = tf / (other_ratio + 0.1)
+            else:
+                score = tf
+            scored.append((keyword, round(score, 4)))
-        max_count = top[0][1] if top else 1
-        return [(phrase, count/max_count) for phrase, count in top]
+        scored_keywords[topic_id] = scored
+    return scored_keywords

tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl