PyPI - tritopic - Versions diffs - 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

tritopic 0.1.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tritopic might be problematic. Click here for more details.

Files changed (30) hide show

tritopic/__init__.py +22 -32
tritopic/config.py +289 -0
tritopic/core/__init__.py +0 -17
tritopic/core/clustering.py +229 -243
tritopic/core/embeddings.py +151 -157
tritopic/core/graph.py +435 -0
tritopic/core/keywords.py +213 -249
tritopic/core/refinement.py +231 -0
tritopic/core/representatives.py +560 -0
tritopic/labeling.py +313 -0
tritopic/model.py +718 -0
tritopic/multilingual/__init__.py +38 -0
tritopic/multilingual/detection.py +208 -0
tritopic/multilingual/stopwords.py +467 -0
tritopic/multilingual/tokenizers.py +275 -0
tritopic/visualization.py +371 -0
{tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/METADATA +91 -51
tritopic-1.1.0.dist-info/RECORD +20 -0
tritopic/core/graph_builder.py +0 -493
tritopic/core/model.py +0 -810
tritopic/labeling/__init__.py +0 -5
tritopic/labeling/llm_labeler.py +0 -279
tritopic/utils/__init__.py +0 -13
tritopic/utils/metrics.py +0 -254
tritopic/visualization/__init__.py +0 -5
tritopic/visualization/plotter.py +0 -523
tritopic-0.1.0.dist-info/RECORD +0 -18
tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
{tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/WHEEL +0 -0
{tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/top_level.txt +0 -0

tritopic/core/embeddings.py CHANGED Viewed

@@ -1,222 +1,216 @@
 """
-Embedding Engine for TriTopic
-==============================
+Embedding Engine Module
-Handles document embedding with support for multiple models:
-- Sentence-BERT models (default)
-- Instructor models (task-specific)
-- BGE models (multilingual)
+Handles document embedding generation with multilingual model selection.
 """
-from __future__ import annotations
-from typing import Any, Literal
+from typing import List, Optional, Union
 import numpy as np
-from tqdm import tqdm
+import warnings
 class EmbeddingEngine:
     """
-    Generate document embeddings using transformer models.
-    Supports various embedding models optimized for different use cases.
+    Generates document embeddings using Sentence Transformers.
-    Parameters
-    ----------
-    model_name : str
-        Name of the sentence-transformers model. Popular choices:
-        - "all-MiniLM-L6-v2": Fast, good quality (default)
-        - "all-mpnet-base-v2": Higher quality, slower
-        - "BAAI/bge-base-en-v1.5": State-of-the-art
-        - "BAAI/bge-m3": Multilingual
-        - "hkunlp/instructor-large": Task-specific (use with instruction)
-    batch_size : int
-        Batch size for encoding. Default: 32
-    device : str or None
-        Device to use ("cuda", "cpu", or None for auto).
-    show_progress : bool
-        Show progress bar. Default: True
+    Supports automatic model selection based on language configuration.
     """
+    # Model recommendations by language
+    LANGUAGE_MODELS = {
+        'en': 'all-MiniLM-L6-v2',
+        'zh': 'BAAI/bge-base-zh-v1.5',
+        'multilingual': 'paraphrase-multilingual-mpnet-base-v2',
+        'multilingual_small': 'paraphrase-multilingual-MiniLM-L12-v2',
+        'multilingual_best': 'BAAI/bge-m3',
+    }
     def __init__(
         self,
-        model_name: str = "all-MiniLM-L6-v2",
+        model_name: str = "auto",
+        language: str = "en",
+        multilingual: bool = False,
         batch_size: int = 32,
-        device: str | None = None,
+        device: Optional[str] = None,
         show_progress: bool = True,
     ):
+        """
+        Initialize the embedding engine.
+        Parameters
+        ----------
+        model_name : str
+            Model name or "auto" for automatic selection
+        language : str
+            Language code for model selection
+        multilingual : bool
+            Force multilingual model
+        batch_size : int
+            Batch size for encoding
+        device : str, optional
+            Device to use ('cuda', 'cpu', or None for auto)
+        show_progress : bool
+            Show progress bar during encoding
+        """
         self.model_name = model_name
+        self.language = language
+        self.multilingual = multilingual
         self.batch_size = batch_size
         self.device = device
         self.show_progress = show_progress
         self._model = None
-        self._is_instructor = "instructor" in model_name.lower()
+        self._resolved_model_name = None
+    def _resolve_model_name(self) -> str:
+        """Resolve the model name based on configuration."""
+        if self.model_name != "auto":
+            return self.model_name
+        if self.multilingual:
+            return self.LANGUAGE_MODELS['multilingual']
+        lang = self.language.lower()
+        if lang == 'en':
+            return self.LANGUAGE_MODELS['en']
+        elif lang == 'zh':
+            return self.LANGUAGE_MODELS['zh']
+        elif lang in ['ja', 'ko', 'th', 'vi', 'ar', 'he', 'hi']:
+            # Asian and Middle Eastern languages need multilingual
+            return self.LANGUAGE_MODELS['multilingual_small']
+        elif lang in ['de', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'sv', 'da', 'no', 'fi']:
+            # European languages
+            return self.LANGUAGE_MODELS['multilingual_small']
+        else:
+            # Default to multilingual for unknown languages
+            return self.LANGUAGE_MODELS['multilingual_small']
     def _load_model(self):
-        """Lazy load the embedding model."""
+        """Load the sentence transformer model."""
         if self._model is not None:
             return
-        from sentence_transformers import SentenceTransformer
-        self._model = SentenceTransformer(
-            self.model_name,
-            device=self.device,
-        )
+        try:
+            from sentence_transformers import SentenceTransformer
+        except ImportError:
+            raise ImportError(
+                "sentence-transformers is required for embedding generation. "
+                "Install with: pip install sentence-transformers"
+            )
+        self._resolved_model_name = self._resolve_model_name()
+        try:
+            self._model = SentenceTransformer(
+                self._resolved_model_name,
+                device=self.device
+            )
+        except Exception as e:
+            # Fall back to a known working model
+            warnings.warn(
+                f"Could not load model '{self._resolved_model_name}': {e}. "
+                f"Falling back to 'all-MiniLM-L6-v2'"
+            )
+            self._resolved_model_name = 'all-MiniLM-L6-v2'
+            self._model = SentenceTransformer(
+                self._resolved_model_name,
+                device=self.device
+            )
     def encode(
         self,
-        documents: list[str],
-        instruction: str | None = None,
+        documents: List[str],
         normalize: bool = True,
     ) -> np.ndarray:
         """
-        Encode documents to embeddings.
+        Encode documents into embeddings.
         Parameters
         ----------
-        documents : list[str]
-            List of document texts.
-        instruction : str, optional
-            Instruction for Instructor models (e.g., "Represent the topic of this document:").
+        documents : List[str]
+            List of documents to encode
         normalize : bool
-            Whether to L2-normalize embeddings. Default: True
+            Whether to L2-normalize embeddings
         Returns
         -------
-        embeddings : np.ndarray
-            Document embeddings of shape (n_docs, embedding_dim).
+        np.ndarray
+            Document embeddings of shape (n_documents, embedding_dim)
         """
         self._load_model()
-        # Handle instructor models
-        if self._is_instructor and instruction:
-            documents = [[instruction, doc] for doc in documents]
-        # Encode in batches
         embeddings = self._model.encode(
             documents,
             batch_size=self.batch_size,
             show_progress_bar=self.show_progress,
-            normalize_embeddings=normalize,
             convert_to_numpy=True,
+            normalize_embeddings=normalize,
         )
         return embeddings
-    def encode_with_pooling(
-        self,
-        documents: list[str],
-        pooling: Literal["mean", "max", "cls"] = "mean",
-    ) -> np.ndarray:
-        """
-        Encode with custom pooling strategy.
-        Parameters
-        ----------
-        documents : list[str]
-            Document texts.
-        pooling : str
-            Pooling strategy: "mean", "max", or "cls".
-        Returns
-        -------
-        embeddings : np.ndarray
-            Pooled embeddings.
-        """
-        # For now, use default pooling from model
-        # Custom pooling would require access to token-level embeddings
-        return self.encode(documents)
     @property
     def embedding_dim(self) -> int:
-        """Get embedding dimension."""
+        """Get the embedding dimension."""
         self._load_model()
         return self._model.get_sentence_embedding_dimension()
-    def similarity(
-        self,
-        embeddings1: np.ndarray,
-        embeddings2: np.ndarray | None = None,
-    ) -> np.ndarray:
-        """
-        Compute cosine similarity between embeddings.
-        Parameters
-        ----------
-        embeddings1 : np.ndarray
-            First set of embeddings.
-        embeddings2 : np.ndarray, optional
-            Second set. If None, compute pairwise similarity of embeddings1.
-        Returns
-        -------
-        similarity : np.ndarray
-            Similarity matrix.
-        """
-        from sklearn.metrics.pairwise import cosine_similarity
-        if embeddings2 is None:
-            return cosine_similarity(embeddings1)
-        return cosine_similarity(embeddings1, embeddings2)
+    @property
+    def model_info(self) -> dict:
+        """Get information about the loaded model."""
+        self._load_model()
+        return {
+            'model_name': self._resolved_model_name,
+            'embedding_dim': self.embedding_dim,
+            'language': self.language,
+            'multilingual': self.multilingual,
+        }
-class MultiModelEmbedding:
+def compute_similarity_matrix(
+    embeddings: np.ndarray,
+    metric: str = "cosine"
+) -> np.ndarray:
     """
-    Combine embeddings from multiple models.
+    Compute pairwise similarity matrix from embeddings.
-    Useful for ensemble approaches where different models capture
-    different aspects of document semantics.
+    Parameters
+    ----------
+    embeddings : np.ndarray
+        Document embeddings of shape (n_documents, embedding_dim)
+    metric : str
+        Similarity metric: "cosine", "euclidean", "dot"
+    Returns
+    -------
+    np.ndarray
+        Similarity matrix of shape (n_documents, n_documents)
     """
+    if metric == "cosine":
+        # For normalized embeddings, cosine similarity = dot product
+        # Ensure normalization
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+        norms[norms == 0] = 1  # Avoid division by zero
+        normalized = embeddings / norms
+        similarity = np.dot(normalized, normalized.T)
+    elif metric == "dot":
+        similarity = np.dot(embeddings, embeddings.T)
+    elif metric == "euclidean":
+        # Convert Euclidean distance to similarity
+        from scipy.spatial.distance import cdist
+        distances = cdist(embeddings, embeddings, metric='euclidean')
+        similarity = 1 / (1 + distances)
+    else:
+        raise ValueError(f"Unknown metric: {metric}")
-    def __init__(
-        self,
-        model_names: list[str],
-        weights: list[float] | None = None,
-        batch_size: int = 32,
-    ):
-        self.model_names = model_names
-        self.weights = weights or [1.0 / len(model_names)] * len(model_names)
-        self.batch_size = batch_size
-        self._engines = [
-            EmbeddingEngine(name, batch_size=batch_size)
-            for name in model_names
-        ]
+    # Ensure diagonal is 1 (self-similarity)
+    np.fill_diagonal(similarity, 1.0)
-    def encode(
-        self,
-        documents: list[str],
-        normalize: bool = True,
-    ) -> np.ndarray:
-        """
-        Encode using all models and combine.
-        Parameters
-        ----------
-        documents : list[str]
-            Document texts.
-        normalize : bool
-            Normalize final embeddings.
-        Returns
-        -------
-        embeddings : np.ndarray
-            Combined embeddings (concatenated).
-        """
-        all_embeddings = []
-        for engine, weight in zip(self._engines, self.weights):
-            emb = engine.encode(documents, normalize=True)
-            all_embeddings.append(emb * weight)
-        # Concatenate
-        combined = np.hstack(all_embeddings)
-        if normalize:
-            norms = np.linalg.norm(combined, axis=1, keepdims=True)
-            combined = combined / (norms + 1e-10)
-        return combined
+    # Clip to [0, 1] range
+    similarity = np.clip(similarity, 0, 1)
+    return similarity

tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

tritopic 0.1.0py3-none-any.whl → 1.1.0py3-none-any.whl