PyPI - tritopic - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

tritopic/__init__.py +22 -32
tritopic/config.py +305 -0
tritopic/core/__init__.py +0 -17
tritopic/core/clustering.py +229 -243
tritopic/core/embeddings.py +151 -157
tritopic/core/graph.py +435 -0
tritopic/core/keywords.py +213 -249
tritopic/core/refinement.py +231 -0
tritopic/core/representatives.py +560 -0
tritopic/labeling.py +313 -0
tritopic/model.py +718 -0
tritopic/multilingual/__init__.py +38 -0
tritopic/multilingual/detection.py +208 -0
tritopic/multilingual/stopwords.py +467 -0
tritopic/multilingual/tokenizers.py +275 -0
tritopic/visualization.py +371 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
tritopic-1.0.0.dist-info/RECORD +20 -0
tritopic/core/graph_builder.py +0 -493
tritopic/core/model.py +0 -810
tritopic/labeling/__init__.py +0 -5
tritopic/labeling/llm_labeler.py +0 -279
tritopic/utils/__init__.py +0 -13
tritopic/utils/metrics.py +0 -254
tritopic/visualization/__init__.py +0 -5
tritopic/visualization/plotter.py +0 -523
tritopic-0.1.0.dist-info/RECORD +0 -18
tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
{tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0

tritopic/multilingual/tokenizers.py ADDED Viewed

@@ -0,0 +1,275 @@
+"""
+Multilingual Tokenizers Module
+Provides language-specific tokenization for various languages including CJK.
+"""
+from typing import List, Callable, Optional
+import re
+import warnings
+class TokenizerFactory:
+    """Factory for creating language-specific tokenizers."""
+    _tokenizers = {}
+    @classmethod
+    def get_tokenizer(cls, language: str, tokenizer_type: str = "auto") -> Callable[[str], List[str]]:
+        """
+        Get the appropriate tokenizer for a language.
+        Parameters
+        ----------
+        language : str
+            ISO 639-1 language code
+        tokenizer_type : str
+            Type of tokenizer: "auto", "whitespace", "spacy", "jieba", "fugashi", "konlpy", "pythainlp"
+        Returns
+        -------
+        Callable[[str], List[str]]
+            A tokenizer function that takes text and returns tokens
+        """
+        if tokenizer_type != "auto":
+            return cls._get_specific_tokenizer(tokenizer_type, language)
+        # Auto-select based on language
+        if language in ['zh', 'zh-cn', 'zh-tw']:
+            return cls._get_jieba_tokenizer()
+        elif language == 'ja':
+            return cls._get_japanese_tokenizer()
+        elif language == 'ko':
+            return cls._get_korean_tokenizer()
+        elif language == 'th':
+            return cls._get_thai_tokenizer()
+        else:
+            return cls._get_whitespace_tokenizer(language)
+    @classmethod
+    def _get_specific_tokenizer(cls, tokenizer_type: str, language: str) -> Callable[[str], List[str]]:
+        """Get a specific tokenizer by name."""
+        tokenizer_map = {
+            "whitespace": lambda: cls._get_whitespace_tokenizer(language),
+            "jieba": cls._get_jieba_tokenizer,
+            "fugashi": cls._get_japanese_tokenizer,
+            "mecab": cls._get_japanese_tokenizer,
+            "konlpy": cls._get_korean_tokenizer,
+            "pythainlp": cls._get_thai_tokenizer,
+            "spacy": lambda: cls._get_spacy_tokenizer(language),
+        }
+        if tokenizer_type in tokenizer_map:
+            return tokenizer_map[tokenizer_type]()
+        else:
+            warnings.warn(f"Unknown tokenizer '{tokenizer_type}', falling back to whitespace")
+            return cls._get_whitespace_tokenizer(language)
+    @classmethod
+    def _get_whitespace_tokenizer(cls, language: str) -> Callable[[str], List[str]]:
+        """Get a simple whitespace-based tokenizer with language-aware preprocessing."""
+        def tokenize(text: str) -> List[str]:
+            # Lowercase
+            text = text.lower()
+            # Remove punctuation but keep apostrophes for contractions
+            text = re.sub(r"[^\w\s'-]", " ", text)
+            # Split on whitespace
+            tokens = text.split()
+            # Remove tokens that are just punctuation or numbers
+            tokens = [t for t in tokens if re.search(r'[a-zA-ZäöüßàâçéèêëîïôûùüÿñæœÄÖÜ]', t)]
+            return tokens
+        return tokenize
+    @classmethod
+    def _get_jieba_tokenizer(cls) -> Callable[[str], List[str]]:
+        """Get Chinese tokenizer using jieba."""
+        try:
+            import jieba
+            jieba.setLogLevel(jieba.logging.INFO)  # Reduce verbosity
+            def tokenize(text: str) -> List[str]:
+                # Use jieba's cut function for word segmentation
+                tokens = list(jieba.cut(text, cut_all=False))
+                # Filter out whitespace and punctuation
+                tokens = [t.strip() for t in tokens if t.strip() and not re.match(r'^[\s\W]+$', t)]
+                return tokens
+            return tokenize
+        except ImportError:
+            warnings.warn(
+                "jieba not installed. Install with 'pip install jieba' for Chinese tokenization. "
+                "Falling back to character-based tokenization."
+            )
+            return cls._get_character_tokenizer()
+    @classmethod
+    def _get_japanese_tokenizer(cls) -> Callable[[str], List[str]]:
+        """Get Japanese tokenizer using fugashi (MeCab)."""
+        try:
+            import fugashi
+            tagger = fugashi.Tagger()
+            def tokenize(text: str) -> List[str]:
+                tokens = []
+                for word in tagger(text):
+                    surface = word.surface
+                    # Filter out punctuation and whitespace
+                    if surface.strip() and not re.match(r'^[\s\W]+$', surface):
+                        tokens.append(surface)
+                return tokens
+            return tokenize
+        except ImportError:
+            warnings.warn(
+                "fugashi not installed. Install with 'pip install fugashi unidic-lite' for Japanese tokenization. "
+                "Falling back to character-based tokenization."
+            )
+            return cls._get_character_tokenizer()
+    @classmethod
+    def _get_korean_tokenizer(cls) -> Callable[[str], List[str]]:
+        """Get Korean tokenizer using KoNLPy."""
+        try:
+            from konlpy.tag import Okt
+            okt = Okt()
+            def tokenize(text: str) -> List[str]:
+                # Use morphological analysis
+                tokens = okt.morphs(text)
+                # Filter out punctuation
+                tokens = [t for t in tokens if t.strip() and not re.match(r'^[\s\W]+$', t)]
+                return tokens
+            return tokenize
+        except ImportError:
+            warnings.warn(
+                "konlpy not installed. Install with 'pip install konlpy' for Korean tokenization. "
+                "Note: KoNLPy may require Java. Falling back to character-based tokenization."
+            )
+            return cls._get_character_tokenizer()
+    @classmethod
+    def _get_thai_tokenizer(cls) -> Callable[[str], List[str]]:
+        """Get Thai tokenizer using pythainlp."""
+        try:
+            from pythainlp.tokenize import word_tokenize
+            def tokenize(text: str) -> List[str]:
+                tokens = word_tokenize(text, engine='newmm')
+                # Filter out whitespace and punctuation
+                tokens = [t.strip() for t in tokens if t.strip() and not re.match(r'^[\s\W]+$', t)]
+                return tokens
+            return tokenize
+        except ImportError:
+            warnings.warn(
+                "pythainlp not installed. Install with 'pip install pythainlp' for Thai tokenization. "
+                "Falling back to whitespace tokenization."
+            )
+            return cls._get_whitespace_tokenizer('th')
+    @classmethod
+    def _get_spacy_tokenizer(cls, language: str) -> Callable[[str], List[str]]:
+        """Get spaCy tokenizer for specified language."""
+        try:
+            import spacy
+            # Map language codes to spaCy model names
+            model_map = {
+                'en': 'en_core_web_sm',
+                'de': 'de_core_news_sm',
+                'fr': 'fr_core_news_sm',
+                'es': 'es_core_news_sm',
+                'it': 'it_core_news_sm',
+                'pt': 'pt_core_news_sm',
+                'nl': 'nl_core_news_sm',
+                'pl': 'pl_core_news_sm',
+                'ru': 'ru_core_news_sm',
+                'zh': 'zh_core_web_sm',
+                'ja': 'ja_core_news_sm',
+            }
+            model_name = model_map.get(language, 'en_core_web_sm')
+            try:
+                nlp = spacy.load(model_name)
+            except OSError:
+                warnings.warn(f"spaCy model '{model_name}' not found. Falling back to whitespace tokenization.")
+                return cls._get_whitespace_tokenizer(language)
+            def tokenize(text: str) -> List[str]:
+                doc = nlp(text)
+                tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_space]
+                return tokens
+            return tokenize
+        except ImportError:
+            warnings.warn("spaCy not installed. Falling back to whitespace tokenization.")
+            return cls._get_whitespace_tokenizer(language)
+    @classmethod
+    def _get_character_tokenizer(cls) -> Callable[[str], List[str]]:
+        """Fallback character-based tokenizer for CJK without proper tokenizer."""
+        def tokenize(text: str) -> List[str]:
+            # For CJK, use n-gram characters
+            tokens = []
+            # Remove whitespace and punctuation
+            text = re.sub(r'[\s\W]+', '', text)
+            # Create character bigrams
+            for i in range(len(text) - 1):
+                tokens.append(text[i:i+2])
+            return tokens
+        return tokenize
+def tokenize_documents(
+    documents: List[str],
+    language: str,
+    tokenizer_type: str = "auto",
+    min_length: int = 2,
+    max_length: int = 50
+) -> List[List[str]]:
+    """
+    Tokenize a list of documents.
+    Parameters
+    ----------
+    documents : List[str]
+        List of documents to tokenize
+    language : str
+        ISO 639-1 language code
+    tokenizer_type : str
+        Type of tokenizer to use
+    min_length : int
+        Minimum token length to keep
+    max_length : int
+        Maximum token length to keep
+    Returns
+    -------
+    List[List[str]]
+        List of tokenized documents
+    """
+    tokenizer = TokenizerFactory.get_tokenizer(language, tokenizer_type)
+    tokenized = []
+    for doc in documents:
+        if not isinstance(doc, str):
+            tokenized.append([])
+            continue
+        tokens = tokenizer(doc)
+        # Filter by length
+        tokens = [t for t in tokens if min_length <= len(t) <= max_length]
+        tokenized.append(tokens)
+    return tokenized

tritopic/visualization.py ADDED Viewed

@@ -0,0 +1,371 @@
+"""
+Visualization functions for TriTopic.
+Provides interactive visualizations using Plotly.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, TYPE_CHECKING
+import numpy as np
+if TYPE_CHECKING:
+    from .model import Topic
+def create_topic_visualization(
+    embeddings: np.ndarray,
+    labels: np.ndarray,
+    topics: List["Topic"],
+    documents: List[str],
+    method: str = "umap",
+    width: int = 900,
+    height: int = 700,
+    point_size: int = 5,
+    **kwargs
+) -> Any:
+    """
+    Create a 2D visualization of documents colored by topic.
+    Parameters
+    ----------
+    embeddings : np.ndarray
+        Document embeddings.
+    labels : np.ndarray
+        Topic assignments.
+    topics : List[Topic]
+        Topic objects with labels.
+    documents : List[str]
+        Original documents for hover text.
+    method : str
+        Reduction method: "umap", "tsne", or "pca".
+    width, height : int
+        Figure dimensions.
+    point_size : int
+        Size of scatter points.
+    Returns
+    -------
+    plotly.graph_objects.Figure
+        Interactive scatter plot.
+    """
+    try:
+        import plotly.express as px
+        import plotly.graph_objects as go
+    except ImportError:
+        raise ImportError("Plotly required. Install with: pip install plotly")
+    # Reduce dimensions
+    coords = _reduce_dimensions(embeddings, method, **kwargs)
+    # Create topic labels for legend
+    topic_map = {t.topic_id: t.label or f"Topic {t.topic_id}" for t in topics}
+    topic_names = [topic_map.get(l, "Outlier") for l in labels]
+    # Create hover text
+    hover_texts = [doc[:200] + "..." if len(doc) > 200 else doc for doc in documents]
+    # Create figure
+    fig = go.Figure()
+    # Add scatter for each topic
+    unique_labels = sorted(set(labels))
+    colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel
+    for i, topic_id in enumerate(unique_labels):
+        mask = labels == topic_id
+        topic_name = topic_map.get(topic_id, "Outlier")
+        color = colors[i % len(colors)] if topic_id >= 0 else "lightgray"
+        fig.add_trace(go.Scatter(
+            x=coords[mask, 0],
+            y=coords[mask, 1],
+            mode="markers",
+            name=topic_name,
+            marker=dict(
+                size=point_size,
+                color=color,
+                opacity=0.7 if topic_id >= 0 else 0.3
+            ),
+            text=[hover_texts[j] for j in np.where(mask)[0]],
+            hovertemplate="<b>%{text}</b><extra></extra>"
+        ))
+    fig.update_layout(
+        title="TriTopic Document Map",
+        xaxis_title=f"{method.upper()} 1",
+        yaxis_title=f"{method.upper()} 2",
+        width=width,
+        height=height,
+        template="plotly_white",
+        legend=dict(
+            yanchor="top",
+            y=0.99,
+            xanchor="left",
+            x=1.02
+        )
+    )
+    return fig
+def create_topic_barchart(
+    topics: List["Topic"],
+    n_keywords: int = 10,
+    width: int = 800,
+    height: int = None
+) -> Any:
+    """
+    Create horizontal bar charts showing top keywords per topic.
+    Parameters
+    ----------
+    topics : List[Topic]
+        Topic objects with keywords.
+    n_keywords : int
+        Number of keywords to show.
+    width : int
+        Figure width.
+    height : int, optional
+        Figure height. Auto-calculated if None.
+    Returns
+    -------
+    plotly.graph_objects.Figure
+        Bar chart figure.
+    """
+    try:
+        import plotly.graph_objects as go
+        from plotly.subplots import make_subplots
+    except ImportError:
+        raise ImportError("Plotly required.")
+    # Filter out outliers
+    valid_topics = [t for t in topics if t.topic_id >= 0]
+    n_topics = len(valid_topics)
+    if n_topics == 0:
+        raise ValueError("No valid topics to visualize")
+    # Calculate layout
+    n_cols = min(3, n_topics)
+    n_rows = (n_topics + n_cols - 1) // n_cols
+    if height is None:
+        height = n_rows * 250
+    # Create subplots
+    subplot_titles = [
+        t.label or f"Topic {t.topic_id}" for t in valid_topics
+    ]
+    fig = make_subplots(
+        rows=n_rows,
+        cols=n_cols,
+        subplot_titles=subplot_titles,
+        horizontal_spacing=0.1,
+        vertical_spacing=0.15
+    )
+    # Add bar for each topic
+    for i, topic in enumerate(valid_topics):
+        row = i // n_cols + 1
+        col = i % n_cols + 1
+        keywords = topic.keywords[:n_keywords][::-1]  # Reverse for horizontal bars
+        scores = topic.keyword_scores[:n_keywords][::-1]
+        fig.add_trace(
+            go.Bar(
+                x=scores,
+                y=keywords,
+                orientation='h',
+                marker_color='steelblue',
+                showlegend=False
+            ),
+            row=row,
+            col=col
+        )
+    fig.update_layout(
+        title="Topic Keywords",
+        width=width,
+        height=height,
+        template="plotly_white"
+    )
+    return fig
+def create_topic_hierarchy(
+    embeddings: np.ndarray,
+    labels: np.ndarray,
+    topics: List["Topic"],
+    method: str = "ward",
+    **kwargs
+) -> Any:
+    """
+    Create a hierarchical clustering dendrogram of topics.
+    Parameters
+    ----------
+    embeddings : np.ndarray
+        Document embeddings.
+    labels : np.ndarray
+        Topic assignments.
+    topics : List[Topic]
+        Topic objects.
+    method : str
+        Linkage method for hierarchical clustering.
+    Returns
+    -------
+    plotly.graph_objects.Figure
+        Dendrogram figure.
+    """
+    try:
+        import plotly.figure_factory as ff
+    except ImportError:
+        raise ImportError("Plotly required.")
+    from scipy.cluster.hierarchy import linkage
+    from scipy.spatial.distance import pdist
+    # Compute topic centroids
+    valid_topics = [t for t in topics if t.topic_id >= 0 and t.centroid is not None]
+    if len(valid_topics) < 2:
+        raise ValueError("Need at least 2 topics for hierarchy")
+    centroids = np.array([t.centroid for t in valid_topics])
+    topic_labels = [t.label or f"Topic {t.topic_id}" for t in valid_topics]
+    # Compute linkage
+    distances = pdist(centroids, metric='cosine')
+    Z = linkage(distances, method=method)
+    # Create dendrogram
+    fig = ff.create_dendrogram(
+        centroids,
+        orientation='left',
+        labels=topic_labels,
+        linkagefun=lambda x: Z
+    )
+    fig.update_layout(
+        title="Topic Hierarchy",
+        width=800,
+        height=max(400, len(valid_topics) * 30),
+        template="plotly_white"
+    )
+    return fig
+def create_topic_heatmap(
+    topics: List["Topic"],
+    documents: List[str],
+    labels: np.ndarray,
+    n_top_keywords: int = 20
+) -> Any:
+    """
+    Create a heatmap showing keyword importance across topics.
+    Parameters
+    ----------
+    topics : List[Topic]
+        Topic objects.
+    documents : List[str]
+        Original documents.
+    labels : np.ndarray
+        Topic assignments.
+    n_top_keywords : int
+        Number of keywords to include.
+    Returns
+    -------
+    plotly.graph_objects.Figure
+        Heatmap figure.
+    """
+    try:
+        import plotly.graph_objects as go
+    except ImportError:
+        raise ImportError("Plotly required.")
+    # Collect all unique keywords
+    valid_topics = [t for t in topics if t.topic_id >= 0]
+    all_keywords = set()
+    for topic in valid_topics:
+        all_keywords.update(topic.keywords[:n_top_keywords])
+    all_keywords = sorted(all_keywords)
+    # Build matrix
+    matrix = np.zeros((len(valid_topics), len(all_keywords)))
+    for i, topic in enumerate(valid_topics):
+        for j, kw in enumerate(all_keywords):
+            if kw in topic.keywords:
+                idx = topic.keywords.index(kw)
+                if idx < len(topic.keyword_scores):
+                    matrix[i, j] = topic.keyword_scores[idx]
+    # Create heatmap
+    topic_names = [t.label or f"Topic {t.topic_id}" for t in valid_topics]
+    fig = go.Figure(data=go.Heatmap(
+        z=matrix,
+        x=all_keywords,
+        y=topic_names,
+        colorscale='Blues'
+    ))
+    fig.update_layout(
+        title="Keyword Importance by Topic",
+        xaxis_title="Keywords",
+        yaxis_title="Topics",
+        width=max(800, len(all_keywords) * 20),
+        height=max(400, len(valid_topics) * 30),
+        template="plotly_white"
+    )
+    return fig
+def _reduce_dimensions(
+    embeddings: np.ndarray,
+    method: str = "umap",
+    n_components: int = 2,
+    **kwargs
+) -> np.ndarray:
+    """Reduce embedding dimensions for visualization."""
+    if method.lower() == "umap":
+        try:
+            from umap import UMAP
+            reducer = UMAP(
+                n_components=n_components,
+                n_neighbors=kwargs.get("n_neighbors", 15),
+                min_dist=kwargs.get("min_dist", 0.1),
+                metric=kwargs.get("metric", "cosine"),
+                random_state=kwargs.get("random_state", 42)
+            )
+            return reducer.fit_transform(embeddings)
+        except ImportError:
+            print("UMAP not available, falling back to PCA")
+            method = "pca"
+    if method.lower() == "tsne":
+        from sklearn.manifold import TSNE
+        reducer = TSNE(
+            n_components=n_components,
+            perplexity=kwargs.get("perplexity", 30),
+            random_state=kwargs.get("random_state", 42)
+        )
+        return reducer.fit_transform(embeddings)
+    # Default: PCA
+    from sklearn.decomposition import PCA
+    reducer = PCA(n_components=n_components)
+    return reducer.fit_transform(embeddings)

tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

tritopic 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl