tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tritopic might be problematic. Click here for more details.
- tritopic/__init__.py +22 -32
- tritopic/config.py +289 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/METADATA +91 -51
- tritopic-1.1.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/top_level.txt +0 -0
tritopic/core/embeddings.py
CHANGED
|
@@ -1,222 +1,216 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Embedding Engine
|
|
3
|
-
==============================
|
|
2
|
+
Embedding Engine Module
|
|
4
3
|
|
|
5
|
-
Handles document embedding with
|
|
6
|
-
- Sentence-BERT models (default)
|
|
7
|
-
- Instructor models (task-specific)
|
|
8
|
-
- BGE models (multilingual)
|
|
4
|
+
Handles document embedding generation with multilingual model selection.
|
|
9
5
|
"""
|
|
10
6
|
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
from typing import Any, Literal
|
|
14
|
-
|
|
7
|
+
from typing import List, Optional, Union
|
|
15
8
|
import numpy as np
|
|
16
|
-
|
|
9
|
+
import warnings
|
|
17
10
|
|
|
18
11
|
|
|
19
12
|
class EmbeddingEngine:
|
|
20
13
|
"""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Supports various embedding models optimized for different use cases.
|
|
14
|
+
Generates document embeddings using Sentence Transformers.
|
|
24
15
|
|
|
25
|
-
|
|
26
|
-
----------
|
|
27
|
-
model_name : str
|
|
28
|
-
Name of the sentence-transformers model. Popular choices:
|
|
29
|
-
- "all-MiniLM-L6-v2": Fast, good quality (default)
|
|
30
|
-
- "all-mpnet-base-v2": Higher quality, slower
|
|
31
|
-
- "BAAI/bge-base-en-v1.5": State-of-the-art
|
|
32
|
-
- "BAAI/bge-m3": Multilingual
|
|
33
|
-
- "hkunlp/instructor-large": Task-specific (use with instruction)
|
|
34
|
-
batch_size : int
|
|
35
|
-
Batch size for encoding. Default: 32
|
|
36
|
-
device : str or None
|
|
37
|
-
Device to use ("cuda", "cpu", or None for auto).
|
|
38
|
-
show_progress : bool
|
|
39
|
-
Show progress bar. Default: True
|
|
16
|
+
Supports automatic model selection based on language configuration.
|
|
40
17
|
"""
|
|
41
18
|
|
|
19
|
+
# Model recommendations by language
|
|
20
|
+
LANGUAGE_MODELS = {
|
|
21
|
+
'en': 'all-MiniLM-L6-v2',
|
|
22
|
+
'zh': 'BAAI/bge-base-zh-v1.5',
|
|
23
|
+
'multilingual': 'paraphrase-multilingual-mpnet-base-v2',
|
|
24
|
+
'multilingual_small': 'paraphrase-multilingual-MiniLM-L12-v2',
|
|
25
|
+
'multilingual_best': 'BAAI/bge-m3',
|
|
26
|
+
}
|
|
27
|
+
|
|
42
28
|
def __init__(
|
|
43
29
|
self,
|
|
44
|
-
model_name: str = "
|
|
30
|
+
model_name: str = "auto",
|
|
31
|
+
language: str = "en",
|
|
32
|
+
multilingual: bool = False,
|
|
45
33
|
batch_size: int = 32,
|
|
46
|
-
device: str
|
|
34
|
+
device: Optional[str] = None,
|
|
47
35
|
show_progress: bool = True,
|
|
48
36
|
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize the embedding engine.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
model_name : str
|
|
43
|
+
Model name or "auto" for automatic selection
|
|
44
|
+
language : str
|
|
45
|
+
Language code for model selection
|
|
46
|
+
multilingual : bool
|
|
47
|
+
Force multilingual model
|
|
48
|
+
batch_size : int
|
|
49
|
+
Batch size for encoding
|
|
50
|
+
device : str, optional
|
|
51
|
+
Device to use ('cuda', 'cpu', or None for auto)
|
|
52
|
+
show_progress : bool
|
|
53
|
+
Show progress bar during encoding
|
|
54
|
+
"""
|
|
49
55
|
self.model_name = model_name
|
|
56
|
+
self.language = language
|
|
57
|
+
self.multilingual = multilingual
|
|
50
58
|
self.batch_size = batch_size
|
|
51
59
|
self.device = device
|
|
52
60
|
self.show_progress = show_progress
|
|
53
61
|
|
|
54
62
|
self._model = None
|
|
55
|
-
self.
|
|
63
|
+
self._resolved_model_name = None
|
|
64
|
+
|
|
65
|
+
def _resolve_model_name(self) -> str:
|
|
66
|
+
"""Resolve the model name based on configuration."""
|
|
67
|
+
if self.model_name != "auto":
|
|
68
|
+
return self.model_name
|
|
69
|
+
|
|
70
|
+
if self.multilingual:
|
|
71
|
+
return self.LANGUAGE_MODELS['multilingual']
|
|
72
|
+
|
|
73
|
+
lang = self.language.lower()
|
|
74
|
+
|
|
75
|
+
if lang == 'en':
|
|
76
|
+
return self.LANGUAGE_MODELS['en']
|
|
77
|
+
elif lang == 'zh':
|
|
78
|
+
return self.LANGUAGE_MODELS['zh']
|
|
79
|
+
elif lang in ['ja', 'ko', 'th', 'vi', 'ar', 'he', 'hi']:
|
|
80
|
+
# Asian and Middle Eastern languages need multilingual
|
|
81
|
+
return self.LANGUAGE_MODELS['multilingual_small']
|
|
82
|
+
elif lang in ['de', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'sv', 'da', 'no', 'fi']:
|
|
83
|
+
# European languages
|
|
84
|
+
return self.LANGUAGE_MODELS['multilingual_small']
|
|
85
|
+
else:
|
|
86
|
+
# Default to multilingual for unknown languages
|
|
87
|
+
return self.LANGUAGE_MODELS['multilingual_small']
|
|
56
88
|
|
|
57
89
|
def _load_model(self):
|
|
58
|
-
"""
|
|
90
|
+
"""Load the sentence transformer model."""
|
|
59
91
|
if self._model is not None:
|
|
60
92
|
return
|
|
61
93
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
94
|
+
try:
|
|
95
|
+
from sentence_transformers import SentenceTransformer
|
|
96
|
+
except ImportError:
|
|
97
|
+
raise ImportError(
|
|
98
|
+
"sentence-transformers is required for embedding generation. "
|
|
99
|
+
"Install with: pip install sentence-transformers"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
self._resolved_model_name = self._resolve_model_name()
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
self._model = SentenceTransformer(
|
|
106
|
+
self._resolved_model_name,
|
|
107
|
+
device=self.device
|
|
108
|
+
)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
# Fall back to a known working model
|
|
111
|
+
warnings.warn(
|
|
112
|
+
f"Could not load model '{self._resolved_model_name}': {e}. "
|
|
113
|
+
f"Falling back to 'all-MiniLM-L6-v2'"
|
|
114
|
+
)
|
|
115
|
+
self._resolved_model_name = 'all-MiniLM-L6-v2'
|
|
116
|
+
self._model = SentenceTransformer(
|
|
117
|
+
self._resolved_model_name,
|
|
118
|
+
device=self.device
|
|
119
|
+
)
|
|
68
120
|
|
|
69
121
|
def encode(
|
|
70
122
|
self,
|
|
71
|
-
documents:
|
|
72
|
-
instruction: str | None = None,
|
|
123
|
+
documents: List[str],
|
|
73
124
|
normalize: bool = True,
|
|
74
125
|
) -> np.ndarray:
|
|
75
126
|
"""
|
|
76
|
-
Encode documents
|
|
127
|
+
Encode documents into embeddings.
|
|
77
128
|
|
|
78
129
|
Parameters
|
|
79
130
|
----------
|
|
80
|
-
documents :
|
|
81
|
-
List of
|
|
82
|
-
instruction : str, optional
|
|
83
|
-
Instruction for Instructor models (e.g., "Represent the topic of this document:").
|
|
131
|
+
documents : List[str]
|
|
132
|
+
List of documents to encode
|
|
84
133
|
normalize : bool
|
|
85
|
-
Whether to L2-normalize embeddings
|
|
134
|
+
Whether to L2-normalize embeddings
|
|
86
135
|
|
|
87
136
|
Returns
|
|
88
137
|
-------
|
|
89
|
-
|
|
90
|
-
Document embeddings of shape (
|
|
138
|
+
np.ndarray
|
|
139
|
+
Document embeddings of shape (n_documents, embedding_dim)
|
|
91
140
|
"""
|
|
92
141
|
self._load_model()
|
|
93
142
|
|
|
94
|
-
# Handle instructor models
|
|
95
|
-
if self._is_instructor and instruction:
|
|
96
|
-
documents = [[instruction, doc] for doc in documents]
|
|
97
|
-
|
|
98
|
-
# Encode in batches
|
|
99
143
|
embeddings = self._model.encode(
|
|
100
144
|
documents,
|
|
101
145
|
batch_size=self.batch_size,
|
|
102
146
|
show_progress_bar=self.show_progress,
|
|
103
|
-
normalize_embeddings=normalize,
|
|
104
147
|
convert_to_numpy=True,
|
|
148
|
+
normalize_embeddings=normalize,
|
|
105
149
|
)
|
|
106
150
|
|
|
107
151
|
return embeddings
|
|
108
152
|
|
|
109
|
-
def encode_with_pooling(
|
|
110
|
-
self,
|
|
111
|
-
documents: list[str],
|
|
112
|
-
pooling: Literal["mean", "max", "cls"] = "mean",
|
|
113
|
-
) -> np.ndarray:
|
|
114
|
-
"""
|
|
115
|
-
Encode with custom pooling strategy.
|
|
116
|
-
|
|
117
|
-
Parameters
|
|
118
|
-
----------
|
|
119
|
-
documents : list[str]
|
|
120
|
-
Document texts.
|
|
121
|
-
pooling : str
|
|
122
|
-
Pooling strategy: "mean", "max", or "cls".
|
|
123
|
-
|
|
124
|
-
Returns
|
|
125
|
-
-------
|
|
126
|
-
embeddings : np.ndarray
|
|
127
|
-
Pooled embeddings.
|
|
128
|
-
"""
|
|
129
|
-
# For now, use default pooling from model
|
|
130
|
-
# Custom pooling would require access to token-level embeddings
|
|
131
|
-
return self.encode(documents)
|
|
132
|
-
|
|
133
153
|
@property
|
|
134
154
|
def embedding_dim(self) -> int:
|
|
135
|
-
"""Get embedding dimension."""
|
|
155
|
+
"""Get the embedding dimension."""
|
|
136
156
|
self._load_model()
|
|
137
157
|
return self._model.get_sentence_embedding_dimension()
|
|
138
158
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
embeddings1 : np.ndarray
|
|
150
|
-
First set of embeddings.
|
|
151
|
-
embeddings2 : np.ndarray, optional
|
|
152
|
-
Second set. If None, compute pairwise similarity of embeddings1.
|
|
153
|
-
|
|
154
|
-
Returns
|
|
155
|
-
-------
|
|
156
|
-
similarity : np.ndarray
|
|
157
|
-
Similarity matrix.
|
|
158
|
-
"""
|
|
159
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
|
160
|
-
|
|
161
|
-
if embeddings2 is None:
|
|
162
|
-
return cosine_similarity(embeddings1)
|
|
163
|
-
return cosine_similarity(embeddings1, embeddings2)
|
|
159
|
+
@property
|
|
160
|
+
def model_info(self) -> dict:
|
|
161
|
+
"""Get information about the loaded model."""
|
|
162
|
+
self._load_model()
|
|
163
|
+
return {
|
|
164
|
+
'model_name': self._resolved_model_name,
|
|
165
|
+
'embedding_dim': self.embedding_dim,
|
|
166
|
+
'language': self.language,
|
|
167
|
+
'multilingual': self.multilingual,
|
|
168
|
+
}
|
|
164
169
|
|
|
165
170
|
|
|
166
|
-
|
|
171
|
+
def compute_similarity_matrix(
|
|
172
|
+
embeddings: np.ndarray,
|
|
173
|
+
metric: str = "cosine"
|
|
174
|
+
) -> np.ndarray:
|
|
167
175
|
"""
|
|
168
|
-
|
|
176
|
+
Compute pairwise similarity matrix from embeddings.
|
|
169
177
|
|
|
170
|
-
|
|
171
|
-
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
embeddings : np.ndarray
|
|
181
|
+
Document embeddings of shape (n_documents, embedding_dim)
|
|
182
|
+
metric : str
|
|
183
|
+
Similarity metric: "cosine", "euclidean", "dot"
|
|
184
|
+
|
|
185
|
+
Returns
|
|
186
|
+
-------
|
|
187
|
+
np.ndarray
|
|
188
|
+
Similarity matrix of shape (n_documents, n_documents)
|
|
172
189
|
"""
|
|
190
|
+
if metric == "cosine":
|
|
191
|
+
# For normalized embeddings, cosine similarity = dot product
|
|
192
|
+
# Ensure normalization
|
|
193
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
194
|
+
norms[norms == 0] = 1 # Avoid division by zero
|
|
195
|
+
normalized = embeddings / norms
|
|
196
|
+
similarity = np.dot(normalized, normalized.T)
|
|
197
|
+
|
|
198
|
+
elif metric == "dot":
|
|
199
|
+
similarity = np.dot(embeddings, embeddings.T)
|
|
200
|
+
|
|
201
|
+
elif metric == "euclidean":
|
|
202
|
+
# Convert Euclidean distance to similarity
|
|
203
|
+
from scipy.spatial.distance import cdist
|
|
204
|
+
distances = cdist(embeddings, embeddings, metric='euclidean')
|
|
205
|
+
similarity = 1 / (1 + distances)
|
|
206
|
+
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError(f"Unknown metric: {metric}")
|
|
173
209
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
model_names: list[str],
|
|
177
|
-
weights: list[float] | None = None,
|
|
178
|
-
batch_size: int = 32,
|
|
179
|
-
):
|
|
180
|
-
self.model_names = model_names
|
|
181
|
-
self.weights = weights or [1.0 / len(model_names)] * len(model_names)
|
|
182
|
-
self.batch_size = batch_size
|
|
183
|
-
|
|
184
|
-
self._engines = [
|
|
185
|
-
EmbeddingEngine(name, batch_size=batch_size)
|
|
186
|
-
for name in model_names
|
|
187
|
-
]
|
|
210
|
+
# Ensure diagonal is 1 (self-similarity)
|
|
211
|
+
np.fill_diagonal(similarity, 1.0)
|
|
188
212
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
) -> np.ndarray:
|
|
194
|
-
"""
|
|
195
|
-
Encode using all models and combine.
|
|
196
|
-
|
|
197
|
-
Parameters
|
|
198
|
-
----------
|
|
199
|
-
documents : list[str]
|
|
200
|
-
Document texts.
|
|
201
|
-
normalize : bool
|
|
202
|
-
Normalize final embeddings.
|
|
203
|
-
|
|
204
|
-
Returns
|
|
205
|
-
-------
|
|
206
|
-
embeddings : np.ndarray
|
|
207
|
-
Combined embeddings (concatenated).
|
|
208
|
-
"""
|
|
209
|
-
all_embeddings = []
|
|
210
|
-
|
|
211
|
-
for engine, weight in zip(self._engines, self.weights):
|
|
212
|
-
emb = engine.encode(documents, normalize=True)
|
|
213
|
-
all_embeddings.append(emb * weight)
|
|
214
|
-
|
|
215
|
-
# Concatenate
|
|
216
|
-
combined = np.hstack(all_embeddings)
|
|
217
|
-
|
|
218
|
-
if normalize:
|
|
219
|
-
norms = np.linalg.norm(combined, axis=1, keepdims=True)
|
|
220
|
-
combined = combined / (norms + 1e-10)
|
|
221
|
-
|
|
222
|
-
return combined
|
|
213
|
+
# Clip to [0, 1] range
|
|
214
|
+
similarity = np.clip(similarity, 0, 1)
|
|
215
|
+
|
|
216
|
+
return similarity
|