tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tritopic might be problematic. Click here for more details.

@@ -1,222 +1,216 @@
1
1
  """
2
- Embedding Engine for TriTopic
3
- ==============================
2
+ Embedding Engine Module
4
3
 
5
- Handles document embedding with support for multiple models:
6
- - Sentence-BERT models (default)
7
- - Instructor models (task-specific)
8
- - BGE models (multilingual)
4
+ Handles document embedding generation with multilingual model selection.
9
5
  """
10
6
 
11
- from __future__ import annotations
12
-
13
- from typing import Any, Literal
14
-
7
+ from typing import List, Optional, Union
15
8
  import numpy as np
16
- from tqdm import tqdm
9
+ import warnings
17
10
 
18
11
 
19
12
  class EmbeddingEngine:
20
13
  """
21
- Generate document embeddings using transformer models.
22
-
23
- Supports various embedding models optimized for different use cases.
14
+ Generates document embeddings using Sentence Transformers.
24
15
 
25
- Parameters
26
- ----------
27
- model_name : str
28
- Name of the sentence-transformers model. Popular choices:
29
- - "all-MiniLM-L6-v2": Fast, good quality (default)
30
- - "all-mpnet-base-v2": Higher quality, slower
31
- - "BAAI/bge-base-en-v1.5": State-of-the-art
32
- - "BAAI/bge-m3": Multilingual
33
- - "hkunlp/instructor-large": Task-specific (use with instruction)
34
- batch_size : int
35
- Batch size for encoding. Default: 32
36
- device : str or None
37
- Device to use ("cuda", "cpu", or None for auto).
38
- show_progress : bool
39
- Show progress bar. Default: True
16
+ Supports automatic model selection based on language configuration.
40
17
  """
41
18
 
19
+ # Model recommendations by language
20
+ LANGUAGE_MODELS = {
21
+ 'en': 'all-MiniLM-L6-v2',
22
+ 'zh': 'BAAI/bge-base-zh-v1.5',
23
+ 'multilingual': 'paraphrase-multilingual-mpnet-base-v2',
24
+ 'multilingual_small': 'paraphrase-multilingual-MiniLM-L12-v2',
25
+ 'multilingual_best': 'BAAI/bge-m3',
26
+ }
27
+
42
28
  def __init__(
43
29
  self,
44
- model_name: str = "all-MiniLM-L6-v2",
30
+ model_name: str = "auto",
31
+ language: str = "en",
32
+ multilingual: bool = False,
45
33
  batch_size: int = 32,
46
- device: str | None = None,
34
+ device: Optional[str] = None,
47
35
  show_progress: bool = True,
48
36
  ):
37
+ """
38
+ Initialize the embedding engine.
39
+
40
+ Parameters
41
+ ----------
42
+ model_name : str
43
+ Model name or "auto" for automatic selection
44
+ language : str
45
+ Language code for model selection
46
+ multilingual : bool
47
+ Force multilingual model
48
+ batch_size : int
49
+ Batch size for encoding
50
+ device : str, optional
51
+ Device to use ('cuda', 'cpu', or None for auto)
52
+ show_progress : bool
53
+ Show progress bar during encoding
54
+ """
49
55
  self.model_name = model_name
56
+ self.language = language
57
+ self.multilingual = multilingual
50
58
  self.batch_size = batch_size
51
59
  self.device = device
52
60
  self.show_progress = show_progress
53
61
 
54
62
  self._model = None
55
- self._is_instructor = "instructor" in model_name.lower()
63
+ self._resolved_model_name = None
64
+
65
+ def _resolve_model_name(self) -> str:
66
+ """Resolve the model name based on configuration."""
67
+ if self.model_name != "auto":
68
+ return self.model_name
69
+
70
+ if self.multilingual:
71
+ return self.LANGUAGE_MODELS['multilingual']
72
+
73
+ lang = self.language.lower()
74
+
75
+ if lang == 'en':
76
+ return self.LANGUAGE_MODELS['en']
77
+ elif lang == 'zh':
78
+ return self.LANGUAGE_MODELS['zh']
79
+ elif lang in ['ja', 'ko', 'th', 'vi', 'ar', 'he', 'hi']:
80
+ # Asian and Middle Eastern languages need multilingual
81
+ return self.LANGUAGE_MODELS['multilingual_small']
82
+ elif lang in ['de', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'sv', 'da', 'no', 'fi']:
83
+ # European languages
84
+ return self.LANGUAGE_MODELS['multilingual_small']
85
+ else:
86
+ # Default to multilingual for unknown languages
87
+ return self.LANGUAGE_MODELS['multilingual_small']
56
88
 
57
89
  def _load_model(self):
58
- """Lazy load the embedding model."""
90
+ """Load the sentence transformer model."""
59
91
  if self._model is not None:
60
92
  return
61
93
 
62
- from sentence_transformers import SentenceTransformer
63
-
64
- self._model = SentenceTransformer(
65
- self.model_name,
66
- device=self.device,
67
- )
94
+ try:
95
+ from sentence_transformers import SentenceTransformer
96
+ except ImportError:
97
+ raise ImportError(
98
+ "sentence-transformers is required for embedding generation. "
99
+ "Install with: pip install sentence-transformers"
100
+ )
101
+
102
+ self._resolved_model_name = self._resolve_model_name()
103
+
104
+ try:
105
+ self._model = SentenceTransformer(
106
+ self._resolved_model_name,
107
+ device=self.device
108
+ )
109
+ except Exception as e:
110
+ # Fall back to a known working model
111
+ warnings.warn(
112
+ f"Could not load model '{self._resolved_model_name}': {e}. "
113
+ f"Falling back to 'all-MiniLM-L6-v2'"
114
+ )
115
+ self._resolved_model_name = 'all-MiniLM-L6-v2'
116
+ self._model = SentenceTransformer(
117
+ self._resolved_model_name,
118
+ device=self.device
119
+ )
68
120
 
69
121
  def encode(
70
122
  self,
71
- documents: list[str],
72
- instruction: str | None = None,
123
+ documents: List[str],
73
124
  normalize: bool = True,
74
125
  ) -> np.ndarray:
75
126
  """
76
- Encode documents to embeddings.
127
+ Encode documents into embeddings.
77
128
 
78
129
  Parameters
79
130
  ----------
80
- documents : list[str]
81
- List of document texts.
82
- instruction : str, optional
83
- Instruction for Instructor models (e.g., "Represent the topic of this document:").
131
+ documents : List[str]
132
+ List of documents to encode
84
133
  normalize : bool
85
- Whether to L2-normalize embeddings. Default: True
134
+ Whether to L2-normalize embeddings
86
135
 
87
136
  Returns
88
137
  -------
89
- embeddings : np.ndarray
90
- Document embeddings of shape (n_docs, embedding_dim).
138
+ np.ndarray
139
+ Document embeddings of shape (n_documents, embedding_dim)
91
140
  """
92
141
  self._load_model()
93
142
 
94
- # Handle instructor models
95
- if self._is_instructor and instruction:
96
- documents = [[instruction, doc] for doc in documents]
97
-
98
- # Encode in batches
99
143
  embeddings = self._model.encode(
100
144
  documents,
101
145
  batch_size=self.batch_size,
102
146
  show_progress_bar=self.show_progress,
103
- normalize_embeddings=normalize,
104
147
  convert_to_numpy=True,
148
+ normalize_embeddings=normalize,
105
149
  )
106
150
 
107
151
  return embeddings
108
152
 
109
- def encode_with_pooling(
110
- self,
111
- documents: list[str],
112
- pooling: Literal["mean", "max", "cls"] = "mean",
113
- ) -> np.ndarray:
114
- """
115
- Encode with custom pooling strategy.
116
-
117
- Parameters
118
- ----------
119
- documents : list[str]
120
- Document texts.
121
- pooling : str
122
- Pooling strategy: "mean", "max", or "cls".
123
-
124
- Returns
125
- -------
126
- embeddings : np.ndarray
127
- Pooled embeddings.
128
- """
129
- # For now, use default pooling from model
130
- # Custom pooling would require access to token-level embeddings
131
- return self.encode(documents)
132
-
133
153
  @property
134
154
  def embedding_dim(self) -> int:
135
- """Get embedding dimension."""
155
+ """Get the embedding dimension."""
136
156
  self._load_model()
137
157
  return self._model.get_sentence_embedding_dimension()
138
158
 
139
- def similarity(
140
- self,
141
- embeddings1: np.ndarray,
142
- embeddings2: np.ndarray | None = None,
143
- ) -> np.ndarray:
144
- """
145
- Compute cosine similarity between embeddings.
146
-
147
- Parameters
148
- ----------
149
- embeddings1 : np.ndarray
150
- First set of embeddings.
151
- embeddings2 : np.ndarray, optional
152
- Second set. If None, compute pairwise similarity of embeddings1.
153
-
154
- Returns
155
- -------
156
- similarity : np.ndarray
157
- Similarity matrix.
158
- """
159
- from sklearn.metrics.pairwise import cosine_similarity
160
-
161
- if embeddings2 is None:
162
- return cosine_similarity(embeddings1)
163
- return cosine_similarity(embeddings1, embeddings2)
159
+ @property
160
+ def model_info(self) -> dict:
161
+ """Get information about the loaded model."""
162
+ self._load_model()
163
+ return {
164
+ 'model_name': self._resolved_model_name,
165
+ 'embedding_dim': self.embedding_dim,
166
+ 'language': self.language,
167
+ 'multilingual': self.multilingual,
168
+ }
164
169
 
165
170
 
166
- class MultiModelEmbedding:
171
+ def compute_similarity_matrix(
172
+ embeddings: np.ndarray,
173
+ metric: str = "cosine"
174
+ ) -> np.ndarray:
167
175
  """
168
- Combine embeddings from multiple models.
176
+ Compute pairwise similarity matrix from embeddings.
169
177
 
170
- Useful for ensemble approaches where different models capture
171
- different aspects of document semantics.
178
+ Parameters
179
+ ----------
180
+ embeddings : np.ndarray
181
+ Document embeddings of shape (n_documents, embedding_dim)
182
+ metric : str
183
+ Similarity metric: "cosine", "euclidean", "dot"
184
+
185
+ Returns
186
+ -------
187
+ np.ndarray
188
+ Similarity matrix of shape (n_documents, n_documents)
172
189
  """
190
+ if metric == "cosine":
191
+ # For normalized embeddings, cosine similarity = dot product
192
+ # Ensure normalization
193
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
194
+ norms[norms == 0] = 1 # Avoid division by zero
195
+ normalized = embeddings / norms
196
+ similarity = np.dot(normalized, normalized.T)
197
+
198
+ elif metric == "dot":
199
+ similarity = np.dot(embeddings, embeddings.T)
200
+
201
+ elif metric == "euclidean":
202
+ # Convert Euclidean distance to similarity
203
+ from scipy.spatial.distance import cdist
204
+ distances = cdist(embeddings, embeddings, metric='euclidean')
205
+ similarity = 1 / (1 + distances)
206
+
207
+ else:
208
+ raise ValueError(f"Unknown metric: {metric}")
173
209
 
174
- def __init__(
175
- self,
176
- model_names: list[str],
177
- weights: list[float] | None = None,
178
- batch_size: int = 32,
179
- ):
180
- self.model_names = model_names
181
- self.weights = weights or [1.0 / len(model_names)] * len(model_names)
182
- self.batch_size = batch_size
183
-
184
- self._engines = [
185
- EmbeddingEngine(name, batch_size=batch_size)
186
- for name in model_names
187
- ]
210
+ # Ensure diagonal is 1 (self-similarity)
211
+ np.fill_diagonal(similarity, 1.0)
188
212
 
189
- def encode(
190
- self,
191
- documents: list[str],
192
- normalize: bool = True,
193
- ) -> np.ndarray:
194
- """
195
- Encode using all models and combine.
196
-
197
- Parameters
198
- ----------
199
- documents : list[str]
200
- Document texts.
201
- normalize : bool
202
- Normalize final embeddings.
203
-
204
- Returns
205
- -------
206
- embeddings : np.ndarray
207
- Combined embeddings (concatenated).
208
- """
209
- all_embeddings = []
210
-
211
- for engine, weight in zip(self._engines, self.weights):
212
- emb = engine.encode(documents, normalize=True)
213
- all_embeddings.append(emb * weight)
214
-
215
- # Concatenate
216
- combined = np.hstack(all_embeddings)
217
-
218
- if normalize:
219
- norms = np.linalg.norm(combined, axis=1, keepdims=True)
220
- combined = combined / (norms + 1e-10)
221
-
222
- return combined
213
+ # Clip to [0, 1] range
214
+ similarity = np.clip(similarity, 0, 1)
215
+
216
+ return similarity