flock-core 0.5.21__py3-none-any.whl → 0.5.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of flock-core might be problematic. Click here for more details.

@@ -0,0 +1,235 @@
1
+ """Embedding service for semantic matching.
2
+
3
+ This module provides a singleton service for generating and caching embeddings
4
+ using sentence-transformers.
5
+ """
6
+
7
+ import logging
8
+ from collections import OrderedDict
9
+
10
+ import numpy as np
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LRUCache:
17
+ """Simple LRU cache with size limit."""
18
+
19
+ def __init__(self, max_size: int = 10000):
20
+ """Initialize LRU cache.
21
+
22
+ Args:
23
+ max_size: Maximum number of entries
24
+ """
25
+ self.max_size = max_size
26
+ self._cache: OrderedDict[str, np.ndarray] = OrderedDict()
27
+
28
+ def get(self, key: str) -> np.ndarray | None:
29
+ """Get value and mark as recently used."""
30
+ if key not in self._cache:
31
+ return None
32
+ # Move to end (most recent)
33
+ self._cache.move_to_end(key)
34
+ return self._cache[key]
35
+
36
+ def put(self, key: str, value: np.ndarray) -> None:
37
+ """Put value and evict LRU if needed."""
38
+ if key in self._cache:
39
+ # Update and move to end
40
+ self._cache.move_to_end(key)
41
+ self._cache[key] = value
42
+
43
+ # Evict oldest if over limit
44
+ if len(self._cache) > self.max_size:
45
+ self._cache.popitem(last=False) # Remove oldest (first item)
46
+
47
+ def __contains__(self, key: str) -> bool:
48
+ """Check if key exists in cache."""
49
+ return key in self._cache
50
+
51
+ def __len__(self) -> int:
52
+ """Get cache size."""
53
+ return len(self._cache)
54
+
55
+
56
+ class EmbeddingService:
57
+ """Singleton service for text embeddings using sentence-transformers.
58
+
59
+ This class manages the lifecycle of the embedding model and provides
60
+ efficient caching of embeddings.
61
+ """
62
+
63
+ _instance = None
64
+
65
+ def __init__(self, cache_size: int = 10000):
66
+ """Private constructor - use get_instance() instead.
67
+
68
+ Args:
69
+ cache_size: Maximum number of embeddings to cache
70
+ """
71
+ self._model = None
72
+ self._cache = LRUCache(max_size=cache_size)
73
+ self._cache_size = cache_size
74
+ self._hits = 0
75
+ self._misses = 0
76
+
77
+ @staticmethod
78
+ def get_instance(cache_size: int = 10000):
79
+ """Get or create the singleton EmbeddingService instance.
80
+
81
+ Args:
82
+ cache_size: Maximum number of embeddings to cache (default: 10000)
83
+
84
+ Returns:
85
+ EmbeddingService: The singleton instance
86
+ """
87
+ if EmbeddingService._instance is None:
88
+ EmbeddingService._instance = EmbeddingService(cache_size=cache_size)
89
+ return EmbeddingService._instance
90
+
91
+ def _load_model(self):
92
+ """Lazy load the sentence-transformers model."""
93
+ if self._model is None:
94
+ from sentence_transformers import SentenceTransformer
95
+
96
+ logger.info("Loading sentence-transformers model: all-MiniLM-L6-v2")
97
+ self._model = SentenceTransformer("all-MiniLM-L6-v2")
98
+ logger.info("Model loaded successfully")
99
+
100
+ def embed(self, text: str) -> np.ndarray:
101
+ """Generate embedding for a single text.
102
+
103
+ Args:
104
+ text: The text to embed
105
+
106
+ Returns:
107
+ np.ndarray: 384-dimensional embedding vector
108
+
109
+ Raises:
110
+ ValueError: If text is empty
111
+ """
112
+ if not text or not text.strip():
113
+ raise ValueError("Cannot embed empty text")
114
+
115
+ # Check cache first
116
+ cached = self._cache.get(text)
117
+ if cached is not None:
118
+ self._hits += 1
119
+ return cached
120
+
121
+ # Cache miss - generate embedding
122
+ self._misses += 1
123
+ self._load_model()
124
+
125
+ # Generate embedding
126
+ embedding = self._model.encode(
127
+ text, convert_to_numpy=True, show_progress_bar=False
128
+ )
129
+
130
+ # Ensure it's a float32 numpy array and flatten to 1D
131
+ if not isinstance(embedding, np.ndarray):
132
+ embedding = np.array(embedding, dtype=np.float32)
133
+
134
+ # Flatten to 1D if needed (model might return (1, 384) for single text)
135
+ if embedding.ndim > 1:
136
+ embedding = embedding.flatten()
137
+
138
+ # Store in cache
139
+ self._cache.put(text, embedding)
140
+
141
+ return embedding
142
+
143
+ def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
144
+ """Generate embeddings for multiple texts efficiently.
145
+
146
+ Args:
147
+ texts: List of texts to embed
148
+
149
+ Returns:
150
+ list[np.ndarray]: List of embedding vectors
151
+ """
152
+ if not texts:
153
+ return []
154
+
155
+ # Separate cached and uncached
156
+ results = [None] * len(texts)
157
+ to_encode = []
158
+ to_encode_indices = []
159
+
160
+ for i, text in enumerate(texts):
161
+ cached = self._cache.get(text)
162
+ if cached is not None:
163
+ results[i] = cached
164
+ self._hits += 1
165
+ else:
166
+ to_encode.append(text)
167
+ to_encode_indices.append(i)
168
+ self._misses += 1
169
+
170
+ # Batch encode uncached texts
171
+ if to_encode:
172
+ self._load_model()
173
+ embeddings = self._model.encode(
174
+ to_encode, convert_to_numpy=True, show_progress_bar=False
175
+ )
176
+
177
+ # Store in cache and results
178
+ for i, (text, embedding) in enumerate(
179
+ zip(to_encode, embeddings, strict=False)
180
+ ):
181
+ if not isinstance(embedding, np.ndarray):
182
+ embedding = np.array(embedding, dtype=np.float32)
183
+ # Flatten to 1D if needed
184
+ if embedding.ndim > 1:
185
+ embedding = embedding.flatten()
186
+ self._cache.put(text, embedding)
187
+ results[to_encode_indices[i]] = embedding
188
+
189
+ return results # type: ignore
190
+
191
+ def similarity(self, text1: str, text2: str) -> float:
192
+ """Compute semantic similarity between two texts.
193
+
194
+ Uses cosine similarity between embeddings.
195
+
196
+ Args:
197
+ text1: First text
198
+ text2: Second text
199
+
200
+ Returns:
201
+ float: Similarity score between 0 and 1
202
+ """
203
+ emb1 = self.embed(text1)
204
+ emb2 = self.embed(text2)
205
+
206
+ # Compute cosine similarity
207
+ dot_product = np.dot(emb1, emb2)
208
+ norm1 = np.linalg.norm(emb1)
209
+ norm2 = np.linalg.norm(emb2)
210
+
211
+ if norm1 == 0 or norm2 == 0:
212
+ return 0.0
213
+
214
+ similarity = dot_product / (norm1 * norm2)
215
+
216
+ # Clamp to [0, 1] and handle floating point errors
217
+ return float(max(0.0, min(1.0, similarity)))
218
+
219
+ def get_cache_stats(self) -> dict:
220
+ """Get cache hit/miss statistics.
221
+
222
+ Returns:
223
+ dict: Statistics including hits, misses, and hit rate
224
+ """
225
+ total = self._hits + self._misses
226
+ hit_rate = self._hits / total if total > 0 else 0.0
227
+
228
+ return {
229
+ "hits": self._hits,
230
+ "misses": self._misses,
231
+ "total": total,
232
+ "hit_rate": hit_rate,
233
+ "cache_size": len(self._cache),
234
+ "cache_limit": self._cache_size,
235
+ }