embed-client 1.0.1.1__tar.gz → 2.0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/PKG-INFO +1 -1
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client/async_client.py +69 -6
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client/example_async_usage_ru.py +11 -3
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client.egg-info/PKG-INFO +1 -1
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/pyproject.toml +1 -1
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/tests/test_async_client.py +39 -3
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/README.md +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client/__init__.py +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client/example_async_usage.py +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client.egg-info/SOURCES.txt +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client.egg-info/dependency_links.txt +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client.egg-info/requires.txt +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/embed_client.egg-info/top_level.txt +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/setup.cfg +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/tests/test_async_client_real.py +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/tests/test_async_client_stress.py +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/tests/test_async_client_stress_new.py +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/tests/test_async_client_stress_updated.py +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/tests/test_example_async_usage.py +0 -0
- {embed_client-1.0.1.1 → embed_client-2.0.0.0}/tests/test_example_async_usage_ru.py +0 -0
@@ -48,7 +48,7 @@ class EmbeddingServiceAsyncClient:
|
|
48
48
|
|
49
49
|
Supports both old and new API formats:
|
50
50
|
- Old format: {"result": {"success": true, "data": {"embeddings": [...]}}}
|
51
|
-
- New format: {"result": {"success": true, "data": [{"body": "text", "embedding": [...], "
|
51
|
+
- New format: {"result": {"success": true, "data": {"embeddings": [...], "results": [{"body": "text", "embedding": [...], "tokens": [...], "bm25_tokens": [...]}]}}}
|
52
52
|
|
53
53
|
Args:
|
54
54
|
base_url (str): Base URL of the embedding service (e.g., "http://localhost").
|
@@ -172,13 +172,33 @@ class EmbeddingServiceAsyncClient:
|
|
172
172
|
result: API response dictionary
|
173
173
|
|
174
174
|
Returns:
|
175
|
-
List of dictionaries with 'body', 'embedding', and '
|
175
|
+
List of dictionaries with 'body', 'embedding', 'tokens', and 'bm25_tokens' fields
|
176
176
|
|
177
177
|
Raises:
|
178
178
|
ValueError: If data cannot be extracted or is in old format
|
179
179
|
"""
|
180
180
|
if "result" in result and isinstance(result["result"], dict):
|
181
181
|
res = result["result"]
|
182
|
+
if "data" in res and isinstance(res["data"], dict) and "results" in res["data"]:
|
183
|
+
# New format: result.data.results[]
|
184
|
+
results = res["data"]["results"]
|
185
|
+
if isinstance(results, list):
|
186
|
+
# Validate that all items have required fields
|
187
|
+
for i, item in enumerate(results):
|
188
|
+
if not isinstance(item, dict):
|
189
|
+
raise ValueError(f"Item {i} is not a dictionary: {item}")
|
190
|
+
if "body" not in item:
|
191
|
+
raise ValueError(f"Item {i} missing 'body' field: {item}")
|
192
|
+
if "embedding" not in item:
|
193
|
+
raise ValueError(f"Item {i} missing 'embedding' field: {item}")
|
194
|
+
if "tokens" not in item:
|
195
|
+
raise ValueError(f"Item {i} missing 'tokens' field: {item}")
|
196
|
+
if "bm25_tokens" not in item:
|
197
|
+
raise ValueError(f"Item {i} missing 'bm25_tokens' field: {item}")
|
198
|
+
|
199
|
+
return results
|
200
|
+
|
201
|
+
# Legacy support for old format: result.data[]
|
182
202
|
if "data" in res and isinstance(res["data"], list):
|
183
203
|
# Validate that all items have required fields
|
184
204
|
for i, item in enumerate(res["data"]):
|
@@ -188,8 +208,9 @@ class EmbeddingServiceAsyncClient:
|
|
188
208
|
raise ValueError(f"Item {i} missing 'body' field: {item}")
|
189
209
|
if "embedding" not in item:
|
190
210
|
raise ValueError(f"Item {i} missing 'embedding' field: {item}")
|
191
|
-
|
192
|
-
|
211
|
+
# Old format had 'chunks' instead of 'tokens'
|
212
|
+
if "chunks" not in item and "tokens" not in item:
|
213
|
+
raise ValueError(f"Item {i} missing 'chunks' or 'tokens' field: {item}")
|
193
214
|
|
194
215
|
return res["data"]
|
195
216
|
|
@@ -214,18 +235,60 @@ class EmbeddingServiceAsyncClient:
|
|
214
235
|
def extract_chunks(self, result: Dict[str, Any]) -> List[List[str]]:
|
215
236
|
"""
|
216
237
|
Extract text chunks from API response (new format only).
|
238
|
+
Note: This method now extracts 'tokens' instead of 'chunks' for compatibility.
|
217
239
|
|
218
240
|
Args:
|
219
241
|
result: API response dictionary
|
220
242
|
|
221
243
|
Returns:
|
222
|
-
List of
|
244
|
+
List of token lists for each text
|
223
245
|
|
224
246
|
Raises:
|
225
247
|
ValueError: If chunks cannot be extracted or is in old format
|
226
248
|
"""
|
227
249
|
data = self.extract_embedding_data(result)
|
228
|
-
|
250
|
+
chunks = []
|
251
|
+
for item in data:
|
252
|
+
# New format uses 'tokens', old format used 'chunks'
|
253
|
+
if "tokens" in item:
|
254
|
+
chunks.append(item["tokens"])
|
255
|
+
elif "chunks" in item:
|
256
|
+
chunks.append(item["chunks"])
|
257
|
+
else:
|
258
|
+
raise ValueError(f"Item missing both 'tokens' and 'chunks' fields: {item}")
|
259
|
+
return chunks
|
260
|
+
|
261
|
+
def extract_tokens(self, result: Dict[str, Any]) -> List[List[str]]:
|
262
|
+
"""
|
263
|
+
Extract tokens from API response (new format only).
|
264
|
+
|
265
|
+
Args:
|
266
|
+
result: API response dictionary
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
List of token lists for each text
|
270
|
+
|
271
|
+
Raises:
|
272
|
+
ValueError: If tokens cannot be extracted or is in old format
|
273
|
+
"""
|
274
|
+
data = self.extract_embedding_data(result)
|
275
|
+
return [item["tokens"] for item in data]
|
276
|
+
|
277
|
+
def extract_bm25_tokens(self, result: Dict[str, Any]) -> List[List[str]]:
|
278
|
+
"""
|
279
|
+
Extract BM25 tokens from API response (new format only).
|
280
|
+
|
281
|
+
Args:
|
282
|
+
result: API response dictionary
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
List of BM25 token lists for each text
|
286
|
+
|
287
|
+
Raises:
|
288
|
+
ValueError: If BM25 tokens cannot be extracted or is in old format
|
289
|
+
"""
|
290
|
+
data = self.extract_embedding_data(result)
|
291
|
+
return [item["bm25_tokens"] for item in data]
|
229
292
|
|
230
293
|
async def __aenter__(self):
|
231
294
|
try:
|
@@ -88,9 +88,17 @@ async def main():
|
|
88
88
|
print("\nAdditional data from new format:")
|
89
89
|
for i, data in enumerate(embedding_data):
|
90
90
|
print(f" Text: {data['body']!r}")
|
91
|
-
print(f"
|
92
|
-
|
93
|
-
|
91
|
+
print(f" Tokens: {data['tokens']}")
|
92
|
+
print(f" BM25 tokens: {data['bm25_tokens']}")
|
93
|
+
|
94
|
+
# Extract tokens and BM25 tokens separately
|
95
|
+
tokens = client.extract_tokens(result)
|
96
|
+
bm25_tokens = client.extract_bm25_tokens(result)
|
97
|
+
print(f"\nExtracted tokens: {tokens}")
|
98
|
+
print(f"Extracted BM25 tokens: {bm25_tokens}")
|
99
|
+
|
100
|
+
except ValueError as e:
|
101
|
+
print(f"(Old format detected - no additional data available): {e}")
|
94
102
|
|
95
103
|
except EmbeddingServiceAPIError as e:
|
96
104
|
print("[API error]", e.error)
|
@@ -810,13 +810,49 @@ async def test_extract_chunks():
|
|
810
810
|
result = {
|
811
811
|
"result": {
|
812
812
|
"data": [
|
813
|
-
{"body": "text1", "embedding": [0.1, 0.2], "
|
814
|
-
{"body": "text2", "embedding": [0.3, 0.4], "
|
813
|
+
{"body": "text1", "embedding": [0.1, 0.2], "tokens": ["text1"]},
|
814
|
+
{"body": "text2", "embedding": [0.3, 0.4], "tokens": ["text2"]}
|
815
815
|
]
|
816
816
|
}
|
817
817
|
}
|
818
818
|
chunks = client.extract_chunks(result)
|
819
|
-
assert chunks == [["
|
819
|
+
assert chunks == [["text1"], ["text2"]]
|
820
|
+
|
821
|
+
@pytest.mark.asyncio
|
822
|
+
async def test_extract_tokens():
|
823
|
+
"""Test extracting tokens from new format responses."""
|
824
|
+
client = EmbeddingServiceAsyncClient(base_url=BASE_URL, port=PORT)
|
825
|
+
|
826
|
+
result = {
|
827
|
+
"result": {
|
828
|
+
"data": {
|
829
|
+
"results": [
|
830
|
+
{"body": "text1", "embedding": [0.1, 0.2], "tokens": ["text1"], "bm25_tokens": ["text1"]},
|
831
|
+
{"body": "text2", "embedding": [0.3, 0.4], "tokens": ["text2"], "bm25_tokens": ["text2"]}
|
832
|
+
]
|
833
|
+
}
|
834
|
+
}
|
835
|
+
}
|
836
|
+
tokens = client.extract_tokens(result)
|
837
|
+
assert tokens == [["text1"], ["text2"]]
|
838
|
+
|
839
|
+
@pytest.mark.asyncio
|
840
|
+
async def test_extract_bm25_tokens():
|
841
|
+
"""Test extracting BM25 tokens from new format responses."""
|
842
|
+
client = EmbeddingServiceAsyncClient(base_url=BASE_URL, port=PORT)
|
843
|
+
|
844
|
+
result = {
|
845
|
+
"result": {
|
846
|
+
"data": {
|
847
|
+
"results": [
|
848
|
+
{"body": "text1", "embedding": [0.1, 0.2], "tokens": ["text1"], "bm25_tokens": ["text1"]},
|
849
|
+
{"body": "text2", "embedding": [0.3, 0.4], "tokens": ["text2"], "bm25_tokens": ["text2"]}
|
850
|
+
]
|
851
|
+
}
|
852
|
+
}
|
853
|
+
}
|
854
|
+
bm25_tokens = client.extract_bm25_tokens(result)
|
855
|
+
assert bm25_tokens == [["text1"], ["text2"]]
|
820
856
|
|
821
857
|
@pytest.mark.asyncio
|
822
858
|
async def test_close_without_open():
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|