@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +3 -3
  2. package/bin/cli.js +1 -1
  3. package/bin/commands/config.js +1 -1
  4. package/dist/index.cjs +1 -1
  5. package/dist/index.js +1 -1
  6. package/package.json +2 -2
  7. package/packages/doctor/src/checks/local-memory.js +2 -2
  8. package/packages/memory/README.md +2 -2
  9. package/packages/memory/openclaw-plugin/README.md +2 -2
  10. package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
  11. package/packages/memory/src/server.js +2 -2
  12. package/packages/memory-engine-v2/.env.example +30 -0
  13. package/packages/memory-engine-v2/README.md +125 -0
  14. package/packages/memory-engine-v2/compat/Dockerfile +11 -0
  15. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  16. package/packages/memory-engine-v2/compat/server.py +1047 -0
  17. package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
  18. package/packages/memory-engine-v2/docker-compose.yml +206 -0
  19. package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
  20. package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
  21. package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
  22. package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
  23. package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
  24. package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
  25. package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
  26. package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
  27. package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
  28. package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
  29. package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
  30. package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
  31. package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
  32. package/packages/memory-engine/.env.example +0 -13
  33. package/packages/memory-engine/MIGRATION.md +0 -219
  34. package/packages/memory-engine/README.md +0 -145
  35. package/packages/memory-engine/bench/README.md +0 -99
  36. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
  37. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
  38. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
  39. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
  40. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
  41. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
  42. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
  43. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
  44. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
  45. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
  48. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
  49. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
  50. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
  51. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
  60. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
  61. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
  62. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
  63. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
  66. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
  67. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
  68. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
  69. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
  78. package/packages/memory-engine/compat/Dockerfile +0 -22
  79. package/packages/memory-engine/compat/server.py +0 -1255
  80. package/packages/memory-engine/docker-compose.test.yml +0 -59
  81. package/packages/memory-engine/docker-compose.yml +0 -255
  82. package/packages/memory-engine/engine/README.md +0 -52
  83. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
  84. package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
  85. package/packages/memory-engine/engine/l6-document-store.py +0 -1018
  86. package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
  87. package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -562
  88. package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
  89. package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
  90. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
  91. package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
  92. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
  93. package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
  94. package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
  95. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
  96. package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
  97. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  98. package/packages/memory-engine/pme_memory/__main__.py +0 -129
  99. package/packages/memory-engine/pme_memory/artifacts.py +0 -95
  100. package/packages/memory-engine/pme_memory/embed.py +0 -74
  101. package/packages/memory-engine/pme_memory/health.py +0 -36
  102. package/packages/memory-engine/pme_memory/hygiene.py +0 -159
  103. package/packages/memory-engine/pme_memory/indexer.py +0 -200
  104. package/packages/memory-engine/pme_memory/needs.py +0 -55
  105. package/packages/memory-engine/pme_memory/provenance.py +0 -80
  106. package/packages/memory-engine/pme_memory/scoring.py +0 -168
  107. package/packages/memory-engine/pme_memory/search.py +0 -52
  108. package/packages/memory-engine/pme_memory/store.py +0 -86
  109. package/packages/memory-engine/pme_memory/synthesis.py +0 -114
  110. package/packages/memory-engine/pyproject.toml +0 -65
  111. package/packages/memory-engine/scripts/kg-extractor.py +0 -557
  112. package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
  113. package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
  114. package/packages/memory-engine/tests/e2e_arena.sh +0 -259
  115. package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
  116. package/packages/memory-engine/tests/embed_stub/server.py +0 -80
  117. package/packages/memory-engine/tests/test_aggregate.py +0 -333
  118. package/packages/memory-engine/tests/test_api_contract.sh +0 -57
  119. package/packages/memory-engine/tests/test_arena_safety.py +0 -232
  120. package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
  121. package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
  122. package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
  123. package/packages/memory-engine/tests/test_embed_provider.py +0 -693
  124. package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
  125. package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
  126. package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
  127. package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
@@ -1,693 +0,0 @@
1
- """Unit tests for engine/services/_shared/embed_provider.py.
2
-
3
- Run with:
4
- cd packages/memory-engine
5
- python -m pytest tests/test_embed_provider.py -v
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import sys
11
- from pathlib import Path
12
-
13
- # Make the engine/services tree importable for tests without packaging it.
14
- ROOT = Path(__file__).parent.parent / "engine" / "services"
15
- sys.path.insert(0, str(ROOT))
16
-
17
- import json # noqa: E402
18
-
19
- import httpx # noqa: E402
20
- import pytest # noqa: E402
21
-
22
- from _shared.embed_provider import ( # noqa: E402
23
- PROVIDERS,
24
- EmbedAuthError,
25
- EmbedClient,
26
- EmbedHTTPError,
27
- EmbedProvider,
28
- resolve_provider,
29
- )
30
-
31
-
32
- # ----------------------------------------------------------------------
33
- # Helpers — stub httpx so we can assert the request shape.
34
- # ----------------------------------------------------------------------
35
-
36
- class _FakeResponse:
37
- def __init__(self, status_code: int, payload: dict | str = ""):
38
- self.status_code = status_code
39
- if isinstance(payload, dict):
40
- self._json = payload
41
- self.text = json.dumps(payload)
42
- else:
43
- self._json = None
44
- self.text = payload
45
-
46
- @property
47
- def is_success(self) -> bool:
48
- return 200 <= self.status_code < 300
49
-
50
- def json(self) -> dict:
51
- if self._json is None:
52
- raise ValueError("not json")
53
- return self._json
54
-
55
-
56
- class _Recorder:
57
- """Records every httpx.post call and returns canned responses keyed by URL."""
58
-
59
- def __init__(self):
60
- self.calls: list[dict] = []
61
- self.responses: dict[str, _FakeResponse] = {}
62
-
63
- def respond(self, url: str, response: _FakeResponse) -> None:
64
- self.responses[url] = response
65
-
66
- def __call__(self, url, *, json, headers, timeout):
67
- self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
68
- if url in self.responses:
69
- return self.responses[url]
70
- # default: 401 to flush out unmatched URLs
71
- return _FakeResponse(401, "no stub for this url")
72
-
73
-
74
- @pytest.fixture
75
- def recorder(monkeypatch):
76
- rec = _Recorder()
77
- monkeypatch.setattr(httpx, "post", rec)
78
- return rec
79
-
80
-
81
- # ----------------------------------------------------------------------
82
- # Provider resolution
83
- # ----------------------------------------------------------------------
84
-
85
- def test_resolve_built_in_providers():
86
- for name in ("openai", "pentatonic-gateway", "cohere"):
87
- p = resolve_provider(name)
88
- assert p.name == name
89
-
90
-
91
- def test_resolve_unknown_provider_raises():
92
- with pytest.raises(ValueError):
93
- resolve_provider("not-a-provider")
94
-
95
-
96
- def test_resolve_custom_provider_from_env(monkeypatch):
97
- monkeypatch.setenv("L4_EMBED_AUTH_HEADER", "X-Custom-Auth")
98
- monkeypatch.setenv("L4_EMBED_AUTH_FORMAT", "Token {key}")
99
- monkeypatch.setenv("L4_EMBED_PATH_DEFAULT", "/embed")
100
- monkeypatch.setenv("L4_EMBED_BODY_SHAPE", "cohere")
101
- monkeypatch.setenv("L4_EMBED_RESPONSE_SHAPE", "cohere")
102
- p = resolve_provider("custom", env_prefix="L4_")
103
- assert p.auth_header == "X-Custom-Auth"
104
- assert p.auth_format == "Token {key}"
105
- assert p.path_default == "/embed"
106
- # body shape produces Cohere-style "texts" field
107
- body = p.body_builder(["hi"], "model-x")
108
- assert body == {"texts": ["hi"], "model": "model-x", "input_type": "search_document"}
109
-
110
-
111
- # ----------------------------------------------------------------------
112
- # Request shape
113
- # ----------------------------------------------------------------------
114
-
115
- def test_openai_provider_request_shape(recorder):
116
- recorder.respond(
117
- "https://gw/v1/embeddings",
118
- _FakeResponse(200, {"data": [{"embedding": [0.1, 0.2]}]}),
119
- )
120
- client = EmbedClient(
121
- url="https://gw/v1/embeddings",
122
- api_key="k",
123
- model="m",
124
- provider=PROVIDERS["openai"],
125
- )
126
- out = client.embed_batch(["hello"])
127
- assert out == [[0.1, 0.2]]
128
- call = recorder.calls[0]
129
- assert call["url"] == "https://gw/v1/embeddings"
130
- assert call["json"] == {"input": ["hello"], "model": "m"}
131
- assert call["headers"] == {"Authorization": "Bearer k"}
132
-
133
-
134
- def test_pentatonic_provider_request_shape(recorder):
135
- recorder.respond(
136
- "https://lambda-gateway.pentatonic.com/v1/embed",
137
- _FakeResponse(200, {"data": [{"embedding": [1.0, 2.0]}]}),
138
- )
139
- client = EmbedClient(
140
- url="https://lambda-gateway.pentatonic.com/v1/embed",
141
- api_key="secret",
142
- model="nv-embed-v2",
143
- provider=PROVIDERS["pentatonic-gateway"],
144
- )
145
- out = client.embed_batch(["t1"])
146
- assert out == [[1.0, 2.0]]
147
- call = recorder.calls[0]
148
- assert call["url"] == "https://lambda-gateway.pentatonic.com/v1/embed"
149
- assert call["json"] == {"input": ["t1"], "model": "nv-embed-v2"}
150
- assert call["headers"] == {"X-API-Key": "secret"}
151
-
152
-
153
- def test_pentatonic_response_parser_handles_both_shapes(recorder):
154
- """Pentatonic Gateway has historically returned both {"data":[...]} and
155
- {"embeddings":[...]} on different endpoints. Parser accepts either."""
156
- p = PROVIDERS["pentatonic-gateway"]
157
- assert p.response_parser({"data": [{"embedding": [1.0]}]}) == [[1.0]]
158
- assert p.response_parser({"embeddings": [[1.0]]}) == [[1.0]]
159
-
160
-
161
- def test_cohere_provider_request_shape(recorder):
162
- recorder.respond(
163
- "https://api.cohere.ai/v1/embed",
164
- _FakeResponse(200, {"embeddings": [[3.0, 4.0]]}),
165
- )
166
- client = EmbedClient(
167
- url="https://api.cohere.ai/v1/embed",
168
- api_key="cohere-key",
169
- model="embed-english-v3.0",
170
- provider=PROVIDERS["cohere"],
171
- )
172
- out = client.embed_batch(["hi"])
173
- assert out == [[3.0, 4.0]]
174
- call = recorder.calls[0]
175
- assert call["json"] == {
176
- "texts": ["hi"],
177
- "model": "embed-english-v3.0",
178
- "input_type": "search_document",
179
- }
180
- assert call["headers"] == {"Authorization": "Bearer cohere-key"}
181
-
182
-
183
- # ----------------------------------------------------------------------
184
- # Auto-detect
185
- # ----------------------------------------------------------------------
186
-
187
- def test_autodetect_on_401_falls_back_to_pentatonic(recorder):
188
- """Operator configured openai but the URL+key actually belong to
189
- Pentatonic Gateway. First call 401s, auto-detect probes pentatonic
190
- and succeeds."""
191
- recorder.respond(
192
- "https://lambda-gateway.pentatonic.com/v1/embeddings",
193
- _FakeResponse(401, '{"error":"Invalid or missing API key"}'),
194
- )
195
- recorder.respond(
196
- "https://lambda-gateway.pentatonic.com/v1/embed",
197
- _FakeResponse(200, {"data": [{"embedding": [9.0]}]}),
198
- )
199
- client = EmbedClient(
200
- url="https://lambda-gateway.pentatonic.com/v1/embeddings",
201
- api_key="k",
202
- model="nv-embed-v2",
203
- provider=PROVIDERS["openai"],
204
- )
205
- out = client.embed_batch(["x"])
206
- assert out == [[9.0]]
207
- assert client.active_provider == "pentatonic-gateway"
208
- # First call uses configured (openai) shape, second uses pentatonic
209
- assert recorder.calls[0]["headers"] == {"Authorization": "Bearer k"}
210
- assert recorder.calls[1]["headers"] == {"X-API-Key": "k"}
211
-
212
-
213
- def test_autodetect_caches_after_first_success(recorder):
214
- """Once auto-detect picks a winner, subsequent calls go straight to it
215
- without retrying the original 401."""
216
- recorder.respond(
217
- "https://gw/v1/embeddings",
218
- _FakeResponse(401, "wrong scheme"),
219
- )
220
- recorder.respond(
221
- "https://gw/v1/embed",
222
- _FakeResponse(200, {"data": [{"embedding": [1.0]}]}),
223
- )
224
- client = EmbedClient(
225
- url="https://gw/v1/embeddings",
226
- api_key="k",
227
- model="m",
228
- provider=PROVIDERS["openai"],
229
- )
230
- client.embed_batch(["a"]) # triggers detect
231
- n_after_first = len(recorder.calls)
232
- client.embed_batch(["b"]) # should go straight to /v1/embed
233
- assert len(recorder.calls) == n_after_first + 1
234
- assert recorder.calls[-1]["url"] == "https://gw/v1/embed"
235
-
236
-
237
- def test_autodetect_disabled_raises(recorder):
238
- recorder.respond("https://gw/v1/embeddings", _FakeResponse(401, "no auth"))
239
- client = EmbedClient(
240
- url="https://gw/v1/embeddings",
241
- api_key="k",
242
- model="m",
243
- provider=PROVIDERS["openai"],
244
- autodetect=False,
245
- )
246
- with pytest.raises(EmbedAuthError):
247
- client.embed_batch(["x"])
248
- # Only one call: no probing happened.
249
- assert len(recorder.calls) == 1
250
-
251
-
252
- def test_autodetect_all_fail_raises(recorder):
253
- """Every candidate also 401s — raise EmbedAuthError."""
254
- recorder.respond("https://gw/v1/embeddings", _FakeResponse(401, "x"))
255
- recorder.respond("https://gw/v1/embed", _FakeResponse(401, "x"))
256
- client = EmbedClient(
257
- url="https://gw/v1/embeddings",
258
- api_key="k",
259
- model="m",
260
- provider=PROVIDERS["openai"],
261
- )
262
- with pytest.raises(EmbedAuthError):
263
- client.embed_batch(["x"])
264
-
265
-
266
- # ----------------------------------------------------------------------
267
- # Error handling
268
- # ----------------------------------------------------------------------
269
-
270
- def test_non_401_http_error_does_not_trigger_autodetect(recorder):
271
- # max_retries=0 isolates this test to autodetect behaviour. With
272
- # retries enabled (default), 503 triggers the retry path which is
273
- # exercised separately in the retry tests below.
274
- recorder.respond(
275
- "https://gw/v1/embeddings",
276
- _FakeResponse(503, "upstream down"),
277
- )
278
- client = EmbedClient(
279
- url="https://gw/v1/embeddings",
280
- api_key="k",
281
- model="m",
282
- provider=PROVIDERS["openai"],
283
- max_retries=0,
284
- )
285
- with pytest.raises(EmbedHTTPError) as exc:
286
- client.embed_batch(["x"])
287
- assert exc.value.status == 503
288
- assert len(recorder.calls) == 1
289
-
290
-
291
- def test_empty_input_returns_empty(recorder):
292
- client = EmbedClient(
293
- url="https://gw/v1/embeddings",
294
- api_key="k",
295
- model="m",
296
- provider=PROVIDERS["openai"],
297
- )
298
- assert client.embed_batch([]) == []
299
- assert recorder.calls == []
300
-
301
-
302
- # ----------------------------------------------------------------------
303
- # from_env construction
304
- # ----------------------------------------------------------------------
305
-
306
- def test_from_env_reads_layer_prefix(monkeypatch, recorder):
307
- monkeypatch.setenv("L4_NV_EMBED_URL", "https://lambda-gateway.pentatonic.com/v1/embed")
308
- monkeypatch.setenv("L4_EMBED_API_KEY", "real-key")
309
- monkeypatch.setenv("L4_EMBED_MODEL", "nv-embed-v2")
310
- monkeypatch.setenv("L4_EMBED_PROVIDER", "pentatonic-gateway")
311
- recorder.respond(
312
- "https://lambda-gateway.pentatonic.com/v1/embed",
313
- _FakeResponse(200, {"data": [{"embedding": [42.0]}]}),
314
- )
315
- client = EmbedClient.from_env(prefix="L4_")
316
- out = client.embed_batch(["t"])
317
- assert out == [[42.0]]
318
- assert client.active_provider == "pentatonic-gateway"
319
- assert recorder.calls[0]["headers"] == {"X-API-Key": "real-key"}
320
-
321
-
322
- def test_from_env_default_provider_is_openai(monkeypatch):
323
- monkeypatch.setenv("L5_NV_EMBED_URL", "https://gw/v1/embeddings")
324
- monkeypatch.setenv("L5_EMBED_API_KEY", "k")
325
- client = EmbedClient.from_env(prefix="L5_")
326
- assert client.active_provider == "openai"
327
-
328
-
329
- def test_from_env_autodetect_opt_out(monkeypatch, recorder):
330
- monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
331
- monkeypatch.setenv("L4_EMBED_API_KEY", "k")
332
- monkeypatch.setenv("L4_EMBED_AUTODETECT", "false")
333
- recorder.respond("https://gw/v1/embeddings", _FakeResponse(401, "x"))
334
- client = EmbedClient.from_env(prefix="L4_")
335
- with pytest.raises(EmbedAuthError):
336
- client.embed_batch(["x"])
337
- assert len(recorder.calls) == 1
338
-
339
-
340
- # ----------------------------------------------------------------------
341
- # URL handling
342
- # ----------------------------------------------------------------------
343
-
344
- def test_url_without_path_gets_provider_default(recorder):
345
- """If operator provides only a base URL, the provider's path_default
346
- is appended."""
347
- recorder.respond(
348
- "https://lambda-gateway.pentatonic.com/v1/embed",
349
- _FakeResponse(200, {"data": [{"embedding": [0.0]}]}),
350
- )
351
- client = EmbedClient(
352
- url="https://lambda-gateway.pentatonic.com",
353
- api_key="k",
354
- model="m",
355
- provider=PROVIDERS["pentatonic-gateway"],
356
- )
357
- client.embed_batch(["x"])
358
- assert recorder.calls[0]["url"] == "https://lambda-gateway.pentatonic.com/v1/embed"
359
-
360
-
361
- # ----------------------------------------------------------------------
362
- # Chunking — work around the Pentatonic AI Gateway's per-call cap of 5
363
- # texts. Above the cap the gateway 502s; without chunking the layer's
364
- # /index-batch handler raises, the compat shim swallows it, and vector
365
- # writes silently drop. Chunking splits the request into chunks of
366
- # `max_batch` so each call stays within the gateway's limit.
367
- # ----------------------------------------------------------------------
368
-
369
-
370
- class _PentatonicEchoStub:
371
- """httpx.post replacement that returns one embedding per input text,
372
- matching real gateway behaviour. Each response embedding encodes the
373
- input index so tests can assert order preservation across chunks."""
374
-
375
- def __init__(self):
376
- self.calls: list[dict] = []
377
- self._offset = 0 # running input-index counter across calls
378
-
379
- def __call__(self, url, *, json, headers, timeout):
380
- self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
381
- n = len(json.get("input") or [])
382
- embs = [[float(self._offset + i)] for i in range(n)]
383
- self._offset += n
384
- return _FakeResponse(200, {"embeddings": embs})
385
-
386
-
387
- def test_chunking_below_max_batch_makes_one_call(monkeypatch):
388
- """N <= max_batch sends one request, no chunking overhead."""
389
- stub = _PentatonicEchoStub()
390
- monkeypatch.setattr(httpx, "post", stub)
391
- client = EmbedClient(
392
- url="https://lambda-gateway.pentatonic.com/v1/embed",
393
- api_key="k", model="m",
394
- provider=PROVIDERS["pentatonic-gateway"],
395
- max_batch=5,
396
- )
397
- out = client.embed_batch([f"t{i}" for i in range(5)])
398
- assert len(out) == 5
399
- assert len(stub.calls) == 1
400
- assert len(stub.calls[0]["json"]["input"]) == 5
401
-
402
-
403
- def test_chunking_above_max_batch_splits_into_calls(monkeypatch):
404
- """N > max_batch is split into len(N)/max_batch posts; results are
405
- concatenated in input order so the caller can't tell."""
406
- stub = _PentatonicEchoStub()
407
- monkeypatch.setattr(httpx, "post", stub)
408
- client = EmbedClient(
409
- url="https://lambda-gateway.pentatonic.com/v1/embed",
410
- api_key="k", model="m",
411
- provider=PROVIDERS["pentatonic-gateway"],
412
- max_batch=5,
413
- )
414
- out = client.embed_batch([f"t{i}" for i in range(12)])
415
- # 12 texts → chunks of [5, 5, 2] → 3 calls
416
- assert len(stub.calls) == 3
417
- assert [len(c["json"]["input"]) for c in stub.calls] == [5, 5, 2]
418
- # Stub returns one vector per input. Each vector encodes its
419
- # cross-chunk input index → assert order preserved.
420
- assert len(out) == 12
421
- assert out == [[float(i)] for i in range(12)]
422
-
423
-
424
- def test_chunking_disabled_with_max_batch_zero(monkeypatch):
425
- """max_batch=0 means no chunking — old behaviour (one big call)."""
426
- stub = _PentatonicEchoStub()
427
- monkeypatch.setattr(httpx, "post", stub)
428
- client = EmbedClient(
429
- url="https://lambda-gateway.pentatonic.com/v1/embed",
430
- api_key="k", model="m",
431
- provider=PROVIDERS["pentatonic-gateway"],
432
- max_batch=0,
433
- )
434
- client.embed_batch([f"t{i}" for i in range(20)])
435
- assert len(stub.calls) == 1
436
- assert len(stub.calls[0]["json"]["input"]) == 20
437
-
438
-
439
- def test_chunking_propagates_first_error(recorder):
440
- """If a chunk fails (e.g., gateway 502), the whole call raises with
441
- the first error — matching the un-chunked semantics. We don't return
442
- a partial vector list because the caller's downstream `for r, emb, txt
443
- in zip(...)` loop would silently drop the failed records."""
444
- # Pentatonic gateway 502 on every call (simulates the real bug)
445
- recorder.respond(
446
- "https://lambda-gateway.pentatonic.com/v1/embed",
447
- _FakeResponse(502, "<html>...bad gateway...</html>"),
448
- )
449
- client = EmbedClient(
450
- url="https://lambda-gateway.pentatonic.com/v1/embed",
451
- api_key="k", model="m",
452
- provider=PROVIDERS["pentatonic-gateway"],
453
- max_batch=5,
454
- )
455
- with pytest.raises(EmbedHTTPError) as exc:
456
- client.embed_batch([f"t{i}" for i in range(8)])
457
- assert exc.value.status == 502
458
-
459
-
460
- class _OpenAIEchoStub:
461
- """OpenAI-shaped stub: returns one embedding per input as
462
- {data: [{embedding: [...]}]}."""
463
-
464
- def __init__(self):
465
- self.calls: list[dict] = []
466
-
467
- def __call__(self, url, *, json, headers, timeout):
468
- self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
469
- n = len(json.get("input") or [])
470
- return _FakeResponse(200, {"data": [{"embedding": [0.0]} for _ in range(n)]})
471
-
472
-
473
- def test_from_env_reads_max_batch(monkeypatch):
474
- """{prefix}EMBED_MAX_BATCH overrides the default of 5."""
475
- monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
476
- monkeypatch.setenv("L4_EMBED_API_KEY", "k")
477
- monkeypatch.setenv("L4_EMBED_MAX_BATCH", "3")
478
- stub = _OpenAIEchoStub()
479
- monkeypatch.setattr(httpx, "post", stub)
480
- client = EmbedClient.from_env(prefix="L4_")
481
- client.embed_batch([f"t{i}" for i in range(7)])
482
- # 7 with chunk=3 → [3, 3, 1] → 3 calls
483
- assert len(stub.calls) == 3
484
- assert [len(c["json"]["input"]) for c in stub.calls] == [3, 3, 1]
485
-
486
-
487
- def test_from_env_default_max_batch_is_five(monkeypatch):
488
- """Default max_batch=5 matches the observed Pentatonic Gateway cap."""
489
- monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
490
- monkeypatch.setenv("L4_EMBED_API_KEY", "k")
491
- stub = _OpenAIEchoStub()
492
- monkeypatch.setattr(httpx, "post", stub)
493
- client = EmbedClient.from_env(prefix="L4_")
494
- client.embed_batch([f"t{i}" for i in range(10)])
495
- # 10 with default chunk=5 → [5, 5] → 2 calls
496
- assert len(stub.calls) == 2
497
-
498
-
499
- # ----------------------------------------------------------------------
500
- # Retry-with-jitter on transient gateway saturation (502/503/504/429)
501
- # ----------------------------------------------------------------------
502
- #
503
- # These tests exercise the retry path added 2026-05-15. Motivation:
504
- # the Pentatonic AI Gateway has a K≈10 concurrency cap and 502s under
505
- # saturation; without retry, a single 502 cascades through the engine's
506
- # per-layer fallback path and amplifies load instead of damping it.
507
- # See the prod incident note on EmbedClient.__init__ for context.
508
-
509
-
510
- class _SequencedRecorder:
511
- """Returns a different response on each successive call.
512
-
513
- The default `_Recorder` returns the same response every time, which
514
- is wrong for retry tests — we need to verify "first call 502, then
515
- succeed on retry". This recorder pops responses off a queue per
516
- URL and falls back to the last response if the queue is empty
517
- (matching the "persistent failure" test case naturally).
518
- """
519
-
520
- def __init__(self):
521
- self.calls: list[dict] = []
522
- self.queues: dict[str, list[_FakeResponse]] = {}
523
-
524
- def queue(self, url: str, responses: list[_FakeResponse]) -> None:
525
- self.queues[url] = list(responses)
526
-
527
- def __call__(self, url, *, json, headers, timeout):
528
- self.calls.append({"url": url, "json": json})
529
- q = self.queues.get(url, [])
530
- if not q:
531
- return _FakeResponse(401, "no responses queued")
532
- # Pop unless this is the last one — keep returning the tail so
533
- # "all attempts fail" tests don't need to queue N copies.
534
- return q.pop(0) if len(q) > 1 else q[0]
535
-
536
-
537
- @pytest.fixture
538
- def sequenced(monkeypatch):
539
- rec = _SequencedRecorder()
540
- monkeypatch.setattr(httpx, "post", rec)
541
- # Avoid the test taking real wall time on backoff sleeps — patch
542
- # time.sleep to no-op. The jitter calculation still runs, just
543
- # without the actual delay.
544
- import time as _time
545
- monkeypatch.setattr(_time, "sleep", lambda _s: None)
546
- return rec
547
-
548
-
549
- def test_retries_on_502_and_succeeds(sequenced):
550
- sequenced.queue(
551
- "https://gw/v1/embeddings",
552
- [
553
- _FakeResponse(502, "bad gateway"),
554
- _FakeResponse(200, {"data": [{"embedding": [0.1, 0.2]}]}),
555
- ],
556
- )
557
- client = EmbedClient(
558
- url="https://gw/v1/embeddings",
559
- api_key="k",
560
- model="m",
561
- provider=PROVIDERS["openai"],
562
- max_retries=3,
563
- )
564
- out = client.embed_batch(["hello"])
565
- assert out == [[0.1, 0.2]]
566
- # First call 502, second call 200 — exactly two attempts.
567
- assert len(sequenced.calls) == 2
568
-
569
-
570
- def test_retries_on_503_504_429(sequenced):
571
- """Each transient code triggers the retry path the same way."""
572
- for code in (503, 504, 429):
573
- sequenced.calls.clear()
574
- sequenced.queue(
575
- "https://gw/v1/embeddings",
576
- [
577
- _FakeResponse(code, "transient"),
578
- _FakeResponse(200, {"data": [{"embedding": [0.0]}]}),
579
- ],
580
- )
581
- client = EmbedClient(
582
- url="https://gw/v1/embeddings",
583
- api_key="k",
584
- model="m",
585
- provider=PROVIDERS["openai"],
586
- max_retries=3,
587
- )
588
- out = client.embed_batch(["x"])
589
- assert out == [[0.0]], f"retry failed for status {code}"
590
- assert len(sequenced.calls) == 2, f"wrong call count for status {code}"
591
-
592
-
593
- def test_does_not_retry_on_500(sequenced):
594
- """500 is server-side bug, not transient saturation — fail fast."""
595
- sequenced.queue(
596
- "https://gw/v1/embeddings",
597
- [_FakeResponse(500, "internal server error")],
598
- )
599
- client = EmbedClient(
600
- url="https://gw/v1/embeddings",
601
- api_key="k",
602
- model="m",
603
- provider=PROVIDERS["openai"],
604
- max_retries=3,
605
- )
606
- with pytest.raises(EmbedHTTPError) as exc:
607
- client.embed_batch(["x"])
608
- assert exc.value.status == 500
609
- # Exactly one attempt — no retry on 500.
610
- assert len(sequenced.calls) == 1
611
-
612
-
613
- def test_does_not_retry_on_400(sequenced):
614
- """4xx (other than 401-autodetect / 429) indicates caller error."""
615
- sequenced.queue(
616
- "https://gw/v1/embeddings",
617
- [_FakeResponse(400, "bad request")],
618
- )
619
- client = EmbedClient(
620
- url="https://gw/v1/embeddings",
621
- api_key="k",
622
- model="m",
623
- provider=PROVIDERS["openai"],
624
- max_retries=3,
625
- )
626
- with pytest.raises(EmbedHTTPError) as exc:
627
- client.embed_batch(["x"])
628
- assert exc.value.status == 400
629
- assert len(sequenced.calls) == 1
630
-
631
-
632
- def test_max_retries_exhausted_raises(sequenced):
633
- """Persistent 502 raises after max_retries+1 attempts."""
634
- sequenced.queue(
635
- "https://gw/v1/embeddings",
636
- [_FakeResponse(502, "still down")],
637
- )
638
- client = EmbedClient(
639
- url="https://gw/v1/embeddings",
640
- api_key="k",
641
- model="m",
642
- provider=PROVIDERS["openai"],
643
- max_retries=3,
644
- )
645
- with pytest.raises(EmbedHTTPError) as exc:
646
- client.embed_batch(["x"])
647
- assert exc.value.status == 502
648
- # max_retries=3 → 1 original + 3 retries = 4 calls total.
649
- assert len(sequenced.calls) == 4
650
-
651
-
652
- def test_max_retries_zero_disables_retry(sequenced):
653
- """Explicit opt-out preserves pre-fix behaviour for callers that
654
- handle their own retry."""
655
- sequenced.queue(
656
- "https://gw/v1/embeddings",
657
- [_FakeResponse(502, "down")],
658
- )
659
- client = EmbedClient(
660
- url="https://gw/v1/embeddings",
661
- api_key="k",
662
- model="m",
663
- provider=PROVIDERS["openai"],
664
- max_retries=0,
665
- )
666
- with pytest.raises(EmbedHTTPError):
667
- client.embed_batch(["x"])
668
- assert len(sequenced.calls) == 1
669
-
670
-
671
- def test_from_env_reads_retry_config(monkeypatch):
672
- """{prefix}EMBED_MAX_RETRIES + EMBED_RETRY_BASE_DELAY +
673
- EMBED_RETRY_MAX_DELAY override the defaults."""
674
- monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
675
- monkeypatch.setenv("L4_EMBED_API_KEY", "k")
676
- monkeypatch.setenv("L4_EMBED_MAX_RETRIES", "5")
677
- monkeypatch.setenv("L4_EMBED_RETRY_BASE_DELAY", "0.25")
678
- monkeypatch.setenv("L4_EMBED_RETRY_MAX_DELAY", "2.5")
679
- client = EmbedClient.from_env(prefix="L4_")
680
- assert client._max_retries == 5
681
- assert client._retry_base_delay == 0.25
682
- assert client._retry_max_delay == 2.5
683
-
684
-
685
- def test_from_env_default_retry_config(monkeypatch):
686
- """Defaults: 3 retries, 100ms base, 1s cap — tuned for K≈10
687
- gateway under burst load."""
688
- monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
689
- monkeypatch.setenv("L4_EMBED_API_KEY", "k")
690
- client = EmbedClient.from_env(prefix="L4_")
691
- assert client._max_retries == 3
692
- assert client._retry_base_delay == 0.1
693
- assert client._retry_max_delay == 1.0