@geravant/sinain 1.12.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/.env.example +4 -2
  2. package/config-shared.js +1 -0
  3. package/package.json +4 -1
  4. package/sinain-agent/run.sh +36 -4
  5. package/sinain-core/package-lock.json +963 -0
  6. package/sinain-core/package.json +1 -0
  7. package/sinain-core/src/buffers/feed-buffer.ts +34 -0
  8. package/sinain-core/src/embedding/service.ts +66 -0
  9. package/sinain-core/src/index.ts +65 -17
  10. package/sinain-core/src/learning/local-curation.ts +137 -7
  11. package/sinain-core/src/server.ts +31 -0
  12. package/sinain-memory/README.md +105 -0
  13. package/sinain-memory/embed_client.py +117 -0
  14. package/sinain-memory/graph_query.py +269 -18
  15. package/sinain-memory/knowledge_integrator.py +551 -74
  16. package/sinain-memory/memory-config.json +1 -1
  17. package/sinain-memory/session_distiller.py +43 -19
  18. package/sinain-memory/triplestore.py +60 -0
  19. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  20. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  21. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  22. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  23. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/__init__.py +0 -0
  25. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/assertions.py +0 -267
  27. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  32. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  33. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  34. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  35. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  38. package/sinain-memory/eval/benchmarks/config.py +0 -23
  39. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  40. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  41. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  42. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  43. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  44. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  45. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  46. package/sinain-memory/eval/benchmarks/query.py +0 -172
  47. package/sinain-memory/eval/benchmarks/report.py +0 -87
  48. package/sinain-memory/eval/benchmarks/runner.py +0 -276
  49. package/sinain-memory/eval/judges/__init__.py +0 -0
  50. package/sinain-memory/eval/judges/base_judge.py +0 -61
  51. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  52. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  53. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  54. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  55. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  56. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  57. package/sinain-memory/eval/schemas.py +0 -247
  58. package/sinain-memory/tests/__init__.py +0 -0
  59. package/sinain-memory/tests/conftest.py +0 -189
  60. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  61. package/sinain-memory/tests/test_embedder.py +0 -210
  62. package/sinain-memory/tests/test_extract_json.py +0 -124
  63. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  64. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  65. package/sinain-memory/tests/test_module_management.py +0 -458
  66. package/sinain-memory/tests/test_parsers.py +0 -96
  67. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  68. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  69. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  70. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  71. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -1,96 +0,0 @@
1
- """Tests for common.py parser functions: parse_module_stack, parse_mining_index, parse_effectiveness."""
2
-
3
- from common import parse_module_stack, parse_mining_index, parse_effectiveness
4
-
5
-
6
- class TestParseModuleStack:
7
- def test_standard_stack(self):
8
- text = "<!-- module-stack: react-native-dev(85), ocr-pipeline(70) -->\nplaybook body"
9
- result = parse_module_stack(text)
10
- assert len(result) == 2
11
- assert result[0] == {"id": "react-native-dev", "priority": 85}
12
- assert result[1] == {"id": "ocr-pipeline", "priority": 70}
13
-
14
- def test_sorted_by_priority_desc(self):
15
- text = "<!-- module-stack: low(10), high(90), mid(50) -->"
16
- result = parse_module_stack(text)
17
- assert result[0]["id"] == "high"
18
- assert result[1]["id"] == "mid"
19
- assert result[2]["id"] == "low"
20
-
21
- def test_single_module(self):
22
- text = "<!-- module-stack: only-one(42) -->"
23
- result = parse_module_stack(text)
24
- assert len(result) == 1
25
- assert result[0] == {"id": "only-one", "priority": 42}
26
-
27
- def test_no_priority_parentheses(self):
28
- text = "<!-- module-stack: bare-module -->"
29
- result = parse_module_stack(text)
30
- assert len(result) == 1
31
- assert result[0] == {"id": "bare-module", "priority": 0}
32
-
33
- def test_absent_comment(self):
34
- text = "Just a regular playbook with no module-stack comment"
35
- assert parse_module_stack(text) == []
36
-
37
- def test_empty_stack(self):
38
- text = "<!-- module-stack: -->"
39
- assert parse_module_stack(text) == []
40
-
41
-
42
- class TestParseMiningIndex:
43
- def test_standard_index(self):
44
- text = "<!-- mining-index: 2026-02-21,2026-02-20,2026-02-19 -->"
45
- result = parse_mining_index(text)
46
- assert result == ["2026-02-21", "2026-02-20", "2026-02-19"]
47
-
48
- def test_single_date(self):
49
- text = "<!-- mining-index: 2026-02-21 -->"
50
- result = parse_mining_index(text)
51
- assert result == ["2026-02-21"]
52
-
53
- def test_empty_index(self):
54
- text = "<!-- mining-index: -->"
55
- result = parse_mining_index(text)
56
- assert result == []
57
-
58
- def test_absent_comment(self):
59
- text = "No mining index here"
60
- assert parse_mining_index(text) == []
61
-
62
- def test_extra_whitespace(self):
63
- text = "<!-- mining-index: 2026-02-21 , 2026-02-20 -->"
64
- result = parse_mining_index(text)
65
- assert result == ["2026-02-21", "2026-02-20"]
66
-
67
-
68
- class TestParseEffectiveness:
69
- def test_standard_metrics(self):
70
- text = "<!-- effectiveness: outputs=8,positive=5,negative=1,neutral=2,rate=0.63,updated=2026-02-21 -->"
71
- result = parse_effectiveness(text)
72
- assert result is not None
73
- assert result["outputs"] == 8
74
- assert result["positive"] == 5
75
- assert result["rate"] == 0.63
76
- assert result["updated"] == "2026-02-21"
77
-
78
- def test_absent_comment(self):
79
- assert parse_effectiveness("No effectiveness comment") is None
80
-
81
- def test_integer_conversion(self):
82
- text = "<!-- effectiveness: outputs=10 -->"
83
- result = parse_effectiveness(text)
84
- assert result["outputs"] == 10
85
- assert isinstance(result["outputs"], int)
86
-
87
- def test_float_conversion(self):
88
- text = "<!-- effectiveness: rate=0.75 -->"
89
- result = parse_effectiveness(text)
90
- assert result["rate"] == 0.75
91
- assert isinstance(result["rate"], float)
92
-
93
- def test_string_value(self):
94
- text = "<!-- effectiveness: updated=2026-02-21 -->"
95
- result = parse_effectiveness(text)
96
- assert result["updated"] == "2026-02-21"
@@ -1,430 +0,0 @@
1
- """Tests for tick_evaluator.py + eval/schemas.py + eval/assertions.py + common retry."""
2
-
3
- import json
4
- from pathlib import Path
5
- from unittest.mock import patch, call
6
-
7
- from common import LLMError, call_llm_with_fallback
8
- from eval.schemas import validate, SCHEMA_REGISTRY
9
- from eval.assertions import (
10
- assert_playbook_under_limit,
11
- assert_curator_respected_directive,
12
- assert_no_repeat_action,
13
- assert_signal_confidence_threshold,
14
- assert_insight_char_limit,
15
- assert_skip_reason_specific,
16
- assert_miner_references_sources,
17
- assert_playbook_header_footer_intact,
18
- assert_schema_valid,
19
- run_tick_assertions,
20
- )
21
- from tick_evaluator import validate_tick_schemas, load_eval_config
22
-
23
-
24
- # ═══════════════════════════════════════════════════════════════════════════
25
- # Schema validation tests
26
- # ═══════════════════════════════════════════════════════════════════════════
27
-
28
- class TestSchemaValidation:
29
- def test_valid_signal_analyzer(self):
30
- data = {"signals": [{"description": "signal1", "priority": "high"}], "recommendedAction": None, "idle": False}
31
- errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
32
- assert errors == []
33
-
34
- def test_valid_signal_with_action(self):
35
- data = {
36
- "signals": [{"description": "sig", "priority": "medium"}],
37
- "recommendedAction": {"action": "sessions_spawn", "task": "debug", "confidence": 0.8},
38
- "idle": False,
39
- }
40
- errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
41
- assert errors == []
42
-
43
- def test_valid_signal_with_null_task(self):
44
- data = {
45
- "signals": [{"description": "idle detected", "priority": "low"}],
46
- "recommendedAction": {"action": "skip", "task": None, "confidence": 0.9},
47
- "idle": True,
48
- }
49
- errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
50
- assert errors == []
51
-
52
- def test_invalid_signal_missing_required(self):
53
- data = {"signals": [{"description": "sig", "priority": "high"}]} # missing idle, recommendedAction
54
- errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
55
- assert any("idle" in e for e in errors)
56
-
57
- def test_valid_feedback_analyzer(self):
58
- data = {
59
- "feedbackScores": {"avg": 0.5, "high": ["a"], "low": ["b"]},
60
- "effectiveness": {"outputs": 10, "positive": 7, "negative": 1, "neutral": 2, "rate": 0.7},
61
- "curateDirective": "normal",
62
- "interpretation": "Good patterns",
63
- }
64
- errors = validate(data, SCHEMA_REGISTRY["feedback_analyzer"])
65
- assert errors == []
66
-
67
- def test_invalid_directive_value(self):
68
- data = {
69
- "feedbackScores": {"avg": 0.5},
70
- "effectiveness": {"outputs": 0, "positive": 0, "negative": 0, "neutral": 0, "rate": 0.0},
71
- "curateDirective": "invalid_value",
72
- }
73
- errors = validate(data, SCHEMA_REGISTRY["feedback_analyzer"])
74
- assert any("curateDirective" in e or "not in" in e for e in errors)
75
-
76
- def test_valid_memory_miner(self):
77
- data = {
78
- "findings": "Found patterns",
79
- "newPatterns": ["pattern1"],
80
- "minedSources": ["2026-02-21.md"],
81
- }
82
- errors = validate(data, SCHEMA_REGISTRY["memory_miner"])
83
- assert errors == []
84
-
85
- def test_valid_playbook_curator(self):
86
- data = {
87
- "changes": {"added": ["new"], "pruned": ["old"], "promoted": []},
88
- "staleItemActions": [],
89
- "playbookLines": 25,
90
- }
91
- errors = validate(data, SCHEMA_REGISTRY["playbook_curator"])
92
- assert errors == []
93
-
94
- def test_valid_insight_skip(self):
95
- data = {"skip": True, "skipReason": "No new patterns since last analysis"}
96
- errors = validate(data, SCHEMA_REGISTRY["insight_synthesizer"])
97
- assert errors == []
98
-
99
- def test_valid_insight_output(self):
100
- data = {
101
- "skip": False,
102
- "suggestion": "Try frame batching",
103
- "insight": "Evening correlates with exploration",
104
- "totalChars": 55,
105
- }
106
- errors = validate(data, SCHEMA_REGISTRY["insight_synthesizer"])
107
- assert errors == []
108
-
109
- def test_valid_insight_without_skip(self):
110
- data = {
111
- "suggestion": "Try frame batching",
112
- "insight": "Evening correlates with exploration",
113
- "totalChars": 55,
114
- }
115
- errors = validate(data, SCHEMA_REGISTRY["insight_synthesizer"])
116
- assert errors == []
117
-
118
- def test_confidence_out_of_range(self):
119
- data = {
120
- "signals": [],
121
- "recommendedAction": {"action": "sessions_spawn", "task": "x", "confidence": 1.5},
122
- "idle": False,
123
- }
124
- errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
125
- # oneOf fails because the object variant rejects confidence > 1.0
126
- assert len(errors) > 0
127
- assert any("oneOf" in e or "maximum" in e for e in errors)
128
-
129
- def test_negative_playbook_lines(self):
130
- data = {
131
- "changes": {"added": [], "pruned": [], "promoted": []},
132
- "playbookLines": -1,
133
- }
134
- errors = validate(data, SCHEMA_REGISTRY["playbook_curator"])
135
- assert any("minimum" in e for e in errors)
136
-
137
-
138
- # ═══════════════════════════════════════════════════════════════════════════
139
- # Assertion tests
140
- # ═══════════════════════════════════════════════════════════════════════════
141
-
142
- class TestPlaybookUnderLimit:
143
- def test_under_limit(self):
144
- r = assert_playbook_under_limit({"playbookLines": 30})
145
- assert r["passed"] is True
146
-
147
- def test_at_limit(self):
148
- r = assert_playbook_under_limit({"playbookLines": 50})
149
- assert r["passed"] is True
150
-
151
- def test_over_limit(self):
152
- r = assert_playbook_under_limit({"playbookLines": 55})
153
- assert r["passed"] is False
154
-
155
- def test_custom_limit(self):
156
- r = assert_playbook_under_limit({"playbookLines": 80}, limit=100)
157
- assert r["passed"] is True
158
-
159
-
160
- class TestCuratorRespectedDirective:
161
- def test_aggressive_prune_with_pruning(self):
162
- r = assert_curator_respected_directive(
163
- {"changes": {"added": [], "pruned": ["x"], "promoted": []}},
164
- "aggressive_prune",
165
- )
166
- assert r["passed"] is True
167
-
168
- def test_aggressive_prune_without_pruning(self):
169
- r = assert_curator_respected_directive(
170
- {"changes": {"added": ["new"], "pruned": [], "promoted": []}},
171
- "aggressive_prune",
172
- )
173
- assert r["passed"] is False
174
-
175
- def test_stability_conservative(self):
176
- r = assert_curator_respected_directive(
177
- {"changes": {"added": ["a"], "pruned": [], "promoted": []}},
178
- "stability",
179
- )
180
- assert r["passed"] is True
181
-
182
- def test_stability_too_aggressive(self):
183
- r = assert_curator_respected_directive(
184
- {"changes": {"added": [], "pruned": ["a", "b", "c", "d"], "promoted": []}},
185
- "stability",
186
- )
187
- assert r["passed"] is False
188
-
189
- def test_normal_anything_goes(self):
190
- r = assert_curator_respected_directive(
191
- {"changes": {"added": ["a", "b"], "pruned": ["c"], "promoted": []}},
192
- "normal",
193
- )
194
- assert r["passed"] is True
195
-
196
-
197
- class TestNoRepeatAction:
198
- def test_no_action(self):
199
- r = assert_no_repeat_action({"recommendedAction": None}, [])
200
- assert r["passed"] is True
201
-
202
- def test_skip_action(self):
203
- r = assert_no_repeat_action({"recommendedAction": {"action": "skip"}}, [])
204
- assert r["passed"] is True
205
-
206
- def test_distinct_action(self):
207
- recent = [{"actionsConsidered": [{"chosen": True, "reason": "Research Flutter overlays"}]}]
208
- r = assert_no_repeat_action(
209
- {"recommendedAction": {"action": "sessions_spawn", "task": "Debug OCR backpressure"}},
210
- recent,
211
- )
212
- assert r["passed"] is True
213
-
214
- def test_repeated_action(self):
215
- recent = [{"actionsConsidered": [{"chosen": True, "reason": "Debug OCR backpressure issue"}]}]
216
- r = assert_no_repeat_action(
217
- {"recommendedAction": {"action": "sessions_spawn", "task": "Debug OCR backpressure"}},
218
- recent,
219
- )
220
- assert r["passed"] is False
221
-
222
-
223
- class TestSignalConfidenceThreshold:
224
- def test_above_threshold(self):
225
- r = assert_signal_confidence_threshold(
226
- {"recommendedAction": {"action": "sessions_spawn", "confidence": 0.8}}
227
- )
228
- assert r["passed"] is True
229
-
230
- def test_below_threshold(self):
231
- r = assert_signal_confidence_threshold(
232
- {"recommendedAction": {"action": "sessions_spawn", "confidence": 0.3}}
233
- )
234
- assert r["passed"] is False
235
-
236
- def test_no_confidence(self):
237
- r = assert_signal_confidence_threshold(
238
- {"recommendedAction": {"action": "sessions_spawn"}}
239
- )
240
- assert r["passed"] is False
241
-
242
- def test_no_action(self):
243
- r = assert_signal_confidence_threshold({"recommendedAction": None})
244
- assert r["passed"] is True
245
-
246
-
247
- class TestInsightCharLimit:
248
- def test_under_limit(self):
249
- r = assert_insight_char_limit({"suggestion": "short", "insight": "also short"})
250
- assert r["passed"] is True
251
-
252
- def test_over_limit(self):
253
- r = assert_insight_char_limit({"suggestion": "x" * 300, "insight": "y" * 300})
254
- assert r["passed"] is False
255
-
256
- def test_skipped_output(self):
257
- r = assert_insight_char_limit({"skip": True})
258
- assert r["passed"] is True
259
-
260
-
261
- class TestSkipReasonSpecific:
262
- def test_specific_reason(self):
263
- r = assert_skip_reason_specific({
264
- "skip": True,
265
- "skipReason": "No new patterns detected in playbook since last tick; mining-index shows 2026-02-21 was already processed"
266
- })
267
- assert r["passed"] is True
268
-
269
- def test_generic_reason(self):
270
- r = assert_skip_reason_specific({"skip": True, "skipReason": "no new data"})
271
- assert r["passed"] is False
272
-
273
- def test_no_reason(self):
274
- r = assert_skip_reason_specific({"skip": True})
275
- assert r["passed"] is False
276
-
277
- def test_not_skipped(self):
278
- r = assert_skip_reason_specific({"skip": False})
279
- assert r["passed"] is True
280
-
281
-
282
- class TestMinerReferencesSources:
283
- def test_valid_sources(self):
284
- r = assert_miner_references_sources(
285
- {"minedSources": ["2026-02-21.md"]},
286
- ["2026-02-21.md", "2026-02-20.md"],
287
- )
288
- assert r["passed"] is True
289
-
290
- def test_unknown_sources(self):
291
- r = assert_miner_references_sources(
292
- {"minedSources": ["2026-02-21.md", "fake-file.md"]},
293
- ["2026-02-21.md"],
294
- )
295
- assert r["passed"] is False
296
-
297
- def test_no_sources(self):
298
- r = assert_miner_references_sources({"minedSources": []}, [])
299
- assert r["passed"] is True
300
-
301
-
302
- class TestPlaybookHeaderFooterIntact:
303
- def test_both_present(self):
304
- text = "<!-- mining-index: 2026-02-21 -->\nbody\n<!-- effectiveness: rate=0.5 -->"
305
- r = assert_playbook_header_footer_intact(text)
306
- assert r["passed"] is True
307
-
308
- def test_missing_header(self):
309
- r = assert_playbook_header_footer_intact("body\n<!-- effectiveness: rate=0.5 -->")
310
- assert r["passed"] is False
311
- assert "mining-index" in r["detail"]
312
-
313
- def test_missing_footer(self):
314
- r = assert_playbook_header_footer_intact("<!-- mining-index: 2026-02-21 -->\nbody")
315
- assert r["passed"] is False
316
- assert "effectiveness" in r["detail"]
317
-
318
-
319
- # ═══════════════════════════════════════════════════════════════════════════
320
- # Integration: validate_tick_schemas
321
- # ═══════════════════════════════════════════════════════════════════════════
322
-
323
- class TestValidateTickSchemas:
324
- def test_full_log_entry(self, sample_log_entry):
325
- result = validate_tick_schemas(sample_log_entry)
326
- assert result["total"] > 0
327
- assert result["valid"] == result["total"]
328
- assert result["failures"] == []
329
-
330
- def test_empty_log_entry(self):
331
- result = validate_tick_schemas({})
332
- assert result["total"] == 0
333
-
334
-
335
- # ═══════════════════════════════════════════════════════════════════════════
336
- # Integration: run_tick_assertions
337
- # ═══════════════════════════════════════════════════════════════════════════
338
-
339
- class TestRunTickAssertions:
340
- def test_all_pass_on_good_entry(self, sample_log_entry):
341
- playbook = "<!-- mining-index: 2026-02-21 -->\nbody\n<!-- effectiveness: rate=0.5 -->"
342
- results = run_tick_assertions(sample_log_entry, [], playbook, ["2026-02-21.md"])
343
- assert all(r["passed"] for r in results), [r for r in results if not r["passed"]]
344
-
345
- def test_detects_failures(self):
346
- bad_entry = {
347
- "signals": [],
348
- "recommendedAction": {"action": "sessions_spawn", "task": "x", "confidence": 0.2},
349
- "playbookChanges": {
350
- "changes": {"added": ["a", "b"], "pruned": [], "promoted": []},
351
- "playbookLines": 55,
352
- },
353
- "curateDirective": "aggressive_prune",
354
- "output": {"skip": True, "skipReason": "no new data"},
355
- }
356
- results = run_tick_assertions(bad_entry, [], "", [])
357
- failed = [r for r in results if not r["passed"]]
358
- failed_names = {r["name"] for r in failed}
359
- assert "signal_confidence_threshold" in failed_names
360
- assert "playbook_under_limit" in failed_names
361
- assert "curator_respected_directive" in failed_names
362
- assert "skip_reason_specific" in failed_names
363
-
364
-
365
- # ═══════════════════════════════════════════════════════════════════════════
366
- # Config
367
- # ═══════════════════════════════════════════════════════════════════════════
368
-
369
- class TestLoadEvalConfig:
370
- def test_defaults(self, tmp_path):
371
- memory_dir = str(tmp_path)
372
- config = load_eval_config(memory_dir)
373
- assert config["level"] == "mechanical"
374
- assert config["sampleRate"] == 0.2
375
-
376
- def test_runtime_override(self, tmp_path):
377
- override = {"level": "full", "changedAt": "2026-02-28T10:00:00Z"}
378
- (tmp_path / "eval-config.json").write_text(json.dumps(override))
379
- config = load_eval_config(str(tmp_path))
380
- assert config["level"] == "full"
381
-
382
-
383
- # ═══════════════════════════════════════════════════════════════════════════
384
- # call_llm_with_fallback retry logic
385
- # ═══════════════════════════════════════════════════════════════════════════
386
-
387
- class TestCallLlmWithFallback:
388
- @patch("common.time.sleep")
389
- @patch("common.call_llm")
390
- def test_succeeds_on_retry(self, mock_call, mock_sleep):
391
- """First attempt fails with LLMError, second succeeds."""
392
- mock_call.side_effect = [
393
- LLMError("timeout"),
394
- '{"changes": {"added": ["x"], "pruned": [], "promoted": []}}',
395
- ]
396
- result = call_llm_with_fallback("sys", "usr", script="playbook_curator", json_mode=True)
397
- assert '"added": ["x"]' in result
398
- assert mock_call.call_count == 2
399
- mock_sleep.assert_called_once_with(1) # 2^0 = 1s backoff
400
-
401
- @patch("common.time.sleep")
402
- @patch("common.call_llm")
403
- def test_raises_after_all_retries_exhausted(self, mock_call, mock_sleep):
404
- """Both attempts fail → raises the last LLMError."""
405
- mock_call.side_effect = LLMError("persistent failure")
406
- import pytest
407
- with pytest.raises(LLMError, match="persistent failure"):
408
- call_llm_with_fallback("sys", "usr", retries=1)
409
- assert mock_call.call_count == 2
410
-
411
- @patch("common.call_llm")
412
- def test_succeeds_first_try_no_retry(self, mock_call):
413
- """If first attempt succeeds, no retry or sleep happens."""
414
- mock_call.return_value = '{"ok": true}'
415
- result = call_llm_with_fallback("sys", "usr")
416
- assert result == '{"ok": true}'
417
- assert mock_call.call_count == 1
418
-
419
- @patch("common.time.sleep")
420
- @patch("common.call_llm")
421
- def test_exponential_backoff(self, mock_call, mock_sleep):
422
- """With retries=2, backoff should be 1s then 2s."""
423
- mock_call.side_effect = [
424
- LLMError("fail1"),
425
- LLMError("fail2"),
426
- "success",
427
- ]
428
- result = call_llm_with_fallback("sys", "usr", retries=2)
429
- assert result == "success"
430
- assert mock_sleep.call_args_list == [call(1), call(2)]