@geravant/sinain 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +183 -0
- package/index.ts +2096 -0
- package/install.js +155 -0
- package/openclaw.plugin.json +59 -0
- package/package.json +21 -0
- package/sinain-memory/common.py +403 -0
- package/sinain-memory/demo_knowledge_transfer.sh +85 -0
- package/sinain-memory/embedder.py +268 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/assertions.py +288 -0
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +61 -0
- package/sinain-memory/eval/judges/curation_judge.py +46 -0
- package/sinain-memory/eval/judges/insight_judge.py +48 -0
- package/sinain-memory/eval/judges/mining_judge.py +42 -0
- package/sinain-memory/eval/judges/signal_judge.py +45 -0
- package/sinain-memory/eval/schemas.py +247 -0
- package/sinain-memory/eval_delta.py +109 -0
- package/sinain-memory/eval_reporter.py +642 -0
- package/sinain-memory/feedback_analyzer.py +221 -0
- package/sinain-memory/git_backup.sh +19 -0
- package/sinain-memory/insight_synthesizer.py +181 -0
- package/sinain-memory/memory/2026-03-01.md +11 -0
- package/sinain-memory/memory/playbook-archive/sinain-playbook-2026-03-01-1418.md +15 -0
- package/sinain-memory/memory/playbook-logs/2026-03-01.jsonl +1 -0
- package/sinain-memory/memory/sinain-playbook.md +21 -0
- package/sinain-memory/memory-config.json +39 -0
- package/sinain-memory/memory_miner.py +183 -0
- package/sinain-memory/module_manager.py +695 -0
- package/sinain-memory/playbook_curator.py +225 -0
- package/sinain-memory/requirements.txt +3 -0
- package/sinain-memory/signal_analyzer.py +141 -0
- package/sinain-memory/test_local.py +402 -0
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +189 -0
- package/sinain-memory/tests/test_curator_helpers.py +94 -0
- package/sinain-memory/tests/test_embedder.py +210 -0
- package/sinain-memory/tests/test_extract_json.py +124 -0
- package/sinain-memory/tests/test_feedback_computation.py +121 -0
- package/sinain-memory/tests/test_miner_helpers.py +71 -0
- package/sinain-memory/tests/test_module_management.py +458 -0
- package/sinain-memory/tests/test_parsers.py +96 -0
- package/sinain-memory/tests/test_tick_evaluator.py +430 -0
- package/sinain-memory/tests/test_triple_extractor.py +255 -0
- package/sinain-memory/tests/test_triple_ingest.py +191 -0
- package/sinain-memory/tests/test_triple_migrate.py +138 -0
- package/sinain-memory/tests/test_triplestore.py +248 -0
- package/sinain-memory/tick_evaluator.py +392 -0
- package/sinain-memory/triple_extractor.py +402 -0
- package/sinain-memory/triple_ingest.py +290 -0
- package/sinain-memory/triple_migrate.py +275 -0
- package/sinain-memory/triple_query.py +184 -0
- package/sinain-memory/triplestore.py +498 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""Tests for tick_evaluator.py + eval/schemas.py + eval/assertions.py + common retry."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from unittest.mock import patch, call
|
|
6
|
+
|
|
7
|
+
from common import LLMError, call_llm_with_fallback
|
|
8
|
+
from eval.schemas import validate, SCHEMA_REGISTRY
|
|
9
|
+
from eval.assertions import (
|
|
10
|
+
assert_playbook_under_limit,
|
|
11
|
+
assert_curator_respected_directive,
|
|
12
|
+
assert_no_repeat_action,
|
|
13
|
+
assert_signal_confidence_threshold,
|
|
14
|
+
assert_insight_char_limit,
|
|
15
|
+
assert_skip_reason_specific,
|
|
16
|
+
assert_miner_references_sources,
|
|
17
|
+
assert_playbook_header_footer_intact,
|
|
18
|
+
assert_schema_valid,
|
|
19
|
+
run_tick_assertions,
|
|
20
|
+
)
|
|
21
|
+
from tick_evaluator import validate_tick_schemas, load_eval_config
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
25
|
+
# Schema validation tests
|
|
26
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
27
|
+
|
|
28
|
+
class TestSchemaValidation:
|
|
29
|
+
def test_valid_signal_analyzer(self):
|
|
30
|
+
data = {"signals": [{"description": "signal1", "priority": "high"}], "recommendedAction": None, "idle": False}
|
|
31
|
+
errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
|
|
32
|
+
assert errors == []
|
|
33
|
+
|
|
34
|
+
def test_valid_signal_with_action(self):
|
|
35
|
+
data = {
|
|
36
|
+
"signals": [{"description": "sig", "priority": "medium"}],
|
|
37
|
+
"recommendedAction": {"action": "sessions_spawn", "task": "debug", "confidence": 0.8},
|
|
38
|
+
"idle": False,
|
|
39
|
+
}
|
|
40
|
+
errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
|
|
41
|
+
assert errors == []
|
|
42
|
+
|
|
43
|
+
def test_valid_signal_with_null_task(self):
|
|
44
|
+
data = {
|
|
45
|
+
"signals": [{"description": "idle detected", "priority": "low"}],
|
|
46
|
+
"recommendedAction": {"action": "skip", "task": None, "confidence": 0.9},
|
|
47
|
+
"idle": True,
|
|
48
|
+
}
|
|
49
|
+
errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
|
|
50
|
+
assert errors == []
|
|
51
|
+
|
|
52
|
+
def test_invalid_signal_missing_required(self):
|
|
53
|
+
data = {"signals": [{"description": "sig", "priority": "high"}]} # missing idle, recommendedAction
|
|
54
|
+
errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
|
|
55
|
+
assert any("idle" in e for e in errors)
|
|
56
|
+
|
|
57
|
+
def test_valid_feedback_analyzer(self):
|
|
58
|
+
data = {
|
|
59
|
+
"feedbackScores": {"avg": 0.5, "high": ["a"], "low": ["b"]},
|
|
60
|
+
"effectiveness": {"outputs": 10, "positive": 7, "negative": 1, "neutral": 2, "rate": 0.7},
|
|
61
|
+
"curateDirective": "normal",
|
|
62
|
+
"interpretation": "Good patterns",
|
|
63
|
+
}
|
|
64
|
+
errors = validate(data, SCHEMA_REGISTRY["feedback_analyzer"])
|
|
65
|
+
assert errors == []
|
|
66
|
+
|
|
67
|
+
def test_invalid_directive_value(self):
|
|
68
|
+
data = {
|
|
69
|
+
"feedbackScores": {"avg": 0.5},
|
|
70
|
+
"effectiveness": {"outputs": 0, "positive": 0, "negative": 0, "neutral": 0, "rate": 0.0},
|
|
71
|
+
"curateDirective": "invalid_value",
|
|
72
|
+
}
|
|
73
|
+
errors = validate(data, SCHEMA_REGISTRY["feedback_analyzer"])
|
|
74
|
+
assert any("curateDirective" in e or "not in" in e for e in errors)
|
|
75
|
+
|
|
76
|
+
def test_valid_memory_miner(self):
|
|
77
|
+
data = {
|
|
78
|
+
"findings": "Found patterns",
|
|
79
|
+
"newPatterns": ["pattern1"],
|
|
80
|
+
"minedSources": ["2026-02-21.md"],
|
|
81
|
+
}
|
|
82
|
+
errors = validate(data, SCHEMA_REGISTRY["memory_miner"])
|
|
83
|
+
assert errors == []
|
|
84
|
+
|
|
85
|
+
def test_valid_playbook_curator(self):
|
|
86
|
+
data = {
|
|
87
|
+
"changes": {"added": ["new"], "pruned": ["old"], "promoted": []},
|
|
88
|
+
"staleItemActions": [],
|
|
89
|
+
"playbookLines": 25,
|
|
90
|
+
}
|
|
91
|
+
errors = validate(data, SCHEMA_REGISTRY["playbook_curator"])
|
|
92
|
+
assert errors == []
|
|
93
|
+
|
|
94
|
+
def test_valid_insight_skip(self):
|
|
95
|
+
data = {"skip": True, "skipReason": "No new patterns since last analysis"}
|
|
96
|
+
errors = validate(data, SCHEMA_REGISTRY["insight_synthesizer"])
|
|
97
|
+
assert errors == []
|
|
98
|
+
|
|
99
|
+
def test_valid_insight_output(self):
|
|
100
|
+
data = {
|
|
101
|
+
"skip": False,
|
|
102
|
+
"suggestion": "Try frame batching",
|
|
103
|
+
"insight": "Evening correlates with exploration",
|
|
104
|
+
"totalChars": 55,
|
|
105
|
+
}
|
|
106
|
+
errors = validate(data, SCHEMA_REGISTRY["insight_synthesizer"])
|
|
107
|
+
assert errors == []
|
|
108
|
+
|
|
109
|
+
def test_valid_insight_without_skip(self):
|
|
110
|
+
data = {
|
|
111
|
+
"suggestion": "Try frame batching",
|
|
112
|
+
"insight": "Evening correlates with exploration",
|
|
113
|
+
"totalChars": 55,
|
|
114
|
+
}
|
|
115
|
+
errors = validate(data, SCHEMA_REGISTRY["insight_synthesizer"])
|
|
116
|
+
assert errors == []
|
|
117
|
+
|
|
118
|
+
def test_confidence_out_of_range(self):
|
|
119
|
+
data = {
|
|
120
|
+
"signals": [],
|
|
121
|
+
"recommendedAction": {"action": "sessions_spawn", "task": "x", "confidence": 1.5},
|
|
122
|
+
"idle": False,
|
|
123
|
+
}
|
|
124
|
+
errors = validate(data, SCHEMA_REGISTRY["signal_analyzer"])
|
|
125
|
+
# oneOf fails because the object variant rejects confidence > 1.0
|
|
126
|
+
assert len(errors) > 0
|
|
127
|
+
assert any("oneOf" in e or "maximum" in e for e in errors)
|
|
128
|
+
|
|
129
|
+
def test_negative_playbook_lines(self):
|
|
130
|
+
data = {
|
|
131
|
+
"changes": {"added": [], "pruned": [], "promoted": []},
|
|
132
|
+
"playbookLines": -1,
|
|
133
|
+
}
|
|
134
|
+
errors = validate(data, SCHEMA_REGISTRY["playbook_curator"])
|
|
135
|
+
assert any("minimum" in e for e in errors)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
139
|
+
# Assertion tests
|
|
140
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
141
|
+
|
|
142
|
+
class TestPlaybookUnderLimit:
|
|
143
|
+
def test_under_limit(self):
|
|
144
|
+
r = assert_playbook_under_limit({"playbookLines": 30})
|
|
145
|
+
assert r["passed"] is True
|
|
146
|
+
|
|
147
|
+
def test_at_limit(self):
|
|
148
|
+
r = assert_playbook_under_limit({"playbookLines": 50})
|
|
149
|
+
assert r["passed"] is True
|
|
150
|
+
|
|
151
|
+
def test_over_limit(self):
|
|
152
|
+
r = assert_playbook_under_limit({"playbookLines": 55})
|
|
153
|
+
assert r["passed"] is False
|
|
154
|
+
|
|
155
|
+
def test_custom_limit(self):
|
|
156
|
+
r = assert_playbook_under_limit({"playbookLines": 80}, limit=100)
|
|
157
|
+
assert r["passed"] is True
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class TestCuratorRespectedDirective:
|
|
161
|
+
def test_aggressive_prune_with_pruning(self):
|
|
162
|
+
r = assert_curator_respected_directive(
|
|
163
|
+
{"changes": {"added": [], "pruned": ["x"], "promoted": []}},
|
|
164
|
+
"aggressive_prune",
|
|
165
|
+
)
|
|
166
|
+
assert r["passed"] is True
|
|
167
|
+
|
|
168
|
+
def test_aggressive_prune_without_pruning(self):
|
|
169
|
+
r = assert_curator_respected_directive(
|
|
170
|
+
{"changes": {"added": ["new"], "pruned": [], "promoted": []}},
|
|
171
|
+
"aggressive_prune",
|
|
172
|
+
)
|
|
173
|
+
assert r["passed"] is False
|
|
174
|
+
|
|
175
|
+
def test_stability_conservative(self):
|
|
176
|
+
r = assert_curator_respected_directive(
|
|
177
|
+
{"changes": {"added": ["a"], "pruned": [], "promoted": []}},
|
|
178
|
+
"stability",
|
|
179
|
+
)
|
|
180
|
+
assert r["passed"] is True
|
|
181
|
+
|
|
182
|
+
def test_stability_too_aggressive(self):
|
|
183
|
+
r = assert_curator_respected_directive(
|
|
184
|
+
{"changes": {"added": [], "pruned": ["a", "b", "c", "d"], "promoted": []}},
|
|
185
|
+
"stability",
|
|
186
|
+
)
|
|
187
|
+
assert r["passed"] is False
|
|
188
|
+
|
|
189
|
+
def test_normal_anything_goes(self):
|
|
190
|
+
r = assert_curator_respected_directive(
|
|
191
|
+
{"changes": {"added": ["a", "b"], "pruned": ["c"], "promoted": []}},
|
|
192
|
+
"normal",
|
|
193
|
+
)
|
|
194
|
+
assert r["passed"] is True
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class TestNoRepeatAction:
|
|
198
|
+
def test_no_action(self):
|
|
199
|
+
r = assert_no_repeat_action({"recommendedAction": None}, [])
|
|
200
|
+
assert r["passed"] is True
|
|
201
|
+
|
|
202
|
+
def test_skip_action(self):
|
|
203
|
+
r = assert_no_repeat_action({"recommendedAction": {"action": "skip"}}, [])
|
|
204
|
+
assert r["passed"] is True
|
|
205
|
+
|
|
206
|
+
def test_distinct_action(self):
|
|
207
|
+
recent = [{"actionsConsidered": [{"chosen": True, "reason": "Research Flutter overlays"}]}]
|
|
208
|
+
r = assert_no_repeat_action(
|
|
209
|
+
{"recommendedAction": {"action": "sessions_spawn", "task": "Debug OCR backpressure"}},
|
|
210
|
+
recent,
|
|
211
|
+
)
|
|
212
|
+
assert r["passed"] is True
|
|
213
|
+
|
|
214
|
+
def test_repeated_action(self):
|
|
215
|
+
recent = [{"actionsConsidered": [{"chosen": True, "reason": "Debug OCR backpressure issue"}]}]
|
|
216
|
+
r = assert_no_repeat_action(
|
|
217
|
+
{"recommendedAction": {"action": "sessions_spawn", "task": "Debug OCR backpressure"}},
|
|
218
|
+
recent,
|
|
219
|
+
)
|
|
220
|
+
assert r["passed"] is False
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class TestSignalConfidenceThreshold:
|
|
224
|
+
def test_above_threshold(self):
|
|
225
|
+
r = assert_signal_confidence_threshold(
|
|
226
|
+
{"recommendedAction": {"action": "sessions_spawn", "confidence": 0.8}}
|
|
227
|
+
)
|
|
228
|
+
assert r["passed"] is True
|
|
229
|
+
|
|
230
|
+
def test_below_threshold(self):
|
|
231
|
+
r = assert_signal_confidence_threshold(
|
|
232
|
+
{"recommendedAction": {"action": "sessions_spawn", "confidence": 0.3}}
|
|
233
|
+
)
|
|
234
|
+
assert r["passed"] is False
|
|
235
|
+
|
|
236
|
+
def test_no_confidence(self):
|
|
237
|
+
r = assert_signal_confidence_threshold(
|
|
238
|
+
{"recommendedAction": {"action": "sessions_spawn"}}
|
|
239
|
+
)
|
|
240
|
+
assert r["passed"] is False
|
|
241
|
+
|
|
242
|
+
def test_no_action(self):
|
|
243
|
+
r = assert_signal_confidence_threshold({"recommendedAction": None})
|
|
244
|
+
assert r["passed"] is True
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class TestInsightCharLimit:
|
|
248
|
+
def test_under_limit(self):
|
|
249
|
+
r = assert_insight_char_limit({"suggestion": "short", "insight": "also short"})
|
|
250
|
+
assert r["passed"] is True
|
|
251
|
+
|
|
252
|
+
def test_over_limit(self):
|
|
253
|
+
r = assert_insight_char_limit({"suggestion": "x" * 300, "insight": "y" * 300})
|
|
254
|
+
assert r["passed"] is False
|
|
255
|
+
|
|
256
|
+
def test_skipped_output(self):
|
|
257
|
+
r = assert_insight_char_limit({"skip": True})
|
|
258
|
+
assert r["passed"] is True
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class TestSkipReasonSpecific:
|
|
262
|
+
def test_specific_reason(self):
|
|
263
|
+
r = assert_skip_reason_specific({
|
|
264
|
+
"skip": True,
|
|
265
|
+
"skipReason": "No new patterns detected in playbook since last tick; mining-index shows 2026-02-21 was already processed"
|
|
266
|
+
})
|
|
267
|
+
assert r["passed"] is True
|
|
268
|
+
|
|
269
|
+
def test_generic_reason(self):
|
|
270
|
+
r = assert_skip_reason_specific({"skip": True, "skipReason": "no new data"})
|
|
271
|
+
assert r["passed"] is False
|
|
272
|
+
|
|
273
|
+
def test_no_reason(self):
|
|
274
|
+
r = assert_skip_reason_specific({"skip": True})
|
|
275
|
+
assert r["passed"] is False
|
|
276
|
+
|
|
277
|
+
def test_not_skipped(self):
|
|
278
|
+
r = assert_skip_reason_specific({"skip": False})
|
|
279
|
+
assert r["passed"] is True
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class TestMinerReferencesSources:
|
|
283
|
+
def test_valid_sources(self):
|
|
284
|
+
r = assert_miner_references_sources(
|
|
285
|
+
{"minedSources": ["2026-02-21.md"]},
|
|
286
|
+
["2026-02-21.md", "2026-02-20.md"],
|
|
287
|
+
)
|
|
288
|
+
assert r["passed"] is True
|
|
289
|
+
|
|
290
|
+
def test_unknown_sources(self):
|
|
291
|
+
r = assert_miner_references_sources(
|
|
292
|
+
{"minedSources": ["2026-02-21.md", "fake-file.md"]},
|
|
293
|
+
["2026-02-21.md"],
|
|
294
|
+
)
|
|
295
|
+
assert r["passed"] is False
|
|
296
|
+
|
|
297
|
+
def test_no_sources(self):
|
|
298
|
+
r = assert_miner_references_sources({"minedSources": []}, [])
|
|
299
|
+
assert r["passed"] is True
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class TestPlaybookHeaderFooterIntact:
|
|
303
|
+
def test_both_present(self):
|
|
304
|
+
text = "<!-- mining-index: 2026-02-21 -->\nbody\n<!-- effectiveness: rate=0.5 -->"
|
|
305
|
+
r = assert_playbook_header_footer_intact(text)
|
|
306
|
+
assert r["passed"] is True
|
|
307
|
+
|
|
308
|
+
def test_missing_header(self):
|
|
309
|
+
r = assert_playbook_header_footer_intact("body\n<!-- effectiveness: rate=0.5 -->")
|
|
310
|
+
assert r["passed"] is False
|
|
311
|
+
assert "mining-index" in r["detail"]
|
|
312
|
+
|
|
313
|
+
def test_missing_footer(self):
|
|
314
|
+
r = assert_playbook_header_footer_intact("<!-- mining-index: 2026-02-21 -->\nbody")
|
|
315
|
+
assert r["passed"] is False
|
|
316
|
+
assert "effectiveness" in r["detail"]
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
320
|
+
# Integration: validate_tick_schemas
|
|
321
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
322
|
+
|
|
323
|
+
class TestValidateTickSchemas:
|
|
324
|
+
def test_full_log_entry(self, sample_log_entry):
|
|
325
|
+
result = validate_tick_schemas(sample_log_entry)
|
|
326
|
+
assert result["total"] > 0
|
|
327
|
+
assert result["valid"] == result["total"]
|
|
328
|
+
assert result["failures"] == []
|
|
329
|
+
|
|
330
|
+
def test_empty_log_entry(self):
|
|
331
|
+
result = validate_tick_schemas({})
|
|
332
|
+
assert result["total"] == 0
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
336
|
+
# Integration: run_tick_assertions
|
|
337
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
338
|
+
|
|
339
|
+
class TestRunTickAssertions:
|
|
340
|
+
def test_all_pass_on_good_entry(self, sample_log_entry):
|
|
341
|
+
playbook = "<!-- mining-index: 2026-02-21 -->\nbody\n<!-- effectiveness: rate=0.5 -->"
|
|
342
|
+
results = run_tick_assertions(sample_log_entry, [], playbook, ["2026-02-21.md"])
|
|
343
|
+
assert all(r["passed"] for r in results), [r for r in results if not r["passed"]]
|
|
344
|
+
|
|
345
|
+
def test_detects_failures(self):
|
|
346
|
+
bad_entry = {
|
|
347
|
+
"signals": [],
|
|
348
|
+
"recommendedAction": {"action": "sessions_spawn", "task": "x", "confidence": 0.2},
|
|
349
|
+
"playbookChanges": {
|
|
350
|
+
"changes": {"added": ["a", "b"], "pruned": [], "promoted": []},
|
|
351
|
+
"playbookLines": 55,
|
|
352
|
+
},
|
|
353
|
+
"curateDirective": "aggressive_prune",
|
|
354
|
+
"output": {"skip": True, "skipReason": "no new data"},
|
|
355
|
+
}
|
|
356
|
+
results = run_tick_assertions(bad_entry, [], "", [])
|
|
357
|
+
failed = [r for r in results if not r["passed"]]
|
|
358
|
+
failed_names = {r["name"] for r in failed}
|
|
359
|
+
assert "signal_confidence_threshold" in failed_names
|
|
360
|
+
assert "playbook_under_limit" in failed_names
|
|
361
|
+
assert "curator_respected_directive" in failed_names
|
|
362
|
+
assert "skip_reason_specific" in failed_names
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
366
|
+
# Config
|
|
367
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
368
|
+
|
|
369
|
+
class TestLoadEvalConfig:
|
|
370
|
+
def test_defaults(self, tmp_path):
|
|
371
|
+
memory_dir = str(tmp_path)
|
|
372
|
+
config = load_eval_config(memory_dir)
|
|
373
|
+
assert config["level"] == "mechanical"
|
|
374
|
+
assert config["sampleRate"] == 0.2
|
|
375
|
+
|
|
376
|
+
def test_runtime_override(self, tmp_path):
|
|
377
|
+
override = {"level": "full", "changedAt": "2026-02-28T10:00:00Z"}
|
|
378
|
+
(tmp_path / "eval-config.json").write_text(json.dumps(override))
|
|
379
|
+
config = load_eval_config(str(tmp_path))
|
|
380
|
+
assert config["level"] == "full"
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
384
|
+
# call_llm_with_fallback retry logic
|
|
385
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
386
|
+
|
|
387
|
+
class TestCallLlmWithFallback:
|
|
388
|
+
@patch("common.time.sleep")
|
|
389
|
+
@patch("common.call_llm")
|
|
390
|
+
def test_succeeds_on_retry(self, mock_call, mock_sleep):
|
|
391
|
+
"""First attempt fails with LLMError, second succeeds."""
|
|
392
|
+
mock_call.side_effect = [
|
|
393
|
+
LLMError("timeout"),
|
|
394
|
+
'{"changes": {"added": ["x"], "pruned": [], "promoted": []}}',
|
|
395
|
+
]
|
|
396
|
+
result = call_llm_with_fallback("sys", "usr", script="playbook_curator", json_mode=True)
|
|
397
|
+
assert '"added": ["x"]' in result
|
|
398
|
+
assert mock_call.call_count == 2
|
|
399
|
+
mock_sleep.assert_called_once_with(1) # 2^0 = 1s backoff
|
|
400
|
+
|
|
401
|
+
@patch("common.time.sleep")
|
|
402
|
+
@patch("common.call_llm")
|
|
403
|
+
def test_raises_after_all_retries_exhausted(self, mock_call, mock_sleep):
|
|
404
|
+
"""Both attempts fail → raises the last LLMError."""
|
|
405
|
+
mock_call.side_effect = LLMError("persistent failure")
|
|
406
|
+
import pytest
|
|
407
|
+
with pytest.raises(LLMError, match="persistent failure"):
|
|
408
|
+
call_llm_with_fallback("sys", "usr", retries=1)
|
|
409
|
+
assert mock_call.call_count == 2
|
|
410
|
+
|
|
411
|
+
@patch("common.call_llm")
|
|
412
|
+
def test_succeeds_first_try_no_retry(self, mock_call):
|
|
413
|
+
"""If first attempt succeeds, no retry or sleep happens."""
|
|
414
|
+
mock_call.return_value = '{"ok": true}'
|
|
415
|
+
result = call_llm_with_fallback("sys", "usr")
|
|
416
|
+
assert result == '{"ok": true}'
|
|
417
|
+
assert mock_call.call_count == 1
|
|
418
|
+
|
|
419
|
+
@patch("common.time.sleep")
|
|
420
|
+
@patch("common.call_llm")
|
|
421
|
+
def test_exponential_backoff(self, mock_call, mock_sleep):
|
|
422
|
+
"""With retries=2, backoff should be 1s then 2s."""
|
|
423
|
+
mock_call.side_effect = [
|
|
424
|
+
LLMError("fail1"),
|
|
425
|
+
LLMError("fail2"),
|
|
426
|
+
"success",
|
|
427
|
+
]
|
|
428
|
+
result = call_llm_with_fallback("sys", "usr", retries=2)
|
|
429
|
+
assert result == "success"
|
|
430
|
+
assert mock_sleep.call_args_list == [call(1), call(2)]
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""Tests for triple_extractor.py — 3-tier extraction."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from triplestore import TripleStore
|
|
5
|
+
from triple_extractor import TripleExtractor, Triple, _make_slug
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def store(tmp_path):
|
|
10
|
+
s = TripleStore(str(tmp_path / "test.db"))
|
|
11
|
+
yield s
|
|
12
|
+
s.close()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.fixture
|
|
16
|
+
def extractor(store):
|
|
17
|
+
return TripleExtractor(store)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.fixture
|
|
21
|
+
def store_with_vocab(store):
|
|
22
|
+
"""Store pre-populated with concept vocabulary."""
|
|
23
|
+
tx = store.begin_tx("setup")
|
|
24
|
+
store.assert_triple(tx, "concept:ocr", "name", "OCR")
|
|
25
|
+
store.assert_triple(tx, "concept:flutter", "name", "Flutter")
|
|
26
|
+
store.assert_triple(tx, "concept:react-native", "name", "React Native")
|
|
27
|
+
return store
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ----- Slug generation -----
|
|
31
|
+
|
|
32
|
+
class TestMakeSlug:
|
|
33
|
+
def test_basic(self):
|
|
34
|
+
assert _make_slug("Frame Batching") == "frame-batching"
|
|
35
|
+
|
|
36
|
+
def test_special_chars(self):
|
|
37
|
+
assert _make_slug("OCR pipeline (v2.0)") == "ocr-pipeline-v2-0"
|
|
38
|
+
|
|
39
|
+
def test_leading_trailing(self):
|
|
40
|
+
assert _make_slug(" --hello-- ") == "hello"
|
|
41
|
+
|
|
42
|
+
def test_max_length(self):
|
|
43
|
+
long = "a" * 100
|
|
44
|
+
assert len(_make_slug(long)) <= 80
|
|
45
|
+
|
|
46
|
+
def test_empty(self):
|
|
47
|
+
assert _make_slug("") == ""
|
|
48
|
+
assert _make_slug("---") == ""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ----- Tier 1: Signal extraction -----
|
|
52
|
+
|
|
53
|
+
class TestExtractSignal:
|
|
54
|
+
def test_basic_signal(self, extractor):
|
|
55
|
+
data = {
|
|
56
|
+
"sessionSummary": "Debugging OCR pipeline",
|
|
57
|
+
"idle": False,
|
|
58
|
+
"signals": [
|
|
59
|
+
{"description": "OCR pipeline backpressure", "priority": "high"}
|
|
60
|
+
],
|
|
61
|
+
"recommendedAction": {"action": "sessions_spawn", "task": "Debug", "confidence": 0.8},
|
|
62
|
+
"output": {"suggestion": "Try frame batching", "insight": "Evening pattern"},
|
|
63
|
+
}
|
|
64
|
+
triples = extractor.extract_signal(data, "2026-03-01T10:00:00Z")
|
|
65
|
+
|
|
66
|
+
# Check entity creation
|
|
67
|
+
entity_attrs = {t.attribute for t in triples if t.entity_id == "signal:2026-03-01T10:00:00Z"}
|
|
68
|
+
assert "summary" in entity_attrs
|
|
69
|
+
assert "description" in entity_attrs
|
|
70
|
+
assert "priority" in entity_attrs
|
|
71
|
+
assert "action" in entity_attrs
|
|
72
|
+
assert "suggestion" in entity_attrs
|
|
73
|
+
|
|
74
|
+
def test_signal_with_playbook_changes(self, extractor):
|
|
75
|
+
data = {
|
|
76
|
+
"signals": [],
|
|
77
|
+
"playbookChanges": {
|
|
78
|
+
"changes": {"added": ["Use frame batching for OCR"], "pruned": [], "promoted": []}
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
triples = extractor.extract_signal(data, "2026-03-01")
|
|
82
|
+
pattern_triples = [t for t in triples if t.entity_id.startswith("pattern:")]
|
|
83
|
+
assert len(pattern_triples) > 0
|
|
84
|
+
assert any(t.attribute == "text" and "frame batching" in t.value.lower() for t in pattern_triples)
|
|
85
|
+
|
|
86
|
+
def test_signal_empty(self, extractor):
|
|
87
|
+
triples = extractor.extract_signal({"signals": []}, "2026-03-01")
|
|
88
|
+
assert isinstance(triples, list)
|
|
89
|
+
|
|
90
|
+
def test_signal_concepts_extracted(self, extractor):
|
|
91
|
+
data = {"signals": [{"description": "OCR Pipeline stall detected", "priority": "high"}]}
|
|
92
|
+
triples = extractor.extract_signal(data, "2026-03-01")
|
|
93
|
+
concept_triples = [t for t in triples if t.entity_id.startswith("concept:")]
|
|
94
|
+
assert len(concept_triples) > 0 # "OCR" or "Pipeline" should be extracted
|
|
95
|
+
|
|
96
|
+
def test_signal_concept_refs(self, extractor):
|
|
97
|
+
data = {"signals": [{"description": "React Native bridge crash", "priority": "high"}]}
|
|
98
|
+
triples = extractor.extract_signal(data, "2026-03-01")
|
|
99
|
+
ref_triples = [t for t in triples if t.value_type == "ref" and t.attribute == "related_to"]
|
|
100
|
+
assert len(ref_triples) > 0
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ----- Tier 1: Session extraction -----
|
|
104
|
+
|
|
105
|
+
class TestExtractSession:
|
|
106
|
+
def test_basic_session(self, extractor):
|
|
107
|
+
data = {
|
|
108
|
+
"ts": "2026-03-01T09:00:00Z",
|
|
109
|
+
"summary": "Implemented OCR batch processing",
|
|
110
|
+
"toolsUsed": ["Read", "Edit", "Bash"],
|
|
111
|
+
"durationMs": 120000,
|
|
112
|
+
}
|
|
113
|
+
triples = extractor.extract_session(data)
|
|
114
|
+
|
|
115
|
+
# Check session entity
|
|
116
|
+
session_triples = [t for t in triples if t.entity_id.startswith("session:")]
|
|
117
|
+
assert any(t.attribute == "summary" for t in session_triples)
|
|
118
|
+
assert any(t.attribute == "duration_ms" for t in session_triples)
|
|
119
|
+
|
|
120
|
+
# Check tool refs
|
|
121
|
+
tool_triples = [t for t in triples if t.entity_id.startswith("tool:")]
|
|
122
|
+
assert len(tool_triples) >= 3
|
|
123
|
+
|
|
124
|
+
def test_session_tool_refs(self, extractor):
|
|
125
|
+
data = {
|
|
126
|
+
"ts": "2026-03-01",
|
|
127
|
+
"toolsUsed": ["Bash"],
|
|
128
|
+
}
|
|
129
|
+
triples = extractor.extract_session(data)
|
|
130
|
+
ref_triples = [t for t in triples if t.value_type == "ref" and t.attribute == "used_tool"]
|
|
131
|
+
assert len(ref_triples) == 1
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ----- Tier 1: Mining extraction -----
|
|
135
|
+
|
|
136
|
+
class TestExtractMining:
|
|
137
|
+
def test_new_patterns(self, extractor):
|
|
138
|
+
data = {
|
|
139
|
+
"newPatterns": ["Frame dropping improves OCR accuracy", "Use batch processing"],
|
|
140
|
+
"preferences": ["User prefers minimal configs"],
|
|
141
|
+
"contradictions": [],
|
|
142
|
+
}
|
|
143
|
+
triples = extractor.extract_mining(data)
|
|
144
|
+
pattern_triples = [t for t in triples if t.attribute == "text"]
|
|
145
|
+
assert len(pattern_triples) >= 2
|
|
146
|
+
|
|
147
|
+
def test_mining_preferences(self, extractor):
|
|
148
|
+
data = {"newPatterns": [], "preferences": ["User likes concise output"]}
|
|
149
|
+
triples = extractor.extract_mining(data)
|
|
150
|
+
pref_triples = [t for t in triples if t.attribute == "pattern_type" and t.value == "preference"]
|
|
151
|
+
assert len(pref_triples) == 1
|
|
152
|
+
|
|
153
|
+
def test_mining_contradictions(self, extractor):
|
|
154
|
+
data = {"newPatterns": [], "contradictions": ["Playbook says X but observation shows Y"]}
|
|
155
|
+
triples = extractor.extract_mining(data)
|
|
156
|
+
contra = [t for t in triples if t.attribute == "pattern_type" and t.value == "contradiction"]
|
|
157
|
+
assert len(contra) == 1
|
|
158
|
+
|
|
159
|
+
def test_mining_empty(self, extractor):
|
|
160
|
+
triples = extractor.extract_mining({"newPatterns": [], "preferences": [], "contradictions": []})
|
|
161
|
+
assert triples == []
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ----- Tier 2: Playbook extraction (regex) -----
|
|
165
|
+
|
|
166
|
+
class TestExtractPlaybook:
|
|
167
|
+
def test_patterns_with_scores(self, extractor):
|
|
168
|
+
text = (
|
|
169
|
+
"## Established Patterns\n"
|
|
170
|
+
"- OCR pipeline stalls when queue depth > 10 (score: 0.8)\n"
|
|
171
|
+
"- Use frame batching for throughput (score: 0.6)\n"
|
|
172
|
+
"- Spawn research agent for new frameworks\n"
|
|
173
|
+
)
|
|
174
|
+
triples = extractor.extract_playbook(text)
|
|
175
|
+
text_triples = [t for t in triples if t.attribute == "text"]
|
|
176
|
+
assert len(text_triples) >= 3
|
|
177
|
+
|
|
178
|
+
score_triples = [t for t in triples if t.attribute == "score"]
|
|
179
|
+
assert len(score_triples) == 2
|
|
180
|
+
assert any(t.value == "0.8" for t in score_triples)
|
|
181
|
+
|
|
182
|
+
def test_skips_comments_and_metadata(self, extractor):
|
|
183
|
+
text = "- <!-- mining-index: 2026-02-21 -->\n- [since: 2026-02-18] stale entry\n- Real pattern here\n"
|
|
184
|
+
triples = extractor.extract_playbook(text)
|
|
185
|
+
text_triples = [t for t in triples if t.attribute == "text"]
|
|
186
|
+
# Should only get "Real pattern here"
|
|
187
|
+
assert any("Real pattern" in t.value for t in text_triples)
|
|
188
|
+
assert not any("mining-index" in t.value for t in text_triples)
|
|
189
|
+
|
|
190
|
+
def test_playbook_source_tagged(self, extractor):
|
|
191
|
+
text = "- Use batch processing\n"
|
|
192
|
+
triples = extractor.extract_playbook(text)
|
|
193
|
+
source_triples = [t for t in triples if t.attribute == "source"]
|
|
194
|
+
assert any(t.value == "playbook" for t in source_triples)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ----- Tier 2: Module extraction -----
|
|
198
|
+
|
|
199
|
+
class TestExtractModule:
|
|
200
|
+
def test_module_manifest_and_patterns(self, extractor):
|
|
201
|
+
manifest = {
|
|
202
|
+
"name": "React Native Dev",
|
|
203
|
+
"description": "RN development patterns",
|
|
204
|
+
"version": "1.0.0",
|
|
205
|
+
}
|
|
206
|
+
patterns_text = "## Established\n- Use Hermes engine for Android\n- Enable Fast Refresh\n"
|
|
207
|
+
triples = extractor.extract_module("react-native-dev", manifest, patterns_text)
|
|
208
|
+
|
|
209
|
+
module_triples = [t for t in triples if t.entity_id == "module:react-native-dev"]
|
|
210
|
+
assert any(t.attribute == "name" and t.value == "React Native Dev" for t in module_triples)
|
|
211
|
+
assert any(t.attribute == "description" for t in module_triples)
|
|
212
|
+
|
|
213
|
+
# Patterns should link back to module
|
|
214
|
+
belongs_triples = [t for t in triples if t.attribute == "belongs_to" and t.value == "module:react-native-dev"]
|
|
215
|
+
assert len(belongs_triples) >= 2
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# ----- Concept extraction -----
|
|
219
|
+
|
|
220
|
+
class TestExtractConcepts:
|
|
221
|
+
def test_vocab_cache_match(self, store_with_vocab):
|
|
222
|
+
extractor = TripleExtractor(store_with_vocab)
|
|
223
|
+
triples = extractor.extract_concepts("Working on OCR pipeline improvements")
|
|
224
|
+
concept_ids = {t.entity_id for t in triples}
|
|
225
|
+
assert "concept:ocr" in concept_ids
|
|
226
|
+
|
|
227
|
+
def test_regex_capitalized_phrases(self, extractor):
|
|
228
|
+
# Multi-word capitalized phrases get captured (including leading caps)
|
|
229
|
+
triples = extractor.extract_concepts("Debugging React Native bridge issues")
|
|
230
|
+
concept_ids = {t.entity_id for t in triples}
|
|
231
|
+
# "Debugging React Native" is captured as one phrase since all words are capitalized
|
|
232
|
+
assert any("react-native" in cid for cid in concept_ids)
|
|
233
|
+
|
|
234
|
+
def test_regex_acronyms(self, extractor):
|
|
235
|
+
triples = extractor.extract_concepts("The API uses JSON over HTTP")
|
|
236
|
+
concept_ids = {t.entity_id for t in triples}
|
|
237
|
+
assert any("api" in cid for cid in concept_ids)
|
|
238
|
+
assert any("json" in cid for cid in concept_ids)
|
|
239
|
+
assert any("http" in cid for cid in concept_ids)
|
|
240
|
+
|
|
241
|
+
def test_regex_technical_terms(self, extractor):
|
|
242
|
+
triples = extractor.extract_concepts("Check the frame-batching pipeline and error-handling logic")
|
|
243
|
+
concept_ids = {t.entity_id for t in triples}
|
|
244
|
+
assert "concept:frame-batching" in concept_ids
|
|
245
|
+
assert "concept:error-handling" in concept_ids
|
|
246
|
+
|
|
247
|
+
def test_short_text_no_llm(self, extractor):
|
|
248
|
+
# Short text should not trigger LLM fallback
|
|
249
|
+
triples = extractor.extract_concepts("Hello")
|
|
250
|
+
# Should return empty or just regex matches, no LLM call
|
|
251
|
+
assert isinstance(triples, list)
|
|
252
|
+
|
|
253
|
+
def test_empty_text(self, extractor):
|
|
254
|
+
triples = extractor.extract_concepts("")
|
|
255
|
+
assert triples == []
|