kc-beta 0.7.5 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -0
- package/package.json +3 -2
- package/src/agent/engine.js +390 -100
- package/src/agent/pipelines/_advance-hints.js +92 -0
- package/src/agent/pipelines/_milestone-derive.js +247 -13
- package/src/agent/pipelines/skill-authoring.js +30 -1
- package/src/agent/tools/agent-tool.js +2 -2
- package/src/agent/tools/consult-skill.js +15 -0
- package/src/agent/tools/dashboard-render.js +48 -1
- package/src/agent/tools/document-parse.js +31 -2
- package/src/agent/tools/phase-advance.js +17 -13
- package/src/agent/tools/release.js +250 -7
- package/src/agent/tools/sandbox-exec.js +65 -8
- package/src/agent/tools/worker-llm-call.js +95 -15
- package/src/agent/workspace.js +25 -4
- package/src/cli/components.js +4 -1
- package/src/cli/index.js +97 -1
- package/src/config.js +19 -2
- package/src/marathon/driver.js +217 -0
- package/src/marathon/prompts.js +93 -0
- package/template/.env.template +16 -0
- package/template/skills/en/bootstrap-workspace/SKILL.md +14 -0
- package/template/skills/en/quality-control/SKILL.md +9 -0
- package/template/skills/en/skill-authoring/SKILL.md +39 -0
- package/template/skills/en/skill-to-workflow/SKILL.md +53 -0
- package/template/skills/en/work-decomposition/SKILL.md +34 -0
- package/template/skills/phase_skills.yaml +5 -0
- package/template/skills/zh/bootstrap-workspace/SKILL.md +14 -0
- package/template/skills/zh/compliance-judgment/SKILL.md +37 -37
- package/template/skills/zh/document-chunking/SKILL.md +21 -14
- package/template/skills/zh/document-parsing/SKILL.md +65 -65
- package/template/skills/zh/entity-extraction/SKILL.md +68 -68
- package/template/skills/zh/quality-control/SKILL.md +9 -0
- package/template/skills/zh/skill-authoring/SKILL.md +39 -0
- package/template/skills/zh/skill-creator/SKILL.md +204 -200
- package/template/skills/zh/skill-to-workflow/SKILL.md +53 -0
- package/template/skills/zh/tree-processing/SKILL.md +67 -63
- package/template/skills/zh/work-decomposition/SKILL.md +34 -0
- package/template/workflows/common/llm_client.py +168 -0
- package/template/workflows/common/utils.py +132 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
// v0.8 P0-E: prescriptive refusal hints for phase_advance gate failures.
|
|
2
|
+
//
|
|
3
|
+
// 资管 + 贷款 v0.7.5 audits both observed the force-bypass pattern:
|
|
4
|
+
// engine refuses phase_advance with `engineCounts: workflowsTested: 0/14`,
|
|
5
|
+
// agent does ~3 min of cleanup, then forces past anyway. Cleanup happens
|
|
6
|
+
// (signal IS being consumed) but force always wins because the descriptive
|
|
7
|
+
// "exit criteria not met" hint doesn't tell the agent WHAT to write.
|
|
8
|
+
//
|
|
9
|
+
// v0.8 P0-E replaces the descriptive hint with a prescriptive one. The
|
|
10
|
+
// hint text below derives from the same artifact paths + filename patterns
|
|
11
|
+
// that _milestone-derive.js walks, so the agent's instructions match what
|
|
12
|
+
// the engine will check next turn.
|
|
13
|
+
//
|
|
14
|
+
// Design contract (matches v0.8 design doc Q20 user lean):
|
|
15
|
+
// - Single shared helper here; engine.js + phase-advance.js both call it.
|
|
16
|
+
// - Each hint is one or two concrete sentences naming a path, a filename
|
|
17
|
+
// pattern, and a script to run (where applicable).
|
|
18
|
+
// - Hint output is plain text, suitable to drop into a tool result.
|
|
19
|
+
//
|
|
20
|
+
// To extend: edit the per-phase hint generators below. Keep the artifact
|
|
21
|
+
// paths in sync with the corresponding derive function in _milestone-derive.js.
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Build a prescriptive refusal hint for a phase_advance gate failure.
|
|
25
|
+
*
|
|
26
|
+
* @param {string} fromPhase — the phase the agent is trying to leave
|
|
27
|
+
* @param {object} engineCounts — raw engine counts (or null)
|
|
28
|
+
* @param {string} [engineCountsLine] — formatted summary string from _buildEngineCountsBlock
|
|
29
|
+
* @returns {string} a multi-line hint suitable for the LLM tool result
|
|
30
|
+
*/
|
|
31
|
+
export function getPrescriptiveHint(fromPhase, engineCounts, engineCountsLine = "") {
|
|
32
|
+
const header = engineCountsLine
|
|
33
|
+
? `Engine telemetry: ${engineCountsLine}\n\n`
|
|
34
|
+
: "";
|
|
35
|
+
|
|
36
|
+
const hint = HINTS_BY_PHASE[fromPhase];
|
|
37
|
+
if (!hint) {
|
|
38
|
+
return header + "Check the system prompt's phase state block for missing milestones. The engine derives milestones from filesystem facts.";
|
|
39
|
+
}
|
|
40
|
+
return header + hint;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const HINTS_BY_PHASE = {
|
|
44
|
+
bootstrap:
|
|
45
|
+
"To advance to rule_extraction:\n" +
|
|
46
|
+
" • Verify <workspace>/source_docs/ contains the regulation file(s) you're extracting rules from.\n" +
|
|
47
|
+
" • Verify <workspace>/samples/ contains at least one sample document for testing.\n" +
|
|
48
|
+
" • Ensure AGENT.md exists at workspace root with project context filled in.\n" +
|
|
49
|
+
"Engine reads filesystem facts; no need to call any 'mark bootstrap complete' tool — just produce the artifacts.",
|
|
50
|
+
|
|
51
|
+
rule_extraction:
|
|
52
|
+
"To advance to skill_authoring:\n" +
|
|
53
|
+
" • For each rule in the source regulation, write an entry to rules/catalog.json with {id, source_ref, falsifiability_statement, applicable_sections}.\n" +
|
|
54
|
+
" • Use rule_catalog tool (operation: 'write') for catalog entries; engine derives `rulesExtracted` from this file.\n" +
|
|
55
|
+
" • For chunk traceability: each catalog entry should reference its source chunk via applicable_sections.\n" +
|
|
56
|
+
" • Write rule_skills/coverage_report.md or rules/coverage_report.md to mark coverageAudited=true (a per-rule × per-section table).",
|
|
57
|
+
|
|
58
|
+
skill_authoring:
|
|
59
|
+
"To advance to skill_testing:\n" +
|
|
60
|
+
" • For each rule_id in rules/catalog.json, create rule_skills/<rule_id>/SKILL.md (uppercase! engine path-match is case-sensitive on Linux).\n" +
|
|
61
|
+
" • Each SKILL.md needs frontmatter (id, name, description) + a body describing verification logic.\n" +
|
|
62
|
+
" • Pair each SKILL.md with rule_skills/<rule_id>/check.py — substantive logic, NOT a 'return NOT_APPLICABLE' stub. If logic lives in workflows/, check.py must import + call the workflow.\n" +
|
|
63
|
+
" • For grouped skills covering multiple rules, frontmatter MUST include `source_rules: [R001, R005, ...]` so engine credits each rule_id.\n" +
|
|
64
|
+
" • Engine counts `rulesCovered` from rule_skills/ walk; aim for catalog.json's full rule list.",
|
|
65
|
+
|
|
66
|
+
skill_testing:
|
|
67
|
+
"To advance to distillation:\n" +
|
|
68
|
+
" • For each rule_id, write test results to output/results/skill_test_round<N>.json or output/results/<rule_id>_<sample>.json.\n" +
|
|
69
|
+
" • Each test result needs `verdict` (PASS/FAIL/NOT_APPLICABLE) plus per-rule accuracy.\n" +
|
|
70
|
+
" • Engine counts `skillsTested` from these files. Aim for ≥1 result per rule, with ≥90% accuracy on labeled samples.\n" +
|
|
71
|
+
" • If a rule consistently fails, iterate the SKILL.md + check.py before advancing (this is the evolution-loop pattern).",
|
|
72
|
+
|
|
73
|
+
distillation:
|
|
74
|
+
"To advance to production_qc:\n" +
|
|
75
|
+
" • For each rule_id, write workflows/<rule_id>/workflow_v1.py (regex-only or hybrid regex+worker_llm).\n" +
|
|
76
|
+
" • Each workflow.py needs a `verify(document_text, config)` function returning {verdict, evidence, confidence, ...}.\n" +
|
|
77
|
+
" • Engine counts `workflowsCreated` from workflows/<rule_id>/workflow_v*.py walk.\n" +
|
|
78
|
+
" • Run scripts/v1_regression.py (or equivalent) to populate output/results/v1_regression.json — engine counts `workflowsTested` from this.\n" +
|
|
79
|
+
" • For grouped workflows (one workflow covering multiple rules), declare `source_rules: [...]` in workflow's docstring or sidecar config.",
|
|
80
|
+
|
|
81
|
+
production_qc:
|
|
82
|
+
"To advance to finalization:\n" +
|
|
83
|
+
" • Write output/results/production_qc_results.json (preferred shape: {results: {<rule_id>: {<doc_id>: {verdict, evidence, confidence}}}}).\n" +
|
|
84
|
+
" • OR write output/qc/review_<batch>.json with `documents_reviewed: N` for each batch — engine sums across files.\n" +
|
|
85
|
+
" • Engine counts `batchesProcessed` and `documentsReviewed`. Each batch should cover the full doc set OR a meaningful sample.\n" +
|
|
86
|
+
" • If accuracy is below threshold, run evolution-loop on the failing rules before advancing.",
|
|
87
|
+
|
|
88
|
+
finalization:
|
|
89
|
+
"(Finalization is the terminal phase — no forward advance.)",
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
export default getPrescriptiveHint;
|
|
@@ -80,6 +80,22 @@ function readJsonSafe(p) {
|
|
|
80
80
|
try { return JSON.parse(fs.readFileSync(p, "utf-8")); } catch { return null; }
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
+
// v0.8 P1-A: find the first existing file from a list of candidate relative
|
|
84
|
+
// paths. Returns the absolute path of the first match, or null. Used for
|
|
85
|
+
// "agent-might-have-written-it-anywhere" lookups where conventions vary.
|
|
86
|
+
//
|
|
87
|
+
// 资管 v0.7.5 wrote rule_skills/coverage_report.md; 贷款 v0.7.5 wrote
|
|
88
|
+
// output/coverage_report.md or similar. Each derive function previously
|
|
89
|
+
// hardcoded its own short list — extracting this helper keeps additions
|
|
90
|
+
// centralized.
|
|
91
|
+
function findFileAcrossKnownPaths(workspaceCwd, relPaths) {
|
|
92
|
+
for (const rel of relPaths) {
|
|
93
|
+
const abs = path.join(workspaceCwd, rel);
|
|
94
|
+
if (fileExists(abs)) return abs;
|
|
95
|
+
}
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
|
|
83
99
|
function readFileSafe(p) {
|
|
84
100
|
try { return fs.readFileSync(p, "utf-8"); } catch { return ""; }
|
|
85
101
|
}
|
|
@@ -197,15 +213,18 @@ export function deriveRuleExtractionMilestones(workspace) {
|
|
|
197
213
|
}
|
|
198
214
|
}
|
|
199
215
|
|
|
200
|
-
// coverageAudited: presence of
|
|
201
|
-
//
|
|
202
|
-
// because agents pick different conventions; the spirit is "did the
|
|
216
|
+
// coverageAudited: presence of any coverage audit/report doc. Loose
|
|
217
|
+
// criterion — agents pick different conventions; the spirit is "did the
|
|
203
218
|
// agent produce a coverage doc" not "did they put it in this exact file".
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
219
|
+
// v0.8 P1-A: use the same findFileAcrossKnownPaths helper as finalization.
|
|
220
|
+
const coverageAudited = !!findFileAcrossKnownPaths(cwd, [
|
|
221
|
+
path.join("rules", "coverage_audit.md"),
|
|
222
|
+
path.join("rules", "coverage_audit.json"),
|
|
223
|
+
path.join("rules", "coverage_report.md"),
|
|
224
|
+
path.join("output", "coverage_report.md"),
|
|
225
|
+
path.join("rule_skills", "coverage_report.md"), // v0.8 P1-A
|
|
226
|
+
path.join("output", "qc", "coverage_report.md"),
|
|
227
|
+
]);
|
|
209
228
|
|
|
210
229
|
return {
|
|
211
230
|
rulesExtracted,
|
|
@@ -314,13 +333,88 @@ export function deriveSkillAuthoringMilestones(workspace) {
|
|
|
314
333
|
}
|
|
315
334
|
}
|
|
316
335
|
|
|
336
|
+
// v0.8 P2-F (item 22): count stub-shaped check.py files. Pairs with
|
|
337
|
+
// v0.8 P2-A teaching about the inverse-stub anti-pattern. Surfaces
|
|
338
|
+
// a ratio that downstream code (skill-authoring exitCriteriaMet)
|
|
339
|
+
// can choose to enforce via env flag.
|
|
340
|
+
const checkPyAudit = _auditCheckPyShapes(skillsDir);
|
|
341
|
+
|
|
317
342
|
return {
|
|
318
343
|
skillsAuthored,
|
|
319
344
|
skillsWithScripts,
|
|
320
345
|
ruleIdsCovered: [...ruleIdsCovered],
|
|
346
|
+
checkPyTotal: checkPyAudit.total,
|
|
347
|
+
checkPyStubCount: checkPyAudit.stubFiles.length,
|
|
348
|
+
checkPyStubFiles: checkPyAudit.stubFiles,
|
|
349
|
+
checkPyStubRatio: checkPyAudit.total > 0
|
|
350
|
+
? +(checkPyAudit.stubFiles.length / checkPyAudit.total).toFixed(3)
|
|
351
|
+
: 0,
|
|
321
352
|
};
|
|
322
353
|
}
|
|
323
354
|
|
|
355
|
+
// v0.8 P2-F: walk rule_skills/<id>/ for check_*.py and check each for
|
|
356
|
+
// stub-shape patterns. Returns {total, stubFiles}. Patterns recognized
|
|
357
|
+
// as stubs (per v0.7.x audit findings):
|
|
358
|
+
// - returns literal `"verdict": "NOT_APPLICABLE"` (资管 v0.7.5 variant)
|
|
359
|
+
// - returns literal `"pass": null` (v0.7.0 legacy)
|
|
360
|
+
// - returns literal `"method": "stub"`
|
|
361
|
+
// - AND none of: workflow import, >20 non-comment lines.
|
|
362
|
+
// Substantive signals override the stub-return signal (a check.py that
|
|
363
|
+
// imports + delegates to a workflow but happens to return NOT_APPLICABLE
|
|
364
|
+
// for some sub-path is not a stub).
|
|
365
|
+
function _auditCheckPyShapes(skillsDir) {
|
|
366
|
+
const stubFiles = [];
|
|
367
|
+
let total = 0;
|
|
368
|
+
if (!dirExists(skillsDir)) return { total, stubFiles };
|
|
369
|
+
|
|
370
|
+
for (const dirEntry of listChildDirs(skillsDir)) {
|
|
371
|
+
if (dirEntry.name.startsWith("__")) continue;
|
|
372
|
+
const skillPath = path.join(skillsDir, dirEntry.name);
|
|
373
|
+
const scripts = findCheckScripts(skillPath);
|
|
374
|
+
for (const scriptPath of scripts) {
|
|
375
|
+
total++;
|
|
376
|
+
if (_isCheckPyStubShaped(scriptPath)) {
|
|
377
|
+
stubFiles.push(path.relative(skillsDir, scriptPath));
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
return { total, stubFiles };
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
function _isCheckPyStubShaped(scriptPath) {
|
|
385
|
+
let content;
|
|
386
|
+
try { content = fs.readFileSync(scriptPath, "utf-8"); }
|
|
387
|
+
catch { return false; }
|
|
388
|
+
|
|
389
|
+
// Substantive signal 1: imports a workflow (direct delegation)
|
|
390
|
+
if (/from\s+workflows[.\w]+\s+import|^import\s+workflows\./m.test(content)) {
|
|
391
|
+
return false;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// Stub return patterns. A check.py is a stub if it ALWAYS returns one
|
|
395
|
+
// of these regardless of input. We detect "always returns" by checking
|
|
396
|
+
// that the file has no other verdict literal — no PASS, FAIL, WARNING
|
|
397
|
+
// returns elsewhere. A scaffold with 30+ lines but a single
|
|
398
|
+
// NOT_APPLICABLE return path (like 资管 v0.7.5's 14 check.py files) is
|
|
399
|
+
// still a stub by behavior — line count is unreliable.
|
|
400
|
+
const stubReturn1 = /return\s+\{[^}]*["']verdict["']\s*:\s*["']NOT_APPLICABLE["']/m.test(content);
|
|
401
|
+
const stubReturn2 = /return\s+\{[^}]*["']pass["']\s*:\s*None/m.test(content);
|
|
402
|
+
const stubReturn3 = /return\s+\{[^}]*["']method["']\s*:\s*["']stub["']/m.test(content);
|
|
403
|
+
const hasStubReturn = stubReturn1 || stubReturn2 || stubReturn3;
|
|
404
|
+
|
|
405
|
+
if (!hasStubReturn) return false;
|
|
406
|
+
|
|
407
|
+
// If we find ANY other verdict (PASS, FAIL, WARNING), the file is doing
|
|
408
|
+
// real branching even if one path returns NOT_APPLICABLE — not a stub.
|
|
409
|
+
const hasOtherVerdict =
|
|
410
|
+
/["']verdict["']\s*:\s*["']PASS["']/m.test(content) ||
|
|
411
|
+
/["']verdict["']\s*:\s*["']FAIL["']/m.test(content) ||
|
|
412
|
+
/["']verdict["']\s*:\s*["']WARNING["']/m.test(content) ||
|
|
413
|
+
/\bmake_result\b/.test(content); // common helper that produces non-stub returns
|
|
414
|
+
|
|
415
|
+
return !hasOtherVerdict;
|
|
416
|
+
}
|
|
417
|
+
|
|
324
418
|
// ───────────────────────────────────────────────────────────────────
|
|
325
419
|
// skill_testing
|
|
326
420
|
// ───────────────────────────────────────────────────────────────────
|
|
@@ -613,10 +707,45 @@ export function deriveProductionQcMilestones(workspace) {
|
|
|
613
707
|
}
|
|
614
708
|
}
|
|
615
709
|
|
|
710
|
+
// v0.8 P1-A: per-doc QC review files at output/qc/reviews/doc_*.json
|
|
711
|
+
// (贷款 v0.7.5 shape). Each file is a single review object with
|
|
712
|
+
// {review_id, document, verdict}. Engine previously skipped these
|
|
713
|
+
// because they don't match the batch heuristic, causing
|
|
714
|
+
// `documents_reviewed: 0` despite 16 docs on disk.
|
|
715
|
+
const perDocReviewsDir = path.join(outputDir, "qc", "reviews");
|
|
716
|
+
if (dirExists(perDocReviewsDir)) {
|
|
717
|
+
for (const e of listChildFiles(perDocReviewsDir)) {
|
|
718
|
+
if (!e.name.endsWith(".json")) continue;
|
|
719
|
+
const data = readJsonSafe(path.join(perDocReviewsDir, e.name));
|
|
720
|
+
if (!data || typeof data !== "object" || !data.verdict) continue;
|
|
721
|
+
// Document identifier: prefer explicit fields, fall back to filename
|
|
722
|
+
const docKey = data.document || data.doc || data.file || data.path || e.name.replace(/\.json$/, "");
|
|
723
|
+
documentsReviewedSet.add(String(docKey));
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
// v0.8 P1-A: also read numeric `documents_reviewed: N` from any
|
|
728
|
+
// top-level batch file (贷款 review_001.json declares 16 directly).
|
|
729
|
+
// We use this only when the doc set is smaller than the claim — agents
|
|
730
|
+
// sometimes write summary batches without enumerating individual docs.
|
|
731
|
+
let declaredDocCount = 0;
|
|
732
|
+
for (const dir of candidateDirs) {
|
|
733
|
+
if (!dirExists(dir)) continue;
|
|
734
|
+
for (const e of listChildFiles(dir)) {
|
|
735
|
+
if (!e.name.endsWith(".json")) continue;
|
|
736
|
+
const data = readJsonSafe(path.join(dir, e.name));
|
|
737
|
+
if (!data || typeof data !== "object") continue;
|
|
738
|
+
const n = Number(data.documents_reviewed);
|
|
739
|
+
if (Number.isFinite(n) && n > declaredDocCount) declaredDocCount = n;
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
const documentsReviewed = Math.max(documentsReviewedSet.size, declaredDocCount);
|
|
743
|
+
|
|
616
744
|
return {
|
|
617
745
|
batchesProcessed,
|
|
618
|
-
documentsReviewed
|
|
746
|
+
documentsReviewed,
|
|
619
747
|
documentsReviewedKeys: [...documentsReviewedSet], // for describeState detail
|
|
748
|
+
documentsReviewedDeclared: declaredDocCount > documentsReviewedSet.size ? declaredDocCount : 0,
|
|
620
749
|
};
|
|
621
750
|
}
|
|
622
751
|
|
|
@@ -658,10 +787,18 @@ export function deriveFinalizationMilestones(workspace) {
|
|
|
658
787
|
}
|
|
659
788
|
}
|
|
660
789
|
|
|
661
|
-
// coverageReportWritten:
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
790
|
+
// coverageReportWritten: accept multiple known agent-write locations.
|
|
791
|
+
// v0.8 P1-A: added rule_skills/coverage_report.md (资管 v0.7.5 wrote here)
|
|
792
|
+
// and coverage_audit.md variants (贷款 v0.7.5 wrote rules/coverage_audit.md).
|
|
793
|
+
// The "coverage doc" concept covers both report-style + audit-style files.
|
|
794
|
+
const coverageReportWritten = !!findFileAcrossKnownPaths(cwd, [
|
|
795
|
+
path.join("rules", "coverage_report.md"),
|
|
796
|
+
path.join("rules", "coverage_audit.md"), // 贷款 v0.7.5
|
|
797
|
+
path.join("rules", "coverage_audit.json"),
|
|
798
|
+
path.join("output", "coverage_report.md"),
|
|
799
|
+
path.join("rule_skills", "coverage_report.md"), // 资管 v0.7.5
|
|
800
|
+
path.join("output", "qc", "coverage_report.md"), // future-proofing
|
|
801
|
+
]);
|
|
665
802
|
|
|
666
803
|
// finalDashboardWritten: at least one dashboards/*.html that is NOT a
|
|
667
804
|
// duplicate of any other. DS + GLM both shipped byte-identical
|
|
@@ -694,11 +831,108 @@ export function deriveFinalizationMilestones(workspace) {
|
|
|
694
831
|
}
|
|
695
832
|
}
|
|
696
833
|
|
|
834
|
+
// v0.8 P0-D: stale-release detection. SOFT gate — surfaces a warning,
|
|
835
|
+
// doesn't refuse phase advance. 资管 audit § 9.1 finding 11 found both
|
|
836
|
+
// release bundles snapped BEFORE the user's "更激进 worker LLM" prompt
|
|
837
|
+
// drove 14 hybrid workflow_v2.py builds, but neither was re-released.
|
|
838
|
+
// We detect by comparing the most-recent release manifest's created_at
|
|
839
|
+
// against the mtimes of workflows/ and rule_skills/.
|
|
840
|
+
const staleRelease = _detectStaleRelease(cwd);
|
|
841
|
+
|
|
697
842
|
return {
|
|
698
843
|
readmeWritten,
|
|
699
844
|
coverageReportWritten,
|
|
700
845
|
finalDashboardWritten,
|
|
701
846
|
dashboardDuplicatesDetected,
|
|
847
|
+
releaseIsStale: staleRelease.isStale,
|
|
848
|
+
staleReleaseDetail: staleRelease.detail,
|
|
849
|
+
};
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
// v0.8 P0-D: detect whether workflows/ or rule_skills/ contain files
|
|
853
|
+
// modified after the most-recent release manifest was written. Returns
|
|
854
|
+
// {isStale: bool, detail: {releaseTs?, releasePath?, newerFiles?: [...]}}.
|
|
855
|
+
// SOFT semantics — the milestone is informational; phase advance still
|
|
856
|
+
// works. The agent + downstream tooling (e2e-audit) decides what to do.
|
|
857
|
+
function _detectStaleRelease(cwd) {
|
|
858
|
+
const releasesRoot = path.join(cwd, "output", "releases");
|
|
859
|
+
if (!dirExists(releasesRoot)) return { isStale: false, detail: null };
|
|
860
|
+
|
|
861
|
+
// Find most-recent release manifest (by created_at OR fs mtime as fallback).
|
|
862
|
+
let latestRelease = null; // {path, createdAt: Date}
|
|
863
|
+
for (const e of listChildDirs(releasesRoot)) {
|
|
864
|
+
const manifestPath = path.join(releasesRoot, e.name, "manifest.json");
|
|
865
|
+
try {
|
|
866
|
+
const stat = fs.statSync(manifestPath);
|
|
867
|
+
if (!stat.isFile()) continue;
|
|
868
|
+
let createdAt = stat.mtime;
|
|
869
|
+
try {
|
|
870
|
+
const m = JSON.parse(fs.readFileSync(manifestPath, "utf-8"));
|
|
871
|
+
if (m?.created_at) {
|
|
872
|
+
const parsed = new Date(m.created_at);
|
|
873
|
+
if (!Number.isNaN(parsed.getTime())) createdAt = parsed;
|
|
874
|
+
}
|
|
875
|
+
} catch { /* fall back to mtime */ }
|
|
876
|
+
if (!latestRelease || createdAt > latestRelease.createdAt) {
|
|
877
|
+
latestRelease = { path: manifestPath, createdAt, slug: e.name };
|
|
878
|
+
}
|
|
879
|
+
} catch { /* skip */ }
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
if (!latestRelease) return { isStale: false, detail: null };
|
|
883
|
+
|
|
884
|
+
// Walk workflows/ and rule_skills/ for files newer than latestRelease.createdAt.
|
|
885
|
+
// Cap to first 10 newer-than-release matches to bound report size.
|
|
886
|
+
const newerFiles = [];
|
|
887
|
+
const cutoff = latestRelease.createdAt.getTime();
|
|
888
|
+
const SCAN_DIRS = ["workflows", "rule_skills"];
|
|
889
|
+
for (const sub of SCAN_DIRS) {
|
|
890
|
+
const root = path.join(cwd, sub);
|
|
891
|
+
if (!dirExists(root)) continue;
|
|
892
|
+
const stack = [root];
|
|
893
|
+
while (stack.length && newerFiles.length < 10) {
|
|
894
|
+
const d = stack.pop();
|
|
895
|
+
let entries;
|
|
896
|
+
try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { continue; }
|
|
897
|
+
for (const ent of entries) {
|
|
898
|
+
if (ent.name.startsWith(".") || ent.name === "__pycache__" || ent.name === "node_modules") continue;
|
|
899
|
+
const p = path.join(d, ent.name);
|
|
900
|
+
if (ent.isDirectory()) { stack.push(p); continue; }
|
|
901
|
+
if (!ent.isFile()) continue;
|
|
902
|
+
// Care about workflow_v*.py + check.py + SKILL.md/skill.md only —
|
|
903
|
+
// not __pycache__, not test artifacts, not .json.
|
|
904
|
+
if (!/(workflow_v\d+\.py|check\.py|SKILL\.md|skill\.md)$/.test(ent.name)) continue;
|
|
905
|
+
try {
|
|
906
|
+
const st = fs.statSync(p);
|
|
907
|
+
if (st.mtimeMs > cutoff) {
|
|
908
|
+
newerFiles.push({
|
|
909
|
+
path: path.relative(cwd, p),
|
|
910
|
+
mtime: new Date(st.mtimeMs).toISOString(),
|
|
911
|
+
});
|
|
912
|
+
if (newerFiles.length >= 10) break;
|
|
913
|
+
}
|
|
914
|
+
} catch { /* skip */ }
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
if (newerFiles.length === 0) return { isStale: false, detail: null };
|
|
920
|
+
|
|
921
|
+
// SOFT: accept_stale_release marker bypasses the warning. Agents that
|
|
922
|
+
// intentionally accept the older release write this file.
|
|
923
|
+
const acceptPath = path.join(cwd, "output", "releases", latestRelease.slug, ".accept_stale_release");
|
|
924
|
+
if (fileExists(acceptPath)) return { isStale: false, detail: { acceptedAt: latestRelease.slug } };
|
|
925
|
+
|
|
926
|
+
return {
|
|
927
|
+
isStale: true,
|
|
928
|
+
detail: {
|
|
929
|
+
releasePath: path.relative(cwd, latestRelease.path),
|
|
930
|
+
releaseSlug: latestRelease.slug,
|
|
931
|
+
releaseCreatedAt: latestRelease.createdAt.toISOString(),
|
|
932
|
+
newerFiles,
|
|
933
|
+
totalNewerCount: newerFiles.length,
|
|
934
|
+
hint: "Workspace artifacts modified after release was built. Either re-run the release tool or write .accept_stale_release into the release dir to acknowledge.",
|
|
935
|
+
},
|
|
702
936
|
};
|
|
703
937
|
}
|
|
704
938
|
|
|
@@ -59,6 +59,10 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
59
59
|
this.skillsAuthored = [...m.skillsAuthored];
|
|
60
60
|
this.skillsWithScripts = [...m.skillsWithScripts];
|
|
61
61
|
this.ruleIdsCovered = new Set(m.ruleIdsCovered);
|
|
62
|
+
// v0.8 P2-F (item 22): stub-shape audit for check.py files.
|
|
63
|
+
this._checkPyStubRatio = m.checkPyStubRatio || 0;
|
|
64
|
+
this._checkPyStubFiles = m.checkPyStubFiles || [];
|
|
65
|
+
this._checkPyTotal = m.checkPyTotal || 0;
|
|
62
66
|
}
|
|
63
67
|
|
|
64
68
|
// v0.7.0 A1: ruleId extraction moved to _milestone-derive.js
|
|
@@ -228,7 +232,32 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
228
232
|
this._validationFailures = v.failures;
|
|
229
233
|
this._validationSkipped = v.skipped;
|
|
230
234
|
if (!v.ok) return false;
|
|
231
|
-
|
|
235
|
+
if (this.skillsWithScripts.length < Math.max(1, this.skillsAuthored.length * 0.5)) {
|
|
236
|
+
return false;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// v0.8 P2-F (item 22): optional enforcement of check.py substantiveness.
|
|
240
|
+
// SOFT-by-default — the stub ratio is always computed (visible in
|
|
241
|
+
// describeState / events) but only blocks phase advance if
|
|
242
|
+
// KC_ENFORCE_CHECK_PY_SUBSTANTIVE=1 is set. Default-off because
|
|
243
|
+
// the heuristic may over-fire on legitimate scaffolds; v0.8 ships
|
|
244
|
+
// the detection + reporting, v0.8.x revisits enforcement after audit
|
|
245
|
+
// data shows whether the signal is reliable.
|
|
246
|
+
const enforce = process.env.KC_ENFORCE_CHECK_PY_SUBSTANTIVE === "1";
|
|
247
|
+
if (enforce && this._checkPyTotal > 0 && this._checkPyStubRatio > 0.5) {
|
|
248
|
+
this._validationFailures = this._validationFailures || [];
|
|
249
|
+
this._validationFailures.push({
|
|
250
|
+
file: "<check_py_substantiveness>",
|
|
251
|
+
reason:
|
|
252
|
+
`${this._checkPyStubCount || this._checkPyStubFiles.length}/${this._checkPyTotal} check.py files are stub-shaped ` +
|
|
253
|
+
`(return NOT_APPLICABLE / pass:null with no workflow import + ≤20 lines). ` +
|
|
254
|
+
`Examples: ${this._checkPyStubFiles.slice(0, 3).join(", ")}${this._checkPyStubFiles.length > 3 ? "..." : ""}. ` +
|
|
255
|
+
`See skill-authoring SKILL.md anti-pattern section. ` +
|
|
256
|
+
`Set KC_ENFORCE_CHECK_PY_SUBSTANTIVE=0 to bypass this gate if intentional.`,
|
|
257
|
+
});
|
|
258
|
+
return false;
|
|
259
|
+
}
|
|
260
|
+
return true;
|
|
232
261
|
}
|
|
233
262
|
|
|
234
263
|
/**
|
|
@@ -382,8 +382,8 @@ export class AgentTool extends BaseTool {
|
|
|
382
382
|
* B8: List currently-running sub-agents. Called by engine's phase-advance
|
|
383
383
|
* path to emit a `stale_subagents` pipeline event — the main agent's next
|
|
384
384
|
* turn sees the list and decides whether to kill each. Soft signal, not
|
|
385
|
-
* an automated kill
|
|
386
|
-
*
|
|
385
|
+
* an automated kill: coupling the subagent lifecycle to phase advance
|
|
386
|
+
* would amplify blast radius if a transition happened unexpectedly.
|
|
387
387
|
*/
|
|
388
388
|
getRunningTaskIds() {
|
|
389
389
|
return Array.from(this._runningTasks.keys());
|
|
@@ -63,6 +63,21 @@ export class ConsultSkillTool extends BaseTool {
|
|
|
63
63
|
const name = (input?.name || "").trim();
|
|
64
64
|
if (!name) return new ToolResult("name required (e.g. consult_skill({name: 'work-decomposition'}))", true);
|
|
65
65
|
|
|
66
|
+
// v0.8 P0-A: defensive null-check. v0.7.5 shipped with an init-order bug
|
|
67
|
+
// where ConsultSkillTool received undefined skillLoader and threw
|
|
68
|
+
// "Cannot read properties of undefined (reading 'getPhaseSkillSet')"
|
|
69
|
+
// on every invocation (资管 audit § 9.1, 5/5 failure rate). The init-order
|
|
70
|
+
// fix is in engine.js:238; this guard prevents an uncaught exception if
|
|
71
|
+
// the bug recurs from any future constructor reorder.
|
|
72
|
+
if (!this._skillLoader || typeof this._skillLoader.getPhaseSkillSet !== "function") {
|
|
73
|
+
return new ToolResult(
|
|
74
|
+
"consult_skill is misconfigured: skillLoader unavailable. This is an engine-side bug — " +
|
|
75
|
+
"surface to the developer user. The agent should fall back to reading skill bodies " +
|
|
76
|
+
"directly from <workspace>/skills/<name>/SKILL.md or the system prompt's always-loaded section.",
|
|
77
|
+
true,
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
|
|
66
81
|
const phase = this._getCurrentPhase ? this._getCurrentPhase() : null;
|
|
67
82
|
const { alwaysLoaded, available } = this._skillLoader.getPhaseSkillSet(phase);
|
|
68
83
|
|
|
@@ -81,11 +81,57 @@ export class DashboardRenderTool extends BaseTool {
|
|
|
81
81
|
metrics.evolution_iterations = fs.readdirSync(evoDir).filter((f) => f.endsWith(".json")).length;
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
+
// v0.8 P1-G: QC counter now reads from multiple known agent-write
|
|
85
|
+
// locations + counts per-doc reviews. Pre-v0.8 read only output/qc/*.json
|
|
86
|
+
// top-level; 资管 v0.7.5 wrote output/results/production_qc_results.json
|
|
87
|
+
// so the dashboard showed `QC Batches: 0` despite 126 pairs of data.
|
|
88
|
+
let qcBatches = 0;
|
|
89
|
+
let qcDocsReviewed = 0;
|
|
90
|
+
|
|
91
|
+
// (a) Top-level batch files in output/qc/ (贷款 v0.7.5 shape)
|
|
84
92
|
const qcDir = path.join(ws, "output", "qc");
|
|
85
93
|
if (fs.existsSync(qcDir)) {
|
|
86
|
-
|
|
94
|
+
for (const f of fs.readdirSync(qcDir).filter((f) => f.endsWith(".json"))) {
|
|
95
|
+
qcBatches++;
|
|
96
|
+
try {
|
|
97
|
+
const data = JSON.parse(fs.readFileSync(path.join(qcDir, f), "utf-8"));
|
|
98
|
+
const n = Number(data?.documents_reviewed);
|
|
99
|
+
if (Number.isFinite(n) && n > qcDocsReviewed) qcDocsReviewed = n;
|
|
100
|
+
} catch { /* skip malformed */ }
|
|
101
|
+
}
|
|
87
102
|
}
|
|
88
103
|
|
|
104
|
+
// (b) Per-doc reviews at output/qc/reviews/ (贷款 detail shape)
|
|
105
|
+
const reviewsDir = path.join(ws, "output", "qc", "reviews");
|
|
106
|
+
if (fs.existsSync(reviewsDir)) {
|
|
107
|
+
const reviewFiles = fs.readdirSync(reviewsDir).filter((f) => f.endsWith(".json"));
|
|
108
|
+
qcDocsReviewed = Math.max(qcDocsReviewed, reviewFiles.length);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// (c) production_qc_results.json shape (资管 v0.7.5)
|
|
112
|
+
const productionQc = path.join(ws, "output", "results", "production_qc_results.json");
|
|
113
|
+
if (fs.existsSync(productionQc)) {
|
|
114
|
+
qcBatches++;
|
|
115
|
+
try {
|
|
116
|
+
const data = JSON.parse(fs.readFileSync(productionQc, "utf-8"));
|
|
117
|
+
const totalDocs = Number(data?.total_docs);
|
|
118
|
+
if (Number.isFinite(totalDocs)) qcDocsReviewed = Math.max(qcDocsReviewed, totalDocs);
|
|
119
|
+
// Otherwise, dedup doc keys from nested results
|
|
120
|
+
if (!Number.isFinite(totalDocs) && data?.results && typeof data.results === "object") {
|
|
121
|
+
const docSet = new Set();
|
|
122
|
+
for (const docs of Object.values(data.results)) {
|
|
123
|
+
if (docs && typeof docs === "object") {
|
|
124
|
+
for (const k of Object.keys(docs)) docSet.add(k);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (docSet.size > 0) qcDocsReviewed = Math.max(qcDocsReviewed, docSet.size);
|
|
128
|
+
}
|
|
129
|
+
} catch { /* skip */ }
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
metrics.qc_batches = qcBatches;
|
|
133
|
+
metrics.qc_docs_reviewed = qcDocsReviewed;
|
|
134
|
+
|
|
89
135
|
return metrics;
|
|
90
136
|
}
|
|
91
137
|
|
|
@@ -126,6 +172,7 @@ th { color: #737373; font-size: 0.85em; }
|
|
|
126
172
|
<div class="metric"><span class="value">${total}</span><br><span class="label">Results</span></div>
|
|
127
173
|
<div class="metric"><span class="value">${metrics.evolution_iterations}</span><br><span class="label">Evolution Cycles</span></div>
|
|
128
174
|
<div class="metric"><span class="value">${metrics.qc_batches}</span><br><span class="label">QC Batches</span></div>
|
|
175
|
+
<div class="metric"><span class="value">${metrics.qc_docs_reviewed || 0}</span><br><span class="label">Docs Reviewed</span></div>
|
|
129
176
|
</div>
|
|
130
177
|
<h2>Confidence Distribution</h2>
|
|
131
178
|
<div class="card">
|
|
@@ -12,14 +12,43 @@ const MIN_CHARS_PER_PAGE = 50;
|
|
|
12
12
|
* Level 3: OCR models via SiliconFlow — fallback via vision models
|
|
13
13
|
*/
|
|
14
14
|
export class DocumentParseTool extends BaseTool {
|
|
15
|
-
|
|
15
|
+
/**
|
|
16
|
+
* @param {object} workspace
|
|
17
|
+
* @param {object} opts
|
|
18
|
+
* @param {string} [opts.mineruApiUrl]
|
|
19
|
+
* @param {string} [opts.mineruApiKey]
|
|
20
|
+
* @param {string} [opts.llmApiKey]
|
|
21
|
+
* @param {string} [opts.llmBaseUrl]
|
|
22
|
+
* @param {string} [opts.ocrModel] — static fallback (legacy)
|
|
23
|
+
* @param {() => string} [opts.getOcrModel] — v0.8.1 P9-B: live-read
|
|
24
|
+
* callback. If provided, takes precedence over `ocrModel`. The
|
|
25
|
+
* constructor used to capture vlmTier1 once at engine startup, but
|
|
26
|
+
* workspace_env_overlay (P1-B) fires AFTER tool construction in
|
|
27
|
+
* some flows (e.g. agent edits .env mid-run, OR overlay applies on
|
|
28
|
+
* a subagent's engine but parent already cached the gc default).
|
|
29
|
+
* E2E #11 资管 v0.8 audit found document_parse errors quoting
|
|
30
|
+
* Qwen3-VL-235B-A22B-Instruct (gc default) even though .env set
|
|
31
|
+
* OCR_MODEL_TIER1=zai-org/GLM-4.6V — the overlay applied 5 min
|
|
32
|
+
* after first failed call. Live-read fixes the race.
|
|
33
|
+
*/
|
|
34
|
+
constructor(workspace, { mineruApiUrl, mineruApiKey, llmApiKey, llmBaseUrl, ocrModel, getOcrModel } = {}) {
|
|
16
35
|
super();
|
|
17
36
|
this._workspace = workspace;
|
|
18
37
|
this._mineruApiUrl = mineruApiUrl || "";
|
|
19
38
|
this._mineruApiKey = mineruApiKey || "";
|
|
20
39
|
this._vlmApiKey = llmApiKey || "";
|
|
21
40
|
this._vlmBaseUrl = (llmBaseUrl || "").replace(/\/+$/, "");
|
|
22
|
-
this.
|
|
41
|
+
this._ocrModelStatic = ocrModel || "";
|
|
42
|
+
this._getOcrModel = typeof getOcrModel === "function" ? getOcrModel : null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/** Read ocrModel live (P9-B) or fall back to the static value captured at construction. */
|
|
46
|
+
get _ocrModel() {
|
|
47
|
+
if (this._getOcrModel) {
|
|
48
|
+
try { return this._getOcrModel() || this._ocrModelStatic; }
|
|
49
|
+
catch { return this._ocrModelStatic; }
|
|
50
|
+
}
|
|
51
|
+
return this._ocrModelStatic;
|
|
23
52
|
}
|
|
24
53
|
|
|
25
54
|
get name() { return "document_parse"; }
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { BaseTool, ToolResult } from "./base.js";
|
|
2
2
|
import { Phase } from "../pipelines/index.js";
|
|
3
|
+
import { getPrescriptiveHint } from "../pipelines/_advance-hints.js";
|
|
3
4
|
|
|
4
5
|
const VALID_PHASES = new Set(Object.values(Phase));
|
|
5
6
|
|
|
@@ -72,12 +73,12 @@ export class PhaseAdvanceTool extends BaseTool {
|
|
|
72
73
|
|
|
73
74
|
const beforePhase = this._getCurrentPhase();
|
|
74
75
|
// H1: short-circuit the "already in target" case with an informational
|
|
75
|
-
// message —
|
|
76
|
-
//
|
|
77
|
-
//
|
|
76
|
+
// message — agent was trying to advance correctly, engine was already
|
|
77
|
+
// there (from a prior pipeline_event-driven advance or an earlier
|
|
78
|
+
// explicit call). Treat as success, not refusal.
|
|
78
79
|
if (beforePhase && beforePhase === to) {
|
|
79
80
|
return new ToolResult(
|
|
80
|
-
`Already in phase ${to} (engine
|
|
81
|
+
`Already in phase ${to} (engine was already there from a prior advance). Proceed with phase-appropriate work.`,
|
|
81
82
|
);
|
|
82
83
|
}
|
|
83
84
|
|
|
@@ -126,18 +127,21 @@ export class PhaseAdvanceTool extends BaseTool {
|
|
|
126
127
|
// exactly which milestones the gate is reading and can satisfy them.
|
|
127
128
|
// E2E #6 v070 showed the generic "check /status" hint wasn't concrete
|
|
128
129
|
// enough — agents forced through. Naming the gap inline reduces that.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
// v0.8 P0-E: prescriptive refusal hint — name the artifacts the agent
|
|
131
|
+
// needs to produce, derived from the same paths _milestone-derive.js
|
|
132
|
+
// walks. Replaces the v0.7.x descriptive "check /status" message that
|
|
133
|
+
// 资管 + 贷款 v0.7.5 audits showed agents force-bypassing.
|
|
134
|
+
const prescriptive = getPrescriptiveHint(
|
|
135
|
+
beforePhase,
|
|
136
|
+
advanceResult?.engineCounts,
|
|
137
|
+
advanceResult?.engineCounts || "",
|
|
138
|
+
);
|
|
132
139
|
|
|
133
140
|
return new ToolResult(
|
|
134
141
|
`Did not advance to ${to} (currently in ${beforePhase || "?"}). ` +
|
|
135
|
-
`Likely cause: source-phase exit criteria not met
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
`the engine derives milestones from filesystem facts (rule_skills/<id>/SKILL.md, check.py, ` +
|
|
139
|
-
`workflows/<id>/*.py, output/results/*.json, etc.). ` +
|
|
140
|
-
`If the transition is non-adjacent or this phase truly is done despite the gate, ` +
|
|
142
|
+
`Likely cause: source-phase exit criteria not met.\n\n` +
|
|
143
|
+
prescriptive +
|
|
144
|
+
`\n\nIf the transition is non-adjacent or this phase truly is done despite the gate, ` +
|
|
141
145
|
`re-call with the documented schema flag. The engine logged the precise reason in ` +
|
|
142
146
|
`events.jsonl as 'phase_advance_refused'.`,
|
|
143
147
|
false,
|