@hongmaple0820/scale-engine 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/README.en.md +310 -237
  2. package/README.md +255 -63
  3. package/dist/api/cli.js +2656 -1258
  4. package/dist/api/cli.js.map +1 -1
  5. package/dist/api/doctor.d.ts +4 -1
  6. package/dist/api/doctor.js +85 -1
  7. package/dist/api/doctor.js.map +1 -1
  8. package/dist/api/quickstart.d.ts +3 -0
  9. package/dist/api/quickstart.js +9 -4
  10. package/dist/api/quickstart.js.map +1 -1
  11. package/dist/cli/phaseCommands.js +7 -0
  12. package/dist/cli/phaseCommands.js.map +1 -1
  13. package/dist/codegraph/CodeIntelligence.d.ts +135 -0
  14. package/dist/codegraph/CodeIntelligence.js +460 -0
  15. package/dist/codegraph/CodeIntelligence.js.map +1 -0
  16. package/dist/context/ContextBudget.d.ts +90 -0
  17. package/dist/context/ContextBudget.js +322 -0
  18. package/dist/context/ContextBudget.js.map +1 -0
  19. package/dist/eval/WorkflowEval.d.ts +161 -0
  20. package/dist/eval/WorkflowEval.js +379 -0
  21. package/dist/eval/WorkflowEval.js.map +1 -0
  22. package/dist/governance/GovernanceRoi.d.ts +25 -0
  23. package/dist/governance/GovernanceRoi.js +70 -0
  24. package/dist/governance/GovernanceRoi.js.map +1 -0
  25. package/dist/governance/ProgressiveGovernance.d.ts +22 -0
  26. package/dist/governance/ProgressiveGovernance.js +159 -0
  27. package/dist/governance/ProgressiveGovernance.js.map +1 -0
  28. package/dist/index.d.ts +2 -0
  29. package/dist/index.js +4 -0
  30. package/dist/index.js.map +1 -1
  31. package/dist/memory/MemoryBrain.d.ts +135 -0
  32. package/dist/memory/MemoryBrain.js +635 -0
  33. package/dist/memory/MemoryBrain.js.map +1 -0
  34. package/dist/memory/MemoryFabric.d.ts +118 -0
  35. package/dist/memory/MemoryFabric.js +281 -0
  36. package/dist/memory/MemoryFabric.js.map +1 -0
  37. package/dist/memory/MemoryLearning.d.ts +61 -0
  38. package/dist/memory/MemoryLearning.js +203 -0
  39. package/dist/memory/MemoryLearning.js.map +1 -0
  40. package/dist/memory/index.d.ts +3 -0
  41. package/dist/memory/index.js +4 -0
  42. package/dist/memory/index.js.map +1 -0
  43. package/dist/output/GovernanceDashboard.d.ts +57 -0
  44. package/dist/output/GovernanceDashboard.js +250 -0
  45. package/dist/output/GovernanceDashboard.js.map +1 -0
  46. package/dist/output/HTMLArtifactLayer.js +31 -31
  47. package/dist/output/index.d.ts +2 -0
  48. package/dist/output/index.js +1 -0
  49. package/dist/output/index.js.map +1 -1
  50. package/dist/prompts/VibeTemplateGallery.js +121 -121
  51. package/dist/runtime/FinalReportGuard.d.ts +16 -0
  52. package/dist/runtime/FinalReportGuard.js +14 -0
  53. package/dist/runtime/FinalReportGuard.js.map +1 -0
  54. package/dist/runtime/RuntimeDoctor.d.ts +23 -0
  55. package/dist/runtime/RuntimeDoctor.js +151 -0
  56. package/dist/runtime/RuntimeDoctor.js.map +1 -0
  57. package/dist/runtime/RuntimeEvidenceLedger.d.ts +50 -0
  58. package/dist/runtime/RuntimeEvidenceLedger.js +89 -0
  59. package/dist/runtime/RuntimeEvidenceLedger.js.map +1 -0
  60. package/dist/runtime/SessionLedger.d.ts +53 -0
  61. package/dist/runtime/SessionLedger.js +104 -0
  62. package/dist/runtime/SessionLedger.js.map +1 -0
  63. package/dist/runtime/index.d.ts +4 -0
  64. package/dist/runtime/index.js +5 -0
  65. package/dist/runtime/index.js.map +1 -0
  66. package/dist/skills/SkillRadar.d.ts +83 -0
  67. package/dist/skills/SkillRadar.js +384 -0
  68. package/dist/skills/SkillRadar.js.map +1 -0
  69. package/dist/workflow/EngineeringStandards.js +69 -66
  70. package/dist/workflow/EngineeringStandards.js.map +1 -1
  71. package/dist/workflow/GovernanceTemplatePacks.js +126 -126
  72. package/dist/workflow/GovernanceTemplates.d.ts +1 -1
  73. package/dist/workflow/GovernanceTemplates.js +500 -229
  74. package/dist/workflow/GovernanceTemplates.js.map +1 -1
  75. package/dist/workflow/ResourceGovernance.js +27 -18
  76. package/dist/workflow/ResourceGovernance.js.map +1 -1
  77. package/dist/workflow/VerificationCommands.d.ts +11 -0
  78. package/dist/workflow/VerificationCommands.js +2 -0
  79. package/dist/workflow/VerificationCommands.js.map +1 -1
  80. package/dist/workflow/VerificationProfile.d.ts +2 -1
  81. package/dist/workflow/VerificationProfile.js +3 -0
  82. package/dist/workflow/VerificationProfile.js.map +1 -1
  83. package/dist/workflow/WorkflowArtifactWriter.js +2 -1
  84. package/dist/workflow/WorkflowArtifactWriter.js.map +1 -1
  85. package/dist/workflow/WorkflowEngine.js +4 -1
  86. package/dist/workflow/WorkflowEngine.js.map +1 -1
  87. package/dist/workflow/WorkspaceSafety.d.ts +9 -0
  88. package/dist/workflow/WorkspaceSafety.js +49 -0
  89. package/dist/workflow/WorkspaceSafety.js.map +1 -0
  90. package/dist/workflow/gates/GateSystem.d.ts +12 -1
  91. package/dist/workflow/gates/GateSystem.js +106 -0
  92. package/dist/workflow/gates/GateSystem.js.map +1 -1
  93. package/dist/workflow/types.d.ts +1 -1
  94. package/docs/CODE_INTELLIGENCE.md +138 -0
  95. package/docs/CONTEXT_BUDGET.md +87 -0
  96. package/docs/GOVERNANCE_DASHBOARD.md +69 -0
  97. package/docs/MEMORY_BRAIN.md +104 -0
  98. package/docs/MEMORY_FABRIC.md +107 -0
  99. package/docs/README.md +76 -0
  100. package/docs/RUNTIME_EVIDENCE.md +101 -0
  101. package/docs/SKILL_RADAR.md +115 -0
  102. package/docs/WORKFLOW_EVAL.md +151 -0
  103. package/docs/start/README.md +42 -0
  104. package/docs/start/agent-governance-demo.md +107 -0
  105. package/docs/start/quickstart.md +127 -0
  106. package/examples/demo-projects/agent-governance-demo/README.md +37 -0
  107. package/examples/demo-projects/agent-governance-demo/package.json +16 -0
  108. package/examples/demo-projects/agent-governance-demo/src/oauth-state.ts +39 -0
  109. package/examples/demo-projects/agent-governance-demo/tests/oauth-state.test.ts +52 -0
  110. package/package.json +14 -3
@@ -0,0 +1,104 @@
1
+ # Memory Brain
2
+
3
+ Memory Brain is SCALE's project-scoped long-term memory layer. It is separate from Memory Fabric:
4
+
5
+ - Memory Fabric builds a compact context pack for the current task.
6
+ - Memory Brain stores reviewed project knowledge with evidence, confidence, scope, and contradiction checks.
7
+
8
+ The first version is local-first and uses SQLite:
9
+
10
+ ```text
11
+ .scale/memory/brain.sqlite
12
+ .scale/memory/brain-manifest.json
13
+ ```
14
+
15
+ ## Commands
16
+
17
+ ```bash
18
+ scale memory ingest --from evidence --task-id <task-id>
19
+ scale memory ingest --from candidate --candidate-id <candidate-id>
20
+ scale memory ingest --from failure --failure-id <failure-replay-id>
21
+ scale memory query "OAuth callback state design"
22
+ scale memory contradictions
23
+ scale memory dream
24
+ scale memory promote <memory-node-id-or-candidate-id>
25
+ scale memory export --output .scale/memory/export.jsonl
26
+ scale memory import .scale/memory/export.jsonl
27
+ ```
28
+
29
+ ## Node Contract
30
+
31
+ ```ts
32
+ interface MemoryNode {
33
+ id: string
34
+ type: 'fact' | 'decision' | 'incident' | 'relation' | 'contradiction'
35
+ title: string
36
+ summary: string
37
+ entities: string[]
38
+ source: 'runtime-evidence' | 'task-artifact' | 'docs' | 'git' | 'manual'
39
+ evidencePaths: string[]
40
+ confidence: number
41
+ scope: 'project' | 'workspace' | 'global-candidate'
42
+ status: 'candidate' | 'active' | 'stale' | 'rejected'
43
+ createdAt: string
44
+ updatedAt: string
45
+ lastVerifiedAt?: string
46
+ }
47
+ ```
48
+
49
+ ## Evidence Rule
50
+
51
+ Active memory must have at least one evidence path. SCALE blocks promotion when this is not true.
52
+
53
+ Runtime evidence and learning candidates are ingested as `candidate` records first. `scale memory promote` is the explicit boundary where reviewed memory becomes active.
54
+
55
+ Failure replay records can also be ingested as `incident` candidates:
56
+
57
+ ```bash
58
+ scale eval run --suite workflow-baseline
59
+ scale eval failures --since 30d
60
+ scale memory ingest --from failure --failure-id <failure-replay-id>
61
+ scale memory promote <memory-node-id>
62
+ ```
63
+
64
+ This connects Eval Harness failures to long-term memory without automatically rewriting project standards. A failure becomes active memory only after promotion and only if the replay artifact is present as evidence.
65
+
66
+ ## Scope Rule
67
+
68
+ Project memory stays project-scoped by default. `global-candidate` is allowed for export and review, but it cannot be activated inside a project brain. This prevents one project's temporary truth from becoming a global rule.
69
+
70
+ ## Contradiction Rule
71
+
72
+ `scale memory contradictions` reports conflicts instead of resolving them automatically. Examples:
73
+
74
+ - one memory says a provider is enabled, another says it is disabled
75
+ - one memory says a route exists, another says it is missing
76
+ - one memory says an operation is allowed, another says it is blocked
77
+
78
+ The command exits non-zero when active contradictions exist.
79
+
80
+ ## Dream Maintenance
81
+
82
+ `scale memory dream` is a maintenance pass. It reports:
83
+
84
+ - promotion candidates
85
+ - stale active memories
86
+ - duplicate groups
87
+ - contradictions
88
+ - suggested docs to update
89
+ - active memories missing evidence
90
+
91
+ It does not auto-promote standards, rewrite docs, or delete memories.
92
+
93
+ ## Resource Lifecycle
94
+
95
+ Memory Brain files under `.scale/memory/` are local runtime state by default. Commit only curated exports, documented decisions, or task artifacts that were intentionally reviewed.
96
+
97
+ Recommended flow:
98
+
99
+ ```text
100
+ runtime evidence -> memory settle -> memory ingest -> memory promote -> docs/standards update when stable
101
+ eval failure replay -> memory ingest --from failure -> memory promote -> workflow rule update when stable
102
+ ```
103
+
104
+ This keeps memory useful without turning every session observation into permanent project truth.
@@ -0,0 +1,107 @@
1
+ # Memory Fabric
2
+
3
+ Memory Fabric 是 SCALE 用来降低长会话 token 消耗、提升 Agent 记忆质量的上下文压缩层。它不会把所有历史文档都塞回提示词,而是按任务范围生成一个可审计的 context pack。
4
+
5
+ 它聚合四类信息:
6
+
7
+ - Runtime Evidence:真实运行过的命令、工具、浏览器、skill、MCP 和人工验证证据。
8
+ - Session Events:当前会话的阶段、工具使用和证据写入事件。
9
+ - Knowledge Recall:从项目知识库召回已验证经验、规则和历史教训。
10
+ - Project Graph:检测 `graphify-out/GRAPH_REPORT.md` 或 `.scale/graph/manifest.json`,只引用图谱状态和摘要,不把大型图谱全文塞进上下文。
11
+
12
+ ## 基本命令
13
+
14
+ 生成上下文包:
15
+
16
+ ```bash
17
+ scale memory pack \
18
+ --task-id 2026-05-18-runtime-evidence \
19
+ --session-id 2026-05-18-runtime-evidence \
20
+ --task "继续实现 runtime evidence 与最终交付检查" \
21
+ --level M \
22
+ --files src/runtime,src/api/cli.ts \
23
+ --budget 4000
24
+ ```
25
+
26
+ 输出 JSON,便于其他 Agent、CLI 或评审工具读取:
27
+
28
+ ```bash
29
+ scale memory pack \
30
+ --task "修复 OAuth callback state 过期处理" \
31
+ --level M \
32
+ --budget 4000 \
33
+ --json
34
+ ```
35
+
36
+ 检查上下文预算:
37
+
38
+ ```bash
39
+ scale memory doctor \
40
+ --task "跨模块权限重构" \
41
+ --level L \
42
+ --budget 3000
43
+ ```
44
+
45
+ 把完成任务后的运行证据沉淀成学习候选:
46
+
47
+ ```bash
48
+ scale memory settle \
49
+ --task-id 2026-05-18-runtime-evidence \
50
+ --session-id 2026-05-18-runtime-evidence \
51
+ --task "继续实现 runtime evidence 与最终交付检查" \
52
+ --level M \
53
+ --budget 4000
54
+ ```
55
+
56
+ `settle` 会写入:
57
+
58
+ ```text
59
+ .scale/memory/learning-candidates/<candidate-id>.json
60
+ .scale/memory/learning-candidates/<candidate-id>.md
61
+ ```
62
+
63
+ 这些文件是本地运行时学习候选,默认不应该直接提交到 Git。它们的作用是让人类或评审 Agent 判断“这条经验是否值得进入长期知识库、工程规范或模块文档”。
64
+
65
+ ## 预算策略
66
+
67
+ Memory Fabric 使用估算 token 预算控制上下文规模。优先级从高到低:
68
+
69
+ 1. Runtime Evidence:失败证据和通过证据优先保留。
70
+ 2. Session Events:最近会话事件优先保留。
71
+ 3. Knowledge Recall:按任务描述和文件范围召回 Top K 知识。
72
+ 4. Project Graph:只保留图谱报告路径和短摘要。
73
+
74
+ 当预算不足时,低优先级 section 会被标记为 omitted,并写入原因。这样 Agent 能知道哪些上下文被刻意裁剪,而不是误以为项目没有相关信息。
75
+
76
+ ## 与知识库和自我进化的关系
77
+
78
+ Memory Fabric 不替代知识库。它是知识库、运行证据和图谱之间的读取层:
79
+
80
+ - Runtime Evidence 记录“这次实际做过什么”。
81
+ - Knowledge Base 记录“长期可复用的经验和规则”。
82
+ - Graphify 或项目图谱记录“模块之间的结构关系”。
83
+ - Memory Fabric 在每次任务开始、恢复、评审或发版前,生成本次最相关的上下文包。
84
+
85
+ 任务完成后,应该把真正稳定的经验沉淀到知识库或长期维护文档中;`.scale/events/` 和 `.scale/evidence/` 仍然是本地运行时产物,不应默认提交到 Git。
86
+
87
+ 新的推荐闭环是:
88
+
89
+ ```text
90
+ runtime evidence -> memory pack -> memory settle -> 人审 -> knowledge/docs/rules
91
+ ```
92
+
93
+ 也就是说,Memory Fabric 先把证据和上下文压缩成候选,不会自动把一次会话里的判断升级成长期规则。存在失败证据时,候选会标记为 `resolve-failures-first`,避免把未闭环问题沉淀成“经验”。
94
+
95
+ ## 推荐使用场景
96
+
97
+ - 长会话恢复前:先生成 context pack,避免重复读大量文档。
98
+ - 多 Agent 协作前:把 context pack 交给审查 Agent 或测试 Agent。
99
+ - 发版前:用 runtime evidence 和 session events 检查是否存在未闭环失败。
100
+ - 任务结束后:用 `memory settle` 生成学习候选,再决定是否进入知识库、模块文档或工程规范。
101
+ - 大型项目治理:结合 service matrix、resource governance 和 engineering standards,生成任务相关而不是全仓库噪声上下文。
102
+
103
+ ## 当前边界
104
+
105
+ - 当前版本不内置向量数据库;如果项目配置了 SQLite knowledge base,会使用现有召回接口。
106
+ - 当前版本只检测 Graphify 产物是否存在并生成摘要,不主动运行 Graphify。
107
+ - HTML 可视化报告适合后续加在 context pack 之上;Memory Fabric 的核心产物先保持 JSON/Markdown,方便 diff、测试和 CLI 集成。
package/docs/README.md ADDED
@@ -0,0 +1,76 @@
1
+ # SCALE Engine 文档地图
2
+
3
+ 这个目录同时包含用户指南、治理能力说明、架构参考、历史规划和推广素材。新用户应优先阅读入门入口和当前治理能力文档,历史规划仅作为背景材料。
4
+
5
+ ## 新用户入口
6
+
7
+ | 文档 | 说明 |
8
+ | --- | --- |
9
+ | [start/README.md](start/README.md) | 入门路径总览 |
10
+ | [start/quickstart.md](start/quickstart.md) | 3 分钟快速开始 |
11
+ | [start/agent-governance-demo.md](start/agent-governance-demo.md) | 官方 demo walkthrough |
12
+ | [../README.md](../README.md) | 项目主页和能力总览 |
13
+
14
+ ## 当前治理能力
15
+
16
+ | 文档 | 说明 |
17
+ | --- | --- |
18
+ | [RESOURCE_GOVERNANCE.md](RESOURCE_GOVERNANCE.md) | 文档、报告、媒体、脚本、临时产物的生命周期治理 |
19
+ | [ENGINEERING_STANDARDS.md](ENGINEERING_STANDARDS.md) | 日志、安全、ORM、框架、测试、部署等工程规范 |
20
+ | [TOOL_ORCHESTRATION.md](TOOL_ORCHESTRATION.md) | skills、MCP、CLI、浏览器、桌面自动化的编排策略 |
21
+ | [RUNTIME_EVIDENCE.md](RUNTIME_EVIDENCE.md) | 会话 ledger、运行时证据和最终交付检查 |
22
+ | [MEMORY_FABRIC.md](MEMORY_FABRIC.md) | Runtime evidence、session events、knowledge recall 和 graph status 的预算化上下文包 |
23
+ | [MEMORY_BRAIN.md](MEMORY_BRAIN.md) | 证据驱动的长期记忆、矛盾检测、dream 整理和 failure replay 沉淀 |
24
+ | [CONTEXT_BUDGET.md](CONTEXT_BUDGET.md) | Context Budget、Progressive Governance、Lazy Loading 和 Governance ROI |
25
+ | [CODE_INTELLIGENCE.md](CODE_INTELLIGENCE.md) | CodeGraph、Graphify 和显式 fallback 的代码智能与探索 ROI |
26
+ | [WORKFLOW_EVAL.md](WORKFLOW_EVAL.md) | Workflow Eval、pass@k 指标、Failure Replay 和改进候选 |
27
+ | [SKILL_RADAR.md](SKILL_RADAR.md) | Skill Radar、能力置信度、证据要求和供应链安全检查 |
28
+ | [GOVERNANCE_DASHBOARD.md](GOVERNANCE_DASHBOARD.md) | Runtime、eval、memory、resource、HTML artifact 的统一治理面板 |
29
+ | [RELEASE_READINESS.md](RELEASE_READINESS.md) | 发版前质量门槛、官方 demo 和真实项目落地验收 |
30
+ | [SKILL-REPOSITORY.md](SKILL-REPOSITORY.md) | 受治理 skill repository 和安装安全策略 |
31
+ | [VIBE-TEMPLATES.md](VIBE-TEMPLATES.md) | 可复制的 Vibe Coding 提示词模板 |
32
+ | [LEADERSHIP-PRESETS.md](LEADERSHIP-PRESETS.md) | CEO、CTO、PM、Architect 等内置领导者角色预设 |
33
+
34
+ ## 架构与参考
35
+
36
+ | 文档 | 说明 |
37
+ | --- | --- |
38
+ | [00-OVERVIEW.md](00-OVERVIEW.md) | 系统概览 |
39
+ | [01-ARCHITECTURE.md](01-ARCHITECTURE.md) | 架构设计 |
40
+ | [02-DATA-MODEL.md](02-DATA-MODEL.md) | 数据模型 |
41
+ | [03-CORE-MODULES.md](03-CORE-MODULES.md) | 核心模块 |
42
+ | [04-INTEGRATION.md](04-INTEGRATION.md) | 平台与集成 |
43
+ | [06-DECISIONS.md](06-DECISIONS.md) | 架构决策记录 |
44
+
45
+ ## 历史规划和过程记录
46
+
47
+ 这些文档是历史上下文,不一定代表当前产品入口:
48
+
49
+ | 文档 | 说明 |
50
+ | --- | --- |
51
+ | [05-ROADMAP.md](05-ROADMAP.md) | 路线图 |
52
+ | [OPTIMIZATION_PLAN.md](OPTIMIZATION_PLAN.md) | 历史优化计划 |
53
+ | [WEEK1-2-REPORT.md](WEEK1-2-REPORT.md) | 阶段报告 |
54
+ | [TASK_GUARD_SUMMARY.md](TASK_GUARD_SUMMARY.md) | Task Guard 总结 |
55
+ | [TASK_GUARD_WORKFLOW_DEMO.md](TASK_GUARD_WORKFLOW_DEMO.md) | 早期 workflow demo |
56
+ | [plans/2026-05-19-agent-engineering-os-upgrade-plan.md](plans/2026-05-19-agent-engineering-os-upgrade-plan.md) | Agent Engineering OS 升级审核稿:Context Budget、CodeGraph、Memory Brain、Skill Radar、HTML Artifact 和 Eval Harness |
57
+ | [plans/](plans/) | 规划方案和技术方案归档 |
58
+ | [superpowers/](superpowers/) | 外部方法论对照和计划归档 |
59
+
60
+ ## 推广和素材
61
+
62
+ | 文档 | 说明 |
63
+ | --- | --- |
64
+ | [promote-article-v2.md](promote-article-v2.md) | 推广文章草稿 v2 |
65
+ | [promote-article-v2.html](promote-article-v2.html) | 推广文章 HTML v2 |
66
+ | [promote-article-v3.md](promote-article-v3.md) | 推广文章草稿 v3 |
67
+ | [promote-article-v3.html](promote-article-v3.html) | 推广文章 HTML v3 |
68
+ | [imgs/](imgs/) | 社群二维码和推广图片 |
69
+
70
+ ## 维护规则
71
+
72
+ - 面向新用户的文档优先放在 `docs/start/`。
73
+ - 当前可执行能力放在根 README 和当前治理能力文档中。
74
+ - 历史规划不要混入新手教程,避免用户把旧计划当成当前事实。
75
+ - 如果 CLI 行为变化,必须同步更新 `README.md`、`docs/start/quickstart.md` 和相关 reference 文档。
76
+ - 如果新增 governance pack,必须同时更新 `README.md`、`docs/start/README.md` 和对应测试。
@@ -0,0 +1,101 @@
1
+ # Runtime Evidence
2
+
3
+ Runtime Evidence 是 SCALE 用来记录 Agent 实际做过什么的运行时证据层。它的目标很直接:没有真实命令、工具、浏览器、skill 或人工验证证据时,Agent 不能声称任务已经完成。
4
+
5
+ 它和现有证据层的关系:
6
+
7
+ - Gate evidence:回答 build、lint、test、security、review 等门禁是否通过。
8
+ - Tool evidence:回答必需的 skill、MCP、浏览器、桌面自动化或 CLI 工具是否执行过。
9
+ - Runtime evidence:回答当前会话是否具备可信的最终交付证据。
10
+
11
+ ## 存储位置
12
+
13
+ Runtime 数据写入 SCALE 已忽略的本地运行时目录:
14
+
15
+ ```text
16
+ .scale/
17
+ ├── events/
18
+ │ ├── current-session.json
19
+ │ └── sessions/<session-id>.jsonl
20
+ └── evidence/
21
+ └── runtime/<evidence-id>.json
22
+ ```
23
+
24
+ 这些文件默认是本地运行时产物,不应该提交到 Git。需要长期保留时,应把摘要沉淀到任务 summary、ADR、README 或模块文档中,而不是直接提交原始日志。
25
+
26
+ ## 基本流程
27
+
28
+ 启动会话:
29
+
30
+ ```bash
31
+ scale runtime start \
32
+ --session-id 2026-05-18-runtime-evidence \
33
+ --task-id 2026-05-18-runtime-evidence \
34
+ --level M \
35
+ --agent codex
36
+ ```
37
+
38
+ 在真实命令、门禁、浏览器验证、skill 执行、MCP 调用或人工检查之后记录证据:
39
+
40
+ ```bash
41
+ scale runtime record \
42
+ --title "build" \
43
+ --kind command \
44
+ --status passed \
45
+ --command "npm run build" \
46
+ --exit-code 0 \
47
+ --summary "TypeScript build passed"
48
+ ```
49
+
50
+ 检查是否允许最终交付:
51
+
52
+ ```bash
53
+ scale runtime final-check \
54
+ --task-id 2026-05-18-runtime-evidence \
55
+ --session-id 2026-05-18-runtime-evidence \
56
+ --level M
57
+ ```
58
+
59
+ 检查运行时健康状态:
60
+
61
+ ```bash
62
+ scale runtime doctor --level M
63
+ scale doctor
64
+ ```
65
+
66
+ ## 完成规则
67
+
68
+ M、L、CRITICAL 任务在最终交付前必须满足:
69
+
70
+ - 当前 task/session 范围内至少有一条 `passed` runtime evidence。
71
+ - 当前 task/session 范围内不能存在 `failed` runtime evidence。
72
+
73
+ S 级任务可以保持轻量,但一旦存在失败证据,仍然不能声称完成。
74
+
75
+ ## 脱敏规则
76
+
77
+ Runtime evidence 复用 tool evidence 的脱敏模型。写入 JSON 前会处理命令、摘要、artifact 路径和 metadata 中的敏感字段:
78
+
79
+ - password
80
+ - token
81
+ - secret
82
+ - authorization
83
+ - cookie
84
+ - credential
85
+ - api key
86
+ - private key
87
+
88
+ 这样可以保留有用证据,同时避免把 token、cookie、密钥等内容写进运行时文件。
89
+
90
+ ## 推荐使用场景
91
+
92
+ 适合记录 runtime evidence 的场景:
93
+
94
+ - 最终交付检查。
95
+ - 长会话或多阶段任务。
96
+ - 跨 Agent 或外部 CLI review。
97
+ - 浏览器、桌面自动化、MCP、skill 验证。
98
+ - 发版前 preflight。
99
+ - 需要进入后续学习闭环的失败、修复和重试记录。
100
+
101
+ 不要用 runtime evidence 替代长期维护文档。Runtime evidence 是“操作证明”,PRD、ADR、架构文档、README、模块文档才是长期项目契约。
@@ -0,0 +1,115 @@
1
+ # Skill Radar
2
+
3
+ Skill Radar is the active capability selection layer for SCALE. It does not auto-install or blindly run skills. It scores relevant skills, MCP servers, browser tools, desktop automation, and external CLIs against the current task, then returns:
4
+
5
+ - why the capability matches
6
+ - confidence score
7
+ - safety level
8
+ - required evidence
9
+ - fallback path
10
+ - supply-chain checks before installation or promotion
11
+
12
+ The goal is to make agents actively use useful tools without turning the project into an unsafe prompt or tool bundle.
13
+
14
+ ## Commands
15
+
16
+ ```bash
17
+ scale skill radar --task "Design upload UI and run browser E2E checks" --files src/pages/upload.tsx
18
+ scale skill radar --task "Automate WPS desktop workflow with CUA" --json
19
+ scale skill radar --task "Review release PR" --phase review --level L --output docs/worklog/tasks/release/skill-radar.md
20
+ scale skill doctor --supply-chain
21
+ scale skill doctor --supply-chain --json
22
+ ```
23
+
24
+ ## Safety Levels
25
+
26
+ | Level | Meaning | Default action |
27
+ | --- | --- | --- |
28
+ | `trusted` | Official or low-risk capability with policy enabled | May be recommended when confidence is high |
29
+ | `review-required` | Third-party or ecosystem capability | Require source, license, scripts, and revision review |
30
+ | `restricted` | Browser, desktop, or external execution boundary | Require explicit evidence and side-effect boundaries |
31
+ | `blocked` | Disabled by policy or failed safety review | Do not run; use fallback |
32
+
33
+ ## Confidence
34
+
35
+ Skill Radar combines:
36
+
37
+ - task keywords and workflow phase
38
+ - changed file patterns
39
+ - local skill installation
40
+ - tool availability
41
+ - trust level
42
+ - policy status
43
+ - frontend/package evidence
44
+ - safety penalties
45
+
46
+ The score is not a promise that the tool will work. It is a routing signal. Any recommendation still needs real evidence before the agent can claim success.
47
+
48
+ ## Default Domains
49
+
50
+ | Domain | Typical triggers | Recommended capability types |
51
+ | --- | --- | --- |
52
+ | `ui` | UI, UX, frontend, component, visual, layout | design skills, visual review, screenshot evidence |
53
+ | `browserAutomation` | browser, E2E, Playwright, Chrome, DevTools | web access, browser automation, DevTools evidence |
54
+ | `desktopAutomation` | desktop, GUI, WPS, WeChat, CUA | disabled by default; manual operator fallback |
55
+ | `externalCli` | Codex, Gemini, OpenCode, external agent CLI | disabled by default; dry-run and output evidence |
56
+ | `review` | PR, merge, release, code review | reviewer skills, severity findings |
57
+ | `docs` | docs, README, ADR, governance asset | doc impact and source-of-truth evidence |
58
+ | `discovery` | skill, MCP, tool, capability discovery | find-skills plus safety review |
59
+
60
+ ## Evidence Contract
61
+
62
+ Each recommendation carries required evidence. Examples:
63
+
64
+ - UI work: `ui-spec`, `design-rationale`, `screenshot`, `visual-review`
65
+ - Browser work: `browser-evidence`, `console-summary`, `network-summary`, `scenario-result`
66
+ - Desktop work: `operator-boundary`, `desktop-screenshot`, `affected-app`
67
+ - External CLI work: `cli-version-check`, `command`, `exit-code`, `output-summary`
68
+ - Review work: `review-report`, `finding-list`, `severity`
69
+
70
+ If evidence is missing, the final delivery should list the capability as unverified rather than claiming it was used successfully.
71
+
72
+ ## Supply-Chain Doctor
73
+
74
+ `scale skill doctor --supply-chain` reviews known skill sources and install commands for:
75
+
76
+ - HTTPS source requirement
77
+ - `curl | bash`, `wget | sh`, `Invoke-Expression`, and `iex` blocking
78
+ - destructive install patterns
79
+ - npm/npx lifecycle script review
80
+ - required source, license, and revision checks
81
+
82
+ This is intentionally conservative. Third-party skills should start in review-required mode and be promoted only after inspection.
83
+
84
+ ## Policy Integration
85
+
86
+ Skill Radar reads `.scale/tools.json` through the Tool Policy layer. Defaults:
87
+
88
+ - UI and browser capabilities are enabled but evidence-required.
89
+ - Desktop CUA is disabled by default.
90
+ - External agent CLIs are disabled by default.
91
+ - Browser tools require captured evidence and should stay in approved domains.
92
+
93
+ Use Tool Policy to enable a restricted capability deliberately rather than relying on an agent's assumption.
94
+
95
+ ## Fallback Rule
96
+
97
+ Every recommendation must include a fallback. This prevents tool theater:
98
+
99
+ ```text
100
+ If the capability is missing, unsafe, low-confidence, or policy-blocked,
101
+ the agent must use the fallback and record why the capability was not used.
102
+ ```
103
+
104
+ ## Artifact Lifecycle
105
+
106
+ Skill Radar reports can be written into task artifacts:
107
+
108
+ ```bash
109
+ scale skill radar \
110
+ --task "Refactor upload page and verify browser flow" \
111
+ --files src/pages/upload.tsx \
112
+ --output docs/worklog/tasks/2026-05-19-upload-refactor/skill-radar.md
113
+ ```
114
+
115
+ Keep the report when it is evidence for an M/L/CRITICAL task. Do not commit transient local detection output unless it is part of the reviewed task artifact set.
@@ -0,0 +1,151 @@
1
+ # Workflow Eval Harness
2
+
3
+ Status: implemented baseline
4
+ Since: v0.22 development branch
5
+
6
+ Workflow Eval Harness 用来证明工作流是否真的提升了 Agent 的工程交付质量,而不是只依赖主观感觉。它会运行轻量 eval suite,记录 pass@k、修复迭代、工具调用、token 估算、人类纠偏次数,并在失败时保留 Failure Replay。
7
+
8
+ ## Commands
9
+
10
+ 初始化默认基线套件:
11
+
12
+ ```bash
13
+ scale eval init
14
+ scale eval init --suite workflow-baseline --json
15
+ ```
16
+
17
+ 运行套件:
18
+
19
+ ```bash
20
+ scale eval run --suite workflow-baseline
21
+ scale eval run --suite workflow-baseline --json
22
+ ```
23
+
24
+ 对比两次运行:
25
+
26
+ ```bash
27
+ scale eval compare --baseline <run-id> --candidate <run-id>
28
+ scale eval compare --baseline <run-id> --candidate <run-id> --json
29
+ ```
30
+
31
+ 生成 Markdown 报告:
32
+
33
+ ```bash
34
+ scale eval report --run <run-id>
35
+ scale eval report --run <run-id> --output docs/worklog/eval-report.md
36
+ ```
37
+
38
+ 查看和提升失败重放:
39
+
40
+ ```bash
41
+ scale eval failures --since 30d
42
+ scale eval replay <failure-id>
43
+ scale eval replay --task-id <task-id>
44
+ scale eval promote-failure <failure-id>
45
+ ```
46
+
47
+ ## Failure Replay To Memory
48
+
49
+ Failure Replay is local eval evidence first. When a failure pattern is useful for future work, ingest it into Memory Brain as an `incident` candidate:
50
+
51
+ ```bash
52
+ scale memory ingest --from failure --failure-id <failure-id>
53
+ scale memory query "missing verification evidence"
54
+ scale memory promote <memory-node-id>
55
+ ```
56
+
57
+ This does not auto-change standards or hooks. It only makes the failure queryable and evidence-backed so repeated mistakes can be promoted deliberately after review.
58
+
59
+ ## Storage
60
+
61
+ ```text
62
+ .scale/evals/
63
+ ├── suites/
64
+ ├── runs/
65
+ ├── failures/
66
+ └── improvements/
67
+ ```
68
+
69
+ These files are local runtime evidence by default. Commit only curated summaries or intentional benchmark fixtures.
70
+
71
+ ## Suite Shape
72
+
73
+ ```json
74
+ {
75
+ "version": "1.0",
76
+ "id": "workflow-baseline",
77
+ "name": "SCALE workflow baseline",
78
+ "cases": [
79
+ {
80
+ "id": "governance-command-smoke",
81
+ "type": "bugfix",
82
+ "title": "Command evidence smoke",
83
+ "task": "Verify that a local command can produce concrete eval evidence.",
84
+ "phase": "verify",
85
+ "successCriteria": ["command exits 0"],
86
+ "attempts": [
87
+ {
88
+ "id": "attempt-1",
89
+ "command": "node -e \"console.log('scale-eval-ok')\"",
90
+ "expectedExitCode": 0,
91
+ "outputContains": "scale-eval-ok"
92
+ }
93
+ ]
94
+ }
95
+ ]
96
+ }
97
+ ```
98
+
99
+ ## Metrics
100
+
101
+ | Metric | Meaning |
102
+ | --- | --- |
103
+ | `passAt1Rate` | 一次完整尝试就通过的比例 |
104
+ | `passAt3Rate` | 三次以内通过的比例 |
105
+ | `averageFixIterations` | 首次失败后的平均修复循环 |
106
+ | `totalToolCalls` | eval attempts 数量,可近似衡量工具调用成本 |
107
+ | `estimatedTokens` | task 与输出摘要的估算 token 成本 |
108
+ | `humanCorrections` | 人类纠偏次数 |
109
+ | `failureReplayCount` | 失败重放记录数量 |
110
+
111
+ ## Failure Replay
112
+
113
+ 失败不只记录最终失败状态,还会保存:
114
+
115
+ - task and success criteria
116
+ - phase
117
+ - wrong turn
118
+ - evidence
119
+ - correction
120
+ - prevention
121
+ - replay command
122
+ - redaction status
123
+
124
+ Failure category 当前包括:
125
+
126
+ - `wrong-exploration-path`
127
+ - `hallucinated-project-fact`
128
+ - `missing-codegraph-or-graph-fallback`
129
+ - `over-broad-context-load`
130
+ - `bad-skill-recommendation`
131
+ - `missing-verification-evidence`
132
+ - `failed-security-or-resource-gate`
133
+ - `human-correction-after-agent-confidence`
134
+ - `command-failure`
135
+ - `unknown`
136
+
137
+ `scale eval promote-failure` 会把失败重放提升为 improvement candidate,但不会自动修改项目规范。是否进入长期标准仍需要人工或后续 review 确认。
138
+
139
+ ## Governance Use
140
+
141
+ - v0.22 的默认 suite 是轻量 smoke baseline,用来验证 eval 管线可运行。
142
+ - 真实项目应逐步增加 bugfix、feature、security、frontend、release、resource 类型案例。
143
+ - Failure Replay 应与 Resource Governance 配合:默认本地保留,只有总结、基准或明确要长期维护的案例才提交。
144
+ - Workflow Eval 的数据可以进入后续 Governance ROI,用来判断某个治理模块是否真的减少 rework、tool calls、token 或人类纠偏。
145
+
146
+ ## Policy
147
+
148
+ - 不允许用 eval 通过率替代真实项目验证。
149
+ - 失败记录中的命令输出会做基础脱敏,但仍应避免把敏感原始日志写入 suite。
150
+ - 低成本 smoke suite 可以频繁运行;重型项目 suite 应按需运行。
151
+ - 没有 eval 证据时,不应宣称工作流能力已经提升。