kc-beta 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import fs from "node:fs";
2
2
  import path from "node:path";
3
3
  import { spawn } from "node:child_process";
4
4
  import { BaseTool, ToolResult } from "./base.js";
5
+ import { normalizeWorkflowResult } from "./_workflow-result-schema.js";
5
6
 
6
7
  /**
7
8
  * Execute a distilled workflow script against a document.
@@ -9,12 +10,33 @@ import { BaseTool, ToolResult } from "./base.js";
9
10
  * result and trace ID automatically. Saves structured result to output/results/.
10
11
  */
11
12
  export class WorkflowRunTool extends BaseTool {
12
- constructor(workspace, versionManager, confidenceScorer, { timeout = 120 } = {}) {
13
+ /**
14
+ * @param {Workspace} workspace
15
+ * @param {VersionManager} versionManager
16
+ * @param {ConfidenceScorer} confidenceScorer
17
+ * @param {object} [opts]
18
+ * @param {number} [opts.timeout=120]
19
+ * @param {(phase: string, key: string, value: any) => boolean} [opts.recordMilestone]
20
+ * v0.6.1 A6: callback for engine-emitted milestone updates. Called on
21
+ * successful workflow execution so the distillation/production_qc gates
22
+ * see real telemetry, not just filesystem scans of canonical paths.
23
+ * @param {() => string} [opts.getCurrentPhase]
24
+ * v0.6.1 A6: returns the engine's current phase. Used to gate
25
+ * production_qc-specific milestone bumps (documentsReviewed) so
26
+ * distillation-phase calls don't accidentally credit QC.
27
+ */
28
+ constructor(workspace, versionManager, confidenceScorer, {
29
+ timeout = 120,
30
+ recordMilestone = null,
31
+ getCurrentPhase = null,
32
+ } = {}) {
13
33
  super();
14
34
  this._workspace = workspace;
15
35
  this._versionMgr = versionManager;
16
36
  this._confidence = confidenceScorer;
17
37
  this._timeout = timeout;
38
+ this._recordMilestone = recordMilestone;
39
+ this._getCurrentPhase = getCurrentPhase;
18
40
  }
19
41
 
20
42
  get name() { return "workflow_run"; }
@@ -68,14 +90,18 @@ export class WorkflowRunTool extends BaseTool {
68
90
  return new ToolResult(e.message, true);
69
91
  }
70
92
 
71
- // Parse output
72
- let resultData;
93
+ // Parse output (last stdout line as JSON)
94
+ let parsed;
73
95
  try {
74
96
  const lines = output.trim().split("\n");
75
- resultData = JSON.parse(lines[lines.length - 1]);
97
+ parsed = JSON.parse(lines[lines.length - 1]);
76
98
  } catch {
77
- resultData = { raw_output: output.slice(0, 5000) };
99
+ parsed = { raw_output: output.slice(0, 5000) };
78
100
  }
101
+ // v0.6.2 I1: normalize to canonical dict shape — strips Python
102
+ // dataclass repr() keys, classifies ERROR results, ensures rule_id
103
+ // and verdict are present.
104
+ const resultData = normalizeWorkflowResult(parsed, ruleId, output);
79
105
 
80
106
  // Attach confidence score
81
107
  const extractedValue = String(resultData.extracted_value || resultData.value || "");
@@ -97,6 +123,18 @@ export class WorkflowRunTool extends BaseTool {
97
123
  const resultFile = path.join(resultsDir, `${ruleId}_${path.parse(docResolved).name}.json`);
98
124
  fs.writeFileSync(resultFile, JSON.stringify(resultData, null, 2), "utf-8");
99
125
 
126
+ // v0.6.1 A6: emit milestone signals so phase gates see this run.
127
+ // Wrapped in try/catch so milestone emission can never break a workflow.
128
+ try {
129
+ this._recordMilestone?.("distillation", "workflowsTested",
130
+ { id: ruleId, value: { confidence, traceId: resultData.trace_id } });
131
+ this._recordMilestone?.("distillation", "workflowsPassing", ruleId);
132
+ const phase = this._getCurrentPhase?.();
133
+ if (phase === "production_qc") {
134
+ this._recordMilestone?.("production_qc", "documentsReviewed", 1);
135
+ }
136
+ } catch { /* never let milestone emission break workflow execution */ }
137
+
100
138
  return new ToolResult(JSON.stringify(resultData, null, 2));
101
139
  }
102
140
 
@@ -240,6 +240,19 @@ export class Workspace {
240
240
  return traceId;
241
241
  }
242
242
 
243
+ /**
244
+ * v0.6.2 J3: Synchronous lock mirror of `withFileLock`, for callers
245
+ * that can't go async (SessionState.save). Locks a sibling
246
+ * `<relPath>.lock` file via O_CREAT|O_EXCL, with 5s timeout and 30s
247
+ * stale-takeover. On failure to acquire, runs fn anyway — better to
248
+ * lose serialization than deadlock a save call. Use sparingly; prefer
249
+ * `withFileLock` (async) for all paths that allow it.
250
+ */
251
+ withSyncFileLock(relPath, fn, { timeoutMs = 5_000, staleMs = 30_000 } = {}) {
252
+ const lockPath = path.join(this.path, `${relPath}.lock`);
253
+ return this._withSyncLockAtPath(lockPath, fn, { timeoutMs, staleMs });
254
+ }
255
+
243
256
  /**
244
257
  * B5: Synchronous gitops lock. Mirror of withFileLock but sync to fit
245
258
  * autoCommit's existing call signature. Times out and proceeds anyway
@@ -247,6 +260,16 @@ export class Workspace {
247
260
  */
248
261
  _withGitSyncLock(fn, { timeoutMs = 5_000, staleMs = 30_000 } = {}) {
249
262
  const lockPath = path.join(this.path, ".git", "kc-commit.lock");
263
+ return this._withSyncLockAtPath(lockPath, fn, { timeoutMs, staleMs });
264
+ }
265
+
266
+ /**
267
+ * Shared sync-lock implementation. Used by `_withGitSyncLock` (B5) and
268
+ * `withSyncFileLock` (J3 / v0.6.2). Same semantics: O_CREAT|O_EXCL on
269
+ * a sibling `.lock` file, busy-spin retry with stale takeover, run fn
270
+ * anyway on timeout.
271
+ */
272
+ _withSyncLockAtPath(lockPath, fn, { timeoutMs = 5_000, staleMs = 30_000 } = {}) {
250
273
  const start = Date.now();
251
274
  let acquired = false;
252
275
  while (Date.now() - start < timeoutMs) {
@@ -123,6 +123,38 @@
123
123
  }
124
124
  },
125
125
 
126
+ "deepseek": {
127
+ "_comment": "DeepSeek v4 family — flagship pro + cheap flash. Native 1M context but KC caps to 200K.",
128
+ "conductor": "deepseek-v4-pro",
129
+ "llm": {
130
+ "tier1": "deepseek-v4-pro",
131
+ "tier2": "deepseek-v4-pro",
132
+ "tier3": "deepseek-v4-flash",
133
+ "tier4": "deepseek-v4-flash"
134
+ },
135
+ "vlm": {
136
+ "tier1": "",
137
+ "tier2": "",
138
+ "tier3": ""
139
+ }
140
+ },
141
+
142
+ "xiaomi": {
143
+ "_comment": "Xiaomi MiMo coding plan — flagship Pro + standard + multimodal Omni. Native 1M context but KC caps to 200K. TTS variants excluded (no KC use case).",
144
+ "conductor": "MiMo-V2.5-Pro",
145
+ "llm": {
146
+ "tier1": "MiMo-V2.5-Pro",
147
+ "tier2": "MiMo-V2.5",
148
+ "tier3": "MiMo-V2-Pro",
149
+ "tier4": "MiMo-V2-Pro"
150
+ },
151
+ "vlm": {
152
+ "tier1": "MiMo-V2-Omni",
153
+ "tier2": "MiMo-V2-Omni",
154
+ "tier3": ""
155
+ }
156
+ },
157
+
126
158
  "openrouter": {
127
159
  "conductor": "anthropic/claude-sonnet-4-20250514",
128
160
  "llm": {
package/src/providers.js CHANGED
@@ -211,6 +211,51 @@ const PROVIDERS = [
211
211
  zh: "MiniMax",
212
212
  },
213
213
  },
214
+ {
215
+ id: "deepseek",
216
+ name: "DeepSeek",
217
+ baseUrl: "https://api.deepseek.com",
218
+ authType: "bearer",
219
+ apiFormat: "openai",
220
+ modelsEndpoint: "/models",
221
+ contextLimit: 200000, // KC cap — DeepSeek v4 is native 1M; we cap to 200K
222
+ defaultModel: getTierConfig("deepseek").conductor || "deepseek-v4-pro",
223
+ defaultTiers: getTierConfig("deepseek").llm,
224
+ defaultVlm: getTierConfig("deepseek").vlm,
225
+ curatedModels: [
226
+ { id: "deepseek-v4-pro", ownedBy: "deepseek" },
227
+ { id: "deepseek-v4-flash", ownedBy: "deepseek" },
228
+ ],
229
+ labels: {
230
+ en: "DeepSeek (v4 family)",
231
+ zh: "DeepSeek(v4 系列)",
232
+ },
233
+ },
234
+ {
235
+ id: "xiaomi",
236
+ name: "Xiaomi MiMo",
237
+ baseUrl: "https://token-plan-cn.xiaomimimo.com/v1",
238
+ authType: "bearer",
239
+ apiFormat: "openai",
240
+ modelsEndpoint: null, // Xiaomi coding-plan endpoint, no /models — use curated list
241
+ supportsCodingPlanKey: true,
242
+ contextLimit: 200000, // KC cap — MiMo V2.5 is native 1M
243
+ defaultModel: getTierConfig("xiaomi").conductor || "MiMo-V2.5-Pro",
244
+ defaultTiers: getTierConfig("xiaomi").llm,
245
+ defaultVlm: getTierConfig("xiaomi").vlm,
246
+ curatedModels: [
247
+ { id: "MiMo-V2.5-Pro", ownedBy: "xiaomi" },
248
+ { id: "MiMo-V2.5", ownedBy: "xiaomi" },
249
+ { id: "MiMo-V2-Pro", ownedBy: "xiaomi" },
250
+ { id: "MiMo-V2-Omni", ownedBy: "xiaomi" }, // multimodal
251
+ // TTS variants (MiMo-V2.5-TTS, *-VoiceClone, *-VoiceDesign, MiMo-V2-TTS)
252
+ // intentionally excluded — KC has no TTS use case.
253
+ ],
254
+ labels: {
255
+ en: "Xiaomi MiMo (V2.5 family, coding plan)",
256
+ zh: "小米 MiMo(V2.5 系列,编程计划)",
257
+ },
258
+ },
214
259
  {
215
260
  id: "openrouter",
216
261
  name: "OpenRouter",
@@ -27,6 +27,25 @@ rule-skills/
27
27
 
28
28
  Not every rule needs all of these. A simple threshold check might only need SKILL.md and a script. A complex semantic rule might need detailed references and many samples. Start minimal, add as needed during testing.
29
29
 
30
+ ## Granularity: 1 rule = 1 skill directory (default)
31
+
32
+ Default to **one rule per skill directory**. Group rules into the same file ONLY when they meet BOTH:
33
+
34
+ 1. They share the same evidence (same section / same table / same field) — so locating one locates all.
35
+ 2. They fail together — when one fails, the others almost always fail too (e.g., siblings in a required-fields list where the table itself is missing).
36
+
37
+ When grouping, name the file with the explicit range so downstream consumers (workflow-run, dashboards, finalization) can parse rule coverage by filename:
38
+ - ✅ `check_r013_r017.py` (R013, R014, R015, R016, R017 — same disclosure table, fail together)
39
+ - ❌ `check_r001_r050_r078.py` (different chapters, even if topically related — keep separate)
40
+
41
+ ### Anti-pattern: the unified runner
42
+
43
+ If you find yourself writing a single `unified_qc.py` (or `batch_runner.py`, or `master_check.py`) that handles all 110 rules in one Python file, **stop**. That means your per-rule skills are wrong, not that the architecture is wrong. Fix the skills.
44
+
45
+ E2E #4 demonstrated the cost: an agent wrote `unified_qc.py` to bypass 110 individual skills it didn't trust. Result was 1,150 errors out of 6,930 production checks (16.6%) and a phase counter stuck in `production_qc` while real work happened in skill_authoring. The unified runner felt productive locally and was a global mistake.
46
+
47
+ If individual skills aren't running cleanly, the right response is to identify which ones break and fix them, not consolidate. The whole pipeline (extraction → skill_testing → distillation → production_qc) assumes one rule = one verifiable artifact.
48
+
30
49
  ## Writing SKILL.md
31
50
 
32
51
  ### Frontmatter
@@ -27,6 +27,25 @@ rule-skills/
27
27
 
28
28
  Not every rule needs all of these. A simple threshold check might only need SKILL.md and a script. A complex semantic rule might need detailed references and many samples. Start minimal, add as needed during testing.
29
29
 
30
+ ## 颗粒度:默认 1 条规则 = 1 个技能目录
31
+
32
+ 默认**每条规则一个独立技能目录**。仅当同时满足以下两个条件时,才能把多条规则合并到同一个文件:
33
+
34
+ 1. 共享同一证据(同一章节 / 同一表格 / 同一字段)——找到一条就找到了全部。
35
+ 2. 一同成败——一条失败,其他几乎必然失败(例如必填字段表中的同辈规则,表本身缺失则全部失败)。
36
+
37
+ 合并时,用显式范围命名文件,让下游消费者(workflow-run、dashboards、finalization)可以从文件名解析规则覆盖范围:
38
+ - ✅ `check_r013_r017.py`(R013、R014、R015、R016、R017——同一披露表格,一同失败)
39
+ - ❌ `check_r001_r050_r078.py`(不同章节,即使主题相关,也应分开)
40
+
41
+ ### 反模式:统一运行器(unified runner)
42
+
43
+ 如果你发现自己在写一个 `unified_qc.py`(或 `batch_runner.py`、`master_check.py`)把全部 110 条规则塞进一个 Python 文件里,**停下来**。这说明你的单条规则技能写错了,不是架构错了。请修复单条技能。
44
+
45
+ E2E #4 给出了代价:智能体写了一个 `unified_qc.py` 绕过它不信任的 110 个独立技能。结果是 6,930 条生产检查里出了 1,150 个错误(16.6%),相位计数器卡在 `production_qc`,而真实工作还在 skill_authoring 里进行。统一运行器在局部看起来很高效,全局上是个错误。
46
+
47
+ 如果某些独立技能跑不通,正确的应对是定位并修复出问题的那几条,而不是合并所有技能。整个流水线(extraction → skill_testing → distillation → production_qc)的前提就是「一条规则 = 一个可独立验证的产物」。
48
+
30
49
  ## Writing SKILL.md
31
50
 
32
51
  ### Frontmatter