kc-beta 0.1.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/bin/kc-beta.js +14 -2
  2. package/package.json +1 -1
  3. package/src/agent/context-window.js +151 -0
  4. package/src/agent/context.js +8 -4
  5. package/src/agent/engine.js +261 -8
  6. package/src/agent/event-log.js +111 -0
  7. package/src/agent/llm-client.js +352 -59
  8. package/src/agent/pipelines/base.js +6 -0
  9. package/src/agent/pipelines/distillation.js +18 -0
  10. package/src/agent/pipelines/extraction.js +21 -0
  11. package/src/agent/pipelines/initializer.js +75 -14
  12. package/src/agent/pipelines/production-qc.js +19 -0
  13. package/src/agent/pipelines/skill-authoring.js +14 -0
  14. package/src/agent/pipelines/skill-testing.js +20 -0
  15. package/src/agent/retry.js +83 -0
  16. package/src/agent/session-state.js +79 -0
  17. package/src/agent/skill-loader.js +13 -1
  18. package/src/agent/token-counter.js +62 -0
  19. package/src/agent/tools/document-parse.js +104 -21
  20. package/src/agent/tools/document-search.js +24 -8
  21. package/src/agent/tools/sandbox-exec.js +16 -5
  22. package/src/agent/tools/web-search.js +107 -0
  23. package/src/agent/tools/worker-llm-call.js +14 -5
  24. package/src/agent/tools/workspace-file.js +47 -20
  25. package/src/agent/workspace.js +24 -1
  26. package/src/cli/components.js +24 -5
  27. package/src/cli/config.js +340 -0
  28. package/src/cli/index.js +113 -11
  29. package/src/cli/onboard.js +216 -53
  30. package/src/config.js +63 -10
  31. package/src/model-tiers.json +153 -0
  32. package/src/providers.js +367 -0
  33. package/template/AGENT.md +20 -0
  34. package/template/skills/en/meta/compliance-judgment/SKILL.md +10 -42
  35. package/template/skills/en/meta/document-chunking/SKILL.md +32 -0
  36. package/template/skills/en/meta/document-parsing/SKILL.md +11 -18
  37. package/template/skills/en/meta/entity-extraction/SKILL.md +13 -28
  38. package/template/skills/en/meta/tree-processing/SKILL.md +19 -1
  39. package/template/skills/en/meta-meta/auto-model-selection/SKILL.md +53 -0
  40. package/template/skills/en/meta-meta/pdf-review-dashboard/SKILL.md +57 -0
  41. package/template/skills/en/meta-meta/pdf-review-dashboard/scripts/generate_review.js +262 -0
  42. package/template/skills/en/meta-meta/rule-extraction/SKILL.md +24 -1
  43. package/template/skills/en/meta-meta/skill-authoring/SKILL.md +6 -0
  44. package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +4 -0
  45. package/template/skills/zh/meta/compliance-judgment/SKILL.md +41 -262
  46. package/template/skills/zh/meta/document-chunking/SKILL.md +32 -0
  47. package/template/skills/zh/meta/document-parsing/SKILL.md +65 -132
  48. package/template/skills/zh/meta/entity-extraction/SKILL.md +68 -230
  49. package/template/skills/zh/meta/tree-processing/SKILL.md +82 -194
  50. package/template/skills/zh/meta-meta/auto-model-selection/SKILL.md +51 -0
  51. package/template/skills/zh/meta-meta/pdf-review-dashboard/SKILL.md +55 -0
  52. package/template/skills/zh/meta-meta/pdf-review-dashboard/scripts/generate_review.js +262 -0
  53. package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +79 -164
  54. package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +64 -185
  55. package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +95 -216
@@ -0,0 +1,367 @@
1
+ /**
2
+ * Provider registry for Multi-LLM support.
3
+ * Centralizes provider metadata, default models, and model classification.
4
+ *
5
+ * Model tier assignments (LLM + VLM) are loaded from model-tiers.json
6
+ * so they can be updated without touching code.
7
+ */
8
+
9
+ import { readFileSync } from "node:fs";
10
+ import { fileURLToPath } from "node:url";
11
+ import { dirname, join } from "node:path";
12
+
13
+ const __filename = fileURLToPath(import.meta.url);
14
+ const __dirname = dirname(__filename);
15
+
16
+ /** @type {Record<string, {conductor: string, llm: Record<string,string>, vlm: Record<string,string>}>} */
17
+ let MODEL_TIERS;
18
+ try {
19
+ MODEL_TIERS = JSON.parse(
20
+ readFileSync(join(__dirname, "model-tiers.json"), "utf-8")
21
+ );
22
+ } catch {
23
+ MODEL_TIERS = {};
24
+ }
25
+
26
+ /** Helper: get tier config for a provider, with fallbacks */
27
+ function getTierConfig(providerId) {
28
+ return MODEL_TIERS[providerId] || { conductor: "", llm: {}, vlm: {} };
29
+ }
30
+
31
+ const PROVIDERS = [
32
+ {
33
+ id: "siliconflow",
34
+ name: "SiliconFlow",
35
+ baseUrl: "https://api.siliconflow.cn/v1",
36
+ authType: "bearer",
37
+ apiFormat: "openai",
38
+ modelsEndpoint: "/models",
39
+ defaultModel: getTierConfig("siliconflow").conductor || "glm-5",
40
+ defaultTiers: getTierConfig("siliconflow").llm,
41
+ defaultVlm: getTierConfig("siliconflow").vlm,
42
+ labels: {
43
+ en: "SiliconFlow (recommended for China)",
44
+ zh: "SiliconFlow(国内推荐)",
45
+ },
46
+ },
47
+ {
48
+ id: "aliyun",
49
+ name: "Aliyun",
50
+ // Coding plan URL — regular API uses dashscope.aliyuncs.com/compatible-mode/v1
51
+ baseUrl: "https://dashscope.aliyuncs.com/compatible-mode/v1",
52
+ codingPlanUrl: "https://coding.dashscope.aliyuncs.com/v1",
53
+ authType: "bearer",
54
+ apiFormat: "openai",
55
+ modelsEndpoint: null, // Aliyun coding plan doesn't support /models
56
+ supportsCodingPlanKey: true,
57
+ defaultModel: getTierConfig("aliyun").conductor || "qwen3.6-plus",
58
+ defaultTiers: getTierConfig("aliyun").llm,
59
+ defaultVlm: getTierConfig("aliyun").vlm,
60
+ // Curated model list (coding plan doesn't have /models endpoint)
61
+ curatedModels: [
62
+ { id: "qwen3.6-plus", ownedBy: "qwen" },
63
+ { id: "qwen3.5-plus", ownedBy: "qwen" },
64
+ { id: "qwen3-max-2026-01-23", ownedBy: "qwen" },
65
+ { id: "qwen3-coder-next", ownedBy: "qwen" },
66
+ { id: "qwen3-coder-plus", ownedBy: "qwen" },
67
+ { id: "glm-5", ownedBy: "zhipu" },
68
+ { id: "glm-4.7", ownedBy: "zhipu" },
69
+ { id: "kimi-k2.5", ownedBy: "kimi" },
70
+ { id: "MiniMax-M2.5", ownedBy: "minimax" },
71
+ ],
72
+ labels: {
73
+ en: "Aliyun Bailian",
74
+ zh: "阿里云百炼",
75
+ },
76
+ },
77
+ {
78
+ id: "volcanocloud",
79
+ name: "VolcanoCloud",
80
+ baseUrl: "https://ark.cn-beijing.volces.com/api/v3",
81
+ authType: "bearer",
82
+ apiFormat: "openai",
83
+ modelsEndpoint: null, // VolcanoCloud coding plan — use curated list
84
+ defaultModel: getTierConfig("volcanocloud").conductor || "doubao-seed-2-0-pro-260215",
85
+ defaultTiers: getTierConfig("volcanocloud").llm,
86
+ defaultVlm: getTierConfig("volcanocloud").vlm,
87
+ curatedModels: [
88
+ { id: "doubao-seed-2-0-pro-260215", ownedBy: "bytedance" },
89
+ { id: "deepseek-v3-2-251201", ownedBy: "deepseek" },
90
+ { id: "glm-4-7-251222", ownedBy: "zhipu" },
91
+ { id: "doubao-1-5-pro-32k-250115", ownedBy: "bytedance" },
92
+ { id: "doubao-seed-2-0-mini-260215", ownedBy: "bytedance" },
93
+ { id: "doubao-seed-2-0-lite-260215", ownedBy: "bytedance" },
94
+ { id: "doubao-1-5-lite-32k-250115", ownedBy: "bytedance" },
95
+ ],
96
+ labels: {
97
+ en: "VolcanoCloud (ByteDance)",
98
+ zh: "火山云(字节跳动)",
99
+ },
100
+ },
101
+ {
102
+ id: "anthropic",
103
+ name: "Anthropic",
104
+ baseUrl: "https://api.anthropic.com",
105
+ authType: "x-api-key",
106
+ apiFormat: "anthropic",
107
+ modelsEndpoint: null, // Use curated list
108
+ defaultModel: getTierConfig("anthropic").conductor || "claude-sonnet-4-20250514",
109
+ defaultTiers: getTierConfig("anthropic").llm,
110
+ defaultVlm: getTierConfig("anthropic").vlm,
111
+ curatedModels: [
112
+ { id: "claude-opus-4-20250514", ownedBy: "anthropic" },
113
+ { id: "claude-sonnet-4-20250514", ownedBy: "anthropic" },
114
+ { id: "claude-haiku-4-5-20251001", ownedBy: "anthropic" },
115
+ ],
116
+ labels: {
117
+ en: "Anthropic",
118
+ zh: "Anthropic",
119
+ },
120
+ },
121
+ {
122
+ id: "openai",
123
+ name: "OpenAI",
124
+ baseUrl: "https://api.openai.com/v1",
125
+ authType: "bearer",
126
+ apiFormat: "openai",
127
+ modelsEndpoint: "/models",
128
+ defaultModel: getTierConfig("openai").conductor || "gpt-4o",
129
+ defaultTiers: getTierConfig("openai").llm,
130
+ defaultVlm: getTierConfig("openai").vlm,
131
+ labels: {
132
+ en: "OpenAI",
133
+ zh: "OpenAI",
134
+ },
135
+ },
136
+ {
137
+ id: "zhipu",
138
+ name: "Zhipu/GLM",
139
+ baseUrl: "https://open.bigmodel.cn/api/paas/v4",
140
+ authType: "bearer",
141
+ apiFormat: "openai",
142
+ modelsEndpoint: "/models",
143
+ defaultModel: getTierConfig("zhipu").conductor || "glm-4-plus",
144
+ defaultTiers: getTierConfig("zhipu").llm,
145
+ defaultVlm: getTierConfig("zhipu").vlm,
146
+ labels: {
147
+ en: "Zhipu GLM",
148
+ zh: "智谱 GLM",
149
+ },
150
+ },
151
+ {
152
+ id: "minimax",
153
+ name: "MiniMax",
154
+ baseUrl: "https://api.minimax.chat/v1",
155
+ authType: "bearer",
156
+ apiFormat: "openai",
157
+ modelsEndpoint: "/models",
158
+ defaultModel: getTierConfig("minimax").conductor || "MiniMax-M2.5",
159
+ defaultTiers: getTierConfig("minimax").llm,
160
+ defaultVlm: getTierConfig("minimax").vlm,
161
+ labels: {
162
+ en: "MiniMax",
163
+ zh: "MiniMax",
164
+ },
165
+ },
166
+ {
167
+ id: "openrouter",
168
+ name: "OpenRouter",
169
+ baseUrl: "https://openrouter.ai/api/v1",
170
+ authType: "bearer",
171
+ apiFormat: "openai",
172
+ modelsEndpoint: "/models",
173
+ defaultModel: getTierConfig("openrouter").conductor || "anthropic/claude-sonnet-4-20250514",
174
+ defaultTiers: getTierConfig("openrouter").llm,
175
+ defaultVlm: getTierConfig("openrouter").vlm,
176
+ labels: {
177
+ en: "OpenRouter",
178
+ zh: "OpenRouter",
179
+ },
180
+ },
181
+ {
182
+ id: "bedrock",
183
+ name: "Bedrock",
184
+ baseUrl: "",
185
+ authType: "aws-sigv4",
186
+ apiFormat: "anthropic",
187
+ modelsEndpoint: null,
188
+ defaultModel: getTierConfig("bedrock").conductor || "anthropic.claude-sonnet-4-20250514-v1:0",
189
+ defaultTiers: getTierConfig("bedrock").llm,
190
+ defaultVlm: getTierConfig("bedrock").vlm,
191
+ labels: {
192
+ en: "AWS Bedrock (not yet supported)",
193
+ zh: "AWS Bedrock(暂未支持)",
194
+ },
195
+ },
196
+ {
197
+ id: "custom",
198
+ name: "Custom",
199
+ baseUrl: "",
200
+ authType: "bearer",
201
+ apiFormat: "openai",
202
+ modelsEndpoint: "/models",
203
+ defaultModel: getTierConfig("custom").conductor || "",
204
+ defaultTiers: getTierConfig("custom").llm,
205
+ defaultVlm: getTierConfig("custom").vlm,
206
+ labels: {
207
+ en: "Custom (enter base URL)",
208
+ zh: "自定义(输入接口地址)",
209
+ },
210
+ },
211
+ ];
212
+
213
+ /**
214
+ * Known model capability rankings (partial — used to sort discovered models).
215
+ * Pattern-matched against lowercase model ID. Higher = more capable.
216
+ * Aligned with kc_reborn providers.py _MODEL_RANKING.
217
+ */
218
+ const MODEL_RANKING = {
219
+ // Anthropic
220
+ "claude-opus-4": 100,
221
+ "claude-sonnet-4": 90,
222
+ "claude-haiku-4": 70,
223
+ // OpenAI
224
+ "gpt-4o": 90,
225
+ "gpt-4o-mini": 70,
226
+ "gpt-4-turbo": 85,
227
+ "o1": 95,
228
+ "o3": 95,
229
+ // Qwen (Aliyun Bailian)
230
+ "qwen3.6-plus": 90,
231
+ "qwen3.5-plus": 85,
232
+ "qwen3-max": 88,
233
+ "qwen3-coder-next": 85,
234
+ "qwen3-coder-plus": 80,
235
+ "qwen-plus": 75,
236
+ "qwen-turbo": 60,
237
+ "qwen3.5-397b": 85,
238
+ "qwen3.5-122b": 75,
239
+ "qwen3.5-35b": 65,
240
+ // Zhipu
241
+ "glm-5": 90,
242
+ "glm-4.7": 80,
243
+ "glm-4": 75,
244
+ // Others
245
+ "kimi-k2.5": 85,
246
+ "kimi-k2": 80,
247
+ "minimax-m2": 80,
248
+ "deepseek-v3": 85,
249
+ "deepseek-r1": 90,
250
+ // VolcanoCloud (ByteDance Doubao)
251
+ "doubao-seed-2-0-pro": 90,
252
+ "doubao-seed-2-0-code": 88,
253
+ "doubao-seed-2-0-mini": 75,
254
+ "doubao-seed-2-0-lite": 65,
255
+ "doubao-seed-1-8": 85,
256
+ "doubao-seed-1-6": 80,
257
+ "doubao-1-5-pro": 80,
258
+ "doubao-1-5-lite": 60,
259
+ };
260
+
261
+ /**
262
+ * Estimate model capability rank (0-100) based on known patterns.
263
+ * @param {string} modelId
264
+ * @returns {number}
265
+ */
266
+ function rankModel(modelId) {
267
+ const lower = modelId.toLowerCase();
268
+ for (const [pattern, rank] of Object.entries(MODEL_RANKING)) {
269
+ if (lower.includes(pattern)) return rank;
270
+ }
271
+ return 50; // Unknown model: assume mid-tier
272
+ }
273
+
274
+ /**
275
+ * Patterns to filter out non-chat models from discovery results.
276
+ */
277
+ const EXCLUDE_PATTERNS = [
278
+ /embed/i, /tts/i, /whisper/i, /dall-e/i, /audio/i, /image/i,
279
+ /moderation/i, /rerank/i,
280
+ ];
281
+
282
+ /** @returns {Array} All provider definitions */
283
+ export function getProviders() {
284
+ return PROVIDERS;
285
+ }
286
+
287
+ /**
288
+ * @param {string} id - Provider ID
289
+ * @returns {object|undefined}
290
+ */
291
+ export function getProviderById(id) {
292
+ return PROVIDERS.find((p) => p.id === id);
293
+ }
294
+
295
+ /**
296
+ * Get display labels for the onboard menu.
297
+ * @param {string} lang - "en" or "zh"
298
+ * @returns {Array<{id: string, label: string}>}
299
+ */
300
+ export function getProviderLabels(lang) {
301
+ return PROVIDERS.map((p) => ({
302
+ id: p.id,
303
+ label: p.labels[lang] || p.labels.en || p.name,
304
+ }));
305
+ }
306
+
307
+ /**
308
+ * Classify a list of discovered models into tier assignments.
309
+ * Uses the same ranking-based approach as kc_reborn's propose_tiers().
310
+ *
311
+ * @param {Array<{id: string, name?: string}>} models - Models from /models endpoint or curated list
312
+ * @returns {{ conductor: string, tiers: {tier1: string, tier2: string, tier3: string, tier4: string}, unclassified: string[] }}
313
+ */
314
+ export function classifyModels(models) {
315
+ // Filter out non-chat models
316
+ const chatModels = models.filter((m) => {
317
+ const name = m.id || m.name || "";
318
+ return !EXCLUDE_PATTERNS.some((re) => re.test(name));
319
+ });
320
+
321
+ // Rank and sort by capability
322
+ const ranked = [...chatModels].sort((a, b) => rankModel(b.id) - rankModel(a.id));
323
+
324
+ // Select conductor (highest ranked)
325
+ const conductor = ranked[0]?.id || "";
326
+
327
+ // Distribute across tiers by rank
328
+ const tierBuckets = { tier1: [], tier2: [], tier3: [], tier4: [] };
329
+
330
+ for (const m of ranked) {
331
+ const rank = rankModel(m.id);
332
+ if (rank >= 85) tierBuckets.tier1.push(m.id);
333
+ else if (rank >= 70) tierBuckets.tier2.push(m.id);
334
+ else if (rank >= 55) tierBuckets.tier3.push(m.id);
335
+ else tierBuckets.tier4.push(m.id);
336
+ }
337
+
338
+ const tiers = {
339
+ tier1: tierBuckets.tier1.slice(0, 3).join(", "),
340
+ tier2: tierBuckets.tier2.slice(0, 3).join(", "),
341
+ tier3: tierBuckets.tier3.slice(0, 2).join(", "),
342
+ tier4: tierBuckets.tier4.slice(0, 2).join(", "),
343
+ };
344
+
345
+ const unclassified = ranked.filter((m) => rankModel(m.id) === 50).map((m) => m.id);
346
+
347
+ return { conductor, tiers, unclassified };
348
+ }
349
+
350
+ /**
351
+ * Get curated models for providers that don't support /models endpoint.
352
+ * @param {string} providerId
353
+ * @returns {Array<{id: string, ownedBy: string}>|null}
354
+ */
355
+ export function getCuratedModels(providerId) {
356
+ const provider = getProviderById(providerId);
357
+ return provider?.curatedModels || null;
358
+ }
359
+
360
+ /**
361
+ * Get the raw model tier config for a provider (from model-tiers.json).
362
+ * @param {string} providerId
363
+ * @returns {{ conductor: string, llm: Record<string,string>, vlm: Record<string,string> }}
364
+ */
365
+ export function getModelTierConfig(providerId) {
366
+ return getTierConfig(providerId);
367
+ }
@@ -0,0 +1,20 @@
1
+ # AGENT.md — Project Context
2
+
3
+ This file is your per-project memory. Update it as you learn about the project.
4
+ The content here is injected into your system prompt on every turn.
5
+
6
+ ## Project
7
+
8
+ <!-- What domain? What regulations? What documents? Fill this in during bootstrap. -->
9
+
10
+ ## Decisions
11
+
12
+ <!-- Key decisions made with the developer user. Rule granularity, accuracy targets, model choices, scope boundaries. -->
13
+
14
+ ## Domain Notes
15
+
16
+ <!-- Terminology, document formats, naming conventions, edge cases specific to this domain. -->
17
+
18
+ ## User Preferences
19
+
20
+ <!-- How the developer user prefers to communicate. Reporting format, language, level of detail. -->
@@ -9,53 +9,17 @@ Judgment is the moment of truth. You have the extracted entity. You have the rul
9
9
 
10
10
  ## The Judgment Spectrum
11
11
 
12
- Rules fall on a spectrum from fully deterministic to fully semantic:
12
+ Rules range from trivially deterministic to deeply semantic. Pick the right tool for each rule.
13
13
 
14
- ### Deterministic Judgments (Use Python)
14
+ **Deterministic** threshold checks, format validation, date arithmetic, cross-field consistency. Pure Python: free, instant, deterministic.
15
15
 
16
- Rules with clear, computable criteria:
16
+ **Semantic** — adequacy, completeness, consistency, compliance with templates, detecting misleading or suggestive language, assessing whether a description is fair and balanced. These require language understanding — use worker LLM.
17
17
 
18
- - **Threshold checks**: "The capital adequacy ratio must be >= 8%."
19
- ```python
20
- result = "pass" if extracted_ratio >= 8.0 else "fail"
21
- ```
22
- - **Format validation**: "The loan number must match pattern XX-YYYY-ZZZZZZ."
23
- ```python
24
- result = "pass" if re.match(r"[A-Z]{2}-\d{4}-\d{6}", loan_number) else "fail"
25
- ```
26
- - **Date arithmetic**: "The contract must be signed within 30 days of application."
27
- ```python
28
- result = "pass" if (sign_date - app_date).days <= 30 else "fail"
29
- ```
30
- - **Cross-field consistency**: "The total must equal the sum of line items."
31
- ```python
32
- result = "pass" if abs(total - sum(items)) < 0.01 else "fail"
33
- ```
18
+ Many real compliance rules require semantic judgment. "The risk disclosure must adequately describe the key risks" cannot be checked with regex or Python. "The contract description must not be misleading or suggestive" requires deep language understanding. Use worker LLM for these without hesitation.
34
19
 
35
- These are best implemented as pure Python. They are free, instant, and deterministic. When possible, prefer this form.
20
+ Some rules combine both: extract a number (deterministic), compare to threshold (deterministic), then assess the explanation if borderline (semantic). The mix depends on the rule.
36
21
 
37
- ### Semantic Judgments (Use LLM)
38
-
39
- Rules requiring language understanding:
40
-
41
- - **Adequacy**: "The risk disclosure must adequately describe the key risks."
42
- - **Completeness**: "The management discussion must address financial performance, strategic outlook, and market conditions."
43
- - **Consistency**: "The executive summary must be consistent with the detailed findings."
44
- - **Compliance with template**: "The report must follow the format specified in Regulation Appendix A."
45
-
46
- For these, design an LLM prompt:
47
- 1. Provide the rule text (what constitutes compliance).
48
- 2. Provide the extracted content (what the document says).
49
- 3. Ask for a structured verdict: pass/fail, reasoning, and comment.
50
- 4. Ask the model to be conservative — flag as fail only when clearly non-compliant. When truly ambiguous, use a "partial" or "uncertain" result rather than a hard fail.
51
-
52
- ### Hybrid Judgments (Most Common)
53
-
54
- Most rules combine deterministic and semantic elements:
55
- - Extract the number (regex) → compare to threshold (Python) → if borderline, assess the explanation (LLM).
56
- - Check that a section exists (deterministic) → check that it covers required topics (semantic).
57
-
58
- Design the pipeline to run cheap steps first. Only invoke the LLM when the deterministic check is insufficient.
22
+ The right method is whatever achieves accuracy at lowest cost. Simple threshold checks don't need LLM. Semantic assessments don't benefit from Python. Most projects will have a mix — let the nature of each rule determine the method.
59
23
 
60
24
  ## Output Format
61
25
 
@@ -80,6 +44,10 @@ For each rule × document combination:
80
44
  - **error**: Something went wrong during extraction or judgment (parsing failure, API error). Needs investigation.
81
45
  - **uncertain**: The judgment is ambiguous. May need human review.
82
46
 
47
+ **Design exit criteria first:** Before writing judgment logic for a rule, define the exit conditions: what constitutes pass, what constitutes fail, what triggers escalation to human, how to handle empty/missing values, what value ranges are valid. Explicit exit criteria prevent ambiguous or inconsistent judgment.
48
+
49
+ **Prompt design:** Design prompts for what you want, not against what you don't want. "Don't include reasoning" is less reliable than extracting the verdict from structured output in postprocessing. Use output filtering instead of prompt negation.
50
+
83
51
  **Comments:**
84
52
  - Required only when result is `fail`. Skip for `pass` unless the developer user specifically requests pass comments.
85
53
  - Be concise and factual: "Capital adequacy ratio is 7.2%, below the regulatory minimum of 8.0%."
@@ -0,0 +1,32 @@
1
+ ---
2
+ name: document-chunking
3
+ description: >
4
+ Fast, cheap chunking for processing batches of sample and input documents.
5
+ Use when you need to split documents into manageable pieces for initial observation,
6
+ data sensibility checks, or feeding to extraction workflows. Not for production
7
+ verification chunking — for that, use tree-processing to design a tailored chunking script.
8
+ ---
9
+
10
+ # Document Chunking
11
+
12
+ Split documents into pieces for downstream processing. This is the fast, cheap version — for batch processing of samples and inputs, not for precision verification workflows.
13
+
14
+ ## Methods
15
+
16
+ **Page-level splits** — simplest. Each page is a chunk. Works for most document processing where you need to iterate over content.
17
+
18
+ **Fixed-size chunks** — split by character/token count with overlap. Good for search and initial observation. Typical: 2000-4000 chars with 200 char overlap.
19
+
20
+ **Header-based splits** — detect section headers and split at boundaries. Preserves semantic units. Use regex patterns for the document's header convention.
21
+
22
+ ## When to Use What
23
+
24
+ Pick the simplest method that serves the task:
25
+ - Batch document observation → page-level
26
+ - Full-text search index → fixed-size with overlap
27
+ - Section-level extraction → header-based
28
+ - Table of contents available → parse TOC for structure
29
+
30
+ ## Relationship to tree-processing
31
+
32
+ This skill is for quick, cheap chunking during exploration and batch processing. When you need production-grade chunking for verification workflows — where the chunking mechanism must be precise, consistent, and coded as a script — use `tree-processing` instead.
@@ -12,28 +12,21 @@ Parsing is the foundation. If the text is wrong, everything downstream is wrong.
12
12
  Start with the simplest parser. Escalate only when necessary. This is not about saving money — it is about producing the most reliable output. Simple parsers have fewer failure modes.
13
13
 
14
14
  ### Level 1: Direct Text Extraction
15
- - Tool: pymupdf (PyMuPDF) or similar PDF text extraction.
15
+ - Tool: pdfjs-dist or similar PDF text extraction.
16
16
  - When: Well-formed digital PDFs with embedded text. This covers most modern business documents.
17
17
  - Output: Raw text with basic structure preserved (paragraphs, basic formatting).
18
18
  - Limitations: Tables may come out as messy text. Charts and images are invisible. Scanned PDFs produce nothing.
19
19
 
20
- ### Level 2: Layout-Aware Extraction
21
- - Tool: pdfplumber or similar layout-aware parser.
22
- - When: Level 1 produces messy table output, or when preserving spatial layout matters (forms, multi-column documents).
23
- - Output: Text with table detection and cell-level extraction.
24
- - Limitations: Still text-based. Cannot handle scanned content.
25
-
26
- ### Level 3: OCR
27
- - Tool: Vision-capable models from OCR_MODEL_TIER in `.env` (PaddleOCR-VL, GLM-4.6V, etc.).
28
- - When: Scanned PDFs, image-based PDFs, or PDFs where Level 1-2 produce garbled/incomplete text.
29
- - Output: Recognized text from images.
30
- - Limitations: Slower, costs API calls, may introduce OCR errors.
31
-
32
- ### Level 4: Vision Model Interpretation
33
- - Tool: High-capability vision models (OCR_MODEL_TIER1).
34
- - When: Complex tables that text extraction cannot parse correctly, charts that need data point extraction, mixed text-and-image layouts.
35
- - Output: Structured interpretation of visual content (table as markdown, chart data as JSON).
36
- - Limitations: Expensive, slow. Reserve for when the visual content genuinely needs interpretation.
20
+ ### Level 2: Provider VLM (Vision Language Model)
21
+ - Tool: VLM models from configured provider (VLM_TIER3 for cheap OCR, VLM_TIER1 for complex interpretation).
22
+ - When: Level 1 produces garbled/incomplete text, scanned PDFs, image-based PDFs.
23
+ - Output: Recognized text from page images, or structured interpretation (table as markdown, chart data as JSON).
24
+ - Calling a provider VLM is more convenient and reliable than deploying local OCR. Use the cheapest VLM tier first; escalate to a more capable tier for complex tables/charts.
25
+
26
+ ### Level 3: MineRU API or Local Tools (Optional)
27
+ - Tool: MineRU API, pdfplumber, or locally deployed OCR if configured.
28
+ - When: Provider VLM is unavailable or too expensive for batch processing.
29
+ - These are optional fallbacks. Most users will use Level 1 + Level 2.
37
30
 
38
31
  ## Quality Detection
39
32
 
@@ -33,40 +33,21 @@ The value could be anywhere, or the rule applies to the document as a whole.
33
33
 
34
34
  ## Method Selection
35
35
 
36
- ### Regex / Python (Cost: zero, Speed: instant)
36
+ Extraction method selection is a cost-accuracy search. The goal is finding the cheapest method that meets the accuracy threshold. Regex is the smallest, cheapest "model" — zero cost, instant, deterministic. Worker LLM is more capable but costs tokens and time. Any search strategy is valid: try the cheapest first and escalate, try the most capable first and downgrade, bisect, or jump directly to a known-good method based on past experience in AGENT.md.
37
37
 
38
- Use when the entity has a predictable format:
38
+ ### Available Methods
39
39
 
40
- - **Dates**: `\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?` or specific patterns for the document type.
41
- - **Monetary amounts**: `[\d,]+\.?\d*\s*(元|万元|亿元|USD|RMB)` or similar.
42
- - **Percentages**: `\d+\.?\d*\s*%`
43
- - **Identifiers**: Loan numbers, registration codes, ID numbers — usually fixed formats.
44
- - **Specific phrases**: "I hereby agree" or "本人同意" — exact string matching.
40
+ **Regex / Python** Cost: zero. Speed: instant. Deterministic.
41
+ Works well for: dates, monetary amounts, percentages, identifiers, fixed phrases, any value with a predictable format.
45
42
 
46
- Build and test the regex on sample documents. A good regex is better than a good LLM prompt for structured values — it is faster, deterministic, and free.
43
+ **Worker LLM** Cost: API tokens. Speed: seconds. Semantic understanding.
44
+ Works well for: contextual interpretation, conditional values, semantic matching, ambiguous structures, suggestive or misleading language detection, table interpretation, anything requiring understanding rather than pattern matching.
47
45
 
48
- ### LLM Extraction (Cost: API call, Speed: seconds)
46
+ Many real verification tasks require semantic understanding — "is this description misleading?", "does this clause adequately disclose risk?", "is this guarantor's business description consistent with their stated industry?" — regex cannot handle these. Use worker LLM without hesitation for such tasks.
49
47
 
50
- Use when the entity requires understanding:
48
+ ### The Search
51
49
 
52
- - **Named entities in context**: "the guarantor's main business" requires understanding who the guarantor is and which text describes their business.
53
- - **Conditional values**: "the interest rate, including any adjustments" — requires understanding what constitutes an adjustment.
54
- - **Semantic matching**: "adequate risk disclosure" — requires judgment about what text constitutes risk disclosure.
55
- - **Table interpretation**: When a table's structure is not uniform and regex cannot reliably extract cells.
56
-
57
- Design the LLM prompt to:
58
- 1. Include the narrowed context (from tree processing).
59
- 2. Specify exactly what to extract.
60
- 3. Define the output format (JSON with named fields).
61
- 4. Provide one example if the extraction is non-obvious.
62
-
63
- ### Hybrid Approach
64
-
65
- Often the best strategy:
66
- 1. Use regex to extract candidates (fast, catches obvious matches).
67
- 2. If regex finds a confident match, use it.
68
- 3. If regex fails or is uncertain, fall back to LLM extraction.
69
- 4. Use LLM to validate regex results when confidence matters.
50
+ If a method's results fall below the accuracy threshold, try a different method or a more capable model. If regex works and meets accuracy — keep it, it's free. If regex produces results below threshold, escalate to worker LLM. If a cheap worker LLM isn't accurate enough, try a more capable tier. Record what works for each extraction type in AGENT.md for future reference.
70
51
 
71
52
  ## Schema Design
72
53
 
@@ -118,6 +99,10 @@ Every extraction should carry a confidence estimate:
118
99
 
119
100
  These are starting points. Calibrate based on actual accuracy (see `confidence-system`).
120
101
 
102
+ ## Prompt Design: Ask For What You Want
103
+
104
+ Design prompts for what you want, not against what you don't want. "Don't include explanations" in a prompt is less reliable than stripping non-JSON text from the output in postprocessing. If you need to tell the LLM not to do something, use output filtering instead of prompt negation.
105
+
121
106
  ## Fitting Worker LLM Context
122
107
 
123
108
  When designing extraction for worker LLM workflows:
@@ -1,12 +1,30 @@
1
1
  ---
2
2
  name: tree-processing
3
- description: Build hierarchical document trees and navigate to specific chapters or sections required by verification rules. Use when a rule targets a specific part of a document (e.g., "Chapter 3 must contain..."), when documents are too long for a single LLM context window, or when you need to find where a specific entity lives within a large document. Implements the "onion peeler" approach for chunking. Also use for documents over 100 pages where full-document processing is impractical.
3
+ description: >
4
+ Design production-grade document chunking mechanisms for verification workflows. Use when
5
+ building the chunking step of a workflow that will run repeatedly on many documents.
6
+ The approach: observe sample documents, find structural patterns, write a chunking script
7
+ in code, that script runs in production. Also use for navigating large documents via
8
+ hierarchical structure when a rule targets a specific section.
9
+ For quick, cheap batch chunking during exploration, use document-chunking instead.
4
10
  ---
5
11
 
6
12
  # Tree Processing
7
13
 
8
14
  Most verification rules do not need the entire document. They need a specific section, a specific table, a specific disclosure. The tree is your map for navigating large documents efficiently.
9
15
 
16
+ ## Production Chunking Methodology
17
+
18
+ For verification workflows that process many documents, the chunking mechanism must be precise, consistent, and fast. The approach:
19
+
20
+ 1. **Observe**: Read 3-5 sample documents. Note their structure — headers, numbering, section patterns.
21
+ 2. **Find patterns**: Identify what's consistent (header format, numbering convention, TOC structure).
22
+ 3. **Write code**: Design a chunking script (regex-based splitter, header detector, TOC parser) that captures the pattern.
23
+ 4. **Test**: Run the script on samples. Verify it produces correct, consistent chunks.
24
+ 5. **Deploy**: The script runs in production workflows. It's deterministic, free, and fast.
25
+
26
+ This is different from `document-chunking` (quick, cheap splits for exploration). Production chunking is a one-time design effort that pays off across all documents of the same type.
27
+
10
28
  ## Why Trees
11
29
 
12
30
  Two reasons: