npm - kc-beta - Versions diffs - 0.7.3 → 0.8.1 - Mend

kc-beta 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

package/template/workflows/common/utils.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""KC workflow helpers (v0.8.1 P10-B).
+Common utilities for distilled workflows. Provider-agnostic, no
+external dependencies. Reusable across rule check.py / workflow.py
+files so that per-rule scripts stay focused on rule-specific logic.
+Currently:
+  strip_annotations(text)     — drop reviewer-annotation footers
+                                from sample documents so per-rule
+                                check.py regex doesn't false-positive
+                                on the annotation itself
+  detect_report_type(text)    — light-touch report-type classifier
+                                (年报 / 季报 / 月报 / 周报 / 其他)
+                                used by rules that gate on report type
+  make_result(rule_id, verdict, evidence, confidence, **kwargs)
+                              — standardized result dict factory
+"""
+import re
+# Annotation prefixes that mark reviewer-added footers in sample docs.
+# These should be stripped before keyword/regex matching so per-rule
+# check.py doesn't match the annotation as if it were document content.
+#
+# Added based on E2E #11 贷款 v0.8 audit § 9: 4/14 spot-checks
+# false-positive PASS because samples contain `预期命中点: ...年化利率`
+# footers that the rule's keyword regex matches.
+_ANNOTATION_PREFIXES = (
+    "预期命中点",
+    "预期结果",
+    "预期判定",
+    "预期验证",
+    "标注",
+    "审核标注",
+    "Expected",
+    "expected",
+    "EXPECTED",
+    "Annotation",
+    "annotation",
+)
+def strip_annotations(text, extra_prefixes=None):
+    """Remove reviewer-annotation footers from document text.
+    A line is dropped if it starts with one of the recognized
+    annotation prefixes followed by `:` or `：` (Chinese full-width
+    colon). All subsequent lines until a blank line or end of text
+    are also dropped (annotations are typically multi-line trailing
+    blocks).
+    Pass `extra_prefixes` (iterable of strings) to add project-specific
+    annotation labels.
+    Returns the cleaned text. Input is never mutated.
+    """
+    if not text:
+        return text
+    prefixes = tuple(_ANNOTATION_PREFIXES)
+    if extra_prefixes:
+        prefixes = prefixes + tuple(extra_prefixes)
+    # Build a pattern matching `<prefix>` + colon (half or full-width)
+    pattern = "|".join(re.escape(p) for p in prefixes)
+    anno_start = re.compile(rf"^\s*(?:{pattern})\s*[::]")
+    out_lines = []
+    in_anno_block = False
+    for line in text.split("\n"):
+        if anno_start.match(line):
+            in_anno_block = True
+            continue
+        if in_anno_block:
+            # End block on a blank line OR a line that doesn't look
+            # like annotation continuation (no leading whitespace).
+            if not line.strip() or not line.startswith((" ", "\t", "-", "*", "·")):
+                in_anno_block = False
+                if line.strip():
+                    out_lines.append(line)
+            # Otherwise still inside the annotation block — drop.
+            continue
+        out_lines.append(line)
+    return "\n".join(out_lines)
+_REPORT_TYPE_PATTERNS = [
+    ("年报", re.compile(r"年报|年度报告|annual report", re.IGNORECASE)),
+    ("半年报", re.compile(r"半年报|半年度报告|interim report", re.IGNORECASE)),
+    ("季报", re.compile(r"季报|季度报告|quarterly report", re.IGNORECASE)),
+    ("月报", re.compile(r"月报|月度报告|monthly report", re.IGNORECASE)),
+    ("周报", re.compile(r"周报|周度报告|weekly report", re.IGNORECASE)),
+]
+def detect_report_type(text):
+    """Light-touch report-type classifier.
+    Returns one of: "年报", "半年报", "季报", "月报", "周报", "其他".
+    Scans only the first 2000 chars (report-type identifiers usually
+    appear in the title or cover page). Used by rules that gate on
+    report type (e.g. R02-06/R02-08 are NOT_APPLICABLE for 季报).
+    """
+    if not text:
+        return "其他"
+    head = text[:2000]
+    for kind, pattern in _REPORT_TYPE_PATTERNS:
+        if pattern.search(head):
+            return kind
+    return "其他"
+def make_result(rule_id, verdict, evidence, confidence=0.7, **kwargs):
+    """Build a standardized check result dict.
+    Required: rule_id, verdict ("PASS" / "FAIL" / "WARNING" / "NOT_APPLICABLE"),
+    evidence (string explaining the verdict).
+    Optional: confidence (0.0-1.0), plus any extra fields the rule
+    needs (model_used, llm_calls, llm_tokens, comment, etc.).
+    """
+    result = {
+        "rule_id": rule_id,
+        "verdict": verdict,
+        "evidence": evidence,
+        "confidence": confidence,
+    }
+    result.update(kwargs)
+    return result
+__all__ = ["strip_annotations", "detect_report_type", "make_result"]

package/template/CLAUDE.md DELETED Viewed

@@ -1,150 +0,0 @@
-# KC Reborn — Document Verification Workspace
-## What This Workspace Is
-You are a coding agent tasked with building a document verification app for the developer user's specific business scenario. The meta skills in `skills/` encode the methodology of experienced verification system architects and business analysts. You bring the intelligence and judgment to apply this methodology to the specific case at hand.
-Your goal: build a verification system that starts with you doing the work, then gradually distills your capability into cheap, fast workflows powered by worker LLMs. You are the ground truth. The workflows you create are the deliverables.
-## Roles
-- **Developer user**: The human you serve. They are a domain expert (e.g., tech lead at a bank's loan department). They provide the rules, the documents, and the business context. Discuss decisions with them.
-- **You (the coding agent)**: You are both the Builder (creating skills and workflows) and the Observer (judging quality). You do the verification first, prove it works, then teach smaller models to replicate your results.
-- **Worker LLMs**: The performers. Models configured in `.env` (TIER1 through TIER4) that will execute the workflows you build. Your job is to find the smallest model that works for each task.
-## Workspace Layout
-```
-Rules/       — Regulation documents, compliance notes from the developer user
-Samples/     — Sample documents for testing (your training set)
-Input/       — Production document batches awaiting verification
-Output/      — Verification results
-skills/      — Meta skills encoding verification methodology
-.env         — Configuration: API keys, model tiers, thresholds, language
-```
-Note: KC's session workspace under `~/.kc_agent/workspaces/<sessionId>/`
-uses lowercase counterparts (`rules/`, `samples/`, `input/`, `output/`,
-`logs/`, `workflows/`, `rule_skills/`) — these are runtime-internal and
-separate from this project's user-facing folders above. The asymmetry
-is intentional: title-case for human-facing project dirs, lowercase for
-KC's working state.
-## Your Mission
-Follow this lifecycle. Each step references the skill(s) to consult:
-1. **Bootstrap** → Read `bootstrap-workspace`. Understand the business scenario, read Rules/, scan Samples/, configure .env with the developer user.
-2. **Extract Rules** → Read `rule-extraction`. Decompose regulation documents into atomic, testable verification rules.
-3. **Decompose Tasks** → Read `task-decomposition`. For each rule, break the verification into sub-tasks and assign the optimal method (rule, code, LLM, or manual) to each.
-4. **Map Rule Relationships** → Read `rule-graph`. Identify shared entities, dependencies, and conflicts between rules. Each rule stays independently executable.
-5. **Write Rule Skills** → Read `skill-authoring`. Write each rule into a skill folder. Before writing extraction logic for a new document type, consult `data-sensibility` to observe the data first.
-6. **Test Skills** → Apply each skill to Samples/. Use `evolution-loop` to diagnose failures and iterate. Continue until accuracy meets SKILL_ACCURACY threshold in .env.
-7. **Distill to Workflows** → Read `skill-to-workflow`. Convert proven skills into Python code + worker LLM prompts. Test workflows against your own results as ground truth. Iterate until WORKFLOW_ACCURACY is met.
-8. **Production QC** → Read `quality-control` and `confidence-system`. Run workflows on Input/. Sample and review results based on confidence scores. For multi-document cases, read `cross-document-verification`. Use `evolution-loop` when quality drops.
-9. **Stabilize** → Gradually reduce monitoring as workflows prove reliable. Only intervene when rules change or quality drops.
-10. **Report** → Read `dashboard-reporting`. Generate HTML dashboards so the developer user can see results, progress, and issues. Ensure dashboards include feedback collection mechanisms for users.
-Throughout: use `version-control` to track all changes. Use `corner-case-management` to handle edge cases without polluting workflows. Use `task-decomposition` and `rule-graph` to inform optimization decisions.
-## Core Principles
-- **Minimum viable model**: Always use the smallest, cheapest, fastest model that meets the accuracy threshold. Start simple, escalate only when necessary.
-- **JIT structure**: Do not design schemas or formats prematurely. Define them when needed, keep them consistent once defined.
-- **OTF evolution**: The system you build today may look completely different tomorrow. Embrace change.
-- **Skills before workflows**: Prove each rule works as a skill (you executing it) before distilling into code + worker LLM prompts.
-- **Log everything**: Every test iteration, every evolution decision, every version change. Both JSON (machine-readable) and plain text (human-readable).
-## How to Read Skills
-Skills use progressive disclosure:
-1. **Frontmatter** (name + description) — always visible, ~100 words. Tells you WHEN to use the skill.
-2. **SKILL.md body** — read when the skill is relevant. Under 500 lines. Conveys methodology, not recipes.
-3. **references/** — read on demand for detailed technical reference.
-4. **scripts/** — executable code you can run or adapt.
-5. **assets/** — data files, templates, examples.
-Skills convey philosophy and decision frameworks. Adapt them to the specific business case. Do not follow them rigidly.
-## Communication with Developer User
-- **Proactively discuss**: rule granularity, accuracy thresholds, model selection, edge cases.
-- **Report progress**: after each testing round, share results and next steps.
-- **Escalate**: when you cannot resolve an issue after iterating, surface it with evidence.
-- **Ask**: the developer user is a domain expert. When in doubt about a rule's intent, ask.
----
-# KC Reborn — 文档核查工作区
-## 这是什么
-你是一个编程智能体，负责为开发者用户的具体业务场景构建文档核查应用。`skills/` 中的元技能编码了资深核查系统架构师和业务分析师的方法论。你负责运用智慧和判断力，将这些方法论应用到具体场景中。
-你的目标：构建一个核查系统，先由你亲自执行核查工作，然后逐步将你的能力蒸馏为由 Worker LLM（执行模型）驱动的低成本、高速度的工作流。你是基准真值。你创建的工作流是最终交付物。
-## 角色定义
-- **开发者用户**：你服务的人。他们是领域专家（如银行信贷部门的技术负责人）。他们提供规则、文档和业务背景。与他们讨论决策。
-- **你（编程智能体）**：你既是构建者（创建技能和工作流），也是观察者（评判质量）。你先执行核查，证明方法可行，再教小模型复现你的结果。
-- **Worker LLM**：执行者。在 `.env` 中配置的模型（TIER1到TIER4），将执行你构建的工作流。你的任务是为每项工作找到能胜任的最小模型。
-## 工作区结构
-```
-Rules/       — 法规文件、开发者用户的合规注释
-Samples/     — 用于测试的样本文件（你的训练集）
-Input/       — 等待核查的生产批次文件
-Output/      — 核查结果
-skills/      — 编码核查方法论的元技能
-.env         — 配置：API密钥、模型层级、阈值、语言
-```
-注：KC 在 `~/.kc_agent/workspaces/<sessionId>/` 下的会话工作区使用
-小写对应目录（`rules/`、`samples/`、`input/`、`output/`、`logs/`、
-`workflows/`、`rule_skills/`）—— 这些是运行时内部目录，与本项目上面
-那些用户可见的目录是分开的。这种大小写不对称是有意的：项目里给人看
-的目录用首字母大写；KC 自己的工作状态用小写。
-## 你的使命
-遵循以下生命周期。每一步标注了需要参考的技能：
-1. **初始化** → 阅读 `bootstrap-workspace`。理解业务场景，阅读 Rules/，浏览 Samples/，与开发者用户配置 .env。
-2. **提取规则** → 阅读 `rule-extraction`。将法规文件分解为原子级、可测试的核查规则。
-3. **任务分解** → 阅读 `task-decomposition`。对每条规则，将核查过程拆解为子任务，为每个子任务分配最优方法（规则、代码、LLM 或人工）。
-4. **构建规则图谱** → 阅读 `rule-graph`。识别规则间的共享实体、依赖关系和潜在冲突。每条规则保持独立可执行。
-5. **编写规则技能** → 阅读 `skill-authoring`。将每条规则写入技能文件夹。编写新文档类型的提取逻辑前，先阅读 `data-sensibility` 观察数据。
-6. **测试技能** → 在 Samples/ 上应用每个技能。使用 `evolution-loop` 诊断失败并迭代。直到准确率达到 .env 中的 SKILL_ACCURACY 阈值。
-7. **蒸馏为工作流** → 阅读 `skill-to-workflow`。将验证过的技能转化为 Python 代码 + Worker LLM 提示词。用你自己的结果作为基准测试工作流。迭代直到达到 WORKFLOW_ACCURACY。
-8. **生产质控** → 阅读 `quality-control` 和 `confidence-system`。在 Input/ 上运行工作流。根据置信度分数抽样审查结果。涉及多文档案件时，阅读 `cross-document-verification`。质量下降时使用 `evolution-loop`。
-9. **稳定运行** → 随着工作流稳定，逐步降低监控频率。仅在规则变更或质量下降时介入。
-10. **报告** → 阅读 `dashboard-reporting`。生成 HTML 仪表板，让开发者用户直观地看到结果、进度和问题。确保仪表盘内置用户反馈收集机制。
-全程使用 `version-control` 跟踪所有变更。使用 `corner-case-management` 处理边缘案例，不要污染主工作流。使用 `task-decomposition` 和 `rule-graph` 指导优化决策。
-## 核心原则
-- **最小可用模型**：始终使用能达到准确率阈值的最小、最便宜、最快的模型。从简单开始，必要时才升级。
-- **即时结构（JIT）**：不要过早设计数据结构或格式。需要时定义，定义后保持一致。
-- **即时演进（OTF）**：你今天构建的系统明天可能面目全非。拥抱变化。
-- **先技能后工作流**：先证明每条规则作为技能（你执行）可行，再蒸馏为代码 + Worker LLM 提示词。
-- **记录一切**：每次测试迭代、每个演进决策、每次版本变更。同时保存 JSON（机器可读）和纯文本（人类可读）。
-## 如何阅读技能
-技能采用渐进式披露：
-1. **前置元数据**（名称 + 描述）— 始终可见，约100字。告诉你何时使用该技能。
-2. **SKILL.md 正文** — 技能相关时阅读。500行以内。传达方法论，而非配方。
-3. **references/** — 按需阅读，获取详细技术参考。
-4. **scripts/** — 可执行代码，可直接运行或修改。
-5. **assets/** — 数据文件、模板、示例。
-技能传达的是理念和决策框架。请根据具体业务场景灵活运用，不要机械照搬。
-## 与开发者用户的沟通
-- **主动讨论**：规则粒度、准确率阈值、模型选择、边缘案例。
-- **汇报进度**：每轮测试后，分享结果和下一步计划。
-- **升级问题**：迭代后仍无法解决的问题，附带证据提交给开发者用户。
-- **多问**：开发者用户是领域专家。对规则意图有疑问时，问他们。

package/template/skills/en/meta/compliance-judgment/SKILL.md DELETED Viewed

@@ -1,82 +0,0 @@
----
-name: compliance-judgment
-description: Determine whether extracted entities comply with verification rules. Use after entity extraction to make the pass/fail judgment for each rule on each document. Covers translating natural language rules into executable logic, choosing between Python calculation and LLM semantic judgment, and producing actionable comments on failures. Also use when designing the judgment step of a workflow or when a rule's judgment logic needs debugging.
----
-# Compliance Judgment
-Judgment is the moment of truth. You have the extracted entity. You have the rule. Do they comply? The answer must be clear, correct, and — when the answer is no — accompanied by a concise, actionable comment.
-## The Judgment Spectrum
-Rules range from trivially deterministic to deeply semantic. Pick the right tool for each rule.
-**Deterministic** — threshold checks, format validation, date arithmetic, cross-field consistency. Pure Python: free, instant, deterministic.
-**Semantic** — adequacy, completeness, consistency, compliance with templates, detecting misleading or suggestive language, assessing whether a description is fair and balanced. These require language understanding — use worker LLM.
-Many real compliance rules require semantic judgment. "The risk disclosure must adequately describe the key risks" cannot be checked with regex or Python. "The contract description must not be misleading or suggestive" requires deep language understanding. Use worker LLM for these without hesitation.
-Some rules combine both: extract a number (deterministic), compare to threshold (deterministic), then assess the explanation if borderline (semantic). The mix depends on the rule.
-The right method is whatever achieves accuracy at lowest cost. Simple threshold checks don't need LLM. Semantic assessments don't benefit from Python. Most projects will have a mix — let the nature of each rule determine the method.
-## Output Format
-For each rule × document combination:
-```json
-{
-  "rule_id": "R001",
-  "document": "report_2024_q1.pdf",
-  "result": "pass | fail | missing | error | uncertain",
-  "extracted_value": "12.5%",
-  "expected": ">= 8.0%",
-  "comment": "",
-  "confidence": 0.95
-}
-```
-**Result values:**
-- **pass**: Entity complies with the rule.
-- **fail**: Entity does not comply. Comment is required.
-- **missing**: The entity could not be found in the document. This is different from fail — the information is absent, not non-compliant.
-- **error**: Something went wrong during extraction or judgment (parsing failure, API error). Needs investigation.
-- **uncertain**: The judgment is ambiguous. May need human review.
-**Design exit criteria first:** Before writing judgment logic for a rule, define the exit conditions: what constitutes pass, what constitutes fail, what triggers escalation to human, how to handle empty/missing values, what value ranges are valid. Explicit exit criteria prevent ambiguous or inconsistent judgment.
-**Prompt design:** Design prompts for what you want, not against what you don't want. "Don't include reasoning" is less reliable than extracting the verdict from structured output in postprocessing. Use output filtering instead of prompt negation.
-**Comments:**
-- Required only when result is `fail`. Skip for `pass` unless the developer user specifically requests pass comments.
-- Be concise and factual: "Capital adequacy ratio is 7.2%, below the regulatory minimum of 8.0%."
-- Do not editorialize: not "This is a serious violation that could result in penalties." Just state the facts.
-- Include the extracted value and the expected value/condition for context.
-### Lightweight Annotation Markup
-For human review, token-efficient logging, and clean diff comparisons, results can also be expressed in compact text markup:
-```
-[PASS] capital_adequacy <- 12.5% (>= 8.0%) | conf:0.95 | src:p3-s2
-[FAIL] sign_date_gap <- 75d (<= 30d) | conf:0.90 | src:p1-s4 | note:Signing overdue by 45 days
-[MISSING] collateral_value | conf:0.60 | note:Collateral valuation not found in document
-```
-This format is losslessly convertible to and from the JSON format above. Use it when presenting results to the developer user for quick review, logging to evolution iteration summaries where token economy matters, or computing diffs between verification runs. See `references/output-format.md` for the full specification and conversion rules.
-## Judgment Ordering
-Some rules depend on the results of other rules:
-- Rule B might only apply if Rule A passes. "If the borrower is a new customer (Rule A), then additional documentation is required (Rule B)."
-- Rule C might use a value computed by Rule A. "The risk-weighted capital ratio (Rule A) determines the required reserve level (Rule C)."
-Map these dependencies in the rule catalog. Execute rules in dependency order. Pass upstream results as context to downstream rules.
-## Handling Edge Cases
-- **Null extraction**: The entity was not found. Default to `missing`, not `fail`. A missing value is an extraction problem, not a compliance problem.
-- **Multiple values**: The document contains the entity in multiple places with different values. Flag as `uncertain`. Report all found values.
-- **Conditional rules**: "If the loan exceeds 1M, then collateral is required." Check the condition before applying the rule. If the condition is not met, the rule does not apply — result is `pass` (or `not_applicable` if you add that category).
-- **Negative results**: Some rules check for absence. "The document must NOT contain guarantees to related parties." Searching for absence is harder than searching for presence. Be thorough in the search, then be confident in the negative.

package/template/skills/en/meta/document-chunking/SKILL.md DELETED Viewed

@@ -1,32 +0,0 @@
----
-name: document-chunking
-description: >
-  Fast, cheap chunking for processing batches of sample and input documents.
-  Use when you need to split documents into manageable pieces for initial observation,
-  data sensibility checks, or feeding to extraction workflows. Not for production
-  verification chunking — for that, use tree-processing to design a tailored chunking script.
----
-# Document Chunking
-Split documents into pieces for downstream processing. This is the fast, cheap version — for batch processing of samples and inputs, not for precision verification workflows.
-## Methods
-**Page-level splits** — simplest. Each page is a chunk. Works for most document processing where you need to iterate over content.
-**Fixed-size chunks** — split by character/token count with overlap. Good for search and initial observation. Typical: 2000-4000 chars with 200 char overlap.
-**Header-based splits** — detect section headers and split at boundaries. Preserves semantic units. Use regex patterns for the document's header convention.
-## When to Use What
-Pick the simplest method that serves the task:
-- Batch document observation → page-level
-- Full-text search index → fixed-size with overlap
-- Section-level extraction → header-based
-- Table of contents available → parse TOC for structure
-## Relationship to tree-processing
-This skill is for quick, cheap chunking during exploration and batch processing. When you need production-grade chunking for verification workflows — where the chunking mechanism must be precise, consistent, and coded as a script — use `tree-processing` instead.

package/template/skills/en/meta/entity-extraction/SKILL.md DELETED Viewed

@@ -1,120 +0,0 @@
----
-name: entity-extraction
-description: Extract specific entities, values, and text segments from documents as required by verification rules. Use after tree processing has located the relevant section, when a rule needs a specific number, date, name, amount, clause, or any domain-specific entity extracted. Covers extraction method selection (regex vs LLM), schema design, postprocessing, and confidence annotation. Also use when designing the extraction step of a workflow for worker LLMs.
----
-# Entity Extraction
-An entity is the thing you need to check. A number, a date, a name, a clause, a percentage, a statement. The rule says what to check; extraction is how you get the value to check it against.
-## Extraction Type Taxonomy
-Different extraction scenarios call for different approaches:
-### Single Entity from Single Section
-The simplest case. One rule needs one value from one place.
-- Example: "Extract the capital adequacy ratio from the Key Metrics table."
-- Approach: Locate the section, apply regex or LLM extraction.
-### Multiple Entities from Single Section
-One rule needs several related values from the same place.
-- Example: "Extract the borrower's name, loan amount, interest rate, and maturity date from the loan agreement summary."
-- Approach: Design a single extraction call that returns all values. More efficient than multiple calls.
-### Single Entity from Multiple Sections
-One value is scattered across multiple places, or needs cross-referencing.
-- Example: "Extract the total collateral value, which may be listed in the collateral section or in Appendix A."
-- Approach: Collect content from all relevant sections, then extract. Note which source the value came from.
-### Entity from Full Document
-The value could be anywhere, or the rule applies to the document as a whole.
-- Example: "Check whether the document contains a valid signature page."
-- Approach: For the coding agent, scan the full document. For worker LLM workflows, design a two-pass approach: first pass identifies the location, second pass extracts the value.
-## Method Selection
-Extraction method selection is a cost-accuracy search. The goal is finding the cheapest method that meets the accuracy threshold. Regex is the smallest, cheapest "model" — zero cost, instant, deterministic. Worker LLM is more capable but costs tokens and time. Any search strategy is valid: try the cheapest first and escalate, try the most capable first and downgrade, bisect, or jump directly to a known-good method based on past experience in AGENT.md.
-### Available Methods
-**Regex / Python** — Cost: zero. Speed: instant. Deterministic.
-Works well for: dates, monetary amounts, percentages, identifiers, fixed phrases, any value with a predictable format.
-**Worker LLM** — Cost: API tokens. Speed: seconds. Semantic understanding.
-Works well for: contextual interpretation, conditional values, semantic matching, ambiguous structures, suggestive or misleading language detection, table interpretation, anything requiring understanding rather than pattern matching.
-Many real verification tasks require semantic understanding — "is this description misleading?", "does this clause adequately disclose risk?", "is this guarantor's business description consistent with their stated industry?" — regex cannot handle these. Use worker LLM without hesitation for such tasks.
-### The Search
-If a method's results fall below the accuracy threshold, try a different method or a more capable model. If regex works and meets accuracy — keep it, it's free. If regex produces results below threshold, escalate to worker LLM. If a cheap worker LLM isn't accurate enough, try a more capable tier. Record what works for each extraction type in AGENT.md for future reference.
-## Project Glossary
-The project glossary (built and maintained by `rule-extraction`, stored at `rules/glossary.json`) is a useful resource when designing extraction. It records canonical names and known aliases for entities that appear across rules. Reading it before extracting helps keep entity names schema-aligned and avoids parallel labels for the same thing.
-Whether the glossary becomes more than a naming convention — for instance, driving cheap pattern matching for entities with stable surface forms — is a per-project judgment. Apply the same cost-accuracy logic as elsewhere: whatever method meets the accuracy threshold for the task at hand.
-## Schema Design
-Define the expected output for each extraction. Keep it simple and JIT:
-```json
-{
-  "entity_name": "capital_adequacy_ratio",
-  "value": 12.5,
-  "unit": "%",
-  "raw_text": "资本充足率为12.5%",
-  "source_location": "Chapter 2, Table 1, Row 3",
-  "confidence": 0.95,
-  "extraction_method": "regex"
-}
-```
-The schema should capture:
-- **value**: The extracted value, normalized.
-- **unit**: If applicable (%, 元, days, etc.).
-- **raw_text**: The original text fragment where the value was found. This is evidence for the judgment step.
-- **source_location**: Where in the document the value was found.
-- **confidence**: How sure you are (see `confidence-system`).
-- **extraction_method**: What extracted it (regex, LLM-TIER2, etc.).
-Do not over-engineer the schema. Add fields as needed during testing.
-## Postprocessing
-Raw extracted values often need normalization:
-- **Chinese numerals → digits**: 一百二十万 → 1200000
-- **Date standardization**: 2024年3月15日 → 2024-03-15
-- **Unit conversion**: 万元 → multiply by 10000 if comparing to a threshold in 元.
-- **Whitespace and noise removal**: Strip extra spaces, line breaks, formatting artifacts.
-- **Percentage normalization**: 0.125 → 12.5% or vice versa, depending on what the rule expects.
-Build postprocessing as Python functions in the rule skill's `scripts/` directory. They are deterministic and reusable.
-## Confidence Annotation
-Every extraction should carry a confidence estimate:
-- **Regex match, validated format**: 0.90-0.95
-- **LLM extraction, high certainty**: 0.80-0.85
-- **LLM extraction, some ambiguity**: 0.60-0.75
-- **Fallback or inferred value**: 0.40-0.60
-- **No value found**: 0.0 (flag as MISSING)
-These are starting points. Calibrate based on actual accuracy (see `confidence-system`).
-## Prompt Design: Ask For What You Want
-Design prompts for what you want, not against what you don't want. "Don't include explanations" in a prompt is less reliable than stripping non-JSON text from the output in postprocessing. If you need to tell the LLM not to do something, use output filtering instead of prompt negation.
-## Fitting Worker LLM Context
-When designing extraction for worker LLM workflows:
-1. Calculate the prompt size: system prompt + instructions + examples + output format = N tokens.
-2. Available context for document content = model's context window - N.
-3. If the section exceeds available context, narrow further via tree processing.
-4. Always leave room for the model's response.
-5. Test with the actual model to verify the context fits — token counts from the coding agent may differ from the worker LLM's tokenizer.

package/template/skills/zh/meta/document-parsing/SKILL.md DELETED Viewed

@@ -1,101 +0,0 @@
----
-name: document-parsing
-description: Parse source documents into machine-readable text with maximum fidelity. Use when processing any document in Samples/ or Input/ for the first time, when parsed text quality is poor, or when tables and charts need special handling. Covers multi-level parser selection from simple text extraction to OCR and vision models. Also use when a verification rule fails due to parsing issues (garbled text, missing tables, mangled layouts) and the parser needs to be upgraded for that document type.
----
-# Document Parsing
-Parsing is the foundation. If the text is wrong, everything downstream is wrong. But parsing is also a cost center — do not use expensive vision models when simple text extraction works.
-## The Minimum Viable Parser Principle
-Start with the simplest parser. Escalate only when necessary. This is not about saving money — it is about producing the most reliable output. Simple parsers have fewer failure modes.
-### Level 1: Direct Text Extraction
-- Tool: pdfjs-dist or similar PDF text extraction.
-- When: Well-formed digital PDFs with embedded text. This covers most modern business documents.
-- Output: Raw text with basic structure preserved (paragraphs, basic formatting).
-- Limitations: Tables may come out as messy text. Charts and images are invisible. Scanned PDFs produce nothing.
-### Level 2: Provider VLM (Vision Language Model)
-- Tool: VLM models from configured provider (VLM_TIER3 for cheap OCR, VLM_TIER1 for complex interpretation).
-- When: Level 1 produces garbled/incomplete text, scanned PDFs, image-based PDFs.
-- Output: Recognized text from page images, or structured interpretation (table as markdown, chart data as JSON).
-- Calling a provider VLM is more convenient and reliable than deploying local OCR. Use the cheapest VLM tier first; escalate to a more capable tier for complex tables/charts.
-### Level 3: MineRU API or Local Tools (Optional)
-- Tool: MineRU API, pdfplumber, or locally deployed OCR — if configured.
-- When: Provider VLM is unavailable or too expensive for batch processing.
-- These are optional fallbacks. Most users will use Level 1 + Level 2.
-## Quality Detection
-How to know when to escalate:
-- **Low character count**: The document has pages but extracted text is very short. Likely a scanned PDF.
-- **Garbled text**: Unusual character sequences, encoding errors, or meaningless text patterns.
-- **Missing expected sections**: The table of contents mentions Chapter 5 but no Chapter 5 text was extracted.
-- **Table artifacts**: Columns of numbers without alignment, cell content mixed with headers, or table borders appearing as characters.
-- **Missing numbers in financial tables**: If a financial document's key metrics are not in the extracted text, the tables were probably not parsed.
-Write a quick quality check after parsing and before proceeding. If quality is insufficient, escalate to the next parser level.
-### Parse Quality Score
-Compute a quality score (0.0 to 1.0) from weighted heuristics to make escalation decisions systematic rather than ad-hoc. A recommended starting framework:
-- **Character density** (weight ~0.3): actual character count / expected characters for the document's page count. A 10-page PDF that yields only 200 characters likely failed.
-- **Garble ratio** (weight ~0.2): fraction of characters that are common CJK/Latin vs control characters, unusual sequences, or encoding artifacts.
-- **Section completeness** (weight ~0.3): if the document has a table of contents, what fraction of TOC entries have matching content in the extracted text?
-- **Table integrity** (weight ~0.2): for financial documents, are key numeric values that should appear in tables actually present in the extracted text?
-**Escalation thresholds** (recommended defaults — adjust freely):
-- Score >= 0.7: accept this parser level, proceed to downstream processing.
-- Score 0.4-0.7: escalate to the next parser level, re-parse, re-score.
-- Score < 0.4: skip directly to Level 3 (OCR) or Level 4 (vision) depending on document characteristics.
-**Lock-in**: once a parser level produces an acceptable score for a document type, record that level. Do not re-evaluate unless a downstream verification failure is traced back to a parsing issue.
-These weights, thresholds, and the scoring approach itself are starting points. The coding agent should design whatever quality assessment works for the specific document types at hand — a simple pass/fail heuristic may be sufficient for some scenarios; a more nuanced scoring function may be needed for others. The important pattern is: **measure quality → compare to threshold → decide whether to escalate**.
-This follows the same tier-transition pattern as model tier selection in `skill-to-workflow`: a quality/accuracy score drives the decision to stay, escalate, or skip tiers.
-## Table Handling
-Tables are critical in financial documents (balance sheets, ratio tables, compliance metrics). They deserve special attention:
-1. **Detection**: Identify table regions. Look for grid patterns, consistent column spacing, or explicit table markers.
-2. **Extraction**: Extract cell-by-cell content. Preserve the row-column relationship.
-3. **Reconstruction**: Convert to a structured format (markdown table, JSON array of rows, or CSV).
-4. **Validation**: Spot-check that key values in the reconstructed table match what is visible in the document.
-When the standard parser fails on tables, try the vision model approach: send the table image (cropped from the PDF page) to a vision model and ask it to produce a markdown table.
-## Chart Handling
-Charts (bar charts, line charts, pie charts) occasionally contain data needed for verification:
-- Extract the chart image from the document.
-- Send to a vision model with a prompt: "Extract the data points, labels, and values from this chart. Return as a JSON array."
-- Validate the extracted data against any nearby text or table that might contain the same numbers.
-This is expensive. Only do it when a verification rule specifically requires data from a chart and that data is not available in text elsewhere in the document.
-## Output Format
-Parsed documents should be saved as clean markdown:
-- Preserve the document's heading hierarchy (# Chapter, ## Section, ### Subsection).
-- Preserve lists, numbered or bulleted.
-- Convert tables to markdown table format.
-- Note page boundaries if relevant (some rules reference specific pages).
-- Strip noise: headers, footers, page numbers, watermarks (unless a rule specifically checks for them).
-Save parsed output alongside the original document for reuse across rules.
-## Caching
-Parsing is expensive (especially Level 3-4). Cache parsed output:
-- Store the parsed markdown alongside the original file.
-- Track which parser level produced it.
-- Re-parse only when: the original file changes, a rule requires higher-quality parsing than what is cached, or a verification failure is traced back to a parsing issue.