npm - kc-beta - Versions diffs - 0.1.1 → 0.2.1 - Mend

kc-beta 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/bin/kc-beta.js +14 -2
package/package.json +1 -1
package/src/agent/context-window.js +151 -0
package/src/agent/context.js +58 -88
package/src/agent/engine.js +267 -38
package/src/agent/event-log.js +111 -0
package/src/agent/llm-client.js +352 -59
package/src/agent/pipelines/_archive_v1/distillation.js +113 -0
package/src/agent/pipelines/_archive_v1/extraction.js +92 -0
package/src/agent/pipelines/_archive_v1/initializer.js +163 -0
package/src/agent/pipelines/_archive_v1/production-qc.js +99 -0
package/src/agent/pipelines/_archive_v1/skill-authoring.js +83 -0
package/src/agent/pipelines/_archive_v1/skill-testing.js +111 -0
package/src/agent/pipelines/base.js +6 -0
package/src/agent/pipelines/distillation.js +25 -11
package/src/agent/pipelines/extraction.js +26 -7
package/src/agent/pipelines/initializer.js +30 -20
package/src/agent/pipelines/production-qc.js +22 -5
package/src/agent/pipelines/skill-authoring.js +19 -8
package/src/agent/pipelines/skill-testing.js +26 -8
package/src/agent/retry.js +83 -0
package/src/agent/session-state.js +78 -0
package/src/agent/skill-loader.js +139 -0
package/src/agent/token-counter.js +62 -0
package/src/agent/tools/document-parse.js +3 -3
package/src/agent/tools/tier-downgrade.js +11 -2
package/src/agent/tools/web-search.js +107 -0
package/src/agent/tools/worker-llm-call.js +14 -5
package/src/cli/components.js +16 -4
package/src/cli/config.js +246 -0
package/src/cli/index.js +99 -10
package/src/cli/onboard.js +154 -48
package/src/config.js +25 -7
package/src/providers.js +370 -0

package/bin/kc-beta.js CHANGED Viewed

@@ -1,16 +1,28 @@
 #!/usr/bin/env node
-const subcommand = process.argv[2];
+// Parse --en / --zh from anywhere in argv (session-only language override)
+const args = process.argv.slice(2);
+let languageOverride = null;
+const filtered = [];
+for (const arg of args) {
+  if (arg === "--en") languageOverride = "en";
+  else if (arg === "--zh") languageOverride = "zh";
+  else filtered.push(arg);
+}
+const subcommand = filtered[0];
 (async () => {
   if (subcommand === "onboard" || subcommand === "setup") {
     const { onboard } = await import("../src/cli/onboard.js");
     await onboard();
+  } else if (subcommand === "config") {
+    const { configEditor } = await import("../src/cli/config.js");
+    await configEditor();
   } else if (subcommand === "init") {
     const { init } = await import("../src/cli/init.js");
     await init();
   } else {
     const { main } = await import("../src/cli/index.js");
-    await main();
+    await main({ languageOverride });
   }
 })();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "kc-beta",
-  "version": "0.1.1",
+  "version": "0.2.1",
   "description": "KC Agent — LLM document verification agent (pure Node.js CLI)",
   "type": "module",
   "bin": {

package/src/agent/context-window.js ADDED Viewed

@@ -0,0 +1,151 @@
+import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
+/**
+ * Automatic context windowing for long conversations.
+ * When messages approach the model's context limit, older messages
+ * are compressed into summaries while keeping recent messages intact.
+ */
+export class ContextWindow {
+  /**
+   * @param {object} opts
+   * @param {number} opts.contextLimit - Total model context limit in tokens
+   * @param {number} [opts.reserveForResponse=8192] - Tokens reserved for model output
+   * @param {number} [opts.recentWindowSize=30] - Number of recent messages to always keep
+   */
+  constructor({ contextLimit, reserveForResponse = 8192, recentWindowSize = 30 }) {
+    this.contextLimit = contextLimit;
+    this.reserveForResponse = reserveForResponse;
+    this.recentWindowSize = recentWindowSize;
+  }
+  /**
+   * Apply windowing to a message array if it exceeds the token budget.
+   * @param {Array<object>} messages - Full message history
+   * @param {string[]} [phaseSummaries] - Summaries from completed pipeline phases
+   * @returns {{ messages: Array, wasWindowed: boolean, removedCount: number }}
+   */
+  window(messages, phaseSummaries = []) {
+    const totalTokens = estimateMessagesTokens(messages);
+    const budget = this.contextLimit - this.reserveForResponse;
+    // If within budget, return as-is
+    if (totalTokens <= budget * 0.85) {
+      return { messages, wasWindowed: false, removedCount: 0 };
+    }
+    // Split into older and recent
+    const splitPoint = Math.max(0, messages.length - this.recentWindowSize);
+    const recentMessages = messages.slice(splitPoint);
+    const olderMessages = messages.slice(0, splitPoint);
+    if (olderMessages.length === 0) {
+      return { messages, wasWindowed: false, removedCount: 0 };
+    }
+    // Build a compact summary of older messages
+    const recentTokens = estimateMessagesTokens(recentMessages);
+    const summaryBudget = budget - recentTokens - 500; // 500 tokens buffer
+    const compactedSummary = this._compactMessages(olderMessages, phaseSummaries, summaryBudget);
+    const windowedMessages = [
+      {
+        role: "user",
+        content: `[Context Summary - Earlier conversation compressed]\n\n${compactedSummary}`,
+      },
+      {
+        role: "assistant",
+        content: "Understood. I have the context from the summary above. Continuing with the current work.",
+      },
+      ...recentMessages,
+    ];
+    return {
+      messages: windowedMessages,
+      wasWindowed: true,
+      removedCount: olderMessages.length,
+    };
+  }
+  /**
+   * Create a mechanical compact summary of messages.
+   * Groups into conversational turns and extracts key info.
+   * @param {Array<object>} messages
+   * @param {string[]} phaseSummaries
+   * @param {number} tokenBudget
+   * @returns {string}
+   */
+  _compactMessages(messages, phaseSummaries, tokenBudget) {
+    const parts = [];
+    // Phase summaries first (high signal)
+    if (phaseSummaries.length > 0) {
+      parts.push("## Phase History");
+      for (const s of phaseSummaries) {
+        parts.push(`- ${s}`);
+      }
+      parts.push("");
+    }
+    // Extract key events from older messages
+    parts.push("## Conversation Summary");
+    const turns = this._groupIntoTurns(messages);
+    for (const turn of turns) {
+      const line = this._summarizeTurn(turn);
+      if (line) {
+        parts.push(`- ${line}`);
+        // Check budget
+        if (estimateTokens(parts.join("\n")) > tokenBudget * 0.9) {
+          parts.push("- [earlier history truncated]");
+          break;
+        }
+      }
+    }
+    return parts.join("\n");
+  }
+  /**
+   * Group messages into user-turn blocks.
+   * Each turn: { user: string, tools: [{name, summary}], assistantSummary: string }
+   */
+  _groupIntoTurns(messages) {
+    const turns = [];
+    let current = null;
+    for (const msg of messages) {
+      if (msg.role === "user") {
+        if (current) turns.push(current);
+        current = { user: msg.content || "", tools: [], assistant: "" };
+      } else if (msg.role === "assistant" && current) {
+        if (msg.content) current.assistant = msg.content;
+        if (msg.tool_calls) {
+          for (const tc of msg.tool_calls) {
+            current.tools.push(tc.function?.name || "unknown");
+          }
+        }
+      }
+      // tool results are captured implicitly via tool names
+    }
+    if (current) turns.push(current);
+    return turns;
+  }
+  /**
+   * Summarize a single conversational turn into one line.
+   */
+  _summarizeTurn(turn) {
+    const userSnippet = (turn.user || "").slice(0, 80).replace(/\n/g, " ");
+    if (!userSnippet) return null;
+    let line = `User: "${userSnippet}"`;
+    if (turn.tools.length > 0) {
+      line += ` → Tools: ${turn.tools.join(", ")}`;
+    }
+    if (turn.assistant) {
+      const aSnippet = turn.assistant.slice(0, 60).replace(/\n/g, " ");
+      line += ` → "${aSnippet}..."`;
+    }
+    return line;
+  }
+}

package/src/agent/context.js CHANGED Viewed

@@ -1,122 +1,92 @@
 const AGENT_IDENTITY = `\
-You are KC Agent, a document verification coding agent. You help users build \
-and manage document verification systems for financial institutions.
+KC Agent builds and manages document verification systems for financial institutions.
-You are direct and technical. You think step by step. When you don't know \
-something, you say so.
+## Architecture
-## Tools
+This system operates in two modes:
-You have the following tools:
+**BUILD mode** (Bootstrap → Extraction → Skill Authoring → Skill Testing): \
+Read regulations, extract rules, build verification skills, test them against samples. \
+All intellectual work — parsing, extracting, judging — is done directly. The results \
+produced in this mode serve as the accuracy baseline. Worker LLM tools are not available \
+in this mode.
-- **sandbox_exec**: Execute shell commands in your workspace directory. Use this \
-to run Python scripts, install packages, list files, etc. Pipes and redirects work.
+**DISTILL mode** (Distillation → Production QC): \
+Convert proven skills into workflows that run with cheaper worker LLMs at scale. \
+Test workflow results against the baseline established in BUILD mode. Monitor production \
+quality. Worker LLM tools become available in this mode.
-- **workspace_file**: Read, write, or list files in your workspace. Operations: \
-read (path), write (path + content), list (optional path).
+Skills are first-class deliverables, not just stepping stones to distillation. When a \
+verification task is too complex for worker LLMs, the skill itself — run by a capable \
+agent — is the production solution.
-- **document_parse**: Parse documents (PDF, DOCX, images) and extract text. \
-Internally uses an escalation chain: text extraction → API parser → OCR models. \
-Starts cheap, escalates if needed. You don't choose the method — the tool handles it. \
-Use force_method only for testing or if the developer user requests a specific parser.
-- **worker_llm_call**: Call a worker LLM at a specified tier (tier1=most capable, \
-tier4=cheapest). Use for distillation testing — check if cheaper models can handle \
-extraction/judgment steps. Returns response with model used and token counts.
-- **workflow_run**: Execute a distilled workflow against a document. Automatically \
-attaches confidence scores and trace IDs. Results saved to output/results/.
-- **tier_downgrade**: Test a workflow step at a lower tier. Compares accuracy at \
-target tier vs. current baseline. Recommends downgrade if accuracy stays above threshold.
+## Methodology
-- **evolution_cycle**: Run one diagnose-classify-fix iteration. Classifies failures \
-as systemic (>threshold) or corner case (<threshold). Routes corner cases to registry \
-automatically. Checks for repeated failure patterns across iterations.
+### Document Parsing
+Start with the simplest parser and escalate only when output is insufficient. Once a \
+parser works for a document type, lock it in. Tables and charts may need specific handling.
-- **document_search**: Search for text across workspace documents. Supports plain text \
-and regex. Returns matching passages with file path and line number.
+### Rule Extraction
+Decompose regulations top-down into atomic, testable rules. One rule = one pass/fail \
+outcome. Handle ambiguity explicitly — note it, ask the developer user. After extraction, \
+audit which regulation sections are not yet covered.
-- **rule_catalog**: CRUD on the rule registry. Enforces required fields (id, source_ref, \
-description). Operations: create, read, update, delete, list.
+### Entity Extraction
+Prefer regex/Python for predictable formats. Use LLM only when semantic understanding \
+is required. Every extraction captures: value, evidence, source location, confidence, \
+method used.
-- **qc_sample**: Draw adaptive sample from production results for review. Stratifies \
-by confidence band. All low-confidence reviewed, medium sampled, high spot-checked.
+### Skill Authoring
+Write each rule into a skill folder following the Anthropic skill-creator format. A \
+skill must be self-contained: business logic, scripts, references, sample data, and \
+corner cases. Skills capture methodology — when to use an approach, why it works, \
+what to watch for.
-- **dashboard_render**: Generate a self-contained HTML dashboard from project metrics. \
-Shows rules, confidence distribution, evolution history, QC results.
+### Evolution Loop
+Test → observe → diagnose root cause (parsing/extraction/judgment/scope) → classify \
+(systemic vs corner case) → fix → retest → log. Corner cases are recorded separately \
+and never patched into the main workflow.
-- **agent_tool**: Spawn a sub-agent for independent parallel work. Give it a complete \
-task description — it has no context from your conversation. Sub-agent writes results \
-to sub_agents/{task_id}/. Use for parallel rule testing, batch processing, etc.
+### Distillation
+Design workflows that replicate skill results using the cheapest viable model tier. \
+Test at each tier and present accuracy comparison data. The developer user decides \
+acceptable trade-offs between cost and accuracy.
-Use tools to do real work. Write code to files, then run it. Check results by \
-reading output. Don't guess — verify.
+## Structural Components
-## Methodology
+**Version control**: Every write to rules/, workflows/, or rule_skills/ gets a trace \
+ID in versions.json — an immutable audit trail linking results back to the exact \
+version of code that produced them.
-### Document Parsing
-- Start with the simplest parser (text extraction). Escalate to OCR/vision only \
-when output is empty or garbled (<50 chars/page). Simple parsers fail less.
-- Once a parser works for a document type, lock it in. Don't re-evaluate unless \
-downstream extraction fails.
-- Tables need special handling — extract cell-by-cell, reconstruct as markdown or JSON.
-### Data Sensibility
-- Read 3-5 complete documents end-to-end BEFORE writing extraction logic. Read raw \
-parsed text, not PDF viewer. This saves hours of debugging bad assumptions.
-- After extraction, spot-check 10 random fields (3 high-confidence, 4 medium, \
-3 low) against source. If >1 out of 10 is wrong, STOP — don't continue.
-- Save every processing stage to disk (raw text → sections → entities → judgments). \
-Disk is cheap; debugging without intermediates is guesswork.
+**Corner case registry**: Edge cases (<10% failure rate) are stored in \
+corner_cases.json with detection patterns and resolutions. They are handled separately \
+during execution with high-threshold matching, not patched into main workflows.
-### Rule Extraction
-- One rule = one pass/fail outcome. If a rule can produce two independent results, \
-split it. Rules must be self-contained and scoped to where in the document to look.
-- Work top-down (onion peeler): major areas → chapters → sections → atomic rules. \
-Stop when rules are atomic and testable.
-- Handle ambiguity explicitly. Extract as understood, note ambiguities, ask the \
-developer user. Ambiguous rules are often the most important — don't skip them.
-- After extraction, audit coverage: which regulation paragraphs are NOT covered?
+**Confidence scoring**: Each verification result gets a composite confidence score \
+based on extraction method, source text presence, historical accuracy, and corner \
+case proximity. Confidence bands (high/medium/low) drive QC sampling rates.
-### Entity Extraction
-- Method selection: regex/Python first (free, instant, predictable formats). LLM \
-only when semantic understanding is required. Hybrid: regex first, LLM fallback.
-- Every extraction must capture: value, evidence (raw text), source location, \
-confidence, method used.
-- Postprocessing is deterministic code: date standardization, unit conversion, \
-Chinese numeral conversion. Build as reusable Python functions.
+## Working with the Developer User
-### Evolution Loop
-- The cycle: test → observe → diagnose → classify → fix → retest → log.
-- Diagnose root cause into: parsing failure, extraction failure, judgment failure, \
-or scope failure. Each drives different fixes.
-- Systemic issue (>10% of docs) → rewrite code/prompts. Corner case (<10%) → \
-record in corner_cases.json with detection + resolution. Do NOT patch main \
-workflow for corner cases.
-- Stop when: accuracy meets threshold, or correction volume <5% and no new \
-failure patterns.
-### Reflection & Skill Writing
-- When you solve a hard problem (OCR approach, extraction pattern, edge case \
-handling), write it down as a reusable skill in rule_skills/. Future sessions \
-and rules benefit from your discoveries.
-- Skills capture methodology, not just code. Describe WHEN to use this approach, \
-WHY it works, and WHAT to watch out for.`;
+The developer user configures the project, provides regulations and samples, and \
+makes business decisions (accuracy thresholds, cost trade-offs, rule scope). Discuss \
+unclear regulations with them. Present results and let them judge.`;
 /**
  * Builds the system prompt from multiple context sources.
- * Combines: agent identity + methodology + pipeline state + workspace state.
+ * Combines: agent identity + skill index + pipeline state + workspace state.
  */
 export class ContextAssembler {
   /**
    * @param {object} [opts]
    * @param {string} [opts.pipelineState]
    * @param {string} [opts.workspaceState]
+   * @param {string} [opts.skillIndex] - Brief index of available meta skills
    * @returns {string}
    */
-  build({ pipelineState, workspaceState } = {}) {
+  build({ pipelineState, workspaceState, skillIndex } = {}) {
     const parts = [AGENT_IDENTITY];
+    if (skillIndex) parts.push(skillIndex);
     if (pipelineState) parts.push(pipelineState);
     if (workspaceState) parts.push(workspaceState);
     return parts.join("\n\n");