npm - kc-beta - Versions diffs - 0.6.1 → 0.7.0 - Mend

kc-beta 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/LICENSE +81 -0
package/LICENSE-COMMERCIAL.md +125 -0
package/README.md +21 -3
package/package.json +14 -5
package/src/agent/context-window.js +9 -12
package/src/agent/context.js +14 -1
package/src/agent/document-parser.js +169 -0
package/src/agent/engine.js +499 -20
package/src/agent/history/event-history.js +222 -0
package/src/agent/llm-client.js +55 -0
package/src/agent/message-utils.js +63 -0
package/src/agent/pipelines/_milestone-derive.js +511 -0
package/src/agent/pipelines/base.js +21 -0
package/src/agent/pipelines/distillation.js +28 -15
package/src/agent/pipelines/extraction.js +103 -36
package/src/agent/pipelines/finalization.js +178 -11
package/src/agent/pipelines/index.js +6 -1
package/src/agent/pipelines/initializer.js +74 -8
package/src/agent/pipelines/production-qc.js +31 -44
package/src/agent/pipelines/skill-authoring.js +152 -80
package/src/agent/pipelines/skill-testing.js +67 -23
package/src/agent/retry.js +10 -2
package/src/agent/scheduler.js +14 -2
package/src/agent/session-state.js +35 -2
package/src/agent/skill-loader.js +13 -7
package/src/agent/skill-validator.js +163 -0
package/src/agent/task-manager.js +61 -5
package/src/agent/tools/_workflow-result-schema.js +249 -0
package/src/agent/tools/document-chunk.js +21 -9
package/src/agent/tools/phase-advance.js +52 -6
package/src/agent/tools/release.js +51 -9
package/src/agent/tools/rule-catalog.js +11 -1
package/src/agent/tools/workflow-run.js +9 -4
package/src/agent/tools/workspace-file.js +32 -0
package/src/agent/workspace.js +61 -0
package/src/cli/components.js +64 -14
package/src/cli/index.js +62 -3
package/src/cli/meme.js +26 -25
package/src/config.js +65 -22
package/src/model-tiers.json +48 -0
package/src/providers.js +87 -0
package/template/release/v1/README.md.tmpl +108 -0
package/template/release/v1/catalog.json.tmpl +4 -0
package/template/release/v1/kc_runtime/__init__.py +11 -0
package/template/release/v1/kc_runtime/confidence.py +63 -0
package/template/release/v1/kc_runtime/doc_parser.py +127 -0
package/template/release/v1/manifest.json.tmpl +11 -0
package/template/release/v1/render_dashboard.py +117 -0
package/template/release/v1/run.py +212 -0
package/template/release/v1/serve.sh +17 -0
package/template/skills/en/meta-meta/skill-authoring/SKILL.md +19 -0
package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
package/template/skills/en/skill-creator/SKILL.md +1 -1
package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +19 -0
package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
package/template/skills/zh/skill-creator/SKILL.md +1 -1

package/src/config.js CHANGED Viewed

@@ -23,8 +23,20 @@ function loadGlobalConfig() {
  */
 function loadEnvFile(envPath) {
   if (!fs.existsSync(envPath)) return {};
+  // v0.7.0 H9: defend bootstrap against a .env that exists but isn't
+  // readable (permission denied, unexpected directory, encoding error,
+  // race with concurrent write). Old code threw and crashed config
+  // bootstrap before the CLI was even up — return empty {} on any
+  // read failure so the user sees the more actionable
+  // "no API key configured" error from loadSettings instead.
+  let raw;
+  try {
+    raw = fs.readFileSync(envPath, "utf-8");
+  } catch {
+    return {};
+  }
   const env = {};
-  const lines = fs.readFileSync(envPath, "utf-8").split("\n");
+  const lines = raw.split("\n");
   for (const line of lines) {
     const trimmed = line.trim();
     if (!trimmed || trimmed.startsWith("#")) continue;
@@ -51,8 +63,13 @@ export function loadSettings(workspacePath) {
   const gc = loadGlobalConfig();
   const env = workspacePath ? loadEnvFile(path.join(workspacePath, ".env")) : {};
+  // Session-scoped overrides (process.env). Internal knob for benchmarking
+  // — lets a single launch swap conductor/workspace/context without touching
+  // ~/.kc_agent/config.json. Not exposed in --help or onboard.
+  const penv = process.env;
   // Resolve provider metadata for authType/apiFormat defaults
-  const provider = gc.provider || "siliconflow";
+  const provider = penv.KC_PROVIDER || gc.provider || "siliconflow";
   const providerDef = getProviderById(provider);
   const settings = {
@@ -61,10 +78,10 @@ export function loadSettings(workspacePath) {
     authType: gc.auth_type || providerDef?.authType || "bearer",
     apiFormat: gc.api_format || providerDef?.apiFormat || "openai",
-    // Conductor LLM (generic keys with legacy fallback)
-    llmApiKey: env.LLM_API_KEY || env.SILICONFLOW_API_KEY || gc.api_key || "",
-    llmBaseUrl: env.LLM_BASE_URL || env.SILICONFLOW_BASE_URL || gc.base_url || "https://api.siliconflow.cn/v1",
-    kcModel: gc.conductor_model || "glm-5",
+    // Conductor LLM (process.env wins → workspace .env → global config)
+    llmApiKey: penv.KC_LLM_API_KEY || env.LLM_API_KEY || env.SILICONFLOW_API_KEY || gc.api_key || "",
+    llmBaseUrl: penv.KC_LLM_BASE_URL || env.LLM_BASE_URL || env.SILICONFLOW_BASE_URL || gc.base_url || "https://api.siliconflow.cn/v1",
+    kcModel: penv.KC_CONDUCTOR_MODEL || gc.conductor_model || "glm-5",
     kcMaxTokens: parseInt(env.KC_MAX_TOKENS || gc.kc_max_tokens?.toString() || "65536", 10),
     // Tier models (from .env or global config tiers)
@@ -78,10 +95,10 @@ export function loadSettings(workspacePath) {
     vlmTier2: env.VLM_TIER2 || gc.vlm_tiers?.tier2 || "",
     vlmTier3: env.VLM_TIER3 || gc.vlm_tiers?.tier3 || "",
-    // Worker LLM — optional, defaults to conductor config
-    workerProvider: gc.worker_provider || "",
-    workerApiKey: env.WORKER_API_KEY || gc.worker_api_key || "",
-    workerBaseUrl: env.WORKER_BASE_URL || gc.worker_base_url || "",
+    // Worker LLM — optional, defaults to conductor config (process.env wins)
+    workerProvider: penv.KC_WORKER_PROVIDER || gc.worker_provider || "",
+    workerApiKey: penv.KC_WORKER_API_KEY || env.WORKER_API_KEY || gc.worker_api_key || "",
+    workerBaseUrl: penv.KC_WORKER_BASE_URL || env.WORKER_BASE_URL || gc.worker_base_url || "",
     workerAuthType: gc.worker_auth_type || "",
     workerApiFormat: gc.worker_api_format || "",
@@ -89,8 +106,8 @@ export function loadSettings(workspacePath) {
     mineruApiUrl: env.MINERU_API_URL || "",
     mineruApiKey: env.MINERU_API_KEY || "",
-    // Workspace
-    kcWorkspaceRoot: gc.workspace_root || path.join(os.homedir(), ".kc_agent", "workspaces"),
+    // Workspace (process.env wins — for parallel benchmark runs)
+    kcWorkspaceRoot: penv.KC_WORKSPACE_ROOT || gc.workspace_root || path.join(os.homedir(), ".kc_agent", "workspaces"),
     kcExecTimeout: parseInt(env.KC_EXEC_TIMEOUT || "30", 10),
     // Accuracy thresholds
@@ -110,16 +127,42 @@ export function loadSettings(workspacePath) {
     tavilyApiKey: env.TAVILY_API_KEY || gc.tavily_api_key || "",
     // Context management — A2: prefer per-provider cap from providers.js
-    // over the generic 200000 default. KC_CONTEXT_LIMIT env still wins.
-    // gc.kc_context_limit (global config) is next. Then provider.contextLimit.
-    // Then a safe 200000 fallback for unknown/custom providers.
-    kcContextLimit: parseInt(
-      env.KC_CONTEXT_LIMIT ||
-        gc.kc_context_limit?.toString() ||
-        providerDef?.contextLimit?.toString() ||
-        "200000",
-      10,
-    ),
+    // over the generic 200000 default. process.env.KC_CONTEXT_LIMIT wins
+    // (session-scoped override for benchmarking long-context models without
+    // editing global config), then workspace .env, then global config, then
+    // provider.contextLimit, then a safe 200000 fallback.
+    //
+    // v0.7.0 E3 (#96): providerContextCap is the deployment hard ceiling
+    // (e.g., SiliconFlow's GLM-5.1 caps at 202_752 despite the model's
+    // native 1M). Effective contextLimit = min(user-requested,
+    // providerContextCap). E2E #5 GLM hit HTTP 413 because user set
+    // KC_CONTEXT_LIMIT=400000 but the deployment refused at ~203k.
+    // The cap is applied AFTER user-priority resolution so the user
+    // can't accidentally bypass it.
+    kcContextLimit: (() => {
+      const requested = parseInt(
+        penv.KC_CONTEXT_LIMIT ||
+          env.KC_CONTEXT_LIMIT ||
+          gc.kc_context_limit?.toString() ||
+          providerDef?.contextLimit?.toString() ||
+          "200000",
+        10,
+      );
+      const cap = providerDef?.providerContextCap;
+      if (typeof cap === "number" && cap > 0 && requested > cap) {
+        // Surface a one-time warning so users notice the clamp without
+        // burying it in events.jsonl.
+        // eslint-disable-next-line no-console
+        console.warn(
+          `[config] KC_CONTEXT_LIMIT=${requested} clamped to ${cap} ` +
+          `(provider ${providerDef.id} hardCap). E2E #5 hit HTTP 413 at ` +
+          `~203k on SiliconFlow GLM-5.1; cap protects against deployment ` +
+          `hard-ceiling rejections.`,
+        );
+        return cap;
+      }
+      return requested;
+    })(),
     toolOutputOffloadTokens: parseInt(env.TOOL_OUTPUT_OFFLOAD_TOKENS || gc.tool_output_offload_tokens?.toString() || "2000", 10),
     toolOutputOffloadErrorTokens: parseInt(env.TOOL_OUTPUT_OFFLOAD_ERROR_TOKENS || gc.tool_output_offload_error_tokens?.toString() || "500", 10),
     maxMessageTokens: parseInt(env.MAX_MESSAGE_TOKENS || gc.max_message_tokens?.toString() || "60000", 10),

package/src/model-tiers.json CHANGED Viewed

@@ -123,6 +123,54 @@
     }
   },
+  "deepseek": {
+    "_comment": "DeepSeek v4 family — flagship pro + cheap flash. Native 1M context but KC caps to 200K.",
+    "conductor": "deepseek-v4-pro",
+    "llm": {
+      "tier1": "deepseek-v4-pro",
+      "tier2": "deepseek-v4-pro",
+      "tier3": "deepseek-v4-flash",
+      "tier4": "deepseek-v4-flash"
+    },
+    "vlm": {
+      "tier1": "",
+      "tier2": "",
+      "tier3": ""
+    }
+  },
+  "tencent": {
+    "_comment": "Tencent Hunyuan via Lkeap plan endpoint. hy3-preview is the hidden flagship (not in /models listing but accepts requests). hunyuan-t1 is a thinking model — if used as conductor, ensure v0.6.3.1 reasoning_content roundtrip is in place.",
+    "conductor": "hy3-preview",
+    "llm": {
+      "tier1": "hy3-preview, hunyuan-t1",
+      "tier2": "hunyuan-turbos, hunyuan-2.0-thinking",
+      "tier3": "hunyuan-2.0-instruct, tc-code-latest",
+      "tier4": "tc-code-latest"
+    },
+    "vlm": {
+      "tier1": "",
+      "tier2": "",
+      "tier3": ""
+    }
+  },
+  "xiaomi": {
+    "_comment": "Xiaomi MiMo coding plan — flagship Pro + standard + multimodal Omni. Native 1M context but KC caps to 200K. TTS variants excluded (no KC use case). Endpoint normalizes IDs to lowercase — must match exactly.",
+    "conductor": "mimo-v2.5-pro",
+    "llm": {
+      "tier1": "mimo-v2.5-pro",
+      "tier2": "mimo-v2.5",
+      "tier3": "mimo-v2-pro",
+      "tier4": "mimo-v2-pro"
+    },
+    "vlm": {
+      "tier1": "mimo-v2-omni",
+      "tier2": "mimo-v2-omni",
+      "tier3": ""
+    }
+  },
   "openrouter": {
     "conductor": "anthropic/claude-sonnet-4-20250514",
     "llm": {

package/src/providers.js CHANGED Viewed

@@ -47,6 +47,14 @@ const PROVIDERS = [
     apiFormat: "openai",
     modelsEndpoint: "/models",
     contextLimit: 200000, // GLM-5.1, Kimi-K2.5 — 200K native
+    // v0.7.0 E3 (#96): provider hardCap. SiliconFlow's GLM-5.1
+    // deployment caps prompts at ~202,752 tokens despite the model's
+    // native 1M — E2E #5 GLM hit HTTP 413 at 203,363 tokens with
+    // KC_CONTEXT_LIMIT=400000 set. providerContextCap protects against
+    // user-set context limits exceeding the deployment hard ceiling.
+    // Effective limit becomes min(providerContextCap, modelContextLimit,
+    // KC_CONTEXT_LIMIT). When undefined, no provider cap applied.
+    providerContextCap: 200000,
     defaultModel: getTierConfig("siliconflow").conductor || "glm-5",
     defaultTiers: getTierConfig("siliconflow").llm,
     defaultVlm: getTierConfig("siliconflow").vlm,
@@ -211,6 +219,85 @@ const PROVIDERS = [
       zh: "MiniMax",
     },
   },
+  {
+    id: "deepseek",
+    name: "DeepSeek",
+    baseUrl: "https://api.deepseek.com",
+    authType: "bearer",
+    apiFormat: "openai",
+    modelsEndpoint: "/models",
+    contextLimit: 200000, // KC cap — DeepSeek v4 is native 1M; we cap to 200K
+    defaultModel: getTierConfig("deepseek").conductor || "deepseek-v4-pro",
+    defaultTiers: getTierConfig("deepseek").llm,
+    defaultVlm: getTierConfig("deepseek").vlm,
+    curatedModels: [
+      { id: "deepseek-v4-pro", ownedBy: "deepseek" },
+      { id: "deepseek-v4-flash", ownedBy: "deepseek" },
+    ],
+    labels: {
+      en: "DeepSeek (v4 family)",
+      zh: "DeepSeek（v4 系列）",
+    },
+  },
+  {
+    id: "xiaomi",
+    name: "Xiaomi MiMo",
+    baseUrl: "https://token-plan-cn.xiaomimimo.com/v1",
+    authType: "bearer",
+    apiFormat: "openai",
+    modelsEndpoint: null, // Xiaomi coding-plan endpoint, no /models — use curated list
+    supportsCodingPlanKey: true,
+    contextLimit: 200000, // KC cap — MiMo V2.5 is native 1M
+    defaultModel: getTierConfig("xiaomi").conductor || "MiMo-V2.5-Pro",
+    defaultTiers: getTierConfig("xiaomi").llm,
+    defaultVlm: getTierConfig("xiaomi").vlm,
+    curatedModels: [
+      { id: "MiMo-V2.5-Pro", ownedBy: "xiaomi" },
+      { id: "MiMo-V2.5", ownedBy: "xiaomi" },
+      { id: "MiMo-V2-Pro", ownedBy: "xiaomi" },
+      { id: "MiMo-V2-Omni", ownedBy: "xiaomi" }, // multimodal
+      // TTS variants (MiMo-V2.5-TTS, *-VoiceClone, *-VoiceDesign, MiMo-V2-TTS)
+      // intentionally excluded — KC has no TTS use case.
+    ],
+    labels: {
+      en: "Xiaomi MiMo (V2.5 family, coding plan)",
+      zh: "小米 MiMo（V2.5 系列，编程计划）",
+    },
+  },
+  {
+    // Tencent Hunyuan via the Lkeap "plan" coding-token endpoint. The /models
+    // endpoint exposes a multi-vendor menu (glm-5.x, kimi-k2.5, minimax,
+    // hunyuan-*, tc-code-latest); hy3-preview is a hidden flagship that
+    // accepts requests but doesn't appear in /models. Curated list reflects
+    // what was advertised + the preview model the user has access to.
+    id: "tencent",
+    name: "Tencent Hunyuan",
+    baseUrl: "https://api.lkeap.cloud.tencent.com/plan/v3",
+    authType: "bearer",
+    apiFormat: "openai",
+    modelsEndpoint: "/models",
+    supportsCodingPlanKey: true,
+    contextLimit: 200000, // hy3-preview is officially 256K; keep below cap with margin
+    defaultModel: getTierConfig("tencent").conductor || "hy3-preview",
+    defaultTiers: getTierConfig("tencent").llm,
+    defaultVlm: getTierConfig("tencent").vlm,
+    curatedModels: [
+      { id: "hy3-preview", ownedBy: "tencent" }, // hidden flagship
+      { id: "hunyuan-t1", ownedBy: "tencent" }, // thinking model
+      { id: "hunyuan-turbos", ownedBy: "tencent" },
+      { id: "hunyuan-2.0-thinking", ownedBy: "tencent" },
+      { id: "hunyuan-2.0-instruct", ownedBy: "tencent" },
+      { id: "tc-code-latest", ownedBy: "tencent" },
+      // Multi-vendor pass-throughs on the same plan key:
+      { id: "glm-5.1", ownedBy: "system" },
+      { id: "kimi-k2.5", ownedBy: "system" },
+      { id: "minimax-m2.7", ownedBy: "system" },
+    ],
+    labels: {
+      en: "Tencent Hunyuan (Lkeap plan)",
+      zh: "腾讯混元（Lkeap 编程计划）",
+    },
+  },
   {
     id: "openrouter",
     name: "OpenRouter",

package/template/release/v1/README.md.tmpl ADDED Viewed

@@ -0,0 +1,108 @@
+# KC Verification Release — v1
+This bundle is a self-contained verification system produced by KC's
+finalization phase. It runs without KC's CLI installed.
+## Project
+- **Generated by**: KC v{{kc_version}}
+- **Session**: `{{session_id}}`
+- **Generated at**: {{generated_at}}
+- **Rules**: {{rule_count}}
+- **Workflows**: {{workflow_count}}
+## What this does
+{{project_description}}
+## How to run
+### Prerequisites
+```
+python3 >= 3.9
+# Optional native parsers (recommended; falls back to LibreOffice if missing):
+pip install pypdf python-docx
+```
+### Single-document smoke test
+```bash
+python3 run.py --doc /path/to/document.pdf
+```
+### Full batch
+```bash
+python3 run.py /path/to/input_dir/
+# results land in output/results/<doc_stem>.json
+# summary in output/results/summary.json
+```
+### Filter by rule
+```bash
+python3 run.py /path/to/input_dir/ --rules R001,R005,R012
+```
+### Render dashboard
+```bash
+python3 render_dashboard.py output/results/ > dashboard.html
+./serve.sh           # http://localhost:8765/dashboard.html
+```
+## Layout
+```
+release/v1/
+├── run.py                      # entry point
+├── render_dashboard.py         # HTML dashboard renderer
+├── serve.sh                    # local http server shim
+├── manifest.json               # populated bundle manifest
+├── catalog.json                # populated rule catalog
+├── confidence_calibration.json # historical accuracy per rule (for confidence calibration)
+├── README.md                   # this file
+├── kc_runtime/
+│   ├── __init__.py
+│   ├── doc_parser.py           # PDF/DOCX/TXT → text
+│   └── confidence.py           # calibration helpers
+└── workflows/
+    └── <rule_id>/workflow_v1.py
+```
+## Workflow contract
+Each `workflows/<rule_id>/workflow_v1.py` is a standalone Python script:
+- Takes a document path on `sys.argv[1]`
+- Emits a single JSON line on stdout containing the verdict
+- Exit code 0 on success, non-zero on workflow-internal error
+Verdict shape:
+```json
+{
+  "rule_id": "R001",
+  "verdict": "PASS|FAIL|PARTIAL|NOT_APPLICABLE|UNDETERMINED|ERROR",
+  "confidence": 0.0,
+  "reason": "human-readable explanation",
+  "evidence": ["snippet 1", "snippet 2"]
+}
+```
+## Known limitations
+{{known_limitations}}
+## License
+This bundle is licensed under the same terms as KC itself
+(PolyForm Noncommercial 1.0.0). For commercial use, see KC's
+LICENSE-COMMERCIAL.md.
+---
+*Re-running this bundle on a new document set is the recommended path.
+For methodology changes (new rules, threshold tuning), re-run KC's
+distillation + production_qc phases and re-emit a fresh release.*

package/template/release/v1/catalog.json.tmpl ADDED Viewed

@@ -0,0 +1,4 @@
+[
+  /* Populated by KC finalization from rules/catalog.json. Each entry: */
+  /* { "id": "R001", "title": "...", "description": "...", "source_ref": "..." } */
+]

package/template/release/v1/kc_runtime/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""KC release runtime — v1.
+Minimal Python helpers used by run.py to dispatch verification
+workflows. Designed to be drop-in self-contained: stdlib + a handful
+of optional native parsers (pypdf, python-docx) for document
+parsing. Falls back to plaintext + LibreOffice CLI if natives
+unavailable — never crashes the run on a missing dep.
+"""
+__version__ = "1.0.0"
+__all__ = ["doc_parser", "confidence"]

package/template/release/v1/kc_runtime/confidence.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""
+Confidence calibration helpers for the release runtime.
+Workflows return raw verdicts with a self-reported confidence score.
+This module re-weights that score against the historical accuracy
+captured during KC's distillation phase, so users see calibrated
+confidence rather than the agent's prior. Falls back to identity
+when no calibration data is available.
+"""
+from __future__ import annotations
+def calibrate(verdict: dict, historical: dict) -> dict:
+    """
+    Adjust verdict["confidence"] using historical accuracy for the rule.
+    Schema for `historical`:
+        {
+          "historical_accuracy": {
+            "<rule_id>": {"accuracy": float in [0, 1], "n_samples": int},
+            ...
+          }
+        }
+    If the rule has no calibration data, the verdict is returned
+    unchanged. If the rule's accuracy is < 0.5 (worse than coin flip),
+    confidence is dampened by the calibration ratio. If accuracy is
+    high but n_samples is small, calibration trusts the raw score
+    more (avoid over-correcting on weak prior).
+    """
+    rule_id = verdict.get("rule_id")
+    if not rule_id:
+        return verdict
+    hist = historical.get("historical_accuracy", {}).get(rule_id)
+    if not hist:
+        return verdict
+    accuracy = float(hist.get("accuracy", 1.0))
+    n_samples = int(hist.get("n_samples", 0))
+    raw = float(verdict.get("confidence", 0.5))
+    # Bayesian-ish blend: weight raw confidence vs accuracy by n_samples.
+    # Small n → trust the raw score; large n → trust the prior more.
+    weight = min(0.5, n_samples / 100.0)
+    calibrated = raw * (1 - weight) + raw * accuracy * weight
+    out = dict(verdict)
+    out["confidence"] = round(calibrated, 4)
+    out["confidence_raw"] = raw
+    out["confidence_calibrated"] = True
+    return out
+def confidence_band(score: float) -> str:
+    """Map numeric score to a verbal band: high / medium / low."""
+    if score >= 0.8:
+        return "high"
+    if score >= 0.5:
+        return "medium"
+    return "low"

package/template/release/v1/kc_runtime/doc_parser.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""
+Minimal document parser for the release runtime.
+Strategy: try native Python parsers first (pypdf, python-docx),
+fall back to LibreOffice CLI if natives unavailable AND lo is on
+PATH, finally fall back to UTF-8 plaintext read. Each strategy
+records what it tried via the result dict so workflows can decide
+whether to trust the text.
+This is a release-time helper — KC's CLI mode uses its own document
+parsing pipeline (src/agent/document-parser.js + LibreOffice).
+"""
+from __future__ import annotations
+import json
+import shutil
+import subprocess
+from pathlib import Path
+def preflight(doc: Path) -> dict:
+    """
+    Verify a document is parseable; return a small status dict.
+    Lets workflows skip cleanly when the parse will fail rather than
+    burning a worker-LLM call on an unreadable file.
+    """
+    if not doc.exists():
+        return {"ok": False, "reason": "not_found", "path": str(doc)}
+    if not doc.is_file():
+        return {"ok": False, "reason": "not_file", "path": str(doc)}
+    if doc.stat().st_size == 0:
+        return {"ok": False, "reason": "empty", "path": str(doc)}
+    return {"ok": True, "path": str(doc), "size_bytes": doc.stat().st_size}
+def extract_text(doc: Path) -> dict:
+    """
+    Pull text out of a document. Returns:
+      { "text": "...", "via": "<strategy>", "ok": bool, "error"?: str }
+    Strategies tried in order:
+      1. Suffix-specific native parser (pypdf for .pdf, python-docx for .docx)
+      2. LibreOffice CLI (`soffice --headless --convert-to txt`) if on PATH
+      3. UTF-8 plaintext (.txt, .md, or any file with text-like bytes)
+    """
+    suffix = doc.suffix.lower()
+    if suffix == ".pdf":
+        text = _try_pypdf(doc)
+        if text is not None:
+            return {"text": text, "via": "pypdf", "ok": True}
+    if suffix in (".docx",):
+        text = _try_python_docx(doc)
+        if text is not None:
+            return {"text": text, "via": "python-docx", "ok": True}
+    # LibreOffice fallback for anything we couldn't parse natively
+    if suffix in (".pdf", ".doc", ".docx", ".odt", ".rtf"):
+        text = _try_libreoffice(doc)
+        if text is not None:
+            return {"text": text, "via": "libreoffice", "ok": True}
+    # Plaintext fallback (covers .txt, .md, .csv, .json, etc.)
+    try:
+        text = doc.read_text(encoding="utf-8")
+        return {"text": text, "via": "plaintext_utf8", "ok": True}
+    except UnicodeDecodeError:
+        try:
+            text = doc.read_text(encoding="gbk")  # common in Chinese corpora
+            return {"text": text, "via": "plaintext_gbk", "ok": True}
+        except Exception as exc:
+            return {"text": "", "via": "none", "ok": False, "error": str(exc)}
+# --- internals ---
+def _try_pypdf(doc: Path):
+    try:
+        import pypdf  # type: ignore
+    except ImportError:
+        return None
+    try:
+        reader = pypdf.PdfReader(str(doc))
+        return "\n".join(page.extract_text() or "" for page in reader.pages)
+    except Exception:
+        return None
+def _try_python_docx(doc: Path):
+    try:
+        import docx  # python-docx
+    except ImportError:
+        return None
+    try:
+        d = docx.Document(str(doc))
+        parts = [p.text for p in d.paragraphs]
+        for table in d.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text:
+                        parts.append(cell.text)
+        return "\n".join(parts)
+    except Exception:
+        return None
+def _try_libreoffice(doc: Path):
+    soffice = shutil.which("soffice") or shutil.which("libreoffice")
+    if not soffice:
+        return None
+    out_dir = doc.parent / ".kc-lo-out"
+    out_dir.mkdir(exist_ok=True)
+    try:
+        subprocess.run(
+            [soffice, "--headless", "--convert-to", "txt", "--outdir", str(out_dir), str(doc)],
+            capture_output=True,
+            timeout=60,
+            check=True,
+        )
+        txt_path = out_dir / (doc.stem + ".txt")
+        if txt_path.exists():
+            return txt_path.read_text(encoding="utf-8")
+    except Exception:
+        return None
+    return None

package/template/release/v1/manifest.json.tmpl ADDED Viewed

@@ -0,0 +1,11 @@
+{
+  "release_version": "v1",
+  "kc_version": "{{kc_version}}",
+  "generated_at": "{{generated_at}}",
+  "session_id": "{{session_id}}",
+  "rules_count": {{rule_count}},
+  "workflows_count": {{workflow_count}},
+  "workflows": {},
+  "calibration_source": "confidence_calibration.json",
+  "documentation": "README.md"
+}