kc-beta 0.6.2 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +81 -0
  2. package/LICENSE-COMMERCIAL.md +125 -0
  3. package/README.md +21 -3
  4. package/package.json +14 -5
  5. package/src/agent/context-window.js +9 -12
  6. package/src/agent/context.js +14 -1
  7. package/src/agent/document-parser.js +169 -0
  8. package/src/agent/engine.js +382 -19
  9. package/src/agent/history/event-history.js +222 -0
  10. package/src/agent/llm-client.js +55 -0
  11. package/src/agent/message-utils.js +63 -0
  12. package/src/agent/pipelines/_milestone-derive.js +566 -0
  13. package/src/agent/pipelines/base.js +21 -0
  14. package/src/agent/pipelines/distillation.js +28 -15
  15. package/src/agent/pipelines/extraction.js +130 -36
  16. package/src/agent/pipelines/finalization.js +178 -11
  17. package/src/agent/pipelines/index.js +6 -1
  18. package/src/agent/pipelines/initializer.js +74 -8
  19. package/src/agent/pipelines/production-qc.js +31 -44
  20. package/src/agent/pipelines/skill-authoring.js +97 -80
  21. package/src/agent/pipelines/skill-testing.js +106 -23
  22. package/src/agent/retry.js +10 -2
  23. package/src/agent/scheduler.js +14 -2
  24. package/src/agent/session-state.js +18 -1
  25. package/src/agent/skill-loader.js +13 -7
  26. package/src/agent/skill-validator.js +19 -5
  27. package/src/agent/task-manager.js +61 -5
  28. package/src/agent/tools/document-chunk.js +21 -9
  29. package/src/agent/tools/phase-advance.js +37 -5
  30. package/src/agent/tools/release.js +51 -9
  31. package/src/agent/tools/rule-catalog.js +11 -1
  32. package/src/agent/tools/workspace-file.js +32 -0
  33. package/src/agent/workspace.js +39 -1
  34. package/src/cli/components.js +64 -14
  35. package/src/cli/index.js +62 -3
  36. package/src/cli/meme.js +26 -25
  37. package/src/config.js +65 -22
  38. package/src/model-tiers.json +24 -8
  39. package/src/providers.js +42 -0
  40. package/template/release/v1/README.md.tmpl +108 -0
  41. package/template/release/v1/catalog.json.tmpl +4 -0
  42. package/template/release/v1/kc_runtime/__init__.py +11 -0
  43. package/template/release/v1/kc_runtime/confidence.py +63 -0
  44. package/template/release/v1/kc_runtime/doc_parser.py +127 -0
  45. package/template/release/v1/manifest.json.tmpl +11 -0
  46. package/template/release/v1/render_dashboard.py +117 -0
  47. package/template/release/v1/run.py +212 -0
  48. package/template/release/v1/serve.sh +17 -0
  49. package/template/skills/en/meta-meta/work-decomposition/SKILL.md +326 -0
  50. package/template/skills/en/skill-creator/SKILL.md +1 -1
  51. package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +321 -0
  52. package/template/skills/zh/skill-creator/SKILL.md +1 -1
package/LICENSE ADDED
@@ -0,0 +1,81 @@
1
+ # PolyForm Noncommercial License 1.0.0
2
+
3
+ <https://polyformproject.org/licenses/noncommercial/1.0.0>
4
+
5
+ ## Acceptance
6
+
7
+ In order to get any license under these terms, you must agree to them as both strict obligations and conditions to all your licenses.
8
+
9
+ ## Copyright License
10
+
11
+ The licensor grants you a copyright license for the software to do everything you might do with the software that would otherwise infringe the licensor's copyright in it for any permitted purpose. However, you may only distribute the software according to [Distribution License](#distribution-license) and make changes or new works based on the software according to [Changes and New Works License](#changes-and-new-works-license).
12
+
13
+ ## Distribution License
14
+
15
+ The licensor grants you an additional copyright license to distribute copies of the software. Your license to distribute covers distributing the software with changes and new works permitted by [Changes and New Works License](#changes-and-new-works-license).
16
+
17
+ ## Notices
18
+
19
+ You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms or the URL for them above, as well as copies of any plain-text lines beginning with `Required Notice:` that the licensor provided with the software. For example:
20
+
21
+ > Required Notice: Copyright Yoyodyne, Inc. (http://example.com)
22
+
23
+ ## Changes and New Works License
24
+
25
+ The licensor grants you an additional copyright license to make changes and new works based on the software for any permitted purpose.
26
+
27
+ ## Patent License
28
+
29
+ The licensor grants you a patent license for the software that covers patent claims the licensor can license, or becomes able to license, that you would infringe by using the software.
30
+
31
+ ## Noncommercial Purposes
32
+
33
+ Any noncommercial purpose is a permitted purpose.
34
+
35
+ ## Personal Uses
36
+
37
+ Personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, amateur pursuits, or religious observance, without any anticipated commercial application, is use for a permitted purpose.
38
+
39
+ ## Noncommercial Organizations
40
+
41
+ Use by any charitable organization, educational institution, public research organization, public safety or health organization, environmental protection organization, or government institution is use for a permitted purpose regardless of the source of funding or obligations resulting from the funding.
42
+
43
+ ## Fair Use
44
+
45
+ You may have "fair use" rights for the software under the law. These terms do not limit them.
46
+
47
+ ## No Other Rights
48
+
49
+ These terms do not allow you to sublicense or transfer any of your licenses to anyone else, or prevent the licensor from granting licenses to anyone else. These terms do not imply any other licenses.
50
+
51
+ ## Patent Defense
52
+
53
+ If you make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
54
+
55
+ ## Violations
56
+
57
+ The first time you are notified in writing that you have violated any of these terms, or done anything with the software not covered by your licenses, your licenses can nonetheless continue if you come into full compliance with these terms, and take practical steps to correct past violations, within 32 days of receiving notice. Otherwise, all your licenses end immediately.
58
+
59
+ ## No Liability
60
+
61
+ ***As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.***
62
+
63
+ ## Definitions
64
+
65
+ The **licensor** is the individual or entity offering these terms, and the **software** is the software the licensor makes available under these terms.
66
+
67
+ **You** refers to the individual or entity agreeing to these terms.
68
+
69
+ **Your company** is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. **Control** means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
70
+
71
+ **Your licenses** are all the licenses granted to you for the software under these terms.
72
+
73
+ **Use** means anything you do with the software requiring one of your licenses.
74
+
75
+ ---
76
+
77
+ Required Notice: Copyright (C) 2024-2026 Memium / kitchen-engineer42 (https://github.com/kitchen-engineer42/kc-cli)
78
+
79
+ For commercial use (including but not limited to enterprise production
80
+ deployment, hosting as a paid service, or distribution as a product),
81
+ see LICENSE-COMMERCIAL.md for how to obtain a commercial license.
@@ -0,0 +1,125 @@
1
+ # Commercial License — KC Agent CLI (`kc-beta`)
2
+
3
+ KC is dual-licensed:
4
+
5
+ - **Noncommercial use** is governed by [LICENSE](./LICENSE) — the
6
+ PolyForm Noncommercial License 1.0.0. Personal users, students,
7
+ hobbyists, public-research organizations, charities, and government
8
+ institutions may use, modify, and self-host KC for free under those
9
+ terms.
10
+ - **Commercial use** — including any deployment of KC inside a
11
+ for-profit company's production workflow, hosting KC as a paid
12
+ service, or redistributing KC (modified or unmodified) as part of a
13
+ product or service offering — is **not** covered by the
14
+ noncommercial license and requires a separate commercial license
15
+ from the rights-holder.
16
+
17
+ This file describes how to obtain that commercial license.
18
+
19
+ ---
20
+
21
+ ## What counts as commercial use
22
+
23
+ The PolyForm noncommercial grant covers any **noncommercial purpose**.
24
+ The license body (LICENSE, "Personal Uses" + "Noncommercial
25
+ Organizations" sections) defines noncommercial purposes inclusively:
26
+ research, experiment, testing, personal study, hobby projects,
27
+ charities, educational institutions, public research bodies, public
28
+ safety / health bodies, environmental orgs, and government
29
+ institutions all qualify.
30
+
31
+ Anything outside that envelope is commercial use. Concrete examples
32
+ of activities that **require** a commercial license:
33
+
34
+ - Running KC inside a for-profit company's compliance, legal,
35
+ document-review, or audit workflow that processes business
36
+ documents — even for internal use.
37
+ - Embedding KC (modified or unmodified) into a commercial product or
38
+ SaaS offering.
39
+ - Hosting KC as a paid or freemium service to third parties.
40
+ - Redistributing KC under a different name or as part of another tool
41
+ that is sold or monetized.
42
+
43
+ Concrete examples that **do not** require a commercial license:
44
+
45
+ - A developer evaluating KC at home or on a personal laptop.
46
+ - A graduate student using KC for a thesis on regulatory NLP.
47
+ - A nonprofit law clinic using KC to help unpaid pro-bono casework.
48
+ - A public university research lab benchmarking LLM document
49
+ verification.
50
+ - A government agency using KC for internal regulatory compliance.
51
+
52
+ If you are unsure whether your use case is noncommercial, contact us
53
+ and we will tell you in plain language. Asking is free.
54
+
55
+ ---
56
+
57
+ ## What "redistribute as a new product" forbids
58
+
59
+ Both the noncommercial license and the commercial license forbid
60
+ publishing KC's source code (or any substantial derivative) under a
61
+ different name as a competing or independent product. The
62
+ noncommercial license restricts derivatives to noncommercial use; the
63
+ commercial license is non-transferable and intended for the licensee,
64
+ not for downstream redistribution as a third-party product.
65
+
66
+ Forks for the licensee's own internal use, with proper attribution
67
+ (`Required Notice:` line preserved per the PolyForm Notices section),
68
+ are acceptable under both licenses. Forks intended to be released as
69
+ a new offering — paid or free — are not.
70
+
71
+ ---
72
+
73
+ ## How to obtain a commercial license
74
+
75
+ Email **heavysal@gmail.com** with:
76
+
77
+ 1. A short description of your intended use case (industry, scale,
78
+ internal vs customer-facing)
79
+ 2. Your organization name and country
80
+ 3. Approximate document volume / number of users (so we can scope
81
+ pricing)
82
+ 4. Whether you need any specific terms (indemnification, source-code
83
+ escrow, on-prem only, etc.)
84
+
85
+ We will respond within 5 business days with either:
86
+
87
+ - A standard commercial license offer (non-negotiable terms, fixed
88
+ per-seat or per-document pricing — fastest path), or
89
+ - A scoped negotiation for unusual cases (large enterprise, regulated
90
+ industry, special compliance requirements)
91
+
92
+ Pricing for v0.7.x onward is set per inquiry. Once we have surveyed
93
+ enough commercial licensees to set transparent rates, we will publish
94
+ a pricing sheet here.
95
+
96
+ ---
97
+
98
+ ## Compliance for licensees
99
+
100
+ A commercial license, once granted, covers the specific entity named
101
+ in the agreement and its wholly-controlled affiliates (per PolyForm's
102
+ "Your company" definition). It does **not** automatically extend to
103
+ parent companies, subsidiaries with different control, or
104
+ post-acquisition successors — those need their own license or a
105
+ written extension.
106
+
107
+ The `Required Notice: Copyright ...` line in LICENSE must remain
108
+ present in any copies of KC the licensee distributes internally.
109
+
110
+ ---
111
+
112
+ ## Reporting violations
113
+
114
+ If you believe KC is being used in violation of either license,
115
+ please email **heavysal@gmail.com** with the relevant details. The
116
+ PolyForm `Violations` section gives violators a 32-day correction
117
+ window from the date of written notice; we follow that process for
118
+ both license tracks.
119
+
120
+ ---
121
+
122
+ *Last updated: 2026-04-29. KC v0.7.0+ is licensed under PolyForm
123
+ Noncommercial 1.0.0 (this dual-license model). KC v0.6.x and earlier
124
+ were licensed under MIT and remain under MIT for those release
125
+ versions.*
package/README.md CHANGED
@@ -245,9 +245,27 @@ Bug reports and PRs welcome at <https://github.com/kitchen-engineer42/kc-cli>.
245
245
 
246
246
  ## License
247
247
 
248
- MIT. Bundled meta-skills under `template/skills/` are proprietary —
249
- distributed via npm but not open-source. See `template/skills/LICENSE` for
250
- terms.
248
+ KC v0.7.0+ is **dual-licensed** under [PolyForm Noncommercial 1.0.0](./LICENSE)
249
+ plus a separate commercial license available on request.
250
+
251
+ - **Personal users, students, hobbyists, public-research orgs,
252
+ charities, and government institutions** — use, modify, and self-host
253
+ KC for free under [LICENSE](./LICENSE) (PolyForm Noncommercial 1.0.0).
254
+ - **Enterprises in production** (for-profit company workflows, hosting
255
+ KC as a paid service, distributing KC inside a commercial product) —
256
+ require a commercial license. See [LICENSE-COMMERCIAL.md](./LICENSE-COMMERCIAL.md)
257
+ for terms and how to contact us.
258
+ - **Redistribution as a competing or independent product** — forbidden
259
+ under both license tracks. Internal forks for licensee use are fine
260
+ with the `Required Notice:` preserved; releasing KC under another
261
+ name as a new offering is not.
262
+
263
+ KC v0.6.x and earlier remain under MIT for those release versions
264
+ (licenses can't be retroactively changed); the v0.7.0 cutover applies
265
+ to all subsequent commits and releases.
266
+
267
+ Bundled meta-skills under `template/skills/` follow the same dual
268
+ license as KC itself.
251
269
 
252
270
  ---
253
271
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "kc-beta",
3
- "version": "0.6.2",
4
- "description": "KC Agent — LLM document verification agent (pure Node.js CLI)",
3
+ "version": "0.7.1",
4
+ "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
5
5
  "type": "module",
6
6
  "bin": {
7
7
  "kc-beta": "bin/kc-beta.js"
@@ -9,10 +9,17 @@
9
9
  "files": [
10
10
  "bin/",
11
11
  "src/",
12
+ "!src/cli/meme.source.js",
12
13
  "template/",
13
14
  "README.md",
14
- "QUICKSTART.md"
15
+ "QUICKSTART.md",
16
+ "LICENSE",
17
+ "LICENSE-COMMERCIAL.md"
15
18
  ],
19
+ "scripts": {
20
+ "build:meme": "node scripts/build-meme.js",
21
+ "prepublishOnly": "node scripts/build-meme.js"
22
+ },
16
23
  "homepage": "https://github.com/kitchen-engineer42/kc-cli",
17
24
  "repository": {
18
25
  "type": "git",
@@ -28,8 +35,10 @@
28
35
  "ink": "^6.0.0",
29
36
  "ink-text-input": "^6.0.0",
30
37
  "ink-spinner": "^5.0.0",
38
+ "mammoth": "^1.6.0",
31
39
  "react": "^19.0.0",
32
- "pdfjs-dist": "^4.0.0"
40
+ "pdfjs-dist": "^4.0.0",
41
+ "word-extractor": "^1.0.4"
33
42
  },
34
43
  "keywords": [
35
44
  "document-verification",
@@ -38,5 +47,5 @@
38
47
  "cli"
39
48
  ],
40
49
  "author": "kitchen-engineer42",
41
- "license": "MIT"
50
+ "license": "SEE LICENSE IN LICENSE"
42
51
  }
@@ -1,4 +1,5 @@
1
1
  import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
2
+ import { findSafeSplitPoint } from "./message-utils.js";
2
3
 
3
4
  /**
4
5
  * Automatic context windowing for long conversations.
@@ -38,18 +39,14 @@ export class ContextWindow {
38
39
  return { messages, wasWindowed: false, removedCount: 0 };
39
40
  }
40
41
 
41
- // Split into older and recent. The recent slice is fed directly to the
42
- // LLM, so it must not begin with an orphan "tool" message — those carry a
43
- // tool_call_id that references an assistant `tool_calls` entry, and if
44
- // that assistant message ended up in the compressed older slice the
45
- // provider rejects the request (OpenAI: "tool messages must follow an
46
- // assistant with tool_calls"; Anthropic: unpaired tool_use/tool_result).
47
- // Walk the split point forward past any leading tool rows so the recent
48
- // window always starts on a turn boundary.
49
- let splitPoint = Math.max(0, messages.length - this.recentWindowSize);
50
- while (splitPoint < messages.length && messages[splitPoint]?.role === "tool") {
51
- splitPoint++;
52
- }
42
+ // Split into older and recent. v0.6.3.1: tool-pair atomicity is a
43
+ // bidirectional invariant recent[0] must not be a `tool` (orphan,
44
+ // its assistant_with_tool_calls got summarized away) AND older[-1]
45
+ // must not be `assistant_with_tool_calls` (its tool results sit at
46
+ // the start of recent and the older summary corrupts that pairing).
47
+ // Use the shared `findSafeSplitPoint` helper from engine.js.
48
+ const desiredSplit = Math.max(0, messages.length - this.recentWindowSize);
49
+ const splitPoint = findSafeSplitPoint(messages, desiredSplit);
53
50
  const recentMessages = messages.slice(splitPoint);
54
51
  const olderMessages = messages.slice(0, splitPoint);
55
52
 
@@ -149,12 +149,25 @@ export class ContextAssembler {
149
149
  * @param {string} [opts.pipelineState]
150
150
  * @param {string} [opts.workspaceState]
151
151
  * @param {string} [opts.skillIndex] - Brief index of available meta skills
152
+ * @param {string} [opts.projectMemory] - v0.7.0 B3: rules/PATTERNS.md
153
+ * content. Capped at ~5 KB by the caller. Surfaced for phases the
154
+ * work-decomposition skill operates in (skill_authoring + skill_testing).
152
155
  * @returns {string}
153
156
  */
154
- build({ agentMd, pipelineState, workspaceState, skillIndex } = {}) {
157
+ build({ agentMd, pipelineState, workspaceState, skillIndex, projectMemory } = {}) {
155
158
  const parts = [AGENT_IDENTITY];
156
159
  if (agentMd) parts.push(agentMd);
157
160
  if (skillIndex) parts.push(skillIndex);
161
+ if (projectMemory) {
162
+ parts.push(
163
+ "## Project memory (rules/PATTERNS.md)\n\n" +
164
+ "Patterns + decisions you've accumulated this session. Treat as " +
165
+ "your prior decisions on this corpus — apply them; update the file " +
166
+ "when you discover something better. The work-decomposition skill " +
167
+ "covers what to write here vs. NOT to write.\n\n" +
168
+ "```markdown\n" + projectMemory.trim() + "\n```",
169
+ );
170
+ }
158
171
  if (pipelineState) parts.push(pipelineState);
159
172
  if (workspaceState) parts.push(workspaceState);
160
173
  return parts.join("\n\n");
@@ -0,0 +1,169 @@
1
+ /**
2
+ * v0.7.0 G (#91): native document parser dispatcher.
3
+ *
4
+ * Centralizes the "given a file path, give me text" operation across
5
+ * formats KC handles. Strategy stack:
6
+ *
7
+ * .pdf → pdfjs-dist (already a hard dep)
8
+ * .docx → mammoth (npm dep, dynamic-imported)
9
+ * .doc → word-extractor (npm dep, dynamic-imported)
10
+ * .txt / .md → fs.readFileSync UTF-8 (with GBK fallback for CJK)
11
+ * anything → plaintext-utf8 best-effort, then LibreOffice fallback
12
+ *
13
+ * `mammoth` and `word-extractor` are dynamic-imported so the module
14
+ * degrades gracefully when they're not installed: missing dep → fall
15
+ * through to plaintext / LibreOffice. Lets KC ship without forcing
16
+ * users to run `npm install` post-upgrade if they don't touch
17
+ * DOCX/DOC content.
18
+ *
19
+ * The standalone PDF tool `tools/document-parse.js` (which has its
20
+ * own VLM/OCR escalation logic for image-PDFs) keeps its richer
21
+ * pipeline; this module is for the lower-friction "just give me
22
+ * text" path that document-chunk uses.
23
+ */
24
+
25
+ import fs from "node:fs";
26
+ import path from "node:path";
27
+ import { spawnSync } from "node:child_process";
28
+
29
+ /**
30
+ * @returns {Promise<{text: string, via: string, ok: boolean, error?: string}>}
31
+ */
32
+ export async function extractText(filePath) {
33
+ const suffix = path.extname(filePath).toLowerCase();
34
+
35
+ if (suffix === ".pdf") {
36
+ const text = await _tryPdfjs(filePath);
37
+ if (text !== null) return { text, via: "pdfjs", ok: true };
38
+ }
39
+
40
+ if (suffix === ".docx") {
41
+ const text = await _tryMammoth(filePath);
42
+ if (text !== null) return { text, via: "mammoth", ok: true };
43
+ }
44
+
45
+ if (suffix === ".doc") {
46
+ const text = await _tryWordExtractor(filePath);
47
+ if (text !== null) return { text, via: "word-extractor", ok: true };
48
+ }
49
+
50
+ if (suffix === ".txt" || suffix === ".md" || suffix === ".csv" || suffix === ".json") {
51
+ const text = _tryPlaintext(filePath);
52
+ if (text !== null) return { text, via: "plaintext", ok: true };
53
+ }
54
+
55
+ // Generic fallbacks for anything we couldn't parse natively (or where
56
+ // the native lib isn't installed): plaintext first, then LibreOffice
57
+ // CLI as a last resort.
58
+ const plain = _tryPlaintext(filePath);
59
+ if (plain !== null) return { text: plain, via: "plaintext_fallback", ok: true };
60
+
61
+ const lo = _tryLibreOffice(filePath);
62
+ if (lo !== null) return { text: lo, via: "libreoffice_fallback", ok: true };
63
+
64
+ return {
65
+ text: "",
66
+ via: "none",
67
+ ok: false,
68
+ error: `no parser available for ${suffix || "(no extension)"}`,
69
+ };
70
+ }
71
+
72
+ // --- internals ---
73
+
74
+ async function _tryPdfjs(filePath) {
75
+ try {
76
+ const pdfjsLib = await import("pdfjs-dist/legacy/build/pdf.mjs");
77
+ const data = new Uint8Array(fs.readFileSync(filePath));
78
+ const doc = await pdfjsLib.getDocument({ data, useSystemFonts: true }).promise;
79
+ const parts = [];
80
+ for (let i = 1; i <= doc.numPages; i++) {
81
+ const page = await doc.getPage(i);
82
+ const content = await page.getTextContent();
83
+ parts.push(content.items.map((it) => it.str || "").join(" "));
84
+ }
85
+ return parts.join("\n");
86
+ } catch {
87
+ return null;
88
+ }
89
+ }
90
+
91
+ async function _tryMammoth(filePath) {
92
+ try {
93
+ const mammoth = await import("mammoth");
94
+ const result = await mammoth.extractRawText({ path: filePath });
95
+ return result.value || "";
96
+ } catch {
97
+ return null; // mammoth not installed OR file unreadable
98
+ }
99
+ }
100
+
101
+ async function _tryWordExtractor(filePath) {
102
+ try {
103
+ const { default: WordExtractor } = await import("word-extractor");
104
+ const extractor = new WordExtractor();
105
+ const doc = await extractor.extract(filePath);
106
+ return doc.getBody() || "";
107
+ } catch {
108
+ return null;
109
+ }
110
+ }
111
+
112
+ function _tryPlaintext(filePath) {
113
+ try {
114
+ const buf = fs.readFileSync(filePath);
115
+ // Heuristic: if the buffer parses as UTF-8 cleanly (no replacement
116
+ // characters), use it. Otherwise try GBK for CJK corpora.
117
+ const utf8 = buf.toString("utf-8");
118
+ if (!utf8.includes("�")) return utf8;
119
+ // GBK fallback (only commonly relevant on Chinese corpora)
120
+ try {
121
+ // Node has TextDecoder("gbk") via ICU on most builds
122
+ const gbk = new TextDecoder("gbk", { fatal: false }).decode(buf);
123
+ if (gbk && !gbk.includes("�")) return gbk;
124
+ } catch { /* GBK not supported on this Node build */ }
125
+ // Last resort: return UTF-8 with replacement characters; caller
126
+ // can decide whether to use it.
127
+ return utf8;
128
+ } catch {
129
+ return null;
130
+ }
131
+ }
132
+
133
+ function _tryLibreOffice(filePath) {
134
+ // soffice/libreoffice CLI fallback. Best-effort; returns null on any
135
+ // failure so caller falls back to "no parser available."
136
+ const lo = _findLibreOffice();
137
+ if (!lo) return null;
138
+ try {
139
+ const outDir = path.join(path.dirname(filePath), ".kc-lo-out");
140
+ fs.mkdirSync(outDir, { recursive: true });
141
+ const r = spawnSync(
142
+ lo,
143
+ ["--headless", "--convert-to", "txt", "--outdir", outDir, filePath],
144
+ { timeout: 60_000 },
145
+ );
146
+ if (r.status !== 0) return null;
147
+ const stem = path.basename(filePath, path.extname(filePath));
148
+ const out = path.join(outDir, stem + ".txt");
149
+ if (!fs.existsSync(out)) return null;
150
+ const text = fs.readFileSync(out, "utf-8");
151
+ // Best-effort cleanup of the conversion output
152
+ try { fs.unlinkSync(out); } catch { /* ignore */ }
153
+ return text;
154
+ } catch {
155
+ return null;
156
+ }
157
+ }
158
+
159
+ function _findLibreOffice() {
160
+ // Use which/where heuristic — synchronous, fine at extract-time.
161
+ const candidates = ["soffice", "libreoffice"];
162
+ for (const cmd of candidates) {
163
+ try {
164
+ const r = spawnSync(cmd, ["--version"], { timeout: 5_000 });
165
+ if (r.status === 0) return cmd;
166
+ } catch { /* not on PATH */ }
167
+ }
168
+ return null;
169
+ }