kc-beta 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +81 -0
- package/LICENSE-COMMERCIAL.md +125 -0
- package/README.md +21 -3
- package/package.json +14 -5
- package/src/agent/context-window.js +9 -12
- package/src/agent/context.js +14 -1
- package/src/agent/document-parser.js +169 -0
- package/src/agent/engine.js +499 -20
- package/src/agent/history/event-history.js +222 -0
- package/src/agent/llm-client.js +55 -0
- package/src/agent/message-utils.js +63 -0
- package/src/agent/pipelines/_milestone-derive.js +511 -0
- package/src/agent/pipelines/base.js +21 -0
- package/src/agent/pipelines/distillation.js +28 -15
- package/src/agent/pipelines/extraction.js +103 -36
- package/src/agent/pipelines/finalization.js +178 -11
- package/src/agent/pipelines/index.js +6 -1
- package/src/agent/pipelines/initializer.js +74 -8
- package/src/agent/pipelines/production-qc.js +31 -44
- package/src/agent/pipelines/skill-authoring.js +152 -80
- package/src/agent/pipelines/skill-testing.js +67 -23
- package/src/agent/retry.js +10 -2
- package/src/agent/scheduler.js +14 -2
- package/src/agent/session-state.js +35 -2
- package/src/agent/skill-loader.js +13 -7
- package/src/agent/skill-validator.js +163 -0
- package/src/agent/task-manager.js +61 -5
- package/src/agent/tools/_workflow-result-schema.js +249 -0
- package/src/agent/tools/document-chunk.js +21 -9
- package/src/agent/tools/phase-advance.js +52 -6
- package/src/agent/tools/release.js +51 -9
- package/src/agent/tools/rule-catalog.js +11 -1
- package/src/agent/tools/workflow-run.js +9 -4
- package/src/agent/tools/workspace-file.js +32 -0
- package/src/agent/workspace.js +61 -0
- package/src/cli/components.js +64 -14
- package/src/cli/index.js +62 -3
- package/src/cli/meme.js +26 -25
- package/src/config.js +65 -22
- package/src/model-tiers.json +48 -0
- package/src/providers.js +87 -0
- package/template/release/v1/README.md.tmpl +108 -0
- package/template/release/v1/catalog.json.tmpl +4 -0
- package/template/release/v1/kc_runtime/__init__.py +11 -0
- package/template/release/v1/kc_runtime/confidence.py +63 -0
- package/template/release/v1/kc_runtime/doc_parser.py +127 -0
- package/template/release/v1/manifest.json.tmpl +11 -0
- package/template/release/v1/render_dashboard.py +117 -0
- package/template/release/v1/run.py +212 -0
- package/template/release/v1/serve.sh +17 -0
- package/template/skills/en/meta-meta/skill-authoring/SKILL.md +19 -0
- package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
- package/template/skills/en/skill-creator/SKILL.md +1 -1
- package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +19 -0
- package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
- package/template/skills/zh/skill-creator/SKILL.md +1 -1
package/LICENSE
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# PolyForm Noncommercial License 1.0.0
|
|
2
|
+
|
|
3
|
+
<https://polyformproject.org/licenses/noncommercial/1.0.0>
|
|
4
|
+
|
|
5
|
+
## Acceptance
|
|
6
|
+
|
|
7
|
+
In order to get any license under these terms, you must agree to them as both strict obligations and conditions to all your licenses.
|
|
8
|
+
|
|
9
|
+
## Copyright License
|
|
10
|
+
|
|
11
|
+
The licensor grants you a copyright license for the software to do everything you might do with the software that would otherwise infringe the licensor's copyright in it for any permitted purpose. However, you may only distribute the software according to [Distribution License](#distribution-license) and make changes or new works based on the software according to [Changes and New Works License](#changes-and-new-works-license).
|
|
12
|
+
|
|
13
|
+
## Distribution License
|
|
14
|
+
|
|
15
|
+
The licensor grants you an additional copyright license to distribute copies of the software. Your license to distribute covers distributing the software with changes and new works permitted by [Changes and New Works License](#changes-and-new-works-license).
|
|
16
|
+
|
|
17
|
+
## Notices
|
|
18
|
+
|
|
19
|
+
You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms or the URL for them above, as well as copies of any plain-text lines beginning with `Required Notice:` that the licensor provided with the software. For example:
|
|
20
|
+
|
|
21
|
+
> Required Notice: Copyright Yoyodyne, Inc. (http://example.com)
|
|
22
|
+
|
|
23
|
+
## Changes and New Works License
|
|
24
|
+
|
|
25
|
+
The licensor grants you an additional copyright license to make changes and new works based on the software for any permitted purpose.
|
|
26
|
+
|
|
27
|
+
## Patent License
|
|
28
|
+
|
|
29
|
+
The licensor grants you a patent license for the software that covers patent claims the licensor can license, or becomes able to license, that you would infringe by using the software.
|
|
30
|
+
|
|
31
|
+
## Noncommercial Purposes
|
|
32
|
+
|
|
33
|
+
Any noncommercial purpose is a permitted purpose.
|
|
34
|
+
|
|
35
|
+
## Personal Uses
|
|
36
|
+
|
|
37
|
+
Personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, amateur pursuits, or religious observance, without any anticipated commercial application, is use for a permitted purpose.
|
|
38
|
+
|
|
39
|
+
## Noncommercial Organizations
|
|
40
|
+
|
|
41
|
+
Use by any charitable organization, educational institution, public research organization, public safety or health organization, environmental protection organization, or government institution is use for a permitted purpose regardless of the source of funding or obligations resulting from the funding.
|
|
42
|
+
|
|
43
|
+
## Fair Use
|
|
44
|
+
|
|
45
|
+
You may have "fair use" rights for the software under the law. These terms do not limit them.
|
|
46
|
+
|
|
47
|
+
## No Other Rights
|
|
48
|
+
|
|
49
|
+
These terms do not allow you to sublicense or transfer any of your licenses to anyone else, or prevent the licensor from granting licenses to anyone else. These terms do not imply any other licenses.
|
|
50
|
+
|
|
51
|
+
## Patent Defense
|
|
52
|
+
|
|
53
|
+
If you make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
|
|
54
|
+
|
|
55
|
+
## Violations
|
|
56
|
+
|
|
57
|
+
The first time you are notified in writing that you have violated any of these terms, or done anything with the software not covered by your licenses, your licenses can nonetheless continue if you come into full compliance with these terms, and take practical steps to correct past violations, within 32 days of receiving notice. Otherwise, all your licenses end immediately.
|
|
58
|
+
|
|
59
|
+
## No Liability
|
|
60
|
+
|
|
61
|
+
***As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.***
|
|
62
|
+
|
|
63
|
+
## Definitions
|
|
64
|
+
|
|
65
|
+
The **licensor** is the individual or entity offering these terms, and the **software** is the software the licensor makes available under these terms.
|
|
66
|
+
|
|
67
|
+
**You** refers to the individual or entity agreeing to these terms.
|
|
68
|
+
|
|
69
|
+
**Your company** is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. **Control** means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
|
|
70
|
+
|
|
71
|
+
**Your licenses** are all the licenses granted to you for the software under these terms.
|
|
72
|
+
|
|
73
|
+
**Use** means anything you do with the software requiring one of your licenses.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
Required Notice: Copyright (C) 2024-2026 Memium / kitchen-engineer42 (https://github.com/kitchen-engineer42/kc-cli)
|
|
78
|
+
|
|
79
|
+
For commercial use (including but not limited to enterprise production
|
|
80
|
+
deployment, hosting as a paid service, or distribution as a product),
|
|
81
|
+
see LICENSE-COMMERCIAL.md for how to obtain a commercial license.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Commercial License — KC Agent CLI (`kc-beta`)
|
|
2
|
+
|
|
3
|
+
KC is dual-licensed:
|
|
4
|
+
|
|
5
|
+
- **Noncommercial use** is governed by [LICENSE](./LICENSE) — the
|
|
6
|
+
PolyForm Noncommercial License 1.0.0. Personal users, students,
|
|
7
|
+
hobbyists, public-research organizations, charities, and government
|
|
8
|
+
institutions may use, modify, and self-host KC for free under those
|
|
9
|
+
terms.
|
|
10
|
+
- **Commercial use** — including any deployment of KC inside a
|
|
11
|
+
for-profit company's production workflow, hosting KC as a paid
|
|
12
|
+
service, or redistributing KC (modified or unmodified) as part of a
|
|
13
|
+
product or service offering — is **not** covered by the
|
|
14
|
+
noncommercial license and requires a separate commercial license
|
|
15
|
+
from the rights-holder.
|
|
16
|
+
|
|
17
|
+
This file describes how to obtain that commercial license.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## What counts as commercial use
|
|
22
|
+
|
|
23
|
+
The PolyForm noncommercial grant covers any **noncommercial purpose**.
|
|
24
|
+
The license body (LICENSE, "Personal Uses" + "Noncommercial
|
|
25
|
+
Organizations" sections) defines noncommercial purposes inclusively:
|
|
26
|
+
research, experiment, testing, personal study, hobby projects,
|
|
27
|
+
charities, educational institutions, public research bodies, public
|
|
28
|
+
safety / health bodies, environmental orgs, and government
|
|
29
|
+
institutions all qualify.
|
|
30
|
+
|
|
31
|
+
Anything outside that envelope is commercial use. Concrete examples
|
|
32
|
+
of activities that **require** a commercial license:
|
|
33
|
+
|
|
34
|
+
- Running KC inside a for-profit company's compliance, legal,
|
|
35
|
+
document-review, or audit workflow that processes business
|
|
36
|
+
documents — even for internal use.
|
|
37
|
+
- Embedding KC (modified or unmodified) into a commercial product or
|
|
38
|
+
SaaS offering.
|
|
39
|
+
- Hosting KC as a paid or freemium service to third parties.
|
|
40
|
+
- Redistributing KC under a different name or as part of another tool
|
|
41
|
+
that is sold or monetized.
|
|
42
|
+
|
|
43
|
+
Concrete examples that **do not** require a commercial license:
|
|
44
|
+
|
|
45
|
+
- A developer evaluating KC at home or on a personal laptop.
|
|
46
|
+
- A graduate student using KC for a thesis on regulatory NLP.
|
|
47
|
+
- A nonprofit law clinic using KC to help unpaid pro-bono casework.
|
|
48
|
+
- A public university research lab benchmarking LLM document
|
|
49
|
+
verification.
|
|
50
|
+
- A government agency using KC for internal regulatory compliance.
|
|
51
|
+
|
|
52
|
+
If you are unsure whether your use case is noncommercial, contact us
|
|
53
|
+
and we will tell you in plain language. Asking is free.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## What "redistribute as a new product" forbids
|
|
58
|
+
|
|
59
|
+
Both the noncommercial license and the commercial license forbid
|
|
60
|
+
publishing KC's source code (or any substantial derivative) under a
|
|
61
|
+
different name as a competing or independent product. The
|
|
62
|
+
noncommercial license restricts derivatives to noncommercial use; the
|
|
63
|
+
commercial license is non-transferable and intended for the licensee,
|
|
64
|
+
not for downstream redistribution as a third-party product.
|
|
65
|
+
|
|
66
|
+
Forks for the licensee's own internal use, with proper attribution
|
|
67
|
+
(`Required Notice:` line preserved per the PolyForm Notices section),
|
|
68
|
+
are acceptable under both licenses. Forks intended to be released as
|
|
69
|
+
a new offering — paid or free — are not.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## How to obtain a commercial license
|
|
74
|
+
|
|
75
|
+
Email **heavysal@gmail.com** with:
|
|
76
|
+
|
|
77
|
+
1. A short description of your intended use case (industry, scale,
|
|
78
|
+
internal vs customer-facing)
|
|
79
|
+
2. Your organization name and country
|
|
80
|
+
3. Approximate document volume / number of users (so we can scope
|
|
81
|
+
pricing)
|
|
82
|
+
4. Whether you need any specific terms (indemnification, source-code
|
|
83
|
+
escrow, on-prem only, etc.)
|
|
84
|
+
|
|
85
|
+
We will respond within 5 business days with either:
|
|
86
|
+
|
|
87
|
+
- A standard commercial license offer (non-negotiable terms, fixed
|
|
88
|
+
per-seat or per-document pricing — fastest path), or
|
|
89
|
+
- A scoped negotiation for unusual cases (large enterprise, regulated
|
|
90
|
+
industry, special compliance requirements)
|
|
91
|
+
|
|
92
|
+
Pricing for v0.7.x onward is set per inquiry. Once we have surveyed
|
|
93
|
+
enough commercial licensees to set transparent rates, we will publish
|
|
94
|
+
a pricing sheet here.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Compliance for licensees
|
|
99
|
+
|
|
100
|
+
A commercial license, once granted, covers the specific entity named
|
|
101
|
+
in the agreement and its wholly-controlled affiliates (per PolyForm's
|
|
102
|
+
"Your company" definition). It does **not** automatically extend to
|
|
103
|
+
parent companies, subsidiaries with different control, or
|
|
104
|
+
post-acquisition successors — those need their own license or a
|
|
105
|
+
written extension.
|
|
106
|
+
|
|
107
|
+
The `Required Notice: Copyright ...` line in LICENSE must remain
|
|
108
|
+
present in any copies of KC the licensee distributes internally.
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Reporting violations
|
|
113
|
+
|
|
114
|
+
If you believe KC is being used in violation of either license,
|
|
115
|
+
please email **heavysal@gmail.com** with the relevant details. The
|
|
116
|
+
PolyForm `Violations` section gives violators a 32-day correction
|
|
117
|
+
window from the date of written notice; we follow that process for
|
|
118
|
+
both license tracks.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
*Last updated: 2026-04-29. KC v0.7.0+ is licensed under PolyForm
|
|
123
|
+
Noncommercial 1.0.0 (this dual-license model). KC v0.6.x and earlier
|
|
124
|
+
were licensed under MIT and remain under MIT for those release
|
|
125
|
+
versions.*
|
package/README.md
CHANGED
|
@@ -245,9 +245,27 @@ Bug reports and PRs welcome at <https://github.com/kitchen-engineer42/kc-cli>.
|
|
|
245
245
|
|
|
246
246
|
## License
|
|
247
247
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
248
|
+
KC v0.7.0+ is **dual-licensed** under [PolyForm Noncommercial 1.0.0](./LICENSE)
|
|
249
|
+
plus a separate commercial license available on request.
|
|
250
|
+
|
|
251
|
+
- **Personal users, students, hobbyists, public-research orgs,
|
|
252
|
+
charities, and government institutions** — use, modify, and self-host
|
|
253
|
+
KC for free under [LICENSE](./LICENSE) (PolyForm Noncommercial 1.0.0).
|
|
254
|
+
- **Enterprises in production** (for-profit company workflows, hosting
|
|
255
|
+
KC as a paid service, distributing KC inside a commercial product) —
|
|
256
|
+
require a commercial license. See [LICENSE-COMMERCIAL.md](./LICENSE-COMMERCIAL.md)
|
|
257
|
+
for terms and how to contact us.
|
|
258
|
+
- **Redistribution as a competing or independent product** — forbidden
|
|
259
|
+
under both license tracks. Internal forks for licensee use are fine
|
|
260
|
+
with the `Required Notice:` preserved; releasing KC under another
|
|
261
|
+
name as a new offering is not.
|
|
262
|
+
|
|
263
|
+
KC v0.6.x and earlier remain under MIT for those release versions
|
|
264
|
+
(licenses can't be retroactively changed); the v0.7.0 cutover applies
|
|
265
|
+
to all subsequent commits and releases.
|
|
266
|
+
|
|
267
|
+
Bundled meta-skills under `template/skills/` follow the same dual
|
|
268
|
+
license as KC itself.
|
|
251
269
|
|
|
252
270
|
---
|
|
253
271
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "kc-beta",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "KC Agent — LLM document verification agent (pure Node.js CLI)",
|
|
3
|
+
"version": "0.7.0",
|
|
4
|
+
"description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
7
|
"kc-beta": "bin/kc-beta.js"
|
|
@@ -9,10 +9,17 @@
|
|
|
9
9
|
"files": [
|
|
10
10
|
"bin/",
|
|
11
11
|
"src/",
|
|
12
|
+
"!src/cli/meme.source.js",
|
|
12
13
|
"template/",
|
|
13
14
|
"README.md",
|
|
14
|
-
"QUICKSTART.md"
|
|
15
|
+
"QUICKSTART.md",
|
|
16
|
+
"LICENSE",
|
|
17
|
+
"LICENSE-COMMERCIAL.md"
|
|
15
18
|
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build:meme": "node scripts/build-meme.js",
|
|
21
|
+
"prepublishOnly": "node scripts/build-meme.js"
|
|
22
|
+
},
|
|
16
23
|
"homepage": "https://github.com/kitchen-engineer42/kc-cli",
|
|
17
24
|
"repository": {
|
|
18
25
|
"type": "git",
|
|
@@ -28,8 +35,10 @@
|
|
|
28
35
|
"ink": "^6.0.0",
|
|
29
36
|
"ink-text-input": "^6.0.0",
|
|
30
37
|
"ink-spinner": "^5.0.0",
|
|
38
|
+
"mammoth": "^1.6.0",
|
|
31
39
|
"react": "^19.0.0",
|
|
32
|
-
"pdfjs-dist": "^4.0.0"
|
|
40
|
+
"pdfjs-dist": "^4.0.0",
|
|
41
|
+
"word-extractor": "^1.0.4"
|
|
33
42
|
},
|
|
34
43
|
"keywords": [
|
|
35
44
|
"document-verification",
|
|
@@ -38,5 +47,5 @@
|
|
|
38
47
|
"cli"
|
|
39
48
|
],
|
|
40
49
|
"author": "kitchen-engineer42",
|
|
41
|
-
"license": "
|
|
50
|
+
"license": "SEE LICENSE IN LICENSE"
|
|
42
51
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
|
|
2
|
+
import { findSafeSplitPoint } from "./message-utils.js";
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Automatic context windowing for long conversations.
|
|
@@ -38,18 +39,14 @@ export class ContextWindow {
|
|
|
38
39
|
return { messages, wasWindowed: false, removedCount: 0 };
|
|
39
40
|
}
|
|
40
41
|
|
|
41
|
-
// Split into older and recent.
|
|
42
|
-
//
|
|
43
|
-
//
|
|
44
|
-
//
|
|
45
|
-
//
|
|
46
|
-
//
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
let splitPoint = Math.max(0, messages.length - this.recentWindowSize);
|
|
50
|
-
while (splitPoint < messages.length && messages[splitPoint]?.role === "tool") {
|
|
51
|
-
splitPoint++;
|
|
52
|
-
}
|
|
42
|
+
// Split into older and recent. v0.6.3.1: tool-pair atomicity is a
|
|
43
|
+
// bidirectional invariant — recent[0] must not be a `tool` (orphan,
|
|
44
|
+
// its assistant_with_tool_calls got summarized away) AND older[-1]
|
|
45
|
+
// must not be `assistant_with_tool_calls` (its tool results sit at
|
|
46
|
+
// the start of recent and the older summary corrupts that pairing).
|
|
47
|
+
// Use the shared `findSafeSplitPoint` helper from engine.js.
|
|
48
|
+
const desiredSplit = Math.max(0, messages.length - this.recentWindowSize);
|
|
49
|
+
const splitPoint = findSafeSplitPoint(messages, desiredSplit);
|
|
53
50
|
const recentMessages = messages.slice(splitPoint);
|
|
54
51
|
const olderMessages = messages.slice(0, splitPoint);
|
|
55
52
|
|
package/src/agent/context.js
CHANGED
|
@@ -149,12 +149,25 @@ export class ContextAssembler {
|
|
|
149
149
|
* @param {string} [opts.pipelineState]
|
|
150
150
|
* @param {string} [opts.workspaceState]
|
|
151
151
|
* @param {string} [opts.skillIndex] - Brief index of available meta skills
|
|
152
|
+
* @param {string} [opts.projectMemory] - v0.7.0 B3: rules/PATTERNS.md
|
|
153
|
+
* content. Capped at ~5 KB by the caller. Surfaced for phases the
|
|
154
|
+
* work-decomposition skill operates in (skill_authoring + skill_testing).
|
|
152
155
|
* @returns {string}
|
|
153
156
|
*/
|
|
154
|
-
build({ agentMd, pipelineState, workspaceState, skillIndex } = {}) {
|
|
157
|
+
build({ agentMd, pipelineState, workspaceState, skillIndex, projectMemory } = {}) {
|
|
155
158
|
const parts = [AGENT_IDENTITY];
|
|
156
159
|
if (agentMd) parts.push(agentMd);
|
|
157
160
|
if (skillIndex) parts.push(skillIndex);
|
|
161
|
+
if (projectMemory) {
|
|
162
|
+
parts.push(
|
|
163
|
+
"## Project memory (rules/PATTERNS.md)\n\n" +
|
|
164
|
+
"Patterns + decisions you've accumulated this session. Treat as " +
|
|
165
|
+
"your prior decisions on this corpus — apply them; update the file " +
|
|
166
|
+
"when you discover something better. The work-decomposition skill " +
|
|
167
|
+
"covers what to write here vs. NOT to write.\n\n" +
|
|
168
|
+
"```markdown\n" + projectMemory.trim() + "\n```",
|
|
169
|
+
);
|
|
170
|
+
}
|
|
158
171
|
if (pipelineState) parts.push(pipelineState);
|
|
159
172
|
if (workspaceState) parts.push(workspaceState);
|
|
160
173
|
return parts.join("\n\n");
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* v0.7.0 G (#91): native document parser dispatcher.
|
|
3
|
+
*
|
|
4
|
+
* Centralizes the "given a file path, give me text" operation across
|
|
5
|
+
* formats KC handles. Strategy stack:
|
|
6
|
+
*
|
|
7
|
+
* .pdf → pdfjs-dist (already a hard dep)
|
|
8
|
+
* .docx → mammoth (npm dep, dynamic-imported)
|
|
9
|
+
* .doc → word-extractor (npm dep, dynamic-imported)
|
|
10
|
+
* .txt / .md → fs.readFileSync UTF-8 (with GBK fallback for CJK)
|
|
11
|
+
* anything → plaintext-utf8 best-effort, then LibreOffice fallback
|
|
12
|
+
*
|
|
13
|
+
* `mammoth` and `word-extractor` are dynamic-imported so the module
|
|
14
|
+
* degrades gracefully when they're not installed: missing dep → fall
|
|
15
|
+
* through to plaintext / LibreOffice. Lets KC ship without forcing
|
|
16
|
+
* users to run `npm install` post-upgrade if they don't touch
|
|
17
|
+
* DOCX/DOC content.
|
|
18
|
+
*
|
|
19
|
+
* The standalone PDF tool `tools/document-parse.js` (which has its
|
|
20
|
+
* own VLM/OCR escalation logic for image-PDFs) keeps its richer
|
|
21
|
+
* pipeline; this module is for the lower-friction "just give me
|
|
22
|
+
* text" path that document-chunk uses.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import fs from "node:fs";
|
|
26
|
+
import path from "node:path";
|
|
27
|
+
import { spawnSync } from "node:child_process";
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @returns {Promise<{text: string, via: string, ok: boolean, error?: string}>}
|
|
31
|
+
*/
|
|
32
|
+
export async function extractText(filePath) {
|
|
33
|
+
const suffix = path.extname(filePath).toLowerCase();
|
|
34
|
+
|
|
35
|
+
if (suffix === ".pdf") {
|
|
36
|
+
const text = await _tryPdfjs(filePath);
|
|
37
|
+
if (text !== null) return { text, via: "pdfjs", ok: true };
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (suffix === ".docx") {
|
|
41
|
+
const text = await _tryMammoth(filePath);
|
|
42
|
+
if (text !== null) return { text, via: "mammoth", ok: true };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (suffix === ".doc") {
|
|
46
|
+
const text = await _tryWordExtractor(filePath);
|
|
47
|
+
if (text !== null) return { text, via: "word-extractor", ok: true };
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (suffix === ".txt" || suffix === ".md" || suffix === ".csv" || suffix === ".json") {
|
|
51
|
+
const text = _tryPlaintext(filePath);
|
|
52
|
+
if (text !== null) return { text, via: "plaintext", ok: true };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Generic fallbacks for anything we couldn't parse natively (or where
|
|
56
|
+
// the native lib isn't installed): plaintext first, then LibreOffice
|
|
57
|
+
// CLI as a last resort.
|
|
58
|
+
const plain = _tryPlaintext(filePath);
|
|
59
|
+
if (plain !== null) return { text: plain, via: "plaintext_fallback", ok: true };
|
|
60
|
+
|
|
61
|
+
const lo = _tryLibreOffice(filePath);
|
|
62
|
+
if (lo !== null) return { text: lo, via: "libreoffice_fallback", ok: true };
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
text: "",
|
|
66
|
+
via: "none",
|
|
67
|
+
ok: false,
|
|
68
|
+
error: `no parser available for ${suffix || "(no extension)"}`,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// --- internals ---
|
|
73
|
+
|
|
74
|
+
async function _tryPdfjs(filePath) {
|
|
75
|
+
try {
|
|
76
|
+
const pdfjsLib = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
77
|
+
const data = new Uint8Array(fs.readFileSync(filePath));
|
|
78
|
+
const doc = await pdfjsLib.getDocument({ data, useSystemFonts: true }).promise;
|
|
79
|
+
const parts = [];
|
|
80
|
+
for (let i = 1; i <= doc.numPages; i++) {
|
|
81
|
+
const page = await doc.getPage(i);
|
|
82
|
+
const content = await page.getTextContent();
|
|
83
|
+
parts.push(content.items.map((it) => it.str || "").join(" "));
|
|
84
|
+
}
|
|
85
|
+
return parts.join("\n");
|
|
86
|
+
} catch {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
async function _tryMammoth(filePath) {
|
|
92
|
+
try {
|
|
93
|
+
const mammoth = await import("mammoth");
|
|
94
|
+
const result = await mammoth.extractRawText({ path: filePath });
|
|
95
|
+
return result.value || "";
|
|
96
|
+
} catch {
|
|
97
|
+
return null; // mammoth not installed OR file unreadable
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async function _tryWordExtractor(filePath) {
|
|
102
|
+
try {
|
|
103
|
+
const { default: WordExtractor } = await import("word-extractor");
|
|
104
|
+
const extractor = new WordExtractor();
|
|
105
|
+
const doc = await extractor.extract(filePath);
|
|
106
|
+
return doc.getBody() || "";
|
|
107
|
+
} catch {
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function _tryPlaintext(filePath) {
|
|
113
|
+
try {
|
|
114
|
+
const buf = fs.readFileSync(filePath);
|
|
115
|
+
// Heuristic: if the buffer parses as UTF-8 cleanly (no replacement
|
|
116
|
+
// characters), use it. Otherwise try GBK for CJK corpora.
|
|
117
|
+
const utf8 = buf.toString("utf-8");
|
|
118
|
+
if (!utf8.includes("�")) return utf8;
|
|
119
|
+
// GBK fallback (only commonly relevant on Chinese corpora)
|
|
120
|
+
try {
|
|
121
|
+
// Node has TextDecoder("gbk") via ICU on most builds
|
|
122
|
+
const gbk = new TextDecoder("gbk", { fatal: false }).decode(buf);
|
|
123
|
+
if (gbk && !gbk.includes("�")) return gbk;
|
|
124
|
+
} catch { /* GBK not supported on this Node build */ }
|
|
125
|
+
// Last resort: return UTF-8 with replacement characters; caller
|
|
126
|
+
// can decide whether to use it.
|
|
127
|
+
return utf8;
|
|
128
|
+
} catch {
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function _tryLibreOffice(filePath) {
|
|
134
|
+
// soffice/libreoffice CLI fallback. Best-effort; returns null on any
|
|
135
|
+
// failure so caller falls back to "no parser available."
|
|
136
|
+
const lo = _findLibreOffice();
|
|
137
|
+
if (!lo) return null;
|
|
138
|
+
try {
|
|
139
|
+
const outDir = path.join(path.dirname(filePath), ".kc-lo-out");
|
|
140
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
141
|
+
const r = spawnSync(
|
|
142
|
+
lo,
|
|
143
|
+
["--headless", "--convert-to", "txt", "--outdir", outDir, filePath],
|
|
144
|
+
{ timeout: 60_000 },
|
|
145
|
+
);
|
|
146
|
+
if (r.status !== 0) return null;
|
|
147
|
+
const stem = path.basename(filePath, path.extname(filePath));
|
|
148
|
+
const out = path.join(outDir, stem + ".txt");
|
|
149
|
+
if (!fs.existsSync(out)) return null;
|
|
150
|
+
const text = fs.readFileSync(out, "utf-8");
|
|
151
|
+
// Best-effort cleanup of the conversion output
|
|
152
|
+
try { fs.unlinkSync(out); } catch { /* ignore */ }
|
|
153
|
+
return text;
|
|
154
|
+
} catch {
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function _findLibreOffice() {
|
|
160
|
+
// Use which/where heuristic — synchronous, fine at extract-time.
|
|
161
|
+
const candidates = ["soffice", "libreoffice"];
|
|
162
|
+
for (const cmd of candidates) {
|
|
163
|
+
try {
|
|
164
|
+
const r = spawnSync(cmd, ["--version"], { timeout: 5_000 });
|
|
165
|
+
if (r.status === 0) return cmd;
|
|
166
|
+
} catch { /* not on PATH */ }
|
|
167
|
+
}
|
|
168
|
+
return null;
|
|
169
|
+
}
|