llm-kb 0.2.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +322 -60
- package/bin/anthropic-5TIU2EED.js +5515 -0
- package/bin/azure-openai-responses-ZVUVMK3G.js +190 -0
- package/bin/chunk-2WV6TQRI.js +4792 -0
- package/bin/chunk-3YMNGUZZ.js +262 -0
- package/bin/chunk-5PYKQQLA.js +14295 -0
- package/bin/chunk-65KFH7OI.js +31 -0
- package/bin/chunk-DHOXVEIR.js +7261 -0
- package/bin/chunk-EAQYK3U2.js +41 -0
- package/bin/chunk-IFS3OKBN.js +428 -0
- package/bin/chunk-LDHOKBJA.js +86 -0
- package/bin/chunk-SLYBG6ZQ.js +32681 -0
- package/bin/chunk-UEODFF7H.js +17 -0
- package/bin/chunk-XCXTZJGO.js +174 -0
- package/bin/chunk-XFV534WU.js +7056 -0
- package/bin/cli.js +5496 -163
- package/bin/dist-3YH7P2QF.js +1244 -0
- package/bin/google-JFC43EFJ.js +371 -0
- package/bin/google-gemini-cli-K4XNMYDI.js +712 -0
- package/bin/google-vertex-Y42F254G.js +414 -0
- package/bin/indexer-KSYRIVVN.js +10 -0
- package/bin/mistral-ZU2JS5XZ.js +38406 -0
- package/bin/multipart-parser-CO464TZY.js +371 -0
- package/bin/openai-codex-responses-NW2LELBH.js +712 -0
- package/bin/openai-completions-TW3VKTHO.js +662 -0
- package/bin/openai-responses-VGL522MK.js +198 -0
- package/bin/src-Y22OHE3S.js +1408 -0
- package/package.json +16 -6
- package/PHASE2_SPEC.md +0 -274
- package/SPEC.md +0 -275
- package/bin/chunk-MYQ36JJB.js +0 -118
- package/bin/indexer-LSYSZXZX.js +0 -6
- package/plan.md +0 -55
- package/src/cli.ts +0 -132
- package/src/indexer.ts +0 -148
- package/src/pdf.ts +0 -119
- package/src/query.ts +0 -132
- package/src/resolve-kb.ts +0 -19
- package/src/scan.ts +0 -59
- package/src/watcher.ts +0 -84
- package/tsconfig.json +0 -14
package/bin/chunk-MYQ36JJB.js
DELETED
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
// src/indexer.ts
|
|
2
|
-
import {
|
|
3
|
-
createAgentSession,
|
|
4
|
-
createBashTool,
|
|
5
|
-
createReadTool,
|
|
6
|
-
createWriteTool,
|
|
7
|
-
DefaultResourceLoader,
|
|
8
|
-
SessionManager,
|
|
9
|
-
SettingsManager
|
|
10
|
-
} from "@mariozechner/pi-coding-agent";
|
|
11
|
-
import { readdir, readFile } from "fs/promises";
|
|
12
|
-
import { join, dirname } from "path";
|
|
13
|
-
import { fileURLToPath } from "url";
|
|
14
|
-
var __filename = fileURLToPath(import.meta.url);
|
|
15
|
-
var __dirname = dirname(__filename);
|
|
16
|
-
function getNodeModulesPath() {
|
|
17
|
-
let dir = __dirname;
|
|
18
|
-
for (let i = 0; i < 5; i++) {
|
|
19
|
-
const candidate = join(dir, "node_modules");
|
|
20
|
-
try {
|
|
21
|
-
return candidate;
|
|
22
|
-
} catch {
|
|
23
|
-
dir = dirname(dir);
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
return join(process.cwd(), "node_modules");
|
|
27
|
-
}
|
|
28
|
-
function buildAgentsContent(sourcesDir, files) {
|
|
29
|
-
const sourceList = files.filter((f) => f.endsWith(".md")).map((f) => ` - ${f}`).join("\n");
|
|
30
|
-
return `# llm-kb Knowledge Base
|
|
31
|
-
|
|
32
|
-
## How to access documents
|
|
33
|
-
|
|
34
|
-
### PDFs (pre-parsed)
|
|
35
|
-
PDFs have been parsed to markdown with bounding boxes.
|
|
36
|
-
Read the markdown versions in \`.llm-kb/wiki/sources/\` instead of the raw PDFs.
|
|
37
|
-
|
|
38
|
-
Available parsed sources:
|
|
39
|
-
${sourceList}
|
|
40
|
-
|
|
41
|
-
### Other file types (Excel, Word, PowerPoint, CSV, images)
|
|
42
|
-
You have bash and read tools. These libraries are pre-installed and available:
|
|
43
|
-
- **exceljs** \u2014 for .xlsx/.xls files
|
|
44
|
-
- **mammoth** \u2014 for .docx files
|
|
45
|
-
- **officeparser** \u2014 for .pptx files
|
|
46
|
-
- **csv-parse** \u2014 built into Node.js, use fs + split for .csv
|
|
47
|
-
|
|
48
|
-
Write a quick Node.js script to extract content when needed.
|
|
49
|
-
|
|
50
|
-
## Index file
|
|
51
|
-
Write the index to \`.llm-kb/wiki/index.md\`.
|
|
52
|
-
|
|
53
|
-
The index should be a markdown file with:
|
|
54
|
-
1. A title and last-updated timestamp
|
|
55
|
-
2. A summary table with columns: Source, Type, Pages/Size, Summary, Key Topics
|
|
56
|
-
3. Each source gets a one-line summary (read the first ~500 chars of each file to generate it)
|
|
57
|
-
4. Total word count across all sources
|
|
58
|
-
`;
|
|
59
|
-
}
|
|
60
|
-
async function buildIndex(folder, sourcesDir, onOutput) {
|
|
61
|
-
const files = await readdir(sourcesDir);
|
|
62
|
-
const mdFiles = files.filter((f) => f.endsWith(".md"));
|
|
63
|
-
if (mdFiles.length === 0) {
|
|
64
|
-
throw new Error("No source files found to index");
|
|
65
|
-
}
|
|
66
|
-
const agentsContent = buildAgentsContent(sourcesDir, files);
|
|
67
|
-
const nodeModulesPath = getNodeModulesPath();
|
|
68
|
-
process.env.NODE_PATH = nodeModulesPath;
|
|
69
|
-
const loader = new DefaultResourceLoader({
|
|
70
|
-
cwd: folder,
|
|
71
|
-
agentsFilesOverride: (current) => ({
|
|
72
|
-
agentsFiles: [
|
|
73
|
-
...current.agentsFiles,
|
|
74
|
-
{ path: ".llm-kb/AGENTS.md", content: agentsContent }
|
|
75
|
-
]
|
|
76
|
-
})
|
|
77
|
-
});
|
|
78
|
-
await loader.reload();
|
|
79
|
-
const { session } = await createAgentSession({
|
|
80
|
-
cwd: folder,
|
|
81
|
-
resourceLoader: loader,
|
|
82
|
-
tools: [
|
|
83
|
-
createReadTool(folder),
|
|
84
|
-
createBashTool(folder),
|
|
85
|
-
createWriteTool(folder)
|
|
86
|
-
],
|
|
87
|
-
sessionManager: SessionManager.inMemory(),
|
|
88
|
-
settingsManager: SettingsManager.inMemory({
|
|
89
|
-
compaction: { enabled: false }
|
|
90
|
-
})
|
|
91
|
-
});
|
|
92
|
-
if (onOutput) {
|
|
93
|
-
session.subscribe((event) => {
|
|
94
|
-
if (event.type === "message_update" && event.assistantMessageEvent.type === "text_delta") {
|
|
95
|
-
onOutput(event.assistantMessageEvent.delta);
|
|
96
|
-
}
|
|
97
|
-
});
|
|
98
|
-
}
|
|
99
|
-
const prompt = `Read each file in .llm-kb/wiki/sources/ (one at a time, just the first 500 characters of each).
|
|
100
|
-
Then write .llm-kb/wiki/index.md with a summary table of all sources.
|
|
101
|
-
|
|
102
|
-
Include: Source filename, Type (PDF/Excel/Word/etc), Pages (from the JSON if available), a one-line summary, and key topics.
|
|
103
|
-
Add a total word count estimate at the bottom.`;
|
|
104
|
-
await session.prompt(prompt);
|
|
105
|
-
const indexPath = join(sourcesDir, "..", "index.md");
|
|
106
|
-
try {
|
|
107
|
-
const content = await readFile(indexPath, "utf-8");
|
|
108
|
-
session.dispose();
|
|
109
|
-
return content;
|
|
110
|
-
} catch {
|
|
111
|
-
session.dispose();
|
|
112
|
-
throw new Error("Agent did not create index.md");
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
export {
|
|
117
|
-
buildIndex
|
|
118
|
-
};
|
package/bin/indexer-LSYSZXZX.js
DELETED
package/plan.md
DELETED
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
# llm-kb — Phase 1 Build Plan
|
|
2
|
-
|
|
3
|
-
> Emergent design. Each slice is a thin vertical slice that works end-to-end, is demoable, and informs the next step. Decisions are made at the last responsible moment.
|
|
4
|
-
|
|
5
|
-
## Key Learnings
|
|
6
|
-
- **PDF is the only adapter we build.** Everything else (Excel, Word, PPT, CSV, images) handled dynamically by Pi SDK agent at query time.
|
|
7
|
-
- **`@llamaindex/liteparse`** proven (from parser-study). Extracts text + bounding boxes locally.
|
|
8
|
-
- **Two-output pattern**: `.md` (spatial text) + `.json` (bounding boxes for citations).
|
|
9
|
-
- **OCR off by default.** Most PDFs have native text. Enable via `OCR_SERVER_URL` or `OCR_ENABLED=true`.
|
|
10
|
-
- **Pi SDK `createAgentSession()`** with defaults — no auth/model config needed. Uses Pi's existing auth.
|
|
11
|
-
- **AGENTS.md injected via `agentsFilesOverride`** — user's folder stays clean.
|
|
12
|
-
- **NODE_PATH** set so agent's bash scripts can use bundled libraries (exceljs, mammoth, officeparser).
|
|
13
|
-
- **Config file skipped** — nothing reads it yet. Add when Phase 2/3 needs it.
|
|
14
|
-
|
|
15
|
-
---
|
|
16
|
-
|
|
17
|
-
## Slice 1: "Hello World" CLI ✅
|
|
18
|
-
Commander CLI with `run <folder>`. Scans folder, lists files by extension.
|
|
19
|
-
|
|
20
|
-
## Slice 2: PDF → markdown + bounding boxes ✅
|
|
21
|
-
LiteParse parses PDFs → `.md` + `.json` in `.llm-kb/wiki/sources/`. Tested on 9 real PDFs (1000+ pages).
|
|
22
|
-
|
|
23
|
-
## Slice 3: Scanned PDF handling (OCR) ✅
|
|
24
|
-
LiteParse has Tesseract.js built-in. `ocrEnabled` + `ocrServerUrl` config. OCR off by default. Azure OCR bridge tested on 16 legal PDFs (3000+ pages).
|
|
25
|
-
|
|
26
|
-
## Slice 4: Progress + error handling ✅
|
|
27
|
-
Inline progress. Stderr suppression. Corrupt file skip + warning. Mtime check — re-runs instant.
|
|
28
|
-
|
|
29
|
-
## Slice 5: Indexer (Pi SDK) ✅
|
|
30
|
-
`createAgentSession` with cwd = user's folder. AGENTS.md injected. Agent reads sources, writes `index.md` with summary table.
|
|
31
|
-
|
|
32
|
-
## Slice 6: File watcher ✅
|
|
33
|
-
chokidar watches folder. New/changed PDFs → parse → re-index. 2s debounce for batch drops.
|
|
34
|
-
|
|
35
|
-
## Slice 7: Config + polish → Skipped
|
|
36
|
-
Config file has no readers yet. Deferred to Phase 2/3. README updated instead.
|
|
37
|
-
|
|
38
|
-
---
|
|
39
|
-
|
|
40
|
-
## Phase 1 Complete ✅
|
|
41
|
-
|
|
42
|
-
**What ships:**
|
|
43
|
-
- `llm-kb run ./folder` — scan, parse PDFs, build index, watch for new files
|
|
44
|
-
- Pre-bundled libraries for agent to handle Excel, Word, PowerPoint at query time
|
|
45
|
-
- OCR via env var (local Tesseract or remote Azure bridge)
|
|
46
|
-
- Auth via Pi SDK (zero config)
|
|
47
|
-
|
|
48
|
-
**Phase 2 complete ✅:**
|
|
49
|
-
- `llm-kb query "question"` — auto-detects KB, streams cited answers
|
|
50
|
-
- `--save` flag — research mode, saves to `outputs/`, re-indexes
|
|
51
|
-
- Query mode is read-only (read tool only). Research mode adds bash + write.
|
|
52
|
-
|
|
53
|
-
**Deferred to Phase 4:**
|
|
54
|
-
- Trace logging (JSON per query: question, filesRead, citations, tokens, duration)
|
|
55
|
-
- Needed for eval, but no eval system yet to consume traces
|
package/src/cli.ts
DELETED
|
@@ -1,132 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
import { Command } from "commander";
|
|
4
|
-
import { scan, summarize } from "./scan.js";
|
|
5
|
-
import { parsePDF } from "./pdf.js";
|
|
6
|
-
import { buildIndex } from "./indexer.js";
|
|
7
|
-
import { startWatcher } from "./watcher.js";
|
|
8
|
-
import { query } from "./query.js";
|
|
9
|
-
import { resolveKnowledgeBase } from "./resolve-kb.js";
|
|
10
|
-
import { existsSync } from "node:fs";
|
|
11
|
-
import { mkdir } from "node:fs/promises";
|
|
12
|
-
import { resolve, join } from "node:path";
|
|
13
|
-
import chalk from "chalk";
|
|
14
|
-
|
|
15
|
-
const program = new Command();
|
|
16
|
-
|
|
17
|
-
program
|
|
18
|
-
.name("llm-kb")
|
|
19
|
-
.description("Drop files into a folder. Get a knowledge base you can query.")
|
|
20
|
-
.version("0.2.0");
|
|
21
|
-
|
|
22
|
-
program
|
|
23
|
-
.command("run")
|
|
24
|
-
.description("Scan, parse, index, and watch a folder")
|
|
25
|
-
.argument("<folder>", "Path to your documents folder")
|
|
26
|
-
.action(async (folder: string) => {
|
|
27
|
-
console.log(`\n${chalk.bold("llm-kb")} v0.2.0\n`);
|
|
28
|
-
|
|
29
|
-
if (!existsSync(folder)) {
|
|
30
|
-
console.error(chalk.red(`Error: Folder not found: ${folder}`));
|
|
31
|
-
process.exit(1);
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
console.log(`Scanning ${folder}...`);
|
|
35
|
-
|
|
36
|
-
const files = await scan(folder);
|
|
37
|
-
|
|
38
|
-
if (files.length === 0) {
|
|
39
|
-
console.log(chalk.yellow(" No supported files found."));
|
|
40
|
-
return;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
const pdfs = files.filter((f) => f.ext === ".pdf");
|
|
44
|
-
console.log(` Found ${chalk.bold(files.length.toString())} files (${summarize(files)})`);
|
|
45
|
-
if (pdfs.length === 0) return;
|
|
46
|
-
|
|
47
|
-
// Set up .llm-kb folder structure
|
|
48
|
-
const root = resolve(folder);
|
|
49
|
-
const sourcesDir = join(root, ".llm-kb", "wiki", "sources");
|
|
50
|
-
await mkdir(sourcesDir, { recursive: true });
|
|
51
|
-
|
|
52
|
-
// Parse PDFs with inline progress
|
|
53
|
-
let parsed = 0;
|
|
54
|
-
let skipped = 0;
|
|
55
|
-
let failed = 0;
|
|
56
|
-
const errors: { name: string; message: string }[] = [];
|
|
57
|
-
|
|
58
|
-
for (let i = 0; i < pdfs.length; i++) {
|
|
59
|
-
const pdf = pdfs[i];
|
|
60
|
-
const fullPath = join(root, pdf.path);
|
|
61
|
-
|
|
62
|
-
// Inline progress — overwrite same line
|
|
63
|
-
const progress = ` Parsing... ${i + 1}/${pdfs.length} — ${pdf.name}`;
|
|
64
|
-
process.stdout.write(`\r${progress.padEnd(80)}`);
|
|
65
|
-
|
|
66
|
-
try {
|
|
67
|
-
const result = await parsePDF(fullPath, sourcesDir);
|
|
68
|
-
if (result.skipped) {
|
|
69
|
-
skipped++;
|
|
70
|
-
} else {
|
|
71
|
-
parsed++;
|
|
72
|
-
}
|
|
73
|
-
} catch (err: any) {
|
|
74
|
-
failed++;
|
|
75
|
-
errors.push({ name: pdf.name, message: err.message });
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// Clear progress line
|
|
80
|
-
process.stdout.write(`\r${"".padEnd(80)}\r`);
|
|
81
|
-
|
|
82
|
-
// Summary
|
|
83
|
-
const parts: string[] = [];
|
|
84
|
-
if (parsed > 0) parts.push(chalk.green(`${parsed} parsed`));
|
|
85
|
-
if (skipped > 0) parts.push(chalk.dim(`${skipped} skipped (up to date)`));
|
|
86
|
-
if (failed > 0) parts.push(chalk.red(`${failed} failed`));
|
|
87
|
-
console.log(` ${parts.join(", ")}`);
|
|
88
|
-
|
|
89
|
-
// Show errors
|
|
90
|
-
for (const err of errors) {
|
|
91
|
-
console.log(chalk.red(` ✗ ${err.name} — ${err.message}`));
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// Build index
|
|
95
|
-
console.log(`\n Building index...`);
|
|
96
|
-
try {
|
|
97
|
-
await buildIndex(root, sourcesDir);
|
|
98
|
-
console.log(chalk.green(` Index built: .llm-kb/wiki/index.md`));
|
|
99
|
-
} catch (err: any) {
|
|
100
|
-
console.error(chalk.red(` Index failed: ${err.message}`));
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
console.log(`\n ${chalk.dim("Output:")} ${sourcesDir}`);
|
|
104
|
-
|
|
105
|
-
// Start watching for new files
|
|
106
|
-
console.log(chalk.dim(`\n Watching for new files... (Ctrl+C to stop)`));
|
|
107
|
-
startWatcher({ folder: root, sourcesDir });
|
|
108
|
-
});
|
|
109
|
-
|
|
110
|
-
program
|
|
111
|
-
.command("query")
|
|
112
|
-
.description("Ask a question across your knowledge base")
|
|
113
|
-
.argument("<question>", "Your question")
|
|
114
|
-
.option("--folder <path>", "Path to document folder (auto-detects if omitted)")
|
|
115
|
-
.option("--save", "Save the answer to wiki/outputs/ (research mode)")
|
|
116
|
-
.action(async (question: string, options: { folder?: string; save?: boolean }) => {
|
|
117
|
-
const root = resolveKnowledgeBase(options.folder || process.cwd());
|
|
118
|
-
|
|
119
|
-
if (!root) {
|
|
120
|
-
console.error(chalk.red("No knowledge base found. Run 'llm-kb run <folder>' first."));
|
|
121
|
-
process.exit(1);
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
try {
|
|
125
|
-
await query(root, question, { save: options.save });
|
|
126
|
-
} catch (err: any) {
|
|
127
|
-
console.error(chalk.red(err.message));
|
|
128
|
-
process.exit(1);
|
|
129
|
-
}
|
|
130
|
-
});
|
|
131
|
-
|
|
132
|
-
program.parse();
|
package/src/indexer.ts
DELETED
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
createAgentSession,
|
|
3
|
-
createBashTool,
|
|
4
|
-
createReadTool,
|
|
5
|
-
createWriteTool,
|
|
6
|
-
DefaultResourceLoader,
|
|
7
|
-
SessionManager,
|
|
8
|
-
SettingsManager,
|
|
9
|
-
} from "@mariozechner/pi-coding-agent";
|
|
10
|
-
import { readdir, readFile } from "node:fs/promises";
|
|
11
|
-
import { join, dirname } from "node:path";
|
|
12
|
-
import { fileURLToPath } from "node:url";
|
|
13
|
-
|
|
14
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
15
|
-
const __dirname = dirname(__filename);
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* Find the node_modules directory for llm-kb's bundled libraries.
|
|
19
|
-
* When running from bin/cli.js, node_modules is at ../node_modules.
|
|
20
|
-
*/
|
|
21
|
-
function getNodeModulesPath(): string {
|
|
22
|
-
// Walk up from this file to find node_modules
|
|
23
|
-
let dir = __dirname;
|
|
24
|
-
for (let i = 0; i < 5; i++) {
|
|
25
|
-
const candidate = join(dir, "node_modules");
|
|
26
|
-
try {
|
|
27
|
-
return candidate;
|
|
28
|
-
} catch {
|
|
29
|
-
dir = dirname(dir);
|
|
30
|
-
}
|
|
31
|
-
}
|
|
32
|
-
return join(process.cwd(), "node_modules");
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
function buildAgentsContent(sourcesDir: string, files: string[]): string {
|
|
36
|
-
const sourceList = files
|
|
37
|
-
.filter((f) => f.endsWith(".md"))
|
|
38
|
-
.map((f) => ` - ${f}`)
|
|
39
|
-
.join("\n");
|
|
40
|
-
|
|
41
|
-
return `# llm-kb Knowledge Base
|
|
42
|
-
|
|
43
|
-
## How to access documents
|
|
44
|
-
|
|
45
|
-
### PDFs (pre-parsed)
|
|
46
|
-
PDFs have been parsed to markdown with bounding boxes.
|
|
47
|
-
Read the markdown versions in \`.llm-kb/wiki/sources/\` instead of the raw PDFs.
|
|
48
|
-
|
|
49
|
-
Available parsed sources:
|
|
50
|
-
${sourceList}
|
|
51
|
-
|
|
52
|
-
### Other file types (Excel, Word, PowerPoint, CSV, images)
|
|
53
|
-
You have bash and read tools. These libraries are pre-installed and available:
|
|
54
|
-
- **exceljs** — for .xlsx/.xls files
|
|
55
|
-
- **mammoth** — for .docx files
|
|
56
|
-
- **officeparser** — for .pptx files
|
|
57
|
-
- **csv-parse** — built into Node.js, use fs + split for .csv
|
|
58
|
-
|
|
59
|
-
Write a quick Node.js script to extract content when needed.
|
|
60
|
-
|
|
61
|
-
## Index file
|
|
62
|
-
Write the index to \`.llm-kb/wiki/index.md\`.
|
|
63
|
-
|
|
64
|
-
The index should be a markdown file with:
|
|
65
|
-
1. A title and last-updated timestamp
|
|
66
|
-
2. A summary table with columns: Source, Type, Pages/Size, Summary, Key Topics
|
|
67
|
-
3. Each source gets a one-line summary (read the first ~500 chars of each file to generate it)
|
|
68
|
-
4. Total word count across all sources
|
|
69
|
-
`;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
export async function buildIndex(
|
|
73
|
-
folder: string,
|
|
74
|
-
sourcesDir: string,
|
|
75
|
-
onOutput?: (text: string) => void
|
|
76
|
-
): Promise<string> {
|
|
77
|
-
// List source files
|
|
78
|
-
const files = await readdir(sourcesDir);
|
|
79
|
-
const mdFiles = files.filter((f) => f.endsWith(".md"));
|
|
80
|
-
|
|
81
|
-
if (mdFiles.length === 0) {
|
|
82
|
-
throw new Error("No source files found to index");
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
// Build AGENTS.md content
|
|
86
|
-
const agentsContent = buildAgentsContent(sourcesDir, files);
|
|
87
|
-
|
|
88
|
-
// Set NODE_PATH so agent's bash scripts can use bundled libraries
|
|
89
|
-
const nodeModulesPath = getNodeModulesPath();
|
|
90
|
-
process.env.NODE_PATH = nodeModulesPath;
|
|
91
|
-
|
|
92
|
-
const loader = new DefaultResourceLoader({
|
|
93
|
-
cwd: folder,
|
|
94
|
-
agentsFilesOverride: (current) => ({
|
|
95
|
-
agentsFiles: [
|
|
96
|
-
...current.agentsFiles,
|
|
97
|
-
{ path: ".llm-kb/AGENTS.md", content: agentsContent },
|
|
98
|
-
],
|
|
99
|
-
}),
|
|
100
|
-
});
|
|
101
|
-
await loader.reload();
|
|
102
|
-
|
|
103
|
-
const { session } = await createAgentSession({
|
|
104
|
-
cwd: folder,
|
|
105
|
-
resourceLoader: loader,
|
|
106
|
-
tools: [
|
|
107
|
-
createReadTool(folder),
|
|
108
|
-
createBashTool(folder),
|
|
109
|
-
createWriteTool(folder),
|
|
110
|
-
],
|
|
111
|
-
sessionManager: SessionManager.inMemory(),
|
|
112
|
-
settingsManager: SettingsManager.inMemory({
|
|
113
|
-
compaction: { enabled: false },
|
|
114
|
-
}),
|
|
115
|
-
});
|
|
116
|
-
|
|
117
|
-
// Subscribe to streaming output
|
|
118
|
-
if (onOutput) {
|
|
119
|
-
session.subscribe((event) => {
|
|
120
|
-
if (
|
|
121
|
-
event.type === "message_update" &&
|
|
122
|
-
event.assistantMessageEvent.type === "text_delta"
|
|
123
|
-
) {
|
|
124
|
-
onOutput(event.assistantMessageEvent.delta);
|
|
125
|
-
}
|
|
126
|
-
});
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
// Build the prompt
|
|
130
|
-
const prompt = `Read each file in .llm-kb/wiki/sources/ (one at a time, just the first 500 characters of each).
|
|
131
|
-
Then write .llm-kb/wiki/index.md with a summary table of all sources.
|
|
132
|
-
|
|
133
|
-
Include: Source filename, Type (PDF/Excel/Word/etc), Pages (from the JSON if available), a one-line summary, and key topics.
|
|
134
|
-
Add a total word count estimate at the bottom.`;
|
|
135
|
-
|
|
136
|
-
await session.prompt(prompt);
|
|
137
|
-
|
|
138
|
-
// Read the generated index
|
|
139
|
-
const indexPath = join(sourcesDir, "..", "index.md");
|
|
140
|
-
try {
|
|
141
|
-
const content = await readFile(indexPath, "utf-8");
|
|
142
|
-
session.dispose();
|
|
143
|
-
return content;
|
|
144
|
-
} catch {
|
|
145
|
-
session.dispose();
|
|
146
|
-
throw new Error("Agent did not create index.md");
|
|
147
|
-
}
|
|
148
|
-
}
|
package/src/pdf.ts
DELETED
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
import { LiteParse } from "@llamaindex/liteparse";
|
|
2
|
-
import { writeFile, mkdir, stat } from "node:fs/promises";
|
|
3
|
-
import { join, basename } from "node:path";
|
|
4
|
-
import { cpus } from "node:os";
|
|
5
|
-
|
|
6
|
-
export interface ParsedPDF {
|
|
7
|
-
name: string;
|
|
8
|
-
mdPath: string;
|
|
9
|
-
jsonPath: string;
|
|
10
|
-
totalPages: number;
|
|
11
|
-
textLength: number;
|
|
12
|
-
skipped: boolean;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* Check if source PDF is newer than the parsed output.
|
|
17
|
-
* Returns true if we can skip parsing.
|
|
18
|
-
*/
|
|
19
|
-
async function isUpToDate(
|
|
20
|
-
pdfPath: string,
|
|
21
|
-
mdPath: string,
|
|
22
|
-
jsonPath: string
|
|
23
|
-
): Promise<boolean> {
|
|
24
|
-
try {
|
|
25
|
-
const [pdfStat, mdStat, jsonStat] = await Promise.all([
|
|
26
|
-
stat(pdfPath),
|
|
27
|
-
stat(mdPath),
|
|
28
|
-
stat(jsonPath),
|
|
29
|
-
]);
|
|
30
|
-
return pdfStat.mtimeMs <= mdStat.mtimeMs && pdfStat.mtimeMs <= jsonStat.mtimeMs;
|
|
31
|
-
} catch {
|
|
32
|
-
return false;
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
/**
|
|
37
|
-
* Suppress stderr temporarily to hide noisy library warnings.
|
|
38
|
-
*/
|
|
39
|
-
function suppressStderr(): () => void {
|
|
40
|
-
const originalWrite = process.stderr.write.bind(process.stderr);
|
|
41
|
-
process.stderr.write = (() => true) as any;
|
|
42
|
-
return () => {
|
|
43
|
-
process.stderr.write = originalWrite;
|
|
44
|
-
};
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export async function parsePDF(
|
|
48
|
-
pdfPath: string,
|
|
49
|
-
outputDir: string
|
|
50
|
-
): Promise<ParsedPDF> {
|
|
51
|
-
const name = basename(pdfPath, ".pdf");
|
|
52
|
-
await mkdir(outputDir, { recursive: true });
|
|
53
|
-
|
|
54
|
-
const mdPath = join(outputDir, `${name}.md`);
|
|
55
|
-
const jsonPath = join(outputDir, `${name}.json`);
|
|
56
|
-
|
|
57
|
-
// Skip if already parsed and source hasn't changed
|
|
58
|
-
if (await isUpToDate(pdfPath, mdPath, jsonPath)) {
|
|
59
|
-
return { name, mdPath, jsonPath, totalPages: 0, textLength: 0, skipped: true };
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
const ocrServerUrl = process.env.OCR_SERVER_URL;
|
|
63
|
-
const ocrEnabled = ocrServerUrl ? true : process.env.OCR_ENABLED === "true";
|
|
64
|
-
|
|
65
|
-
const parser = new LiteParse({
|
|
66
|
-
ocrEnabled,
|
|
67
|
-
outputFormat: "json",
|
|
68
|
-
numWorkers: cpus().length,
|
|
69
|
-
...(ocrServerUrl ? { ocrServerUrl } : {}),
|
|
70
|
-
});
|
|
71
|
-
|
|
72
|
-
// Suppress noisy Tesseract/PDF.js warnings during parse
|
|
73
|
-
const restore = suppressStderr();
|
|
74
|
-
let result;
|
|
75
|
-
try {
|
|
76
|
-
result = await parser.parse(pdfPath, true);
|
|
77
|
-
} finally {
|
|
78
|
-
restore();
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// Build markdown — spatial text per page
|
|
82
|
-
const markdown = result.pages
|
|
83
|
-
.map((p: any) => `# Page ${p.pageNum}\n\n${p.text}`)
|
|
84
|
-
.join("\n\n---\n\n");
|
|
85
|
-
|
|
86
|
-
// Build bounding box JSON
|
|
87
|
-
const bboxData = {
|
|
88
|
-
source: basename(pdfPath),
|
|
89
|
-
totalPages: result.pages.length,
|
|
90
|
-
pages: result.pages.map((p: any) => ({
|
|
91
|
-
page: p.pageNum,
|
|
92
|
-
width: p.width,
|
|
93
|
-
height: p.height,
|
|
94
|
-
textItems: p.textItems.map((item: any) => ({
|
|
95
|
-
text: (item.str ?? item.text ?? "").trim(),
|
|
96
|
-
x: Math.round(item.x * 100) / 100,
|
|
97
|
-
y: Math.round(item.y * 100) / 100,
|
|
98
|
-
width: Math.round((item.width ?? item.w ?? 0) * 100) / 100,
|
|
99
|
-
height: Math.round((item.height ?? item.h ?? 0) * 100) / 100,
|
|
100
|
-
fontName: item.fontName,
|
|
101
|
-
fontSize: item.fontSize
|
|
102
|
-
? Math.round(item.fontSize * 100) / 100
|
|
103
|
-
: undefined,
|
|
104
|
-
})),
|
|
105
|
-
})),
|
|
106
|
-
};
|
|
107
|
-
|
|
108
|
-
await writeFile(mdPath, markdown);
|
|
109
|
-
await writeFile(jsonPath, JSON.stringify(bboxData, null, 2));
|
|
110
|
-
|
|
111
|
-
return {
|
|
112
|
-
name,
|
|
113
|
-
mdPath,
|
|
114
|
-
jsonPath,
|
|
115
|
-
totalPages: result.pages.length,
|
|
116
|
-
textLength: markdown.length,
|
|
117
|
-
skipped: false,
|
|
118
|
-
};
|
|
119
|
-
}
|