llm-kb 0.2.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +322 -60
  2. package/bin/anthropic-5TIU2EED.js +5515 -0
  3. package/bin/azure-openai-responses-ZVUVMK3G.js +190 -0
  4. package/bin/chunk-2WV6TQRI.js +4792 -0
  5. package/bin/chunk-3YMNGUZZ.js +262 -0
  6. package/bin/chunk-5PYKQQLA.js +14295 -0
  7. package/bin/chunk-65KFH7OI.js +31 -0
  8. package/bin/chunk-DHOXVEIR.js +7261 -0
  9. package/bin/chunk-EAQYK3U2.js +41 -0
  10. package/bin/chunk-IFS3OKBN.js +428 -0
  11. package/bin/chunk-LDHOKBJA.js +86 -0
  12. package/bin/chunk-SLYBG6ZQ.js +32681 -0
  13. package/bin/chunk-UEODFF7H.js +17 -0
  14. package/bin/chunk-XCXTZJGO.js +174 -0
  15. package/bin/chunk-XFV534WU.js +7056 -0
  16. package/bin/cli.js +5496 -163
  17. package/bin/dist-3YH7P2QF.js +1244 -0
  18. package/bin/google-JFC43EFJ.js +371 -0
  19. package/bin/google-gemini-cli-K4XNMYDI.js +712 -0
  20. package/bin/google-vertex-Y42F254G.js +414 -0
  21. package/bin/indexer-KSYRIVVN.js +10 -0
  22. package/bin/mistral-ZU2JS5XZ.js +38406 -0
  23. package/bin/multipart-parser-CO464TZY.js +371 -0
  24. package/bin/openai-codex-responses-NW2LELBH.js +712 -0
  25. package/bin/openai-completions-TW3VKTHO.js +662 -0
  26. package/bin/openai-responses-VGL522MK.js +198 -0
  27. package/bin/src-Y22OHE3S.js +1408 -0
  28. package/package.json +16 -6
  29. package/PHASE2_SPEC.md +0 -274
  30. package/SPEC.md +0 -275
  31. package/bin/chunk-MYQ36JJB.js +0 -118
  32. package/bin/indexer-LSYSZXZX.js +0 -6
  33. package/plan.md +0 -55
  34. package/src/cli.ts +0 -132
  35. package/src/indexer.ts +0 -148
  36. package/src/pdf.ts +0 -119
  37. package/src/query.ts +0 -132
  38. package/src/resolve-kb.ts +0 -19
  39. package/src/scan.ts +0 -59
  40. package/src/watcher.ts +0 -84
  41. package/tsconfig.json +0 -14
@@ -1,118 +0,0 @@
1
- // src/indexer.ts
2
- import {
3
- createAgentSession,
4
- createBashTool,
5
- createReadTool,
6
- createWriteTool,
7
- DefaultResourceLoader,
8
- SessionManager,
9
- SettingsManager
10
- } from "@mariozechner/pi-coding-agent";
11
- import { readdir, readFile } from "fs/promises";
12
- import { join, dirname } from "path";
13
- import { fileURLToPath } from "url";
14
- var __filename = fileURLToPath(import.meta.url);
15
- var __dirname = dirname(__filename);
16
- function getNodeModulesPath() {
17
- let dir = __dirname;
18
- for (let i = 0; i < 5; i++) {
19
- const candidate = join(dir, "node_modules");
20
- try {
21
- return candidate;
22
- } catch {
23
- dir = dirname(dir);
24
- }
25
- }
26
- return join(process.cwd(), "node_modules");
27
- }
28
- function buildAgentsContent(sourcesDir, files) {
29
- const sourceList = files.filter((f) => f.endsWith(".md")).map((f) => ` - ${f}`).join("\n");
30
- return `# llm-kb Knowledge Base
31
-
32
- ## How to access documents
33
-
34
- ### PDFs (pre-parsed)
35
- PDFs have been parsed to markdown with bounding boxes.
36
- Read the markdown versions in \`.llm-kb/wiki/sources/\` instead of the raw PDFs.
37
-
38
- Available parsed sources:
39
- ${sourceList}
40
-
41
- ### Other file types (Excel, Word, PowerPoint, CSV, images)
42
- You have bash and read tools. These libraries are pre-installed and available:
43
- - **exceljs** \u2014 for .xlsx/.xls files
44
- - **mammoth** \u2014 for .docx files
45
- - **officeparser** \u2014 for .pptx files
46
- - **csv-parse** \u2014 built into Node.js, use fs + split for .csv
47
-
48
- Write a quick Node.js script to extract content when needed.
49
-
50
- ## Index file
51
- Write the index to \`.llm-kb/wiki/index.md\`.
52
-
53
- The index should be a markdown file with:
54
- 1. A title and last-updated timestamp
55
- 2. A summary table with columns: Source, Type, Pages/Size, Summary, Key Topics
56
- 3. Each source gets a one-line summary (read the first ~500 chars of each file to generate it)
57
- 4. Total word count across all sources
58
- `;
59
- }
60
- async function buildIndex(folder, sourcesDir, onOutput) {
61
- const files = await readdir(sourcesDir);
62
- const mdFiles = files.filter((f) => f.endsWith(".md"));
63
- if (mdFiles.length === 0) {
64
- throw new Error("No source files found to index");
65
- }
66
- const agentsContent = buildAgentsContent(sourcesDir, files);
67
- const nodeModulesPath = getNodeModulesPath();
68
- process.env.NODE_PATH = nodeModulesPath;
69
- const loader = new DefaultResourceLoader({
70
- cwd: folder,
71
- agentsFilesOverride: (current) => ({
72
- agentsFiles: [
73
- ...current.agentsFiles,
74
- { path: ".llm-kb/AGENTS.md", content: agentsContent }
75
- ]
76
- })
77
- });
78
- await loader.reload();
79
- const { session } = await createAgentSession({
80
- cwd: folder,
81
- resourceLoader: loader,
82
- tools: [
83
- createReadTool(folder),
84
- createBashTool(folder),
85
- createWriteTool(folder)
86
- ],
87
- sessionManager: SessionManager.inMemory(),
88
- settingsManager: SettingsManager.inMemory({
89
- compaction: { enabled: false }
90
- })
91
- });
92
- if (onOutput) {
93
- session.subscribe((event) => {
94
- if (event.type === "message_update" && event.assistantMessageEvent.type === "text_delta") {
95
- onOutput(event.assistantMessageEvent.delta);
96
- }
97
- });
98
- }
99
- const prompt = `Read each file in .llm-kb/wiki/sources/ (one at a time, just the first 500 characters of each).
100
- Then write .llm-kb/wiki/index.md with a summary table of all sources.
101
-
102
- Include: Source filename, Type (PDF/Excel/Word/etc), Pages (from the JSON if available), a one-line summary, and key topics.
103
- Add a total word count estimate at the bottom.`;
104
- await session.prompt(prompt);
105
- const indexPath = join(sourcesDir, "..", "index.md");
106
- try {
107
- const content = await readFile(indexPath, "utf-8");
108
- session.dispose();
109
- return content;
110
- } catch {
111
- session.dispose();
112
- throw new Error("Agent did not create index.md");
113
- }
114
- }
115
-
116
- export {
117
- buildIndex
118
- };
@@ -1,6 +0,0 @@
1
- import {
2
- buildIndex
3
- } from "./chunk-MYQ36JJB.js";
4
- export {
5
- buildIndex
6
- };
package/plan.md DELETED
@@ -1,55 +0,0 @@
1
- # llm-kb — Phase 1 Build Plan
2
-
3
- > Emergent design. Each slice is a thin vertical slice that works end-to-end, is demoable, and informs the next step. Decisions are made at the last responsible moment.
4
-
5
- ## Key Learnings
6
- - **PDF is the only adapter we build.** Everything else (Excel, Word, PPT, CSV, images) handled dynamically by Pi SDK agent at query time.
7
- - **`@llamaindex/liteparse`** proven (from parser-study). Extracts text + bounding boxes locally.
8
- - **Two-output pattern**: `.md` (spatial text) + `.json` (bounding boxes for citations).
9
- - **OCR off by default.** Most PDFs have native text. Enable via `OCR_SERVER_URL` or `OCR_ENABLED=true`.
10
- - **Pi SDK `createAgentSession()`** with defaults — no auth/model config needed. Uses Pi's existing auth.
11
- - **AGENTS.md injected via `agentsFilesOverride`** — user's folder stays clean.
12
- - **NODE_PATH** set so agent's bash scripts can use bundled libraries (exceljs, mammoth, officeparser).
13
- - **Config file skipped** — nothing reads it yet. Add when Phase 2/3 needs it.
14
-
15
- ---
16
-
17
- ## Slice 1: "Hello World" CLI ✅
18
- Commander CLI with `run <folder>`. Scans folder, lists files by extension.
19
-
20
- ## Slice 2: PDF → markdown + bounding boxes ✅
21
- LiteParse parses PDFs → `.md` + `.json` in `.llm-kb/wiki/sources/`. Tested on 9 real PDFs (1000+ pages).
22
-
23
- ## Slice 3: Scanned PDF handling (OCR) ✅
24
- LiteParse has Tesseract.js built-in. `ocrEnabled` + `ocrServerUrl` config. OCR off by default. Azure OCR bridge tested on 16 legal PDFs (3000+ pages).
25
-
26
- ## Slice 4: Progress + error handling ✅
27
- Inline progress. Stderr suppression. Corrupt file skip + warning. Mtime check — re-runs instant.
28
-
29
- ## Slice 5: Indexer (Pi SDK) ✅
30
- `createAgentSession` with cwd = user's folder. AGENTS.md injected. Agent reads sources, writes `index.md` with summary table.
31
-
32
- ## Slice 6: File watcher ✅
33
- chokidar watches folder. New/changed PDFs → parse → re-index. 2s debounce for batch drops.
34
-
35
- ## Slice 7: Config + polish → Skipped
36
- Config file has no readers yet. Deferred to Phase 2/3. README updated instead.
37
-
38
- ---
39
-
40
- ## Phase 1 Complete ✅
41
-
42
- **What ships:**
43
- - `llm-kb run ./folder` — scan, parse PDFs, build index, watch for new files
44
- - Pre-bundled libraries for agent to handle Excel, Word, PowerPoint at query time
45
- - OCR via env var (local Tesseract or remote Azure bridge)
46
- - Auth via Pi SDK (zero config)
47
-
48
- **Phase 2 complete ✅:**
49
- - `llm-kb query "question"` — auto-detects KB, streams cited answers
50
- - `--save` flag — research mode, saves to `outputs/`, re-indexes
51
- - Query mode is read-only (read tool only). Research mode adds bash + write.
52
-
53
- **Deferred to Phase 4:**
54
- - Trace logging (JSON per query: question, filesRead, citations, tokens, duration)
55
- - Needed for eval, but no eval system yet to consume traces
package/src/cli.ts DELETED
@@ -1,132 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- import { Command } from "commander";
4
- import { scan, summarize } from "./scan.js";
5
- import { parsePDF } from "./pdf.js";
6
- import { buildIndex } from "./indexer.js";
7
- import { startWatcher } from "./watcher.js";
8
- import { query } from "./query.js";
9
- import { resolveKnowledgeBase } from "./resolve-kb.js";
10
- import { existsSync } from "node:fs";
11
- import { mkdir } from "node:fs/promises";
12
- import { resolve, join } from "node:path";
13
- import chalk from "chalk";
14
-
15
- const program = new Command();
16
-
17
- program
18
- .name("llm-kb")
19
- .description("Drop files into a folder. Get a knowledge base you can query.")
20
- .version("0.2.0");
21
-
22
- program
23
- .command("run")
24
- .description("Scan, parse, index, and watch a folder")
25
- .argument("<folder>", "Path to your documents folder")
26
- .action(async (folder: string) => {
27
- console.log(`\n${chalk.bold("llm-kb")} v0.2.0\n`);
28
-
29
- if (!existsSync(folder)) {
30
- console.error(chalk.red(`Error: Folder not found: ${folder}`));
31
- process.exit(1);
32
- }
33
-
34
- console.log(`Scanning ${folder}...`);
35
-
36
- const files = await scan(folder);
37
-
38
- if (files.length === 0) {
39
- console.log(chalk.yellow(" No supported files found."));
40
- return;
41
- }
42
-
43
- const pdfs = files.filter((f) => f.ext === ".pdf");
44
- console.log(` Found ${chalk.bold(files.length.toString())} files (${summarize(files)})`);
45
- if (pdfs.length === 0) return;
46
-
47
- // Set up .llm-kb folder structure
48
- const root = resolve(folder);
49
- const sourcesDir = join(root, ".llm-kb", "wiki", "sources");
50
- await mkdir(sourcesDir, { recursive: true });
51
-
52
- // Parse PDFs with inline progress
53
- let parsed = 0;
54
- let skipped = 0;
55
- let failed = 0;
56
- const errors: { name: string; message: string }[] = [];
57
-
58
- for (let i = 0; i < pdfs.length; i++) {
59
- const pdf = pdfs[i];
60
- const fullPath = join(root, pdf.path);
61
-
62
- // Inline progress — overwrite same line
63
- const progress = ` Parsing... ${i + 1}/${pdfs.length} — ${pdf.name}`;
64
- process.stdout.write(`\r${progress.padEnd(80)}`);
65
-
66
- try {
67
- const result = await parsePDF(fullPath, sourcesDir);
68
- if (result.skipped) {
69
- skipped++;
70
- } else {
71
- parsed++;
72
- }
73
- } catch (err: any) {
74
- failed++;
75
- errors.push({ name: pdf.name, message: err.message });
76
- }
77
- }
78
-
79
- // Clear progress line
80
- process.stdout.write(`\r${"".padEnd(80)}\r`);
81
-
82
- // Summary
83
- const parts: string[] = [];
84
- if (parsed > 0) parts.push(chalk.green(`${parsed} parsed`));
85
- if (skipped > 0) parts.push(chalk.dim(`${skipped} skipped (up to date)`));
86
- if (failed > 0) parts.push(chalk.red(`${failed} failed`));
87
- console.log(` ${parts.join(", ")}`);
88
-
89
- // Show errors
90
- for (const err of errors) {
91
- console.log(chalk.red(` ✗ ${err.name} — ${err.message}`));
92
- }
93
-
94
- // Build index
95
- console.log(`\n Building index...`);
96
- try {
97
- await buildIndex(root, sourcesDir);
98
- console.log(chalk.green(` Index built: .llm-kb/wiki/index.md`));
99
- } catch (err: any) {
100
- console.error(chalk.red(` Index failed: ${err.message}`));
101
- }
102
-
103
- console.log(`\n ${chalk.dim("Output:")} ${sourcesDir}`);
104
-
105
- // Start watching for new files
106
- console.log(chalk.dim(`\n Watching for new files... (Ctrl+C to stop)`));
107
- startWatcher({ folder: root, sourcesDir });
108
- });
109
-
110
- program
111
- .command("query")
112
- .description("Ask a question across your knowledge base")
113
- .argument("<question>", "Your question")
114
- .option("--folder <path>", "Path to document folder (auto-detects if omitted)")
115
- .option("--save", "Save the answer to wiki/outputs/ (research mode)")
116
- .action(async (question: string, options: { folder?: string; save?: boolean }) => {
117
- const root = resolveKnowledgeBase(options.folder || process.cwd());
118
-
119
- if (!root) {
120
- console.error(chalk.red("No knowledge base found. Run 'llm-kb run <folder>' first."));
121
- process.exit(1);
122
- }
123
-
124
- try {
125
- await query(root, question, { save: options.save });
126
- } catch (err: any) {
127
- console.error(chalk.red(err.message));
128
- process.exit(1);
129
- }
130
- });
131
-
132
- program.parse();
package/src/indexer.ts DELETED
@@ -1,148 +0,0 @@
1
- import {
2
- createAgentSession,
3
- createBashTool,
4
- createReadTool,
5
- createWriteTool,
6
- DefaultResourceLoader,
7
- SessionManager,
8
- SettingsManager,
9
- } from "@mariozechner/pi-coding-agent";
10
- import { readdir, readFile } from "node:fs/promises";
11
- import { join, dirname } from "node:path";
12
- import { fileURLToPath } from "node:url";
13
-
14
- const __filename = fileURLToPath(import.meta.url);
15
- const __dirname = dirname(__filename);
16
-
17
- /**
18
- * Find the node_modules directory for llm-kb's bundled libraries.
19
- * When running from bin/cli.js, node_modules is at ../node_modules.
20
- */
21
- function getNodeModulesPath(): string {
22
- // Walk up from this file to find node_modules
23
- let dir = __dirname;
24
- for (let i = 0; i < 5; i++) {
25
- const candidate = join(dir, "node_modules");
26
- try {
27
- return candidate;
28
- } catch {
29
- dir = dirname(dir);
30
- }
31
- }
32
- return join(process.cwd(), "node_modules");
33
- }
34
-
35
- function buildAgentsContent(sourcesDir: string, files: string[]): string {
36
- const sourceList = files
37
- .filter((f) => f.endsWith(".md"))
38
- .map((f) => ` - ${f}`)
39
- .join("\n");
40
-
41
- return `# llm-kb Knowledge Base
42
-
43
- ## How to access documents
44
-
45
- ### PDFs (pre-parsed)
46
- PDFs have been parsed to markdown with bounding boxes.
47
- Read the markdown versions in \`.llm-kb/wiki/sources/\` instead of the raw PDFs.
48
-
49
- Available parsed sources:
50
- ${sourceList}
51
-
52
- ### Other file types (Excel, Word, PowerPoint, CSV, images)
53
- You have bash and read tools. These libraries are pre-installed and available:
54
- - **exceljs** — for .xlsx/.xls files
55
- - **mammoth** — for .docx files
56
- - **officeparser** — for .pptx files
57
- - **csv-parse** — built into Node.js, use fs + split for .csv
58
-
59
- Write a quick Node.js script to extract content when needed.
60
-
61
- ## Index file
62
- Write the index to \`.llm-kb/wiki/index.md\`.
63
-
64
- The index should be a markdown file with:
65
- 1. A title and last-updated timestamp
66
- 2. A summary table with columns: Source, Type, Pages/Size, Summary, Key Topics
67
- 3. Each source gets a one-line summary (read the first ~500 chars of each file to generate it)
68
- 4. Total word count across all sources
69
- `;
70
- }
71
-
72
- export async function buildIndex(
73
- folder: string,
74
- sourcesDir: string,
75
- onOutput?: (text: string) => void
76
- ): Promise<string> {
77
- // List source files
78
- const files = await readdir(sourcesDir);
79
- const mdFiles = files.filter((f) => f.endsWith(".md"));
80
-
81
- if (mdFiles.length === 0) {
82
- throw new Error("No source files found to index");
83
- }
84
-
85
- // Build AGENTS.md content
86
- const agentsContent = buildAgentsContent(sourcesDir, files);
87
-
88
- // Set NODE_PATH so agent's bash scripts can use bundled libraries
89
- const nodeModulesPath = getNodeModulesPath();
90
- process.env.NODE_PATH = nodeModulesPath;
91
-
92
- const loader = new DefaultResourceLoader({
93
- cwd: folder,
94
- agentsFilesOverride: (current) => ({
95
- agentsFiles: [
96
- ...current.agentsFiles,
97
- { path: ".llm-kb/AGENTS.md", content: agentsContent },
98
- ],
99
- }),
100
- });
101
- await loader.reload();
102
-
103
- const { session } = await createAgentSession({
104
- cwd: folder,
105
- resourceLoader: loader,
106
- tools: [
107
- createReadTool(folder),
108
- createBashTool(folder),
109
- createWriteTool(folder),
110
- ],
111
- sessionManager: SessionManager.inMemory(),
112
- settingsManager: SettingsManager.inMemory({
113
- compaction: { enabled: false },
114
- }),
115
- });
116
-
117
- // Subscribe to streaming output
118
- if (onOutput) {
119
- session.subscribe((event) => {
120
- if (
121
- event.type === "message_update" &&
122
- event.assistantMessageEvent.type === "text_delta"
123
- ) {
124
- onOutput(event.assistantMessageEvent.delta);
125
- }
126
- });
127
- }
128
-
129
- // Build the prompt
130
- const prompt = `Read each file in .llm-kb/wiki/sources/ (one at a time, just the first 500 characters of each).
131
- Then write .llm-kb/wiki/index.md with a summary table of all sources.
132
-
133
- Include: Source filename, Type (PDF/Excel/Word/etc), Pages (from the JSON if available), a one-line summary, and key topics.
134
- Add a total word count estimate at the bottom.`;
135
-
136
- await session.prompt(prompt);
137
-
138
- // Read the generated index
139
- const indexPath = join(sourcesDir, "..", "index.md");
140
- try {
141
- const content = await readFile(indexPath, "utf-8");
142
- session.dispose();
143
- return content;
144
- } catch {
145
- session.dispose();
146
- throw new Error("Agent did not create index.md");
147
- }
148
- }
package/src/pdf.ts DELETED
@@ -1,119 +0,0 @@
1
- import { LiteParse } from "@llamaindex/liteparse";
2
- import { writeFile, mkdir, stat } from "node:fs/promises";
3
- import { join, basename } from "node:path";
4
- import { cpus } from "node:os";
5
-
6
- export interface ParsedPDF {
7
- name: string;
8
- mdPath: string;
9
- jsonPath: string;
10
- totalPages: number;
11
- textLength: number;
12
- skipped: boolean;
13
- }
14
-
15
- /**
16
- * Check if source PDF is newer than the parsed output.
17
- * Returns true if we can skip parsing.
18
- */
19
- async function isUpToDate(
20
- pdfPath: string,
21
- mdPath: string,
22
- jsonPath: string
23
- ): Promise<boolean> {
24
- try {
25
- const [pdfStat, mdStat, jsonStat] = await Promise.all([
26
- stat(pdfPath),
27
- stat(mdPath),
28
- stat(jsonPath),
29
- ]);
30
- return pdfStat.mtimeMs <= mdStat.mtimeMs && pdfStat.mtimeMs <= jsonStat.mtimeMs;
31
- } catch {
32
- return false;
33
- }
34
- }
35
-
36
- /**
37
- * Suppress stderr temporarily to hide noisy library warnings.
38
- */
39
- function suppressStderr(): () => void {
40
- const originalWrite = process.stderr.write.bind(process.stderr);
41
- process.stderr.write = (() => true) as any;
42
- return () => {
43
- process.stderr.write = originalWrite;
44
- };
45
- }
46
-
47
- export async function parsePDF(
48
- pdfPath: string,
49
- outputDir: string
50
- ): Promise<ParsedPDF> {
51
- const name = basename(pdfPath, ".pdf");
52
- await mkdir(outputDir, { recursive: true });
53
-
54
- const mdPath = join(outputDir, `${name}.md`);
55
- const jsonPath = join(outputDir, `${name}.json`);
56
-
57
- // Skip if already parsed and source hasn't changed
58
- if (await isUpToDate(pdfPath, mdPath, jsonPath)) {
59
- return { name, mdPath, jsonPath, totalPages: 0, textLength: 0, skipped: true };
60
- }
61
-
62
- const ocrServerUrl = process.env.OCR_SERVER_URL;
63
- const ocrEnabled = ocrServerUrl ? true : process.env.OCR_ENABLED === "true";
64
-
65
- const parser = new LiteParse({
66
- ocrEnabled,
67
- outputFormat: "json",
68
- numWorkers: cpus().length,
69
- ...(ocrServerUrl ? { ocrServerUrl } : {}),
70
- });
71
-
72
- // Suppress noisy Tesseract/PDF.js warnings during parse
73
- const restore = suppressStderr();
74
- let result;
75
- try {
76
- result = await parser.parse(pdfPath, true);
77
- } finally {
78
- restore();
79
- }
80
-
81
- // Build markdown — spatial text per page
82
- const markdown = result.pages
83
- .map((p: any) => `# Page ${p.pageNum}\n\n${p.text}`)
84
- .join("\n\n---\n\n");
85
-
86
- // Build bounding box JSON
87
- const bboxData = {
88
- source: basename(pdfPath),
89
- totalPages: result.pages.length,
90
- pages: result.pages.map((p: any) => ({
91
- page: p.pageNum,
92
- width: p.width,
93
- height: p.height,
94
- textItems: p.textItems.map((item: any) => ({
95
- text: (item.str ?? item.text ?? "").trim(),
96
- x: Math.round(item.x * 100) / 100,
97
- y: Math.round(item.y * 100) / 100,
98
- width: Math.round((item.width ?? item.w ?? 0) * 100) / 100,
99
- height: Math.round((item.height ?? item.h ?? 0) * 100) / 100,
100
- fontName: item.fontName,
101
- fontSize: item.fontSize
102
- ? Math.round(item.fontSize * 100) / 100
103
- : undefined,
104
- })),
105
- })),
106
- };
107
-
108
- await writeFile(mdPath, markdown);
109
- await writeFile(jsonPath, JSON.stringify(bboxData, null, 2));
110
-
111
- return {
112
- name,
113
- mdPath,
114
- jsonPath,
115
- totalPages: result.pages.length,
116
- textLength: markdown.length,
117
- skipped: false,
118
- };
119
- }