pageindex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,81 @@
1
+ {
2
+ "name": "pageindex",
3
+ "version": "1.0.0",
4
+ "description": "Vectorless, reasoning-based RAG for document understanding. Multi-runtime (Node.js, Bun, Deno). PDF and Markdown support with OCR.",
5
+ "module": "dist/index.js",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "type": "module",
9
+ "exports": {
10
+ ".": {
11
+ "types": "./dist/index.d.ts",
12
+ "import": "./dist/index.js",
13
+ "require": "./dist/index.cjs"
14
+ }
15
+ },
16
+ "bin": {
17
+ "pageindex": "dist/cli.js"
18
+ },
19
+ "scripts": {
20
+ "build": "npm run build:esm && npm run build:cjs && npm run build:cli && npm run build:types",
21
+ "build:esm": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --platform=node --external:openai --external:pdf-parse --external:pdf-poppler",
22
+ "build:cjs": "esbuild src/index.ts --bundle --format=cjs --outfile=dist/index.cjs --platform=node --external:openai --external:pdf-parse --external:pdf-poppler",
23
+ "build:cli": "esbuild src/cli.ts --bundle --format=esm --outfile=dist/cli.js --platform=node --external:openai --external:pdf-parse --external:pdf-poppler --banner:js='#!/usr/bin/env node'",
24
+ "build:types": "tsc --declaration --emitDeclarationOnly --outDir dist",
25
+ "typecheck": "tsc --noEmit",
26
+ "test": "bun test",
27
+ "test:node": "node --test tests/*.test.js",
28
+ "benchmark": "bun run benchmarks/benchmark.ts",
29
+ "prepublishOnly": "npm run build"
30
+ },
31
+ "keywords": [
32
+ "pageindex",
33
+ "rag",
34
+ "retrieval",
35
+ "document",
36
+ "pdf",
37
+ "markdown",
38
+ "reasoning",
39
+ "vectorless",
40
+ "llm",
41
+ "openai",
42
+ "tree-index",
43
+ "information-retrieval",
44
+ "ai",
45
+ "ocr",
46
+ "nodejs",
47
+ "bun",
48
+ "deno",
49
+ "multi-runtime"
50
+ ],
51
+ "author": "Antonio Oliveira <antonio@oakoliver.com> (https://oakoliver.com)",
52
+ "license": "MIT",
53
+ "repository": {
54
+ "type": "git",
55
+ "url": "git+https://github.com/oakoliver/pageindex.git"
56
+ },
57
+ "homepage": "https://github.com/oakoliver/pageindex#readme",
58
+ "bugs": {
59
+ "url": "https://github.com/oakoliver/pageindex/issues"
60
+ },
61
+ "engines": {
62
+ "node": ">=18.0.0"
63
+ },
64
+ "files": [
65
+ "dist",
66
+ "src",
67
+ "README.md",
68
+ "LICENSE"
69
+ ],
70
+ "dependencies": {
71
+ "openai": "^6.29.0",
72
+ "pdf-parse": "^2.4.5",
73
+ "pdf-poppler": "^0.2.3"
74
+ },
75
+ "devDependencies": {
76
+ "@types/node": "^20.0.0",
77
+ "@types/pdf-parse": "^1.1.5",
78
+ "esbuild": "^0.27.4",
79
+ "typescript": "^5.0.0"
80
+ }
81
+ }
package/src/cli.ts ADDED
@@ -0,0 +1,272 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * pageindex CLI
4
+ * Command-line interface for processing PDFs and Markdown documents
5
+ */
6
+
7
+ import { parseArgs } from "util";
8
+ import { PageIndex } from "./pageindex";
9
+ import { mdToTree } from "./markdown";
10
+ import * as path from "path";
11
+ import * as fs from "fs";
12
+ import * as fsp from "fs/promises";
13
+
14
+ interface CliArgs {
15
+ pdf?: string;
16
+ md?: string;
17
+ model: string;
18
+ tocCheckPages: number;
19
+ maxPagesPerNode: number;
20
+ maxTokensPerNode: number;
21
+ addNodeId: boolean;
22
+ addNodeSummary: boolean;
23
+ addDocDescription: boolean;
24
+ addNodeText: boolean;
25
+ thinning: boolean;
26
+ thinningThreshold: number;
27
+ summaryTokenThreshold: number;
28
+ output?: string;
29
+ lmstudio: boolean;
30
+ ollama: boolean;
31
+ baseUrl?: string;
32
+ // OCR options
33
+ ocr: boolean;
34
+ ocrModel: string;
35
+ ocrPromptType: "text" | "formula" | "table";
36
+ imageDpi: number;
37
+ help: boolean;
38
+ }
39
+
40
+ function printHelp(): void {
41
+ console.log(`
42
+ bun-pageindex - Vectorless, reasoning-based RAG for document understanding
43
+
44
+ USAGE:
45
+ bun-pageindex --pdf <path> Process a PDF file
46
+ bun-pageindex --md <path> Process a Markdown file
47
+
48
+ OPTIONS:
49
+ --pdf <path> Path to PDF file
50
+ --md <path> Path to Markdown file
51
+ --output, -o <path> Output file path (default: ./results/<name>_structure.json)
52
+
53
+ MODEL OPTIONS:
54
+ --model <name> Model to use (default: gpt-4o-2024-11-20)
55
+ --lmstudio Use LM Studio (localhost:1234)
56
+ --ollama Use Ollama (localhost:11434)
57
+ --base-url <url> Custom OpenAI-compatible API URL
58
+
59
+ PDF OPTIONS:
60
+ --toc-check-pages <n> Pages to check for TOC (default: 20)
61
+ --max-pages-per-node <n> Max pages per node (default: 10)
62
+ --max-tokens-per-node <n> Max tokens per node (default: 20000)
63
+
64
+ OCR OPTIONS (for scanned PDFs):
65
+ --ocr Enable OCR mode for scanned PDFs
66
+ --ocr-model <name> OCR model (default: mlx-community/GLM-OCR-bf16)
67
+ --ocr-prompt-type <type> OCR prompt: text, formula, table (default: text)
68
+ --image-dpi <n> Image DPI for OCR (default: 150)
69
+
70
+ MARKDOWN OPTIONS:
71
+ --thinning Apply tree thinning
72
+ --thinning-threshold <n> Min tokens for thinning (default: 5000)
73
+ --summary-token-threshold <n> Token threshold for summaries (default: 200)
74
+
75
+ OUTPUT OPTIONS:
76
+ --add-node-id Add node IDs (default: true)
77
+ --no-node-id Don't add node IDs
78
+ --add-node-summary Add node summaries (default: true)
79
+ --no-node-summary Don't add node summaries
80
+ --add-doc-description Add document description
81
+ --add-node-text Include raw text in output
82
+
83
+ --help, -h Show this help message
84
+
85
+ EXAMPLES:
86
+ bun-pageindex --pdf document.pdf
87
+ bun-pageindex --md README.md --add-doc-description
88
+ bun-pageindex --pdf paper.pdf --lmstudio --model llama3
89
+ bun-pageindex --pdf report.pdf --base-url http://localhost:8080/v1
90
+ bun-pageindex --pdf scanned.pdf --ocr --lmstudio --model qwen/qwen3-vl-30b
91
+ `);
92
+ }
93
+
94
+ function parseCliArgs(): CliArgs {
95
+ const { values } = parseArgs({
96
+ args: process.argv.slice(2),
97
+ options: {
98
+ pdf: { type: "string" },
99
+ md: { type: "string" },
100
+ model: { type: "string", default: "gpt-4o-2024-11-20" },
101
+ "toc-check-pages": { type: "string", default: "20" },
102
+ "max-pages-per-node": { type: "string", default: "10" },
103
+ "max-tokens-per-node": { type: "string", default: "20000" },
104
+ "add-node-id": { type: "boolean", default: true },
105
+ "no-node-id": { type: "boolean", default: false },
106
+ "add-node-summary": { type: "boolean", default: true },
107
+ "no-node-summary": { type: "boolean", default: false },
108
+ "add-doc-description": { type: "boolean", default: false },
109
+ "add-node-text": { type: "boolean", default: false },
110
+ thinning: { type: "boolean", default: false },
111
+ "thinning-threshold": { type: "string", default: "5000" },
112
+ "summary-token-threshold": { type: "string", default: "200" },
113
+ output: { type: "string", short: "o" },
114
+ lmstudio: { type: "boolean", default: false },
115
+ ollama: { type: "boolean", default: false },
116
+ "base-url": { type: "string" },
117
+ // OCR options
118
+ ocr: { type: "boolean", default: false },
119
+ "ocr-model": { type: "string", default: "mlx-community/GLM-OCR-bf16" },
120
+ "ocr-prompt-type": { type: "string", default: "text" },
121
+ "image-dpi": { type: "string", default: "150" },
122
+ help: { type: "boolean", short: "h", default: false },
123
+ },
124
+ allowPositionals: true,
125
+ });
126
+
127
+ return {
128
+ pdf: values.pdf,
129
+ md: values.md,
130
+ model: values.model || "gpt-4o-2024-11-20",
131
+ tocCheckPages: parseInt(values["toc-check-pages"] || "20", 10),
132
+ maxPagesPerNode: parseInt(values["max-pages-per-node"] || "10", 10),
133
+ maxTokensPerNode: parseInt(values["max-tokens-per-node"] || "20000", 10),
134
+ addNodeId: values["no-node-id"] ? false : (values["add-node-id"] ?? true),
135
+ addNodeSummary: values["no-node-summary"] ? false : (values["add-node-summary"] ?? true),
136
+ addDocDescription: values["add-doc-description"] ?? false,
137
+ addNodeText: values["add-node-text"] ?? false,
138
+ thinning: values.thinning ?? false,
139
+ thinningThreshold: parseInt(values["thinning-threshold"] || "5000", 10),
140
+ summaryTokenThreshold: parseInt(values["summary-token-threshold"] || "200", 10),
141
+ output: values.output,
142
+ lmstudio: values.lmstudio ?? false,
143
+ ollama: values.ollama ?? false,
144
+ baseUrl: values["base-url"],
145
+ // OCR options
146
+ ocr: values.ocr ?? false,
147
+ ocrModel: values["ocr-model"] || "mlx-community/GLM-OCR-bf16",
148
+ ocrPromptType: (values["ocr-prompt-type"] || "text") as "text" | "formula" | "table",
149
+ imageDpi: parseInt(values["image-dpi"] || "150", 10),
150
+ help: values.help ?? false,
151
+ };
152
+ }
153
+
154
+ async function main(): Promise<void> {
155
+ const args = parseCliArgs();
156
+
157
+ if (args.help) {
158
+ printHelp();
159
+ process.exit(0);
160
+ }
161
+
162
+ // Validate input
163
+ if (!args.pdf && !args.md) {
164
+ console.error("Error: Either --pdf or --md must be specified");
165
+ console.error("Use --help for usage information");
166
+ process.exit(1);
167
+ }
168
+
169
+ if (args.pdf && args.md) {
170
+ console.error("Error: Only one of --pdf or --md can be specified");
171
+ process.exit(1);
172
+ }
173
+
174
+ // Determine output path
175
+ const inputPath = args.pdf || args.md!;
176
+ const inputName = path.basename(inputPath, path.extname(inputPath));
177
+ const outputDir = "./results";
178
+ const outputPath = args.output || path.join(outputDir, `${inputName}_structure.json`);
179
+
180
+ // Create output directory
181
+ if (!fs.existsSync(outputDir)) {
182
+ fs.mkdirSync(outputDir, { recursive: true });
183
+ }
184
+
185
+ let result;
186
+
187
+ if (args.pdf) {
188
+ // Validate PDF
189
+ if (!args.pdf.toLowerCase().endsWith(".pdf")) {
190
+ console.error("Error: PDF file must have .pdf extension");
191
+ process.exit(1);
192
+ }
193
+
194
+ if (!fs.existsSync(args.pdf)) {
195
+ console.error(`Error: PDF file not found: ${args.pdf}`);
196
+ process.exit(1);
197
+ }
198
+
199
+ console.log(`Processing PDF: ${args.pdf}`);
200
+ if (args.ocr) {
201
+ console.log(`[OCR Mode] Using OCR model: ${args.ocrModel}`);
202
+ }
203
+
204
+ // Create PageIndex instance
205
+ const pageIndex = new PageIndex({
206
+ model: args.model,
207
+ tocCheckPageNum: args.tocCheckPages,
208
+ maxPageNumEachNode: args.maxPagesPerNode,
209
+ maxTokenNumEachNode: args.maxTokensPerNode,
210
+ addNodeId: args.addNodeId,
211
+ addNodeSummary: args.addNodeSummary,
212
+ addDocDescription: args.addDocDescription,
213
+ addNodeText: args.addNodeText,
214
+ // OCR options
215
+ extractionMode: args.ocr ? "ocr" : "text",
216
+ ocrModel: args.ocrModel,
217
+ ocrPromptType: args.ocrPromptType,
218
+ imageDpi: args.imageDpi,
219
+ });
220
+
221
+ // Configure endpoint
222
+ if (args.lmstudio) {
223
+ pageIndex.useLMStudio();
224
+ } else if (args.ollama) {
225
+ pageIndex.useOllama();
226
+ } else if (args.baseUrl) {
227
+ pageIndex.setBaseUrl(args.baseUrl);
228
+ }
229
+
230
+ // Process PDF
231
+ result = await pageIndex.fromPdf(args.pdf);
232
+
233
+ } else {
234
+ // Validate Markdown
235
+ const mdPath = args.md!;
236
+ if (!mdPath.toLowerCase().endsWith(".md") && !mdPath.toLowerCase().endsWith(".markdown")) {
237
+ console.error("Error: Markdown file must have .md or .markdown extension");
238
+ process.exit(1);
239
+ }
240
+
241
+ if (!fs.existsSync(mdPath)) {
242
+ console.error(`Error: Markdown file not found: ${mdPath}`);
243
+ process.exit(1);
244
+ }
245
+
246
+ console.log(`Processing Markdown: ${mdPath}`);
247
+
248
+ // Process Markdown
249
+ result = await mdToTree(mdPath, {
250
+ model: args.model,
251
+ addNodeId: args.addNodeId,
252
+ addNodeSummary: args.addNodeSummary,
253
+ addDocDescription: args.addDocDescription,
254
+ addNodeText: args.addNodeText,
255
+ thinning: args.thinning,
256
+ thinningThreshold: args.thinningThreshold,
257
+ summaryTokenThreshold: args.summaryTokenThreshold,
258
+ });
259
+ }
260
+
261
+ console.log("Parsing done, saving to file...");
262
+
263
+ // Save results
264
+ await fsp.writeFile(outputPath, JSON.stringify(result, null, 2));
265
+ console.log(`Tree structure saved to: ${outputPath}`);
266
+ }
267
+
268
+ // Run
269
+ main().catch((error) => {
270
+ console.error("Error:", error.message);
271
+ process.exit(1);
272
+ });
package/src/index.ts ADDED
@@ -0,0 +1,82 @@
1
+ /**
2
+ * bun-pageindex
3
+ * Bun-native vectorless, reasoning-based RAG for document understanding
4
+ *
5
+ * @author Antonio Oliveira <antonio@oakoliver.com> (https://oakoliver.com)
6
+ * @license MIT
7
+ */
8
+
9
+ // Main API exports
10
+ export {
11
+ PageIndex,
12
+ createPageIndex,
13
+ indexPdf,
14
+ indexPdfWithLMStudio,
15
+ indexPdfWithOcr,
16
+ indexPdfWithLMStudioOcr,
17
+ } from "./pageindex";
18
+
19
+ // Types
20
+ export type {
21
+ PageIndexOptions,
22
+ MarkdownOptions,
23
+ TreeNode,
24
+ PageIndexResult,
25
+ TocItem,
26
+ PageContent,
27
+ TocCheckResult,
28
+ ExtractionMode,
29
+ OcrPromptType,
30
+ } from "./types";
31
+
32
+ // PDF utilities
33
+ export { parsePdf, getPdfName, type PdfInfo, type PdfPage } from "./pdf";
34
+
35
+ // OCR utilities
36
+ export {
37
+ pdfToImages,
38
+ pdfBufferToImages,
39
+ ocrImage,
40
+ ocrImages,
41
+ parsePdfWithOcr,
42
+ getPdfInfo,
43
+ type OcrOptions,
44
+ } from "./ocr";
45
+
46
+ // OpenAI utilities
47
+ export {
48
+ chatGPT,
49
+ chatGPTWithFinishReason,
50
+ chatGPTBatch,
51
+ getLMStudioConfig,
52
+ getOllamaConfig,
53
+ type ClientConfig,
54
+ type ChatOptions,
55
+ type ChatResult,
56
+ } from "./openai";
57
+
58
+ // Tree utilities
59
+ export {
60
+ writeNodeId,
61
+ getNodes,
62
+ structureToList,
63
+ getLeafNodes,
64
+ isLeafNode,
65
+ listToTree,
66
+ postProcessing,
67
+ printToc,
68
+ countTokens,
69
+ extractJson,
70
+ formatStructure,
71
+ } from "./utils";
72
+
73
+ // Markdown processing
74
+ export {
75
+ mdToTree,
76
+ markdownToTree,
77
+ extractNodesFromMarkdown,
78
+ extractNodeTextContent,
79
+ buildTreeFromNodes,
80
+ treeThinningForIndex,
81
+ printTocMd,
82
+ } from "./markdown";