@kreuzberg/node 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -5
- package/dist/cli.js +2 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +2 -2
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +93 -2
- package/dist/index.d.ts +93 -2
- package/dist/index.js +51 -42
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +45 -42
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +5 -1
- package/dist/types.d.ts +5 -1
- package/dist/types.js.map +1 -1
- package/index.d.ts +213 -12
- package/index.js +58 -52
- package/package.json +11 -11
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -42,13 +42,16 @@
|
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
45
|
-
<img src="https://img.shields.io/badge/License-MIT-
|
|
45
|
+
<img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
|
|
46
46
|
</a>
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
|
-
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-
|
|
48
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
|
|
49
|
+
</a>
|
|
50
|
+
<a href="https://docs.kreuzberg.dev/demo.html">
|
|
51
|
+
<img src="https://img.shields.io/badge/%E2%96%B6%EF%B8%8F_Live_Demo-007ec6" alt="Live Demo">
|
|
49
52
|
</a>
|
|
50
53
|
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
-
<img src="https://img.shields.io/badge/%F0%9F%A4%
|
|
54
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97_Hugging_Face-007ec6" alt="Hugging Face">
|
|
52
55
|
</a>
|
|
53
56
|
</div>
|
|
54
57
|
|
|
@@ -61,7 +64,7 @@
|
|
|
61
64
|
</div>
|
|
62
65
|
|
|
63
66
|
|
|
64
|
-
Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
67
|
+
Extract text, tables, images, and metadata from 91+ file formats and 248 programming languages including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
65
68
|
|
|
66
69
|
|
|
67
70
|
## Installation
|
|
@@ -74,6 +77,7 @@ Install via one of the supported package managers:
|
|
|
74
77
|
|
|
75
78
|
|
|
76
79
|
**npm:**
|
|
80
|
+
|
|
77
81
|
```bash
|
|
78
82
|
npm install @kreuzberg/node
|
|
79
83
|
```
|
|
@@ -82,6 +86,7 @@ npm install @kreuzberg/node
|
|
|
82
86
|
|
|
83
87
|
|
|
84
88
|
**pnpm:**
|
|
89
|
+
|
|
85
90
|
```bash
|
|
86
91
|
pnpm add @kreuzberg/node
|
|
87
92
|
```
|
|
@@ -90,6 +95,7 @@ pnpm add @kreuzberg/node
|
|
|
90
95
|
|
|
91
96
|
|
|
92
97
|
**yarn:**
|
|
98
|
+
|
|
93
99
|
```bash
|
|
94
100
|
yarn add @kreuzberg/node
|
|
95
101
|
```
|
|
@@ -107,6 +113,7 @@ yarn add @kreuzberg/node
|
|
|
107
113
|
### Platform Support
|
|
108
114
|
|
|
109
115
|
Pre-built binaries available for:
|
|
116
|
+
|
|
110
117
|
- macOS (arm64, x64)
|
|
111
118
|
- Linux (x64)
|
|
112
119
|
- Windows (x64)
|
|
@@ -268,12 +275,14 @@ try {
|
|
|
268
275
|
|
|
269
276
|
|
|
270
277
|
**Performance Benefits:**
|
|
278
|
+
|
|
271
279
|
- **Parallel Processing**: Multiple documents extracted simultaneously
|
|
272
280
|
- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
|
|
273
281
|
- **Queue Management**: Automatically distributes work across available workers
|
|
274
282
|
- **Resource Control**: Prevents thread exhaustion with configurable pool size
|
|
275
283
|
|
|
276
284
|
**Best Practices:**
|
|
285
|
+
|
|
277
286
|
- Use worker pools for batches of 10+ documents
|
|
278
287
|
- Set pool size to number of CPU cores (default behavior)
|
|
279
288
|
- Always close pools with `closeWorkerPool()` to prevent resource leaks
|
|
@@ -366,6 +375,19 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
|
366
375
|
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
367
376
|
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
368
377
|
|
|
378
|
+
#### Code Intelligence (248 Languages)
|
|
379
|
+
|
|
380
|
+
| Feature | Description |
|
|
381
|
+
|---------|-------------|
|
|
382
|
+
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
|
|
383
|
+
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
|
|
384
|
+
| **Symbol Extraction** | Variables, constants, type aliases, properties |
|
|
385
|
+
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
|
|
386
|
+
| **Diagnostics** | Parse errors with line/column positions |
|
|
387
|
+
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
|
|
388
|
+
|
|
389
|
+
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
|
|
390
|
+
|
|
369
391
|
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
370
392
|
|
|
371
393
|
### Key Capabilities
|
|
@@ -387,6 +409,9 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
|
387
409
|
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
388
410
|
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
389
411
|
- **Language Detection** - Detect and support multiple languages in documents
|
|
412
|
+
|
|
413
|
+
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [248 programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
|
|
414
|
+
|
|
390
415
|
- **Configuration** - Fine-grained control over extraction behavior
|
|
391
416
|
|
|
392
417
|
### Performance Characteristics
|
package/dist/cli.js
CHANGED
|
@@ -44,8 +44,8 @@ function getDirectory() {
|
|
|
44
44
|
return (0, import_node_path.dirname)(__filename);
|
|
45
45
|
}
|
|
46
46
|
try {
|
|
47
|
-
const
|
|
48
|
-
return (0, import_node_path.dirname)((0, import_node_url.fileURLToPath)(
|
|
47
|
+
const getUrl = new Function("return import.meta.url");
|
|
48
|
+
return (0, import_node_path.dirname)((0, import_node_url.fileURLToPath)(getUrl()));
|
|
49
49
|
} catch {
|
|
50
50
|
return process.cwd();
|
|
51
51
|
}
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM
|
|
1
|
+
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM: use Function constructor to avoid static analysis warnings\n\ttry {\n\t\tconst getUrl = new Function(\"return import.meta.url\");\n\t\treturn dirname(fileURLToPath(getUrl()));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AACH,UAAM,SAAS,IAAI,SAAS,wBAAwB;AACpD,eAAO,8BAAQ,+BAAc,OAAO,CAAC,CAAC;AAAA,EACvC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
|
package/dist/cli.mjs
CHANGED
|
@@ -17,8 +17,8 @@ function getDirectory() {
|
|
|
17
17
|
return dirname(__filename);
|
|
18
18
|
}
|
|
19
19
|
try {
|
|
20
|
-
const
|
|
21
|
-
return dirname(fileURLToPath(
|
|
20
|
+
const getUrl = new Function("return import.meta.url");
|
|
21
|
+
return dirname(fileURLToPath(getUrl()));
|
|
22
22
|
} catch {
|
|
23
23
|
return process.cwd();
|
|
24
24
|
}
|
package/dist/cli.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM
|
|
1
|
+
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM: use Function constructor to avoid static analysis warnings\n\ttry {\n\t\tconst getUrl = new Function(\"return import.meta.url\");\n\t\treturn dirname(fileURLToPath(getUrl()));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AACH,UAAM,SAAS,IAAI,SAAS,wBAAwB;AACpD,WAAO,QAAQ,cAAc,OAAO,CAAC,CAAC;AAAA,EACvC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,UAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
|
package/dist/index.d.mts
CHANGED
|
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
|
|
|
297
297
|
*/
|
|
298
298
|
declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
299
299
|
|
|
300
|
+
/**
|
|
301
|
+
* PDF page rendering functions.
|
|
302
|
+
*
|
|
303
|
+
* Render individual PDF pages or iterate over all pages as PNG images.
|
|
304
|
+
*/
|
|
305
|
+
/**
|
|
306
|
+
* Render a single PDF page to a PNG buffer (synchronous).
|
|
307
|
+
*
|
|
308
|
+
* @param filePath - Path to the PDF file
|
|
309
|
+
* @param pageIndex - Zero-based page index
|
|
310
|
+
* @param options - Optional settings
|
|
311
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
312
|
+
* @returns Buffer containing PNG image data
|
|
313
|
+
*/
|
|
314
|
+
declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
|
|
315
|
+
dpi?: number;
|
|
316
|
+
}): Buffer;
|
|
317
|
+
/**
|
|
318
|
+
* Render a single PDF page to a PNG buffer (asynchronous).
|
|
319
|
+
*
|
|
320
|
+
* @param filePath - Path to the PDF file
|
|
321
|
+
* @param pageIndex - Zero-based page index
|
|
322
|
+
* @param options - Optional settings
|
|
323
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
324
|
+
* @returns Promise resolving to a Buffer containing PNG image data
|
|
325
|
+
*/
|
|
326
|
+
declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
|
|
327
|
+
dpi?: number;
|
|
328
|
+
}): Promise<Buffer>;
|
|
329
|
+
/** A rendered PDF page with its index and PNG data. */
|
|
330
|
+
interface PdfPageResult {
|
|
331
|
+
pageIndex: number;
|
|
332
|
+
data: Buffer;
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Collect all PDF pages as PNG images (synchronous).
|
|
336
|
+
*
|
|
337
|
+
* @param filePath - Path to the PDF file
|
|
338
|
+
* @param options - Optional settings
|
|
339
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
340
|
+
* @returns Array of PdfPageResult objects
|
|
341
|
+
*/
|
|
342
|
+
declare function iteratePdfPagesSync(filePath: string, options?: {
|
|
343
|
+
dpi?: number;
|
|
344
|
+
}): PdfPageResult[];
|
|
345
|
+
/**
|
|
346
|
+
* Collect all PDF pages as PNG images (asynchronous).
|
|
347
|
+
*
|
|
348
|
+
* @param filePath - Path to the PDF file
|
|
349
|
+
* @param options - Optional settings
|
|
350
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
351
|
+
* @returns Promise resolving to an array of PdfPageResult objects
|
|
352
|
+
*/
|
|
353
|
+
declare function iteratePdfPages(filePath: string, options?: {
|
|
354
|
+
dpi?: number;
|
|
355
|
+
}): Promise<PdfPageResult[]>;
|
|
356
|
+
/**
|
|
357
|
+
* Get the number of pages in a PDF file.
|
|
358
|
+
*
|
|
359
|
+
* @param filePath - Path to the PDF file
|
|
360
|
+
* @returns Number of pages
|
|
361
|
+
*/
|
|
362
|
+
declare function pdfPageCount(filePath: string): number;
|
|
363
|
+
/**
|
|
364
|
+
* Lazy PDF page iterator. Renders one page at a time via `.next()`.
|
|
365
|
+
* Call `.close()` when done to free native resources.
|
|
366
|
+
*
|
|
367
|
+
* @example
|
|
368
|
+
* ```typescript
|
|
369
|
+
* const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
|
|
370
|
+
* let result;
|
|
371
|
+
* while ((result = iter.next()) !== null) {
|
|
372
|
+
* const { pageIndex, data } = result;
|
|
373
|
+
* // process page...
|
|
374
|
+
* }
|
|
375
|
+
* iter.close();
|
|
376
|
+
* ```
|
|
377
|
+
*/
|
|
378
|
+
declare class PdfPageIterator {
|
|
379
|
+
private inner;
|
|
380
|
+
constructor(filePath: string, options?: {
|
|
381
|
+
dpi?: number;
|
|
382
|
+
});
|
|
383
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
384
|
+
next(): PdfPageResult | null;
|
|
385
|
+
/** Total number of pages in the PDF. */
|
|
386
|
+
pageCount(): number;
|
|
387
|
+
/** Free native resources. Safe to call multiple times. */
|
|
388
|
+
close(): void;
|
|
389
|
+
}
|
|
390
|
+
|
|
300
391
|
/**
|
|
301
392
|
* Single-document extraction APIs.
|
|
302
393
|
*
|
|
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1282
|
* @module @kreuzberg/node
|
|
1192
1283
|
*/
|
|
1193
1284
|
|
|
1194
|
-
declare const __version__ = "4.
|
|
1285
|
+
declare const __version__ = "4.7.0";
|
|
1195
1286
|
|
|
1196
|
-
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
|
1287
|
+
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.d.ts
CHANGED
|
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
|
|
|
297
297
|
*/
|
|
298
298
|
declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
299
299
|
|
|
300
|
+
/**
|
|
301
|
+
* PDF page rendering functions.
|
|
302
|
+
*
|
|
303
|
+
* Render individual PDF pages or iterate over all pages as PNG images.
|
|
304
|
+
*/
|
|
305
|
+
/**
|
|
306
|
+
* Render a single PDF page to a PNG buffer (synchronous).
|
|
307
|
+
*
|
|
308
|
+
* @param filePath - Path to the PDF file
|
|
309
|
+
* @param pageIndex - Zero-based page index
|
|
310
|
+
* @param options - Optional settings
|
|
311
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
312
|
+
* @returns Buffer containing PNG image data
|
|
313
|
+
*/
|
|
314
|
+
declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
|
|
315
|
+
dpi?: number;
|
|
316
|
+
}): Buffer;
|
|
317
|
+
/**
|
|
318
|
+
* Render a single PDF page to a PNG buffer (asynchronous).
|
|
319
|
+
*
|
|
320
|
+
* @param filePath - Path to the PDF file
|
|
321
|
+
* @param pageIndex - Zero-based page index
|
|
322
|
+
* @param options - Optional settings
|
|
323
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
324
|
+
* @returns Promise resolving to a Buffer containing PNG image data
|
|
325
|
+
*/
|
|
326
|
+
declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
|
|
327
|
+
dpi?: number;
|
|
328
|
+
}): Promise<Buffer>;
|
|
329
|
+
/** A rendered PDF page with its index and PNG data. */
|
|
330
|
+
interface PdfPageResult {
|
|
331
|
+
pageIndex: number;
|
|
332
|
+
data: Buffer;
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Collect all PDF pages as PNG images (synchronous).
|
|
336
|
+
*
|
|
337
|
+
* @param filePath - Path to the PDF file
|
|
338
|
+
* @param options - Optional settings
|
|
339
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
340
|
+
* @returns Array of PdfPageResult objects
|
|
341
|
+
*/
|
|
342
|
+
declare function iteratePdfPagesSync(filePath: string, options?: {
|
|
343
|
+
dpi?: number;
|
|
344
|
+
}): PdfPageResult[];
|
|
345
|
+
/**
|
|
346
|
+
* Collect all PDF pages as PNG images (asynchronous).
|
|
347
|
+
*
|
|
348
|
+
* @param filePath - Path to the PDF file
|
|
349
|
+
* @param options - Optional settings
|
|
350
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
351
|
+
* @returns Promise resolving to an array of PdfPageResult objects
|
|
352
|
+
*/
|
|
353
|
+
declare function iteratePdfPages(filePath: string, options?: {
|
|
354
|
+
dpi?: number;
|
|
355
|
+
}): Promise<PdfPageResult[]>;
|
|
356
|
+
/**
|
|
357
|
+
* Get the number of pages in a PDF file.
|
|
358
|
+
*
|
|
359
|
+
* @param filePath - Path to the PDF file
|
|
360
|
+
* @returns Number of pages
|
|
361
|
+
*/
|
|
362
|
+
declare function pdfPageCount(filePath: string): number;
|
|
363
|
+
/**
|
|
364
|
+
* Lazy PDF page iterator. Renders one page at a time via `.next()`.
|
|
365
|
+
* Call `.close()` when done to free native resources.
|
|
366
|
+
*
|
|
367
|
+
* @example
|
|
368
|
+
* ```typescript
|
|
369
|
+
* const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
|
|
370
|
+
* let result;
|
|
371
|
+
* while ((result = iter.next()) !== null) {
|
|
372
|
+
* const { pageIndex, data } = result;
|
|
373
|
+
* // process page...
|
|
374
|
+
* }
|
|
375
|
+
* iter.close();
|
|
376
|
+
* ```
|
|
377
|
+
*/
|
|
378
|
+
declare class PdfPageIterator {
|
|
379
|
+
private inner;
|
|
380
|
+
constructor(filePath: string, options?: {
|
|
381
|
+
dpi?: number;
|
|
382
|
+
});
|
|
383
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
384
|
+
next(): PdfPageResult | null;
|
|
385
|
+
/** Total number of pages in the PDF. */
|
|
386
|
+
pageCount(): number;
|
|
387
|
+
/** Free native resources. Safe to call multiple times. */
|
|
388
|
+
close(): void;
|
|
389
|
+
}
|
|
390
|
+
|
|
300
391
|
/**
|
|
301
392
|
* Single-document extraction APIs.
|
|
302
393
|
*
|
|
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1282
|
* @module @kreuzberg/node
|
|
1192
1283
|
*/
|
|
1193
1284
|
|
|
1194
|
-
declare const __version__ = "4.
|
|
1285
|
+
declare const __version__ = "4.7.0";
|
|
1195
1286
|
|
|
1196
|
-
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
|
1287
|
+
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.js
CHANGED
|
@@ -28,6 +28,7 @@ __export(index_exports, {
|
|
|
28
28
|
MissingDependencyError: () => MissingDependencyError,
|
|
29
29
|
OcrError: () => OcrError,
|
|
30
30
|
ParsingError: () => ParsingError,
|
|
31
|
+
PdfPageIterator: () => PdfPageIterator,
|
|
31
32
|
PluginError: () => PluginError,
|
|
32
33
|
ValidationError: () => ValidationError,
|
|
33
34
|
__resetBindingForTests: () => __resetBindingForTests,
|
|
@@ -59,6 +60,8 @@ __export(index_exports, {
|
|
|
59
60
|
getLastErrorCode: () => getLastErrorCode,
|
|
60
61
|
getLastPanicContext: () => getLastPanicContext,
|
|
61
62
|
getWorkerPoolStats: () => getWorkerPoolStats,
|
|
63
|
+
iteratePdfPages: () => iteratePdfPages,
|
|
64
|
+
iteratePdfPagesSync: () => iteratePdfPagesSync,
|
|
62
65
|
listDocumentExtractors: () => listDocumentExtractors,
|
|
63
66
|
listEmbeddingPresets: () => listEmbeddingPresets,
|
|
64
67
|
listOcrBackends: () => listOcrBackends,
|
|
@@ -66,9 +69,12 @@ __export(index_exports, {
|
|
|
66
69
|
listValidators: () => listValidators,
|
|
67
70
|
loadConfigFile: () => loadConfigFile,
|
|
68
71
|
loadConfigFromPath: () => loadConfigFromPath,
|
|
72
|
+
pdfPageCount: () => pdfPageCount,
|
|
69
73
|
registerOcrBackend: () => registerOcrBackend,
|
|
70
74
|
registerPostProcessor: () => registerPostProcessor,
|
|
71
75
|
registerValidator: () => registerValidator,
|
|
76
|
+
renderPdfPage: () => renderPdfPage,
|
|
77
|
+
renderPdfPageSync: () => renderPdfPageSync,
|
|
72
78
|
unregisterDocumentExtractor: () => unregisterDocumentExtractor,
|
|
73
79
|
unregisterOcrBackend: () => unregisterOcrBackend,
|
|
74
80
|
unregisterPostProcessor: () => unregisterPostProcessor,
|
|
@@ -579,6 +585,7 @@ function convertChunk(rawChunk) {
|
|
|
579
585
|
if (!rawChunk || typeof rawChunk !== "object") {
|
|
580
586
|
return {
|
|
581
587
|
content: "",
|
|
588
|
+
chunkType: null,
|
|
582
589
|
metadata: {
|
|
583
590
|
byteStart: 0,
|
|
584
591
|
byteEnd: 0,
|
|
@@ -592,26 +599,17 @@ function convertChunk(rawChunk) {
|
|
|
592
599
|
const chunk = rawChunk;
|
|
593
600
|
const metadata = chunk["metadata"] ?? {};
|
|
594
601
|
return {
|
|
595
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
596
602
|
content: chunk["content"] ?? "",
|
|
597
|
-
|
|
603
|
+
chunkType: chunk["chunk_type"] ?? chunk["chunkType"] ?? null,
|
|
598
604
|
embedding: chunk["embedding"] ?? null,
|
|
599
605
|
metadata: {
|
|
600
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
601
606
|
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
602
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
603
607
|
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
604
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
605
608
|
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
606
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
607
609
|
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
608
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
609
610
|
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
610
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
611
611
|
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
612
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
613
612
|
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
|
|
614
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
615
613
|
headingContext: (() => {
|
|
616
614
|
const hc = metadata["heading_context"] ?? metadata["headingContext"];
|
|
617
615
|
if (!hc) return null;
|
|
@@ -621,9 +619,7 @@ function convertChunk(rawChunk) {
|
|
|
621
619
|
headings: headings.map((h) => {
|
|
622
620
|
const heading = h;
|
|
623
621
|
return {
|
|
624
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
625
622
|
level: heading["level"] ?? 0,
|
|
626
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
627
623
|
text: heading["text"] ?? ""
|
|
628
624
|
};
|
|
629
625
|
})
|
|
@@ -644,22 +640,14 @@ function convertElement(rawElement) {
|
|
|
644
640
|
const element = rawElement;
|
|
645
641
|
const elementMetadata = element["metadata"] ?? {};
|
|
646
642
|
return {
|
|
647
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
648
643
|
elementId: element["element_id"] ?? element["elementId"] ?? "",
|
|
649
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
650
644
|
elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
|
|
651
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
652
645
|
text: element["text"] ?? "",
|
|
653
646
|
metadata: {
|
|
654
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
655
647
|
pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
|
|
656
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
657
648
|
filename: elementMetadata["filename"] ?? null,
|
|
658
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
659
649
|
coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
|
|
660
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
661
650
|
elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
|
|
662
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
663
651
|
additional: elementMetadata["additional"] ?? {}
|
|
664
652
|
}
|
|
665
653
|
};
|
|
@@ -682,27 +670,16 @@ function convertImage(rawImage) {
|
|
|
682
670
|
}
|
|
683
671
|
const image = rawImage;
|
|
684
672
|
return {
|
|
685
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
686
673
|
data: ensureUint8Array(image["data"]),
|
|
687
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
688
674
|
format: image["format"] ?? "unknown",
|
|
689
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
690
675
|
imageIndex: image["imageIndex"] ?? 0,
|
|
691
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
692
676
|
pageNumber: image["pageNumber"] ?? null,
|
|
693
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
694
677
|
width: image["width"] ?? null,
|
|
695
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
696
678
|
height: image["height"] ?? null,
|
|
697
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
698
679
|
colorspace: image["colorspace"] ?? null,
|
|
699
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
700
680
|
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
701
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
702
681
|
isMask: image["isMask"] ?? false,
|
|
703
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
704
682
|
description: image["description"] ?? null,
|
|
705
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
706
683
|
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
707
684
|
};
|
|
708
685
|
}
|
|
@@ -717,15 +694,10 @@ function convertPageContent(rawPage) {
|
|
|
717
694
|
}
|
|
718
695
|
const page = rawPage;
|
|
719
696
|
return {
|
|
720
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
721
697
|
pageNumber: page["pageNumber"] ?? 0,
|
|
722
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
723
698
|
content: page["content"] ?? "",
|
|
724
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
725
699
|
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
726
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
727
700
|
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
|
|
728
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
729
701
|
isBlank: page["isBlank"] ?? null
|
|
730
702
|
};
|
|
731
703
|
}
|
|
@@ -748,20 +720,15 @@ function convertResult(rawResult) {
|
|
|
748
720
|
const metadata = result["metadata"];
|
|
749
721
|
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
750
722
|
const returnObj = {
|
|
751
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
752
723
|
content: result["content"] ?? "",
|
|
753
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
754
724
|
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
755
725
|
metadata: metadataValue,
|
|
756
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
757
726
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
758
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
759
727
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
760
728
|
chunks: null,
|
|
761
729
|
images: null,
|
|
762
730
|
elements: null,
|
|
763
731
|
pages: null,
|
|
764
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
765
732
|
document: result["document"] ?? null
|
|
766
733
|
};
|
|
767
734
|
const chunksData = result["chunks"];
|
|
@@ -833,6 +800,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
833
800
|
return rawResults.map(convertResult);
|
|
834
801
|
}
|
|
835
802
|
|
|
803
|
+
// typescript/extraction/render.ts
|
|
804
|
+
function renderPdfPageSync(filePath, pageIndex, options) {
|
|
805
|
+
return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
|
|
806
|
+
}
|
|
807
|
+
async function renderPdfPage(filePath, pageIndex, options) {
|
|
808
|
+
return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
|
|
809
|
+
}
|
|
810
|
+
function iteratePdfPagesSync(filePath, options) {
|
|
811
|
+
return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
|
|
812
|
+
}
|
|
813
|
+
async function iteratePdfPages(filePath, options) {
|
|
814
|
+
return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
|
|
815
|
+
}
|
|
816
|
+
function pdfPageCount(filePath) {
|
|
817
|
+
return getBinding().pdfPageCount(filePath);
|
|
818
|
+
}
|
|
819
|
+
var PdfPageIterator = class {
|
|
820
|
+
inner;
|
|
821
|
+
constructor(filePath, options) {
|
|
822
|
+
const Ctor = getBinding().JsPdfPageIterator;
|
|
823
|
+
this.inner = new Ctor(filePath, options?.dpi ?? null);
|
|
824
|
+
}
|
|
825
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
826
|
+
next() {
|
|
827
|
+
return this.inner.next();
|
|
828
|
+
}
|
|
829
|
+
/** Total number of pages in the PDF. */
|
|
830
|
+
pageCount() {
|
|
831
|
+
return this.inner.pageCount();
|
|
832
|
+
}
|
|
833
|
+
/** Free native resources. Safe to call multiple times. */
|
|
834
|
+
close() {
|
|
835
|
+
this.inner.close();
|
|
836
|
+
}
|
|
837
|
+
};
|
|
838
|
+
|
|
836
839
|
// typescript/extraction/single.ts
|
|
837
840
|
var import_node_fs = require("fs");
|
|
838
841
|
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
@@ -1231,7 +1234,7 @@ function getEmbeddingPreset(name) {
|
|
|
1231
1234
|
}
|
|
1232
1235
|
|
|
1233
1236
|
// typescript/index.ts
|
|
1234
|
-
var __version__ = "4.
|
|
1237
|
+
var __version__ = "4.7.0";
|
|
1235
1238
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1236
1239
|
0 && (module.exports = {
|
|
1237
1240
|
CacheError,
|
|
@@ -1242,6 +1245,7 @@ var __version__ = "4.6.1";
|
|
|
1242
1245
|
MissingDependencyError,
|
|
1243
1246
|
OcrError,
|
|
1244
1247
|
ParsingError,
|
|
1248
|
+
PdfPageIterator,
|
|
1245
1249
|
PluginError,
|
|
1246
1250
|
ValidationError,
|
|
1247
1251
|
__resetBindingForTests,
|
|
@@ -1273,6 +1277,8 @@ var __version__ = "4.6.1";
|
|
|
1273
1277
|
getLastErrorCode,
|
|
1274
1278
|
getLastPanicContext,
|
|
1275
1279
|
getWorkerPoolStats,
|
|
1280
|
+
iteratePdfPages,
|
|
1281
|
+
iteratePdfPagesSync,
|
|
1276
1282
|
listDocumentExtractors,
|
|
1277
1283
|
listEmbeddingPresets,
|
|
1278
1284
|
listOcrBackends,
|
|
@@ -1280,9 +1286,12 @@ var __version__ = "4.6.1";
|
|
|
1280
1286
|
listValidators,
|
|
1281
1287
|
loadConfigFile,
|
|
1282
1288
|
loadConfigFromPath,
|
|
1289
|
+
pdfPageCount,
|
|
1283
1290
|
registerOcrBackend,
|
|
1284
1291
|
registerPostProcessor,
|
|
1285
1292
|
registerValidator,
|
|
1293
|
+
renderPdfPage,
|
|
1294
|
+
renderPdfPageSync,
|
|
1286
1295
|
unregisterDocumentExtractor,
|
|
1287
1296
|
unregisterOcrBackend,
|
|
1288
1297
|
unregisterPostProcessor,
|