@kreuzberg/node 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.1" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -42,13 +42,16 @@
42
42
 
43
43
  <!-- Project Info -->
44
44
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
45
- <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
45
+ <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
46
46
  </a>
47
47
  <a href="https://docs.kreuzberg.dev">
48
- <img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
48
+ <img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
49
+ </a>
50
+ <a href="https://docs.kreuzberg.dev/demo.html">
51
+ <img src="https://img.shields.io/badge/%E2%96%B6%EF%B8%8F_Live_Demo-007ec6" alt="Live Demo">
49
52
  </a>
50
53
  <a href="https://huggingface.co/Kreuzberg">
51
- <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
54
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97_Hugging_Face-007ec6" alt="Hugging Face">
52
55
  </a>
53
56
  </div>
54
57
 
@@ -61,7 +64,7 @@
61
64
  </div>
62
65
 
63
66
 
64
- Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
67
+ Extract text, tables, images, and metadata from 91+ file formats and 248 programming languages including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
65
68
 
66
69
 
67
70
  ## Installation
@@ -74,6 +77,7 @@ Install via one of the supported package managers:
74
77
 
75
78
 
76
79
  **npm:**
80
+
77
81
  ```bash
78
82
  npm install @kreuzberg/node
79
83
  ```
@@ -82,6 +86,7 @@ npm install @kreuzberg/node
82
86
 
83
87
 
84
88
  **pnpm:**
89
+
85
90
  ```bash
86
91
  pnpm add @kreuzberg/node
87
92
  ```
@@ -90,6 +95,7 @@ pnpm add @kreuzberg/node
90
95
 
91
96
 
92
97
  **yarn:**
98
+
93
99
  ```bash
94
100
  yarn add @kreuzberg/node
95
101
  ```
@@ -107,6 +113,7 @@ yarn add @kreuzberg/node
107
113
  ### Platform Support
108
114
 
109
115
  Pre-built binaries available for:
116
+
110
117
  - macOS (arm64, x64)
111
118
  - Linux (x64)
112
119
  - Windows (x64)
@@ -268,12 +275,14 @@ try {
268
275
 
269
276
 
270
277
  **Performance Benefits:**
278
+
271
279
  - **Parallel Processing**: Multiple documents extracted simultaneously
272
280
  - **CPU Utilization**: Maximizes multi-core CPU usage for large batches
273
281
  - **Queue Management**: Automatically distributes work across available workers
274
282
  - **Resource Control**: Prevents thread exhaustion with configurable pool size
275
283
 
276
284
  **Best Practices:**
285
+
277
286
  - Use worker pools for batches of 10+ documents
278
287
  - Set pool size to number of CPU cores (default behavior)
279
288
  - Always close pools with `closeWorkerPool()` to prevent resource leaks
@@ -366,6 +375,19 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
366
375
  | **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
367
376
  | **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
368
377
 
378
+ #### Code Intelligence (248 Languages)
379
+
380
+ | Feature | Description |
381
+ |---------|-------------|
382
+ | **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
383
+ | **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
384
+ | **Symbol Extraction** | Variables, constants, type aliases, properties |
385
+ | **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
386
+ | **Diagnostics** | Parse errors with line/column positions |
387
+ | **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
388
+
389
+ Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
390
+
369
391
  **[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
370
392
 
371
393
  ### Key Capabilities
@@ -387,6 +409,9 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
387
409
  - **Batch Processing** - Efficiently process multiple documents in parallel
388
410
  - **Memory Efficient** - Stream large files without loading entirely into memory
389
411
  - **Language Detection** - Detect and support multiple languages in documents
412
+
413
+ - **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [248 programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
414
+
390
415
  - **Configuration** - Fine-grained control over extraction behavior
391
416
 
392
417
  ### Performance Characteristics
package/dist/cli.js CHANGED
@@ -44,8 +44,8 @@ function getDirectory() {
44
44
  return (0, import_node_path.dirname)(__filename);
45
45
  }
46
46
  try {
47
- const url = eval("import.meta.url");
48
- return (0, import_node_path.dirname)((0, import_node_url.fileURLToPath)(url));
47
+ const getUrl = new Function("return import.meta.url");
48
+ return (0, import_node_path.dirname)((0, import_node_url.fileURLToPath)(getUrl()));
49
49
  } catch {
50
50
  return process.cwd();
51
51
  }
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// biome-ignore lint/security/noGlobalEval: Required to handle both CJS and ESM contexts at runtime\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,eAAO,8BAAQ,+BAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
1
+ {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM: use Function constructor to avoid static analysis warnings\n\ttry {\n\t\tconst getUrl = new Function(\"return import.meta.url\");\n\t\treturn dirname(fileURLToPath(getUrl()));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AACH,UAAM,SAAS,IAAI,SAAS,wBAAwB;AACpD,eAAO,8BAAQ,+BAAc,OAAO,CAAC,CAAC;AAAA,EACvC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
package/dist/cli.mjs CHANGED
@@ -17,8 +17,8 @@ function getDirectory() {
17
17
  return dirname(__filename);
18
18
  }
19
19
  try {
20
- const url = eval("import.meta.url");
21
- return dirname(fileURLToPath(url));
20
+ const getUrl = new Function("return import.meta.url");
21
+ return dirname(fileURLToPath(getUrl()));
22
22
  } catch {
23
23
  return process.cwd();
24
24
  }
package/dist/cli.mjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// biome-ignore lint/security/noGlobalEval: Required to handle both CJS and ESM contexts at runtime\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,WAAO,QAAQ,cAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,UAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
1
+ {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM: use Function constructor to avoid static analysis warnings\n\ttry {\n\t\tconst getUrl = new Function(\"return import.meta.url\");\n\t\treturn dirname(fileURLToPath(getUrl()));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AACH,UAAM,SAAS,IAAI,SAAS,wBAAwB;AACpD,WAAO,QAAQ,cAAc,OAAO,CAAC,CAAC;AAAA,EACvC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,UAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
package/dist/index.d.mts CHANGED
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
297
297
  */
298
298
  declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
299
299
 
300
+ /**
301
+ * PDF page rendering functions.
302
+ *
303
+ * Render individual PDF pages or iterate over all pages as PNG images.
304
+ */
305
+ /**
306
+ * Render a single PDF page to a PNG buffer (synchronous).
307
+ *
308
+ * @param filePath - Path to the PDF file
309
+ * @param pageIndex - Zero-based page index
310
+ * @param options - Optional settings
311
+ * @param options.dpi - DPI for rendering (default 150)
312
+ * @returns Buffer containing PNG image data
313
+ */
314
+ declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
315
+ dpi?: number;
316
+ }): Buffer;
317
+ /**
318
+ * Render a single PDF page to a PNG buffer (asynchronous).
319
+ *
320
+ * @param filePath - Path to the PDF file
321
+ * @param pageIndex - Zero-based page index
322
+ * @param options - Optional settings
323
+ * @param options.dpi - DPI for rendering (default 150)
324
+ * @returns Promise resolving to a Buffer containing PNG image data
325
+ */
326
+ declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
327
+ dpi?: number;
328
+ }): Promise<Buffer>;
329
+ /** A rendered PDF page with its index and PNG data. */
330
+ interface PdfPageResult {
331
+ pageIndex: number;
332
+ data: Buffer;
333
+ }
334
+ /**
335
+ * Collect all PDF pages as PNG images (synchronous).
336
+ *
337
+ * @param filePath - Path to the PDF file
338
+ * @param options - Optional settings
339
+ * @param options.dpi - DPI for rendering (default 150)
340
+ * @returns Array of PdfPageResult objects
341
+ */
342
+ declare function iteratePdfPagesSync(filePath: string, options?: {
343
+ dpi?: number;
344
+ }): PdfPageResult[];
345
+ /**
346
+ * Collect all PDF pages as PNG images (asynchronous).
347
+ *
348
+ * @param filePath - Path to the PDF file
349
+ * @param options - Optional settings
350
+ * @param options.dpi - DPI for rendering (default 150)
351
+ * @returns Promise resolving to an array of PdfPageResult objects
352
+ */
353
+ declare function iteratePdfPages(filePath: string, options?: {
354
+ dpi?: number;
355
+ }): Promise<PdfPageResult[]>;
356
+ /**
357
+ * Get the number of pages in a PDF file.
358
+ *
359
+ * @param filePath - Path to the PDF file
360
+ * @returns Number of pages
361
+ */
362
+ declare function pdfPageCount(filePath: string): number;
363
+ /**
364
+ * Lazy PDF page iterator. Renders one page at a time via `.next()`.
365
+ * Call `.close()` when done to free native resources.
366
+ *
367
+ * @example
368
+ * ```typescript
369
+ * const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
370
+ * let result;
371
+ * while ((result = iter.next()) !== null) {
372
+ * const { pageIndex, data } = result;
373
+ * // process page...
374
+ * }
375
+ * iter.close();
376
+ * ```
377
+ */
378
+ declare class PdfPageIterator {
379
+ private inner;
380
+ constructor(filePath: string, options?: {
381
+ dpi?: number;
382
+ });
383
+ /** Advance and return the next page, or null when exhausted. */
384
+ next(): PdfPageResult | null;
385
+ /** Total number of pages in the PDF. */
386
+ pageCount(): number;
387
+ /** Free native resources. Safe to call multiple times. */
388
+ close(): void;
389
+ }
390
+
300
391
  /**
301
392
  * Single-document extraction APIs.
302
393
  *
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
1191
1282
  * @module @kreuzberg/node
1192
1283
  */
1193
1284
 
1194
- declare const __version__ = "4.6.1";
1285
+ declare const __version__ = "4.7.0";
1195
1286
 
1196
- export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
1287
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.d.ts CHANGED
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
297
297
  */
298
298
  declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
299
299
 
300
+ /**
301
+ * PDF page rendering functions.
302
+ *
303
+ * Render individual PDF pages or iterate over all pages as PNG images.
304
+ */
305
+ /**
306
+ * Render a single PDF page to a PNG buffer (synchronous).
307
+ *
308
+ * @param filePath - Path to the PDF file
309
+ * @param pageIndex - Zero-based page index
310
+ * @param options - Optional settings
311
+ * @param options.dpi - DPI for rendering (default 150)
312
+ * @returns Buffer containing PNG image data
313
+ */
314
+ declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
315
+ dpi?: number;
316
+ }): Buffer;
317
+ /**
318
+ * Render a single PDF page to a PNG buffer (asynchronous).
319
+ *
320
+ * @param filePath - Path to the PDF file
321
+ * @param pageIndex - Zero-based page index
322
+ * @param options - Optional settings
323
+ * @param options.dpi - DPI for rendering (default 150)
324
+ * @returns Promise resolving to a Buffer containing PNG image data
325
+ */
326
+ declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
327
+ dpi?: number;
328
+ }): Promise<Buffer>;
329
+ /** A rendered PDF page with its index and PNG data. */
330
+ interface PdfPageResult {
331
+ pageIndex: number;
332
+ data: Buffer;
333
+ }
334
+ /**
335
+ * Collect all PDF pages as PNG images (synchronous).
336
+ *
337
+ * @param filePath - Path to the PDF file
338
+ * @param options - Optional settings
339
+ * @param options.dpi - DPI for rendering (default 150)
340
+ * @returns Array of PdfPageResult objects
341
+ */
342
+ declare function iteratePdfPagesSync(filePath: string, options?: {
343
+ dpi?: number;
344
+ }): PdfPageResult[];
345
+ /**
346
+ * Collect all PDF pages as PNG images (asynchronous).
347
+ *
348
+ * @param filePath - Path to the PDF file
349
+ * @param options - Optional settings
350
+ * @param options.dpi - DPI for rendering (default 150)
351
+ * @returns Promise resolving to an array of PdfPageResult objects
352
+ */
353
+ declare function iteratePdfPages(filePath: string, options?: {
354
+ dpi?: number;
355
+ }): Promise<PdfPageResult[]>;
356
+ /**
357
+ * Get the number of pages in a PDF file.
358
+ *
359
+ * @param filePath - Path to the PDF file
360
+ * @returns Number of pages
361
+ */
362
+ declare function pdfPageCount(filePath: string): number;
363
+ /**
364
+ * Lazy PDF page iterator. Renders one page at a time via `.next()`.
365
+ * Call `.close()` when done to free native resources.
366
+ *
367
+ * @example
368
+ * ```typescript
369
+ * const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
370
+ * let result;
371
+ * while ((result = iter.next()) !== null) {
372
+ * const { pageIndex, data } = result;
373
+ * // process page...
374
+ * }
375
+ * iter.close();
376
+ * ```
377
+ */
378
+ declare class PdfPageIterator {
379
+ private inner;
380
+ constructor(filePath: string, options?: {
381
+ dpi?: number;
382
+ });
383
+ /** Advance and return the next page, or null when exhausted. */
384
+ next(): PdfPageResult | null;
385
+ /** Total number of pages in the PDF. */
386
+ pageCount(): number;
387
+ /** Free native resources. Safe to call multiple times. */
388
+ close(): void;
389
+ }
390
+
300
391
  /**
301
392
  * Single-document extraction APIs.
302
393
  *
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
1191
1282
  * @module @kreuzberg/node
1192
1283
  */
1193
1284
 
1194
- declare const __version__ = "4.6.1";
1285
+ declare const __version__ = "4.7.0";
1195
1286
 
1196
- export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
1287
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.js CHANGED
@@ -28,6 +28,7 @@ __export(index_exports, {
28
28
  MissingDependencyError: () => MissingDependencyError,
29
29
  OcrError: () => OcrError,
30
30
  ParsingError: () => ParsingError,
31
+ PdfPageIterator: () => PdfPageIterator,
31
32
  PluginError: () => PluginError,
32
33
  ValidationError: () => ValidationError,
33
34
  __resetBindingForTests: () => __resetBindingForTests,
@@ -59,6 +60,8 @@ __export(index_exports, {
59
60
  getLastErrorCode: () => getLastErrorCode,
60
61
  getLastPanicContext: () => getLastPanicContext,
61
62
  getWorkerPoolStats: () => getWorkerPoolStats,
63
+ iteratePdfPages: () => iteratePdfPages,
64
+ iteratePdfPagesSync: () => iteratePdfPagesSync,
62
65
  listDocumentExtractors: () => listDocumentExtractors,
63
66
  listEmbeddingPresets: () => listEmbeddingPresets,
64
67
  listOcrBackends: () => listOcrBackends,
@@ -66,9 +69,12 @@ __export(index_exports, {
66
69
  listValidators: () => listValidators,
67
70
  loadConfigFile: () => loadConfigFile,
68
71
  loadConfigFromPath: () => loadConfigFromPath,
72
+ pdfPageCount: () => pdfPageCount,
69
73
  registerOcrBackend: () => registerOcrBackend,
70
74
  registerPostProcessor: () => registerPostProcessor,
71
75
  registerValidator: () => registerValidator,
76
+ renderPdfPage: () => renderPdfPage,
77
+ renderPdfPageSync: () => renderPdfPageSync,
72
78
  unregisterDocumentExtractor: () => unregisterDocumentExtractor,
73
79
  unregisterOcrBackend: () => unregisterOcrBackend,
74
80
  unregisterPostProcessor: () => unregisterPostProcessor,
@@ -579,6 +585,7 @@ function convertChunk(rawChunk) {
579
585
  if (!rawChunk || typeof rawChunk !== "object") {
580
586
  return {
581
587
  content: "",
588
+ chunkType: null,
582
589
  metadata: {
583
590
  byteStart: 0,
584
591
  byteEnd: 0,
@@ -592,26 +599,17 @@ function convertChunk(rawChunk) {
592
599
  const chunk = rawChunk;
593
600
  const metadata = chunk["metadata"] ?? {};
594
601
  return {
595
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
596
602
  content: chunk["content"] ?? "",
597
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
603
+ chunkType: chunk["chunk_type"] ?? chunk["chunkType"] ?? null,
598
604
  embedding: chunk["embedding"] ?? null,
599
605
  metadata: {
600
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
601
606
  byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
602
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
603
607
  byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
604
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
605
608
  tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
606
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
607
609
  chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
608
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
609
610
  totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
610
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
611
611
  firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
612
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
613
612
  lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
614
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
615
613
  headingContext: (() => {
616
614
  const hc = metadata["heading_context"] ?? metadata["headingContext"];
617
615
  if (!hc) return null;
@@ -621,9 +619,7 @@ function convertChunk(rawChunk) {
621
619
  headings: headings.map((h) => {
622
620
  const heading = h;
623
621
  return {
624
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
625
622
  level: heading["level"] ?? 0,
626
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
627
623
  text: heading["text"] ?? ""
628
624
  };
629
625
  })
@@ -644,22 +640,14 @@ function convertElement(rawElement) {
644
640
  const element = rawElement;
645
641
  const elementMetadata = element["metadata"] ?? {};
646
642
  return {
647
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
648
643
  elementId: element["element_id"] ?? element["elementId"] ?? "",
649
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
650
644
  elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
651
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
652
645
  text: element["text"] ?? "",
653
646
  metadata: {
654
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
655
647
  pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
656
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
657
648
  filename: elementMetadata["filename"] ?? null,
658
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
659
649
  coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
660
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
661
650
  elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
662
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
663
651
  additional: elementMetadata["additional"] ?? {}
664
652
  }
665
653
  };
@@ -682,27 +670,16 @@ function convertImage(rawImage) {
682
670
  }
683
671
  const image = rawImage;
684
672
  return {
685
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
686
673
  data: ensureUint8Array(image["data"]),
687
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
688
674
  format: image["format"] ?? "unknown",
689
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
690
675
  imageIndex: image["imageIndex"] ?? 0,
691
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
692
676
  pageNumber: image["pageNumber"] ?? null,
693
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
694
677
  width: image["width"] ?? null,
695
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
696
678
  height: image["height"] ?? null,
697
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
698
679
  colorspace: image["colorspace"] ?? null,
699
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
700
680
  bitsPerComponent: image["bitsPerComponent"] ?? null,
701
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
702
681
  isMask: image["isMask"] ?? false,
703
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
704
682
  description: image["description"] ?? null,
705
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
706
683
  ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
707
684
  };
708
685
  }
@@ -717,15 +694,10 @@ function convertPageContent(rawPage) {
717
694
  }
718
695
  const page = rawPage;
719
696
  return {
720
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
721
697
  pageNumber: page["pageNumber"] ?? 0,
722
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
723
698
  content: page["content"] ?? "",
724
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
725
699
  tables: Array.isArray(page["tables"]) ? page["tables"] : [],
726
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
727
700
  images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
728
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
729
701
  isBlank: page["isBlank"] ?? null
730
702
  };
731
703
  }
@@ -748,20 +720,15 @@ function convertResult(rawResult) {
748
720
  const metadata = result["metadata"];
749
721
  const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
750
722
  const returnObj = {
751
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
752
723
  content: result["content"] ?? "",
753
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
754
724
  mimeType: result["mimeType"] ?? "application/octet-stream",
755
725
  metadata: metadataValue,
756
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
757
726
  tables: Array.isArray(result["tables"]) ? result["tables"] : [],
758
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
759
727
  detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
760
728
  chunks: null,
761
729
  images: null,
762
730
  elements: null,
763
731
  pages: null,
764
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
765
732
  document: result["document"] ?? null
766
733
  };
767
734
  const chunksData = result["chunks"];
@@ -833,6 +800,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
833
800
  return rawResults.map(convertResult);
834
801
  }
835
802
 
803
+ // typescript/extraction/render.ts
804
+ function renderPdfPageSync(filePath, pageIndex, options) {
805
+ return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
806
+ }
807
+ async function renderPdfPage(filePath, pageIndex, options) {
808
+ return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
809
+ }
810
+ function iteratePdfPagesSync(filePath, options) {
811
+ return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
812
+ }
813
+ async function iteratePdfPages(filePath, options) {
814
+ return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
815
+ }
816
+ function pdfPageCount(filePath) {
817
+ return getBinding().pdfPageCount(filePath);
818
+ }
819
+ var PdfPageIterator = class {
820
+ inner;
821
+ constructor(filePath, options) {
822
+ const Ctor = getBinding().JsPdfPageIterator;
823
+ this.inner = new Ctor(filePath, options?.dpi ?? null);
824
+ }
825
+ /** Advance and return the next page, or null when exhausted. */
826
+ next() {
827
+ return this.inner.next();
828
+ }
829
+ /** Total number of pages in the PDF. */
830
+ pageCount() {
831
+ return this.inner.pageCount();
832
+ }
833
+ /** Free native resources. Safe to call multiple times. */
834
+ close() {
835
+ this.inner.close();
836
+ }
837
+ };
838
+
836
839
  // typescript/extraction/single.ts
837
840
  var import_node_fs = require("fs");
838
841
  function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
@@ -1231,7 +1234,7 @@ function getEmbeddingPreset(name) {
1231
1234
  }
1232
1235
 
1233
1236
  // typescript/index.ts
1234
- var __version__ = "4.6.1";
1237
+ var __version__ = "4.7.0";
1235
1238
  // Annotate the CommonJS export names for ESM import in node:
1236
1239
  0 && (module.exports = {
1237
1240
  CacheError,
@@ -1242,6 +1245,7 @@ var __version__ = "4.6.1";
1242
1245
  MissingDependencyError,
1243
1246
  OcrError,
1244
1247
  ParsingError,
1248
+ PdfPageIterator,
1245
1249
  PluginError,
1246
1250
  ValidationError,
1247
1251
  __resetBindingForTests,
@@ -1273,6 +1277,8 @@ var __version__ = "4.6.1";
1273
1277
  getLastErrorCode,
1274
1278
  getLastPanicContext,
1275
1279
  getWorkerPoolStats,
1280
+ iteratePdfPages,
1281
+ iteratePdfPagesSync,
1276
1282
  listDocumentExtractors,
1277
1283
  listEmbeddingPresets,
1278
1284
  listOcrBackends,
@@ -1280,9 +1286,12 @@ var __version__ = "4.6.1";
1280
1286
  listValidators,
1281
1287
  loadConfigFile,
1282
1288
  loadConfigFromPath,
1289
+ pdfPageCount,
1283
1290
  registerOcrBackend,
1284
1291
  registerPostProcessor,
1285
1292
  registerValidator,
1293
+ renderPdfPage,
1294
+ renderPdfPageSync,
1286
1295
  unregisterDocumentExtractor,
1287
1296
  unregisterOcrBackend,
1288
1297
  unregisterPostProcessor,