@kreuzberg/node 4.6.0 → 4.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.3" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// biome-ignore lint/security/noGlobalEval: Required to handle both CJS and ESM contexts at runtime\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,eAAO,8BAAQ,+BAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
1
+ {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// oxlint-disable-next-line no-eval -- Required for CJS/ESM compat\n\t\t// biome-ignore lint/security/noGlobalEval: CJS/ESM compat\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAIH,UAAM,MAAM,KAAK,iBAAiB;AAClC,eAAO,8BAAQ,+BAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
package/dist/cli.mjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// biome-ignore lint/security/noGlobalEval: Required to handle both CJS and ESM contexts at runtime\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,WAAO,QAAQ,cAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,UAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
1
+ {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// oxlint-disable-next-line no-eval -- Required for CJS/ESM compat\n\t\t// biome-ignore lint/security/noGlobalEval: CJS/ESM compat\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAIH,UAAM,MAAM,KAAK,iBAAiB;AAClC,WAAO,QAAQ,cAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,UAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
package/dist/index.d.mts CHANGED
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
297
297
  */
298
298
  declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
299
299
 
300
+ /**
301
+ * PDF page rendering functions.
302
+ *
303
+ * Render individual PDF pages or iterate over all pages as PNG images.
304
+ */
305
+ /**
306
+ * Render a single PDF page to a PNG buffer (synchronous).
307
+ *
308
+ * @param filePath - Path to the PDF file
309
+ * @param pageIndex - Zero-based page index
310
+ * @param options - Optional settings
311
+ * @param options.dpi - DPI for rendering (default 150)
312
+ * @returns Buffer containing PNG image data
313
+ */
314
+ declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
315
+ dpi?: number;
316
+ }): Buffer;
317
+ /**
318
+ * Render a single PDF page to a PNG buffer (asynchronous).
319
+ *
320
+ * @param filePath - Path to the PDF file
321
+ * @param pageIndex - Zero-based page index
322
+ * @param options - Optional settings
323
+ * @param options.dpi - DPI for rendering (default 150)
324
+ * @returns Promise resolving to a Buffer containing PNG image data
325
+ */
326
+ declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
327
+ dpi?: number;
328
+ }): Promise<Buffer>;
329
+ /** A rendered PDF page with its index and PNG data. */
330
+ interface PdfPageResult {
331
+ pageIndex: number;
332
+ data: Buffer;
333
+ }
334
+ /**
335
+ * Collect all PDF pages as PNG images (synchronous).
336
+ *
337
+ * @param filePath - Path to the PDF file
338
+ * @param options - Optional settings
339
+ * @param options.dpi - DPI for rendering (default 150)
340
+ * @returns Array of PdfPageResult objects
341
+ */
342
+ declare function iteratePdfPagesSync(filePath: string, options?: {
343
+ dpi?: number;
344
+ }): PdfPageResult[];
345
+ /**
346
+ * Collect all PDF pages as PNG images (asynchronous).
347
+ *
348
+ * @param filePath - Path to the PDF file
349
+ * @param options - Optional settings
350
+ * @param options.dpi - DPI for rendering (default 150)
351
+ * @returns Promise resolving to an array of PdfPageResult objects
352
+ */
353
+ declare function iteratePdfPages(filePath: string, options?: {
354
+ dpi?: number;
355
+ }): Promise<PdfPageResult[]>;
356
+ /**
357
+ * Get the number of pages in a PDF file.
358
+ *
359
+ * @param filePath - Path to the PDF file
360
+ * @returns Number of pages
361
+ */
362
+ declare function pdfPageCount(filePath: string): number;
363
+ /**
364
+ * Lazy PDF page iterator. Renders one page at a time via `.next()`.
365
+ * Call `.close()` when done to free native resources.
366
+ *
367
+ * @example
368
+ * ```typescript
369
+ * const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
370
+ * let result;
371
+ * while ((result = iter.next()) !== null) {
372
+ * const { pageIndex, data } = result;
373
+ * // process page...
374
+ * }
375
+ * iter.close();
376
+ * ```
377
+ */
378
+ declare class PdfPageIterator {
379
+ private inner;
380
+ constructor(filePath: string, options?: {
381
+ dpi?: number;
382
+ });
383
+ /** Advance and return the next page, or null when exhausted. */
384
+ next(): PdfPageResult | null;
385
+ /** Total number of pages in the PDF. */
386
+ pageCount(): number;
387
+ /** Free native resources. Safe to call multiple times. */
388
+ close(): void;
389
+ }
390
+
300
391
  /**
301
392
  * Single-document extraction APIs.
302
393
  *
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
1191
1282
  * @module @kreuzberg/node
1192
1283
  */
1193
1284
 
1194
- declare const __version__ = "4.6.0";
1285
+ declare const __version__ = "4.6.3";
1195
1286
 
1196
- export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
1287
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.d.ts CHANGED
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
297
297
  */
298
298
  declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
299
299
 
300
+ /**
301
+ * PDF page rendering functions.
302
+ *
303
+ * Render individual PDF pages or iterate over all pages as PNG images.
304
+ */
305
+ /**
306
+ * Render a single PDF page to a PNG buffer (synchronous).
307
+ *
308
+ * @param filePath - Path to the PDF file
309
+ * @param pageIndex - Zero-based page index
310
+ * @param options - Optional settings
311
+ * @param options.dpi - DPI for rendering (default 150)
312
+ * @returns Buffer containing PNG image data
313
+ */
314
+ declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
315
+ dpi?: number;
316
+ }): Buffer;
317
+ /**
318
+ * Render a single PDF page to a PNG buffer (asynchronous).
319
+ *
320
+ * @param filePath - Path to the PDF file
321
+ * @param pageIndex - Zero-based page index
322
+ * @param options - Optional settings
323
+ * @param options.dpi - DPI for rendering (default 150)
324
+ * @returns Promise resolving to a Buffer containing PNG image data
325
+ */
326
+ declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
327
+ dpi?: number;
328
+ }): Promise<Buffer>;
329
+ /** A rendered PDF page with its index and PNG data. */
330
+ interface PdfPageResult {
331
+ pageIndex: number;
332
+ data: Buffer;
333
+ }
334
+ /**
335
+ * Collect all PDF pages as PNG images (synchronous).
336
+ *
337
+ * @param filePath - Path to the PDF file
338
+ * @param options - Optional settings
339
+ * @param options.dpi - DPI for rendering (default 150)
340
+ * @returns Array of PdfPageResult objects
341
+ */
342
+ declare function iteratePdfPagesSync(filePath: string, options?: {
343
+ dpi?: number;
344
+ }): PdfPageResult[];
345
+ /**
346
+ * Collect all PDF pages as PNG images (asynchronous).
347
+ *
348
+ * @param filePath - Path to the PDF file
349
+ * @param options - Optional settings
350
+ * @param options.dpi - DPI for rendering (default 150)
351
+ * @returns Promise resolving to an array of PdfPageResult objects
352
+ */
353
+ declare function iteratePdfPages(filePath: string, options?: {
354
+ dpi?: number;
355
+ }): Promise<PdfPageResult[]>;
356
+ /**
357
+ * Get the number of pages in a PDF file.
358
+ *
359
+ * @param filePath - Path to the PDF file
360
+ * @returns Number of pages
361
+ */
362
+ declare function pdfPageCount(filePath: string): number;
363
+ /**
364
+ * Lazy PDF page iterator. Renders one page at a time via `.next()`.
365
+ * Call `.close()` when done to free native resources.
366
+ *
367
+ * @example
368
+ * ```typescript
369
+ * const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
370
+ * let result;
371
+ * while ((result = iter.next()) !== null) {
372
+ * const { pageIndex, data } = result;
373
+ * // process page...
374
+ * }
375
+ * iter.close();
376
+ * ```
377
+ */
378
+ declare class PdfPageIterator {
379
+ private inner;
380
+ constructor(filePath: string, options?: {
381
+ dpi?: number;
382
+ });
383
+ /** Advance and return the next page, or null when exhausted. */
384
+ next(): PdfPageResult | null;
385
+ /** Total number of pages in the PDF. */
386
+ pageCount(): number;
387
+ /** Free native resources. Safe to call multiple times. */
388
+ close(): void;
389
+ }
390
+
300
391
  /**
301
392
  * Single-document extraction APIs.
302
393
  *
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
1191
1282
  * @module @kreuzberg/node
1192
1283
  */
1193
1284
 
1194
- declare const __version__ = "4.6.0";
1285
+ declare const __version__ = "4.6.3";
1195
1286
 
1196
- export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
1287
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.js CHANGED
@@ -28,6 +28,7 @@ __export(index_exports, {
28
28
  MissingDependencyError: () => MissingDependencyError,
29
29
  OcrError: () => OcrError,
30
30
  ParsingError: () => ParsingError,
31
+ PdfPageIterator: () => PdfPageIterator,
31
32
  PluginError: () => PluginError,
32
33
  ValidationError: () => ValidationError,
33
34
  __resetBindingForTests: () => __resetBindingForTests,
@@ -59,6 +60,8 @@ __export(index_exports, {
59
60
  getLastErrorCode: () => getLastErrorCode,
60
61
  getLastPanicContext: () => getLastPanicContext,
61
62
  getWorkerPoolStats: () => getWorkerPoolStats,
63
+ iteratePdfPages: () => iteratePdfPages,
64
+ iteratePdfPagesSync: () => iteratePdfPagesSync,
62
65
  listDocumentExtractors: () => listDocumentExtractors,
63
66
  listEmbeddingPresets: () => listEmbeddingPresets,
64
67
  listOcrBackends: () => listOcrBackends,
@@ -66,9 +69,12 @@ __export(index_exports, {
66
69
  listValidators: () => listValidators,
67
70
  loadConfigFile: () => loadConfigFile,
68
71
  loadConfigFromPath: () => loadConfigFromPath,
72
+ pdfPageCount: () => pdfPageCount,
69
73
  registerOcrBackend: () => registerOcrBackend,
70
74
  registerPostProcessor: () => registerPostProcessor,
71
75
  registerValidator: () => registerValidator,
76
+ renderPdfPage: () => renderPdfPage,
77
+ renderPdfPageSync: () => renderPdfPageSync,
72
78
  unregisterDocumentExtractor: () => unregisterDocumentExtractor,
73
79
  unregisterOcrBackend: () => unregisterOcrBackend,
74
80
  unregisterPostProcessor: () => unregisterPostProcessor,
@@ -592,26 +598,16 @@ function convertChunk(rawChunk) {
592
598
  const chunk = rawChunk;
593
599
  const metadata = chunk["metadata"] ?? {};
594
600
  return {
595
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
596
601
  content: chunk["content"] ?? "",
597
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
598
602
  embedding: chunk["embedding"] ?? null,
599
603
  metadata: {
600
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
601
604
  byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
602
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
603
605
  byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
604
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
605
606
  tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
606
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
607
607
  chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
608
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
609
608
  totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
610
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
611
609
  firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
612
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
613
610
  lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
614
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
615
611
  headingContext: (() => {
616
612
  const hc = metadata["heading_context"] ?? metadata["headingContext"];
617
613
  if (!hc) return null;
@@ -621,9 +617,7 @@ function convertChunk(rawChunk) {
621
617
  headings: headings.map((h) => {
622
618
  const heading = h;
623
619
  return {
624
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
625
620
  level: heading["level"] ?? 0,
626
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
627
621
  text: heading["text"] ?? ""
628
622
  };
629
623
  })
@@ -644,22 +638,14 @@ function convertElement(rawElement) {
644
638
  const element = rawElement;
645
639
  const elementMetadata = element["metadata"] ?? {};
646
640
  return {
647
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
648
641
  elementId: element["element_id"] ?? element["elementId"] ?? "",
649
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
650
642
  elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
651
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
652
643
  text: element["text"] ?? "",
653
644
  metadata: {
654
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
655
645
  pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
656
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
657
646
  filename: elementMetadata["filename"] ?? null,
658
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
659
647
  coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
660
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
661
648
  elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
662
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
663
649
  additional: elementMetadata["additional"] ?? {}
664
650
  }
665
651
  };
@@ -682,27 +668,16 @@ function convertImage(rawImage) {
682
668
  }
683
669
  const image = rawImage;
684
670
  return {
685
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
686
671
  data: ensureUint8Array(image["data"]),
687
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
688
672
  format: image["format"] ?? "unknown",
689
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
690
673
  imageIndex: image["imageIndex"] ?? 0,
691
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
692
674
  pageNumber: image["pageNumber"] ?? null,
693
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
694
675
  width: image["width"] ?? null,
695
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
696
676
  height: image["height"] ?? null,
697
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
698
677
  colorspace: image["colorspace"] ?? null,
699
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
700
678
  bitsPerComponent: image["bitsPerComponent"] ?? null,
701
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
702
679
  isMask: image["isMask"] ?? false,
703
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
704
680
  description: image["description"] ?? null,
705
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
706
681
  ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
707
682
  };
708
683
  }
@@ -717,15 +692,10 @@ function convertPageContent(rawPage) {
717
692
  }
718
693
  const page = rawPage;
719
694
  return {
720
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
721
695
  pageNumber: page["pageNumber"] ?? 0,
722
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
723
696
  content: page["content"] ?? "",
724
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
725
697
  tables: Array.isArray(page["tables"]) ? page["tables"] : [],
726
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
727
698
  images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
728
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
729
699
  isBlank: page["isBlank"] ?? null
730
700
  };
731
701
  }
@@ -748,20 +718,15 @@ function convertResult(rawResult) {
748
718
  const metadata = result["metadata"];
749
719
  const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
750
720
  const returnObj = {
751
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
752
721
  content: result["content"] ?? "",
753
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
754
722
  mimeType: result["mimeType"] ?? "application/octet-stream",
755
723
  metadata: metadataValue,
756
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
757
724
  tables: Array.isArray(result["tables"]) ? result["tables"] : [],
758
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
759
725
  detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
760
726
  chunks: null,
761
727
  images: null,
762
728
  elements: null,
763
729
  pages: null,
764
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
765
730
  document: result["document"] ?? null
766
731
  };
767
732
  const chunksData = result["chunks"];
@@ -833,6 +798,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
833
798
  return rawResults.map(convertResult);
834
799
  }
835
800
 
801
+ // typescript/extraction/render.ts
802
+ function renderPdfPageSync(filePath, pageIndex, options) {
803
+ return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
804
+ }
805
+ async function renderPdfPage(filePath, pageIndex, options) {
806
+ return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
807
+ }
808
+ function iteratePdfPagesSync(filePath, options) {
809
+ return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
810
+ }
811
+ async function iteratePdfPages(filePath, options) {
812
+ return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
813
+ }
814
+ function pdfPageCount(filePath) {
815
+ return getBinding().pdfPageCount(filePath);
816
+ }
817
+ var PdfPageIterator = class {
818
+ inner;
819
+ constructor(filePath, options) {
820
+ const Ctor = getBinding().JsPdfPageIterator;
821
+ this.inner = new Ctor(filePath, options?.dpi ?? null);
822
+ }
823
+ /** Advance and return the next page, or null when exhausted. */
824
+ next() {
825
+ return this.inner.next();
826
+ }
827
+ /** Total number of pages in the PDF. */
828
+ pageCount() {
829
+ return this.inner.pageCount();
830
+ }
831
+ /** Free native resources. Safe to call multiple times. */
832
+ close() {
833
+ this.inner.close();
834
+ }
835
+ };
836
+
836
837
  // typescript/extraction/single.ts
837
838
  var import_node_fs = require("fs");
838
839
  function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
@@ -1231,7 +1232,7 @@ function getEmbeddingPreset(name) {
1231
1232
  }
1232
1233
 
1233
1234
  // typescript/index.ts
1234
- var __version__ = "4.6.0";
1235
+ var __version__ = "4.6.3";
1235
1236
  // Annotate the CommonJS export names for ESM import in node:
1236
1237
  0 && (module.exports = {
1237
1238
  CacheError,
@@ -1242,6 +1243,7 @@ var __version__ = "4.6.0";
1242
1243
  MissingDependencyError,
1243
1244
  OcrError,
1244
1245
  ParsingError,
1246
+ PdfPageIterator,
1245
1247
  PluginError,
1246
1248
  ValidationError,
1247
1249
  __resetBindingForTests,
@@ -1273,6 +1275,8 @@ var __version__ = "4.6.0";
1273
1275
  getLastErrorCode,
1274
1276
  getLastPanicContext,
1275
1277
  getWorkerPoolStats,
1278
+ iteratePdfPages,
1279
+ iteratePdfPagesSync,
1276
1280
  listDocumentExtractors,
1277
1281
  listEmbeddingPresets,
1278
1282
  listOcrBackends,
@@ -1280,9 +1284,12 @@ var __version__ = "4.6.0";
1280
1284
  listValidators,
1281
1285
  loadConfigFile,
1282
1286
  loadConfigFromPath,
1287
+ pdfPageCount,
1283
1288
  registerOcrBackend,
1284
1289
  registerPostProcessor,
1285
1290
  registerValidator,
1291
+ renderPdfPage,
1292
+ renderPdfPageSync,
1286
1293
  unregisterDocumentExtractor,
1287
1294
  unregisterOcrBackend,
1288
1295
  unregisterPostProcessor,