@kreuzberg/node 4.6.0 → 4.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +93 -2
- package/dist/index.d.ts +93 -2
- package/dist/index.js +49 -42
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +43 -42
- package/dist/index.mjs.map +1 -1
- package/index.d.ts +148 -0
- package/index.js +58 -52
- package/package.json +8 -8
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// biome-ignore lint/security/noGlobalEval:
|
|
1
|
+
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// oxlint-disable-next-line no-eval -- Required for CJS/ESM compat\n\t\t// biome-ignore lint/security/noGlobalEval: CJS/ESM compat\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAIH,UAAM,MAAM,KAAK,iBAAiB;AAClC,eAAO,8BAAQ,+BAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
|
package/dist/cli.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// biome-ignore lint/security/noGlobalEval:
|
|
1
|
+
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// oxlint-disable-next-line no-eval -- Required for CJS/ESM compat\n\t\t// biome-ignore lint/security/noGlobalEval: CJS/ESM compat\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAIH,UAAM,MAAM,KAAK,iBAAiB;AAClC,WAAO,QAAQ,cAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,UAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
|
package/dist/index.d.mts
CHANGED
|
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
|
|
|
297
297
|
*/
|
|
298
298
|
declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
299
299
|
|
|
300
|
+
/**
|
|
301
|
+
* PDF page rendering functions.
|
|
302
|
+
*
|
|
303
|
+
* Render individual PDF pages or iterate over all pages as PNG images.
|
|
304
|
+
*/
|
|
305
|
+
/**
|
|
306
|
+
* Render a single PDF page to a PNG buffer (synchronous).
|
|
307
|
+
*
|
|
308
|
+
* @param filePath - Path to the PDF file
|
|
309
|
+
* @param pageIndex - Zero-based page index
|
|
310
|
+
* @param options - Optional settings
|
|
311
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
312
|
+
* @returns Buffer containing PNG image data
|
|
313
|
+
*/
|
|
314
|
+
declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
|
|
315
|
+
dpi?: number;
|
|
316
|
+
}): Buffer;
|
|
317
|
+
/**
|
|
318
|
+
* Render a single PDF page to a PNG buffer (asynchronous).
|
|
319
|
+
*
|
|
320
|
+
* @param filePath - Path to the PDF file
|
|
321
|
+
* @param pageIndex - Zero-based page index
|
|
322
|
+
* @param options - Optional settings
|
|
323
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
324
|
+
* @returns Promise resolving to a Buffer containing PNG image data
|
|
325
|
+
*/
|
|
326
|
+
declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
|
|
327
|
+
dpi?: number;
|
|
328
|
+
}): Promise<Buffer>;
|
|
329
|
+
/** A rendered PDF page with its index and PNG data. */
|
|
330
|
+
interface PdfPageResult {
|
|
331
|
+
pageIndex: number;
|
|
332
|
+
data: Buffer;
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Collect all PDF pages as PNG images (synchronous).
|
|
336
|
+
*
|
|
337
|
+
* @param filePath - Path to the PDF file
|
|
338
|
+
* @param options - Optional settings
|
|
339
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
340
|
+
* @returns Array of PdfPageResult objects
|
|
341
|
+
*/
|
|
342
|
+
declare function iteratePdfPagesSync(filePath: string, options?: {
|
|
343
|
+
dpi?: number;
|
|
344
|
+
}): PdfPageResult[];
|
|
345
|
+
/**
|
|
346
|
+
* Collect all PDF pages as PNG images (asynchronous).
|
|
347
|
+
*
|
|
348
|
+
* @param filePath - Path to the PDF file
|
|
349
|
+
* @param options - Optional settings
|
|
350
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
351
|
+
* @returns Promise resolving to an array of PdfPageResult objects
|
|
352
|
+
*/
|
|
353
|
+
declare function iteratePdfPages(filePath: string, options?: {
|
|
354
|
+
dpi?: number;
|
|
355
|
+
}): Promise<PdfPageResult[]>;
|
|
356
|
+
/**
|
|
357
|
+
* Get the number of pages in a PDF file.
|
|
358
|
+
*
|
|
359
|
+
* @param filePath - Path to the PDF file
|
|
360
|
+
* @returns Number of pages
|
|
361
|
+
*/
|
|
362
|
+
declare function pdfPageCount(filePath: string): number;
|
|
363
|
+
/**
|
|
364
|
+
* Lazy PDF page iterator. Renders one page at a time via `.next()`.
|
|
365
|
+
* Call `.close()` when done to free native resources.
|
|
366
|
+
*
|
|
367
|
+
* @example
|
|
368
|
+
* ```typescript
|
|
369
|
+
* const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
|
|
370
|
+
* let result;
|
|
371
|
+
* while ((result = iter.next()) !== null) {
|
|
372
|
+
* const { pageIndex, data } = result;
|
|
373
|
+
* // process page...
|
|
374
|
+
* }
|
|
375
|
+
* iter.close();
|
|
376
|
+
* ```
|
|
377
|
+
*/
|
|
378
|
+
declare class PdfPageIterator {
|
|
379
|
+
private inner;
|
|
380
|
+
constructor(filePath: string, options?: {
|
|
381
|
+
dpi?: number;
|
|
382
|
+
});
|
|
383
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
384
|
+
next(): PdfPageResult | null;
|
|
385
|
+
/** Total number of pages in the PDF. */
|
|
386
|
+
pageCount(): number;
|
|
387
|
+
/** Free native resources. Safe to call multiple times. */
|
|
388
|
+
close(): void;
|
|
389
|
+
}
|
|
390
|
+
|
|
300
391
|
/**
|
|
301
392
|
* Single-document extraction APIs.
|
|
302
393
|
*
|
|
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1282
|
* @module @kreuzberg/node
|
|
1192
1283
|
*/
|
|
1193
1284
|
|
|
1194
|
-
declare const __version__ = "4.6.
|
|
1285
|
+
declare const __version__ = "4.6.3";
|
|
1195
1286
|
|
|
1196
|
-
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
|
1287
|
+
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.d.ts
CHANGED
|
@@ -297,6 +297,97 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
|
|
|
297
297
|
*/
|
|
298
298
|
declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
299
299
|
|
|
300
|
+
/**
|
|
301
|
+
* PDF page rendering functions.
|
|
302
|
+
*
|
|
303
|
+
* Render individual PDF pages or iterate over all pages as PNG images.
|
|
304
|
+
*/
|
|
305
|
+
/**
|
|
306
|
+
* Render a single PDF page to a PNG buffer (synchronous).
|
|
307
|
+
*
|
|
308
|
+
* @param filePath - Path to the PDF file
|
|
309
|
+
* @param pageIndex - Zero-based page index
|
|
310
|
+
* @param options - Optional settings
|
|
311
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
312
|
+
* @returns Buffer containing PNG image data
|
|
313
|
+
*/
|
|
314
|
+
declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
|
|
315
|
+
dpi?: number;
|
|
316
|
+
}): Buffer;
|
|
317
|
+
/**
|
|
318
|
+
* Render a single PDF page to a PNG buffer (asynchronous).
|
|
319
|
+
*
|
|
320
|
+
* @param filePath - Path to the PDF file
|
|
321
|
+
* @param pageIndex - Zero-based page index
|
|
322
|
+
* @param options - Optional settings
|
|
323
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
324
|
+
* @returns Promise resolving to a Buffer containing PNG image data
|
|
325
|
+
*/
|
|
326
|
+
declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
|
|
327
|
+
dpi?: number;
|
|
328
|
+
}): Promise<Buffer>;
|
|
329
|
+
/** A rendered PDF page with its index and PNG data. */
|
|
330
|
+
interface PdfPageResult {
|
|
331
|
+
pageIndex: number;
|
|
332
|
+
data: Buffer;
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Collect all PDF pages as PNG images (synchronous).
|
|
336
|
+
*
|
|
337
|
+
* @param filePath - Path to the PDF file
|
|
338
|
+
* @param options - Optional settings
|
|
339
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
340
|
+
* @returns Array of PdfPageResult objects
|
|
341
|
+
*/
|
|
342
|
+
declare function iteratePdfPagesSync(filePath: string, options?: {
|
|
343
|
+
dpi?: number;
|
|
344
|
+
}): PdfPageResult[];
|
|
345
|
+
/**
|
|
346
|
+
* Collect all PDF pages as PNG images (asynchronous).
|
|
347
|
+
*
|
|
348
|
+
* @param filePath - Path to the PDF file
|
|
349
|
+
* @param options - Optional settings
|
|
350
|
+
* @param options.dpi - DPI for rendering (default 150)
|
|
351
|
+
* @returns Promise resolving to an array of PdfPageResult objects
|
|
352
|
+
*/
|
|
353
|
+
declare function iteratePdfPages(filePath: string, options?: {
|
|
354
|
+
dpi?: number;
|
|
355
|
+
}): Promise<PdfPageResult[]>;
|
|
356
|
+
/**
|
|
357
|
+
* Get the number of pages in a PDF file.
|
|
358
|
+
*
|
|
359
|
+
* @param filePath - Path to the PDF file
|
|
360
|
+
* @returns Number of pages
|
|
361
|
+
*/
|
|
362
|
+
declare function pdfPageCount(filePath: string): number;
|
|
363
|
+
/**
|
|
364
|
+
* Lazy PDF page iterator. Renders one page at a time via `.next()`.
|
|
365
|
+
* Call `.close()` when done to free native resources.
|
|
366
|
+
*
|
|
367
|
+
* @example
|
|
368
|
+
* ```typescript
|
|
369
|
+
* const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
|
|
370
|
+
* let result;
|
|
371
|
+
* while ((result = iter.next()) !== null) {
|
|
372
|
+
* const { pageIndex, data } = result;
|
|
373
|
+
* // process page...
|
|
374
|
+
* }
|
|
375
|
+
* iter.close();
|
|
376
|
+
* ```
|
|
377
|
+
*/
|
|
378
|
+
declare class PdfPageIterator {
|
|
379
|
+
private inner;
|
|
380
|
+
constructor(filePath: string, options?: {
|
|
381
|
+
dpi?: number;
|
|
382
|
+
});
|
|
383
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
384
|
+
next(): PdfPageResult | null;
|
|
385
|
+
/** Total number of pages in the PDF. */
|
|
386
|
+
pageCount(): number;
|
|
387
|
+
/** Free native resources. Safe to call multiple times. */
|
|
388
|
+
close(): void;
|
|
389
|
+
}
|
|
390
|
+
|
|
300
391
|
/**
|
|
301
392
|
* Single-document extraction APIs.
|
|
302
393
|
*
|
|
@@ -1191,6 +1282,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1282
|
* @module @kreuzberg/node
|
|
1192
1283
|
*/
|
|
1193
1284
|
|
|
1194
|
-
declare const __version__ = "4.6.
|
|
1285
|
+
declare const __version__ = "4.6.3";
|
|
1195
1286
|
|
|
1196
|
-
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
|
1287
|
+
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.js
CHANGED
|
@@ -28,6 +28,7 @@ __export(index_exports, {
|
|
|
28
28
|
MissingDependencyError: () => MissingDependencyError,
|
|
29
29
|
OcrError: () => OcrError,
|
|
30
30
|
ParsingError: () => ParsingError,
|
|
31
|
+
PdfPageIterator: () => PdfPageIterator,
|
|
31
32
|
PluginError: () => PluginError,
|
|
32
33
|
ValidationError: () => ValidationError,
|
|
33
34
|
__resetBindingForTests: () => __resetBindingForTests,
|
|
@@ -59,6 +60,8 @@ __export(index_exports, {
|
|
|
59
60
|
getLastErrorCode: () => getLastErrorCode,
|
|
60
61
|
getLastPanicContext: () => getLastPanicContext,
|
|
61
62
|
getWorkerPoolStats: () => getWorkerPoolStats,
|
|
63
|
+
iteratePdfPages: () => iteratePdfPages,
|
|
64
|
+
iteratePdfPagesSync: () => iteratePdfPagesSync,
|
|
62
65
|
listDocumentExtractors: () => listDocumentExtractors,
|
|
63
66
|
listEmbeddingPresets: () => listEmbeddingPresets,
|
|
64
67
|
listOcrBackends: () => listOcrBackends,
|
|
@@ -66,9 +69,12 @@ __export(index_exports, {
|
|
|
66
69
|
listValidators: () => listValidators,
|
|
67
70
|
loadConfigFile: () => loadConfigFile,
|
|
68
71
|
loadConfigFromPath: () => loadConfigFromPath,
|
|
72
|
+
pdfPageCount: () => pdfPageCount,
|
|
69
73
|
registerOcrBackend: () => registerOcrBackend,
|
|
70
74
|
registerPostProcessor: () => registerPostProcessor,
|
|
71
75
|
registerValidator: () => registerValidator,
|
|
76
|
+
renderPdfPage: () => renderPdfPage,
|
|
77
|
+
renderPdfPageSync: () => renderPdfPageSync,
|
|
72
78
|
unregisterDocumentExtractor: () => unregisterDocumentExtractor,
|
|
73
79
|
unregisterOcrBackend: () => unregisterOcrBackend,
|
|
74
80
|
unregisterPostProcessor: () => unregisterPostProcessor,
|
|
@@ -592,26 +598,16 @@ function convertChunk(rawChunk) {
|
|
|
592
598
|
const chunk = rawChunk;
|
|
593
599
|
const metadata = chunk["metadata"] ?? {};
|
|
594
600
|
return {
|
|
595
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
596
601
|
content: chunk["content"] ?? "",
|
|
597
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
598
602
|
embedding: chunk["embedding"] ?? null,
|
|
599
603
|
metadata: {
|
|
600
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
601
604
|
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
602
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
603
605
|
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
604
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
605
606
|
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
606
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
607
607
|
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
608
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
609
608
|
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
610
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
611
609
|
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
612
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
613
610
|
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
|
|
614
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
615
611
|
headingContext: (() => {
|
|
616
612
|
const hc = metadata["heading_context"] ?? metadata["headingContext"];
|
|
617
613
|
if (!hc) return null;
|
|
@@ -621,9 +617,7 @@ function convertChunk(rawChunk) {
|
|
|
621
617
|
headings: headings.map((h) => {
|
|
622
618
|
const heading = h;
|
|
623
619
|
return {
|
|
624
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
625
620
|
level: heading["level"] ?? 0,
|
|
626
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
627
621
|
text: heading["text"] ?? ""
|
|
628
622
|
};
|
|
629
623
|
})
|
|
@@ -644,22 +638,14 @@ function convertElement(rawElement) {
|
|
|
644
638
|
const element = rawElement;
|
|
645
639
|
const elementMetadata = element["metadata"] ?? {};
|
|
646
640
|
return {
|
|
647
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
648
641
|
elementId: element["element_id"] ?? element["elementId"] ?? "",
|
|
649
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
650
642
|
elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
|
|
651
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
652
643
|
text: element["text"] ?? "",
|
|
653
644
|
metadata: {
|
|
654
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
655
645
|
pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
|
|
656
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
657
646
|
filename: elementMetadata["filename"] ?? null,
|
|
658
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
659
647
|
coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
|
|
660
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
661
648
|
elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
|
|
662
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
663
649
|
additional: elementMetadata["additional"] ?? {}
|
|
664
650
|
}
|
|
665
651
|
};
|
|
@@ -682,27 +668,16 @@ function convertImage(rawImage) {
|
|
|
682
668
|
}
|
|
683
669
|
const image = rawImage;
|
|
684
670
|
return {
|
|
685
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
686
671
|
data: ensureUint8Array(image["data"]),
|
|
687
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
688
672
|
format: image["format"] ?? "unknown",
|
|
689
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
690
673
|
imageIndex: image["imageIndex"] ?? 0,
|
|
691
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
692
674
|
pageNumber: image["pageNumber"] ?? null,
|
|
693
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
694
675
|
width: image["width"] ?? null,
|
|
695
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
696
676
|
height: image["height"] ?? null,
|
|
697
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
698
677
|
colorspace: image["colorspace"] ?? null,
|
|
699
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
700
678
|
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
701
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
702
679
|
isMask: image["isMask"] ?? false,
|
|
703
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
704
680
|
description: image["description"] ?? null,
|
|
705
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
706
681
|
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
707
682
|
};
|
|
708
683
|
}
|
|
@@ -717,15 +692,10 @@ function convertPageContent(rawPage) {
|
|
|
717
692
|
}
|
|
718
693
|
const page = rawPage;
|
|
719
694
|
return {
|
|
720
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
721
695
|
pageNumber: page["pageNumber"] ?? 0,
|
|
722
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
723
696
|
content: page["content"] ?? "",
|
|
724
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
725
697
|
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
726
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
727
698
|
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
|
|
728
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
729
699
|
isBlank: page["isBlank"] ?? null
|
|
730
700
|
};
|
|
731
701
|
}
|
|
@@ -748,20 +718,15 @@ function convertResult(rawResult) {
|
|
|
748
718
|
const metadata = result["metadata"];
|
|
749
719
|
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
750
720
|
const returnObj = {
|
|
751
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
752
721
|
content: result["content"] ?? "",
|
|
753
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
754
722
|
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
755
723
|
metadata: metadataValue,
|
|
756
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
757
724
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
758
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
759
725
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
760
726
|
chunks: null,
|
|
761
727
|
images: null,
|
|
762
728
|
elements: null,
|
|
763
729
|
pages: null,
|
|
764
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
765
730
|
document: result["document"] ?? null
|
|
766
731
|
};
|
|
767
732
|
const chunksData = result["chunks"];
|
|
@@ -833,6 +798,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
833
798
|
return rawResults.map(convertResult);
|
|
834
799
|
}
|
|
835
800
|
|
|
801
|
+
// typescript/extraction/render.ts
|
|
802
|
+
function renderPdfPageSync(filePath, pageIndex, options) {
|
|
803
|
+
return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
|
|
804
|
+
}
|
|
805
|
+
async function renderPdfPage(filePath, pageIndex, options) {
|
|
806
|
+
return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
|
|
807
|
+
}
|
|
808
|
+
function iteratePdfPagesSync(filePath, options) {
|
|
809
|
+
return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
|
|
810
|
+
}
|
|
811
|
+
async function iteratePdfPages(filePath, options) {
|
|
812
|
+
return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
|
|
813
|
+
}
|
|
814
|
+
function pdfPageCount(filePath) {
|
|
815
|
+
return getBinding().pdfPageCount(filePath);
|
|
816
|
+
}
|
|
817
|
+
var PdfPageIterator = class {
|
|
818
|
+
inner;
|
|
819
|
+
constructor(filePath, options) {
|
|
820
|
+
const Ctor = getBinding().JsPdfPageIterator;
|
|
821
|
+
this.inner = new Ctor(filePath, options?.dpi ?? null);
|
|
822
|
+
}
|
|
823
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
824
|
+
next() {
|
|
825
|
+
return this.inner.next();
|
|
826
|
+
}
|
|
827
|
+
/** Total number of pages in the PDF. */
|
|
828
|
+
pageCount() {
|
|
829
|
+
return this.inner.pageCount();
|
|
830
|
+
}
|
|
831
|
+
/** Free native resources. Safe to call multiple times. */
|
|
832
|
+
close() {
|
|
833
|
+
this.inner.close();
|
|
834
|
+
}
|
|
835
|
+
};
|
|
836
|
+
|
|
836
837
|
// typescript/extraction/single.ts
|
|
837
838
|
var import_node_fs = require("fs");
|
|
838
839
|
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
@@ -1231,7 +1232,7 @@ function getEmbeddingPreset(name) {
|
|
|
1231
1232
|
}
|
|
1232
1233
|
|
|
1233
1234
|
// typescript/index.ts
|
|
1234
|
-
var __version__ = "4.6.
|
|
1235
|
+
var __version__ = "4.6.3";
|
|
1235
1236
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1236
1237
|
0 && (module.exports = {
|
|
1237
1238
|
CacheError,
|
|
@@ -1242,6 +1243,7 @@ var __version__ = "4.6.0";
|
|
|
1242
1243
|
MissingDependencyError,
|
|
1243
1244
|
OcrError,
|
|
1244
1245
|
ParsingError,
|
|
1246
|
+
PdfPageIterator,
|
|
1245
1247
|
PluginError,
|
|
1246
1248
|
ValidationError,
|
|
1247
1249
|
__resetBindingForTests,
|
|
@@ -1273,6 +1275,8 @@ var __version__ = "4.6.0";
|
|
|
1273
1275
|
getLastErrorCode,
|
|
1274
1276
|
getLastPanicContext,
|
|
1275
1277
|
getWorkerPoolStats,
|
|
1278
|
+
iteratePdfPages,
|
|
1279
|
+
iteratePdfPagesSync,
|
|
1276
1280
|
listDocumentExtractors,
|
|
1277
1281
|
listEmbeddingPresets,
|
|
1278
1282
|
listOcrBackends,
|
|
@@ -1280,9 +1284,12 @@ var __version__ = "4.6.0";
|
|
|
1280
1284
|
listValidators,
|
|
1281
1285
|
loadConfigFile,
|
|
1282
1286
|
loadConfigFromPath,
|
|
1287
|
+
pdfPageCount,
|
|
1283
1288
|
registerOcrBackend,
|
|
1284
1289
|
registerPostProcessor,
|
|
1285
1290
|
registerValidator,
|
|
1291
|
+
renderPdfPage,
|
|
1292
|
+
renderPdfPageSync,
|
|
1286
1293
|
unregisterDocumentExtractor,
|
|
1287
1294
|
unregisterOcrBackend,
|
|
1288
1295
|
unregisterPostProcessor,
|