@kreuzberg/node 4.6.0 → 4.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +93 -2
- package/dist/index.d.ts +93 -2
- package/dist/index.js +49 -42
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +43 -42
- package/dist/index.mjs.map +1 -1
- package/index.d.ts +148 -0
- package/index.js +58 -52
- package/package.json +8 -8
package/dist/index.mjs
CHANGED
|
@@ -519,26 +519,16 @@ function convertChunk(rawChunk) {
|
|
|
519
519
|
const chunk = rawChunk;
|
|
520
520
|
const metadata = chunk["metadata"] ?? {};
|
|
521
521
|
return {
|
|
522
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
523
522
|
content: chunk["content"] ?? "",
|
|
524
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
525
523
|
embedding: chunk["embedding"] ?? null,
|
|
526
524
|
metadata: {
|
|
527
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
528
525
|
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
529
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
530
526
|
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
531
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
532
527
|
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
533
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
534
528
|
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
535
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
536
529
|
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
537
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
538
530
|
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
539
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
540
531
|
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
|
|
541
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
542
532
|
headingContext: (() => {
|
|
543
533
|
const hc = metadata["heading_context"] ?? metadata["headingContext"];
|
|
544
534
|
if (!hc) return null;
|
|
@@ -548,9 +538,7 @@ function convertChunk(rawChunk) {
|
|
|
548
538
|
headings: headings.map((h) => {
|
|
549
539
|
const heading = h;
|
|
550
540
|
return {
|
|
551
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
552
541
|
level: heading["level"] ?? 0,
|
|
553
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
554
542
|
text: heading["text"] ?? ""
|
|
555
543
|
};
|
|
556
544
|
})
|
|
@@ -571,22 +559,14 @@ function convertElement(rawElement) {
|
|
|
571
559
|
const element = rawElement;
|
|
572
560
|
const elementMetadata = element["metadata"] ?? {};
|
|
573
561
|
return {
|
|
574
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
575
562
|
elementId: element["element_id"] ?? element["elementId"] ?? "",
|
|
576
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
577
563
|
elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
|
|
578
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
579
564
|
text: element["text"] ?? "",
|
|
580
565
|
metadata: {
|
|
581
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
582
566
|
pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
|
|
583
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
584
567
|
filename: elementMetadata["filename"] ?? null,
|
|
585
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
586
568
|
coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
|
|
587
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
588
569
|
elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
|
|
589
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
590
570
|
additional: elementMetadata["additional"] ?? {}
|
|
591
571
|
}
|
|
592
572
|
};
|
|
@@ -609,27 +589,16 @@ function convertImage(rawImage) {
|
|
|
609
589
|
}
|
|
610
590
|
const image = rawImage;
|
|
611
591
|
return {
|
|
612
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
613
592
|
data: ensureUint8Array(image["data"]),
|
|
614
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
615
593
|
format: image["format"] ?? "unknown",
|
|
616
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
617
594
|
imageIndex: image["imageIndex"] ?? 0,
|
|
618
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
619
595
|
pageNumber: image["pageNumber"] ?? null,
|
|
620
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
621
596
|
width: image["width"] ?? null,
|
|
622
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
623
597
|
height: image["height"] ?? null,
|
|
624
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
625
598
|
colorspace: image["colorspace"] ?? null,
|
|
626
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
627
599
|
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
628
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
629
600
|
isMask: image["isMask"] ?? false,
|
|
630
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
631
601
|
description: image["description"] ?? null,
|
|
632
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
633
602
|
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
634
603
|
};
|
|
635
604
|
}
|
|
@@ -644,15 +613,10 @@ function convertPageContent(rawPage) {
|
|
|
644
613
|
}
|
|
645
614
|
const page = rawPage;
|
|
646
615
|
return {
|
|
647
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
648
616
|
pageNumber: page["pageNumber"] ?? 0,
|
|
649
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
650
617
|
content: page["content"] ?? "",
|
|
651
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
652
618
|
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
653
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
654
619
|
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
|
|
655
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
656
620
|
isBlank: page["isBlank"] ?? null
|
|
657
621
|
};
|
|
658
622
|
}
|
|
@@ -675,20 +639,15 @@ function convertResult(rawResult) {
|
|
|
675
639
|
const metadata = result["metadata"];
|
|
676
640
|
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
677
641
|
const returnObj = {
|
|
678
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
679
642
|
content: result["content"] ?? "",
|
|
680
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
681
643
|
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
682
644
|
metadata: metadataValue,
|
|
683
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
684
645
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
685
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
686
646
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
687
647
|
chunks: null,
|
|
688
648
|
images: null,
|
|
689
649
|
elements: null,
|
|
690
650
|
pages: null,
|
|
691
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
692
651
|
document: result["document"] ?? null
|
|
693
652
|
};
|
|
694
653
|
const chunksData = result["chunks"];
|
|
@@ -760,6 +719,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
760
719
|
return rawResults.map(convertResult);
|
|
761
720
|
}
|
|
762
721
|
|
|
722
|
+
// typescript/extraction/render.ts
|
|
723
|
+
function renderPdfPageSync(filePath, pageIndex, options) {
|
|
724
|
+
return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
|
|
725
|
+
}
|
|
726
|
+
async function renderPdfPage(filePath, pageIndex, options) {
|
|
727
|
+
return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
|
|
728
|
+
}
|
|
729
|
+
function iteratePdfPagesSync(filePath, options) {
|
|
730
|
+
return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
|
|
731
|
+
}
|
|
732
|
+
async function iteratePdfPages(filePath, options) {
|
|
733
|
+
return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
|
|
734
|
+
}
|
|
735
|
+
function pdfPageCount(filePath) {
|
|
736
|
+
return getBinding().pdfPageCount(filePath);
|
|
737
|
+
}
|
|
738
|
+
var PdfPageIterator = class {
|
|
739
|
+
inner;
|
|
740
|
+
constructor(filePath, options) {
|
|
741
|
+
const Ctor = getBinding().JsPdfPageIterator;
|
|
742
|
+
this.inner = new Ctor(filePath, options?.dpi ?? null);
|
|
743
|
+
}
|
|
744
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
745
|
+
next() {
|
|
746
|
+
return this.inner.next();
|
|
747
|
+
}
|
|
748
|
+
/** Total number of pages in the PDF. */
|
|
749
|
+
pageCount() {
|
|
750
|
+
return this.inner.pageCount();
|
|
751
|
+
}
|
|
752
|
+
/** Free native resources. Safe to call multiple times. */
|
|
753
|
+
close() {
|
|
754
|
+
this.inner.close();
|
|
755
|
+
}
|
|
756
|
+
};
|
|
757
|
+
|
|
763
758
|
// typescript/extraction/single.ts
|
|
764
759
|
import { readFileSync } from "fs";
|
|
765
760
|
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
@@ -1158,7 +1153,7 @@ function getEmbeddingPreset(name) {
|
|
|
1158
1153
|
}
|
|
1159
1154
|
|
|
1160
1155
|
// typescript/index.ts
|
|
1161
|
-
var __version__ = "4.6.
|
|
1156
|
+
var __version__ = "4.6.3";
|
|
1162
1157
|
export {
|
|
1163
1158
|
CacheError,
|
|
1164
1159
|
ErrorCode,
|
|
@@ -1168,6 +1163,7 @@ export {
|
|
|
1168
1163
|
MissingDependencyError,
|
|
1169
1164
|
OcrError,
|
|
1170
1165
|
ParsingError,
|
|
1166
|
+
PdfPageIterator,
|
|
1171
1167
|
PluginError,
|
|
1172
1168
|
ValidationError,
|
|
1173
1169
|
__resetBindingForTests,
|
|
@@ -1199,6 +1195,8 @@ export {
|
|
|
1199
1195
|
getLastErrorCode,
|
|
1200
1196
|
getLastPanicContext,
|
|
1201
1197
|
getWorkerPoolStats,
|
|
1198
|
+
iteratePdfPages,
|
|
1199
|
+
iteratePdfPagesSync,
|
|
1202
1200
|
listDocumentExtractors,
|
|
1203
1201
|
listEmbeddingPresets,
|
|
1204
1202
|
listOcrBackends,
|
|
@@ -1206,9 +1204,12 @@ export {
|
|
|
1206
1204
|
listValidators,
|
|
1207
1205
|
loadConfigFile,
|
|
1208
1206
|
loadConfigFromPath,
|
|
1207
|
+
pdfPageCount,
|
|
1209
1208
|
registerOcrBackend,
|
|
1210
1209
|
registerPostProcessor,
|
|
1211
1210
|
registerValidator,
|
|
1211
|
+
renderPdfPage,
|
|
1212
|
+
renderPdfPageSync,
|
|
1212
1213
|
unregisterDocumentExtractor,
|
|
1213
1214
|
unregisterOcrBackend,
|
|
1214
1215
|
unregisterPostProcessor,
|