@kreuzberg/node 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -5
- package/dist/cli.js +2 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +2 -2
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +93 -2
- package/dist/index.d.ts +93 -2
- package/dist/index.js +51 -42
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +45 -42
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +5 -1
- package/dist/types.d.ts +5 -1
- package/dist/types.js.map +1 -1
- package/index.d.ts +213 -12
- package/index.js +58 -52
- package/package.json +11 -11
package/dist/index.mjs
CHANGED
|
@@ -506,6 +506,7 @@ function convertChunk(rawChunk) {
|
|
|
506
506
|
if (!rawChunk || typeof rawChunk !== "object") {
|
|
507
507
|
return {
|
|
508
508
|
content: "",
|
|
509
|
+
chunkType: null,
|
|
509
510
|
metadata: {
|
|
510
511
|
byteStart: 0,
|
|
511
512
|
byteEnd: 0,
|
|
@@ -519,26 +520,17 @@ function convertChunk(rawChunk) {
|
|
|
519
520
|
const chunk = rawChunk;
|
|
520
521
|
const metadata = chunk["metadata"] ?? {};
|
|
521
522
|
return {
|
|
522
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
523
523
|
content: chunk["content"] ?? "",
|
|
524
|
-
|
|
524
|
+
chunkType: chunk["chunk_type"] ?? chunk["chunkType"] ?? null,
|
|
525
525
|
embedding: chunk["embedding"] ?? null,
|
|
526
526
|
metadata: {
|
|
527
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
528
527
|
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
529
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
530
528
|
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
531
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
532
529
|
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
533
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
534
530
|
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
535
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
536
531
|
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
537
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
538
532
|
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
539
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
540
533
|
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
|
|
541
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
542
534
|
headingContext: (() => {
|
|
543
535
|
const hc = metadata["heading_context"] ?? metadata["headingContext"];
|
|
544
536
|
if (!hc) return null;
|
|
@@ -548,9 +540,7 @@ function convertChunk(rawChunk) {
|
|
|
548
540
|
headings: headings.map((h) => {
|
|
549
541
|
const heading = h;
|
|
550
542
|
return {
|
|
551
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
552
543
|
level: heading["level"] ?? 0,
|
|
553
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
554
544
|
text: heading["text"] ?? ""
|
|
555
545
|
};
|
|
556
546
|
})
|
|
@@ -571,22 +561,14 @@ function convertElement(rawElement) {
|
|
|
571
561
|
const element = rawElement;
|
|
572
562
|
const elementMetadata = element["metadata"] ?? {};
|
|
573
563
|
return {
|
|
574
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
575
564
|
elementId: element["element_id"] ?? element["elementId"] ?? "",
|
|
576
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
577
565
|
elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
|
|
578
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
579
566
|
text: element["text"] ?? "",
|
|
580
567
|
metadata: {
|
|
581
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
582
568
|
pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
|
|
583
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
584
569
|
filename: elementMetadata["filename"] ?? null,
|
|
585
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
586
570
|
coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
|
|
587
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
588
571
|
elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
|
|
589
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
590
572
|
additional: elementMetadata["additional"] ?? {}
|
|
591
573
|
}
|
|
592
574
|
};
|
|
@@ -609,27 +591,16 @@ function convertImage(rawImage) {
|
|
|
609
591
|
}
|
|
610
592
|
const image = rawImage;
|
|
611
593
|
return {
|
|
612
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
613
594
|
data: ensureUint8Array(image["data"]),
|
|
614
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
615
595
|
format: image["format"] ?? "unknown",
|
|
616
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
617
596
|
imageIndex: image["imageIndex"] ?? 0,
|
|
618
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
619
597
|
pageNumber: image["pageNumber"] ?? null,
|
|
620
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
621
598
|
width: image["width"] ?? null,
|
|
622
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
623
599
|
height: image["height"] ?? null,
|
|
624
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
625
600
|
colorspace: image["colorspace"] ?? null,
|
|
626
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
627
601
|
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
628
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
629
602
|
isMask: image["isMask"] ?? false,
|
|
630
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
631
603
|
description: image["description"] ?? null,
|
|
632
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
633
604
|
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
634
605
|
};
|
|
635
606
|
}
|
|
@@ -644,15 +615,10 @@ function convertPageContent(rawPage) {
|
|
|
644
615
|
}
|
|
645
616
|
const page = rawPage;
|
|
646
617
|
return {
|
|
647
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
648
618
|
pageNumber: page["pageNumber"] ?? 0,
|
|
649
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
650
619
|
content: page["content"] ?? "",
|
|
651
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
652
620
|
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
653
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
654
621
|
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
|
|
655
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
656
622
|
isBlank: page["isBlank"] ?? null
|
|
657
623
|
};
|
|
658
624
|
}
|
|
@@ -675,20 +641,15 @@ function convertResult(rawResult) {
|
|
|
675
641
|
const metadata = result["metadata"];
|
|
676
642
|
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
677
643
|
const returnObj = {
|
|
678
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
679
644
|
content: result["content"] ?? "",
|
|
680
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
681
645
|
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
682
646
|
metadata: metadataValue,
|
|
683
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
684
647
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
685
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
686
648
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
687
649
|
chunks: null,
|
|
688
650
|
images: null,
|
|
689
651
|
elements: null,
|
|
690
652
|
pages: null,
|
|
691
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
692
653
|
document: result["document"] ?? null
|
|
693
654
|
};
|
|
694
655
|
const chunksData = result["chunks"];
|
|
@@ -760,6 +721,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
760
721
|
return rawResults.map(convertResult);
|
|
761
722
|
}
|
|
762
723
|
|
|
724
|
+
// typescript/extraction/render.ts
|
|
725
|
+
function renderPdfPageSync(filePath, pageIndex, options) {
|
|
726
|
+
return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
|
|
727
|
+
}
|
|
728
|
+
async function renderPdfPage(filePath, pageIndex, options) {
|
|
729
|
+
return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
|
|
730
|
+
}
|
|
731
|
+
function iteratePdfPagesSync(filePath, options) {
|
|
732
|
+
return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
|
|
733
|
+
}
|
|
734
|
+
async function iteratePdfPages(filePath, options) {
|
|
735
|
+
return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
|
|
736
|
+
}
|
|
737
|
+
function pdfPageCount(filePath) {
|
|
738
|
+
return getBinding().pdfPageCount(filePath);
|
|
739
|
+
}
|
|
740
|
+
var PdfPageIterator = class {
|
|
741
|
+
inner;
|
|
742
|
+
constructor(filePath, options) {
|
|
743
|
+
const Ctor = getBinding().JsPdfPageIterator;
|
|
744
|
+
this.inner = new Ctor(filePath, options?.dpi ?? null);
|
|
745
|
+
}
|
|
746
|
+
/** Advance and return the next page, or null when exhausted. */
|
|
747
|
+
next() {
|
|
748
|
+
return this.inner.next();
|
|
749
|
+
}
|
|
750
|
+
/** Total number of pages in the PDF. */
|
|
751
|
+
pageCount() {
|
|
752
|
+
return this.inner.pageCount();
|
|
753
|
+
}
|
|
754
|
+
/** Free native resources. Safe to call multiple times. */
|
|
755
|
+
close() {
|
|
756
|
+
this.inner.close();
|
|
757
|
+
}
|
|
758
|
+
};
|
|
759
|
+
|
|
763
760
|
// typescript/extraction/single.ts
|
|
764
761
|
import { readFileSync } from "fs";
|
|
765
762
|
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
@@ -1158,7 +1155,7 @@ function getEmbeddingPreset(name) {
|
|
|
1158
1155
|
}
|
|
1159
1156
|
|
|
1160
1157
|
// typescript/index.ts
|
|
1161
|
-
var __version__ = "4.
|
|
1158
|
+
var __version__ = "4.7.0";
|
|
1162
1159
|
export {
|
|
1163
1160
|
CacheError,
|
|
1164
1161
|
ErrorCode,
|
|
@@ -1168,6 +1165,7 @@ export {
|
|
|
1168
1165
|
MissingDependencyError,
|
|
1169
1166
|
OcrError,
|
|
1170
1167
|
ParsingError,
|
|
1168
|
+
PdfPageIterator,
|
|
1171
1169
|
PluginError,
|
|
1172
1170
|
ValidationError,
|
|
1173
1171
|
__resetBindingForTests,
|
|
@@ -1199,6 +1197,8 @@ export {
|
|
|
1199
1197
|
getLastErrorCode,
|
|
1200
1198
|
getLastPanicContext,
|
|
1201
1199
|
getWorkerPoolStats,
|
|
1200
|
+
iteratePdfPages,
|
|
1201
|
+
iteratePdfPagesSync,
|
|
1202
1202
|
listDocumentExtractors,
|
|
1203
1203
|
listEmbeddingPresets,
|
|
1204
1204
|
listOcrBackends,
|
|
@@ -1206,9 +1206,12 @@ export {
|
|
|
1206
1206
|
listValidators,
|
|
1207
1207
|
loadConfigFile,
|
|
1208
1208
|
loadConfigFromPath,
|
|
1209
|
+
pdfPageCount,
|
|
1209
1210
|
registerOcrBackend,
|
|
1210
1211
|
registerPostProcessor,
|
|
1211
1212
|
registerValidator,
|
|
1213
|
+
renderPdfPage,
|
|
1214
|
+
renderPdfPageSync,
|
|
1212
1215
|
unregisterDocumentExtractor,
|
|
1213
1216
|
unregisterOcrBackend,
|
|
1214
1217
|
unregisterPostProcessor,
|