@kreuzberg/node 4.6.0 → 4.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -519,26 +519,16 @@ function convertChunk(rawChunk) {
519
519
  const chunk = rawChunk;
520
520
  const metadata = chunk["metadata"] ?? {};
521
521
  return {
522
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
523
522
  content: chunk["content"] ?? "",
524
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
525
523
  embedding: chunk["embedding"] ?? null,
526
524
  metadata: {
527
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
528
525
  byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
529
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
530
526
  byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
531
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
532
527
  tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
533
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
534
528
  chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
535
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
536
529
  totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
537
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
538
530
  firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
539
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
540
531
  lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
541
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
542
532
  headingContext: (() => {
543
533
  const hc = metadata["heading_context"] ?? metadata["headingContext"];
544
534
  if (!hc) return null;
@@ -548,9 +538,7 @@ function convertChunk(rawChunk) {
548
538
  headings: headings.map((h) => {
549
539
  const heading = h;
550
540
  return {
551
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
552
541
  level: heading["level"] ?? 0,
553
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
554
542
  text: heading["text"] ?? ""
555
543
  };
556
544
  })
@@ -571,22 +559,14 @@ function convertElement(rawElement) {
571
559
  const element = rawElement;
572
560
  const elementMetadata = element["metadata"] ?? {};
573
561
  return {
574
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
575
562
  elementId: element["element_id"] ?? element["elementId"] ?? "",
576
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
577
563
  elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
578
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
579
564
  text: element["text"] ?? "",
580
565
  metadata: {
581
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
582
566
  pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
583
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
584
567
  filename: elementMetadata["filename"] ?? null,
585
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
586
568
  coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
587
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
588
569
  elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
589
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
590
570
  additional: elementMetadata["additional"] ?? {}
591
571
  }
592
572
  };
@@ -609,27 +589,16 @@ function convertImage(rawImage) {
609
589
  }
610
590
  const image = rawImage;
611
591
  return {
612
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
613
592
  data: ensureUint8Array(image["data"]),
614
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
615
593
  format: image["format"] ?? "unknown",
616
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
617
594
  imageIndex: image["imageIndex"] ?? 0,
618
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
619
595
  pageNumber: image["pageNumber"] ?? null,
620
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
621
596
  width: image["width"] ?? null,
622
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
623
597
  height: image["height"] ?? null,
624
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
625
598
  colorspace: image["colorspace"] ?? null,
626
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
627
599
  bitsPerComponent: image["bitsPerComponent"] ?? null,
628
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
629
600
  isMask: image["isMask"] ?? false,
630
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
631
601
  description: image["description"] ?? null,
632
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
633
602
  ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
634
603
  };
635
604
  }
@@ -644,15 +613,10 @@ function convertPageContent(rawPage) {
644
613
  }
645
614
  const page = rawPage;
646
615
  return {
647
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
648
616
  pageNumber: page["pageNumber"] ?? 0,
649
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
650
617
  content: page["content"] ?? "",
651
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
652
618
  tables: Array.isArray(page["tables"]) ? page["tables"] : [],
653
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
654
619
  images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
655
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
656
620
  isBlank: page["isBlank"] ?? null
657
621
  };
658
622
  }
@@ -675,20 +639,15 @@ function convertResult(rawResult) {
675
639
  const metadata = result["metadata"];
676
640
  const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
677
641
  const returnObj = {
678
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
679
642
  content: result["content"] ?? "",
680
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
681
643
  mimeType: result["mimeType"] ?? "application/octet-stream",
682
644
  metadata: metadataValue,
683
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
684
645
  tables: Array.isArray(result["tables"]) ? result["tables"] : [],
685
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
686
646
  detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
687
647
  chunks: null,
688
648
  images: null,
689
649
  elements: null,
690
650
  pages: null,
691
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
692
651
  document: result["document"] ?? null
693
652
  };
694
653
  const chunksData = result["chunks"];
@@ -760,6 +719,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
760
719
  return rawResults.map(convertResult);
761
720
  }
762
721
 
722
+ // typescript/extraction/render.ts
723
+ function renderPdfPageSync(filePath, pageIndex, options) {
724
+ return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
725
+ }
726
+ async function renderPdfPage(filePath, pageIndex, options) {
727
+ return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
728
+ }
729
+ function iteratePdfPagesSync(filePath, options) {
730
+ return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
731
+ }
732
+ async function iteratePdfPages(filePath, options) {
733
+ return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
734
+ }
735
+ function pdfPageCount(filePath) {
736
+ return getBinding().pdfPageCount(filePath);
737
+ }
738
+ var PdfPageIterator = class {
739
+ inner;
740
+ constructor(filePath, options) {
741
+ const Ctor = getBinding().JsPdfPageIterator;
742
+ this.inner = new Ctor(filePath, options?.dpi ?? null);
743
+ }
744
+ /** Advance and return the next page, or null when exhausted. */
745
+ next() {
746
+ return this.inner.next();
747
+ }
748
+ /** Total number of pages in the PDF. */
749
+ pageCount() {
750
+ return this.inner.pageCount();
751
+ }
752
+ /** Free native resources. Safe to call multiple times. */
753
+ close() {
754
+ this.inner.close();
755
+ }
756
+ };
757
+
763
758
  // typescript/extraction/single.ts
764
759
  import { readFileSync } from "fs";
765
760
  function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
@@ -1158,7 +1153,7 @@ function getEmbeddingPreset(name) {
1158
1153
  }
1159
1154
 
1160
1155
  // typescript/index.ts
1161
- var __version__ = "4.6.0";
1156
+ var __version__ = "4.6.3";
1162
1157
  export {
1163
1158
  CacheError,
1164
1159
  ErrorCode,
@@ -1168,6 +1163,7 @@ export {
1168
1163
  MissingDependencyError,
1169
1164
  OcrError,
1170
1165
  ParsingError,
1166
+ PdfPageIterator,
1171
1167
  PluginError,
1172
1168
  ValidationError,
1173
1169
  __resetBindingForTests,
@@ -1199,6 +1195,8 @@ export {
1199
1195
  getLastErrorCode,
1200
1196
  getLastPanicContext,
1201
1197
  getWorkerPoolStats,
1198
+ iteratePdfPages,
1199
+ iteratePdfPagesSync,
1202
1200
  listDocumentExtractors,
1203
1201
  listEmbeddingPresets,
1204
1202
  listOcrBackends,
@@ -1206,9 +1204,12 @@ export {
1206
1204
  listValidators,
1207
1205
  loadConfigFile,
1208
1206
  loadConfigFromPath,
1207
+ pdfPageCount,
1209
1208
  registerOcrBackend,
1210
1209
  registerPostProcessor,
1211
1210
  registerValidator,
1211
+ renderPdfPage,
1212
+ renderPdfPageSync,
1212
1213
  unregisterDocumentExtractor,
1213
1214
  unregisterOcrBackend,
1214
1215
  unregisterPostProcessor,