@kreuzberg/node 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -506,6 +506,7 @@ function convertChunk(rawChunk) {
506
506
  if (!rawChunk || typeof rawChunk !== "object") {
507
507
  return {
508
508
  content: "",
509
+ chunkType: null,
509
510
  metadata: {
510
511
  byteStart: 0,
511
512
  byteEnd: 0,
@@ -519,26 +520,17 @@ function convertChunk(rawChunk) {
519
520
  const chunk = rawChunk;
520
521
  const metadata = chunk["metadata"] ?? {};
521
522
  return {
522
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
523
523
  content: chunk["content"] ?? "",
524
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
524
+ chunkType: chunk["chunk_type"] ?? chunk["chunkType"] ?? null,
525
525
  embedding: chunk["embedding"] ?? null,
526
526
  metadata: {
527
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
528
527
  byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
529
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
530
528
  byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
531
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
532
529
  tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
533
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
534
530
  chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
535
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
536
531
  totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
537
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
538
532
  firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
539
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
540
533
  lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
541
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
542
534
  headingContext: (() => {
543
535
  const hc = metadata["heading_context"] ?? metadata["headingContext"];
544
536
  if (!hc) return null;
@@ -548,9 +540,7 @@ function convertChunk(rawChunk) {
548
540
  headings: headings.map((h) => {
549
541
  const heading = h;
550
542
  return {
551
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
552
543
  level: heading["level"] ?? 0,
553
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
554
544
  text: heading["text"] ?? ""
555
545
  };
556
546
  })
@@ -571,22 +561,14 @@ function convertElement(rawElement) {
571
561
  const element = rawElement;
572
562
  const elementMetadata = element["metadata"] ?? {};
573
563
  return {
574
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
575
564
  elementId: element["element_id"] ?? element["elementId"] ?? "",
576
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
577
565
  elementType: element["element_type"] ?? element["elementType"] ?? "narrative_text",
578
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
579
566
  text: element["text"] ?? "",
580
567
  metadata: {
581
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
582
568
  pageNumber: elementMetadata["page_number"] ?? elementMetadata["pageNumber"] ?? null,
583
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
584
569
  filename: elementMetadata["filename"] ?? null,
585
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
586
570
  coordinates: elementMetadata["coordinates"] ? elementMetadata["coordinates"] : null,
587
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
588
571
  elementIndex: elementMetadata["element_index"] ?? elementMetadata["elementIndex"] ?? null,
589
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
590
572
  additional: elementMetadata["additional"] ?? {}
591
573
  }
592
574
  };
@@ -609,27 +591,16 @@ function convertImage(rawImage) {
609
591
  }
610
592
  const image = rawImage;
611
593
  return {
612
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
613
594
  data: ensureUint8Array(image["data"]),
614
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
615
595
  format: image["format"] ?? "unknown",
616
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
617
596
  imageIndex: image["imageIndex"] ?? 0,
618
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
619
597
  pageNumber: image["pageNumber"] ?? null,
620
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
621
598
  width: image["width"] ?? null,
622
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
623
599
  height: image["height"] ?? null,
624
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
625
600
  colorspace: image["colorspace"] ?? null,
626
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
627
601
  bitsPerComponent: image["bitsPerComponent"] ?? null,
628
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
629
602
  isMask: image["isMask"] ?? false,
630
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
631
603
  description: image["description"] ?? null,
632
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
633
604
  ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
634
605
  };
635
606
  }
@@ -644,15 +615,10 @@ function convertPageContent(rawPage) {
644
615
  }
645
616
  const page = rawPage;
646
617
  return {
647
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
648
618
  pageNumber: page["pageNumber"] ?? 0,
649
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
650
619
  content: page["content"] ?? "",
651
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
652
620
  tables: Array.isArray(page["tables"]) ? page["tables"] : [],
653
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
654
621
  images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
655
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
656
622
  isBlank: page["isBlank"] ?? null
657
623
  };
658
624
  }
@@ -675,20 +641,15 @@ function convertResult(rawResult) {
675
641
  const metadata = result["metadata"];
676
642
  const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
677
643
  const returnObj = {
678
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
679
644
  content: result["content"] ?? "",
680
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
681
645
  mimeType: result["mimeType"] ?? "application/octet-stream",
682
646
  metadata: metadataValue,
683
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
684
647
  tables: Array.isArray(result["tables"]) ? result["tables"] : [],
685
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
686
648
  detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
687
649
  chunks: null,
688
650
  images: null,
689
651
  elements: null,
690
652
  pages: null,
691
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
692
653
  document: result["document"] ?? null
693
654
  };
694
655
  const chunksData = result["chunks"];
@@ -760,6 +721,42 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
760
721
  return rawResults.map(convertResult);
761
722
  }
762
723
 
724
+ // typescript/extraction/render.ts
725
+ function renderPdfPageSync(filePath, pageIndex, options) {
726
+ return getBinding().renderPdfPageSync(filePath, pageIndex, options?.dpi ?? null);
727
+ }
728
+ async function renderPdfPage(filePath, pageIndex, options) {
729
+ return getBinding().renderPdfPage(filePath, pageIndex, options?.dpi ?? null);
730
+ }
731
+ function iteratePdfPagesSync(filePath, options) {
732
+ return getBinding().iteratePdfPagesSync(filePath, options?.dpi ?? null);
733
+ }
734
+ async function iteratePdfPages(filePath, options) {
735
+ return getBinding().iteratePdfPages(filePath, options?.dpi ?? null);
736
+ }
737
+ function pdfPageCount(filePath) {
738
+ return getBinding().pdfPageCount(filePath);
739
+ }
740
+ var PdfPageIterator = class {
741
+ inner;
742
+ constructor(filePath, options) {
743
+ const Ctor = getBinding().JsPdfPageIterator;
744
+ this.inner = new Ctor(filePath, options?.dpi ?? null);
745
+ }
746
+ /** Advance and return the next page, or null when exhausted. */
747
+ next() {
748
+ return this.inner.next();
749
+ }
750
+ /** Total number of pages in the PDF. */
751
+ pageCount() {
752
+ return this.inner.pageCount();
753
+ }
754
+ /** Free native resources. Safe to call multiple times. */
755
+ close() {
756
+ this.inner.close();
757
+ }
758
+ };
759
+
763
760
  // typescript/extraction/single.ts
764
761
  import { readFileSync } from "fs";
765
762
  function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
@@ -1158,7 +1155,7 @@ function getEmbeddingPreset(name) {
1158
1155
  }
1159
1156
 
1160
1157
  // typescript/index.ts
1161
- var __version__ = "4.6.1";
1158
+ var __version__ = "4.7.0";
1162
1159
  export {
1163
1160
  CacheError,
1164
1161
  ErrorCode,
@@ -1168,6 +1165,7 @@ export {
1168
1165
  MissingDependencyError,
1169
1166
  OcrError,
1170
1167
  ParsingError,
1168
+ PdfPageIterator,
1171
1169
  PluginError,
1172
1170
  ValidationError,
1173
1171
  __resetBindingForTests,
@@ -1199,6 +1197,8 @@ export {
1199
1197
  getLastErrorCode,
1200
1198
  getLastPanicContext,
1201
1199
  getWorkerPoolStats,
1200
+ iteratePdfPages,
1201
+ iteratePdfPagesSync,
1202
1202
  listDocumentExtractors,
1203
1203
  listEmbeddingPresets,
1204
1204
  listOcrBackends,
@@ -1206,9 +1206,12 @@ export {
1206
1206
  listValidators,
1207
1207
  loadConfigFile,
1208
1208
  loadConfigFromPath,
1209
+ pdfPageCount,
1209
1210
  registerOcrBackend,
1210
1211
  registerPostProcessor,
1211
1212
  registerValidator,
1213
+ renderPdfPage,
1214
+ renderPdfPageSync,
1212
1215
  unregisterDocumentExtractor,
1213
1216
  unregisterOcrBackend,
1214
1217
  unregisterPostProcessor,