@kreuzberg/node 4.2.15 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/types.d.mts CHANGED
@@ -38,6 +38,124 @@ interface TesseractConfig {
38
38
  */
39
39
  tesseditCharWhitelist?: string;
40
40
  }
41
+ /**
42
+ * OCR element hierarchy level.
43
+ *
44
+ * Defines the granularity of OCR element extraction.
45
+ */
46
+ type OcrElementLevel = "word" | "line" | "block" | "page";
47
+ /**
48
+ * Bounding geometry for OCR elements using rectangle coordinates.
49
+ *
50
+ * Represents rectangular coordinates with position and dimensions.
51
+ */
52
+ interface OcrBoundingGeometryRectangle {
53
+ type: "rectangle";
54
+ left: number;
55
+ top: number;
56
+ width: number;
57
+ height: number;
58
+ }
59
+ /**
60
+ * Bounding geometry for OCR elements using quadrilateral points.
61
+ *
62
+ * Represents irregular quadrilateral shapes with four corner points.
63
+ */
64
+ interface OcrBoundingGeometryQuadrilateral {
65
+ type: "quadrilateral";
66
+ points: number[][];
67
+ }
68
+ /**
69
+ * Bounding geometry for OCR elements.
70
+ *
71
+ * Can be either rectangular or quadrilateral based on the OCR engine's detection capability.
72
+ */
73
+ type OcrBoundingGeometry = OcrBoundingGeometryRectangle | OcrBoundingGeometryQuadrilateral;
74
+ /**
75
+ * Confidence scores for OCR operations.
76
+ *
77
+ * Tracks confidence levels for different aspects of OCR processing.
78
+ */
79
+ interface OcrConfidence {
80
+ /** Confidence score (0.0-1.0) for text detection. */
81
+ detection?: number;
82
+ /** Confidence score (0.0-1.0) for text recognition. */
83
+ recognition?: number;
84
+ }
85
+ /**
86
+ * Rotation information for OCR elements.
87
+ *
88
+ * Tracks detected text rotation and associated confidence.
89
+ */
90
+ interface OcrRotation {
91
+ /** Angle of rotation in degrees. */
92
+ angleDegrees?: number;
93
+ /** Confidence score (0.0-1.0) for rotation detection. */
94
+ confidence?: number;
95
+ }
96
+ /**
97
+ * Individual OCR element (word, line, block, or page).
98
+ *
99
+ * Represents a granular unit of text extracted by OCR with geometric and confidence information.
100
+ */
101
+ interface OcrElement {
102
+ /** Extracted text content */
103
+ text: string;
104
+ /** Bounding geometry of the element in the image */
105
+ geometry?: OcrBoundingGeometry;
106
+ /** Confidence scores for detection and recognition */
107
+ confidence?: OcrConfidence;
108
+ /** Hierarchy level of this element */
109
+ level?: OcrElementLevel;
110
+ /** Rotation information if text is rotated */
111
+ rotation?: OcrRotation;
112
+ /** Page number where this element was found (1-indexed) */
113
+ pageNumber?: number;
114
+ /** Parent element ID for hierarchical relationships */
115
+ parentId?: string;
116
+ /** Backend-specific metadata that doesn't fit standard fields */
117
+ backendMetadata?: Record<string, unknown>;
118
+ }
119
+ /**
120
+ * Configuration for OCR element extraction.
121
+ *
122
+ * Controls how granular OCR elements are extracted and organized.
123
+ */
124
+ interface OcrElementConfig {
125
+ /** Enable extraction of granular OCR elements. Default: false. */
126
+ includeElements?: boolean;
127
+ /** Minimum hierarchy level to extract. Default: 'word'. */
128
+ minLevel?: OcrElementLevel;
129
+ /** Minimum confidence threshold (0.0-1.0) for including elements. Default: 0.0. */
130
+ minConfidence?: number;
131
+ /** Build hierarchical relationships between elements. Default: false. */
132
+ buildHierarchy?: boolean;
133
+ }
134
+ /**
135
+ * PaddleOCR engine configuration options.
136
+ *
137
+ * Specific configuration for the PaddleOCR backend.
138
+ */
139
+ interface PaddleOcrConfig {
140
+ /** Language code(s) for OCR (e.g., 'en', 'zh', 'multi'). */
141
+ language?: string;
142
+ /** Directory to cache downloaded OCR models. */
143
+ cacheDir?: string;
144
+ /** Enable angle classification for rotated text detection. Default: false. */
145
+ useAngleCls?: boolean;
146
+ /** Enable table structure detection. Default: false. */
147
+ enableTableDetection?: boolean;
148
+ /** Database threshold for text detection (0.0-1.0). Default: 0.3. */
149
+ detDbThresh?: number;
150
+ /** Box threshold for text detection (0.0-1.0). Default: 0.5. */
151
+ detDbBoxThresh?: number;
152
+ /** Unclip ratio for expanding detected text regions. Default: 1.5. */
153
+ detDbUnclipRatio?: number;
154
+ /** Maximum side length for detection preprocessing. Default: 960. */
155
+ detLimitSideLen?: number;
156
+ /** Batch size for text recognition. Default: 30. */
157
+ recBatchNum?: number;
158
+ }
41
159
  /**
42
160
  * OCR (Optical Character Recognition) configuration.
43
161
  *
@@ -50,6 +168,10 @@ interface OcrConfig {
50
168
  language?: string;
51
169
  /** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */
52
170
  tesseractConfig?: TesseractConfig;
171
+ /** PaddleOCR engine-specific configuration options. Only used when backend is 'paddleocr'. */
172
+ paddleOcrConfig?: PaddleOcrConfig;
173
+ /** OCR element extraction configuration. */
174
+ elementConfig?: OcrElementConfig;
53
175
  }
54
176
  /**
55
177
  * Document chunking configuration for splitting large documents.
@@ -344,6 +466,8 @@ interface ExtractionConfig {
344
466
  ocr?: OcrConfig;
345
467
  /** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */
346
468
  forceOcr?: boolean;
469
+ /** Include structured document tree in the extraction result. Default: false. */
470
+ includeDocumentStructure?: boolean;
347
471
  /** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */
348
472
  chunking?: ChunkingConfig;
349
473
  /** Image extraction and optimization configuration. */
@@ -564,6 +688,8 @@ interface PageInfo {
564
688
  tableCount?: number | null;
565
689
  /** Whether this page is hidden (e.g., in presentations) */
566
690
  hidden?: boolean | null;
691
+ /** Whether this page is blank (contains no meaningful content) */
692
+ isBlank?: boolean | null;
567
693
  }
568
694
  /**
569
695
  * Page structure metadata.
@@ -660,6 +786,8 @@ interface PageContent {
660
786
  tables: Table[];
661
787
  /** Images found and extracted from this page */
662
788
  images: ExtractedImage[];
789
+ /** Whether this page is blank (contains no meaningful content) */
790
+ isBlank?: boolean | null;
663
791
  }
664
792
  /**
665
793
  * Extraction result metadata.
@@ -819,6 +947,10 @@ interface ExtractionResult {
819
947
  pages?: PageContent[] | null;
820
948
  /** Extracted keywords when keyword extraction is enabled, null otherwise */
821
949
  keywords?: ExtractedKeyword[] | null;
950
+ /** Granular OCR elements (words, lines, blocks) when OCR element extraction is enabled, null otherwise */
951
+ ocrElements?: OcrElement[] | null;
952
+ /** Structured document tree when include_document_structure is enabled, null otherwise */
953
+ document?: Record<string, unknown> | null;
822
954
  }
823
955
  /** Post-processor execution stage in the extraction pipeline. */
824
956
  type ProcessingStage = "early" | "middle" | "late";
@@ -946,17 +1078,11 @@ interface ValidatorProtocol {
946
1078
  *
947
1079
  * @example
948
1080
  * ```typescript
949
- * import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
950
1081
  * import { registerOcrBackend, extractFile } from '@kreuzberg/node';
951
1082
  *
952
- * // Create and register the backend
953
- * const backend = new GutenOcrBackend();
954
- * await backend.initialize();
955
- * registerOcrBackend(backend);
956
- *
957
- * // Use with extraction
1083
+ * // PaddleOCR is built into the native Rust core - just use the backend name
958
1084
  * const result = await extractFile('scanned.pdf', null, {
959
- * ocr: { backend: 'guten-ocr', language: 'en' }
1085
+ * ocr: { backend: 'paddle-ocr', language: 'en' }
960
1086
  * });
961
1087
  * ```
962
1088
  */
@@ -966,10 +1092,10 @@ interface OcrBackendProtocol {
966
1092
  *
967
1093
  * This name is used in ExtractionConfig to select the backend:
968
1094
  * ```typescript
969
- * { ocr: { backend: 'guten-ocr', language: 'en' } }
1095
+ * { ocr: { backend: 'paddle-ocr', language: 'en' } }
970
1096
  * ```
971
1097
  *
972
- * @returns Unique backend identifier (e.g., "guten-ocr", "tesseract")
1098
+ * @returns Unique backend identifier (e.g., "paddle-ocr", "tesseract")
973
1099
  */
974
1100
  name(): string;
975
1101
  /**
@@ -1151,4 +1277,4 @@ interface WorkerPoolStats {
1151
1277
  queuedTasks: number;
1152
1278
  }
1153
1279
 
1154
- export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
1280
+ export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
package/dist/types.d.ts CHANGED
@@ -38,6 +38,124 @@ interface TesseractConfig {
38
38
  */
39
39
  tesseditCharWhitelist?: string;
40
40
  }
41
+ /**
42
+ * OCR element hierarchy level.
43
+ *
44
+ * Defines the granularity of OCR element extraction.
45
+ */
46
+ type OcrElementLevel = "word" | "line" | "block" | "page";
47
+ /**
48
+ * Bounding geometry for OCR elements using rectangle coordinates.
49
+ *
50
+ * Represents rectangular coordinates with position and dimensions.
51
+ */
52
+ interface OcrBoundingGeometryRectangle {
53
+ type: "rectangle";
54
+ left: number;
55
+ top: number;
56
+ width: number;
57
+ height: number;
58
+ }
59
+ /**
60
+ * Bounding geometry for OCR elements using quadrilateral points.
61
+ *
62
+ * Represents irregular quadrilateral shapes with four corner points.
63
+ */
64
+ interface OcrBoundingGeometryQuadrilateral {
65
+ type: "quadrilateral";
66
+ points: number[][];
67
+ }
68
+ /**
69
+ * Bounding geometry for OCR elements.
70
+ *
71
+ * Can be either rectangular or quadrilateral based on the OCR engine's detection capability.
72
+ */
73
+ type OcrBoundingGeometry = OcrBoundingGeometryRectangle | OcrBoundingGeometryQuadrilateral;
74
+ /**
75
+ * Confidence scores for OCR operations.
76
+ *
77
+ * Tracks confidence levels for different aspects of OCR processing.
78
+ */
79
+ interface OcrConfidence {
80
+ /** Confidence score (0.0-1.0) for text detection. */
81
+ detection?: number;
82
+ /** Confidence score (0.0-1.0) for text recognition. */
83
+ recognition?: number;
84
+ }
85
+ /**
86
+ * Rotation information for OCR elements.
87
+ *
88
+ * Tracks detected text rotation and associated confidence.
89
+ */
90
+ interface OcrRotation {
91
+ /** Angle of rotation in degrees. */
92
+ angleDegrees?: number;
93
+ /** Confidence score (0.0-1.0) for rotation detection. */
94
+ confidence?: number;
95
+ }
96
+ /**
97
+ * Individual OCR element (word, line, block, or page).
98
+ *
99
+ * Represents a granular unit of text extracted by OCR with geometric and confidence information.
100
+ */
101
+ interface OcrElement {
102
+ /** Extracted text content */
103
+ text: string;
104
+ /** Bounding geometry of the element in the image */
105
+ geometry?: OcrBoundingGeometry;
106
+ /** Confidence scores for detection and recognition */
107
+ confidence?: OcrConfidence;
108
+ /** Hierarchy level of this element */
109
+ level?: OcrElementLevel;
110
+ /** Rotation information if text is rotated */
111
+ rotation?: OcrRotation;
112
+ /** Page number where this element was found (1-indexed) */
113
+ pageNumber?: number;
114
+ /** Parent element ID for hierarchical relationships */
115
+ parentId?: string;
116
+ /** Backend-specific metadata that doesn't fit standard fields */
117
+ backendMetadata?: Record<string, unknown>;
118
+ }
119
+ /**
120
+ * Configuration for OCR element extraction.
121
+ *
122
+ * Controls how granular OCR elements are extracted and organized.
123
+ */
124
+ interface OcrElementConfig {
125
+ /** Enable extraction of granular OCR elements. Default: false. */
126
+ includeElements?: boolean;
127
+ /** Minimum hierarchy level to extract. Default: 'word'. */
128
+ minLevel?: OcrElementLevel;
129
+ /** Minimum confidence threshold (0.0-1.0) for including elements. Default: 0.0. */
130
+ minConfidence?: number;
131
+ /** Build hierarchical relationships between elements. Default: false. */
132
+ buildHierarchy?: boolean;
133
+ }
134
+ /**
135
+ * PaddleOCR engine configuration options.
136
+ *
137
+ * Specific configuration for the PaddleOCR backend.
138
+ */
139
+ interface PaddleOcrConfig {
140
+ /** Language code(s) for OCR (e.g., 'en', 'zh', 'multi'). */
141
+ language?: string;
142
+ /** Directory to cache downloaded OCR models. */
143
+ cacheDir?: string;
144
+ /** Enable angle classification for rotated text detection. Default: false. */
145
+ useAngleCls?: boolean;
146
+ /** Enable table structure detection. Default: false. */
147
+ enableTableDetection?: boolean;
148
+ /** Database threshold for text detection (0.0-1.0). Default: 0.3. */
149
+ detDbThresh?: number;
150
+ /** Box threshold for text detection (0.0-1.0). Default: 0.5. */
151
+ detDbBoxThresh?: number;
152
+ /** Unclip ratio for expanding detected text regions. Default: 1.5. */
153
+ detDbUnclipRatio?: number;
154
+ /** Maximum side length for detection preprocessing. Default: 960. */
155
+ detLimitSideLen?: number;
156
+ /** Batch size for text recognition. Default: 30. */
157
+ recBatchNum?: number;
158
+ }
41
159
  /**
42
160
  * OCR (Optical Character Recognition) configuration.
43
161
  *
@@ -50,6 +168,10 @@ interface OcrConfig {
50
168
  language?: string;
51
169
  /** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */
52
170
  tesseractConfig?: TesseractConfig;
171
+ /** PaddleOCR engine-specific configuration options. Only used when backend is 'paddleocr'. */
172
+ paddleOcrConfig?: PaddleOcrConfig;
173
+ /** OCR element extraction configuration. */
174
+ elementConfig?: OcrElementConfig;
53
175
  }
54
176
  /**
55
177
  * Document chunking configuration for splitting large documents.
@@ -344,6 +466,8 @@ interface ExtractionConfig {
344
466
  ocr?: OcrConfig;
345
467
  /** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */
346
468
  forceOcr?: boolean;
469
+ /** Include structured document tree in the extraction result. Default: false. */
470
+ includeDocumentStructure?: boolean;
347
471
  /** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */
348
472
  chunking?: ChunkingConfig;
349
473
  /** Image extraction and optimization configuration. */
@@ -564,6 +688,8 @@ interface PageInfo {
564
688
  tableCount?: number | null;
565
689
  /** Whether this page is hidden (e.g., in presentations) */
566
690
  hidden?: boolean | null;
691
+ /** Whether this page is blank (contains no meaningful content) */
692
+ isBlank?: boolean | null;
567
693
  }
568
694
  /**
569
695
  * Page structure metadata.
@@ -660,6 +786,8 @@ interface PageContent {
660
786
  tables: Table[];
661
787
  /** Images found and extracted from this page */
662
788
  images: ExtractedImage[];
789
+ /** Whether this page is blank (contains no meaningful content) */
790
+ isBlank?: boolean | null;
663
791
  }
664
792
  /**
665
793
  * Extraction result metadata.
@@ -819,6 +947,10 @@ interface ExtractionResult {
819
947
  pages?: PageContent[] | null;
820
948
  /** Extracted keywords when keyword extraction is enabled, null otherwise */
821
949
  keywords?: ExtractedKeyword[] | null;
950
+ /** Granular OCR elements (words, lines, blocks) when OCR element extraction is enabled, null otherwise */
951
+ ocrElements?: OcrElement[] | null;
952
+ /** Structured document tree when include_document_structure is enabled, null otherwise */
953
+ document?: Record<string, unknown> | null;
822
954
  }
823
955
  /** Post-processor execution stage in the extraction pipeline. */
824
956
  type ProcessingStage = "early" | "middle" | "late";
@@ -946,17 +1078,11 @@ interface ValidatorProtocol {
946
1078
  *
947
1079
  * @example
948
1080
  * ```typescript
949
- * import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
950
1081
  * import { registerOcrBackend, extractFile } from '@kreuzberg/node';
951
1082
  *
952
- * // Create and register the backend
953
- * const backend = new GutenOcrBackend();
954
- * await backend.initialize();
955
- * registerOcrBackend(backend);
956
- *
957
- * // Use with extraction
1083
+ * // PaddleOCR is built into the native Rust core - just use the backend name
958
1084
  * const result = await extractFile('scanned.pdf', null, {
959
- * ocr: { backend: 'guten-ocr', language: 'en' }
1085
+ * ocr: { backend: 'paddle-ocr', language: 'en' }
960
1086
  * });
961
1087
  * ```
962
1088
  */
@@ -966,10 +1092,10 @@ interface OcrBackendProtocol {
966
1092
  *
967
1093
  * This name is used in ExtractionConfig to select the backend:
968
1094
  * ```typescript
969
- * { ocr: { backend: 'guten-ocr', language: 'en' } }
1095
+ * { ocr: { backend: 'paddle-ocr', language: 'en' } }
970
1096
  * ```
971
1097
  *
972
- * @returns Unique backend identifier (e.g., "guten-ocr", "tesseract")
1098
+ * @returns Unique backend identifier (e.g., "paddle-ocr", "tesseract")
973
1099
  */
974
1100
  name(): string;
975
1101
  /**
@@ -1151,4 +1277,4 @@ interface WorkerPoolStats {
1151
1277
  queuedTasks: number;
1152
1278
  }
1153
1279
 
1154
- export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
1280
+ export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };