npm - @kreuzberg/node - Versions diffs - 4.2.15 → 4.3.1 - Mend

@kreuzberg/node 4.2.15 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/README.md +9 -11
package/dist/errors.d.mts +2 -3
package/dist/errors.d.ts +2 -3
package/dist/errors.js.map +1 -1
package/dist/errors.mjs.map +1 -1
package/dist/index.d.mts +5 -14
package/dist/index.d.ts +5 -14
package/dist/index.js +19 -215
package/dist/index.js.map +1 -1
package/dist/index.mjs +19 -204
package/dist/index.mjs.map +1 -1
package/dist/types.d.mts +137 -11
package/dist/types.d.ts +137 -11
package/dist/types.js.map +1 -1
package/index.d.ts +27 -0
package/index.js +52 -52
package/package.json +11 -9
package/dist/ocr/guten-ocr.d.mts +0 -193
package/dist/ocr/guten-ocr.d.ts +0 -193
package/dist/ocr/guten-ocr.js +0 -234
package/dist/ocr/guten-ocr.js.map +0 -1
package/dist/ocr/guten-ocr.mjs +0 -199
package/dist/ocr/guten-ocr.mjs.map +0 -1

package/dist/types.d.mts CHANGED Viewed

@@ -38,6 +38,124 @@ interface TesseractConfig {
      */
     tesseditCharWhitelist?: string;
 }
+/**
+ * OCR element hierarchy level.
+ *
+ * Defines the granularity of OCR element extraction.
+ */
+type OcrElementLevel = "word" | "line" | "block" | "page";
+/**
+ * Bounding geometry for OCR elements using rectangle coordinates.
+ *
+ * Represents rectangular coordinates with position and dimensions.
+ */
+interface OcrBoundingGeometryRectangle {
+    type: "rectangle";
+    left: number;
+    top: number;
+    width: number;
+    height: number;
+}
+/**
+ * Bounding geometry for OCR elements using quadrilateral points.
+ *
+ * Represents irregular quadrilateral shapes with four corner points.
+ */
+interface OcrBoundingGeometryQuadrilateral {
+    type: "quadrilateral";
+    points: number[][];
+}
+/**
+ * Bounding geometry for OCR elements.
+ *
+ * Can be either rectangular or quadrilateral based on the OCR engine's detection capability.
+ */
+type OcrBoundingGeometry = OcrBoundingGeometryRectangle | OcrBoundingGeometryQuadrilateral;
+/**
+ * Confidence scores for OCR operations.
+ *
+ * Tracks confidence levels for different aspects of OCR processing.
+ */
+interface OcrConfidence {
+    /** Confidence score (0.0-1.0) for text detection. */
+    detection?: number;
+    /** Confidence score (0.0-1.0) for text recognition. */
+    recognition?: number;
+}
+/**
+ * Rotation information for OCR elements.
+ *
+ * Tracks detected text rotation and associated confidence.
+ */
+interface OcrRotation {
+    /** Angle of rotation in degrees. */
+    angleDegrees?: number;
+    /** Confidence score (0.0-1.0) for rotation detection. */
+    confidence?: number;
+}
+/**
+ * Individual OCR element (word, line, block, or page).
+ *
+ * Represents a granular unit of text extracted by OCR with geometric and confidence information.
+ */
+interface OcrElement {
+    /** Extracted text content */
+    text: string;
+    /** Bounding geometry of the element in the image */
+    geometry?: OcrBoundingGeometry;
+    /** Confidence scores for detection and recognition */
+    confidence?: OcrConfidence;
+    /** Hierarchy level of this element */
+    level?: OcrElementLevel;
+    /** Rotation information if text is rotated */
+    rotation?: OcrRotation;
+    /** Page number where this element was found (1-indexed) */
+    pageNumber?: number;
+    /** Parent element ID for hierarchical relationships */
+    parentId?: string;
+    /** Backend-specific metadata that doesn't fit standard fields */
+    backendMetadata?: Record<string, unknown>;
+}
+/**
+ * Configuration for OCR element extraction.
+ *
+ * Controls how granular OCR elements are extracted and organized.
+ */
+interface OcrElementConfig {
+    /** Enable extraction of granular OCR elements. Default: false. */
+    includeElements?: boolean;
+    /** Minimum hierarchy level to extract. Default: 'word'. */
+    minLevel?: OcrElementLevel;
+    /** Minimum confidence threshold (0.0-1.0) for including elements. Default: 0.0. */
+    minConfidence?: number;
+    /** Build hierarchical relationships between elements. Default: false. */
+    buildHierarchy?: boolean;
+}
+/**
+ * PaddleOCR engine configuration options.
+ *
+ * Specific configuration for the PaddleOCR backend.
+ */
+interface PaddleOcrConfig {
+    /** Language code(s) for OCR (e.g., 'en', 'zh', 'multi'). */
+    language?: string;
+    /** Directory to cache downloaded OCR models. */
+    cacheDir?: string;
+    /** Enable angle classification for rotated text detection. Default: false. */
+    useAngleCls?: boolean;
+    /** Enable table structure detection. Default: false. */
+    enableTableDetection?: boolean;
+    /** Database threshold for text detection (0.0-1.0). Default: 0.3. */
+    detDbThresh?: number;
+    /** Box threshold for text detection (0.0-1.0). Default: 0.5. */
+    detDbBoxThresh?: number;
+    /** Unclip ratio for expanding detected text regions. Default: 1.5. */
+    detDbUnclipRatio?: number;
+    /** Maximum side length for detection preprocessing. Default: 960. */
+    detLimitSideLen?: number;
+    /** Batch size for text recognition. Default: 30. */
+    recBatchNum?: number;
+}
 /**
  * OCR (Optical Character Recognition) configuration.
  *
@@ -50,6 +168,10 @@ interface OcrConfig {
     language?: string;
     /** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */
     tesseractConfig?: TesseractConfig;
+    /** PaddleOCR engine-specific configuration options. Only used when backend is 'paddleocr'. */
+    paddleOcrConfig?: PaddleOcrConfig;
+    /** OCR element extraction configuration. */
+    elementConfig?: OcrElementConfig;
 }
 /**
  * Document chunking configuration for splitting large documents.
@@ -344,6 +466,8 @@ interface ExtractionConfig {
     ocr?: OcrConfig;
     /** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */
     forceOcr?: boolean;
+    /** Include structured document tree in the extraction result. Default: false. */
+    includeDocumentStructure?: boolean;
     /** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */
     chunking?: ChunkingConfig;
     /** Image extraction and optimization configuration. */
@@ -564,6 +688,8 @@ interface PageInfo {
     tableCount?: number | null;
     /** Whether this page is hidden (e.g., in presentations) */
     hidden?: boolean | null;
+    /** Whether this page is blank (contains no meaningful content) */
+    isBlank?: boolean | null;
 }
 /**
  * Page structure metadata.
@@ -660,6 +786,8 @@ interface PageContent {
     tables: Table[];
     /** Images found and extracted from this page */
     images: ExtractedImage[];
+    /** Whether this page is blank (contains no meaningful content) */
+    isBlank?: boolean | null;
 }
 /**
  * Extraction result metadata.
@@ -819,6 +947,10 @@ interface ExtractionResult {
     pages?: PageContent[] | null;
     /** Extracted keywords when keyword extraction is enabled, null otherwise */
     keywords?: ExtractedKeyword[] | null;
+    /** Granular OCR elements (words, lines, blocks) when OCR element extraction is enabled, null otherwise */
+    ocrElements?: OcrElement[] | null;
+    /** Structured document tree when include_document_structure is enabled, null otherwise */
+    document?: Record<string, unknown> | null;
 }
 /** Post-processor execution stage in the extraction pipeline. */
 type ProcessingStage = "early" | "middle" | "late";
@@ -946,17 +1078,11 @@ interface ValidatorProtocol {
  *
  * @example
  * ```typescript
- * import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
  * import { registerOcrBackend, extractFile } from '@kreuzberg/node';
  *
- * // Create and register the backend
- * const backend = new GutenOcrBackend();
- * await backend.initialize();
- * registerOcrBackend(backend);
- *
- * // Use with extraction
+ * // PaddleOCR is built into the native Rust core - just use the backend name
  * const result = await extractFile('scanned.pdf', null, {
- *   ocr: { backend: 'guten-ocr', language: 'en' }
+ *   ocr: { backend: 'paddle-ocr', language: 'en' }
  * });
  * ```
  */
@@ -966,10 +1092,10 @@ interface OcrBackendProtocol {
      *
      * This name is used in ExtractionConfig to select the backend:
      * ```typescript
-     * { ocr: { backend: 'guten-ocr', language: 'en' } }
+     * { ocr: { backend: 'paddle-ocr', language: 'en' } }
      * ```
      *
-     * @returns Unique backend identifier (e.g., "guten-ocr", "tesseract")
+     * @returns Unique backend identifier (e.g., "paddle-ocr", "tesseract")
      */
     name(): string;
     /**
@@ -1151,4 +1277,4 @@ interface WorkerPoolStats {
     queuedTasks: number;
 }
-export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
+export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };

package/dist/types.d.ts CHANGED Viewed

@@ -38,6 +38,124 @@ interface TesseractConfig {
      */
     tesseditCharWhitelist?: string;
 }
+/**
+ * OCR element hierarchy level.
+ *
+ * Defines the granularity of OCR element extraction.
+ */
+type OcrElementLevel = "word" | "line" | "block" | "page";
+/**
+ * Bounding geometry for OCR elements using rectangle coordinates.
+ *
+ * Represents rectangular coordinates with position and dimensions.
+ */
+interface OcrBoundingGeometryRectangle {
+    type: "rectangle";
+    left: number;
+    top: number;
+    width: number;
+    height: number;
+}
+/**
+ * Bounding geometry for OCR elements using quadrilateral points.
+ *
+ * Represents irregular quadrilateral shapes with four corner points.
+ */
+interface OcrBoundingGeometryQuadrilateral {
+    type: "quadrilateral";
+    points: number[][];
+}
+/**
+ * Bounding geometry for OCR elements.
+ *
+ * Can be either rectangular or quadrilateral based on the OCR engine's detection capability.
+ */
+type OcrBoundingGeometry = OcrBoundingGeometryRectangle | OcrBoundingGeometryQuadrilateral;
+/**
+ * Confidence scores for OCR operations.
+ *
+ * Tracks confidence levels for different aspects of OCR processing.
+ */
+interface OcrConfidence {
+    /** Confidence score (0.0-1.0) for text detection. */
+    detection?: number;
+    /** Confidence score (0.0-1.0) for text recognition. */
+    recognition?: number;
+}
+/**
+ * Rotation information for OCR elements.
+ *
+ * Tracks detected text rotation and associated confidence.
+ */
+interface OcrRotation {
+    /** Angle of rotation in degrees. */
+    angleDegrees?: number;
+    /** Confidence score (0.0-1.0) for rotation detection. */
+    confidence?: number;
+}
+/**
+ * Individual OCR element (word, line, block, or page).
+ *
+ * Represents a granular unit of text extracted by OCR with geometric and confidence information.
+ */
+interface OcrElement {
+    /** Extracted text content */
+    text: string;
+    /** Bounding geometry of the element in the image */
+    geometry?: OcrBoundingGeometry;
+    /** Confidence scores for detection and recognition */
+    confidence?: OcrConfidence;
+    /** Hierarchy level of this element */
+    level?: OcrElementLevel;
+    /** Rotation information if text is rotated */
+    rotation?: OcrRotation;
+    /** Page number where this element was found (1-indexed) */
+    pageNumber?: number;
+    /** Parent element ID for hierarchical relationships */
+    parentId?: string;
+    /** Backend-specific metadata that doesn't fit standard fields */
+    backendMetadata?: Record<string, unknown>;
+}
+/**
+ * Configuration for OCR element extraction.
+ *
+ * Controls how granular OCR elements are extracted and organized.
+ */
+interface OcrElementConfig {
+    /** Enable extraction of granular OCR elements. Default: false. */
+    includeElements?: boolean;
+    /** Minimum hierarchy level to extract. Default: 'word'. */
+    minLevel?: OcrElementLevel;
+    /** Minimum confidence threshold (0.0-1.0) for including elements. Default: 0.0. */
+    minConfidence?: number;
+    /** Build hierarchical relationships between elements. Default: false. */
+    buildHierarchy?: boolean;
+}
+/**
+ * PaddleOCR engine configuration options.
+ *
+ * Specific configuration for the PaddleOCR backend.
+ */
+interface PaddleOcrConfig {
+    /** Language code(s) for OCR (e.g., 'en', 'zh', 'multi'). */
+    language?: string;
+    /** Directory to cache downloaded OCR models. */
+    cacheDir?: string;
+    /** Enable angle classification for rotated text detection. Default: false. */
+    useAngleCls?: boolean;
+    /** Enable table structure detection. Default: false. */
+    enableTableDetection?: boolean;
+    /** Database threshold for text detection (0.0-1.0). Default: 0.3. */
+    detDbThresh?: number;
+    /** Box threshold for text detection (0.0-1.0). Default: 0.5. */
+    detDbBoxThresh?: number;
+    /** Unclip ratio for expanding detected text regions. Default: 1.5. */
+    detDbUnclipRatio?: number;
+    /** Maximum side length for detection preprocessing. Default: 960. */
+    detLimitSideLen?: number;
+    /** Batch size for text recognition. Default: 30. */
+    recBatchNum?: number;
+}
 /**
  * OCR (Optical Character Recognition) configuration.
  *
@@ -50,6 +168,10 @@ interface OcrConfig {
     language?: string;
     /** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */
     tesseractConfig?: TesseractConfig;
+    /** PaddleOCR engine-specific configuration options. Only used when backend is 'paddleocr'. */
+    paddleOcrConfig?: PaddleOcrConfig;
+    /** OCR element extraction configuration. */
+    elementConfig?: OcrElementConfig;
 }
 /**
  * Document chunking configuration for splitting large documents.
@@ -344,6 +466,8 @@ interface ExtractionConfig {
     ocr?: OcrConfig;
     /** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */
     forceOcr?: boolean;
+    /** Include structured document tree in the extraction result. Default: false. */
+    includeDocumentStructure?: boolean;
     /** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */
     chunking?: ChunkingConfig;
     /** Image extraction and optimization configuration. */
@@ -564,6 +688,8 @@ interface PageInfo {
     tableCount?: number | null;
     /** Whether this page is hidden (e.g., in presentations) */
     hidden?: boolean | null;
+    /** Whether this page is blank (contains no meaningful content) */
+    isBlank?: boolean | null;
 }
 /**
  * Page structure metadata.
@@ -660,6 +786,8 @@ interface PageContent {
     tables: Table[];
     /** Images found and extracted from this page */
     images: ExtractedImage[];
+    /** Whether this page is blank (contains no meaningful content) */
+    isBlank?: boolean | null;
 }
 /**
  * Extraction result metadata.
@@ -819,6 +947,10 @@ interface ExtractionResult {
     pages?: PageContent[] | null;
     /** Extracted keywords when keyword extraction is enabled, null otherwise */
     keywords?: ExtractedKeyword[] | null;
+    /** Granular OCR elements (words, lines, blocks) when OCR element extraction is enabled, null otherwise */
+    ocrElements?: OcrElement[] | null;
+    /** Structured document tree when include_document_structure is enabled, null otherwise */
+    document?: Record<string, unknown> | null;
 }
 /** Post-processor execution stage in the extraction pipeline. */
 type ProcessingStage = "early" | "middle" | "late";
@@ -946,17 +1078,11 @@ interface ValidatorProtocol {
  *
  * @example
  * ```typescript
- * import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
  * import { registerOcrBackend, extractFile } from '@kreuzberg/node';
  *
- * // Create and register the backend
- * const backend = new GutenOcrBackend();
- * await backend.initialize();
- * registerOcrBackend(backend);
- *
- * // Use with extraction
+ * // PaddleOCR is built into the native Rust core - just use the backend name
  * const result = await extractFile('scanned.pdf', null, {
- *   ocr: { backend: 'guten-ocr', language: 'en' }
+ *   ocr: { backend: 'paddle-ocr', language: 'en' }
  * });
  * ```
  */
@@ -966,10 +1092,10 @@ interface OcrBackendProtocol {
      *
      * This name is used in ExtractionConfig to select the backend:
      * ```typescript
-     * { ocr: { backend: 'guten-ocr', language: 'en' } }
+     * { ocr: { backend: 'paddle-ocr', language: 'en' } }
      * ```
      *
-     * @returns Unique backend identifier (e.g., "guten-ocr", "tesseract")
+     * @returns Unique backend identifier (e.g., "paddle-ocr", "tesseract")
      */
     name(): string;
     /**
@@ -1151,4 +1277,4 @@ interface WorkerPoolStats {
     queuedTasks: number;
 }
-export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
+export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };