npm - @kreuzberg/node - Versions diffs - 4.0.0-rc.21 → 4.0.0-rc.24 - Mend

@kreuzberg/node 4.0.0-rc.21 → 4.0.0-rc.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/types.d.mts CHANGED Viewed

@@ -107,6 +107,21 @@ interface TokenReductionConfig {
     /** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
     preserveImportantWords?: boolean;
 }
+/**
+ * Hierarchy extraction configuration.
+ *
+ * Controls document hierarchy detection based on font size clustering.
+ */
+interface HierarchyConfig {
+    /** Enable hierarchy extraction. Default: true. */
+    enabled?: boolean;
+    /** Number of font size clusters (2-10). Default: 6. */
+    kClusters?: number;
+    /** Include bounding box information. Default: true. */
+    includeBbox?: boolean;
+    /** OCR coverage threshold (0.0-1.0). Default: null. */
+    ocrCoverageThreshold?: number | null;
+}
 /**
  * PDF-specific extraction configuration.
  *
@@ -119,6 +134,8 @@ interface PdfConfig {
     passwords?: string[];
     /** Extract document metadata (title, author, creation date, etc.). Default: true. */
     extractMetadata?: boolean;
+    /** Hierarchy extraction configuration. */
+    hierarchy?: HierarchyConfig;
 }
 /**
  * Image extraction and processing configuration.
@@ -281,6 +298,22 @@ interface KeywordConfig {
     /** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
     rakeParams?: RakeParams;
 }
+/**
+ * Extracted keyword with relevance metadata.
+ *
+ * Represents a single keyword extracted from text along with its relevance score,
+ * the algorithm that extracted it, and optional position information.
+ */
+interface ExtractedKeyword {
+    /** The keyword text */
+    text: string;
+    /** Relevance score (higher is better, algorithm-specific range) */
+    score: number;
+    /** Algorithm that extracted this keyword */
+    algorithm: KeywordAlgorithm;
+    /** Optional positions where keyword appears in text (character offsets) */
+    positions?: number[];
+}
 /**
  * Page tracking and extraction configuration.
  *
@@ -288,7 +321,7 @@ interface KeywordConfig {
  * Page range information in chunk metadata (first_page/last_page) is automatically
  * enabled when page boundaries are available and chunking is configured.
  */
-interface PageConfig {
+interface PageExtractionConfig {
     /** Extract pages as separate array (ExtractionResult.pages) */
     extractPages?: boolean;
     /** Insert page markers in main content string */
@@ -328,7 +361,7 @@ interface ExtractionConfig {
     /** Keyword extraction configuration for extracting important phrases. */
     keywords?: KeywordConfig;
     /** Page tracking and extraction configuration for multi-page documents. */
-    pages?: PageConfig;
+    pages?: PageExtractionConfig;
     /** Maximum number of concurrent extractions in batch operations. Default: 4. */
     maxConcurrentExtractions?: number;
 }
@@ -383,28 +416,50 @@ interface TextMetadata {
     links?: [string, string][] | null;
     codeBlocks?: [string, string][] | null;
 }
+interface HeaderMetadata {
+    level: number;
+    text: string;
+    id?: string | null;
+    depth: number;
+    htmlOffset: number;
+}
+interface LinkMetadata {
+    href: string;
+    text: string;
+    title?: string | null;
+    linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
+    rel: string[];
+    attributes: Record<string, string>;
+}
+interface HtmlImageMetadata {
+    src: string;
+    alt?: string | null;
+    title?: string | null;
+    dimensions?: [number, number] | null;
+    imageType: "data_uri" | "inline_svg" | "external" | "relative";
+    attributes: Record<string, string>;
+}
+interface StructuredData {
+    dataType: "json_ld" | "microdata" | "rdfa";
+    rawJson: string;
+    schemaType?: string | null;
+}
 interface HtmlMetadata {
     title?: string | null;
     description?: string | null;
-    keywords?: string | null;
+    keywords: string[];
     author?: string | null;
-    canonical?: string | null;
+    canonicalUrl?: string | null;
     baseHref?: string | null;
-    ogTitle?: string | null;
-    ogDescription?: string | null;
-    ogImage?: string | null;
-    ogUrl?: string | null;
-    ogType?: string | null;
-    ogSiteName?: string | null;
-    twitterCard?: string | null;
-    twitterTitle?: string | null;
-    twitterDescription?: string | null;
-    twitterImage?: string | null;
-    twitterSite?: string | null;
-    twitterCreator?: string | null;
-    linkAuthor?: string | null;
-    linkLicense?: string | null;
-    linkAlternate?: string | null;
+    language?: string | null;
+    textDirection?: "ltr" | "rtl" | "auto" | null;
+    openGraph: Record<string, string>;
+    twitterCard: Record<string, string>;
+    metaTags: Record<string, string>;
+    htmlHeaders: HeaderMetadata[];
+    htmlLinks: LinkMetadata[];
+    htmlImages: HtmlImageMetadata[];
+    structuredData: StructuredData[];
 }
 interface PdfMetadata {
     title?: string | null;
@@ -640,23 +695,17 @@ interface Metadata {
     headers?: string[] | null;
     links?: [string, string][] | null;
     code_blocks?: [string, string][] | null;
-    canonical?: string | null;
+    canonical_url?: string | null;
     base_href?: string | null;
-    og_title?: string | null;
-    og_description?: string | null;
-    og_image?: string | null;
-    og_url?: string | null;
-    og_type?: string | null;
-    og_site_name?: string | null;
-    twitter_card?: string | null;
-    twitter_title?: string | null;
-    twitter_description?: string | null;
-    twitter_image?: string | null;
-    twitter_site?: string | null;
-    twitter_creator?: string | null;
-    link_author?: string | null;
-    link_license?: string | null;
-    link_alternate?: string | null;
+    open_graph?: Record<string, string>;
+    twitter_card?: Record<string, string>;
+    meta_tags?: Record<string, string>;
+    html_language?: string | null;
+    text_direction?: "ltr" | "rtl" | "auto" | null;
+    html_headers?: HeaderMetadata[];
+    html_links?: LinkMetadata[];
+    html_images?: HtmlImageMetadata[];
+    structured_data?: StructuredData[];
     psm?: number;
     output_format?: string;
     table_count?: number;
@@ -695,6 +744,8 @@ interface ExtractionResult {
     images: ExtractedImage[] | null;
     /** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
     pages?: PageContent[] | null;
+    /** Extracted keywords when keyword extraction is enabled, null otherwise */
+    keywords?: ExtractedKeyword[] | null;
 }
 /** Post-processor execution stage in the extraction pipeline. */
 type ProcessingStage = "early" | "middle" | "late";
@@ -972,5 +1023,59 @@ interface ErrorClassification {
      */
     confidence: number;
 }
+/**
+ * Opaque handle to a worker pool for concurrent extraction operations.
+ *
+ * Worker pools enable parallel processing of CPU-bound document extraction
+ * tasks by distributing work across multiple threads. This is especially
+ * useful for batch processing large numbers of documents.
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4); // 4 concurrent workers
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+interface WorkerPool {
+    /** Internal pool identifier (opaque) */
+    readonly poolId: number;
+}
+/**
+ * Worker pool statistics.
+ *
+ * Provides information about the current state of a worker pool including
+ * pool size, number of active workers, and queued tasks.
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ * const stats = getWorkerPoolStats(pool);
+ * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
+ * console.log(`Queued: ${stats.queuedTasks}`);
+ * ```
+ */
+interface WorkerPoolStats {
+    /**
+     * Maximum number of concurrent workers in the pool.
+     */
+    size: number;
+    /**
+     * Number of currently active (executing) workers.
+     */
+    activeWorkers: number;
+    /**
+     * Number of tasks waiting in the queue.
+     */
+    queuedTasks: number;
+}
-export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, XmlMetadata, YakeParams };
+export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };

package/dist/types.d.ts CHANGED Viewed

@@ -107,6 +107,21 @@ interface TokenReductionConfig {
     /** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
     preserveImportantWords?: boolean;
 }
+/**
+ * Hierarchy extraction configuration.
+ *
+ * Controls document hierarchy detection based on font size clustering.
+ */
+interface HierarchyConfig {
+    /** Enable hierarchy extraction. Default: true. */
+    enabled?: boolean;
+    /** Number of font size clusters (2-10). Default: 6. */
+    kClusters?: number;
+    /** Include bounding box information. Default: true. */
+    includeBbox?: boolean;
+    /** OCR coverage threshold (0.0-1.0). Default: null. */
+    ocrCoverageThreshold?: number | null;
+}
 /**
  * PDF-specific extraction configuration.
  *
@@ -119,6 +134,8 @@ interface PdfConfig {
     passwords?: string[];
     /** Extract document metadata (title, author, creation date, etc.). Default: true. */
     extractMetadata?: boolean;
+    /** Hierarchy extraction configuration. */
+    hierarchy?: HierarchyConfig;
 }
 /**
  * Image extraction and processing configuration.
@@ -281,6 +298,22 @@ interface KeywordConfig {
     /** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
     rakeParams?: RakeParams;
 }
+/**
+ * Extracted keyword with relevance metadata.
+ *
+ * Represents a single keyword extracted from text along with its relevance score,
+ * the algorithm that extracted it, and optional position information.
+ */
+interface ExtractedKeyword {
+    /** The keyword text */
+    text: string;
+    /** Relevance score (higher is better, algorithm-specific range) */
+    score: number;
+    /** Algorithm that extracted this keyword */
+    algorithm: KeywordAlgorithm;
+    /** Optional positions where keyword appears in text (character offsets) */
+    positions?: number[];
+}
 /**
  * Page tracking and extraction configuration.
  *
@@ -288,7 +321,7 @@ interface KeywordConfig {
  * Page range information in chunk metadata (first_page/last_page) is automatically
  * enabled when page boundaries are available and chunking is configured.
  */
-interface PageConfig {
+interface PageExtractionConfig {
     /** Extract pages as separate array (ExtractionResult.pages) */
     extractPages?: boolean;
     /** Insert page markers in main content string */
@@ -328,7 +361,7 @@ interface ExtractionConfig {
     /** Keyword extraction configuration for extracting important phrases. */
     keywords?: KeywordConfig;
     /** Page tracking and extraction configuration for multi-page documents. */
-    pages?: PageConfig;
+    pages?: PageExtractionConfig;
     /** Maximum number of concurrent extractions in batch operations. Default: 4. */
     maxConcurrentExtractions?: number;
 }
@@ -383,28 +416,50 @@ interface TextMetadata {
     links?: [string, string][] | null;
     codeBlocks?: [string, string][] | null;
 }
+interface HeaderMetadata {
+    level: number;
+    text: string;
+    id?: string | null;
+    depth: number;
+    htmlOffset: number;
+}
+interface LinkMetadata {
+    href: string;
+    text: string;
+    title?: string | null;
+    linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
+    rel: string[];
+    attributes: Record<string, string>;
+}
+interface HtmlImageMetadata {
+    src: string;
+    alt?: string | null;
+    title?: string | null;
+    dimensions?: [number, number] | null;
+    imageType: "data_uri" | "inline_svg" | "external" | "relative";
+    attributes: Record<string, string>;
+}
+interface StructuredData {
+    dataType: "json_ld" | "microdata" | "rdfa";
+    rawJson: string;
+    schemaType?: string | null;
+}
 interface HtmlMetadata {
     title?: string | null;
     description?: string | null;
-    keywords?: string | null;
+    keywords: string[];
     author?: string | null;
-    canonical?: string | null;
+    canonicalUrl?: string | null;
     baseHref?: string | null;
-    ogTitle?: string | null;
-    ogDescription?: string | null;
-    ogImage?: string | null;
-    ogUrl?: string | null;
-    ogType?: string | null;
-    ogSiteName?: string | null;
-    twitterCard?: string | null;
-    twitterTitle?: string | null;
-    twitterDescription?: string | null;
-    twitterImage?: string | null;
-    twitterSite?: string | null;
-    twitterCreator?: string | null;
-    linkAuthor?: string | null;
-    linkLicense?: string | null;
-    linkAlternate?: string | null;
+    language?: string | null;
+    textDirection?: "ltr" | "rtl" | "auto" | null;
+    openGraph: Record<string, string>;
+    twitterCard: Record<string, string>;
+    metaTags: Record<string, string>;
+    htmlHeaders: HeaderMetadata[];
+    htmlLinks: LinkMetadata[];
+    htmlImages: HtmlImageMetadata[];
+    structuredData: StructuredData[];
 }
 interface PdfMetadata {
     title?: string | null;
@@ -640,23 +695,17 @@ interface Metadata {
     headers?: string[] | null;
     links?: [string, string][] | null;
     code_blocks?: [string, string][] | null;
-    canonical?: string | null;
+    canonical_url?: string | null;
     base_href?: string | null;
-    og_title?: string | null;
-    og_description?: string | null;
-    og_image?: string | null;
-    og_url?: string | null;
-    og_type?: string | null;
-    og_site_name?: string | null;
-    twitter_card?: string | null;
-    twitter_title?: string | null;
-    twitter_description?: string | null;
-    twitter_image?: string | null;
-    twitter_site?: string | null;
-    twitter_creator?: string | null;
-    link_author?: string | null;
-    link_license?: string | null;
-    link_alternate?: string | null;
+    open_graph?: Record<string, string>;
+    twitter_card?: Record<string, string>;
+    meta_tags?: Record<string, string>;
+    html_language?: string | null;
+    text_direction?: "ltr" | "rtl" | "auto" | null;
+    html_headers?: HeaderMetadata[];
+    html_links?: LinkMetadata[];
+    html_images?: HtmlImageMetadata[];
+    structured_data?: StructuredData[];
     psm?: number;
     output_format?: string;
     table_count?: number;
@@ -695,6 +744,8 @@ interface ExtractionResult {
     images: ExtractedImage[] | null;
     /** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
     pages?: PageContent[] | null;
+    /** Extracted keywords when keyword extraction is enabled, null otherwise */
+    keywords?: ExtractedKeyword[] | null;
 }
 /** Post-processor execution stage in the extraction pipeline. */
 type ProcessingStage = "early" | "middle" | "late";
@@ -972,5 +1023,59 @@ interface ErrorClassification {
      */
     confidence: number;
 }
+/**
+ * Opaque handle to a worker pool for concurrent extraction operations.
+ *
+ * Worker pools enable parallel processing of CPU-bound document extraction
+ * tasks by distributing work across multiple threads. This is especially
+ * useful for batch processing large numbers of documents.
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4); // 4 concurrent workers
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+interface WorkerPool {
+    /** Internal pool identifier (opaque) */
+    readonly poolId: number;
+}
+/**
+ * Worker pool statistics.
+ *
+ * Provides information about the current state of a worker pool including
+ * pool size, number of active workers, and queued tasks.
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ * const stats = getWorkerPoolStats(pool);
+ * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
+ * console.log(`Queued: ${stats.queuedTasks}`);
+ * ```
+ */
+interface WorkerPoolStats {
+    /**
+     * Maximum number of concurrent workers in the pool.
+     */
+    size: number;
+    /**
+     * Number of currently active (executing) workers.
+     */
+    activeWorkers: number;
+    /**
+     * Number of tasks waiting in the queue.
+     */
+    queuedTasks: number;
+}
-export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, XmlMetadata, YakeParams };
+export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };