@kreuzberg/node 4.0.0-rc.6 → 4.0.0-rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,666 @@
1
+ /**
2
+ * Type definitions for Kreuzberg extraction results.
3
+ *
4
+ * These types mirror the strongly-typed Rust metadata structures,
5
+ * providing type safety for TypeScript users.
6
+ */
7
+ interface TesseractConfig {
8
+ psm?: number;
9
+ enableTableDetection?: boolean;
10
+ tesseditCharWhitelist?: string;
11
+ }
12
+ interface OcrConfig {
13
+ backend: string;
14
+ language?: string;
15
+ tesseractConfig?: TesseractConfig;
16
+ }
17
+ interface ChunkingConfig {
18
+ maxChars?: number;
19
+ maxOverlap?: number;
20
+ chunkSize?: number;
21
+ chunkOverlap?: number;
22
+ preset?: string;
23
+ embedding?: Record<string, unknown>;
24
+ enabled?: boolean;
25
+ }
26
+ interface LanguageDetectionConfig {
27
+ enabled?: boolean;
28
+ minConfidence?: number;
29
+ detectMultiple?: boolean;
30
+ }
31
+ interface TokenReductionConfig {
32
+ mode?: string;
33
+ preserveImportantWords?: boolean;
34
+ }
35
+ interface PdfConfig {
36
+ extractImages?: boolean;
37
+ passwords?: string[];
38
+ extractMetadata?: boolean;
39
+ }
40
+ interface ImageExtractionConfig {
41
+ extractImages?: boolean;
42
+ targetDpi?: number;
43
+ maxImageDimension?: number;
44
+ autoAdjustDpi?: boolean;
45
+ minDpi?: number;
46
+ maxDpi?: number;
47
+ }
48
+ interface PostProcessorConfig {
49
+ enabled?: boolean;
50
+ enabledProcessors?: string[];
51
+ disabledProcessors?: string[];
52
+ }
53
+ interface HtmlPreprocessingOptions {
54
+ enabled?: boolean;
55
+ preset?: "minimal" | "standard" | "aggressive";
56
+ removeNavigation?: boolean;
57
+ removeForms?: boolean;
58
+ }
59
+ interface HtmlConversionOptions {
60
+ headingStyle?: "atx" | "underlined" | "atx_closed";
61
+ listIndentType?: "spaces" | "tabs";
62
+ listIndentWidth?: number;
63
+ bullets?: string;
64
+ strongEmSymbol?: string;
65
+ escapeAsterisks?: boolean;
66
+ escapeUnderscores?: boolean;
67
+ escapeMisc?: boolean;
68
+ escapeAscii?: boolean;
69
+ codeLanguage?: string;
70
+ autolinks?: boolean;
71
+ defaultTitle?: boolean;
72
+ brInTables?: boolean;
73
+ hocrSpatialTables?: boolean;
74
+ highlightStyle?: "double_equal" | "html" | "bold" | "none";
75
+ extractMetadata?: boolean;
76
+ whitespaceMode?: "normalized" | "strict";
77
+ stripNewlines?: boolean;
78
+ wrap?: boolean;
79
+ wrapWidth?: number;
80
+ convertAsInline?: boolean;
81
+ subSymbol?: string;
82
+ supSymbol?: string;
83
+ newlineStyle?: "spaces" | "backslash";
84
+ codeBlockStyle?: "indented" | "backticks" | "tildes";
85
+ keepInlineImagesIn?: string[];
86
+ encoding?: string;
87
+ debug?: boolean;
88
+ stripTags?: string[];
89
+ preserveTags?: string[];
90
+ preprocessing?: HtmlPreprocessingOptions;
91
+ }
92
+ type KeywordAlgorithm = "yake" | "rake";
93
+ interface YakeParams {
94
+ windowSize?: number;
95
+ }
96
+ interface RakeParams {
97
+ minWordLength?: number;
98
+ maxWordsPerPhrase?: number;
99
+ }
100
+ interface KeywordConfig {
101
+ algorithm?: KeywordAlgorithm;
102
+ maxKeywords?: number;
103
+ minScore?: number;
104
+ ngramRange?: [number, number];
105
+ language?: string;
106
+ yakeParams?: YakeParams;
107
+ rakeParams?: RakeParams;
108
+ }
109
+ /**
110
+ * Page tracking and extraction configuration.
111
+ *
112
+ * Controls how pages/slides/sheets are extracted and tracked in the document.
113
+ * Page range information in chunk metadata (first_page/last_page) is automatically
114
+ * enabled when page boundaries are available and chunking is configured.
115
+ */
116
+ interface PageConfig {
117
+ /** Extract pages as separate array (ExtractionResult.pages) */
118
+ extractPages?: boolean;
119
+ /** Insert page markers in main content string */
120
+ insertPageMarkers?: boolean;
121
+ /** Page marker format (use {page_num} placeholder) */
122
+ markerFormat?: string;
123
+ }
124
+ interface ExtractionConfig {
125
+ useCache?: boolean;
126
+ enableQualityProcessing?: boolean;
127
+ ocr?: OcrConfig;
128
+ forceOcr?: boolean;
129
+ chunking?: ChunkingConfig;
130
+ images?: ImageExtractionConfig;
131
+ pdfOptions?: PdfConfig;
132
+ tokenReduction?: TokenReductionConfig;
133
+ languageDetection?: LanguageDetectionConfig;
134
+ postprocessor?: PostProcessorConfig;
135
+ htmlOptions?: HtmlConversionOptions;
136
+ keywords?: KeywordConfig;
137
+ pages?: PageConfig;
138
+ maxConcurrentExtractions?: number;
139
+ }
140
+ interface Table {
141
+ cells: string[][];
142
+ markdown: string;
143
+ pageNumber: number;
144
+ }
145
+ interface ExcelMetadata {
146
+ sheetCount?: number;
147
+ sheetNames?: string[];
148
+ }
149
+ interface EmailMetadata {
150
+ fromEmail?: string | null;
151
+ fromName?: string | null;
152
+ toEmails?: string[];
153
+ ccEmails?: string[];
154
+ bccEmails?: string[];
155
+ messageId?: string | null;
156
+ attachments?: string[];
157
+ }
158
+ interface ArchiveMetadata {
159
+ format?: string;
160
+ fileCount?: number;
161
+ fileList?: string[];
162
+ totalSize?: number;
163
+ compressedSize?: number | null;
164
+ }
165
+ interface ImageMetadata {
166
+ width?: number;
167
+ height?: number;
168
+ format?: string;
169
+ exif?: Record<string, string>;
170
+ }
171
+ interface XmlMetadata {
172
+ elementCount?: number;
173
+ uniqueElements?: string[];
174
+ }
175
+ interface TextMetadata {
176
+ lineCount?: number;
177
+ wordCount?: number;
178
+ characterCount?: number;
179
+ headers?: string[] | null;
180
+ links?: [string, string][] | null;
181
+ codeBlocks?: [string, string][] | null;
182
+ }
183
+ interface HtmlMetadata {
184
+ title?: string | null;
185
+ description?: string | null;
186
+ keywords?: string | null;
187
+ author?: string | null;
188
+ canonical?: string | null;
189
+ baseHref?: string | null;
190
+ ogTitle?: string | null;
191
+ ogDescription?: string | null;
192
+ ogImage?: string | null;
193
+ ogUrl?: string | null;
194
+ ogType?: string | null;
195
+ ogSiteName?: string | null;
196
+ twitterCard?: string | null;
197
+ twitterTitle?: string | null;
198
+ twitterDescription?: string | null;
199
+ twitterImage?: string | null;
200
+ twitterSite?: string | null;
201
+ twitterCreator?: string | null;
202
+ linkAuthor?: string | null;
203
+ linkLicense?: string | null;
204
+ linkAlternate?: string | null;
205
+ }
206
+ interface PdfMetadata {
207
+ title?: string | null;
208
+ author?: string | null;
209
+ subject?: string | null;
210
+ keywords?: string | null;
211
+ creator?: string | null;
212
+ producer?: string | null;
213
+ creationDate?: string | null;
214
+ modificationDate?: string | null;
215
+ pageCount?: number;
216
+ }
217
+ interface PptxMetadata {
218
+ title?: string | null;
219
+ author?: string | null;
220
+ description?: string | null;
221
+ summary?: string | null;
222
+ fonts?: string[];
223
+ }
224
+ interface OcrMetadata {
225
+ language?: string;
226
+ psm?: number;
227
+ outputFormat?: string;
228
+ tableCount?: number;
229
+ tableRows?: number | null;
230
+ tableCols?: number | null;
231
+ }
232
+ interface ImagePreprocessingMetadata {
233
+ originalDimensions?: [number, number];
234
+ originalDpi?: [number, number];
235
+ targetDpi?: number;
236
+ scaleFactor?: number;
237
+ autoAdjusted?: boolean;
238
+ finalDpi?: number;
239
+ newDimensions?: [number, number] | null;
240
+ resampleMethod?: string;
241
+ dimensionClamped?: boolean;
242
+ calculatedDpi?: number | null;
243
+ skippedResize?: boolean;
244
+ resizeError?: string | null;
245
+ }
246
+ interface ErrorMetadata {
247
+ errorType?: string;
248
+ message?: string;
249
+ }
250
+ /**
251
+ * Page boundary information for chunk metadata.
252
+ *
253
+ * Tracks where a specific page's content starts and ends in the main content string,
254
+ * enabling mapping from byte positions to page numbers. All offsets are guaranteed to be
255
+ * at valid UTF-8 character boundaries.
256
+ */
257
+ interface PageBoundary {
258
+ /** Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
259
+ byteStart: number;
260
+ /** Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
261
+ byteEnd: number;
262
+ /** Page number (1-indexed) */
263
+ pageNumber: number;
264
+ }
265
+ /**
266
+ * Type of paginated unit in a document.
267
+ *
268
+ * Distinguishes between different types of "pages":
269
+ * - "page": Standard document pages (PDF, DOCX, images)
270
+ * - "slide": Presentation slides (PPTX, ODP)
271
+ * - "sheet": Spreadsheet sheets (XLSX, ODS)
272
+ */
273
+ type PageUnitType = "page" | "slide" | "sheet";
274
+ /**
275
+ * Detailed per-page metadata.
276
+ *
277
+ * Captures information about a single page/slide/sheet including dimensions,
278
+ * content counts, and visibility state.
279
+ */
280
+ interface PageInfo {
281
+ /** Page number (1-indexed) */
282
+ number: number;
283
+ /** Page title (usually for presentations) */
284
+ title?: string | null;
285
+ /** Dimensions in points (PDF) or pixels (images): [width, height] */
286
+ dimensions?: [number, number] | null;
287
+ /** Number of images on this page */
288
+ imageCount?: number | null;
289
+ /** Number of tables on this page */
290
+ tableCount?: number | null;
291
+ /** Whether this page is hidden (e.g., in presentations) */
292
+ hidden?: boolean | null;
293
+ }
294
+ /**
295
+ * Page structure metadata.
296
+ *
297
+ * Contains information about pages/slides/sheets in a document, including
298
+ * boundaries for mapping chunks to pages and detailed per-page metadata.
299
+ */
300
+ interface PageStructure {
301
+ /** Total number of pages/slides/sheets */
302
+ totalCount: number;
303
+ /** Type of paginated unit (page, slide, or sheet) */
304
+ unitType: PageUnitType;
305
+ /** Byte offset boundaries for each page */
306
+ boundaries?: PageBoundary[] | null;
307
+ /** Detailed per-page metadata (optional, only when needed) */
308
+ pages?: PageInfo[] | null;
309
+ }
310
+ /**
311
+ * Metadata about a chunk's position and properties in the document.
312
+ *
313
+ * Tracks where a chunk appears in the original document, including byte offsets
314
+ * and page ranges when page tracking is enabled.
315
+ */
316
+ interface ChunkMetadata {
317
+ /** Byte offset where this chunk starts in the original text (UTF-8 valid boundary) */
318
+ byteStart: number;
319
+ /** Byte offset where this chunk ends in the original text (UTF-8 valid boundary) */
320
+ byteEnd: number;
321
+ /** Number of tokens in this chunk (if available from embedding model) */
322
+ tokenCount?: number | null;
323
+ /** Zero-based index of this chunk in the document */
324
+ chunkIndex: number;
325
+ /** Total number of chunks in the document */
326
+ totalChunks: number;
327
+ /** First page number this chunk spans (1-indexed, only when page tracking enabled) */
328
+ firstPage?: number | null;
329
+ /** Last page number this chunk spans (1-indexed, only when page tracking enabled) */
330
+ lastPage?: number | null;
331
+ }
332
+ interface Chunk {
333
+ content: string;
334
+ embedding?: number[] | null;
335
+ metadata: ChunkMetadata;
336
+ }
337
+ interface ExtractedImage {
338
+ data: Uint8Array;
339
+ format: string;
340
+ imageIndex: number;
341
+ pageNumber?: number | null;
342
+ width?: number | null;
343
+ height?: number | null;
344
+ colorspace?: string | null;
345
+ bitsPerComponent?: number | null;
346
+ isMask: boolean;
347
+ description?: string | null;
348
+ ocrResult?: ExtractionResult | null;
349
+ }
350
+ /**
351
+ * Content for a single page/slide/sheet.
352
+ *
353
+ * When page extraction is enabled, documents are split into per-page content
354
+ * with associated tables and images mapped to each page.
355
+ */
356
+ interface PageContent {
357
+ /** Page number (1-indexed) */
358
+ pageNumber: number;
359
+ /** Text content for this page */
360
+ content: string;
361
+ /** Tables found on this page */
362
+ tables: Table[];
363
+ /** Images found on this page */
364
+ images: ExtractedImage[];
365
+ }
366
+ /**
367
+ * Extraction result metadata.
368
+ *
369
+ * Uses a flattened discriminated union approach with format_type as the discriminator.
370
+ * When format_type is set (e.g., "archive"), the corresponding format-specific fields
371
+ * are available at the root level of the metadata object.
372
+ *
373
+ * This structure matches the Rust serialization with serde's tagged enum flattening.
374
+ */
375
+ interface Metadata {
376
+ language?: string | null;
377
+ date?: string | null;
378
+ subject?: string | null;
379
+ format_type?: "pdf" | "excel" | "email" | "pptx" | "archive" | "image" | "xml" | "text" | "html" | "ocr";
380
+ title?: string | null;
381
+ author?: string | null;
382
+ keywords?: string | null;
383
+ creator?: string | null;
384
+ producer?: string | null;
385
+ creation_date?: string | null;
386
+ modification_date?: string | null;
387
+ page_count?: number;
388
+ sheet_count?: number;
389
+ sheet_names?: string[];
390
+ from_email?: string | null;
391
+ from_name?: string | null;
392
+ to_emails?: string[];
393
+ cc_emails?: string[];
394
+ bcc_emails?: string[];
395
+ message_id?: string | null;
396
+ attachments?: string[];
397
+ description?: string | null;
398
+ summary?: string | null;
399
+ fonts?: string[];
400
+ format?: string;
401
+ file_count?: number;
402
+ file_list?: string[];
403
+ total_size?: number;
404
+ compressed_size?: number | null;
405
+ width?: number;
406
+ height?: number;
407
+ exif?: Record<string, string>;
408
+ element_count?: number;
409
+ unique_elements?: string[];
410
+ line_count?: number;
411
+ word_count?: number;
412
+ character_count?: number;
413
+ headers?: string[] | null;
414
+ links?: [string, string][] | null;
415
+ code_blocks?: [string, string][] | null;
416
+ canonical?: string | null;
417
+ base_href?: string | null;
418
+ og_title?: string | null;
419
+ og_description?: string | null;
420
+ og_image?: string | null;
421
+ og_url?: string | null;
422
+ og_type?: string | null;
423
+ og_site_name?: string | null;
424
+ twitter_card?: string | null;
425
+ twitter_title?: string | null;
426
+ twitter_description?: string | null;
427
+ twitter_image?: string | null;
428
+ twitter_site?: string | null;
429
+ twitter_creator?: string | null;
430
+ link_author?: string | null;
431
+ link_license?: string | null;
432
+ link_alternate?: string | null;
433
+ psm?: number;
434
+ output_format?: string;
435
+ table_count?: number;
436
+ table_rows?: number | null;
437
+ table_cols?: number | null;
438
+ image_preprocessing?: ImagePreprocessingMetadata | null;
439
+ json_schema?: Record<string, unknown> | null;
440
+ page_structure?: PageStructure | null;
441
+ error?: ErrorMetadata | null;
442
+ [key: string]: any;
443
+ }
444
+ interface ExtractionResult {
445
+ content: string;
446
+ mimeType: string;
447
+ metadata: Metadata;
448
+ tables: Table[];
449
+ detectedLanguages: string[] | null;
450
+ chunks: Chunk[] | null;
451
+ images: ExtractedImage[] | null;
452
+ pages?: PageContent[] | null;
453
+ }
454
+ type ProcessingStage = "early" | "middle" | "late";
455
+ interface PostProcessorProtocol {
456
+ /**
457
+ * Return the unique name of this postprocessor.
458
+ */
459
+ name(): string;
460
+ /**
461
+ * Process and enrich an extraction result.
462
+ *
463
+ * @param result - ExtractionResult with extracted content, metadata, and tables
464
+ * @returns Modified result with enriched metadata
465
+ */
466
+ process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
467
+ /**
468
+ * Return the processing stage for this processor.
469
+ *
470
+ * @returns One of "early", "middle", or "late" (default: "middle")
471
+ */
472
+ processingStage?(): ProcessingStage;
473
+ /**
474
+ * Initialize the processor (e.g., load ML models).
475
+ *
476
+ * Called once when the processor is registered.
477
+ */
478
+ initialize?(): void | Promise<void>;
479
+ /**
480
+ * Shutdown the processor and release resources.
481
+ *
482
+ * Called when the processor is unregistered.
483
+ */
484
+ shutdown?(): void | Promise<void>;
485
+ }
486
+ interface ValidatorProtocol {
487
+ /**
488
+ * Return the unique name of this validator.
489
+ */
490
+ name(): string;
491
+ /**
492
+ * Validate an extraction result.
493
+ *
494
+ * Throw an error if validation fails. The error message should explain why validation failed.
495
+ * If validation passes, return without throwing.
496
+ *
497
+ * @param result - ExtractionResult to validate
498
+ * @throws Error if validation fails (extraction will fail)
499
+ */
500
+ validate(result: ExtractionResult): void | Promise<void>;
501
+ /**
502
+ * Return the validation priority.
503
+ *
504
+ * Higher priority validators run first. Useful for running cheap validations before expensive ones.
505
+ *
506
+ * @returns Priority value (higher = runs earlier, default: 50)
507
+ */
508
+ priority?(): number;
509
+ /**
510
+ * Check if this validator should run for a given result.
511
+ *
512
+ * Allows conditional validation based on MIME type, metadata, or content.
513
+ *
514
+ * @param result - ExtractionResult to check
515
+ * @returns true if validator should run, false to skip (default: true)
516
+ */
517
+ shouldValidate?(result: ExtractionResult): boolean;
518
+ /**
519
+ * Initialize the validator.
520
+ *
521
+ * Called once when the validator is registered.
522
+ */
523
+ initialize?(): void | Promise<void>;
524
+ /**
525
+ * Shutdown the validator and release resources.
526
+ *
527
+ * Called when the validator is unregistered.
528
+ */
529
+ shutdown?(): void | Promise<void>;
530
+ }
531
+ /**
532
+ * OCR backend protocol for implementing custom OCR engines.
533
+ *
534
+ * This interface defines the contract for OCR backends that can be registered
535
+ * with Kreuzberg's extraction pipeline.
536
+ *
537
+ * ## Implementation Requirements
538
+ *
539
+ * OCR backends must implement:
540
+ * - `name()`: Return a unique backend identifier
541
+ * - `supportedLanguages()`: Return list of supported ISO 639-1/2/3 language codes
542
+ * - `processImage()`: Process image bytes and return extraction result
543
+ *
544
+ * ## Optional Methods
545
+ *
546
+ * - `initialize()`: Called when backend is registered (load models, etc.)
547
+ * - `shutdown()`: Called when backend is unregistered (cleanup resources)
548
+ *
549
+ * @example
550
+ * ```typescript
551
+ * import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
552
+ * import { registerOcrBackend, extractFile } from '@kreuzberg/node';
553
+ *
554
+ * // Create and register the backend
555
+ * const backend = new GutenOcrBackend();
556
+ * await backend.initialize();
557
+ * registerOcrBackend(backend);
558
+ *
559
+ * // Use with extraction
560
+ * const result = await extractFile('scanned.pdf', null, {
561
+ * ocr: { backend: 'guten-ocr', language: 'en' }
562
+ * });
563
+ * ```
564
+ */
565
+ interface OcrBackendProtocol {
566
+ /**
567
+ * Return the unique name of this OCR backend.
568
+ *
569
+ * This name is used in ExtractionConfig to select the backend:
570
+ * ```typescript
571
+ * { ocr: { backend: 'guten-ocr', language: 'en' } }
572
+ * ```
573
+ *
574
+ * @returns Unique backend identifier (e.g., "guten-ocr", "tesseract")
575
+ */
576
+ name(): string;
577
+ /**
578
+ * Return list of supported language codes.
579
+ *
580
+ * Language codes should follow ISO 639-1 (2-letter) or ISO 639-2 (3-letter) standards.
581
+ * Common codes: "en", "eng" (English), "de", "deu" (German), "fr", "fra" (French).
582
+ *
583
+ * @returns Array of supported language codes
584
+ *
585
+ * @example
586
+ * ```typescript
587
+ * supportedLanguages(): string[] {
588
+ * return ["en", "eng", "de", "deu", "fr", "fra"];
589
+ * }
590
+ * ```
591
+ */
592
+ supportedLanguages(): string[];
593
+ /**
594
+ * Process image bytes and extract text via OCR.
595
+ *
596
+ * This method receives raw image data and must return a result object with:
597
+ * - `content`: Extracted text content
598
+ * - `mime_type`: MIME type (usually "text/plain")
599
+ * - `metadata`: Additional information (confidence, dimensions, etc.)
600
+ * - `tables`: Optional array of detected tables
601
+ *
602
+ * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string (when called from Rust bindings)
603
+ * @param language - Language code from supportedLanguages()
604
+ * @returns Promise resolving to extraction result
605
+ *
606
+ * @example
607
+ * ```typescript
608
+ * async processImage(imageBytes: Uint8Array | string, language: string): Promise<{
609
+ * content: string;
610
+ * mime_type: string;
611
+ * metadata: Record<string, unknown>;
612
+ * tables: unknown[];
613
+ * }> {
614
+ * const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
615
+ * const text = await myOcrEngine.recognize(buffer, language);
616
+ * return {
617
+ * content: text,
618
+ * mime_type: "text/plain",
619
+ * metadata: { confidence: 0.95, language },
620
+ * tables: []
621
+ * };
622
+ * }
623
+ * ```
624
+ */
625
+ processImage(imageBytes: Uint8Array | string, language: string): Promise<{
626
+ content: string;
627
+ mime_type: string;
628
+ metadata: Record<string, unknown>;
629
+ tables: unknown[];
630
+ }>;
631
+ /**
632
+ * Initialize the OCR backend (optional).
633
+ *
634
+ * Called once when the backend is registered. Use this to:
635
+ * - Load ML models
636
+ * - Initialize libraries
637
+ * - Validate dependencies
638
+ *
639
+ * @example
640
+ * ```typescript
641
+ * async initialize(): Promise<void> {
642
+ * this.model = await loadModel('./path/to/model');
643
+ * }
644
+ * ```
645
+ */
646
+ initialize?(): void | Promise<void>;
647
+ /**
648
+ * Shutdown the OCR backend and release resources (optional).
649
+ *
650
+ * Called when the backend is unregistered. Use this to:
651
+ * - Free model memory
652
+ * - Close file handles
653
+ * - Cleanup temporary files
654
+ *
655
+ * @example
656
+ * ```typescript
657
+ * async shutdown(): Promise<void> {
658
+ * await this.model.dispose();
659
+ * this.model = null;
660
+ * }
661
+ * ```
662
+ */
663
+ shutdown?(): void | Promise<void>;
664
+ }
665
+
666
+ export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, XmlMetadata, YakeParams };