kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py CHANGED
@@ -1,8 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from collections.abc import Awaitable, Callable
4
+ from collections.abc import Awaitable, Callable, Iterable, Mapping
5
5
  from dataclasses import asdict, dataclass, field
6
+ from enum import Enum
6
7
  from typing import TYPE_CHECKING, Any, Literal, TypedDict
7
8
 
8
9
  import msgspec
@@ -15,23 +16,515 @@ from kreuzberg._utils._table import (
15
16
  )
16
17
  from kreuzberg.exceptions import ValidationError
17
18
 
19
+ if TYPE_CHECKING:
20
+ from kreuzberg._utils._device import DeviceType
21
+
18
22
  if sys.version_info < (3, 11): # pragma: no cover
19
23
  from typing_extensions import NotRequired
20
24
  else: # pragma: no cover
21
25
  from typing import NotRequired
22
26
 
23
27
  if TYPE_CHECKING:
24
- from pandas import DataFrame
25
- from PIL.Image import Image
28
+ from pathlib import Path
26
29
 
27
- from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
28
- from kreuzberg._gmft import GMFTConfig
29
- from kreuzberg._language_detection import LanguageDetectionConfig
30
- from kreuzberg._ocr._easyocr import EasyOCRConfig
31
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
32
- from kreuzberg._ocr._tesseract import TesseractConfig
30
+ from PIL.Image import Image
31
+ from polars import DataFrame
33
32
 
34
33
  OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
34
+ OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
35
+
36
+
37
+ class ConfigDict:
38
+ """Abstract base class for configuration objects that can be converted to dictionaries."""
39
+
40
+ def to_dict(self, include_none: bool = False) -> dict[str, Any]:
41
+ """Convert configuration to dictionary.
42
+
43
+ Args:
44
+ include_none: If True, include fields with None values.
45
+ If False (default), exclude None values.
46
+
47
+ Returns:
48
+ Dictionary representation of the configuration.
49
+ """
50
+ result = msgspec.to_builtins(
51
+ self,
52
+ builtin_types=(type(None),),
53
+ order="deterministic",
54
+ )
55
+
56
+ if include_none:
57
+ return result # type: ignore[no-any-return]
58
+
59
+ return {k: v for k, v in result.items() if v is not None}
60
+
61
+
62
+ class PSMMode(Enum):
63
+ """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
64
+
65
+ OSD_ONLY = 0
66
+ """Orientation and script detection only."""
67
+ AUTO_OSD = 1
68
+ """Automatic page segmentation with orientation and script detection."""
69
+ AUTO_ONLY = 2
70
+ """Automatic page segmentation without OSD."""
71
+ AUTO = 3
72
+ """Fully automatic page segmentation (default)."""
73
+ SINGLE_COLUMN = 4
74
+ """Assume a single column of text."""
75
+ SINGLE_BLOCK_VERTICAL = 5
76
+ """Assume a single uniform block of vertically aligned text."""
77
+ SINGLE_BLOCK = 6
78
+ """Assume a single uniform block of text."""
79
+ SINGLE_LINE = 7
80
+ """Treat the image as a single text line."""
81
+ SINGLE_WORD = 8
82
+ """Treat the image as a single word."""
83
+ CIRCLE_WORD = 9
84
+ """Treat the image as a single word in a circle."""
85
+ SINGLE_CHAR = 10
86
+ """Treat the image as a single character."""
87
+
88
+
89
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
90
+ class TesseractConfig(ConfigDict):
91
+ """Configuration options for Tesseract OCR engine."""
92
+
93
+ classify_use_pre_adapted_templates: bool = True
94
+ """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
95
+ language: str = "eng"
96
+ """Language code to use for OCR.
97
+ Examples:
98
+ - 'eng' for English
99
+ - 'deu' for German
100
+ - multiple languages combined with '+', e.g. 'eng+deu'
101
+ """
102
+ language_model_ngram_on: bool = False
103
+ """Enable or disable the use of n-gram-based language models for improved text recognition.
104
+ Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
105
+ psm: PSMMode = PSMMode.AUTO
106
+ """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
107
+ tessedit_dont_blkrej_good_wds: bool = True
108
+ """If True, prevents block rejection of words identified as good, improving text output quality."""
109
+ tessedit_dont_rowrej_good_wds: bool = True
110
+ """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
111
+ tessedit_enable_dict_correction: bool = True
112
+ """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
113
+ tessedit_char_whitelist: str = ""
114
+ """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
115
+ tessedit_use_primary_params_model: bool = True
116
+ """If True, forces the use of the primary parameters model for text recognition."""
117
+ textord_space_size_is_variable: bool = True
118
+ """Allow variable spacing between words, useful for text with irregular spacing."""
119
+ thresholding_method: bool = False
120
+ """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
121
+ output_format: OutputFormatType = "markdown"
122
+ """Output format: 'markdown' (default), 'text', 'tsv' (for structured data), or 'hocr' (HTML-based)."""
123
+ enable_table_detection: bool = False
124
+ """Enable table structure detection from TSV output."""
125
+ table_column_threshold: int = 20
126
+ """Pixel threshold for column clustering in table detection."""
127
+ table_row_threshold_ratio: float = 0.5
128
+ """Row threshold as ratio of mean text height for table detection."""
129
+ table_min_confidence: float = 30.0
130
+ """Minimum confidence score to include a word in table extraction."""
131
+
132
+
133
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
134
+ class EasyOCRConfig(ConfigDict):
135
+ """Configuration options for EasyOCR."""
136
+
137
+ add_margin: float = 0.1
138
+ """Extend bounding boxes in all directions."""
139
+ adjust_contrast: float = 0.5
140
+ """Target contrast level for low contrast text."""
141
+ beam_width: int = 5
142
+ """Beam width for beam search in recognition."""
143
+ canvas_size: int = 2560
144
+ """Maximum image dimension for detection."""
145
+ contrast_ths: float = 0.1
146
+ """Contrast threshold for preprocessing."""
147
+ decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
148
+ """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
149
+ height_ths: float = 0.5
150
+ """Maximum difference in box height for merging."""
151
+ language: str | list[str] = "en"
152
+ """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
153
+ a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
154
+ link_threshold: float = 0.4
155
+ """Link confidence threshold."""
156
+ low_text: float = 0.4
157
+ """Text low-bound score."""
158
+ mag_ratio: float = 1.0
159
+ """Image magnification ratio."""
160
+ min_size: int = 10
161
+ """Minimum text box size in pixels."""
162
+ rotation_info: list[int] | None = None
163
+ """List of angles to try for detection."""
164
+ slope_ths: float = 0.1
165
+ """Maximum slope for merging text boxes."""
166
+ text_threshold: float = 0.7
167
+ """Text confidence threshold."""
168
+ use_gpu: bool = False
169
+ """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
170
+ device: DeviceType = "auto"
171
+ """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
172
+ gpu_memory_limit: float | None = None
173
+ """Maximum GPU memory to use in GB. None for no limit."""
174
+ fallback_to_cpu: bool = True
175
+ """Whether to fallback to CPU if requested device is unavailable."""
176
+ width_ths: float = 0.5
177
+ """Maximum horizontal distance for merging boxes."""
178
+ x_ths: float = 1.0
179
+ """Maximum horizontal distance for paragraph merging."""
180
+ y_ths: float = 0.5
181
+ """Maximum vertical distance for paragraph merging."""
182
+ ycenter_ths: float = 0.5
183
+ """Maximum shift in y direction for merging."""
184
+
185
+
186
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
187
+ class PaddleOCRConfig(ConfigDict):
188
+ """Configuration options for PaddleOCR.
189
+
190
+ This dataclass provides type hints and documentation for all PaddleOCR parameters.
191
+ """
192
+
193
+ cls_image_shape: str = "3,48,192"
194
+ """Image shape for classification algorithm in format 'channels,height,width'."""
195
+ det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
196
+ """Detection algorithm."""
197
+ det_db_box_thresh: float = 0.5
198
+ """Score threshold for detected boxes. Boxes below this value are discarded."""
199
+ det_db_thresh: float = 0.3
200
+ """Binarization threshold for DB output map."""
201
+ det_db_unclip_ratio: float = 2.0
202
+ """Expansion ratio for detected text boxes."""
203
+ det_east_cover_thresh: float = 0.1
204
+ """Score threshold for EAST output boxes."""
205
+ det_east_nms_thresh: float = 0.2
206
+ """NMS threshold for EAST model output boxes."""
207
+ det_east_score_thresh: float = 0.8
208
+ """Binarization threshold for EAST output map."""
209
+ det_max_side_len: int = 960
210
+ """Maximum size of image long side. Images exceeding this will be proportionally resized."""
211
+ det_model_dir: str | None = None
212
+ """Directory for detection model. If None, uses default model location."""
213
+ drop_score: float = 0.5
214
+ """Filter recognition results by confidence score. Results below this are discarded."""
215
+ enable_mkldnn: bool = False
216
+ """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
217
+ gpu_mem: int = 8000
218
+ """GPU memory size (in MB) to use for initialization."""
219
+ language: str = "en"
220
+ """Language to use for OCR."""
221
+ max_text_length: int = 25
222
+ """Maximum text length that the recognition algorithm can recognize."""
223
+ rec: bool = True
224
+ """Enable text recognition when using the ocr() function."""
225
+ rec_algorithm: Literal[
226
+ "CRNN",
227
+ "SRN",
228
+ "NRTR",
229
+ "SAR",
230
+ "SEED",
231
+ "SVTR",
232
+ "SVTR_LCNet",
233
+ "ViTSTR",
234
+ "ABINet",
235
+ "VisionLAN",
236
+ "SPIN",
237
+ "RobustScanner",
238
+ "RFL",
239
+ ] = "CRNN"
240
+ """Recognition algorithm."""
241
+ rec_image_shape: str = "3,32,320"
242
+ """Image shape for recognition algorithm in format 'channels,height,width'."""
243
+ rec_model_dir: str | None = None
244
+ """Directory for recognition model. If None, uses default model location."""
245
+ table: bool = True
246
+ """Whether to enable table recognition."""
247
+ use_angle_cls: bool = True
248
+ """Whether to use text orientation classification model."""
249
+ use_gpu: bool = False
250
+ """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
251
+ device: DeviceType = "auto"
252
+ """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
253
+ gpu_memory_limit: float | None = None
254
+ """Maximum GPU memory to use in GB. None for no limit."""
255
+ fallback_to_cpu: bool = True
256
+ """Whether to fallback to CPU if requested device is unavailable."""
257
+ use_space_char: bool = True
258
+ """Whether to recognize spaces."""
259
+ use_zero_copy_run: bool = False
260
+ """Whether to enable zero_copy_run for inference optimization."""
261
+
262
+
263
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
264
+ class GMFTConfig(ConfigDict):
265
+ """Configuration options for GMFT table extraction.
266
+
267
+ This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
268
+ """
269
+
270
+ verbosity: int = 0
271
+ """
272
+ Verbosity level for logging.
273
+
274
+ 0: errors only
275
+ 1: print warnings
276
+ 2: print warnings and info
277
+ 3: print warnings, info, and debug
278
+ """
279
+ formatter_base_threshold: float = 0.3
280
+ """
281
+ Base threshold for the confidence demanded of a table feature (row/column).
282
+
283
+ Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
284
+ """
285
+ cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
286
+ default_factory=lambda: {
287
+ 0: 0.3,
288
+ 1: 0.3,
289
+ 2: 0.3,
290
+ 3: 0.3,
291
+ 4: 0.5,
292
+ 5: 0.5,
293
+ 6: 99,
294
+ },
295
+ hash=False,
296
+ )
297
+ """
298
+ Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
299
+
300
+ But low confidences may be better than too high confidence (see formatter_base_threshold)
301
+ """
302
+ detector_base_threshold: float = 0.9
303
+ """Minimum confidence score required for a table"""
304
+ remove_null_rows: bool = True
305
+ """
306
+ Flag to remove rows with no text.
307
+ """
308
+ enable_multi_header: bool = False
309
+ """
310
+ Enable multi-indices in the dataframe.
311
+
312
+ If false, then multiple headers will be merged column-wise.
313
+ """
314
+ semantic_spanning_cells: bool = False
315
+ """
316
+ [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
317
+ """
318
+ semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
319
+ """
320
+ [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
321
+
322
+ Possible values: 'algorithm', 'deep', None.
323
+ """
324
+ large_table_if_n_rows_removed: int = 8
325
+ """
326
+ If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
327
+ """
328
+ large_table_threshold: int = 10
329
+ """
330
+ With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
331
+
332
+ Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
333
+ """
334
+ large_table_row_overlap_threshold: float = 0.2
335
+ """
336
+ With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
337
+
338
+ Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
339
+ """
340
+ large_table_maximum_rows: int = 1000
341
+ """
342
+ Maximum number of rows allowed for a large table.
343
+ """
344
+ force_large_table_assumption: bool | None = None
345
+ """
346
+ Force the large table assumption to be applied, regardless of the number of rows and overlap.
347
+ """
348
+ total_overlap_reject_threshold: float = 0.9
349
+ """
350
+ Reject if total overlap is > 90% of table area.
351
+ """
352
+ total_overlap_warn_threshold: float = 0.1
353
+ """
354
+ Warn if total overlap is > 10% of table area.
355
+ """
356
+ nms_warn_threshold: int = 5
357
+ """
358
+ Warn if non maxima suppression removes > 5 rows.
359
+ """
360
+ iob_reject_threshold: float = 0.05
361
+ """
362
+ Reject if iob between textbox and cell is < 5%.
363
+ """
364
+ iob_warn_threshold: float = 0.5
365
+ """
366
+ Warn if iob between textbox and cell is < 50%.
367
+ """
368
+
369
+
370
+ @dataclass(frozen=True, slots=True)
371
+ class LanguageDetectionConfig(ConfigDict):
372
+ """Configuration for language detection."""
373
+
374
+ low_memory: bool = True
375
+ """If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
376
+ Defaults to True for better memory efficiency."""
377
+ top_k: int = 3
378
+ """Maximum number of languages to return for multilingual detection."""
379
+ multilingual: bool = False
380
+ """If True, uses multilingual detection to handle mixed-language text.
381
+ If False, uses single language detection."""
382
+ cache_dir: str | None = None
383
+ """Custom directory for model cache. If None, uses system default."""
384
+ allow_fallback: bool = True
385
+ """If True, falls back to small model if large model fails."""
386
+
387
+
388
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
389
+ class SpacyEntityExtractionConfig(ConfigDict):
390
+ """Configuration for spaCy-based entity extraction."""
391
+
392
+ model_cache_dir: str | Path | None = None
393
+ """Directory to cache spaCy models. If None, uses spaCy's default."""
394
+ language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
395
+ """Mapping of language codes to spaCy model names.
396
+
397
+ If None, uses default mappings:
398
+ - en: en_core_web_sm
399
+ - de: de_core_news_sm
400
+ - fr: fr_core_news_sm
401
+ - es: es_core_news_sm
402
+ - pt: pt_core_news_sm
403
+ - it: it_core_news_sm
404
+ - nl: nl_core_news_sm
405
+ - zh: zh_core_web_sm
406
+ - ja: ja_core_news_sm
407
+ """
408
+ fallback_to_multilingual: bool = True
409
+ """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
410
+ max_doc_length: int = 1000000
411
+ """Maximum document length for spaCy processing."""
412
+ batch_size: int = 1000
413
+ """Batch size for processing multiple texts."""
414
+
415
+ def __post_init__(self) -> None:
416
+ if self.language_models is None:
417
+ object.__setattr__(self, "language_models", self._get_default_language_models())
418
+
419
+ if isinstance(self.language_models, dict):
420
+ object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
421
+
422
+ @staticmethod
423
+ def _get_default_language_models() -> dict[str, str]:
424
+ return {
425
+ "en": "en_core_web_sm",
426
+ "de": "de_core_news_sm",
427
+ "fr": "fr_core_news_sm",
428
+ "es": "es_core_news_sm",
429
+ "pt": "pt_core_news_sm",
430
+ "it": "it_core_news_sm",
431
+ "nl": "nl_core_news_sm",
432
+ "zh": "zh_core_web_sm",
433
+ "ja": "ja_core_news_sm",
434
+ "ko": "ko_core_news_sm",
435
+ "ru": "ru_core_news_sm",
436
+ "pl": "pl_core_news_sm",
437
+ "ro": "ro_core_news_sm",
438
+ "el": "el_core_news_sm",
439
+ "da": "da_core_news_sm",
440
+ "fi": "fi_core_news_sm",
441
+ "nb": "nb_core_news_sm",
442
+ "sv": "sv_core_news_sm",
443
+ "ca": "ca_core_news_sm",
444
+ "hr": "hr_core_news_sm",
445
+ "lt": "lt_core_news_sm",
446
+ "mk": "mk_core_news_sm",
447
+ "sl": "sl_core_news_sm",
448
+ "uk": "uk_core_news_sm",
449
+ "xx": "xx_ent_wiki_sm",
450
+ }
451
+
452
+ def get_model_for_language(self, language_code: str) -> str | None:
453
+ """Get the appropriate spaCy model for a language code."""
454
+ if not self.language_models:
455
+ return None
456
+
457
+ models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
458
+
459
+ if language_code in models_dict:
460
+ return models_dict[language_code]
461
+
462
+ base_lang = language_code.split("-")[0].lower()
463
+ if base_lang in models_dict:
464
+ return models_dict[base_lang]
465
+
466
+ return None
467
+
468
+ def get_fallback_model(self) -> str | None:
469
+ """Get fallback multilingual model if enabled."""
470
+ return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
471
+
472
+
473
+ class BoundingBox(TypedDict):
474
+ """Bounding box coordinates for text elements."""
475
+
476
+ left: int
477
+ """X coordinate of the left edge."""
478
+ top: int
479
+ """Y coordinate of the top edge."""
480
+ width: int
481
+ """Width of the bounding box."""
482
+ height: int
483
+ """Height of the bounding box."""
484
+
485
+
486
+ class TSVWord(TypedDict):
487
+ """Represents a word from Tesseract TSV output."""
488
+
489
+ level: int
490
+ """Hierarchy level (1=page, 2=block, 3=para, 4=line, 5=word)."""
491
+ page_num: int
492
+ """Page number."""
493
+ block_num: int
494
+ """Block number within the page."""
495
+ par_num: int
496
+ """Paragraph number within the block."""
497
+ line_num: int
498
+ """Line number within the paragraph."""
499
+ word_num: int
500
+ """Word number within the line."""
501
+ left: int
502
+ """X coordinate of the left edge of the word."""
503
+ top: int
504
+ """Y coordinate of the top edge of the word."""
505
+ width: int
506
+ """Width of the word bounding box."""
507
+ height: int
508
+ """Height of the word bounding box."""
509
+ conf: float
510
+ """Confidence score (0-100)."""
511
+ text: str
512
+ """The recognized text content."""
513
+
514
+
515
+ class TableCell(TypedDict):
516
+ """Represents a cell in a reconstructed table."""
517
+
518
+ row: int
519
+ """Row index (0-based)."""
520
+ col: int
521
+ """Column index (0-based)."""
522
+ text: str
523
+ """Cell text content."""
524
+ bbox: BoundingBox
525
+ """Bounding box of the cell."""
526
+ confidence: float
527
+ """Average confidence of words in the cell."""
35
528
 
36
529
 
37
530
  class TableData(TypedDict):
@@ -39,8 +532,8 @@ class TableData(TypedDict):
39
532
 
40
533
  cropped_image: Image
41
534
  """The cropped image of the table."""
42
- df: DataFrame
43
- """The table data as a pandas DataFrame."""
535
+ df: DataFrame | None
536
+ """The table data as a polars DataFrame."""
44
537
  page_number: int
45
538
  """The page number of the table."""
46
539
  text: str
@@ -106,8 +599,6 @@ class Metadata(TypedDict, total=False):
106
599
  """Version identifier or revision number."""
107
600
  width: NotRequired[int]
108
601
  """Width of the document page/slide/image, if applicable."""
109
-
110
- # Email-specific fields
111
602
  email_from: NotRequired[str]
112
603
  """Email sender (from field)."""
113
604
  email_to: NotRequired[str]
@@ -120,25 +611,26 @@ class Metadata(TypedDict, total=False):
120
611
  """Email date or document date."""
121
612
  attachments: NotRequired[list[str]]
122
613
  """List of attachment names."""
123
-
124
- # Additional metadata fields for various extractors
125
614
  content: NotRequired[str]
126
615
  """Content metadata field."""
127
616
  parse_error: NotRequired[str]
128
617
  """Parse error information."""
129
618
  warning: NotRequired[str]
130
619
  """Warning messages."""
131
-
132
- # Table extraction metadata
133
620
  table_count: NotRequired[int]
134
621
  """Number of tables extracted from the document."""
622
+ tables_detected: NotRequired[int]
623
+ """Number of tables detected in the document."""
135
624
  tables_summary: NotRequired[str]
136
625
  """Summary of table extraction results."""
137
626
  quality_score: NotRequired[float]
138
627
  """Quality score for extracted content (0.0-1.0)."""
628
+ source_format: NotRequired[str]
629
+ """Source format of the extracted content."""
630
+ error: NotRequired[str]
631
+ """Error message if extraction failed."""
139
632
 
140
633
 
141
- # Cache valid metadata keys at module level for performance
142
634
  _VALID_METADATA_KEYS = {
143
635
  "authors",
144
636
  "categories",
@@ -189,7 +681,6 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
189
681
  if not data:
190
682
  return {}
191
683
 
192
- # Filter and return only valid metadata
193
684
  normalized: Metadata = {}
194
685
  for key, value in data.items():
195
686
  if key in _VALID_METADATA_KEYS and value is not None:
@@ -249,18 +740,15 @@ class ExtractionResult:
249
740
  Returns:
250
741
  Dictionary representation of the ExtractionResult.
251
742
  """
252
- # Use msgspec.to_builtins for efficient conversion
253
- # The builtin_types parameter allows DataFrames to pass through
254
743
  result = msgspec.to_builtins(
255
744
  self,
256
- builtin_types=(type(None),), # Allow None to pass through
257
- order="deterministic", # Ensure consistent output
745
+ builtin_types=(type(None),),
746
+ order="deterministic",
258
747
  )
259
748
 
260
749
  if include_none:
261
750
  return result # type: ignore[no-any-return]
262
751
 
263
- # Remove None values to match expected behavior
264
752
  return {k: v for k, v in result.items() if v is not None}
265
753
 
266
754
  def export_tables_to_csv(self) -> list[str]:
@@ -302,7 +790,7 @@ ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
302
790
 
303
791
 
304
792
  @dataclass(unsafe_hash=True, slots=True)
305
- class ExtractionConfig:
793
+ class ExtractionConfig(ConfigDict):
306
794
  """Represents configuration settings for an extraction process.
307
795
 
308
796
  This class encapsulates the configuration options for extracting text
@@ -317,6 +805,8 @@ class ExtractionConfig:
317
805
  """Whether to chunk the content into smaller chunks."""
318
806
  extract_tables: bool = False
319
807
  """Whether to extract tables from the content. This requires the 'gmft' dependency."""
808
+ extract_tables_from_ocr: bool = False
809
+ """Extract tables from OCR output using TSV format (Tesseract only)."""
320
810
  max_chars: int = DEFAULT_MAX_CHARACTERS
321
811
  """The size of each chunk in characters."""
322
812
  max_overlap: int = DEFAULT_MAX_OVERLAP
@@ -359,6 +849,10 @@ class ExtractionConfig:
359
849
  """Whether to apply quality post-processing to improve extraction results."""
360
850
  pdf_password: str | list[str] = ""
361
851
  """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
852
+ html_to_markdown_config: HTMLToMarkdownConfig | None = None
853
+ """Configuration for HTML to Markdown conversion. If None, uses default settings."""
854
+ use_cache: bool = True
855
+ """Whether to use caching for extraction results. Set to False to disable all caching."""
362
856
 
363
857
  def __post_init__(self) -> None:
364
858
  if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
@@ -367,9 +861,6 @@ class ExtractionConfig:
367
861
  object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
368
862
  if self.validators is not None and isinstance(self.validators, list):
369
863
  object.__setattr__(self, "validators", tuple(self.validators))
370
- from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
371
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
372
- from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
373
864
 
374
865
  if self.ocr_backend is None and self.ocr_config is not None:
375
866
  raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
@@ -391,23 +882,130 @@ class ExtractionConfig:
391
882
  A dict of the OCR configuration or an empty dict if no backend is provided.
392
883
  """
393
884
  if self.ocr_backend is None:
394
- return {}
885
+ return {"use_cache": self.use_cache}
395
886
 
396
887
  if self.ocr_config is not None:
397
- # Use asdict for OCR configs to preserve enum objects correctly
398
- return asdict(self.ocr_config)
888
+ config_dict = asdict(self.ocr_config)
889
+ config_dict["use_cache"] = self.use_cache
890
+ return config_dict
399
891
 
400
- # Lazy load and cache default configs instead of creating new instances
401
892
  match self.ocr_backend:
402
893
  case "tesseract":
403
- from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
404
-
405
- return asdict(TesseractConfig())
894
+ config_dict = asdict(TesseractConfig())
895
+ config_dict["use_cache"] = self.use_cache
896
+ return config_dict
406
897
  case "easyocr":
407
- from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
898
+ config_dict = asdict(EasyOCRConfig())
899
+ config_dict["use_cache"] = self.use_cache
900
+ return config_dict
901
+ case _:
902
+ config_dict = asdict(PaddleOCRConfig())
903
+ config_dict["use_cache"] = self.use_cache
904
+ return config_dict
905
+
906
+ def to_dict(self, include_none: bool = False) -> dict[str, Any]:
907
+ """Convert configuration to dictionary recursively.
408
908
 
409
- return asdict(EasyOCRConfig())
410
- case _: # paddleocr or any other backend
411
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
909
+ Args:
910
+ include_none: If True, include fields with None values.
911
+ If False (default), exclude None values.
412
912
 
413
- return asdict(PaddleOCRConfig())
913
+ Returns:
914
+ Dictionary representation of the configuration with nested configs converted.
915
+ """
916
+ result = msgspec.to_builtins(
917
+ self,
918
+ builtin_types=(type(None),),
919
+ order="deterministic",
920
+ )
921
+
922
+ for field_name, value in result.items():
923
+ if hasattr(value, "to_dict"):
924
+ result[field_name] = value.to_dict(include_none=include_none)
925
+
926
+ if include_none:
927
+ return result # type: ignore[no-any-return]
928
+
929
+ return {k: v for k, v in result.items() if v is not None}
930
+
931
+
932
+ @dataclass(frozen=True)
933
+ class HTMLToMarkdownConfig:
934
+ """Configuration for HTML to Markdown conversion.
935
+
936
+ This configuration class provides fine-grained control over how HTML content
937
+ is converted to Markdown format. Most fields have sensible defaults that work
938
+ well for typical document extraction scenarios.
939
+ """
940
+
941
+ stream_processing: bool = False
942
+ """Enable streaming mode for processing large HTML documents."""
943
+ chunk_size: int = 1024
944
+ """Size of chunks when stream_processing is enabled."""
945
+ chunk_callback: Callable[[str], None] | None = None
946
+ """Callback function invoked for each chunk during stream processing."""
947
+ progress_callback: Callable[[int, int], None] | None = None
948
+ """Callback function for progress updates (current, total)."""
949
+ parser: str | None = "lxml"
950
+ """BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
951
+ autolinks: bool = True
952
+ """Convert URLs to clickable links automatically."""
953
+ bullets: str = "*+-"
954
+ """Characters to use for unordered list bullets."""
955
+ code_language: str = ""
956
+ """Default language for code blocks."""
957
+ code_language_callback: Callable[[Any], str] | None = None
958
+ """Callback to determine code language dynamically."""
959
+ convert: str | Iterable[str] | None = None
960
+ """HTML tags to convert. If None, all supported tags are converted."""
961
+ convert_as_inline: bool = False
962
+ """Convert block elements as inline elements."""
963
+ custom_converters: Mapping[Any, Any] | None = None
964
+ """Custom converters for specific HTML elements."""
965
+ default_title: bool = False
966
+ """Use a default title if none is found."""
967
+ escape_asterisks: bool = True
968
+ """Escape asterisks in text to prevent unintended emphasis."""
969
+ escape_misc: bool = True
970
+ """Escape miscellaneous characters that have special meaning in Markdown."""
971
+ escape_underscores: bool = True
972
+ """Escape underscores in text to prevent unintended emphasis."""
973
+ extract_metadata: bool = True
974
+ """Extract metadata from HTML head section."""
975
+ heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
976
+ """Style for markdown headings."""
977
+ highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
978
+ """Style for highlighting text."""
979
+ keep_inline_images_in: Iterable[str] | None = None
980
+ """HTML tags where inline images should be preserved."""
981
+ newline_style: Literal["spaces", "backslash"] = "spaces"
982
+ """Style for line breaks in markdown."""
983
+ strip: str | Iterable[str] | None = None
984
+ """HTML tags to strip completely from output."""
985
+ strip_newlines: bool = False
986
+ """Strip newlines from the output."""
987
+ strong_em_symbol: Literal["*", "_"] = "*"
988
+ """Symbol to use for strong/emphasis formatting."""
989
+ sub_symbol: str = ""
990
+ """Symbol to use for subscript text."""
991
+ sup_symbol: str = ""
992
+ """Symbol to use for superscript text."""
993
+ wrap: bool = False
994
+ """Enable text wrapping."""
995
+ wrap_width: int = 80
996
+ """Width for text wrapping when wrap is True."""
997
+ preprocess_html: bool = True
998
+ """Enable HTML preprocessing to clean up the input."""
999
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
1000
+ """Preprocessing level for cleaning HTML."""
1001
+ remove_navigation: bool = True
1002
+ """Remove navigation elements from HTML."""
1003
+ remove_forms: bool = True
1004
+ """Remove form elements from HTML."""
1005
+
1006
+ def to_dict(self) -> dict[str, Any]:
1007
+ """Convert config to dictionary for passing to convert_to_markdown.
1008
+
1009
+ Excludes None values and handles special cases.
1010
+ """
1011
+ return {key: value for key, value in self.__dict__.items() if value is not None}