kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py CHANGED
@@ -1,8 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from collections.abc import Awaitable, Callable
4
+ from collections.abc import Awaitable, Callable, Iterable, Mapping
5
5
  from dataclasses import asdict, dataclass, field
6
+ from enum import Enum
6
7
  from typing import TYPE_CHECKING, Any, Literal, TypedDict
7
8
 
8
9
  import msgspec
@@ -15,32 +16,492 @@ from kreuzberg._utils._table import (
15
16
  )
16
17
  from kreuzberg.exceptions import ValidationError
17
18
 
19
+ if TYPE_CHECKING:
20
+ from kreuzberg._utils._device import DeviceType
21
+
18
22
  if sys.version_info < (3, 11): # pragma: no cover
19
23
  from typing_extensions import NotRequired
20
24
  else: # pragma: no cover
21
25
  from typing import NotRequired
22
26
 
23
27
  if TYPE_CHECKING:
24
- from pandas import DataFrame
25
- from PIL.Image import Image
28
+ from pathlib import Path
26
29
 
27
- from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
28
- from kreuzberg._gmft import GMFTConfig
29
- from kreuzberg._language_detection import LanguageDetectionConfig
30
- from kreuzberg._ocr._easyocr import EasyOCRConfig
31
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
32
- from kreuzberg._ocr._tesseract import TesseractConfig
30
+ from PIL.Image import Image
31
+ from polars import DataFrame
33
32
 
34
33
  OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
34
+ OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
35
35
 
36
36
 
37
- class TableData(TypedDict):
38
- """Table data, returned from table extraction."""
37
+ class ConfigDict:
38
+ def to_dict(self, include_none: bool = False) -> dict[str, Any]:
39
+ result = msgspec.to_builtins(
40
+ self,
41
+ builtin_types=(type(None),),
42
+ order="deterministic",
43
+ )
44
+
45
+ if include_none:
46
+ return result # type: ignore[no-any-return]
47
+
48
+ return {k: v for k, v in result.items() if v is not None}
49
+
39
50
 
51
+ class PSMMode(Enum):
52
+ OSD_ONLY = 0
53
+ """Orientation and script detection only."""
54
+ AUTO_OSD = 1
55
+ """Automatic page segmentation with orientation and script detection."""
56
+ AUTO_ONLY = 2
57
+ """Automatic page segmentation without OSD."""
58
+ AUTO = 3
59
+ """Fully automatic page segmentation (default)."""
60
+ SINGLE_COLUMN = 4
61
+ """Assume a single column of text."""
62
+ SINGLE_BLOCK_VERTICAL = 5
63
+ """Assume a single uniform block of vertically aligned text."""
64
+ SINGLE_BLOCK = 6
65
+ """Assume a single uniform block of text."""
66
+ SINGLE_LINE = 7
67
+ """Treat the image as a single text line."""
68
+ SINGLE_WORD = 8
69
+ """Treat the image as a single word."""
70
+ CIRCLE_WORD = 9
71
+ """Treat the image as a single word in a circle."""
72
+ SINGLE_CHAR = 10
73
+ """Treat the image as a single character."""
74
+
75
+
76
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
77
+ class TesseractConfig(ConfigDict):
78
+ classify_use_pre_adapted_templates: bool = True
79
+ """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
80
+ language: str = "eng"
81
+ """Language code to use for OCR.
82
+ Examples:
83
+ - 'eng' for English
84
+ - 'deu' for German
85
+ - multiple languages combined with '+', e.g. 'eng+deu'
86
+ """
87
+ language_model_ngram_on: bool = False
88
+ """Enable or disable the use of n-gram-based language models for improved text recognition.
89
+ Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
90
+ psm: PSMMode = PSMMode.AUTO
91
+ """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
92
+ tessedit_dont_blkrej_good_wds: bool = True
93
+ """If True, prevents block rejection of words identified as good, improving text output quality."""
94
+ tessedit_dont_rowrej_good_wds: bool = True
95
+ """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
96
+ tessedit_enable_dict_correction: bool = True
97
+ """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
98
+ tessedit_char_whitelist: str = ""
99
+ """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
100
+ tessedit_use_primary_params_model: bool = True
101
+ """If True, forces the use of the primary parameters model for text recognition."""
102
+ textord_space_size_is_variable: bool = True
103
+ """Allow variable spacing between words, useful for text with irregular spacing."""
104
+ thresholding_method: bool = False
105
+ """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
106
+ output_format: OutputFormatType = "markdown"
107
+ """Output format: 'markdown' (default), 'text', 'tsv' (for structured data), or 'hocr' (HTML-based)."""
108
+ enable_table_detection: bool = False
109
+ """Enable table structure detection from TSV output."""
110
+ table_column_threshold: int = 20
111
+ """Pixel threshold for column clustering in table detection."""
112
+ table_row_threshold_ratio: float = 0.5
113
+ """Row threshold as ratio of mean text height for table detection."""
114
+ table_min_confidence: float = 30.0
115
+ """Minimum confidence score to include a word in table extraction."""
116
+
117
+
118
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
119
+ class EasyOCRConfig(ConfigDict):
120
+ add_margin: float = 0.1
121
+ """Extend bounding boxes in all directions."""
122
+ adjust_contrast: float = 0.5
123
+ """Target contrast level for low contrast text."""
124
+ beam_width: int = 5
125
+ """Beam width for beam search in recognition."""
126
+ canvas_size: int = 2560
127
+ """Maximum image dimension for detection."""
128
+ contrast_ths: float = 0.1
129
+ """Contrast threshold for preprocessing."""
130
+ decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
131
+ """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
132
+ height_ths: float = 0.5
133
+ """Maximum difference in box height for merging."""
134
+ language: str | list[str] = "en"
135
+ """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
136
+ a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
137
+ link_threshold: float = 0.4
138
+ """Link confidence threshold."""
139
+ low_text: float = 0.4
140
+ """Text low-bound score."""
141
+ mag_ratio: float = 1.0
142
+ """Image magnification ratio."""
143
+ min_size: int = 10
144
+ """Minimum text box size in pixels."""
145
+ rotation_info: list[int] | None = None
146
+ """List of angles to try for detection."""
147
+ slope_ths: float = 0.1
148
+ """Maximum slope for merging text boxes."""
149
+ text_threshold: float = 0.7
150
+ """Text confidence threshold."""
151
+ use_gpu: bool = False
152
+ """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
153
+ device: DeviceType = "auto"
154
+ """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
155
+ gpu_memory_limit: float | None = None
156
+ """Maximum GPU memory to use in GB. None for no limit."""
157
+ fallback_to_cpu: bool = True
158
+ """Whether to fallback to CPU if requested device is unavailable."""
159
+ width_ths: float = 0.5
160
+ """Maximum horizontal distance for merging boxes."""
161
+ x_ths: float = 1.0
162
+ """Maximum horizontal distance for paragraph merging."""
163
+ y_ths: float = 0.5
164
+ """Maximum vertical distance for paragraph merging."""
165
+ ycenter_ths: float = 0.5
166
+ """Maximum shift in y direction for merging."""
167
+
168
+
169
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
170
+ class PaddleOCRConfig(ConfigDict):
171
+ cls_image_shape: str = "3,48,192"
172
+ """Image shape for classification algorithm in format 'channels,height,width'."""
173
+ det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
174
+ """Detection algorithm."""
175
+ det_db_box_thresh: float = 0.5
176
+ """DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_box_thresh' instead. Score threshold for detected boxes."""
177
+ det_db_thresh: float = 0.3
178
+ """DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_thresh' instead. Binarization threshold for DB output map."""
179
+ det_db_unclip_ratio: float = 2.0
180
+ """DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_unclip_ratio' instead. Expansion ratio for detected text boxes."""
181
+ det_east_cover_thresh: float = 0.1
182
+ """Score threshold for EAST output boxes."""
183
+ det_east_nms_thresh: float = 0.2
184
+ """NMS threshold for EAST model output boxes."""
185
+ det_east_score_thresh: float = 0.8
186
+ """Binarization threshold for EAST output map."""
187
+ det_max_side_len: int = 960
188
+ """Maximum size of image long side. Images exceeding this will be proportionally resized."""
189
+ det_model_dir: str | None = None
190
+ """Directory for detection model. If None, uses default model location."""
191
+ drop_score: float = 0.5
192
+ """Filter recognition results by confidence score. Results below this are discarded."""
193
+ enable_mkldnn: bool = False
194
+ """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
195
+ gpu_mem: int = 8000
196
+ """DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. GPU memory size (in MB) to use for initialization."""
197
+ language: str = "en"
198
+ """Language to use for OCR."""
199
+ max_text_length: int = 25
200
+ """Maximum text length that the recognition algorithm can recognize."""
201
+ rec: bool = True
202
+ """Enable text recognition when using the ocr() function."""
203
+ rec_algorithm: Literal[
204
+ "CRNN",
205
+ "SRN",
206
+ "NRTR",
207
+ "SAR",
208
+ "SEED",
209
+ "SVTR",
210
+ "SVTR_LCNet",
211
+ "ViTSTR",
212
+ "ABINet",
213
+ "VisionLAN",
214
+ "SPIN",
215
+ "RobustScanner",
216
+ "RFL",
217
+ ] = "CRNN"
218
+ """Recognition algorithm."""
219
+ rec_image_shape: str = "3,32,320"
220
+ """Image shape for recognition algorithm in format 'channels,height,width'."""
221
+ rec_model_dir: str | None = None
222
+ """Directory for recognition model. If None, uses default model location."""
223
+ table: bool = True
224
+ """Whether to enable table recognition."""
225
+ use_angle_cls: bool = True
226
+ """DEPRECATED in PaddleOCR 3.2.0+: Use 'use_textline_orientation' instead. Whether to use text orientation classification model."""
227
+ use_gpu: bool = False
228
+ """DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. Use hardware acceleration flags instead."""
229
+ device: DeviceType = "auto"
230
+ """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
231
+ gpu_memory_limit: float | None = None
232
+ """DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. Maximum GPU memory to use in GB."""
233
+ fallback_to_cpu: bool = True
234
+ """Whether to fallback to CPU if requested device is unavailable."""
235
+ use_space_char: bool = True
236
+ """Whether to recognize spaces."""
237
+ use_zero_copy_run: bool = False
238
+ """Whether to enable zero_copy_run for inference optimization."""
239
+
240
+ text_det_thresh: float = 0.3
241
+ """Binarization threshold for text detection output map (replaces det_db_thresh)."""
242
+ text_det_box_thresh: float = 0.5
243
+ """Score threshold for detected text boxes (replaces det_db_box_thresh)."""
244
+ text_det_unclip_ratio: float = 2.0
245
+ """Expansion ratio for detected text boxes (replaces det_db_unclip_ratio)."""
246
+ use_textline_orientation: bool = True
247
+ """Whether to use text line orientation classification model (replaces use_angle_cls)."""
248
+
249
+
250
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
251
+ class GMFTConfig(ConfigDict):
252
+ verbosity: int = 0
253
+ """
254
+ Verbosity level for logging.
255
+
256
+ 0: errors only
257
+ 1: print warnings
258
+ 2: print warnings and info
259
+ 3: print warnings, info, and debug
260
+ """
261
+ formatter_base_threshold: float = 0.3
262
+ """
263
+ Base threshold for the confidence demanded of a table feature (row/column).
264
+
265
+ Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
266
+ """
267
+ cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
268
+ default_factory=lambda: {
269
+ 0: 0.3,
270
+ 1: 0.3,
271
+ 2: 0.3,
272
+ 3: 0.3,
273
+ 4: 0.5,
274
+ 5: 0.5,
275
+ 6: 99,
276
+ },
277
+ hash=False,
278
+ )
279
+ """
280
+ Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
281
+
282
+ But low confidences may be better than too high confidence (see formatter_base_threshold)
283
+ """
284
+ detector_base_threshold: float = 0.9
285
+ """Minimum confidence score required for a table"""
286
+ remove_null_rows: bool = True
287
+ """
288
+ Flag to remove rows with no text.
289
+ """
290
+ enable_multi_header: bool = False
291
+ """
292
+ Enable multi-indices in the dataframe.
293
+
294
+ If false, then multiple headers will be merged column-wise.
295
+ """
296
+ semantic_spanning_cells: bool = False
297
+ """
298
+ [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
299
+ """
300
+ semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
301
+ """
302
+ [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
303
+
304
+ Possible values: 'algorithm', 'deep', None.
305
+ """
306
+ large_table_if_n_rows_removed: int = 8
307
+ """
308
+ If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
309
+ """
310
+ large_table_threshold: int = 10
311
+ """
312
+ With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
313
+
314
+ Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
315
+ """
316
+ large_table_row_overlap_threshold: float = 0.2
317
+ """
318
+ With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
319
+
320
+ Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
321
+ """
322
+ large_table_maximum_rows: int = 1000
323
+ """
324
+ Maximum number of rows allowed for a large table.
325
+ """
326
+ force_large_table_assumption: bool | None = None
327
+ """
328
+ Force the large table assumption to be applied, regardless of the number of rows and overlap.
329
+ """
330
+ total_overlap_reject_threshold: float = 0.9
331
+ """
332
+ Reject if total overlap is > 90% of table area.
333
+ """
334
+ total_overlap_warn_threshold: float = 0.1
335
+ """
336
+ Warn if total overlap is > 10% of table area.
337
+ """
338
+ nms_warn_threshold: int = 5
339
+ """
340
+ Warn if non maxima suppression removes > 5 rows.
341
+ """
342
+ iob_reject_threshold: float = 0.05
343
+ """
344
+ Reject if iob between textbox and cell is < 5%.
345
+ """
346
+ iob_warn_threshold: float = 0.5
347
+ """
348
+ Warn if iob between textbox and cell is < 50%.
349
+ """
350
+
351
+
352
+ @dataclass(frozen=True, slots=True)
353
+ class LanguageDetectionConfig(ConfigDict):
354
+ low_memory: bool = True
355
+ """If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
356
+ Defaults to True for better memory efficiency."""
357
+ top_k: int = 3
358
+ """Maximum number of languages to return for multilingual detection."""
359
+ multilingual: bool = False
360
+ """If True, uses multilingual detection to handle mixed-language text.
361
+ If False, uses single language detection."""
362
+ cache_dir: str | None = None
363
+ """Custom directory for model cache. If None, uses system default."""
364
+ allow_fallback: bool = True
365
+ """If True, falls back to small model if large model fails."""
366
+
367
+
368
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
369
+ class SpacyEntityExtractionConfig(ConfigDict):
370
+ model_cache_dir: str | Path | None = None
371
+ """Directory to cache spaCy models. If None, uses spaCy's default."""
372
+ language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
373
+ """Mapping of language codes to spaCy model names.
374
+
375
+ If None, uses default mappings:
376
+ - en: en_core_web_sm
377
+ - de: de_core_news_sm
378
+ - fr: fr_core_news_sm
379
+ - es: es_core_news_sm
380
+ - pt: pt_core_news_sm
381
+ - it: it_core_news_sm
382
+ - nl: nl_core_news_sm
383
+ - zh: zh_core_web_sm
384
+ - ja: ja_core_news_sm
385
+ """
386
+ fallback_to_multilingual: bool = True
387
+ """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
388
+ max_doc_length: int = 1000000
389
+ """Maximum document length for spaCy processing."""
390
+ batch_size: int = 1000
391
+ """Batch size for processing multiple texts."""
392
+
393
+ def __post_init__(self) -> None:
394
+ if self.language_models is None:
395
+ object.__setattr__(self, "language_models", self._get_default_language_models())
396
+
397
+ if isinstance(self.language_models, dict):
398
+ object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
399
+
400
+ @staticmethod
401
+ def _get_default_language_models() -> dict[str, str]:
402
+ return {
403
+ "en": "en_core_web_sm",
404
+ "de": "de_core_news_sm",
405
+ "fr": "fr_core_news_sm",
406
+ "es": "es_core_news_sm",
407
+ "pt": "pt_core_news_sm",
408
+ "it": "it_core_news_sm",
409
+ "nl": "nl_core_news_sm",
410
+ "zh": "zh_core_web_sm",
411
+ "ja": "ja_core_news_sm",
412
+ "ko": "ko_core_news_sm",
413
+ "ru": "ru_core_news_sm",
414
+ "pl": "pl_core_news_sm",
415
+ "ro": "ro_core_news_sm",
416
+ "el": "el_core_news_sm",
417
+ "da": "da_core_news_sm",
418
+ "fi": "fi_core_news_sm",
419
+ "nb": "nb_core_news_sm",
420
+ "sv": "sv_core_news_sm",
421
+ "ca": "ca_core_news_sm",
422
+ "hr": "hr_core_news_sm",
423
+ "lt": "lt_core_news_sm",
424
+ "mk": "mk_core_news_sm",
425
+ "sl": "sl_core_news_sm",
426
+ "uk": "uk_core_news_sm",
427
+ "xx": "xx_ent_wiki_sm",
428
+ }
429
+
430
+ def get_model_for_language(self, language_code: str) -> str | None:
431
+ if not self.language_models:
432
+ return None
433
+
434
+ models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
435
+
436
+ if language_code in models_dict:
437
+ return models_dict[language_code]
438
+
439
+ base_lang = language_code.split("-")[0].lower()
440
+ if base_lang in models_dict:
441
+ return models_dict[base_lang]
442
+
443
+ return None
444
+
445
+ def get_fallback_model(self) -> str | None:
446
+ return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
447
+
448
+
449
+ class BoundingBox(TypedDict):
450
+ left: int
451
+ """X coordinate of the left edge."""
452
+ top: int
453
+ """Y coordinate of the top edge."""
454
+ width: int
455
+ """Width of the bounding box."""
456
+ height: int
457
+ """Height of the bounding box."""
458
+
459
+
460
+ class TSVWord(TypedDict):
461
+ level: int
462
+ """Hierarchy level (1=page, 2=block, 3=para, 4=line, 5=word)."""
463
+ page_num: int
464
+ """Page number."""
465
+ block_num: int
466
+ """Block number within the page."""
467
+ par_num: int
468
+ """Paragraph number within the block."""
469
+ line_num: int
470
+ """Line number within the paragraph."""
471
+ word_num: int
472
+ """Word number within the line."""
473
+ left: int
474
+ """X coordinate of the left edge of the word."""
475
+ top: int
476
+ """Y coordinate of the top edge of the word."""
477
+ width: int
478
+ """Width of the word bounding box."""
479
+ height: int
480
+ """Height of the word bounding box."""
481
+ conf: float
482
+ """Confidence score (0-100)."""
483
+ text: str
484
+ """The recognized text content."""
485
+
486
+
487
+ class TableCell(TypedDict):
488
+ row: int
489
+ """Row index (0-based)."""
490
+ col: int
491
+ """Column index (0-based)."""
492
+ text: str
493
+ """Cell text content."""
494
+ bbox: BoundingBox
495
+ """Bounding box of the cell."""
496
+ confidence: float
497
+ """Average confidence of words in the cell."""
498
+
499
+
500
+ class TableData(TypedDict):
40
501
  cropped_image: Image
41
502
  """The cropped image of the table."""
42
- df: DataFrame
43
- """The table data as a pandas DataFrame."""
503
+ df: DataFrame | None
504
+ """The table data as a polars DataFrame."""
44
505
  page_number: int
45
506
  """The page number of the table."""
46
507
  text: str
@@ -48,12 +509,6 @@ class TableData(TypedDict):
48
509
 
49
510
 
50
511
  class Metadata(TypedDict, total=False):
51
- """Base metadata common to all document types.
52
-
53
- All fields will only be included if they contain non-empty values.
54
- Any field that would be empty or None is omitted from the dictionary.
55
- """
56
-
57
512
  authors: NotRequired[list[str]]
58
513
  """List of document authors."""
59
514
  categories: NotRequired[list[str]]
@@ -106,8 +561,6 @@ class Metadata(TypedDict, total=False):
106
561
  """Version identifier or revision number."""
107
562
  width: NotRequired[int]
108
563
  """Width of the document page/slide/image, if applicable."""
109
-
110
- # Email-specific fields
111
564
  email_from: NotRequired[str]
112
565
  """Email sender (from field)."""
113
566
  email_to: NotRequired[str]
@@ -120,25 +573,26 @@ class Metadata(TypedDict, total=False):
120
573
  """Email date or document date."""
121
574
  attachments: NotRequired[list[str]]
122
575
  """List of attachment names."""
123
-
124
- # Additional metadata fields for various extractors
125
576
  content: NotRequired[str]
126
577
  """Content metadata field."""
127
578
  parse_error: NotRequired[str]
128
579
  """Parse error information."""
129
580
  warning: NotRequired[str]
130
581
  """Warning messages."""
131
-
132
- # Table extraction metadata
133
582
  table_count: NotRequired[int]
134
583
  """Number of tables extracted from the document."""
584
+ tables_detected: NotRequired[int]
585
+ """Number of tables detected in the document."""
135
586
  tables_summary: NotRequired[str]
136
587
  """Summary of table extraction results."""
137
588
  quality_score: NotRequired[float]
138
589
  """Quality score for extracted content (0.0-1.0)."""
590
+ source_format: NotRequired[str]
591
+ """Source format of the extracted content."""
592
+ error: NotRequired[str]
593
+ """Error message if extraction failed."""
139
594
 
140
595
 
141
- # Cache valid metadata keys at module level for performance
142
596
  _VALID_METADATA_KEYS = {
143
597
  "authors",
144
598
  "categories",
@@ -182,14 +636,9 @@ _VALID_METADATA_KEYS = {
182
636
 
183
637
 
184
638
  def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
185
- """Normalize any dict to proper Metadata TypedDict.
186
-
187
- Filters out invalid keys and ensures type safety.
188
- """
189
639
  if not data:
190
640
  return {}
191
641
 
192
- # Filter and return only valid metadata
193
642
  normalized: Metadata = {}
194
643
  for key, value in data.items():
195
644
  if key in _VALID_METADATA_KEYS and value is not None:
@@ -200,8 +649,6 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
200
649
 
201
650
  @dataclass(frozen=True, slots=True)
202
651
  class Entity:
203
- """Represents an extracted entity with type, text, and position."""
204
-
205
652
  type: str
206
653
  """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
207
654
  text: str
@@ -214,8 +661,6 @@ class Entity:
214
661
 
215
662
  @dataclass(slots=True)
216
663
  class ExtractionResult:
217
- """The result of a file extraction."""
218
-
219
664
  content: str
220
665
  """The extracted content."""
221
666
  mime_type: str
@@ -240,57 +685,30 @@ class ExtractionResult:
240
685
  """Internal layout data from OCR, not for public use."""
241
686
 
242
687
  def to_dict(self, include_none: bool = False) -> dict[str, Any]:
243
- """Converts the ExtractionResult to a dictionary.
244
-
245
- Args:
246
- include_none: If True, include fields with None values.
247
- If False (default), exclude None values.
248
-
249
- Returns:
250
- Dictionary representation of the ExtractionResult.
251
- """
252
- # Use msgspec.to_builtins for efficient conversion
253
- # The builtin_types parameter allows DataFrames to pass through
254
688
  result = msgspec.to_builtins(
255
689
  self,
256
- builtin_types=(type(None),), # Allow None to pass through
257
- order="deterministic", # Ensure consistent output
690
+ builtin_types=(type(None),),
691
+ order="deterministic",
258
692
  )
259
693
 
260
694
  if include_none:
261
695
  return result # type: ignore[no-any-return]
262
696
 
263
- # Remove None values to match expected behavior
264
697
  return {k: v for k, v in result.items() if v is not None}
265
698
 
266
699
  def export_tables_to_csv(self) -> list[str]:
267
- """Export all tables to CSV format.
268
-
269
- Returns:
270
- List of CSV strings, one per table
271
- """
272
700
  if not self.tables: # pragma: no cover
273
701
  return []
274
702
 
275
703
  return [export_table_to_csv(table) for table in self.tables]
276
704
 
277
705
  def export_tables_to_tsv(self) -> list[str]:
278
- """Export all tables to TSV format.
279
-
280
- Returns:
281
- List of TSV strings, one per table
282
- """
283
706
  if not self.tables: # pragma: no cover
284
707
  return []
285
708
 
286
709
  return [export_table_to_tsv(table) for table in self.tables]
287
710
 
288
711
  def get_table_summaries(self) -> list[dict[str, Any]]:
289
- """Get structural information for all tables.
290
-
291
- Returns:
292
- List of table structure dictionaries
293
- """
294
712
  if not self.tables: # pragma: no cover
295
713
  return []
296
714
 
@@ -302,21 +720,15 @@ ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
302
720
 
303
721
 
304
722
  @dataclass(unsafe_hash=True, slots=True)
305
- class ExtractionConfig:
306
- """Represents configuration settings for an extraction process.
307
-
308
- This class encapsulates the configuration options for extracting text
309
- from images or documents using Optical Character Recognition (OCR). It
310
- provides options to customize the OCR behavior, select the backend
311
- engine, and configure engine-specific parameters.
312
- """
313
-
723
+ class ExtractionConfig(ConfigDict):
314
724
  force_ocr: bool = False
315
725
  """Whether to force OCR."""
316
726
  chunk_content: bool = False
317
727
  """Whether to chunk the content into smaller chunks."""
318
728
  extract_tables: bool = False
319
729
  """Whether to extract tables from the content. This requires the 'gmft' dependency."""
730
+ extract_tables_from_ocr: bool = False
731
+ """Extract tables from OCR output using TSV format (Tesseract only)."""
320
732
  max_chars: int = DEFAULT_MAX_CHARACTERS
321
733
  """The size of each chunk in characters."""
322
734
  max_overlap: int = DEFAULT_MAX_OVERLAP
@@ -359,6 +771,10 @@ class ExtractionConfig:
359
771
  """Whether to apply quality post-processing to improve extraction results."""
360
772
  pdf_password: str | list[str] = ""
361
773
  """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
774
+ html_to_markdown_config: HTMLToMarkdownConfig | None = None
775
+ """Configuration for HTML to Markdown conversion. If None, uses default settings."""
776
+ use_cache: bool = True
777
+ """Whether to use caching for extraction results. Set to False to disable all caching."""
362
778
 
363
779
  def __post_init__(self) -> None:
364
780
  if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
@@ -367,9 +783,6 @@ class ExtractionConfig:
367
783
  object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
368
784
  if self.validators is not None and isinstance(self.validators, list):
369
785
  object.__setattr__(self, "validators", tuple(self.validators))
370
- from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
371
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
372
- from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
373
786
 
374
787
  if self.ocr_backend is None and self.ocr_config is not None:
375
788
  raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
@@ -385,29 +798,111 @@ class ExtractionConfig:
385
798
  )
386
799
 
387
800
  def get_config_dict(self) -> dict[str, Any]:
388
- """Returns the OCR configuration object based on the backend specified.
389
-
390
- Returns:
391
- A dict of the OCR configuration or an empty dict if no backend is provided.
392
- """
393
801
  if self.ocr_backend is None:
394
- return {}
802
+ return {"use_cache": self.use_cache}
395
803
 
396
804
  if self.ocr_config is not None:
397
- # Use asdict for OCR configs to preserve enum objects correctly
398
- return asdict(self.ocr_config)
805
+ config_dict = asdict(self.ocr_config)
806
+ config_dict["use_cache"] = self.use_cache
807
+ return config_dict
399
808
 
400
- # Lazy load and cache default configs instead of creating new instances
401
809
  match self.ocr_backend:
402
810
  case "tesseract":
403
- from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
404
-
405
- return asdict(TesseractConfig())
811
+ config_dict = asdict(TesseractConfig())
812
+ config_dict["use_cache"] = self.use_cache
813
+ return config_dict
406
814
  case "easyocr":
407
- from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
815
+ config_dict = asdict(EasyOCRConfig())
816
+ config_dict["use_cache"] = self.use_cache
817
+ return config_dict
818
+ case _:
819
+ config_dict = asdict(PaddleOCRConfig())
820
+ config_dict["use_cache"] = self.use_cache
821
+ return config_dict
822
+
823
+ def to_dict(self, include_none: bool = False) -> dict[str, Any]:
824
+ result = msgspec.to_builtins(
825
+ self,
826
+ builtin_types=(type(None),),
827
+ order="deterministic",
828
+ )
829
+
830
+ for field_name, value in result.items():
831
+ if hasattr(value, "to_dict"):
832
+ result[field_name] = value.to_dict(include_none=include_none)
833
+
834
+ if include_none:
835
+ return result # type: ignore[no-any-return]
836
+
837
+ return {k: v for k, v in result.items() if v is not None}
408
838
 
409
- return asdict(EasyOCRConfig())
410
- case _: # paddleocr or any other backend
411
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
412
839
 
413
- return asdict(PaddleOCRConfig())
840
+ @dataclass(frozen=True)
841
+ class HTMLToMarkdownConfig:
842
+ stream_processing: bool = False
843
+ """Enable streaming mode for processing large HTML documents."""
844
+ chunk_size: int = 1024
845
+ """Size of chunks when stream_processing is enabled."""
846
+ chunk_callback: Callable[[str], None] | None = None
847
+ """Callback function invoked for each chunk during stream processing."""
848
+ progress_callback: Callable[[int, int], None] | None = None
849
+ """Callback function for progress updates (current, total)."""
850
+ parser: str | None = "lxml"
851
+ """BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
852
+ autolinks: bool = True
853
+ """Convert URLs to clickable links automatically."""
854
+ bullets: str = "*+-"
855
+ """Characters to use for unordered list bullets."""
856
+ code_language: str = ""
857
+ """Default language for code blocks."""
858
+ code_language_callback: Callable[[Any], str] | None = None
859
+ """Callback to determine code language dynamically."""
860
+ convert: str | Iterable[str] | None = None
861
+ """HTML tags to convert. If None, all supported tags are converted."""
862
+ convert_as_inline: bool = False
863
+ """Convert block elements as inline elements."""
864
+ custom_converters: Mapping[Any, Any] | None = None
865
+ """Custom converters for specific HTML elements."""
866
+ default_title: bool = False
867
+ """Use a default title if none is found."""
868
+ escape_asterisks: bool = True
869
+ """Escape asterisks in text to prevent unintended emphasis."""
870
+ escape_misc: bool = True
871
+ """Escape miscellaneous characters that have special meaning in Markdown."""
872
+ escape_underscores: bool = True
873
+ """Escape underscores in text to prevent unintended emphasis."""
874
+ extract_metadata: bool = True
875
+ """Extract metadata from HTML head section."""
876
+ heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
877
+ """Style for markdown headings."""
878
+ highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
879
+ """Style for highlighting text."""
880
+ keep_inline_images_in: Iterable[str] | None = None
881
+ """HTML tags where inline images should be preserved."""
882
+ newline_style: Literal["spaces", "backslash"] = "spaces"
883
+ """Style for line breaks in markdown."""
884
+ strip: str | Iterable[str] | None = None
885
+ """HTML tags to strip completely from output."""
886
+ strip_newlines: bool = False
887
+ """Strip newlines from the output."""
888
+ strong_em_symbol: Literal["*", "_"] = "*"
889
+ """Symbol to use for strong/emphasis formatting."""
890
+ sub_symbol: str = ""
891
+ """Symbol to use for subscript text."""
892
+ sup_symbol: str = ""
893
+ """Symbol to use for superscript text."""
894
+ wrap: bool = False
895
+ """Enable text wrapping."""
896
+ wrap_width: int = 80
897
+ """Width for text wrapping when wrap is True."""
898
+ preprocess_html: bool = True
899
+ """Enable HTML preprocessing to clean up the input."""
900
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
901
+ """Preprocessing level for cleaning HTML."""
902
+ remove_navigation: bool = True
903
+ """Remove navigation elements from HTML."""
904
+ remove_forms: bool = True
905
+ """Remove form elements from HTML."""
906
+
907
+ def to_dict(self) -> dict[str, Any]:
908
+ return {key: value for key, value in self.__dict__.items() if value is not None}