kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_gmft.py
CHANGED
@@ -7,16 +7,17 @@ import queue
|
|
7
7
|
import signal
|
8
8
|
import time
|
9
9
|
import traceback
|
10
|
-
from dataclasses import dataclass, field
|
11
10
|
from io import StringIO
|
12
11
|
from pathlib import Path
|
13
|
-
from typing import TYPE_CHECKING, Any
|
12
|
+
from typing import TYPE_CHECKING, Any
|
14
13
|
|
15
14
|
import anyio
|
16
15
|
import msgspec
|
16
|
+
import pandas as pd
|
17
17
|
from PIL import Image
|
18
18
|
|
19
|
-
from kreuzberg._types import TableData
|
19
|
+
from kreuzberg._types import GMFTConfig, TableData
|
20
|
+
from kreuzberg._utils._cache import get_table_cache
|
20
21
|
from kreuzberg._utils._sync import run_sync
|
21
22
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
22
23
|
|
@@ -27,139 +28,9 @@ if TYPE_CHECKING:
|
|
27
28
|
from pandas import DataFrame
|
28
29
|
|
29
30
|
|
30
|
-
@dataclass(unsafe_hash=True, slots=True)
|
31
|
-
class GMFTConfig:
|
32
|
-
"""Configuration options for GMFT.
|
33
|
-
|
34
|
-
This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
|
35
|
-
"""
|
36
|
-
|
37
|
-
verbosity: int = 0
|
38
|
-
"""
|
39
|
-
Verbosity level for logging.
|
40
|
-
|
41
|
-
0: errors only
|
42
|
-
1: print warnings
|
43
|
-
2: print warnings and info
|
44
|
-
3: print warnings, info, and debug
|
45
|
-
"""
|
46
|
-
formatter_base_threshold: float = 0.3
|
47
|
-
"""
|
48
|
-
Base threshold for the confidence demanded of a table feature (row/column).
|
49
|
-
|
50
|
-
Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
|
51
|
-
"""
|
52
|
-
cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
|
53
|
-
default_factory=lambda: {
|
54
|
-
0: 0.3,
|
55
|
-
1: 0.3,
|
56
|
-
2: 0.3,
|
57
|
-
3: 0.3,
|
58
|
-
4: 0.5,
|
59
|
-
5: 0.5,
|
60
|
-
6: 99,
|
61
|
-
},
|
62
|
-
hash=False,
|
63
|
-
)
|
64
|
-
"""
|
65
|
-
Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
|
66
|
-
|
67
|
-
But low confidences may be better than too high confidence (see formatter_base_threshold)
|
68
|
-
"""
|
69
|
-
detector_base_threshold: float = 0.9
|
70
|
-
"""Minimum confidence score required for a table"""
|
71
|
-
remove_null_rows: bool = True
|
72
|
-
"""
|
73
|
-
Flag to remove rows with no text.
|
74
|
-
"""
|
75
|
-
enable_multi_header: bool = False
|
76
|
-
"""
|
77
|
-
Enable multi-indices in the dataframe.
|
78
|
-
|
79
|
-
If false, then multiple headers will be merged column-wise.
|
80
|
-
"""
|
81
|
-
semantic_spanning_cells: bool = False
|
82
|
-
"""
|
83
|
-
[Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
|
84
|
-
"""
|
85
|
-
semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
|
86
|
-
"""
|
87
|
-
[Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
|
88
|
-
|
89
|
-
Possible values: 'algorithm', 'deep', None.
|
90
|
-
|
91
|
-
'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
|
92
|
-
'deep': merges headers according to the spanning cells detected by the Table Transformer.
|
93
|
-
None: headers are not duplicated.
|
94
|
-
"""
|
95
|
-
large_table_if_n_rows_removed: int = 8
|
96
|
-
"""
|
97
|
-
If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
|
98
|
-
"""
|
99
|
-
large_table_threshold: int = 10
|
100
|
-
"""
|
101
|
-
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
102
|
-
|
103
|
-
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
|
104
|
-
"""
|
105
|
-
large_table_row_overlap_threshold: float = 0.2
|
106
|
-
"""
|
107
|
-
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
108
|
-
|
109
|
-
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
|
110
|
-
"""
|
111
|
-
large_table_maximum_rows: int = 1000
|
112
|
-
"""
|
113
|
-
Maximum number of rows allowed for a large table.
|
114
|
-
"""
|
115
|
-
force_large_table_assumption: bool | None = None
|
116
|
-
"""
|
117
|
-
Force the large table assumption to be applied, regardless of the number of rows and overlap.
|
118
|
-
"""
|
119
|
-
total_overlap_reject_threshold: float = 0.9
|
120
|
-
"""
|
121
|
-
Reject if total overlap is > 90% of table area.
|
122
|
-
"""
|
123
|
-
total_overlap_warn_threshold: float = 0.1
|
124
|
-
"""
|
125
|
-
Warn if total overlap is > 10% of table area.
|
126
|
-
"""
|
127
|
-
nms_warn_threshold: int = 5
|
128
|
-
"""
|
129
|
-
Warn if non maxima suppression removes > 5 rows.
|
130
|
-
"""
|
131
|
-
iob_reject_threshold: float = 0.05
|
132
|
-
"""
|
133
|
-
Reject if iob between textbox and cell is < 5%.
|
134
|
-
"""
|
135
|
-
iob_warn_threshold: float = 0.5
|
136
|
-
"""
|
137
|
-
Warn if iob between textbox and cell is < 50%.
|
138
|
-
"""
|
139
|
-
|
140
|
-
|
141
31
|
async def extract_tables(
|
142
32
|
file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
|
143
33
|
) -> list[TableData]:
|
144
|
-
"""Extracts tables from a PDF file.
|
145
|
-
|
146
|
-
This function takes a file path to a PDF file, and an optional configuration object.
|
147
|
-
It returns a list of strings, where each string is a markdown-formatted table.
|
148
|
-
|
149
|
-
Args:
|
150
|
-
file_path: The path to the PDF file.
|
151
|
-
config: An optional configuration object.
|
152
|
-
use_isolated_process: Whether to use an isolated process for extraction.
|
153
|
-
If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
|
154
|
-
|
155
|
-
Raises:
|
156
|
-
MissingDependencyError: Raised when the required dependencies are not installed.
|
157
|
-
|
158
|
-
Returns:
|
159
|
-
A list of table data dictionaries.
|
160
|
-
"""
|
161
|
-
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
162
|
-
|
163
34
|
# Determine if we should use isolated process # ~keep
|
164
35
|
if use_isolated_process is None:
|
165
36
|
use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
|
@@ -211,15 +82,15 @@ async def extract_tables(
|
|
211
82
|
return result
|
212
83
|
|
213
84
|
try:
|
214
|
-
from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415
|
85
|
+
from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415
|
215
86
|
AutoTableDetector,
|
216
87
|
AutoTableFormatter,
|
217
88
|
)
|
218
89
|
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
219
|
-
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
220
|
-
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
90
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
91
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
221
92
|
|
222
|
-
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
93
|
+
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
223
94
|
config=TATRFormatConfig(
|
224
95
|
verbosity=config.verbosity,
|
225
96
|
formatter_base_threshold=config.formatter_base_threshold,
|
@@ -235,7 +106,7 @@ async def extract_tables(
|
|
235
106
|
force_large_table_assumption=config.force_large_table_assumption,
|
236
107
|
)
|
237
108
|
)
|
238
|
-
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
109
|
+
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
239
110
|
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
240
111
|
)
|
241
112
|
doc = await run_sync(PyPDFium2Document, str(file_path))
|
@@ -276,19 +147,6 @@ async def extract_tables(
|
|
276
147
|
def extract_tables_sync(
|
277
148
|
file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
|
278
149
|
) -> list[TableData]:
|
279
|
-
"""Synchronous wrapper for extract_tables.
|
280
|
-
|
281
|
-
Args:
|
282
|
-
file_path: The path to the PDF file.
|
283
|
-
config: An optional configuration object.
|
284
|
-
use_isolated_process: Whether to use an isolated process for extraction.
|
285
|
-
If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
|
286
|
-
|
287
|
-
Returns:
|
288
|
-
A list of table data dictionaries.
|
289
|
-
"""
|
290
|
-
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
291
|
-
|
292
150
|
# Determine if we should use isolated process # ~keep
|
293
151
|
if use_isolated_process is None:
|
294
152
|
use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
|
@@ -390,13 +248,6 @@ def _extract_tables_in_process(
|
|
390
248
|
config_dict: dict[str, Any],
|
391
249
|
result_queue: queue.Queue[tuple[bool, Any]],
|
392
250
|
) -> None:
|
393
|
-
"""Extract tables in an isolated process to handle potential segfaults.
|
394
|
-
|
395
|
-
Args:
|
396
|
-
file_path: Path to the PDF file
|
397
|
-
config_dict: Serialized GMFTConfig as a dict
|
398
|
-
result_queue: Queue to put results or errors
|
399
|
-
"""
|
400
251
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
401
252
|
|
402
253
|
try:
|
@@ -480,19 +331,6 @@ def _extract_tables_isolated(
|
|
480
331
|
config: GMFTConfig | None = None,
|
481
332
|
timeout: float = 300.0,
|
482
333
|
) -> list[TableData]:
|
483
|
-
"""Extract tables using an isolated process to handle segfaults.
|
484
|
-
|
485
|
-
Args:
|
486
|
-
file_path: Path to the PDF file
|
487
|
-
config: GMFT configuration
|
488
|
-
timeout: Maximum time to wait for extraction
|
489
|
-
|
490
|
-
Returns:
|
491
|
-
List of extracted tables
|
492
|
-
|
493
|
-
Raises:
|
494
|
-
RuntimeError: If extraction fails or times out
|
495
|
-
"""
|
496
334
|
config = config or GMFTConfig()
|
497
335
|
config_dict = msgspec.to_builtins(config)
|
498
336
|
|
@@ -542,7 +380,6 @@ def _extract_tables_isolated(
|
|
542
380
|
tables = []
|
543
381
|
for table_dict in result:
|
544
382
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
545
|
-
import pandas as pd # noqa: PLC0415
|
546
383
|
|
547
384
|
if table_dict["df_csv"] is None:
|
548
385
|
df = pd.DataFrame(columns=table_dict["df_columns"])
|
@@ -592,19 +429,6 @@ async def _extract_tables_isolated_async(
|
|
592
429
|
config: GMFTConfig | None = None,
|
593
430
|
timeout: float = 300.0, # noqa: ASYNC109
|
594
431
|
) -> list[TableData]:
|
595
|
-
"""Async version of extract_tables_isolated using asyncio.
|
596
|
-
|
597
|
-
Args:
|
598
|
-
file_path: Path to the PDF file
|
599
|
-
config: GMFT configuration
|
600
|
-
timeout: Maximum time to wait for extraction
|
601
|
-
|
602
|
-
Returns:
|
603
|
-
List of extracted tables
|
604
|
-
|
605
|
-
Raises:
|
606
|
-
RuntimeError: If extraction fails or times out
|
607
|
-
"""
|
608
432
|
config = config or GMFTConfig()
|
609
433
|
config_dict = msgspec.to_builtins(config)
|
610
434
|
|
@@ -620,38 +444,29 @@ async def _extract_tables_isolated_async(
|
|
620
444
|
|
621
445
|
try:
|
622
446
|
|
623
|
-
|
447
|
+
def get_result_sync() -> tuple[bool, Any]:
|
624
448
|
while True:
|
625
449
|
try:
|
626
|
-
return result_queue.
|
450
|
+
return result_queue.get(timeout=0.1) # type: ignore[no-any-return]
|
627
451
|
except queue.Empty: # noqa: PERF203
|
628
|
-
await anyio.sleep(0.1)
|
629
452
|
if not process.is_alive():
|
630
|
-
# Process died without putting result # ~keep
|
631
453
|
if process.exitcode == -signal.SIGSEGV:
|
632
454
|
raise ParsingError(
|
633
455
|
"GMFT process crashed with segmentation fault",
|
634
|
-
context={
|
635
|
-
"file_path": str(file_path),
|
636
|
-
"exit_code": process.exitcode,
|
637
|
-
},
|
456
|
+
context={"file_path": str(file_path), "exit_code": process.exitcode},
|
638
457
|
) from None
|
639
458
|
raise ParsingError(
|
640
459
|
f"GMFT process died unexpectedly with exit code {process.exitcode}",
|
641
|
-
context={
|
642
|
-
"file_path": str(file_path),
|
643
|
-
"exit_code": process.exitcode,
|
644
|
-
},
|
460
|
+
context={"file_path": str(file_path), "exit_code": process.exitcode},
|
645
461
|
) from None
|
646
462
|
|
647
463
|
with anyio.fail_after(timeout):
|
648
|
-
success, result = await
|
464
|
+
success, result = await anyio.to_thread.run_sync(get_result_sync)
|
649
465
|
|
650
466
|
if success:
|
651
467
|
tables = []
|
652
468
|
for table_dict in result:
|
653
469
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
654
|
-
import pandas as pd # noqa: PLC0415
|
655
470
|
|
656
471
|
if table_dict["df_csv"] is None:
|
657
472
|
df = pd.DataFrame(columns=table_dict["df_columns"])
|
kreuzberg/_language_detection.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from dataclasses import dataclass
|
4
3
|
from functools import lru_cache
|
5
4
|
from typing import TYPE_CHECKING, Any
|
6
5
|
|
6
|
+
from kreuzberg._types import LanguageDetectionConfig
|
7
7
|
from kreuzberg.exceptions import MissingDependencyError
|
8
8
|
|
9
9
|
if TYPE_CHECKING:
|
@@ -23,29 +23,7 @@ except ImportError: # pragma: no cover
|
|
23
23
|
_CACHE_SIZE = 128
|
24
24
|
|
25
25
|
|
26
|
-
@dataclass(frozen=True, slots=True)
|
27
|
-
class LanguageDetectionConfig:
|
28
|
-
"""Configuration for language detection.
|
29
|
-
|
30
|
-
Attributes:
|
31
|
-
low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
|
32
|
-
Defaults to True for better memory efficiency.
|
33
|
-
top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
|
34
|
-
multilingual: If True, uses multilingual detection to handle mixed-language text.
|
35
|
-
If False, uses single language detection. Defaults to False.
|
36
|
-
cache_dir: Custom directory for model cache. If None, uses system default.
|
37
|
-
allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
|
38
|
-
"""
|
39
|
-
|
40
|
-
low_memory: bool = True
|
41
|
-
top_k: int = 3
|
42
|
-
multilingual: bool = False
|
43
|
-
cache_dir: str | None = None
|
44
|
-
allow_fallback: bool = True
|
45
|
-
|
46
|
-
|
47
26
|
def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
|
48
|
-
"""Create FastLangDetectConfig from our config."""
|
49
27
|
if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
|
50
28
|
return None
|
51
29
|
|
@@ -60,19 +38,6 @@ def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangD
|
|
60
38
|
|
61
39
|
@lru_cache(maxsize=_CACHE_SIZE)
|
62
40
|
def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
|
63
|
-
"""Detect the most probable languages in the given text using fast-langdetect.
|
64
|
-
|
65
|
-
Args:
|
66
|
-
text: The text to analyze.
|
67
|
-
config: Configuration for language detection. If None, uses defaults.
|
68
|
-
|
69
|
-
Returns:
|
70
|
-
A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
|
71
|
-
or None if detection fails.
|
72
|
-
|
73
|
-
Raises:
|
74
|
-
MissingDependencyError: If fast-langdetect is not installed.
|
75
|
-
"""
|
76
41
|
if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
|
77
42
|
raise MissingDependencyError.create_for_package(
|
78
43
|
dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
|
kreuzberg/_mcp/__init__.py
CHANGED
kreuzberg/_mcp/server.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Kreuzberg MCP server implementation."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import base64
|
@@ -10,11 +8,10 @@ import msgspec
|
|
10
8
|
from mcp.server import FastMCP
|
11
9
|
from mcp.types import TextContent
|
12
10
|
|
13
|
-
from kreuzberg._config import
|
11
|
+
from kreuzberg._config import discover_config
|
14
12
|
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
15
13
|
from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
|
16
14
|
|
17
|
-
# Create the MCP server
|
18
15
|
mcp = FastMCP("Kreuzberg Text Extraction")
|
19
16
|
|
20
17
|
|
@@ -27,14 +24,11 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
27
24
|
Returns:
|
28
25
|
ExtractionConfig instance.
|
29
26
|
"""
|
30
|
-
|
31
|
-
base_config = try_discover_config()
|
27
|
+
base_config = discover_config()
|
32
28
|
|
33
29
|
if base_config is None:
|
34
|
-
# No config file found, use defaults
|
35
30
|
return ExtractionConfig(**kwargs)
|
36
31
|
|
37
|
-
# Merge discovered config with tool parameters (tool params take precedence)
|
38
32
|
config_dict: dict[str, Any] = {
|
39
33
|
"force_ocr": base_config.force_ocr,
|
40
34
|
"chunk_content": base_config.chunk_content,
|
@@ -50,7 +44,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
50
44
|
"gmft_config": base_config.gmft_config,
|
51
45
|
}
|
52
46
|
|
53
|
-
# Override with provided parameters
|
54
47
|
config_dict = config_dict | kwargs
|
55
48
|
|
56
49
|
return ExtractionConfig(**config_dict)
|
@@ -189,7 +182,7 @@ def get_default_config() -> str:
|
|
189
182
|
@mcp.resource("config://discovered")
|
190
183
|
def get_discovered_config() -> str:
|
191
184
|
"""Get the discovered configuration from config files."""
|
192
|
-
config =
|
185
|
+
config = discover_config()
|
193
186
|
if config is None:
|
194
187
|
return "No configuration file found"
|
195
188
|
return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
|
kreuzberg/_mime_types.py
CHANGED
@@ -4,6 +4,7 @@ from mimetypes import guess_type
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import TYPE_CHECKING, Final
|
6
6
|
|
7
|
+
from kreuzberg._utils._cache import get_mime_cache
|
7
8
|
from kreuzberg.exceptions import ValidationError
|
8
9
|
|
9
10
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -172,27 +173,10 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
|
172
173
|
def validate_mime_type(
|
173
174
|
*, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
|
174
175
|
) -> str:
|
175
|
-
"""Validate and detect the MIME type for a given file.
|
176
|
-
|
177
|
-
Args:
|
178
|
-
file_path: The path to the file.
|
179
|
-
mime_type: Optional explicit MIME type. If provided, this will be validated.
|
180
|
-
If not provided, the function will attempt to detect the MIME type.
|
181
|
-
check_file_exists: Whether to check if the file exists. Default is True.
|
182
|
-
Set to False in tests where you want to validate a mime type without an actual file.
|
183
|
-
|
184
|
-
Raises:
|
185
|
-
ValidationError: If the MIME type is not supported or cannot be determined.
|
186
|
-
|
187
|
-
Returns:
|
188
|
-
The validated MIME type.
|
189
|
-
"""
|
190
176
|
if mime_type:
|
191
177
|
return _validate_explicit_mime_type(mime_type)
|
192
178
|
|
193
179
|
if file_path:
|
194
|
-
from kreuzberg._utils._cache import get_mime_cache # noqa: PLC0415
|
195
|
-
|
196
180
|
path = Path(file_path)
|
197
181
|
|
198
182
|
try:
|
@@ -228,7 +212,6 @@ def validate_mime_type(
|
|
228
212
|
|
229
213
|
|
230
214
|
def _validate_explicit_mime_type(mime_type: str) -> str:
|
231
|
-
"""Validate an explicitly provided MIME type."""
|
232
215
|
if mime_type in SUPPORTED_MIME_TYPES:
|
233
216
|
return mime_type
|
234
217
|
|
@@ -243,7 +226,6 @@ def _validate_explicit_mime_type(mime_type: str) -> str:
|
|
243
226
|
|
244
227
|
|
245
228
|
def _detect_mime_type_uncached(file_path: PathLike[str] | str | None = None, check_file_exists: bool = True) -> str:
|
246
|
-
"""Detect MIME type without caching (internal function)."""
|
247
229
|
if file_path and check_file_exists:
|
248
230
|
path = Path(file_path)
|
249
231
|
if not path.exists():
|
kreuzberg/_ocr/_base.py
CHANGED
@@ -16,98 +16,26 @@ T = TypeVar("T")
|
|
16
16
|
|
17
17
|
|
18
18
|
class OCRBackend(ABC, Generic[T]):
|
19
|
-
"""Abstract base class for Optical Character Recognition (OCR) backend implementations.
|
20
|
-
|
21
|
-
This class provides the blueprint for OCR backend implementations,
|
22
|
-
offering both synchronous and asynchronous methods to process images
|
23
|
-
and files for text extraction.
|
24
|
-
"""
|
25
|
-
|
26
19
|
@abstractmethod
|
27
|
-
async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
|
28
|
-
"""Asynchronously process an image and extract its text and metadata.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
image: An instance of PIL.Image representing the input image.
|
32
|
-
**kwargs: Any kwargs related to the given backend
|
33
|
-
|
34
|
-
Returns:
|
35
|
-
The extraction result object
|
36
|
-
"""
|
37
|
-
...
|
20
|
+
async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
|
38
21
|
|
39
22
|
@abstractmethod
|
40
|
-
async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
|
41
|
-
"""Asynchronously process a file and extract its text and metadata.
|
42
|
-
|
43
|
-
Args:
|
44
|
-
path: A Path object representing the file to be processed.
|
45
|
-
**kwargs: Any kwargs related to the given backend
|
46
|
-
|
47
|
-
Returns:
|
48
|
-
The extraction result object
|
49
|
-
"""
|
50
|
-
...
|
23
|
+
async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
|
51
24
|
|
52
25
|
@abstractmethod
|
53
|
-
def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
|
54
|
-
"""Synchronously process an image and extract its text and metadata.
|
55
|
-
|
56
|
-
Args:
|
57
|
-
image: An instance of PIL.Image representing the input image.
|
58
|
-
**kwargs: Any kwargs related to the given backend
|
59
|
-
|
60
|
-
Returns:
|
61
|
-
The extraction result object
|
62
|
-
"""
|
63
|
-
...
|
26
|
+
def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
|
64
27
|
|
65
28
|
@abstractmethod
|
66
|
-
def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
|
67
|
-
"""Synchronously process a file and extract its text and metadata.
|
68
|
-
|
69
|
-
Args:
|
70
|
-
path: A Path object representing the file to be processed.
|
71
|
-
**kwargs: Any kwargs related to the given backend
|
72
|
-
|
73
|
-
Returns:
|
74
|
-
The extraction result object
|
75
|
-
"""
|
76
|
-
...
|
29
|
+
def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
|
77
30
|
|
78
31
|
def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
|
79
|
-
"""Synchronously process a batch of files and extract their text and metadata.
|
80
|
-
|
81
|
-
Default implementation processes files sequentially. Backends can override
|
82
|
-
for more efficient batch processing.
|
83
|
-
|
84
|
-
Args:
|
85
|
-
paths: List of Path objects representing files to be processed.
|
86
|
-
**kwargs: Any kwargs related to the given backend
|
87
|
-
|
88
|
-
Returns:
|
89
|
-
List of extraction result objects in the same order as input paths
|
90
|
-
"""
|
91
32
|
return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
|
92
33
|
|
93
34
|
async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
|
94
|
-
"""Asynchronously process a batch of files and extract their text and metadata.
|
95
|
-
|
96
|
-
Default implementation processes files concurrently. Backends can override
|
97
|
-
for more efficient batch processing.
|
98
|
-
|
99
|
-
Args:
|
100
|
-
paths: List of Path objects representing files to be processed.
|
101
|
-
**kwargs: Any kwargs related to the given backend
|
102
|
-
|
103
|
-
Returns:
|
104
|
-
List of extraction result objects in the same order as input paths
|
105
|
-
"""
|
106
35
|
from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
|
107
36
|
|
108
37
|
tasks = [self.process_file(path, **kwargs) for path in paths]
|
109
38
|
return await run_taskgroup(*tasks) # pragma: no cover
|
110
39
|
|
111
40
|
def __hash__(self) -> int:
|
112
|
-
"""Hash function for allowing caching."""
|
113
41
|
return hash(type(self).__name__) # pragma: no cover
|