kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +0 -124
- kreuzberg/_document_classification.py +20 -39
- kreuzberg/_entity_extraction.py +0 -29
- kreuzberg/_extractors/_base.py +4 -66
- kreuzberg/_extractors/_email.py +0 -4
- kreuzberg/_extractors/_image.py +0 -2
- kreuzberg/_extractors/_pandoc.py +0 -58
- kreuzberg/_extractors/_pdf.py +0 -3
- kreuzberg/_extractors/_presentation.py +0 -82
- kreuzberg/_extractors/_spread_sheet.py +0 -2
- kreuzberg/_gmft.py +0 -61
- kreuzberg/_language_detection.py +0 -14
- kreuzberg/_mime_types.py +0 -17
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +110 -85
- kreuzberg/_ocr/_paddleocr.py +146 -138
- kreuzberg/_ocr/_table_extractor.py +0 -76
- kreuzberg/_ocr/_tesseract.py +0 -206
- kreuzberg/_playa.py +0 -27
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +16 -119
- kreuzberg/_utils/_cache.py +0 -52
- kreuzberg/_utils/_device.py +0 -56
- kreuzberg/_utils/_document_cache.py +0 -73
- kreuzberg/_utils/_errors.py +0 -47
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -14
- kreuzberg/_utils/_process_pool.py +0 -47
- kreuzberg/_utils/_quality.py +0 -17
- kreuzberg/_utils/_ref.py +0 -16
- kreuzberg/_utils/_serialization.py +0 -25
- kreuzberg/_utils/_string.py +0 -20
- kreuzberg/_utils/_sync.py +0 -76
- kreuzberg/_utils/_table.py +0 -45
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +2 -2
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
- kreuzberg-3.13.2.dist-info/RECORD +57 -0
- kreuzberg-3.13.0.dist-info/RECORD +0 -56
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -11,6 +11,16 @@ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
|
11
11
|
from kreuzberg._ocr._base import OCRBackend
|
12
12
|
from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
|
13
13
|
from kreuzberg._utils._device import DeviceInfo, validate_device_request
|
14
|
+
from kreuzberg._utils._ocr_cache import (
|
15
|
+
build_cache_kwargs,
|
16
|
+
cache_and_complete_async,
|
17
|
+
cache_and_complete_sync,
|
18
|
+
generate_image_hash,
|
19
|
+
get_file_info,
|
20
|
+
handle_cache_lookup_async,
|
21
|
+
handle_cache_lookup_sync,
|
22
|
+
mark_processing_complete,
|
23
|
+
)
|
14
24
|
from kreuzberg._utils._string import normalize_spaces
|
15
25
|
from kreuzberg._utils._sync import run_sync
|
16
26
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -23,15 +33,21 @@ try: # pragma: no cover
|
|
23
33
|
except ImportError: # pragma: no cover
|
24
34
|
from typing_extensions import Unpack
|
25
35
|
|
26
|
-
|
36
|
+
if TYPE_CHECKING:
|
27
37
|
import numpy as np
|
28
38
|
from paddleocr import PaddleOCR
|
29
39
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
40
|
+
HAS_PADDLEOCR: bool
|
41
|
+
if not TYPE_CHECKING:
|
42
|
+
try:
|
43
|
+
import numpy as np
|
44
|
+
from paddleocr import PaddleOCR
|
45
|
+
|
46
|
+
HAS_PADDLEOCR = True
|
47
|
+
except ImportError:
|
48
|
+
HAS_PADDLEOCR = False
|
49
|
+
np: Any = None
|
50
|
+
PaddleOCR: Any = None
|
35
51
|
|
36
52
|
|
37
53
|
PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
|
@@ -41,61 +57,68 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
41
57
|
_paddle_ocr: ClassVar[Any] = None
|
42
58
|
|
43
59
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
44
|
-
""
|
60
|
+
use_cache = kwargs.pop("use_cache", True)
|
45
61
|
|
46
|
-
|
47
|
-
|
48
|
-
|
62
|
+
cache_kwargs = None
|
63
|
+
if use_cache:
|
64
|
+
image_hash = generate_image_hash(image)
|
65
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
|
49
66
|
|
50
|
-
|
51
|
-
|
67
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
68
|
+
if cached_result:
|
69
|
+
return cached_result
|
52
70
|
|
53
|
-
|
54
|
-
|
55
|
-
"""
|
56
|
-
await self._init_paddle_ocr(**kwargs)
|
71
|
+
try:
|
72
|
+
await self._init_paddle_ocr(**kwargs)
|
57
73
|
|
58
|
-
|
59
|
-
|
74
|
+
if image.mode != "RGB":
|
75
|
+
image = image.convert("RGB")
|
60
76
|
|
61
|
-
|
62
|
-
|
63
|
-
result = await run_sync(self._paddle_ocr.ocr, image_np, cls=
|
64
|
-
|
77
|
+
image_np = np.array(image)
|
78
|
+
use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
|
79
|
+
result = await run_sync(self._paddle_ocr.ocr, image_np, cls=use_textline_orientation)
|
80
|
+
|
81
|
+
extraction_result = self._process_paddle_result(result, image)
|
82
|
+
|
83
|
+
if use_cache and cache_kwargs:
|
84
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
85
|
+
|
86
|
+
return extraction_result
|
65
87
|
except Exception as e:
|
88
|
+
if use_cache and cache_kwargs:
|
89
|
+
mark_processing_complete(cache_kwargs)
|
66
90
|
raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
|
67
91
|
|
68
92
|
async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
69
|
-
""
|
93
|
+
use_cache = kwargs.pop("use_cache", True)
|
70
94
|
|
71
|
-
|
72
|
-
|
73
|
-
|
95
|
+
cache_kwargs = None
|
96
|
+
if use_cache:
|
97
|
+
file_info = get_file_info(path)
|
98
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
|
74
99
|
|
75
|
-
|
76
|
-
|
100
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
101
|
+
if cached_result:
|
102
|
+
return cached_result
|
77
103
|
|
78
|
-
Raises:
|
79
|
-
OCRError: If file loading or OCR processing fails.
|
80
|
-
"""
|
81
|
-
await self._init_paddle_ocr(**kwargs)
|
82
104
|
try:
|
105
|
+
await self._init_paddle_ocr(**kwargs)
|
83
106
|
image = await run_sync(Image.open, path)
|
84
|
-
|
107
|
+
|
108
|
+
kwargs["use_cache"] = False
|
109
|
+
extraction_result = await self.process_image(image, **kwargs)
|
110
|
+
|
111
|
+
if use_cache and cache_kwargs:
|
112
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
113
|
+
|
114
|
+
return extraction_result
|
85
115
|
except Exception as e:
|
116
|
+
if use_cache and cache_kwargs:
|
117
|
+
mark_processing_complete(cache_kwargs)
|
86
118
|
raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
|
87
119
|
|
88
120
|
@staticmethod
|
89
121
|
def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
|
90
|
-
"""Process PaddleOCR result into an ExtractionResult with metadata.
|
91
|
-
|
92
|
-
Args:
|
93
|
-
result: The raw result from PaddleOCR.
|
94
|
-
image: The original PIL image.
|
95
|
-
|
96
|
-
Returns:
|
97
|
-
ExtractionResult: The extraction result containing text content, mime type, and metadata.
|
98
|
-
"""
|
99
122
|
text_content = ""
|
100
123
|
confidence_sum = 0
|
101
124
|
confidence_count = 0
|
@@ -155,11 +178,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
155
178
|
|
156
179
|
@classmethod
|
157
180
|
def _is_mkldnn_supported(cls) -> bool:
|
158
|
-
"""Check if the current architecture supports MKL-DNN optimization.
|
159
|
-
|
160
|
-
Returns:
|
161
|
-
True if MKL-DNN is supported on this architecture.
|
162
|
-
"""
|
163
181
|
system = platform.system().lower()
|
164
182
|
processor = platform.processor().lower()
|
165
183
|
machine = platform.machine().lower()
|
@@ -174,15 +192,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
174
192
|
|
175
193
|
@classmethod
|
176
194
|
async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
|
177
|
-
"""Initialize PaddleOCR with the provided configuration.
|
178
|
-
|
179
|
-
Args:
|
180
|
-
**kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
|
181
|
-
|
182
|
-
Raises:
|
183
|
-
MissingDependencyError: If PaddleOCR is not installed.
|
184
|
-
OCRError: If initialization fails.
|
185
|
-
"""
|
186
195
|
if cls._paddle_ocr is not None:
|
187
196
|
return
|
188
197
|
|
@@ -193,38 +202,34 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
193
202
|
|
194
203
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
195
204
|
|
196
|
-
|
197
|
-
use_gpu = device_info.device_type == "cuda"
|
205
|
+
cls._resolve_device_config(**kwargs)
|
198
206
|
|
199
|
-
|
200
|
-
kwargs.setdefault("use_angle_cls", True)
|
201
|
-
kwargs["use_gpu"] = use_gpu and has_gpu_package
|
202
|
-
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
|
203
|
-
kwargs.setdefault("det_db_thresh", 0.3)
|
204
|
-
kwargs.setdefault("det_db_box_thresh", 0.5)
|
205
|
-
kwargs.setdefault("det_db_unclip_ratio", 1.6)
|
207
|
+
bool(find_spec("paddlepaddle_gpu"))
|
206
208
|
|
207
|
-
|
208
|
-
|
209
|
+
use_angle_cls = kwargs.pop("use_angle_cls", True)
|
210
|
+
kwargs.setdefault("use_textline_orientation", use_angle_cls)
|
211
|
+
|
212
|
+
det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
|
213
|
+
det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
|
214
|
+
det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
|
215
|
+
|
216
|
+
kwargs.setdefault("text_det_thresh", det_db_thresh)
|
217
|
+
kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
|
218
|
+
kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
|
219
|
+
|
220
|
+
kwargs.pop("use_gpu", None)
|
221
|
+
kwargs.pop("gpu_mem", None)
|
222
|
+
kwargs.pop("gpu_memory_limit", None)
|
223
|
+
|
224
|
+
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
|
209
225
|
|
210
226
|
try:
|
211
|
-
cls._paddle_ocr = await run_sync(PaddleOCR, lang=language,
|
227
|
+
cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, **kwargs)
|
212
228
|
except Exception as e:
|
213
229
|
raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
|
214
230
|
|
215
231
|
@classmethod
|
216
232
|
def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
|
217
|
-
"""Resolve device configuration with backward compatibility.
|
218
|
-
|
219
|
-
Args:
|
220
|
-
**kwargs: Configuration parameters including device settings.
|
221
|
-
|
222
|
-
Returns:
|
223
|
-
DeviceInfo object for the selected device.
|
224
|
-
|
225
|
-
Raises:
|
226
|
-
ValidationError: If requested device is not available and fallback is disabled.
|
227
|
-
"""
|
228
233
|
use_gpu = kwargs.get("use_gpu", False)
|
229
234
|
device = kwargs.get("device", "auto")
|
230
235
|
memory_limit = kwargs.get("gpu_memory_limit")
|
@@ -269,17 +274,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
269
274
|
|
270
275
|
@staticmethod
|
271
276
|
def _validate_language_code(lang_code: str) -> str:
|
272
|
-
"""Convert a language code to PaddleOCR format.
|
273
|
-
|
274
|
-
Args:
|
275
|
-
lang_code: ISO language code or language name
|
276
|
-
|
277
|
-
Raises:
|
278
|
-
ValidationError: If the language is not supported by PaddleOCR
|
279
|
-
|
280
|
-
Returns:
|
281
|
-
Language code compatible with PaddleOCR
|
282
|
-
"""
|
283
277
|
normalized = lang_code.lower()
|
284
278
|
if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
|
285
279
|
return normalized
|
@@ -293,61 +287,68 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
293
287
|
)
|
294
288
|
|
295
289
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
296
|
-
""
|
290
|
+
use_cache = kwargs.pop("use_cache", True)
|
297
291
|
|
298
|
-
|
299
|
-
|
300
|
-
|
292
|
+
cache_kwargs = None
|
293
|
+
if use_cache:
|
294
|
+
image_hash = generate_image_hash(image)
|
295
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
|
301
296
|
|
302
|
-
|
303
|
-
|
297
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
298
|
+
if cached_result:
|
299
|
+
return cached_result
|
304
300
|
|
305
|
-
|
306
|
-
|
307
|
-
"""
|
308
|
-
self._init_paddle_ocr_sync(**kwargs)
|
301
|
+
try:
|
302
|
+
self._init_paddle_ocr_sync(**kwargs)
|
309
303
|
|
310
|
-
|
311
|
-
|
304
|
+
if image.mode != "RGB":
|
305
|
+
image = image.convert("RGB")
|
312
306
|
|
313
|
-
|
314
|
-
|
315
|
-
result = self._paddle_ocr.ocr(image_np, cls=
|
316
|
-
|
307
|
+
image_np = np.array(image)
|
308
|
+
use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
|
309
|
+
result = self._paddle_ocr.ocr(image_np, cls=use_textline_orientation)
|
310
|
+
|
311
|
+
extraction_result = self._process_paddle_result(result, image)
|
312
|
+
|
313
|
+
if use_cache and cache_kwargs:
|
314
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
315
|
+
|
316
|
+
return extraction_result
|
317
317
|
except Exception as e:
|
318
|
+
if use_cache and cache_kwargs:
|
319
|
+
mark_processing_complete(cache_kwargs)
|
318
320
|
raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
|
319
321
|
|
320
322
|
def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
321
|
-
""
|
323
|
+
use_cache = kwargs.pop("use_cache", True)
|
322
324
|
|
323
|
-
|
324
|
-
|
325
|
-
|
325
|
+
cache_kwargs = None
|
326
|
+
if use_cache:
|
327
|
+
file_info = get_file_info(path)
|
328
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
|
326
329
|
|
327
|
-
|
328
|
-
|
330
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
331
|
+
if cached_result:
|
332
|
+
return cached_result
|
329
333
|
|
330
|
-
Raises:
|
331
|
-
OCRError: If file loading or OCR processing fails.
|
332
|
-
"""
|
333
|
-
self._init_paddle_ocr_sync(**kwargs)
|
334
334
|
try:
|
335
|
+
self._init_paddle_ocr_sync(**kwargs)
|
335
336
|
image = Image.open(path)
|
336
|
-
|
337
|
+
|
338
|
+
kwargs["use_cache"] = False
|
339
|
+
extraction_result = self.process_image_sync(image, **kwargs)
|
340
|
+
|
341
|
+
if use_cache and cache_kwargs:
|
342
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
343
|
+
|
344
|
+
return extraction_result
|
337
345
|
except Exception as e:
|
346
|
+
if use_cache and cache_kwargs:
|
347
|
+
mark_processing_complete(cache_kwargs)
|
338
348
|
raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
|
339
349
|
|
340
350
|
@classmethod
|
341
351
|
def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
|
342
|
-
"""Synchronously initialize PaddleOCR with the provided configuration.
|
343
|
-
|
344
|
-
Args:
|
345
|
-
**kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
|
346
|
-
|
347
|
-
Raises:
|
348
|
-
MissingDependencyError: If PaddleOCR is not installed.
|
349
|
-
OCRError: If initialization fails.
|
350
|
-
"""
|
351
352
|
if cls._paddle_ocr is not None:
|
352
353
|
return
|
353
354
|
|
@@ -358,21 +359,28 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
358
359
|
|
359
360
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
360
361
|
|
361
|
-
|
362
|
-
|
362
|
+
cls._resolve_device_config(**kwargs)
|
363
|
+
|
364
|
+
bool(find_spec("paddlepaddle_gpu"))
|
365
|
+
|
366
|
+
use_angle_cls = kwargs.pop("use_angle_cls", True)
|
367
|
+
kwargs.setdefault("use_textline_orientation", use_angle_cls)
|
368
|
+
|
369
|
+
det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
|
370
|
+
det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
|
371
|
+
det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
|
372
|
+
|
373
|
+
kwargs.setdefault("text_det_thresh", det_db_thresh)
|
374
|
+
kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
|
375
|
+
kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
|
363
376
|
|
364
|
-
|
365
|
-
kwargs.
|
366
|
-
kwargs
|
367
|
-
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
|
368
|
-
kwargs.setdefault("det_db_thresh", 0.3)
|
369
|
-
kwargs.setdefault("det_db_box_thresh", 0.5)
|
370
|
-
kwargs.setdefault("det_db_unclip_ratio", 1.6)
|
377
|
+
kwargs.pop("use_gpu", None)
|
378
|
+
kwargs.pop("gpu_mem", None)
|
379
|
+
kwargs.pop("gpu_memory_limit", None)
|
371
380
|
|
372
|
-
|
373
|
-
kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
|
381
|
+
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
|
374
382
|
|
375
383
|
try:
|
376
|
-
cls._paddle_ocr = PaddleOCR(lang=language,
|
384
|
+
cls._paddle_ocr = PaddleOCR(lang=language, **kwargs)
|
377
385
|
except Exception as e:
|
378
386
|
raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
|
@@ -13,18 +13,6 @@ if TYPE_CHECKING:
|
|
13
13
|
|
14
14
|
|
15
15
|
def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
|
16
|
-
"""Parse TSV output into structured word data.
|
17
|
-
|
18
|
-
Args:
|
19
|
-
tsv_data: Raw TSV output from Tesseract.
|
20
|
-
min_confidence: Minimum confidence score to include a word.
|
21
|
-
|
22
|
-
Returns:
|
23
|
-
List of word dictionaries with position and text data.
|
24
|
-
|
25
|
-
Raises:
|
26
|
-
ParsingError: If TSV data cannot be parsed.
|
27
|
-
"""
|
28
16
|
try:
|
29
17
|
reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
|
30
18
|
words: list[TSVWord] = []
|
@@ -62,15 +50,6 @@ def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWor
|
|
62
50
|
|
63
51
|
|
64
52
|
def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
|
65
|
-
"""Detect columns using X position clustering.
|
66
|
-
|
67
|
-
Args:
|
68
|
-
words: List of word dictionaries from TSV.
|
69
|
-
column_threshold: Pixel threshold for column clustering.
|
70
|
-
|
71
|
-
Returns:
|
72
|
-
Sorted list of column X positions.
|
73
|
-
"""
|
74
53
|
if not words:
|
75
54
|
return []
|
76
55
|
|
@@ -94,15 +73,6 @@ def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[
|
|
94
73
|
|
95
74
|
|
96
75
|
def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
|
97
|
-
"""Detect rows using Y position clustering.
|
98
|
-
|
99
|
-
Args:
|
100
|
-
words: List of word dictionaries from TSV.
|
101
|
-
row_threshold_ratio: Row threshold as ratio of mean text height.
|
102
|
-
|
103
|
-
Returns:
|
104
|
-
Sorted list of row Y positions.
|
105
|
-
"""
|
106
76
|
if not words:
|
107
77
|
return []
|
108
78
|
|
@@ -129,15 +99,6 @@ def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> li
|
|
129
99
|
|
130
100
|
|
131
101
|
def _find_closest_index(value: float, positions: list[int]) -> int:
|
132
|
-
"""Find index of closest position.
|
133
|
-
|
134
|
-
Args:
|
135
|
-
value: The value to match.
|
136
|
-
positions: List of positions to search.
|
137
|
-
|
138
|
-
Returns:
|
139
|
-
Index of the closest position.
|
140
|
-
"""
|
141
102
|
if not positions:
|
142
103
|
return 0
|
143
104
|
|
@@ -146,14 +107,6 @@ def _find_closest_index(value: float, positions: list[int]) -> int:
|
|
146
107
|
|
147
108
|
|
148
109
|
def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
|
149
|
-
"""Remove completely empty rows and columns.
|
150
|
-
|
151
|
-
Args:
|
152
|
-
table: 2D table array.
|
153
|
-
|
154
|
-
Returns:
|
155
|
-
Cleaned table with empty rows/columns removed.
|
156
|
-
"""
|
157
110
|
if not table:
|
158
111
|
return table
|
159
112
|
|
@@ -175,16 +128,6 @@ def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
|
|
175
128
|
def reconstruct_table(
|
176
129
|
words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
|
177
130
|
) -> list[list[str]]:
|
178
|
-
"""Reconstruct table from words and detected structure.
|
179
|
-
|
180
|
-
Args:
|
181
|
-
words: List of word dictionaries from TSV.
|
182
|
-
column_threshold: Pixel threshold for column clustering.
|
183
|
-
row_threshold_ratio: Row threshold as ratio of mean text height.
|
184
|
-
|
185
|
-
Returns:
|
186
|
-
2D list representing the table structure.
|
187
|
-
"""
|
188
131
|
if not words:
|
189
132
|
return []
|
190
133
|
|
@@ -211,14 +154,6 @@ def reconstruct_table(
|
|
211
154
|
|
212
155
|
|
213
156
|
def to_markdown(table: list[list[str]]) -> str:
|
214
|
-
"""Convert table to markdown format.
|
215
|
-
|
216
|
-
Args:
|
217
|
-
table: 2D list representing the table.
|
218
|
-
|
219
|
-
Returns:
|
220
|
-
Markdown-formatted table string.
|
221
|
-
"""
|
222
157
|
if not table or not table[0]:
|
223
158
|
return ""
|
224
159
|
|
@@ -238,17 +173,6 @@ def to_markdown(table: list[list[str]]) -> str:
|
|
238
173
|
def extract_table_from_tsv(
|
239
174
|
tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
|
240
175
|
) -> str:
|
241
|
-
"""Extract table from TSV data and convert to markdown.
|
242
|
-
|
243
|
-
Args:
|
244
|
-
tsv_data: Raw TSV output from Tesseract.
|
245
|
-
column_threshold: Pixel threshold for column clustering.
|
246
|
-
row_threshold_ratio: Row threshold as ratio of mean text height.
|
247
|
-
min_confidence: Minimum confidence score to include a word.
|
248
|
-
|
249
|
-
Returns:
|
250
|
-
Markdown-formatted table string, or empty string if no table detected.
|
251
|
-
"""
|
252
176
|
words = extract_words(tsv_data, min_confidence=min_confidence)
|
253
177
|
if not words:
|
254
178
|
return ""
|