kreuzberg 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_chunker.py +0 -15
  2. kreuzberg/_config.py +0 -124
  3. kreuzberg/_document_classification.py +20 -39
  4. kreuzberg/_entity_extraction.py +0 -29
  5. kreuzberg/_extractors/_base.py +4 -66
  6. kreuzberg/_extractors/_email.py +0 -4
  7. kreuzberg/_extractors/_image.py +0 -2
  8. kreuzberg/_extractors/_pandoc.py +0 -58
  9. kreuzberg/_extractors/_pdf.py +0 -3
  10. kreuzberg/_extractors/_presentation.py +0 -82
  11. kreuzberg/_extractors/_spread_sheet.py +0 -2
  12. kreuzberg/_gmft.py +0 -61
  13. kreuzberg/_language_detection.py +0 -14
  14. kreuzberg/_mime_types.py +0 -17
  15. kreuzberg/_ocr/_base.py +4 -76
  16. kreuzberg/_ocr/_easyocr.py +110 -85
  17. kreuzberg/_ocr/_paddleocr.py +146 -138
  18. kreuzberg/_ocr/_table_extractor.py +0 -76
  19. kreuzberg/_ocr/_tesseract.py +0 -206
  20. kreuzberg/_playa.py +0 -27
  21. kreuzberg/_registry.py +0 -36
  22. kreuzberg/_types.py +16 -119
  23. kreuzberg/_utils/_cache.py +0 -52
  24. kreuzberg/_utils/_device.py +0 -56
  25. kreuzberg/_utils/_document_cache.py +0 -73
  26. kreuzberg/_utils/_errors.py +0 -47
  27. kreuzberg/_utils/_ocr_cache.py +136 -0
  28. kreuzberg/_utils/_pdf_lock.py +0 -14
  29. kreuzberg/_utils/_process_pool.py +0 -47
  30. kreuzberg/_utils/_quality.py +0 -17
  31. kreuzberg/_utils/_ref.py +0 -16
  32. kreuzberg/_utils/_serialization.py +0 -25
  33. kreuzberg/_utils/_string.py +0 -20
  34. kreuzberg/_utils/_sync.py +0 -76
  35. kreuzberg/_utils/_table.py +0 -45
  36. kreuzberg/_utils/_tmp.py +0 -9
  37. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +3 -2
  38. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  39. kreuzberg-3.13.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -11,6 +11,16 @@ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
11
11
  from kreuzberg._ocr._base import OCRBackend
12
12
  from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
13
13
  from kreuzberg._utils._device import DeviceInfo, validate_device_request
14
+ from kreuzberg._utils._ocr_cache import (
15
+ build_cache_kwargs,
16
+ cache_and_complete_async,
17
+ cache_and_complete_sync,
18
+ generate_image_hash,
19
+ get_file_info,
20
+ handle_cache_lookup_async,
21
+ handle_cache_lookup_sync,
22
+ mark_processing_complete,
23
+ )
14
24
  from kreuzberg._utils._string import normalize_spaces
15
25
  from kreuzberg._utils._sync import run_sync
16
26
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -23,15 +33,21 @@ try: # pragma: no cover
23
33
  except ImportError: # pragma: no cover
24
34
  from typing_extensions import Unpack
25
35
 
26
- try:
36
+ if TYPE_CHECKING:
27
37
  import numpy as np
28
38
  from paddleocr import PaddleOCR
29
39
 
30
- HAS_PADDLEOCR = True
31
- except ImportError:
32
- HAS_PADDLEOCR = False
33
- np = None # type: ignore[assignment]
34
- PaddleOCR = None
40
+ HAS_PADDLEOCR: bool
41
+ if not TYPE_CHECKING:
42
+ try:
43
+ import numpy as np
44
+ from paddleocr import PaddleOCR
45
+
46
+ HAS_PADDLEOCR = True
47
+ except ImportError:
48
+ HAS_PADDLEOCR = False
49
+ np: Any = None
50
+ PaddleOCR: Any = None
35
51
 
36
52
 
37
53
  PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
@@ -41,61 +57,68 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
41
57
  _paddle_ocr: ClassVar[Any] = None
42
58
 
43
59
  async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
44
- """Asynchronously process an image and extract its text and metadata using PaddleOCR.
60
+ use_cache = kwargs.pop("use_cache", True)
45
61
 
46
- Args:
47
- image: An instance of PIL.Image representing the input image.
48
- **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
62
+ cache_kwargs = None
63
+ if use_cache:
64
+ image_hash = generate_image_hash(image)
65
+ cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
49
66
 
50
- Returns:
51
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
67
+ cached_result = await handle_cache_lookup_async(cache_kwargs)
68
+ if cached_result:
69
+ return cached_result
52
70
 
53
- Raises:
54
- OCRError: If OCR processing fails.
55
- """
56
- await self._init_paddle_ocr(**kwargs)
71
+ try:
72
+ await self._init_paddle_ocr(**kwargs)
57
73
 
58
- if image.mode != "RGB":
59
- image = image.convert("RGB")
74
+ if image.mode != "RGB":
75
+ image = image.convert("RGB")
60
76
 
61
- image_np = np.array(image)
62
- try:
63
- result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
64
- return self._process_paddle_result(result, image)
77
+ image_np = np.array(image)
78
+ use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
79
+ result = await run_sync(self._paddle_ocr.ocr, image_np, cls=use_textline_orientation)
80
+
81
+ extraction_result = self._process_paddle_result(result, image)
82
+
83
+ if use_cache and cache_kwargs:
84
+ await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
85
+
86
+ return extraction_result
65
87
  except Exception as e:
88
+ if use_cache and cache_kwargs:
89
+ mark_processing_complete(cache_kwargs)
66
90
  raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
67
91
 
68
92
  async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
69
- """Asynchronously process a file and extract its text and metadata using PaddleOCR.
93
+ use_cache = kwargs.pop("use_cache", True)
70
94
 
71
- Args:
72
- path: A Path object representing the file to be processed.
73
- **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
95
+ cache_kwargs = None
96
+ if use_cache:
97
+ file_info = get_file_info(path)
98
+ cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
74
99
 
75
- Returns:
76
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
100
+ cached_result = await handle_cache_lookup_async(cache_kwargs)
101
+ if cached_result:
102
+ return cached_result
77
103
 
78
- Raises:
79
- OCRError: If file loading or OCR processing fails.
80
- """
81
- await self._init_paddle_ocr(**kwargs)
82
104
  try:
105
+ await self._init_paddle_ocr(**kwargs)
83
106
  image = await run_sync(Image.open, path)
84
- return await self.process_image(image, **kwargs)
107
+
108
+ kwargs["use_cache"] = False
109
+ extraction_result = await self.process_image(image, **kwargs)
110
+
111
+ if use_cache and cache_kwargs:
112
+ await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
113
+
114
+ return extraction_result
85
115
  except Exception as e:
116
+ if use_cache and cache_kwargs:
117
+ mark_processing_complete(cache_kwargs)
86
118
  raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
87
119
 
88
120
  @staticmethod
89
121
  def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
90
- """Process PaddleOCR result into an ExtractionResult with metadata.
91
-
92
- Args:
93
- result: The raw result from PaddleOCR.
94
- image: The original PIL image.
95
-
96
- Returns:
97
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
98
- """
99
122
  text_content = ""
100
123
  confidence_sum = 0
101
124
  confidence_count = 0
@@ -155,11 +178,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
155
178
 
156
179
  @classmethod
157
180
  def _is_mkldnn_supported(cls) -> bool:
158
- """Check if the current architecture supports MKL-DNN optimization.
159
-
160
- Returns:
161
- True if MKL-DNN is supported on this architecture.
162
- """
163
181
  system = platform.system().lower()
164
182
  processor = platform.processor().lower()
165
183
  machine = platform.machine().lower()
@@ -174,15 +192,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
174
192
 
175
193
  @classmethod
176
194
  async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
177
- """Initialize PaddleOCR with the provided configuration.
178
-
179
- Args:
180
- **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
181
-
182
- Raises:
183
- MissingDependencyError: If PaddleOCR is not installed.
184
- OCRError: If initialization fails.
185
- """
186
195
  if cls._paddle_ocr is not None:
187
196
  return
188
197
 
@@ -193,38 +202,34 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
193
202
 
194
203
  language = cls._validate_language_code(kwargs.pop("language", "en"))
195
204
 
196
- device_info = cls._resolve_device_config(**kwargs)
197
- use_gpu = device_info.device_type == "cuda"
205
+ cls._resolve_device_config(**kwargs)
198
206
 
199
- has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
200
- kwargs.setdefault("use_angle_cls", True)
201
- kwargs["use_gpu"] = use_gpu and has_gpu_package
202
- kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
203
- kwargs.setdefault("det_db_thresh", 0.3)
204
- kwargs.setdefault("det_db_box_thresh", 0.5)
205
- kwargs.setdefault("det_db_unclip_ratio", 1.6)
207
+ bool(find_spec("paddlepaddle_gpu"))
206
208
 
207
- if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
208
- kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
209
+ use_angle_cls = kwargs.pop("use_angle_cls", True)
210
+ kwargs.setdefault("use_textline_orientation", use_angle_cls)
211
+
212
+ det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
213
+ det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
214
+ det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
215
+
216
+ kwargs.setdefault("text_det_thresh", det_db_thresh)
217
+ kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
218
+ kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
219
+
220
+ kwargs.pop("use_gpu", None)
221
+ kwargs.pop("gpu_mem", None)
222
+ kwargs.pop("gpu_memory_limit", None)
223
+
224
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
209
225
 
210
226
  try:
211
- cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
227
+ cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, **kwargs)
212
228
  except Exception as e:
213
229
  raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
214
230
 
215
231
  @classmethod
216
232
  def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
217
- """Resolve device configuration with backward compatibility.
218
-
219
- Args:
220
- **kwargs: Configuration parameters including device settings.
221
-
222
- Returns:
223
- DeviceInfo object for the selected device.
224
-
225
- Raises:
226
- ValidationError: If requested device is not available and fallback is disabled.
227
- """
228
233
  use_gpu = kwargs.get("use_gpu", False)
229
234
  device = kwargs.get("device", "auto")
230
235
  memory_limit = kwargs.get("gpu_memory_limit")
@@ -269,17 +274,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
269
274
 
270
275
  @staticmethod
271
276
  def _validate_language_code(lang_code: str) -> str:
272
- """Convert a language code to PaddleOCR format.
273
-
274
- Args:
275
- lang_code: ISO language code or language name
276
-
277
- Raises:
278
- ValidationError: If the language is not supported by PaddleOCR
279
-
280
- Returns:
281
- Language code compatible with PaddleOCR
282
- """
283
277
  normalized = lang_code.lower()
284
278
  if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
285
279
  return normalized
@@ -293,61 +287,68 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
293
287
  )
294
288
 
295
289
  def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
296
- """Synchronously process an image and extract its text and metadata using PaddleOCR.
290
+ use_cache = kwargs.pop("use_cache", True)
297
291
 
298
- Args:
299
- image: An instance of PIL.Image representing the input image.
300
- **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
292
+ cache_kwargs = None
293
+ if use_cache:
294
+ image_hash = generate_image_hash(image)
295
+ cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
301
296
 
302
- Returns:
303
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
297
+ cached_result = handle_cache_lookup_sync(cache_kwargs)
298
+ if cached_result:
299
+ return cached_result
304
300
 
305
- Raises:
306
- OCRError: If OCR processing fails.
307
- """
308
- self._init_paddle_ocr_sync(**kwargs)
301
+ try:
302
+ self._init_paddle_ocr_sync(**kwargs)
309
303
 
310
- if image.mode != "RGB":
311
- image = image.convert("RGB")
304
+ if image.mode != "RGB":
305
+ image = image.convert("RGB")
312
306
 
313
- image_np = np.array(image)
314
- try:
315
- result = self._paddle_ocr.ocr(image_np, cls=kwargs.get("use_angle_cls", True))
316
- return self._process_paddle_result(result, image)
307
+ image_np = np.array(image)
308
+ use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
309
+ result = self._paddle_ocr.ocr(image_np, cls=use_textline_orientation)
310
+
311
+ extraction_result = self._process_paddle_result(result, image)
312
+
313
+ if use_cache and cache_kwargs:
314
+ cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
315
+
316
+ return extraction_result
317
317
  except Exception as e:
318
+ if use_cache and cache_kwargs:
319
+ mark_processing_complete(cache_kwargs)
318
320
  raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
319
321
 
320
322
  def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
321
- """Synchronously process a file and extract its text and metadata using PaddleOCR.
323
+ use_cache = kwargs.pop("use_cache", True)
322
324
 
323
- Args:
324
- path: A Path object representing the file to be processed.
325
- **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
325
+ cache_kwargs = None
326
+ if use_cache:
327
+ file_info = get_file_info(path)
328
+ cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
326
329
 
327
- Returns:
328
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
330
+ cached_result = handle_cache_lookup_sync(cache_kwargs)
331
+ if cached_result:
332
+ return cached_result
329
333
 
330
- Raises:
331
- OCRError: If file loading or OCR processing fails.
332
- """
333
- self._init_paddle_ocr_sync(**kwargs)
334
334
  try:
335
+ self._init_paddle_ocr_sync(**kwargs)
335
336
  image = Image.open(path)
336
- return self.process_image_sync(image, **kwargs)
337
+
338
+ kwargs["use_cache"] = False
339
+ extraction_result = self.process_image_sync(image, **kwargs)
340
+
341
+ if use_cache and cache_kwargs:
342
+ cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
343
+
344
+ return extraction_result
337
345
  except Exception as e:
346
+ if use_cache and cache_kwargs:
347
+ mark_processing_complete(cache_kwargs)
338
348
  raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
339
349
 
340
350
  @classmethod
341
351
  def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
342
- """Synchronously initialize PaddleOCR with the provided configuration.
343
-
344
- Args:
345
- **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
346
-
347
- Raises:
348
- MissingDependencyError: If PaddleOCR is not installed.
349
- OCRError: If initialization fails.
350
- """
351
352
  if cls._paddle_ocr is not None:
352
353
  return
353
354
 
@@ -358,21 +359,28 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
358
359
 
359
360
  language = cls._validate_language_code(kwargs.pop("language", "en"))
360
361
 
361
- device_info = cls._resolve_device_config(**kwargs)
362
- use_gpu = device_info.device_type == "cuda"
362
+ cls._resolve_device_config(**kwargs)
363
+
364
+ bool(find_spec("paddlepaddle_gpu"))
365
+
366
+ use_angle_cls = kwargs.pop("use_angle_cls", True)
367
+ kwargs.setdefault("use_textline_orientation", use_angle_cls)
368
+
369
+ det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
370
+ det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
371
+ det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
372
+
373
+ kwargs.setdefault("text_det_thresh", det_db_thresh)
374
+ kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
375
+ kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
363
376
 
364
- has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
365
- kwargs.setdefault("use_angle_cls", True)
366
- kwargs["use_gpu"] = use_gpu and has_gpu_package
367
- kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
368
- kwargs.setdefault("det_db_thresh", 0.3)
369
- kwargs.setdefault("det_db_box_thresh", 0.5)
370
- kwargs.setdefault("det_db_unclip_ratio", 1.6)
377
+ kwargs.pop("use_gpu", None)
378
+ kwargs.pop("gpu_mem", None)
379
+ kwargs.pop("gpu_memory_limit", None)
371
380
 
372
- if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
373
- kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
381
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
374
382
 
375
383
  try:
376
- cls._paddle_ocr = PaddleOCR(lang=language, show_log=False, **kwargs)
384
+ cls._paddle_ocr = PaddleOCR(lang=language, **kwargs)
377
385
  except Exception as e:
378
386
  raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
@@ -13,18 +13,6 @@ if TYPE_CHECKING:
13
13
 
14
14
 
15
15
  def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
16
- """Parse TSV output into structured word data.
17
-
18
- Args:
19
- tsv_data: Raw TSV output from Tesseract.
20
- min_confidence: Minimum confidence score to include a word.
21
-
22
- Returns:
23
- List of word dictionaries with position and text data.
24
-
25
- Raises:
26
- ParsingError: If TSV data cannot be parsed.
27
- """
28
16
  try:
29
17
  reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
30
18
  words: list[TSVWord] = []
@@ -62,15 +50,6 @@ def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWor
62
50
 
63
51
 
64
52
  def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
65
- """Detect columns using X position clustering.
66
-
67
- Args:
68
- words: List of word dictionaries from TSV.
69
- column_threshold: Pixel threshold for column clustering.
70
-
71
- Returns:
72
- Sorted list of column X positions.
73
- """
74
53
  if not words:
75
54
  return []
76
55
 
@@ -94,15 +73,6 @@ def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[
94
73
 
95
74
 
96
75
  def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
97
- """Detect rows using Y position clustering.
98
-
99
- Args:
100
- words: List of word dictionaries from TSV.
101
- row_threshold_ratio: Row threshold as ratio of mean text height.
102
-
103
- Returns:
104
- Sorted list of row Y positions.
105
- """
106
76
  if not words:
107
77
  return []
108
78
 
@@ -129,15 +99,6 @@ def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> li
129
99
 
130
100
 
131
101
  def _find_closest_index(value: float, positions: list[int]) -> int:
132
- """Find index of closest position.
133
-
134
- Args:
135
- value: The value to match.
136
- positions: List of positions to search.
137
-
138
- Returns:
139
- Index of the closest position.
140
- """
141
102
  if not positions:
142
103
  return 0
143
104
 
@@ -146,14 +107,6 @@ def _find_closest_index(value: float, positions: list[int]) -> int:
146
107
 
147
108
 
148
109
  def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
149
- """Remove completely empty rows and columns.
150
-
151
- Args:
152
- table: 2D table array.
153
-
154
- Returns:
155
- Cleaned table with empty rows/columns removed.
156
- """
157
110
  if not table:
158
111
  return table
159
112
 
@@ -175,16 +128,6 @@ def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
175
128
  def reconstruct_table(
176
129
  words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
177
130
  ) -> list[list[str]]:
178
- """Reconstruct table from words and detected structure.
179
-
180
- Args:
181
- words: List of word dictionaries from TSV.
182
- column_threshold: Pixel threshold for column clustering.
183
- row_threshold_ratio: Row threshold as ratio of mean text height.
184
-
185
- Returns:
186
- 2D list representing the table structure.
187
- """
188
131
  if not words:
189
132
  return []
190
133
 
@@ -211,14 +154,6 @@ def reconstruct_table(
211
154
 
212
155
 
213
156
  def to_markdown(table: list[list[str]]) -> str:
214
- """Convert table to markdown format.
215
-
216
- Args:
217
- table: 2D list representing the table.
218
-
219
- Returns:
220
- Markdown-formatted table string.
221
- """
222
157
  if not table or not table[0]:
223
158
  return ""
224
159
 
@@ -238,17 +173,6 @@ def to_markdown(table: list[list[str]]) -> str:
238
173
  def extract_table_from_tsv(
239
174
  tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
240
175
  ) -> str:
241
- """Extract table from TSV data and convert to markdown.
242
-
243
- Args:
244
- tsv_data: Raw TSV output from Tesseract.
245
- column_threshold: Pixel threshold for column clustering.
246
- row_threshold_ratio: Row threshold as ratio of mean text height.
247
- min_confidence: Minimum confidence score to include a word.
248
-
249
- Returns:
250
- Markdown-formatted table string, or empty string if no table detected.
251
- """
252
176
  words = extract_words(tsv_data, min_confidence=min_confidence)
253
177
  if not words:
254
178
  return ""