kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,332 +0,0 @@
1
- """Isolated GMFT table extraction to handle segmentation faults."""
2
-
3
- from __future__ import annotations
4
-
5
- import multiprocessing as mp
6
- import pickle
7
- import queue
8
- import signal
9
- import traceback
10
- from typing import TYPE_CHECKING, Any
11
-
12
- if TYPE_CHECKING:
13
- from os import PathLike
14
-
15
- from kreuzberg._gmft import GMFTConfig
16
- from kreuzberg._types import TableData
17
-
18
-
19
- def _extract_tables_in_process(
20
- file_path: str | PathLike[str],
21
- config_dict: dict[str, Any],
22
- result_queue: queue.Queue[tuple[bool, Any]],
23
- ) -> None:
24
- """Extract tables in an isolated process to handle potential segfaults.
25
-
26
- Args:
27
- file_path: Path to the PDF file
28
- config_dict: Serialized GMFTConfig as a dict
29
- result_queue: Queue to put results or errors
30
- """
31
- signal.signal(signal.SIGINT, signal.SIG_IGN)
32
-
33
- try:
34
- from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
35
- from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
36
- from gmft.formatters.tatr import TATRFormatConfig
37
- from gmft.pdf_bindings.pdfium import PyPDFium2Document
38
-
39
- from kreuzberg._gmft import GMFTConfig
40
-
41
- config = GMFTConfig(**config_dict)
42
-
43
- formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
44
- config=TATRFormatConfig(
45
- verbosity=config.verbosity,
46
- formatter_base_threshold=config.formatter_base_threshold,
47
- cell_required_confidence=config.cell_required_confidence,
48
- remove_null_rows=config.remove_null_rows,
49
- enable_multi_header=config.enable_multi_header,
50
- semantic_spanning_cells=config.semantic_spanning_cells,
51
- semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
52
- large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
53
- large_table_threshold=config.large_table_threshold,
54
- large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
55
- large_table_maximum_rows=config.large_table_maximum_rows,
56
- force_large_table_assumption=config.force_large_table_assumption,
57
- )
58
- )
59
- detector = AutoTableDetector( # type: ignore[no-untyped-call]
60
- config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
61
- )
62
-
63
- doc = PyPDFium2Document(str(file_path))
64
- cropped_tables = []
65
- dataframes = []
66
-
67
- try:
68
- for page in doc:
69
- cropped_tables.extend(detector.extract(page)) # type: ignore[attr-defined]
70
-
71
- for cropped_table in cropped_tables:
72
- formatted_table = formatter.extract(cropped_table) # type: ignore[attr-defined]
73
- dataframes.append(formatted_table.df())
74
-
75
- results = []
76
- for data_frame, cropped_table in zip(dataframes, cropped_tables):
77
- import io
78
-
79
- img_bytes = io.BytesIO()
80
- cropped_image = cropped_table.image()
81
- cropped_image.save(img_bytes, format="PNG")
82
- img_bytes.seek(0)
83
-
84
- results.append(
85
- {
86
- "cropped_image_bytes": img_bytes.getvalue(),
87
- "page_number": cropped_table.page.page_number,
88
- "text": data_frame.to_markdown(),
89
- "df_pickle": pickle.dumps(data_frame),
90
- }
91
- )
92
-
93
- result_queue.put((True, results))
94
-
95
- finally:
96
- doc.close() # type: ignore[no-untyped-call]
97
-
98
- except Exception as e: # noqa: BLE001
99
- error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
100
- result_queue.put((False, error_info))
101
-
102
-
103
- def extract_tables_isolated(
104
- file_path: str | PathLike[str],
105
- config: GMFTConfig | None = None,
106
- timeout: float = 300.0,
107
- ) -> list[TableData]:
108
- """Extract tables using an isolated process to handle segfaults.
109
-
110
- Args:
111
- file_path: Path to the PDF file
112
- config: GMFT configuration
113
- timeout: Maximum time to wait for extraction
114
-
115
- Returns:
116
- List of extracted tables
117
-
118
- Raises:
119
- RuntimeError: If extraction fails or times out
120
- """
121
- from kreuzberg._gmft import GMFTConfig
122
- from kreuzberg._types import TableData
123
- from kreuzberg.exceptions import ParsingError
124
-
125
- config = config or GMFTConfig()
126
- config_dict = config.__dict__.copy()
127
-
128
- ctx = mp.get_context("spawn")
129
- result_queue = ctx.Queue()
130
-
131
- process = ctx.Process(
132
- target=_extract_tables_in_process,
133
- args=(str(file_path), config_dict, result_queue),
134
- )
135
-
136
- process.start()
137
-
138
- try:
139
- # Wait for result with timeout, checking for process death # ~keep
140
- import time
141
-
142
- start_time = time.time()
143
- while True:
144
- try:
145
- success, result = result_queue.get_nowait()
146
- break
147
- except queue.Empty:
148
- if time.time() - start_time > timeout:
149
- raise
150
-
151
- if not process.is_alive():
152
- # Process died without putting result # ~keep
153
- if process.exitcode == -signal.SIGSEGV:
154
- raise ParsingError(
155
- "GMFT process crashed with segmentation fault",
156
- context={
157
- "file_path": str(file_path),
158
- "exit_code": process.exitcode,
159
- },
160
- ) from None
161
- raise ParsingError(
162
- f"GMFT process died unexpectedly with exit code {process.exitcode}",
163
- context={
164
- "file_path": str(file_path),
165
- "exit_code": process.exitcode,
166
- },
167
- ) from None
168
-
169
- time.sleep(0.1)
170
-
171
- if success:
172
- tables = []
173
- for table_dict in result:
174
- import io
175
- import pickle
176
-
177
- from PIL import Image
178
-
179
- img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
180
- df = pickle.loads(table_dict["df_pickle"]) # noqa: S301
181
-
182
- tables.append(
183
- TableData(
184
- cropped_image=img,
185
- page_number=table_dict["page_number"],
186
- text=table_dict["text"],
187
- df=df,
188
- )
189
- )
190
-
191
- return tables
192
-
193
- error_info = result
194
- raise ParsingError(
195
- f"GMFT table extraction failed: {error_info['error']}",
196
- context={
197
- "file_path": str(file_path),
198
- "error_type": error_info["type"],
199
- "traceback": error_info["traceback"],
200
- },
201
- )
202
-
203
- except queue.Empty as e:
204
- raise ParsingError(
205
- "GMFT table extraction timed out",
206
- context={
207
- "file_path": str(file_path),
208
- "timeout": timeout,
209
- },
210
- ) from e
211
- finally:
212
- if process.is_alive():
213
- process.terminate()
214
- process.join(timeout=5)
215
- if process.is_alive():
216
- process.kill()
217
- process.join()
218
-
219
-
220
- async def extract_tables_isolated_async(
221
- file_path: str | PathLike[str],
222
- config: GMFTConfig | None = None,
223
- timeout: float = 300.0,
224
- ) -> list[TableData]:
225
- """Async version of extract_tables_isolated using asyncio.
226
-
227
- Args:
228
- file_path: Path to the PDF file
229
- config: GMFT configuration
230
- timeout: Maximum time to wait for extraction
231
-
232
- Returns:
233
- List of extracted tables
234
-
235
- Raises:
236
- RuntimeError: If extraction fails or times out
237
- """
238
- import anyio
239
-
240
- from kreuzberg._gmft import GMFTConfig
241
- from kreuzberg._types import TableData
242
- from kreuzberg.exceptions import ParsingError
243
-
244
- config = config or GMFTConfig()
245
- config_dict = config.__dict__.copy()
246
-
247
- ctx = mp.get_context("spawn")
248
- result_queue = ctx.Queue()
249
-
250
- process = ctx.Process(
251
- target=_extract_tables_in_process,
252
- args=(str(file_path), config_dict, result_queue),
253
- )
254
-
255
- process.start()
256
-
257
- try:
258
-
259
- async def wait_for_result() -> tuple[bool, Any]:
260
- while True:
261
- try:
262
- return result_queue.get_nowait() # type: ignore[no-any-return]
263
- except queue.Empty: # noqa: PERF203
264
- await anyio.sleep(0.1)
265
- if not process.is_alive():
266
- # Process died without putting result # ~keep
267
- if process.exitcode == -signal.SIGSEGV:
268
- raise ParsingError(
269
- "GMFT process crashed with segmentation fault",
270
- context={
271
- "file_path": str(file_path),
272
- "exit_code": process.exitcode,
273
- },
274
- ) from None
275
- raise ParsingError(
276
- f"GMFT process died unexpectedly with exit code {process.exitcode}",
277
- context={
278
- "file_path": str(file_path),
279
- "exit_code": process.exitcode,
280
- },
281
- ) from None
282
-
283
- with anyio.fail_after(timeout):
284
- success, result = await wait_for_result()
285
-
286
- if success:
287
- tables = []
288
- for table_dict in result:
289
- import io
290
- import pickle
291
-
292
- from PIL import Image
293
-
294
- img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
295
- df = pickle.loads(table_dict["df_pickle"]) # noqa: S301
296
-
297
- tables.append(
298
- TableData(
299
- cropped_image=img,
300
- page_number=table_dict["page_number"],
301
- text=table_dict["text"],
302
- df=df,
303
- )
304
- )
305
-
306
- return tables
307
-
308
- error_info = result
309
- raise ParsingError(
310
- f"GMFT table extraction failed: {error_info['error']}",
311
- context={
312
- "file_path": str(file_path),
313
- "error_type": error_info["type"],
314
- "traceback": error_info["traceback"],
315
- },
316
- )
317
-
318
- except TimeoutError as e:
319
- raise ParsingError(
320
- "GMFT table extraction timed out",
321
- context={
322
- "file_path": str(file_path),
323
- "timeout": timeout,
324
- },
325
- ) from e
326
- finally:
327
- if process.is_alive():
328
- process.terminate()
329
- await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
330
- if process.is_alive():
331
- process.kill()
332
- await anyio.to_thread.run_sync(process.join)
@@ -1,188 +0,0 @@
1
- """Process pool manager for resource-aware multiprocessing."""
2
-
3
- from __future__ import annotations
4
-
5
- import multiprocessing as mp
6
- from concurrent.futures import ProcessPoolExecutor
7
- from typing import TYPE_CHECKING, Any, Callable, TypeVar
8
-
9
- import anyio
10
- import psutil
11
- from typing_extensions import Self
12
-
13
- if TYPE_CHECKING:
14
- import types
15
-
16
- T = TypeVar("T")
17
-
18
-
19
- class ProcessPoolManager:
20
- """Resource-aware process pool manager for CPU-intensive tasks."""
21
-
22
- def __init__(
23
- self,
24
- max_processes: int | None = None,
25
- memory_limit_gb: float | None = None,
26
- ) -> None:
27
- """Initialize the process pool manager.
28
-
29
- Args:
30
- max_processes: Maximum number of processes. Defaults to CPU count.
31
- memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
32
- """
33
- self.max_processes = max_processes or mp.cpu_count()
34
-
35
- if memory_limit_gb is None:
36
- available_memory = psutil.virtual_memory().available
37
- self.memory_limit_bytes = int(available_memory * 0.75) # Use 75% of available # ~keep
38
- else:
39
- self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
40
-
41
- self._executor: ProcessPoolExecutor | None = None
42
- self._active_tasks = 0
43
-
44
- def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
45
- """Calculate optimal number of workers based on memory constraints.
46
-
47
- Args:
48
- task_memory_mb: Estimated memory usage per task in MB.
49
-
50
- Returns:
51
- Optimal number of workers.
52
- """
53
- task_memory_bytes = task_memory_mb * 1024**2
54
- memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
55
-
56
- return min(self.max_processes, memory_based_limit)
57
-
58
- def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
59
- """Ensure process pool executor is initialized."""
60
- if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
61
- if self._executor is not None:
62
- self._executor.shutdown(wait=False)
63
-
64
- workers = max_workers or self.max_processes
65
- self._executor = ProcessPoolExecutor(max_workers=workers)
66
-
67
- return self._executor
68
-
69
- async def submit_task(
70
- self,
71
- func: Callable[..., T],
72
- *args: Any,
73
- task_memory_mb: float = 100,
74
- ) -> T:
75
- """Submit a task to the process pool.
76
-
77
- Args:
78
- func: Function to execute.
79
- *args: Positional arguments for the function.
80
- task_memory_mb: Estimated memory usage in MB.
81
-
82
- Returns:
83
- Result of the function execution.
84
- """
85
- workers = self.get_optimal_workers(task_memory_mb)
86
- self._ensure_executor(workers)
87
-
88
- self._active_tasks += 1
89
-
90
- try:
91
- return await anyio.to_thread.run_sync(func, *args)
92
- finally:
93
- self._active_tasks -= 1
94
-
95
- async def submit_batch(
96
- self,
97
- func: Callable[..., T],
98
- arg_batches: list[tuple[Any, ...]],
99
- task_memory_mb: float = 100,
100
- max_concurrent: int | None = None,
101
- ) -> list[T]:
102
- """Submit a batch of tasks to the process pool.
103
-
104
- Args:
105
- func: Function to execute.
106
- arg_batches: List of argument tuples for each task.
107
- task_memory_mb: Estimated memory usage per task in MB.
108
- max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
109
-
110
- Returns:
111
- List of results in the same order as input.
112
- """
113
- if not arg_batches:
114
- return []
115
-
116
- workers = self.get_optimal_workers(task_memory_mb)
117
- max_concurrent = max_concurrent or workers
118
-
119
- self._ensure_executor(workers)
120
-
121
- semaphore = anyio.CapacityLimiter(max_concurrent)
122
-
123
- async def submit_single(args: tuple[Any, ...]) -> T:
124
- async with semaphore:
125
- self._active_tasks += 1
126
- try:
127
- return await anyio.to_thread.run_sync(func, *args)
128
- finally:
129
- self._active_tasks -= 1
130
-
131
- async with anyio.create_task_group() as tg:
132
- results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
133
-
134
- async def run_task(idx: int, args: tuple[Any, ...]) -> None:
135
- results[idx] = await submit_single(args)
136
-
137
- for idx, args in enumerate(arg_batches):
138
- tg.start_soon(run_task, idx, args)
139
-
140
- return results
141
-
142
- def get_system_info(self) -> dict[str, Any]:
143
- """Get current system resource information."""
144
- memory = psutil.virtual_memory()
145
- cpu_percent = psutil.cpu_percent(interval=1)
146
-
147
- return {
148
- "cpu_count": mp.cpu_count(),
149
- "cpu_percent": cpu_percent,
150
- "memory_total": memory.total,
151
- "memory_available": memory.available,
152
- "memory_percent": memory.percent,
153
- "active_tasks": self._active_tasks,
154
- "max_processes": self.max_processes,
155
- "memory_limit": self.memory_limit_bytes,
156
- }
157
-
158
- def shutdown(self, wait: bool = True) -> None:
159
- """Shutdown the process pool."""
160
- if self._executor is not None:
161
- self._executor.shutdown(wait=wait)
162
- self._executor = None
163
-
164
- def __enter__(self) -> Self:
165
- """Context manager entry."""
166
- return self
167
-
168
- def __exit__(
169
- self,
170
- exc_type: type[BaseException] | None,
171
- exc_val: BaseException | None,
172
- exc_tb: types.TracebackType | None,
173
- ) -> None:
174
- """Context manager exit."""
175
- self.shutdown()
176
-
177
- async def __aenter__(self) -> Self:
178
- """Async context manager entry."""
179
- return self
180
-
181
- async def __aexit__(
182
- self,
183
- exc_type: type[BaseException] | None,
184
- exc_val: BaseException | None,
185
- exc_tb: types.TracebackType | None,
186
- ) -> None:
187
- """Async context manager exit."""
188
- self.shutdown()