kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -84,8 +84,6 @@ NodeType = Literal[
84
84
 
85
85
 
86
86
  class PandocExtractor(Extractor):
87
- """Extractor for documents supported by Pandoc."""
88
-
89
87
  _checked_version: bool = False
90
88
 
91
89
  MIMETYPE_TO_PANDOC_TYPE_MAPPING: ClassVar[Mapping[str, str]] = {
@@ -153,14 +151,6 @@ class PandocExtractor(Extractor):
153
151
  }
154
152
 
155
153
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
156
- """Extract text and metadata from bytes content using Pandoc.
157
-
158
- Args:
159
- content: The content bytes to process.
160
-
161
- Returns:
162
- ExtractionResult with the extracted text and metadata.
163
- """
164
154
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
165
155
  input_file, unlink = await create_temp_file(f".{extension}")
166
156
 
@@ -171,17 +161,6 @@ class PandocExtractor(Extractor):
171
161
  await unlink()
172
162
 
173
163
  async def extract_path_async(self, path: Path) -> ExtractionResult:
174
- """Extract text and metadata from a file using Pandoc.
175
-
176
- Args:
177
- path: The path to the file to process.
178
-
179
- Raises:
180
- ParsingError: If the file data could not be extracted.
181
-
182
- Returns:
183
- ExtractionResult with the extracted text and metadata.
184
- """
185
164
  await self._validate_pandoc_version()
186
165
  self._get_pandoc_type_from_mime_type(self.mime_type)
187
166
 
@@ -198,14 +177,6 @@ class PandocExtractor(Extractor):
198
177
  raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
199
178
 
200
179
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
201
- """Pure sync implementation of extract_bytes.
202
-
203
- Args:
204
- content: The content bytes to process.
205
-
206
- Returns:
207
- ExtractionResult with the extracted text and metadata.
208
- """
209
180
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
210
181
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
211
182
 
@@ -219,17 +190,6 @@ class PandocExtractor(Extractor):
219
190
  Path(temp_path).unlink()
220
191
 
221
192
  def extract_path_sync(self, path: Path) -> ExtractionResult:
222
- """Pure sync implementation of extract_path.
223
-
224
- Args:
225
- path: The path to the file to process.
226
-
227
- Returns:
228
- ExtractionResult with the extracted text and metadata.
229
-
230
- Raises:
231
- ParsingError: When file processing fails.
232
- """
233
193
  self._validate_pandoc_version_sync()
234
194
  self._get_pandoc_type_from_mime_type(self.mime_type)
235
195
 
@@ -244,18 +204,13 @@ class PandocExtractor(Extractor):
244
204
  raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
245
205
 
246
206
  async def _validate_pandoc_version(self) -> None:
247
- """Validate that the installed Pandoc version meets the minimum requirement.
248
-
249
- Raises:
250
- MissingDependencyError: If Pandoc is not installed or version is too low
251
- """
252
207
  try:
253
208
  if self._checked_version:
254
209
  return
255
210
 
256
211
  command = ["pandoc", "--version"]
257
212
  result = await run_process(command)
258
- stdout = result.stdout.decode()
213
+ stdout = result.stdout.decode("utf-8")
259
214
 
260
215
  version_match = re.search(
261
216
  r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
@@ -299,14 +254,6 @@ class PandocExtractor(Extractor):
299
254
 
300
255
  @staticmethod
301
256
  def _get_pandoc_key(key: str) -> str | None:
302
- """Map Pandoc metadata keys to our standard metadata keys.
303
-
304
- Args:
305
- key: The key from Pandoc metadata
306
-
307
- Returns:
308
- The mapped key name for our system, or None if not mapped
309
- """
310
257
  if key == "abstract":
311
258
  return "summary"
312
259
 
@@ -325,17 +272,6 @@ class PandocExtractor(Extractor):
325
272
  return key
326
273
 
327
274
  def _get_pandoc_type_from_mime_type(self, mime_type: str) -> str:
328
- """Get Pandoc format type from MIME type.
329
-
330
- Args:
331
- mime_type: The MIME type to look up
332
-
333
- Returns:
334
- The corresponding Pandoc type
335
-
336
- Raises:
337
- ValidationError: If mime_type is not supported
338
- """
339
275
  if pandoc_type := (self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
340
276
  return pandoc_type
341
277
 
@@ -349,17 +285,6 @@ class PandocExtractor(Extractor):
349
285
  raise ValidationError(f"Unsupported mime type: {mime_type}")
350
286
 
351
287
  async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
352
- """Extract metadata from a file using Pandoc.
353
-
354
- Args:
355
- input_file: The file to extract metadata from
356
-
357
- Returns:
358
- The extracted metadata
359
-
360
- Raises:
361
- ParsingError: If metadata extraction fails
362
- """
363
288
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
364
289
  metadata_file, unlink = await create_temp_file(".json")
365
290
  try:
@@ -389,17 +314,6 @@ class PandocExtractor(Extractor):
389
314
  await unlink()
390
315
 
391
316
  async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
392
- """Extract text content from a file using Pandoc.
393
-
394
- Args:
395
- input_file: The file to extract content from
396
-
397
- Returns:
398
- The extracted text content
399
-
400
- Raises:
401
- ParsingError: If content extraction fails
402
- """
403
317
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
404
318
  output_path, unlink = await create_temp_file(".md")
405
319
  try:
@@ -431,14 +345,6 @@ class PandocExtractor(Extractor):
431
345
  await unlink()
432
346
 
433
347
  def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
434
- """Extract structured metadata from Pandoc JSON metadata.
435
-
436
- Args:
437
- raw_meta: The raw metadata from Pandoc
438
-
439
- Returns:
440
- Structured metadata
441
- """
442
348
  meta: Metadata = {}
443
349
 
444
350
  if (
@@ -485,16 +391,6 @@ class PandocExtractor(Extractor):
485
391
  return meta
486
392
 
487
393
  def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
488
- """Extract text from an inline node in a document structure.
489
-
490
- Args:
491
- node: The node to extract text from
492
- type_field: The field name for the node type
493
- content_field: The field name for the node content
494
-
495
- Returns:
496
- The extracted text or None if no text could be extracted
497
- """
498
394
  if node_type := node.get(type_field):
499
395
  if node_type == "Str":
500
396
  return node.get(content_field)
@@ -505,29 +401,11 @@ class PandocExtractor(Extractor):
505
401
  return None
506
402
 
507
403
  def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
508
- """Extract text from a list of inline nodes.
509
-
510
- Args:
511
- nodes: The list of nodes to extract text from
512
-
513
- Returns:
514
- The extracted text or None if no text could be extracted
515
- """
516
404
  texts = [text for node in nodes if (text := self._extract_inline_text(node))]
517
405
  result = "".join(texts).strip()
518
406
  return result if result else None
519
407
 
520
408
  def _extract_meta_value(self, node: Any, type_field: str = "t", content_field: str = "c") -> str | list[str] | None:
521
- """Extract a metadata value from a node.
522
-
523
- Args:
524
- node: The node to extract metadata from
525
- type_field: The field name for the node type
526
- content_field: The field name for the node content
527
-
528
- Returns:
529
- The extracted metadata value or None if no metadata could be extracted
530
- """
531
409
  if not isinstance(node, dict) or type_field not in node:
532
410
  return None
533
411
 
@@ -577,12 +455,17 @@ class PandocExtractor(Extractor):
577
455
  return None
578
456
 
579
457
  def _validate_pandoc_version_sync(self) -> None:
580
- """Synchronous version of _validate_pandoc_version."""
581
458
  try:
582
459
  if self._checked_version:
583
460
  return
584
461
 
585
- result = subprocess.run(["pandoc", "--version"], capture_output=True, text=True, check=False) # noqa: S607
462
+ result = subprocess.run(
463
+ ["pandoc", "--version"], # noqa: S607
464
+ capture_output=True,
465
+ text=True,
466
+ check=False,
467
+ encoding="utf-8",
468
+ )
586
469
 
587
470
  if result.returncode != 0:
588
471
  raise MissingDependencyError(
@@ -621,7 +504,6 @@ class PandocExtractor(Extractor):
621
504
  ) from e
622
505
 
623
506
  def _extract_metadata_sync(self, path: Path) -> Metadata:
624
- """Synchronous version of _handle_extract_metadata."""
625
507
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
626
508
  fd, metadata_file = tempfile.mkstemp(suffix=".json")
627
509
  os.close(fd)
@@ -638,7 +520,7 @@ class PandocExtractor(Extractor):
638
520
  str(metadata_file),
639
521
  ]
640
522
 
641
- result = subprocess.run(command, capture_output=True, text=True, check=False)
523
+ result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
642
524
 
643
525
  if result.returncode != 0:
644
526
  raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
@@ -655,7 +537,6 @@ class PandocExtractor(Extractor):
655
537
  Path(metadata_file).unlink()
656
538
 
657
539
  def _extract_file_sync(self, path: Path) -> str:
658
- """Synchronous version of _handle_extract_file."""
659
540
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
660
541
  fd, output_path = tempfile.mkstemp(suffix=".md")
661
542
  os.close(fd)
@@ -673,7 +554,7 @@ class PandocExtractor(Extractor):
673
554
  str(output_path),
674
555
  ]
675
556
 
676
- result = subprocess.run(command, capture_output=True, text=True, check=False)
557
+ result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
677
558
 
678
559
  if result.returncode != 0:
679
560
  raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
@@ -691,8 +572,6 @@ class PandocExtractor(Extractor):
691
572
 
692
573
 
693
574
  class MarkdownExtractor(PandocExtractor):
694
- """Extractor for Markdown-based document formats."""
695
-
696
575
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
697
576
  "text/x-markdown",
698
577
  "text/x-commonmark",
@@ -704,8 +583,6 @@ class MarkdownExtractor(PandocExtractor):
704
583
 
705
584
 
706
585
  class OfficeDocumentExtractor(PandocExtractor):
707
- """Extractor for Office document formats (Word, ODT)."""
708
-
709
586
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
710
587
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
711
588
  "application/vnd.oasis.opendocument.text",
@@ -713,8 +590,6 @@ class OfficeDocumentExtractor(PandocExtractor):
713
590
 
714
591
 
715
592
  class EbookExtractor(PandocExtractor):
716
- """Extractor for e-book formats (EPUB, FB2)."""
717
-
718
593
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
719
594
  "application/epub+zip",
720
595
  "application/x-fictionbook+xml",
@@ -722,8 +597,6 @@ class EbookExtractor(PandocExtractor):
722
597
 
723
598
 
724
599
  class StructuredTextExtractor(PandocExtractor):
725
- """Extractor for structured text formats (RST, Org, etc.)."""
726
-
727
600
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
728
601
  "text/x-rst",
729
602
  "text/x-org",
@@ -733,8 +606,6 @@ class StructuredTextExtractor(PandocExtractor):
733
606
 
734
607
 
735
608
  class LaTeXExtractor(PandocExtractor):
736
- """Extractor for LaTeX and Typst documents."""
737
-
738
609
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
739
610
  "application/x-latex",
740
611
  "application/x-typst",
@@ -742,8 +613,6 @@ class LaTeXExtractor(PandocExtractor):
742
613
 
743
614
 
744
615
  class BibliographyExtractor(PandocExtractor):
745
- """Extractor for bibliography formats (BibTeX, CSL JSON, etc.)."""
746
-
747
616
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
748
617
  "application/x-bibtex",
749
618
  "application/x-biblatex",
@@ -754,8 +623,6 @@ class BibliographyExtractor(PandocExtractor):
754
623
 
755
624
 
756
625
  class XMLBasedExtractor(PandocExtractor):
757
- """Extractor for XML-based document formats (DocBook, JATS, OPML)."""
758
-
759
626
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
760
627
  "application/docbook+xml",
761
628
  "application/x-jats+xml",
@@ -764,8 +631,6 @@ class XMLBasedExtractor(PandocExtractor):
764
631
 
765
632
 
766
633
  class TabularDataExtractor(PandocExtractor):
767
- """Extractor for tabular data formats (CSV, TSV)."""
768
-
769
634
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
770
635
  "text/csv",
771
636
  "text/tab-separated-values",
@@ -773,8 +638,6 @@ class TabularDataExtractor(PandocExtractor):
773
638
 
774
639
 
775
640
  class MiscFormatExtractor(PandocExtractor):
776
- """Extractor for miscellaneous formats (RTF, man, Jupyter notebooks)."""
777
-
778
641
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
779
642
  "application/rtf",
780
643
  "text/troff",
@@ -18,11 +18,8 @@ from playa import parse
18
18
  from kreuzberg._extractors._base import Extractor
19
19
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
20
20
  from kreuzberg._ocr import get_ocr_backend
21
- from kreuzberg._ocr._easyocr import EasyOCRConfig
22
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
- from kreuzberg._ocr._tesseract import TesseractConfig
24
21
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
25
- from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
22
+ from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
26
23
  from kreuzberg._utils._errors import create_error_context, should_retry
27
24
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
28
25
  from kreuzberg._utils._string import normalize_spaces
@@ -65,7 +62,6 @@ class PDFExtractor(Extractor):
65
62
  if self._validate_extracted_text(content):
66
63
  result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
67
64
  except ParsingError:
68
- # If searchable text extraction fails, continue to OCR or empty result
69
65
  pass
70
66
 
71
67
  if not result and self.config.ocr_backend is not None:
@@ -77,7 +73,7 @@ class PDFExtractor(Extractor):
77
73
  result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
78
74
 
79
75
  if self.config.extract_tables:
80
- # GMFT is optional dependency
76
+ # GMFT is optional dependency ~keep
81
77
  try:
82
78
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
83
79
 
@@ -85,7 +81,6 @@ class PDFExtractor(Extractor):
85
81
  except ImportError: # pragma: no cover
86
82
  result.tables = []
87
83
 
88
- # Enhance metadata with table information
89
84
  if result.tables:
90
85
  table_summary = generate_table_summary(result.tables)
91
86
  result.metadata = result.metadata | {
@@ -98,7 +93,6 @@ class PDFExtractor(Extractor):
98
93
  return self._apply_quality_processing(result)
99
94
 
100
95
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
101
- """Pure sync implementation of PDF extraction from bytes."""
102
96
  fd, temp_path = tempfile.mkstemp(suffix=".pdf")
103
97
  try:
104
98
  with os.fdopen(fd, "wb") as f:
@@ -115,7 +109,6 @@ class PDFExtractor(Extractor):
115
109
  Path(temp_path).unlink()
116
110
 
117
111
  def extract_path_sync(self, path: Path) -> ExtractionResult:
118
- """Pure sync implementation of PDF extraction from path."""
119
112
  try:
120
113
  text = self._extract_pdf_searchable_text_sync(path)
121
114
  except ParsingError:
@@ -126,7 +119,7 @@ class PDFExtractor(Extractor):
126
119
 
127
120
  tables = []
128
121
  if self.config.extract_tables:
129
- # GMFT is optional dependency
122
+ # GMFT is optional dependency ~keep
130
123
  try:
131
124
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
132
125
 
@@ -134,7 +127,6 @@ class PDFExtractor(Extractor):
134
127
  except ImportError:
135
128
  tables = []
136
129
 
137
- # Use playa for better text structure preservation when not using OCR
138
130
  if not self.config.force_ocr and self._validate_extracted_text(text):
139
131
  text = self._extract_with_playa_sync(path, fallback_text=text)
140
132
 
@@ -148,7 +140,6 @@ class PDFExtractor(Extractor):
148
140
  chunks=[],
149
141
  )
150
142
 
151
- # Enhance metadata with table information
152
143
  if tables:
153
144
  table_summary = generate_table_summary(tables)
154
145
  result.metadata = result.metadata | {
@@ -158,25 +149,9 @@ class PDFExtractor(Extractor):
158
149
  f"{table_summary['total_rows']} total rows",
159
150
  }
160
151
 
161
- # Apply quality processing
162
152
  return self._apply_quality_processing(result)
163
153
 
164
154
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
165
- """Check if text extracted from PDF is valid or corrupted.
166
-
167
- This checks for indicators of corrupted PDF text extraction:
168
- 1. Empty or whitespace-only text
169
- 2. High concentration of control characters and null bytes
170
- 3. High concentration of Unicode replacement characters
171
-
172
- Args:
173
- text: The extracted text to validate
174
- corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
175
- characters (default: 0.05 or 5%)
176
-
177
- Returns:
178
- True if the text appears valid, False if it seems corrupted
179
- """
180
155
  if not text or not text.strip():
181
156
  return False
182
157
 
@@ -188,17 +163,6 @@ class PDFExtractor(Extractor):
188
163
  return (len(corruption_matches) / len(text)) < corruption_threshold
189
164
 
190
165
  async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
191
- """Convert a PDF file to images.
192
-
193
- Args:
194
- input_file: The path to the PDF file.
195
-
196
- Raises:
197
- ParsingError: If the PDF file could not be converted to images.
198
-
199
- Returns:
200
- A list of Pillow Images.
201
- """
202
166
  document: pypdfium2.PdfDocument | None = None
203
167
  last_error = None
204
168
 
@@ -206,7 +170,7 @@ class PDFExtractor(Extractor):
206
170
  try:
207
171
  with pypdfium_file_lock(input_file):
208
172
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
209
- return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
173
+ return [page.render(scale=200 / 72).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
210
174
  except pypdfium2.PdfiumError as e: # noqa: PERF203
211
175
  last_error = e
212
176
  if not should_retry(e, attempt + 1):
@@ -238,39 +202,18 @@ class PDFExtractor(Extractor):
238
202
  ) from last_error
239
203
 
240
204
  async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
241
- """Extract text from a scanned PDF file using OCR.
242
-
243
- Args:
244
- input_file: The path to the PDF file.
245
- ocr_backend: The OCR backend to use.
246
-
247
- Returns:
248
- The extraction result with text content and metadata.
249
- """
250
205
  images = await self._convert_pdf_to_images(input_file)
251
206
  backend = get_ocr_backend(ocr_backend)
252
207
  ocr_results = await run_taskgroup_batched(
253
208
  *[backend.process_image(image, **self.config.get_config_dict()) for image in images],
254
209
  batch_size=cpu_count(),
255
210
  )
256
- # Use list comprehension and join for efficient string building
257
211
  content = "\n".join(result.content for result in ocr_results)
258
212
 
259
213
  return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
260
214
 
261
215
  @staticmethod
262
216
  async def _extract_pdf_searchable_text(input_file: Path) -> str:
263
- """Extract text from a searchable PDF file using pypdfium2.
264
-
265
- Args:
266
- input_file: The path to the PDF file.
267
-
268
- Raises:
269
- ParsingError: If the text could not be extracted from the PDF file.
270
-
271
- Returns:
272
- The extracted text.
273
- """
274
217
  document: pypdfium2.PdfDocument | None = None
275
218
  try:
276
219
  with pypdfium_file_lock(input_file):
@@ -318,7 +261,6 @@ class PDFExtractor(Extractor):
318
261
  await run_sync(document.close)
319
262
 
320
263
  def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
321
- """Extract searchable text from PDF using pypdfium2 (sync version)."""
322
264
  pdf = None
323
265
  try:
324
266
  with pypdfium_file_lock(path):
@@ -339,7 +281,6 @@ class PDFExtractor(Extractor):
339
281
  pdf.close()
340
282
 
341
283
  def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
342
- """Extract text from PDF using OCR (sync version)."""
343
284
  pdf = None
344
285
  try:
345
286
  images = []
@@ -352,23 +293,7 @@ class PDFExtractor(Extractor):
352
293
  bitmap.close()
353
294
  page.close()
354
295
 
355
- image_paths = []
356
- temp_files = []
357
-
358
- try:
359
- for i, img in enumerate(images):
360
- fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
361
- temp_files.append((fd, temp_path))
362
- img.save(temp_path, format="PNG")
363
- os.close(fd)
364
- image_paths.append(temp_path)
365
-
366
- return self._process_pdf_images_with_ocr(image_paths)
367
-
368
- finally:
369
- for _, temp_path in temp_files:
370
- with contextlib.suppress(OSError):
371
- Path(temp_path).unlink()
296
+ return self._process_pdf_images_with_ocr_direct(images)
372
297
 
373
298
  except Exception as e:
374
299
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -378,7 +303,6 @@ class PDFExtractor(Extractor):
378
303
  pdf.close()
379
304
 
380
305
  def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
381
- """Process PDF images with the configured OCR backend."""
382
306
  backend = get_ocr_backend(self.config.ocr_backend)
383
307
  paths = [Path(p) for p in image_paths]
384
308
 
@@ -401,18 +325,47 @@ class PDFExtractor(Extractor):
401
325
  case _:
402
326
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
403
327
 
404
- # Use list comprehension and join for efficient string building
328
+ return "\n\n".join(result.content for result in results)
329
+
330
+ def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
331
+ backend = get_ocr_backend(self.config.ocr_backend)
332
+
333
+ match self.config.ocr_backend:
334
+ case "tesseract":
335
+ config = (
336
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
337
+ )
338
+ results = []
339
+ for image in images:
340
+ result = backend.process_image_sync(image, **asdict(config))
341
+ results.append(result)
342
+ case "paddleocr":
343
+ paddle_config = (
344
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
345
+ )
346
+ results = []
347
+ for image in images:
348
+ result = backend.process_image_sync(image, **asdict(paddle_config))
349
+ results.append(result)
350
+ case "easyocr":
351
+ easy_config = (
352
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
353
+ )
354
+ results = []
355
+ for image in images:
356
+ result = backend.process_image_sync(image, **asdict(easy_config))
357
+ results.append(result)
358
+ case _:
359
+ raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
360
+
405
361
  return "\n\n".join(result.content for result in results)
406
362
 
407
363
  def _parse_with_password_attempts(self, content: bytes) -> Document:
408
- """Parse PDF with password attempts."""
409
- # Normalize password to list
410
364
  if isinstance(self.config.pdf_password, str):
411
365
  passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
412
366
  else:
413
367
  passwords = list(self.config.pdf_password)
414
368
 
415
- # Try each password in sequence
416
369
  last_exception = None
417
370
  for password in passwords:
418
371
  try:
@@ -421,21 +374,17 @@ class PDFExtractor(Extractor):
421
374
  last_exception = e
422
375
  continue
423
376
 
424
- # If all passwords failed, raise the last exception
425
377
  if last_exception:
426
378
  raise last_exception from None
427
379
 
428
- # Fallback to no password
429
380
  return parse(content, max_workers=1, password="")
430
381
 
431
382
  def _get_passwords_to_try(self) -> list[str]:
432
- """Get list of passwords to try in sequence."""
433
383
  if isinstance(self.config.pdf_password, str):
434
384
  return [self.config.pdf_password] if self.config.pdf_password else [""]
435
385
  return list(self.config.pdf_password) if self.config.pdf_password else [""]
436
386
 
437
387
  async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
438
- """Extract PDF metadata with password attempts."""
439
388
  passwords = self._get_passwords_to_try()
440
389
 
441
390
  last_exception = None
@@ -446,7 +395,6 @@ class PDFExtractor(Extractor):
446
395
  last_exception = e
447
396
  continue
448
397
 
449
- # If all passwords failed, try with empty password as fallback
450
398
  try:
451
399
  return await extract_pdf_metadata(content, password="")
452
400
  except Exception:
@@ -455,7 +403,6 @@ class PDFExtractor(Extractor):
455
403
  raise
456
404
 
457
405
  def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
458
- """Extract PDF metadata with password attempts (sync version)."""
459
406
  passwords = self._get_passwords_to_try()
460
407
 
461
408
  last_exception = None
@@ -466,7 +413,6 @@ class PDFExtractor(Extractor):
466
413
  last_exception = e
467
414
  continue
468
415
 
469
- # If all passwords failed, try with empty password as fallback
470
416
  try:
471
417
  return extract_pdf_metadata_sync(content, password="")
472
418
  except Exception:
@@ -475,12 +421,10 @@ class PDFExtractor(Extractor):
475
421
  raise
476
422
 
477
423
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
478
- """Extract text using playa for better structure preservation."""
479
424
  with contextlib.suppress(Exception):
480
425
  content = path.read_bytes()
481
426
  document = self._parse_with_password_attempts(content)
482
427
 
483
- # Extract text while preserving structure
484
428
  pages_text = []
485
429
  for page in document.pages:
486
430
  page_text = page.extract_text()