kreuzberg 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_chunker.py +0 -15
  2. kreuzberg/_config.py +0 -124
  3. kreuzberg/_document_classification.py +20 -39
  4. kreuzberg/_entity_extraction.py +0 -29
  5. kreuzberg/_extractors/_base.py +4 -66
  6. kreuzberg/_extractors/_email.py +0 -4
  7. kreuzberg/_extractors/_image.py +0 -2
  8. kreuzberg/_extractors/_pandoc.py +0 -58
  9. kreuzberg/_extractors/_pdf.py +0 -3
  10. kreuzberg/_extractors/_presentation.py +0 -82
  11. kreuzberg/_extractors/_spread_sheet.py +0 -2
  12. kreuzberg/_gmft.py +0 -61
  13. kreuzberg/_language_detection.py +0 -14
  14. kreuzberg/_mime_types.py +0 -17
  15. kreuzberg/_ocr/_base.py +4 -76
  16. kreuzberg/_ocr/_easyocr.py +110 -85
  17. kreuzberg/_ocr/_paddleocr.py +146 -138
  18. kreuzberg/_ocr/_table_extractor.py +0 -76
  19. kreuzberg/_ocr/_tesseract.py +0 -206
  20. kreuzberg/_playa.py +0 -27
  21. kreuzberg/_registry.py +0 -36
  22. kreuzberg/_types.py +16 -119
  23. kreuzberg/_utils/_cache.py +0 -52
  24. kreuzberg/_utils/_device.py +0 -56
  25. kreuzberg/_utils/_document_cache.py +0 -73
  26. kreuzberg/_utils/_errors.py +0 -47
  27. kreuzberg/_utils/_ocr_cache.py +136 -0
  28. kreuzberg/_utils/_pdf_lock.py +0 -14
  29. kreuzberg/_utils/_process_pool.py +0 -47
  30. kreuzberg/_utils/_quality.py +0 -17
  31. kreuzberg/_utils/_ref.py +0 -16
  32. kreuzberg/_utils/_serialization.py +0 -25
  33. kreuzberg/_utils/_string.py +0 -20
  34. kreuzberg/_utils/_sync.py +0 -76
  35. kreuzberg/_utils/_table.py +0 -45
  36. kreuzberg/_utils/_tmp.py +0 -9
  37. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +3 -2
  38. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  39. kreuzberg-3.13.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -231,7 +231,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
231
231
  ocr_cache.mark_complete(**cache_kwargs)
232
232
 
233
233
  async def _handle_cache_lookup(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
234
- """Handle cache lookup before processing."""
235
234
  ocr_cache = get_ocr_cache()
236
235
 
237
236
  cached_result = await ocr_cache.aget(**cache_kwargs)
@@ -249,7 +248,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
249
248
  return None
250
249
 
251
250
  def _prepare_tesseract_run_config(self, **kwargs: Any) -> dict[str, Any]:
252
- """Prepare configuration for a Tesseract run."""
253
251
  language = self._validate_language_code(kwargs.pop("language", "eng"))
254
252
  psm = kwargs.pop("psm", PSMMode.AUTO)
255
253
  output_format = kwargs.pop("output_format", "markdown")
@@ -282,7 +280,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
282
280
  }
283
281
 
284
282
  async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
285
- """Build and execute the Tesseract command."""
286
283
  command = [
287
284
  "tesseract",
288
285
  str(path),
@@ -327,7 +324,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
327
324
  ) from e
328
325
 
329
326
  async def _process_tesseract_output(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
330
- """Process the raw output from Tesseract based on the requested format."""
331
327
  output_format = run_config["output_format"]
332
328
  enable_table_detection = run_config["enable_table_detection"]
333
329
  kwargs = run_config["remaining_kwargs"]
@@ -413,17 +409,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
413
409
  table_row_threshold_ratio: float = 0.5,
414
410
  table_min_confidence: float = 30.0,
415
411
  ) -> ExtractionResult:
416
- """Process TSV output and extract tables if detected.
417
-
418
- Args:
419
- tsv_content: Raw TSV output from Tesseract.
420
- table_column_threshold: Pixel threshold for column clustering.
421
- table_row_threshold_ratio: Row threshold as ratio of mean text height.
422
- table_min_confidence: Minimum confidence score to include a word.
423
-
424
- Returns:
425
- ExtractionResult with extracted content and tables.
426
- """
427
412
  text_result = self._extract_text_from_tsv(tsv_content)
428
413
 
429
414
  try:
@@ -460,14 +445,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
460
445
  return text_result
461
446
 
462
447
  def _extract_text_from_tsv(self, tsv_content: str) -> ExtractionResult:
463
- """Extract plain text from TSV output.
464
-
465
- Args:
466
- tsv_content: Raw TSV output from Tesseract.
467
-
468
- Returns:
469
- ExtractionResult with extracted text.
470
- """
471
448
  try:
472
449
  reader = csv.DictReader(StringIO(tsv_content), delimiter="\t")
473
450
 
@@ -527,20 +504,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
527
504
  table_min_confidence: float = 30.0,
528
505
  **_kwargs: Any,
529
506
  ) -> ExtractionResult:
530
- """Convert hOCR content to Markdown with table detection.
531
-
532
- Args:
533
- hocr_content: Raw hOCR HTML/XML content from Tesseract.
534
- enable_table_detection: Whether to detect and format tables.
535
- html_to_markdown_config: Configuration for HTML to Markdown conversion.
536
- table_column_threshold: Pixel threshold for column clustering.
537
- table_row_threshold_ratio: Row threshold as ratio of mean text height.
538
- table_min_confidence: Minimum confidence score to include a word.
539
- **kwargs: Additional configuration options.
540
-
541
- Returns:
542
- ExtractionResult with Markdown content and detected tables.
543
- """
544
507
  config = html_to_markdown_config or HTMLToMarkdownConfig(
545
508
  escape_asterisks=False,
546
509
  escape_underscores=False,
@@ -610,20 +573,15 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
610
573
  )
611
574
 
612
575
  def _create_basic_converters(self) -> dict[str, Any]:
613
- """Create basic converters for individual hOCR elements."""
614
-
615
576
  def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
616
- """Custom converter for hOCR word elements - adds spaces between words."""
617
577
  del tag
618
578
  return f"{text.strip()} "
619
579
 
620
580
  def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
621
- """Custom converter for hOCR line elements - handles line breaks."""
622
581
  del tag
623
582
  return f"{text.strip()}\n"
624
583
 
625
584
  def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
626
- """Custom converter for hOCR paragraph elements - handles paragraph breaks."""
627
585
  del tag
628
586
  content = text.strip()
629
587
  if not content:
@@ -631,7 +589,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
631
589
  return f"{content}\n\n"
632
590
 
633
591
  def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
634
- """Custom converter for hOCR content area elements."""
635
592
  del tag
636
593
  content = text.strip()
637
594
  if not content:
@@ -639,17 +596,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
639
596
  return f"{content}\n\n"
640
597
 
641
598
  def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
642
- """Custom converter for hOCR page elements."""
643
599
  del tag
644
600
  return text.strip()
645
601
 
646
602
  def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
647
- """Custom converter for hOCR separator elements - convert to horizontal rules."""
648
603
  del tag, text
649
604
  return "---\n"
650
605
 
651
606
  def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
652
- """Custom converter for hOCR photo/image elements - indicate image presence."""
653
607
  del text
654
608
  title = tag.get("title", "")
655
609
  if isinstance(title, str):
@@ -672,18 +626,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
672
626
  }
673
627
 
674
628
  def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
675
- """Create custom converters for hOCR elements that preserve spacing.
676
-
677
- Args:
678
- tables: List of detected tables (not used for filtering, tables added separately).
679
-
680
- Returns:
681
- Dictionary mapping HTML tags to converter functions.
682
- """
683
629
  basic_converters = self._create_basic_converters()
684
630
 
685
631
  def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
686
- """Generic converter for div elements based on class."""
687
632
  class_attr = tag.get("class", "")
688
633
  if isinstance(class_attr, list):
689
634
  class_attr = " ".join(class_attr)
@@ -697,7 +642,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
697
642
  return text
698
643
 
699
644
  def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
700
- """Generic converter for span elements based on class."""
701
645
  class_attr = tag.get("class", "")
702
646
  if isinstance(class_attr, list):
703
647
  class_attr = " ".join(class_attr)
@@ -717,15 +661,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
717
661
  }
718
662
 
719
663
  def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
720
- """Synchronously process hOCR content to markdown format.
721
-
722
- Args:
723
- hocr_content: Raw hOCR content as string
724
- config: Tesseract configuration object
725
-
726
- Returns:
727
- ExtractionResult with markdown content
728
- """
729
664
  tables: list[TableData] = []
730
665
 
731
666
  if config.enable_table_detection:
@@ -795,17 +730,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
795
730
  table_row_threshold_ratio: float = 0.5,
796
731
  table_min_confidence: float = 30.0,
797
732
  ) -> ExtractionResult:
798
- """Synchronously process TSV output and extract tables if detected.
799
-
800
- Args:
801
- tsv_content: Raw TSV output from Tesseract.
802
- table_column_threshold: Pixel threshold for column clustering.
803
- table_row_threshold_ratio: Row threshold as ratio of mean text height.
804
- table_min_confidence: Minimum confidence score to include a word.
805
-
806
- Returns:
807
- ExtractionResult with extracted content and tables.
808
- """
809
733
  text_result = self._extract_text_from_tsv(tsv_content)
810
734
 
811
735
  try:
@@ -848,17 +772,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
848
772
  row_threshold_ratio: float = 0.5,
849
773
  min_confidence: float = 30.0,
850
774
  ) -> list[TableData]:
851
- """Extract tables from hOCR structure using coordinate analysis.
852
-
853
- Args:
854
- soup: Parsed hOCR BeautifulSoup object.
855
- column_threshold: Pixel threshold for column clustering.
856
- row_threshold_ratio: Row threshold as ratio of mean text height.
857
- min_confidence: Minimum confidence score to include a word.
858
-
859
- Returns:
860
- List of detected tables as TableData objects.
861
- """
862
775
  tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
863
776
 
864
777
  if not tsv_data:
@@ -903,15 +816,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
903
816
  return tables
904
817
 
905
818
  async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
906
- """Convert hOCR structure to TSV format for table extraction.
907
-
908
- Args:
909
- soup: Parsed hOCR BeautifulSoup object.
910
- min_confidence: Minimum confidence score to include.
911
-
912
- Returns:
913
- TSV formatted string compatible with table extractor.
914
- """
915
819
  tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
916
820
 
917
821
  words = soup.find_all("span", class_="ocrx_word")
@@ -947,14 +851,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
947
851
  return "\n".join(tsv_lines)
948
852
 
949
853
  def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
950
- """Identify potential table regions from word coordinates.
951
-
952
- Args:
953
- words: List of word dictionaries with coordinates.
954
-
955
- Returns:
956
- List of word groups representing potential tables.
957
- """
958
854
  if not words:
959
855
  return []
960
856
 
@@ -962,11 +858,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
962
858
 
963
859
  @classmethod
964
860
  async def _validate_tesseract_version(cls) -> None:
965
- """Validate that Tesseract is installed and is version 5 or above.
966
-
967
- Raises:
968
- MissingDependencyError: If Tesseract is not installed or is below version 5.
969
- """
970
861
  try:
971
862
  if cls._version_checked:
972
863
  return
@@ -992,7 +883,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
992
883
  ) from e
993
884
 
994
885
  def _handle_cache_lookup_sync(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
995
- """Handle cache lookup before processing (sync)."""
996
886
  ocr_cache = get_ocr_cache()
997
887
 
998
888
  cached_result = ocr_cache.get(**cache_kwargs)
@@ -1010,7 +900,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1010
900
  return None
1011
901
 
1012
902
  def _execute_tesseract_sync(self, command: list[str]) -> None:
1013
- """Run tesseract command synchronously."""
1014
903
  env = os.environ.copy()
1015
904
  if sys.platform.startswith("linux"):
1016
905
  env["OMP_THREAD_LIMIT"] = "1"
@@ -1038,7 +927,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1038
927
  ) from e
1039
928
 
1040
929
  def _process_tesseract_output_sync(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
1041
- """Process the raw output from Tesseract based on the requested format (sync)."""
1042
930
  output_format = run_config["output_format"]
1043
931
  enable_table_detection = run_config["enable_table_detection"]
1044
932
  kwargs = run_config["remaining_kwargs"]
@@ -1063,7 +951,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1063
951
  )
1064
952
 
1065
953
  def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
1066
- """Synchronously process an image and extract its text and metadata."""
1067
954
  use_cache = kwargs.pop("use_cache", True)
1068
955
 
1069
956
  save_image = image
@@ -1107,7 +994,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1107
994
  ocr_cache.mark_complete(**cache_kwargs)
1108
995
 
1109
996
  def process_file_sync(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
1110
- """Synchronously process a file and extract its text and metadata."""
1111
997
  use_cache = kwargs.pop("use_cache", True)
1112
998
 
1113
999
  file_info = self._get_file_info(path)
@@ -1188,7 +1074,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1188
1074
  ocr_cache.mark_complete(**cache_kwargs)
1189
1075
 
1190
1076
  def _get_file_info(self, path: Path) -> dict[str, Any]:
1191
- """Get file information for caching."""
1192
1077
  try:
1193
1078
  stat = path.stat()
1194
1079
  return {
@@ -1206,7 +1091,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1206
1091
  def _build_tesseract_command(
1207
1092
  self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
1208
1093
  ) -> list[str]:
1209
- """Build tesseract command with all parameters."""
1210
1094
  command = [
1211
1095
  "tesseract",
1212
1096
  str(path),
@@ -1235,11 +1119,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1235
1119
 
1236
1120
  @classmethod
1237
1121
  def _validate_tesseract_version_sync(cls) -> None:
1238
- """Synchronously validate that Tesseract is installed and is version 5 or above.
1239
-
1240
- Raises:
1241
- MissingDependencyError: If Tesseract is not installed or is below version 5.
1242
- """
1243
1122
  try:
1244
1123
  if cls._version_checked:
1245
1124
  return
@@ -1265,17 +1144,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1265
1144
 
1266
1145
  @staticmethod
1267
1146
  def _validate_language_code(language_code: str) -> str:
1268
- """Convert a language code to Tesseract format.
1269
-
1270
- Args:
1271
- language_code: Tesseract supported language code or multiple language codes connected with '+'
1272
-
1273
- Raises:
1274
- ValidationError: If the language is not supported by Tesseract
1275
-
1276
- Returns:
1277
- Language code compatible with Tesseract
1278
- """
1279
1147
  normalized = language_code.lower()
1280
1148
  if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
1281
1149
  return normalized
@@ -1300,18 +1168,6 @@ def _process_image_with_tesseract(
1300
1168
  image_path: str,
1301
1169
  config_dict: dict[str, Any],
1302
1170
  ) -> dict[str, Any]:
1303
- """Process a single image with Tesseract in a separate process.
1304
-
1305
- This function is designed to be executed in a subprocess.
1306
- It uses direct tesseract command execution to avoid async complications.
1307
-
1308
- Args:
1309
- image_path: Path to the image file.
1310
- config_dict: Tesseract configuration as dictionary.
1311
-
1312
- Returns:
1313
- OCR result as dictionary.
1314
- """
1315
1171
  try:
1316
1172
  with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
1317
1173
  output_base = tmp_file.name.replace(".txt", "")
@@ -1399,15 +1255,6 @@ def _process_image_bytes_with_tesseract(
1399
1255
  image_bytes: bytes,
1400
1256
  config_dict: dict[str, Any],
1401
1257
  ) -> dict[str, Any]:
1402
- """Process image bytes with Tesseract in a separate process.
1403
-
1404
- Args:
1405
- image_bytes: Image data as bytes.
1406
- config_dict: Tesseract configuration as dictionary.
1407
-
1408
- Returns:
1409
- OCR result as dictionary.
1410
- """
1411
1258
  try:
1412
1259
  with (
1413
1260
  tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image,
@@ -1433,21 +1280,12 @@ def _process_image_bytes_with_tesseract(
1433
1280
 
1434
1281
 
1435
1282
  class TesseractProcessPool:
1436
- """Process pool for parallel Tesseract OCR processing."""
1437
-
1438
1283
  def __init__(
1439
1284
  self,
1440
1285
  config: TesseractConfig | None = None,
1441
1286
  max_processes: int | None = None,
1442
1287
  memory_limit_gb: float | None = None,
1443
1288
  ) -> None:
1444
- """Initialize the Tesseract process pool.
1445
-
1446
- Args:
1447
- config: Default Tesseract configuration.
1448
- max_processes: Maximum number of processes.
1449
- memory_limit_gb: Memory limit in GB.
1450
- """
1451
1289
  from kreuzberg._utils._process_pool import ProcessPoolManager # noqa: PLC0415
1452
1290
 
1453
1291
  self.config = config or TesseractConfig()
@@ -1457,7 +1295,6 @@ class TesseractProcessPool:
1457
1295
  )
1458
1296
 
1459
1297
  def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
1460
- """Convert TesseractConfig to dictionary for pickling."""
1461
1298
  cfg = config or self.config
1462
1299
 
1463
1300
  config_dict = {}
@@ -1472,7 +1309,6 @@ class TesseractProcessPool:
1472
1309
  return config_dict
1473
1310
 
1474
1311
  def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
1475
- """Convert result dictionary back to OCRResult."""
1476
1312
  if not result_dict["success"]:
1477
1313
  raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
1478
1314
 
@@ -1488,15 +1324,6 @@ class TesseractProcessPool:
1488
1324
  image_path: str | Path,
1489
1325
  config: TesseractConfig | None = None,
1490
1326
  ) -> ExtractionResult:
1491
- """Process a single image file with Tesseract.
1492
-
1493
- Args:
1494
- image_path: Path to the image file.
1495
- config: Tesseract configuration (uses default if None).
1496
-
1497
- Returns:
1498
- OCR result.
1499
- """
1500
1327
  config_dict = self._config_to_dict(config)
1501
1328
 
1502
1329
  task_memory_mb = 80
@@ -1515,15 +1342,6 @@ class TesseractProcessPool:
1515
1342
  image_bytes: bytes,
1516
1343
  config: TesseractConfig | None = None,
1517
1344
  ) -> ExtractionResult:
1518
- """Process image bytes with Tesseract.
1519
-
1520
- Args:
1521
- image_bytes: Image data as bytes.
1522
- config: Tesseract configuration (uses default if None).
1523
-
1524
- Returns:
1525
- OCR result.
1526
- """
1527
1345
  config_dict = self._config_to_dict(config)
1528
1346
 
1529
1347
  image_size_mb = len(image_bytes) / 1024 / 1024
@@ -1544,16 +1362,6 @@ class TesseractProcessPool:
1544
1362
  config: TesseractConfig | None = None,
1545
1363
  max_concurrent: int | None = None,
1546
1364
  ) -> list[ExtractionResult]:
1547
- """Process a batch of images in parallel.
1548
-
1549
- Args:
1550
- image_paths: List of image file paths.
1551
- config: Tesseract configuration (uses default if None).
1552
- max_concurrent: Maximum concurrent processes.
1553
-
1554
- Returns:
1555
- List of OCR results in the same order as input.
1556
- """
1557
1365
  if not image_paths:
1558
1366
  return []
1559
1367
 
@@ -1578,16 +1386,6 @@ class TesseractProcessPool:
1578
1386
  config: TesseractConfig | None = None,
1579
1387
  max_concurrent: int | None = None,
1580
1388
  ) -> list[ExtractionResult]:
1581
- """Process a batch of image bytes in parallel.
1582
-
1583
- Args:
1584
- image_bytes_list: List of image data as bytes.
1585
- config: Tesseract configuration (uses default if None).
1586
- max_concurrent: Maximum concurrent processes.
1587
-
1588
- Returns:
1589
- List of OCR results in the same order as input.
1590
- """
1591
1389
  if not image_bytes_list:
1592
1390
  return []
1593
1391
 
@@ -1608,15 +1406,12 @@ class TesseractProcessPool:
1608
1406
  return [self._result_from_dict(result_dict) for result_dict in result_dicts]
1609
1407
 
1610
1408
  def get_system_info(self) -> dict[str, Any]:
1611
- """Get system information from the process manager."""
1612
1409
  return self.process_manager.get_system_info()
1613
1410
 
1614
1411
  def shutdown(self, wait: bool = True) -> None:
1615
- """Shutdown the process pool."""
1616
1412
  self.process_manager.shutdown(wait=wait)
1617
1413
 
1618
1414
  async def __aenter__(self) -> Self:
1619
- """Async context manager entry."""
1620
1415
  return self
1621
1416
 
1622
1417
  async def __aexit__(
@@ -1625,5 +1420,4 @@ class TesseractProcessPool:
1625
1420
  exc_val: BaseException | None,
1626
1421
  exc_tb: object,
1627
1422
  ) -> None:
1628
- """Async context manager exit."""
1629
1423
  self.shutdown()
kreuzberg/_playa.py CHANGED
@@ -25,18 +25,6 @@ BOM_CHAR = "\ufeff"
25
25
 
26
26
 
27
27
  async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
28
- """Extract metadata from a PDF document.
29
-
30
- Args:
31
- pdf_content: The bytes of the PDF document.
32
- password: Password for encrypted PDF files.
33
-
34
- Raises:
35
- ParsingError: If the PDF metadata could not be extracted.
36
-
37
- Returns:
38
- A dictionary of metadata extracted from the PDF.
39
- """
40
28
  try:
41
29
  document = parse(pdf_content, max_workers=1, password=password)
42
30
  metadata: Metadata = {}
@@ -247,7 +235,6 @@ def _collect_document_permissions(document: Document) -> list[str]:
247
235
 
248
236
 
249
237
  def _extract_structure_information(document: Document, result: Metadata) -> None:
250
- """Extract language and subtitle from document structure."""
251
238
  if document.structure:
252
239
  languages = set()
253
240
  subtitle = None
@@ -280,20 +267,6 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
280
267
 
281
268
 
282
269
  def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
283
- """Synchronous version of extract_pdf_metadata.
284
-
285
- Extract metadata from a PDF document without using async/await.
286
-
287
- Args:
288
- pdf_content: The bytes of the PDF document.
289
- password: Password for encrypted PDF files.
290
-
291
- Raises:
292
- ParsingError: If the PDF metadata could not be extracted.
293
-
294
- Returns:
295
- A dictionary of metadata extracted from the PDF.
296
- """
297
270
  try:
298
271
  document = parse(pdf_content, max_workers=1, password=password)
299
272
  metadata: Metadata = {}
kreuzberg/_registry.py CHANGED
@@ -28,14 +28,6 @@ if TYPE_CHECKING:
28
28
 
29
29
 
30
30
  class ExtractorRegistry:
31
- """Manages extractors for different MIME types and their configurations.
32
-
33
- This class provides functionality to register, unregister, and retrieve
34
- extractors based on MIME types. It supports both synchronous and asynchronous
35
- operations for managing extractors. A default set of extractors is also
36
- maintained alongside user-registered extractors.
37
- """
38
-
39
31
  _default_extractors: ClassVar[list[type[Extractor]]] = [
40
32
  PDFExtractor,
41
33
  OfficeDocumentExtractor,
@@ -59,15 +51,6 @@ class ExtractorRegistry:
59
51
  @classmethod
60
52
  @lru_cache
61
53
  def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
62
- """Gets the extractor for the mimetype.
63
-
64
- Args:
65
- mime_type: The mime type of the content.
66
- config: Extraction options object, defaults to the default object.
67
-
68
- Returns:
69
- The extractor
70
- """
71
54
  extractors: list[type[Extractor]] = [
72
55
  *cls._registered_extractors,
73
56
  *cls._default_extractors,
@@ -81,30 +64,11 @@ class ExtractorRegistry:
81
64
 
82
65
  @classmethod
83
66
  def add_extractor(cls, extractor: type[Extractor]) -> None:
84
- """Add an extractor to the registry.
85
-
86
- Note:
87
- Extractors are tried in the order they are added: first added, first tried.
88
-
89
- Args:
90
- extractor: The extractor to add.
91
-
92
- Returns:
93
- None
94
- """
95
67
  cls._registered_extractors.append(extractor)
96
68
  cls.get_extractor.cache_clear()
97
69
 
98
70
  @classmethod
99
71
  def remove_extractor(cls, extractor: type[Extractor]) -> None:
100
- """Remove an extractor from the registry.
101
-
102
- Args:
103
- extractor: The extractor to remove.
104
-
105
- Returns:
106
- None
107
- """
108
72
  try:
109
73
  cls._registered_extractors.remove(extractor)
110
74
  cls.get_extractor.cache_clear()