kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kreuzberg/__init__.py +4 -0
  2. kreuzberg/_api/main.py +22 -1
  3. kreuzberg/_config.py +404 -0
  4. kreuzberg/_entity_extraction.py +4 -5
  5. kreuzberg/_extractors/_base.py +3 -5
  6. kreuzberg/_extractors/_image.py +18 -32
  7. kreuzberg/_extractors/_pandoc.py +3 -14
  8. kreuzberg/_extractors/_pdf.py +39 -57
  9. kreuzberg/_extractors/_spread_sheet.py +2 -3
  10. kreuzberg/_extractors/_structured.py +10 -7
  11. kreuzberg/_gmft.py +314 -10
  12. kreuzberg/_language_detection.py +1 -1
  13. kreuzberg/_mcp/server.py +58 -8
  14. kreuzberg/_ocr/__init__.py +1 -22
  15. kreuzberg/_ocr/_base.py +59 -0
  16. kreuzberg/_ocr/_easyocr.py +92 -1
  17. kreuzberg/_ocr/_paddleocr.py +90 -1
  18. kreuzberg/_ocr/_tesseract.py +556 -5
  19. kreuzberg/_playa.py +2 -3
  20. kreuzberg/_types.py +46 -24
  21. kreuzberg/_utils/_cache.py +35 -4
  22. kreuzberg/_utils/_device.py +10 -20
  23. kreuzberg/_utils/_errors.py +44 -45
  24. kreuzberg/_utils/_process_pool.py +2 -6
  25. kreuzberg/_utils/_quality.py +7 -11
  26. kreuzberg/_utils/_serialization.py +21 -16
  27. kreuzberg/_utils/_string.py +22 -12
  28. kreuzberg/_utils/_table.py +3 -4
  29. kreuzberg/cli.py +4 -5
  30. kreuzberg/exceptions.py +10 -0
  31. kreuzberg/extraction.py +6 -24
  32. kreuzberg-3.8.2.dist-info/METADATA +265 -0
  33. kreuzberg-3.8.2.dist-info/RECORD +53 -0
  34. kreuzberg/_cli_config.py +0 -175
  35. kreuzberg/_multiprocessing/__init__.py +0 -5
  36. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  37. kreuzberg/_ocr/_pool.py +0 -357
  38. kreuzberg/_ocr/_sync.py +0 -566
  39. kreuzberg-3.8.0.dist-info/METADATA +0 -313
  40. kreuzberg-3.8.0.dist-info/RECORD +0 -57
  41. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
  42. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
  43. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,14 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import hashlib
4
+ import io
5
+ import os
4
6
  import re
7
+ import subprocess
5
8
  import sys
9
+ import tempfile
6
10
  from dataclasses import dataclass
7
11
  from enum import Enum
12
+ from pathlib import Path
8
13
  from typing import TYPE_CHECKING, Any, ClassVar, Final
9
14
 
10
15
  from anyio import Path as AsyncPath
11
16
  from anyio import run_process
17
+ from typing_extensions import Self
12
18
 
13
19
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
14
20
  from kreuzberg._ocr._base import OCRBackend
@@ -19,8 +25,6 @@ from kreuzberg._utils._tmp import create_temp_file
19
25
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
20
26
 
21
27
  if TYPE_CHECKING:
22
- from pathlib import Path
23
-
24
28
  from PIL.Image import Image
25
29
 
26
30
  try: # pragma: no cover
@@ -189,7 +193,7 @@ class PSMMode(Enum):
189
193
  """Treat the image as a single character."""
190
194
 
191
195
 
192
- @dataclass(unsafe_hash=True, frozen=True)
196
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
193
197
  class TesseractConfig:
194
198
  """Configuration options for Tesseract OCR engine."""
195
199
 
@@ -232,8 +236,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
232
236
  image: Image,
233
237
  **kwargs: Unpack[TesseractConfig],
234
238
  ) -> ExtractionResult:
235
- import io
236
-
237
239
  from kreuzberg._utils._cache import get_ocr_cache
238
240
 
239
241
  image_buffer = io.BytesIO()
@@ -407,6 +409,223 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
407
409
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
408
410
  ) from e
409
411
 
412
+ def process_image_sync(
413
+ self,
414
+ image: Image,
415
+ **kwargs: Unpack[TesseractConfig],
416
+ ) -> ExtractionResult:
417
+ """Synchronously process an image and extract its text and metadata.
418
+
419
+ Args:
420
+ image: An instance of PIL.Image representing the input image.
421
+ **kwargs: Any kwargs related to the given backend
422
+
423
+ Returns:
424
+ The extraction result object
425
+ """
426
+ from kreuzberg._utils._cache import get_ocr_cache
427
+
428
+ image_buffer = io.BytesIO()
429
+ image.save(image_buffer, format="PNG")
430
+ image_content = image_buffer.getvalue()
431
+
432
+ cache_kwargs = {
433
+ "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
434
+ "ocr_backend": "tesseract",
435
+ "ocr_config": str(sorted(kwargs.items())),
436
+ }
437
+
438
+ ocr_cache = get_ocr_cache()
439
+ cached_result = ocr_cache.get(**cache_kwargs)
440
+ if cached_result is not None:
441
+ return cached_result
442
+
443
+ if ocr_cache.is_processing(**cache_kwargs):
444
+ event = ocr_cache.mark_processing(**cache_kwargs)
445
+ event.wait()
446
+
447
+ # Try cache again after waiting for other process to complete
448
+ cached_result = ocr_cache.get(**cache_kwargs)
449
+ if cached_result is not None:
450
+ return cached_result
451
+
452
+ ocr_cache.mark_processing(**cache_kwargs)
453
+
454
+ try:
455
+ self._validate_tesseract_version_sync()
456
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
457
+ image_path = Path(tmp_file.name)
458
+ image.save(str(image_path), format="PNG")
459
+ try:
460
+ result = self.process_file_sync(image_path, **kwargs)
461
+
462
+ ocr_cache.set(result, **cache_kwargs)
463
+
464
+ return result
465
+ finally:
466
+ if image_path.exists():
467
+ image_path.unlink()
468
+ finally:
469
+ ocr_cache.mark_complete(**cache_kwargs)
470
+
471
+ def process_file_sync(
472
+ self,
473
+ path: Path,
474
+ **kwargs: Unpack[TesseractConfig],
475
+ ) -> ExtractionResult:
476
+ """Synchronously process a file and extract its text and metadata.
477
+
478
+ Args:
479
+ path: A Path object representing the file to be processed.
480
+ **kwargs: Any kwargs related to the given backend
481
+
482
+ Returns:
483
+ The extraction result object
484
+ """
485
+ from kreuzberg._utils._cache import get_ocr_cache
486
+
487
+ file_info = self._get_file_info(path)
488
+
489
+ cache_kwargs = {
490
+ "file_info": str(sorted(file_info.items())),
491
+ "ocr_backend": "tesseract",
492
+ "ocr_config": str(sorted(kwargs.items())),
493
+ }
494
+
495
+ ocr_cache = get_ocr_cache()
496
+ cached_result = ocr_cache.get(**cache_kwargs)
497
+ if cached_result is not None:
498
+ return cached_result
499
+
500
+ if ocr_cache.is_processing(**cache_kwargs):
501
+ event = ocr_cache.mark_processing(**cache_kwargs)
502
+ event.wait()
503
+
504
+ # Try cache again after waiting for other process to complete
505
+ cached_result = ocr_cache.get(**cache_kwargs)
506
+ if cached_result is not None:
507
+ return cached_result
508
+
509
+ ocr_cache.mark_processing(**cache_kwargs)
510
+
511
+ try:
512
+ self._validate_tesseract_version_sync()
513
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
514
+ output_base = tmp_file.name.replace(".txt", "")
515
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
516
+ psm = kwargs.pop("psm", PSMMode.AUTO)
517
+ try:
518
+ command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
519
+ self._run_tesseract_sync(command)
520
+
521
+ output_path = Path(output_base + ".txt")
522
+ with output_path.open(encoding="utf-8") as f:
523
+ output = f.read()
524
+ extraction_result = ExtractionResult(
525
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
526
+ )
527
+
528
+ final_cache_kwargs = cache_kwargs.copy()
529
+ final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
530
+ ocr_cache.set(extraction_result, **final_cache_kwargs)
531
+
532
+ return extraction_result
533
+ except (RuntimeError, OSError) as e:
534
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
535
+ finally:
536
+ for ext in [".txt"]:
537
+ temp_file = Path(output_base + ext)
538
+ if temp_file.exists():
539
+ temp_file.unlink()
540
+ finally:
541
+ ocr_cache.mark_complete(**cache_kwargs)
542
+
543
+ def _get_file_info(self, path: Path) -> dict[str, Any]:
544
+ """Get file information for caching."""
545
+ try:
546
+ stat = path.stat()
547
+ return {
548
+ "path": str(path.resolve()),
549
+ "size": stat.st_size,
550
+ "mtime": stat.st_mtime,
551
+ }
552
+ except OSError:
553
+ return {
554
+ "path": str(path),
555
+ "size": 0,
556
+ "mtime": 0,
557
+ }
558
+
559
+ def _build_tesseract_command(
560
+ self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
561
+ ) -> list[str]:
562
+ """Build tesseract command with all parameters."""
563
+ command = [
564
+ "tesseract",
565
+ str(path),
566
+ output_base,
567
+ "-l",
568
+ language,
569
+ "--psm",
570
+ str(psm.value),
571
+ "--oem",
572
+ "1",
573
+ "--loglevel",
574
+ "OFF",
575
+ ]
576
+ for kwarg, value in kwargs.items():
577
+ if isinstance(value, bool):
578
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
579
+ else:
580
+ command.extend(["-c", f"{kwarg}={value}"])
581
+ return command
582
+
583
+ def _run_tesseract_sync(self, command: list[str]) -> None:
584
+ """Run tesseract command synchronously."""
585
+ env = os.environ.copy()
586
+ if sys.platform.startswith("linux"):
587
+ env["OMP_THREAD_LIMIT"] = "1"
588
+
589
+ result = subprocess.run(
590
+ command,
591
+ check=False,
592
+ env=env,
593
+ capture_output=True,
594
+ text=True,
595
+ timeout=30,
596
+ )
597
+
598
+ if result.returncode != 0:
599
+ raise OCRError(
600
+ "OCR failed with a non-0 return code.",
601
+ context={"error": result.stderr},
602
+ )
603
+
604
+ @classmethod
605
+ def _validate_tesseract_version_sync(cls) -> None:
606
+ """Synchronously validate that Tesseract is installed and is version 5 or above.
607
+
608
+ Raises:
609
+ MissingDependencyError: If Tesseract is not installed or is below version 5.
610
+ """
611
+ try:
612
+ if cls._version_checked:
613
+ return
614
+
615
+ command = ["tesseract", "--version"]
616
+ result = subprocess.run(command, capture_output=True, text=True, check=False)
617
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
618
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
619
+ raise MissingDependencyError(
620
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
621
+ )
622
+
623
+ cls._version_checked = True
624
+ except FileNotFoundError as e:
625
+ raise MissingDependencyError(
626
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
627
+ ) from e
628
+
410
629
  @staticmethod
411
630
  def _validate_language_code(language_code: str) -> str:
412
631
  """Convert a language code to Tesseract format.
@@ -438,3 +657,335 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
438
657
  "supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
439
658
  },
440
659
  )
660
+
661
+
662
+ def _process_image_with_tesseract(
663
+ image_path: str,
664
+ config_dict: dict[str, Any],
665
+ ) -> dict[str, Any]:
666
+ """Process a single image with Tesseract in a separate process.
667
+
668
+ This function is designed to be executed in a subprocess.
669
+ It uses direct tesseract command execution to avoid async complications.
670
+
671
+ Args:
672
+ image_path: Path to the image file.
673
+ config_dict: Tesseract configuration as dictionary.
674
+
675
+ Returns:
676
+ OCR result as dictionary.
677
+ """
678
+ try:
679
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
680
+ output_base = tmp_file.name.replace(".txt", "")
681
+
682
+ try:
683
+ language = config_dict.get("language", "eng")
684
+ psm = config_dict.get("psm", 3)
685
+
686
+ command = [
687
+ "tesseract",
688
+ image_path,
689
+ output_base,
690
+ "-l",
691
+ language,
692
+ "--psm",
693
+ str(psm),
694
+ "--oem",
695
+ "1",
696
+ "--loglevel",
697
+ "OFF",
698
+ ]
699
+
700
+ boolean_options = [
701
+ "classify_use_pre_adapted_templates",
702
+ "language_model_ngram_on",
703
+ "tessedit_dont_blkrej_good_wds",
704
+ "tessedit_dont_rowrej_good_wds",
705
+ "tessedit_enable_dict_correction",
706
+ "tessedit_use_primary_params_model",
707
+ "textord_space_size_is_variable",
708
+ "thresholding_method",
709
+ ]
710
+
711
+ for option in boolean_options:
712
+ if option in config_dict:
713
+ value = 1 if config_dict[option] else 0
714
+ command.extend(["-c", f"{option}={value}"])
715
+
716
+ env = os.environ.copy()
717
+ env["OMP_THREAD_LIMIT"] = "1"
718
+
719
+ result = subprocess.run(
720
+ command,
721
+ check=False,
722
+ env=env,
723
+ capture_output=True,
724
+ text=True,
725
+ timeout=30,
726
+ )
727
+
728
+ if result.returncode != 0:
729
+ raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
730
+
731
+ output_file = output_base + ".txt"
732
+ with Path(output_file).open(encoding="utf-8") as f:
733
+ text = f.read()
734
+
735
+ text = normalize_spaces(text)
736
+
737
+ return {
738
+ "success": True,
739
+ "text": text,
740
+ "confidence": None,
741
+ "error": None,
742
+ }
743
+
744
+ finally:
745
+ for ext in [".txt"]:
746
+ temp_file = output_base + ext
747
+ temp_path = Path(temp_file)
748
+ if temp_path.exists():
749
+ temp_path.unlink()
750
+
751
+ except Exception as e: # noqa: BLE001
752
+ return {
753
+ "success": False,
754
+ "text": "",
755
+ "confidence": None,
756
+ "error": str(e),
757
+ }
758
+
759
+
760
+ def _process_image_bytes_with_tesseract(
761
+ image_bytes: bytes,
762
+ config_dict: dict[str, Any],
763
+ ) -> dict[str, Any]:
764
+ """Process image bytes with Tesseract in a separate process.
765
+
766
+ Args:
767
+ image_bytes: Image data as bytes.
768
+ config_dict: Tesseract configuration as dictionary.
769
+
770
+ Returns:
771
+ OCR result as dictionary.
772
+ """
773
+ try:
774
+ from PIL import Image
775
+
776
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
777
+ with Image.open(io.BytesIO(image_bytes)) as image:
778
+ image.save(tmp_image.name, format="PNG")
779
+ image_path = tmp_image.name
780
+
781
+ try:
782
+ return _process_image_with_tesseract(image_path, config_dict)
783
+ finally:
784
+ image_file = Path(image_path)
785
+ if image_file.exists():
786
+ image_file.unlink()
787
+
788
+ except Exception as e: # noqa: BLE001
789
+ return {
790
+ "success": False,
791
+ "text": "",
792
+ "confidence": None,
793
+ "error": str(e),
794
+ }
795
+
796
+
797
+ class TesseractProcessPool:
798
+ """Process pool for parallel Tesseract OCR processing."""
799
+
800
+ def __init__(
801
+ self,
802
+ config: TesseractConfig | None = None,
803
+ max_processes: int | None = None,
804
+ memory_limit_gb: float | None = None,
805
+ ) -> None:
806
+ """Initialize the Tesseract process pool.
807
+
808
+ Args:
809
+ config: Default Tesseract configuration.
810
+ max_processes: Maximum number of processes.
811
+ memory_limit_gb: Memory limit in GB.
812
+ """
813
+ from kreuzberg._utils._process_pool import ProcessPoolManager
814
+
815
+ self.config = config or TesseractConfig()
816
+ self.process_manager = ProcessPoolManager(
817
+ max_processes=max_processes,
818
+ memory_limit_gb=memory_limit_gb,
819
+ )
820
+
821
+ def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
822
+ """Convert TesseractConfig to dictionary for pickling."""
823
+ cfg = config or self.config
824
+
825
+ config_dict = {}
826
+ for field_name in cfg.__dataclass_fields__:
827
+ value = getattr(cfg, field_name)
828
+
829
+ if hasattr(value, "value"):
830
+ config_dict[field_name] = value.value
831
+ else:
832
+ config_dict[field_name] = value
833
+
834
+ return config_dict
835
+
836
+ def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
837
+ """Convert result dictionary back to OCRResult."""
838
+ if not result_dict["success"]:
839
+ raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
840
+
841
+ return ExtractionResult(
842
+ content=result_dict["text"],
843
+ mime_type=PLAIN_TEXT_MIME_TYPE,
844
+ metadata={"confidence": result_dict["confidence"]} if result_dict["confidence"] else {}, # type: ignore[typeddict-unknown-key]
845
+ chunks=[],
846
+ )
847
+
848
+ async def process_image(
849
+ self,
850
+ image_path: str | Path,
851
+ config: TesseractConfig | None = None,
852
+ ) -> ExtractionResult:
853
+ """Process a single image file with Tesseract.
854
+
855
+ Args:
856
+ image_path: Path to the image file.
857
+ config: Tesseract configuration (uses default if None).
858
+
859
+ Returns:
860
+ OCR result.
861
+ """
862
+ config_dict = self._config_to_dict(config)
863
+
864
+ task_memory_mb = 80
865
+
866
+ result_dict = await self.process_manager.submit_task(
867
+ _process_image_with_tesseract,
868
+ str(image_path),
869
+ config_dict,
870
+ task_memory_mb=task_memory_mb,
871
+ )
872
+
873
+ return self._result_from_dict(result_dict)
874
+
875
+ async def process_image_bytes(
876
+ self,
877
+ image_bytes: bytes,
878
+ config: TesseractConfig | None = None,
879
+ ) -> ExtractionResult:
880
+ """Process image bytes with Tesseract.
881
+
882
+ Args:
883
+ image_bytes: Image data as bytes.
884
+ config: Tesseract configuration (uses default if None).
885
+
886
+ Returns:
887
+ OCR result.
888
+ """
889
+ config_dict = self._config_to_dict(config)
890
+
891
+ image_size_mb = len(image_bytes) / 1024 / 1024
892
+ task_memory_mb = max(80, image_size_mb * 2 + 50)
893
+
894
+ result_dict = await self.process_manager.submit_task(
895
+ _process_image_bytes_with_tesseract,
896
+ image_bytes,
897
+ config_dict,
898
+ task_memory_mb=task_memory_mb,
899
+ )
900
+
901
+ return self._result_from_dict(result_dict)
902
+
903
+ async def process_batch_images(
904
+ self,
905
+ image_paths: list[str | Path],
906
+ config: TesseractConfig | None = None,
907
+ max_concurrent: int | None = None,
908
+ ) -> list[ExtractionResult]:
909
+ """Process a batch of images in parallel.
910
+
911
+ Args:
912
+ image_paths: List of image file paths.
913
+ config: Tesseract configuration (uses default if None).
914
+ max_concurrent: Maximum concurrent processes.
915
+
916
+ Returns:
917
+ List of OCR results in the same order as input.
918
+ """
919
+ if not image_paths:
920
+ return []
921
+
922
+ config_dict = self._config_to_dict(config)
923
+
924
+ arg_batches = [(str(path), config_dict) for path in image_paths]
925
+
926
+ task_memory_mb = 80
927
+
928
+ result_dicts = await self.process_manager.submit_batch(
929
+ _process_image_with_tesseract,
930
+ arg_batches,
931
+ task_memory_mb=task_memory_mb,
932
+ max_concurrent=max_concurrent,
933
+ )
934
+
935
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
936
+
937
+ async def process_batch_bytes(
938
+ self,
939
+ image_bytes_list: list[bytes],
940
+ config: TesseractConfig | None = None,
941
+ max_concurrent: int | None = None,
942
+ ) -> list[ExtractionResult]:
943
+ """Process a batch of image bytes in parallel.
944
+
945
+ Args:
946
+ image_bytes_list: List of image data as bytes.
947
+ config: Tesseract configuration (uses default if None).
948
+ max_concurrent: Maximum concurrent processes.
949
+
950
+ Returns:
951
+ List of OCR results in the same order as input.
952
+ """
953
+ if not image_bytes_list:
954
+ return []
955
+
956
+ config_dict = self._config_to_dict(config)
957
+
958
+ arg_batches = [(image_bytes, config_dict) for image_bytes in image_bytes_list]
959
+
960
+ avg_image_size_mb = sum(len(img) for img in image_bytes_list) / len(image_bytes_list) / 1024 / 1024
961
+ task_memory_mb = max(80, avg_image_size_mb * 2 + 50)
962
+
963
+ result_dicts = await self.process_manager.submit_batch(
964
+ _process_image_bytes_with_tesseract,
965
+ arg_batches,
966
+ task_memory_mb=task_memory_mb,
967
+ max_concurrent=max_concurrent,
968
+ )
969
+
970
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
971
+
972
+ def get_system_info(self) -> dict[str, Any]:
973
+ """Get system information from the process manager."""
974
+ return self.process_manager.get_system_info()
975
+
976
+ def shutdown(self, wait: bool = True) -> None:
977
+ """Shutdown the process pool."""
978
+ self.process_manager.shutdown(wait=wait)
979
+
980
+ async def __aenter__(self) -> Self:
981
+ """Async context manager entry."""
982
+ return self
983
+
984
+ async def __aexit__(
985
+ self,
986
+ exc_type: type[BaseException] | None,
987
+ exc_val: BaseException | None,
988
+ exc_tb: object,
989
+ ) -> None:
990
+ """Async context manager exit."""
991
+ self.shutdown()
kreuzberg/_playa.py CHANGED
@@ -114,9 +114,8 @@ def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> Non
114
114
  if keywords := pdf_info.get("keywords"):
115
115
  if isinstance(keywords, (str, bytes)):
116
116
  kw_str = decode_text(keywords)
117
- kw_list = [k.strip() for k in kw_str.split(",")]
118
- kw_list = [k.strip() for k in " ".join(kw_list).split(";")]
119
- result["keywords"] = [k for k in kw_list if k]
117
+ # Combine multiple operations into a single comprehension
118
+ result["keywords"] = [k.strip() for part in kw_str.replace(";", ",").split(",") if (k := part.strip())]
120
119
  elif isinstance(keywords, list):
121
120
  result["keywords"] = [decode_text(k) for k in keywords]
122
121