natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
  3. natural_pdf/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/gemini.py +63 -47
  5. natural_pdf/collections/pdf_collection.py +5 -2
  6. natural_pdf/core/element_manager.py +6 -4
  7. natural_pdf/core/page.py +36 -27
  8. natural_pdf/core/pdf.py +25 -16
  9. natural_pdf/elements/base.py +1 -3
  10. natural_pdf/elements/collections.py +13 -14
  11. natural_pdf/elements/region.py +7 -6
  12. natural_pdf/exporters/__init__.py +4 -0
  13. natural_pdf/exporters/base.py +61 -0
  14. natural_pdf/exporters/paddleocr.py +345 -0
  15. natural_pdf/ocr/__init__.py +16 -8
  16. natural_pdf/ocr/engine.py +46 -30
  17. natural_pdf/ocr/engine_easyocr.py +81 -40
  18. natural_pdf/ocr/engine_paddle.py +39 -28
  19. natural_pdf/ocr/engine_surya.py +32 -16
  20. natural_pdf/ocr/ocr_factory.py +34 -23
  21. natural_pdf/ocr/ocr_manager.py +15 -11
  22. natural_pdf/ocr/ocr_options.py +5 -0
  23. natural_pdf/ocr/utils.py +46 -31
  24. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  25. natural_pdf/utils/debug.py +4 -2
  26. natural_pdf/utils/identifiers.py +9 -5
  27. natural_pdf/utils/packaging.py +172 -105
  28. natural_pdf/utils/text_extraction.py +44 -64
  29. natural_pdf/utils/visualization.py +1 -1
  30. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
  31. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
  32. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
  33. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
  34. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
@@ -13,14 +13,14 @@ class OCRFactory:
13
13
  @staticmethod
14
14
  def create_engine(engine_type: str, **kwargs) -> OCREngine:
15
15
  """Create and return an OCR engine instance.
16
-
16
+
17
17
  Args:
18
18
  engine_type: One of 'surya', 'easyocr', 'paddle'
19
19
  **kwargs: Arguments to pass to the engine constructor
20
-
20
+
21
21
  Returns:
22
22
  An initialized OCR engine
23
-
23
+
24
24
  Raises:
25
25
  ImportError: If the required dependencies aren't installed
26
26
  ValueError: If the engine_type is unknown
@@ -28,72 +28,83 @@ class OCRFactory:
28
28
  if engine_type == "surya":
29
29
  try:
30
30
  from .engine_surya import SuryaOCREngine
31
+
31
32
  return SuryaOCREngine(**kwargs)
32
33
  except ImportError:
33
- raise ImportError("Surya engine requires the 'surya' package. "
34
- "Install with: pip install surya")
34
+ raise ImportError(
35
+ "Surya engine requires the 'surya' package. " "Install with: pip install surya"
36
+ )
35
37
  elif engine_type == "easyocr":
36
38
  try:
37
39
  from .engine_easyocr import EasyOCREngine
40
+
38
41
  return EasyOCREngine(**kwargs)
39
42
  except ImportError:
40
- raise ImportError("EasyOCR engine requires the 'easyocr' package. "
41
- "Install with: pip install easyocr")
43
+ raise ImportError(
44
+ "EasyOCR engine requires the 'easyocr' package. "
45
+ "Install with: pip install easyocr"
46
+ )
42
47
  elif engine_type == "paddle":
43
48
  try:
44
49
  from .engine_paddle import PaddleOCREngine
50
+
45
51
  return PaddleOCREngine(**kwargs)
46
52
  except ImportError:
47
- raise ImportError("PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
48
- "Install with: pip install paddleocr paddlepaddle")
53
+ raise ImportError(
54
+ "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
55
+ "Install with: pip install paddleocr paddlepaddle"
56
+ )
49
57
  else:
50
58
  raise ValueError(f"Unknown engine type: {engine_type}")
51
-
59
+
52
60
  @staticmethod
53
61
  def list_available_engines() -> Dict[str, bool]:
54
62
  """Returns a dictionary of engine names and their availability status."""
55
63
  engines = {}
56
-
64
+
57
65
  # Check Surya
58
66
  try:
59
67
  engines["surya"] = importlib.util.find_spec("surya") is not None
60
68
  except ImportError:
61
69
  engines["surya"] = False
62
-
70
+
63
71
  # Check EasyOCR
64
72
  try:
65
73
  engines["easyocr"] = importlib.util.find_spec("easyocr") is not None
66
74
  except ImportError:
67
75
  engines["easyocr"] = False
68
-
76
+
69
77
  # Check PaddleOCR
70
78
  try:
71
- paddle = importlib.util.find_spec("paddle") is not None or importlib.util.find_spec("paddlepaddle") is not None
79
+ paddle = (
80
+ importlib.util.find_spec("paddle") is not None
81
+ or importlib.util.find_spec("paddlepaddle") is not None
82
+ )
72
83
  paddleocr = importlib.util.find_spec("paddleocr") is not None
73
84
  engines["paddle"] = paddle and paddleocr
74
85
  except ImportError:
75
86
  engines["paddle"] = False
76
-
87
+
77
88
  return engines
78
-
89
+
79
90
  @staticmethod
80
91
  def get_recommended_engine(**kwargs) -> OCREngine:
81
92
  """Returns the best available OCR engine based on what's installed.
82
-
93
+
83
94
  First tries engines in order of preference: EasyOCR, Paddle, Surya.
84
95
  If none are available, raises ImportError with installation instructions.
85
-
96
+
86
97
  Args:
87
98
  **kwargs: Arguments to pass to the engine constructor
88
-
99
+
89
100
  Returns:
90
101
  The best available OCR engine instance
91
-
102
+
92
103
  Raises:
93
104
  ImportError: If no engines are available
94
105
  """
95
106
  available = OCRFactory.list_available_engines()
96
-
107
+
97
108
  # Try engines in order of recommendation
98
109
  if available.get("easyocr", False):
99
110
  logger.info("Using EasyOCR engine (recommended)")
@@ -104,11 +115,11 @@ class OCRFactory:
104
115
  elif available.get("surya", False):
105
116
  logger.info("Using Surya OCR engine")
106
117
  return OCRFactory.create_engine("surya", **kwargs)
107
-
118
+
108
119
  # If we get here, no engines are available
109
120
  raise ImportError(
110
121
  "No OCR engines available. Please install at least one of: \n"
111
122
  "- EasyOCR (recommended): pip install easyocr\n"
112
123
  "- PaddleOCR: pip install paddleocr paddlepaddle\n"
113
124
  "- Surya OCR: pip install surya"
114
- )
125
+ )
@@ -65,7 +65,7 @@ class OCRManager:
65
65
  device: Optional[str] = None,
66
66
  detect_only: bool = False,
67
67
  # --- Engine-Specific Options ---
68
- options: Optional[Any] = None, # e.g. EasyOCROptions(), PaddleOCROptions()
68
+ options: Optional[Any] = None, # e.g. EasyOCROptions(), PaddleOCROptions()
69
69
  ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
70
70
  """
71
71
  Applies OCR to a single image or a batch of images.
@@ -100,7 +100,7 @@ class OCRManager:
100
100
  if not is_batch and not isinstance(images, Image.Image):
101
101
  raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
102
102
 
103
- # --- Determine Engine ---
103
+ # --- Determine Engine ---
104
104
  selected_engine_name = (engine or "easyocr").lower()
105
105
  if selected_engine_name not in self.ENGINE_REGISTRY:
106
106
  raise ValueError(
@@ -108,35 +108,39 @@ class OCRManager:
108
108
  )
109
109
  logger.debug(f"Selected engine: '{selected_engine_name}'")
110
110
 
111
- # --- Prepare Options ---
111
+ # --- Prepare Options ---
112
112
  final_options = copy.deepcopy(options) if options is not None else None
113
-
113
+
114
114
  # Type check options object if provided
115
115
  if final_options is not None:
116
- options_class = self.ENGINE_REGISTRY[selected_engine_name].get("options_class", BaseOCROptions)
116
+ options_class = self.ENGINE_REGISTRY[selected_engine_name].get(
117
+ "options_class", BaseOCROptions
118
+ )
117
119
  if not isinstance(final_options, options_class):
118
- # Allow dicts to be passed directly too, assuming engine handles them
120
+ # Allow dicts to be passed directly too, assuming engine handles them
119
121
  if not isinstance(final_options, dict):
120
- raise TypeError(
122
+ raise TypeError(
121
123
  f"Provided options type '{type(final_options).__name__}' is not compatible with engine '{selected_engine_name}'. Expected '{options_class.__name__}' or dict."
122
124
  )
123
125
 
124
- # --- Get Engine Instance and Process ---
126
+ # --- Get Engine Instance and Process ---
125
127
  try:
126
128
  engine_instance = self._get_engine_instance(selected_engine_name)
127
129
  processing_mode = "batch" if is_batch else "single image"
128
130
  logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
129
- logger.debug(f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}")
131
+ logger.debug(
132
+ f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
133
+ )
130
134
 
131
135
  # Call the engine's process_image, passing common args and options object
132
136
  # **ASSUMPTION**: Engine process_image signatures are updated to accept these common args.
133
137
  results = engine_instance.process_image(
134
- images=images,
138
+ images=images,
135
139
  languages=languages,
136
140
  min_confidence=min_confidence,
137
141
  device=device,
138
142
  detect_only=detect_only,
139
- options=final_options
143
+ options=final_options,
140
144
  )
141
145
 
142
146
  # Log result summary based on mode
@@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
13
13
  @dataclass
14
14
  class BaseOCROptions:
15
15
  """Base class for OCR engine options."""
16
+
16
17
  extra_args: Dict[str, Any] = field(default_factory=dict)
17
18
 
18
19
 
@@ -20,6 +21,7 @@ class BaseOCROptions:
20
21
  @dataclass
21
22
  class EasyOCROptions(BaseOCROptions):
22
23
  """Specific options for the EasyOCR engine."""
24
+
23
25
  model_storage_directory: Optional[str] = None
24
26
  user_network_directory: Optional[str] = None
25
27
  recog_network: str = "english_g2"
@@ -64,6 +66,7 @@ class EasyOCROptions(BaseOCROptions):
64
66
  @dataclass
65
67
  class PaddleOCROptions(BaseOCROptions):
66
68
  """Specific options for the PaddleOCR engine."""
69
+
67
70
  use_angle_cls: bool = True
68
71
  use_gpu: Optional[bool] = None
69
72
  gpu_mem: int = 500
@@ -90,6 +93,7 @@ class PaddleOCROptions(BaseOCROptions):
90
93
 
91
94
  def __post_init__(self):
92
95
  pass
96
+
93
97
  # if self.use_gpu is None:
94
98
  # if self.device and "cuda" in self.device.lower():
95
99
  # self.use_gpu = True
@@ -102,6 +106,7 @@ class PaddleOCROptions(BaseOCROptions):
102
106
  @dataclass
103
107
  class SuryaOCROptions(BaseOCROptions):
104
108
  """Specific options for the Surya OCR engine."""
109
+
105
110
  # Currently, Surya example shows languages passed at prediction time.
106
111
  pass
107
112
 
natural_pdf/ocr/utils.py CHANGED
@@ -10,51 +10,71 @@ if TYPE_CHECKING:
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
+
13
14
  def _apply_ocr_correction_to_elements(
14
15
  elements: Iterable["Element"],
15
16
  correction_callback: Callable[[Any], Optional[str]],
17
+ caller_info: str = "Utility",
16
18
  ) -> None:
17
19
  """
18
- Applies correction callback to a list of elements in place,
20
+ Applies OCR correction callback to a list of elements in place,
19
21
  showing a progress bar.
20
22
 
21
- Iterates through elements, calls the callback, and updates
22
- element.text if a new string is returned.
23
-
23
+ Iterates through elements, checks if source starts with 'ocr', calls
24
+ the callback, and updates element.text if a new string is returned.
25
+
24
26
  Args:
25
27
  elements: An iterable of Element objects.
26
28
  correction_callback: A function accepting an element and returning
27
29
  Optional[str] (new text or None).
30
+ caller_info: String identifying the calling context for logs.
28
31
  """
32
+ if not callable(correction_callback):
33
+ # Raise error here so individual methods don't need to repeat the check
34
+ raise TypeError("`correction_callback` must be a callable function.")
35
+
36
+ if not elements:
37
+ logger.warning(f"{caller_info}: No elements provided for correction.")
38
+ return
39
+
29
40
  corrections_applied = 0
30
41
  elements_checked = 0
31
42
 
32
43
  # Prepare the iterable with tqdm
33
- element_iterable = tqdm(elements, desc=f"Correcting OCR", unit="element")
44
+ element_iterable = tqdm(elements, desc=f"Correcting OCR ({caller_info})", unit="element")
34
45
 
35
46
  for element in element_iterable:
36
47
  # Check if the element is likely from OCR and has text attribute
37
- element_source = getattr(element, 'source', None)
38
- if isinstance(element_source, str) and element_source.startswith('ocr') and hasattr(element, 'text'):
48
+ element_source = getattr(element, "source", None)
49
+ if (
50
+ isinstance(element_source, str)
51
+ and element_source.startswith("ocr")
52
+ and hasattr(element, "text")
53
+ ):
39
54
  elements_checked += 1
40
- current_text = getattr(element, 'text')
55
+ current_text = getattr(element, "text") # Already checked hasattr
41
56
 
42
57
  new_text = correction_callback(element)
43
58
 
44
59
  if new_text is not None:
45
60
  if new_text != current_text:
46
- element.text = new_text
61
+ element.text = new_text # Update in place
47
62
  corrections_applied += 1
48
63
 
49
- logger.info(f"OCR correction finished. Checked: {elements_checked}, Applied: {corrections_applied}")
64
+ logger.info(
65
+ f"{caller_info}: OCR correction finished. Checked: {elements_checked}, Applied: {corrections_applied}"
66
+ )
67
+ # No return value needed, modifies elements in place
50
68
 
51
69
 
52
- def direct_ocr_llm(element,
53
- client,
54
- model="",
55
- resolution=150,
56
- prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
57
- padding=2) -> str:
70
+ def direct_ocr_llm(
71
+ element,
72
+ client,
73
+ model="",
74
+ resolution=150,
75
+ prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
76
+ padding=2,
77
+ ) -> str:
58
78
  """Convenience method to directly OCR a region of the page."""
59
79
 
60
80
  if isinstance(element, TextElement):
@@ -65,34 +85,29 @@ def direct_ocr_llm(element,
65
85
  buffered = io.BytesIO()
66
86
  region_img = region.to_image(resolution=resolution, include_highlights=False)
67
87
  region_img.save(buffered, format="PNG")
68
- base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
88
+ base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
69
89
 
70
90
  response = client.chat.completions.create(
71
91
  model=model,
72
92
  messages=[
73
93
  {
74
94
  "role": "system",
75
- "content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image."
95
+ "content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image.",
76
96
  },
77
97
  {
78
98
  "role": "user",
79
99
  "content": [
80
- {
81
- "type": "text",
82
- "text": prompt
83
- },
100
+ {"type": "text", "text": prompt},
84
101
  {
85
102
  "type": "image_url",
86
- "image_url": {
87
- "url": f"data:image/png;base64,{base64_image}"
88
- }
89
- }
90
- ]
91
- }
92
- ]
103
+ "image_url": {"url": f"data:image/png;base64,{base64_image}"},
104
+ },
105
+ ],
106
+ },
107
+ ],
93
108
  )
94
-
109
+
95
110
  corrected = response.choices[0].message.content
96
111
  logger.debug(f"Corrected {region.extract_text()} to {corrected}")
97
112
 
98
- return corrected
113
+ return corrected