natural-pdf 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1009,8 +1009,7 @@ class PageCollection(Generic[P]):
1009
1009
  """
1010
1010
  Applies OCR to all pages within this collection using batch processing.
1011
1011
 
1012
- This delegates the work to the parent PDF object's `apply_ocr_to_pages`
1013
- method for efficiency. The OCR results (TextElements) are added directly
1012
+ This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
1014
1013
  to the respective Page objects within this collection.
1015
1014
 
1016
1015
  Args:
@@ -1028,8 +1027,8 @@ class PageCollection(Generic[P]):
1028
1027
  Raises:
1029
1028
  RuntimeError: If pages in the collection lack a parent PDF object
1030
1029
  or if the parent PDF object lacks the required
1031
- `apply_ocr_to_pages` method.
1032
- (Propagates exceptions from PDF.apply_ocr_to_pages)
1030
+ `apply_ocr` method.
1031
+ (Propagates exceptions from PDF.apply_ocr)
1033
1032
  """
1034
1033
  if not self.pages:
1035
1034
  logger.warning("Cannot apply OCR to an empty PageCollection.")
@@ -1042,16 +1041,17 @@ class PageCollection(Generic[P]):
1042
1041
 
1043
1042
  parent_pdf = first_page._parent
1044
1043
 
1045
- if not hasattr(parent_pdf, 'apply_ocr_to_pages') or not callable(parent_pdf.apply_ocr_to_pages):
1046
- raise RuntimeError("Parent PDF object does not have the required 'apply_ocr_to_pages' method.")
1044
+ # Updated check for renamed method
1045
+ if not hasattr(parent_pdf, 'apply_ocr') or not callable(parent_pdf.apply_ocr):
1046
+ raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1047
1047
 
1048
1048
  # Get the 0-based indices of the pages in this collection
1049
1049
  page_indices = [p.index for p in self.pages]
1050
1050
 
1051
1051
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
1052
1052
 
1053
- # Delegate the batch call to the parent PDF object
1054
- parent_pdf.apply_ocr_to_pages(
1053
+ # Delegate the batch call to the parent PDF object (using renamed method)
1054
+ parent_pdf.apply_ocr(
1055
1055
  pages=page_indices,
1056
1056
  engine=engine,
1057
1057
  options=options,
@@ -18,7 +18,7 @@ class Region(DirectionalMixin):
18
18
  Represents a rectangular region on a page.
19
19
  """
20
20
 
21
- def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None):
21
+ def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None, label: Optional[str] = None):
22
22
  """
23
23
  Initialize a region.
24
24
 
@@ -27,6 +27,7 @@ class Region(DirectionalMixin):
27
27
  bbox: Bounding box as (x0, top, x1, bottom)
28
28
  polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
29
29
  parent: Optional parent region (for hierarchical document structure)
30
+ label: Optional label for the region (e.g., for exclusions)
30
31
  """
31
32
  self._page = page
32
33
  self._bbox = bbox
@@ -49,6 +50,7 @@ class Region(DirectionalMixin):
49
50
  # Region management attributes
50
51
  self.name = None
51
52
  self.source = None # Will be set by creation methods
53
+ self.label = label
52
54
 
53
55
  # Hierarchy support for nested document structure
54
56
  self.parent_region = parent
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,252 @@
1
+ """
2
+ Module for exporting PDF content to various formats.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ import tempfile
8
+ from typing import TYPE_CHECKING, List, Dict, Any, Tuple
9
+
10
+ # Lazy imports for optional dependencies
11
+ try:
12
+ from PIL import Image
13
+ except ImportError:
14
+ Image = None # type: ignore
15
+
16
+ try:
17
+ import pikepdf
18
+ except ImportError:
19
+ pikepdf = None # type: ignore
20
+
21
+ try:
22
+ from ocrmypdf.hocrtransform import HocrTransform
23
+ except ImportError:
24
+ HocrTransform = None # type: ignore
25
+
26
+ if TYPE_CHECKING:
27
+ from natural_pdf.core.pdf import PDF
28
+ from natural_pdf.core.page import Page
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # --- Constants ---
34
+ HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
35
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
36
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
37
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
38
+ <head>
39
+ <title></title>
40
+ <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
41
+ <meta name='ocr-system' content='natural-pdf' />
42
+ <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
43
+ </head>
44
+ <body>
45
+ '''
46
+
47
+ HOCR_TEMPLATE_PAGE = ''' <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
48
+ '''
49
+
50
+ HOCR_TEMPLATE_WORD = ''' <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
51
+ '''
52
+
53
+ HOCR_TEMPLATE_LINE_START = ''' <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}'>
54
+ '''
55
+ HOCR_TEMPLATE_LINE_END = ''' </span>
56
+ '''
57
+
58
+ HOCR_TEMPLATE_FOOTER = ''' </div>
59
+ </body>
60
+ </html>
61
+ '''
62
+ # --- End Constants ---
63
+
64
+
65
+ def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -> str:
66
+ """
67
+ Generates an hOCR string for a given Page object based on its OCR elements.
68
+
69
+ Args:
70
+ page: The Page object containing OCR elements (TextElements).
71
+ image_width: The width of the rendered image for coordinate scaling.
72
+ image_height: The height of the rendered image for coordinate scaling.
73
+
74
+ Returns:
75
+ An hOCR XML string.
76
+
77
+ Raises:
78
+ ValueError: If the page has no OCR elements.
79
+ """
80
+ # Attempt to get OCR elements (words) using find_all with selector
81
+ # Use find_all which returns an ElementCollection
82
+ ocr_elements_collection = page.find_all('text[source=ocr]')
83
+ ocr_elements = ocr_elements_collection.elements # Get the list of elements
84
+
85
+ if not ocr_elements:
86
+ logger.warning(f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from.")
87
+ # Return minimal valid hOCR for an empty page
88
+ hocr_content = HOCR_TEMPLATE_HEADER
89
+ hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height)
90
+ hocr_content += HOCR_TEMPLATE_FOOTER
91
+ return hocr_content
92
+
93
+
94
+ # --- TODO: Implement logic to group words into lines if necessary ---
95
+ # For now, just output words directly. A more advanced implementation
96
+ # might group words geometrically into lines first.
97
+ # Example (simple, assuming elements are somewhat sorted):
98
+ # lines = []
99
+ # current_line = []
100
+ # last_y = -1
101
+ # for word in ocr_elements:
102
+ # if not current_line or abs(word.y0 - last_y) < threshold: # Simple Y-based grouping
103
+ # current_line.append(word)
104
+ # last_y = word.y0
105
+ # else:
106
+ # lines.append(current_line)
107
+ # current_line = [word]
108
+ # last_y = word.y0
109
+ # if current_line:
110
+ # lines.append(current_line)
111
+ # --- End Line Grouping Placeholder ---
112
+
113
+
114
+ hocr_content = HOCR_TEMPLATE_HEADER
115
+ hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height) # image_path is often unused
116
+
117
+ # Scale factors from PDF points (page dims) to image pixels (rendered image dims)
118
+ # Note: Assumes OCR element coordinates are in PDF points (page.width/height)
119
+ scale_x = image_width / page.width if page.width > 0 else 1
120
+ scale_y = image_height / page.height if page.height > 0 else 1
121
+
122
+ word_id_counter = 0
123
+ for word in ocr_elements:
124
+ # Scale coordinates to image dimensions
125
+ img_x0 = int(word.x0 * scale_x)
126
+ img_y0 = int(word.y0 * scale_y)
127
+ img_x1 = int(word.x1 * scale_x)
128
+ img_y1 = int(word.y1 * scale_y)
129
+
130
+ # Ensure coordinates are within image bounds
131
+ img_x0 = max(0, img_x0)
132
+ img_y0 = max(0, img_y0)
133
+ img_x1 = min(image_width, img_x1)
134
+ img_y1 = min(image_height, img_y1)
135
+
136
+ # Basic escaping for XML - might need more robust escaping
137
+ text = word.text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
138
+
139
+ # Confidence (assuming it exists, default to 99 if not)
140
+ confidence = getattr(word, 'confidence', 0.99) * 100 # hOCR often uses 0-100
141
+
142
+ hocr_content += HOCR_TEMPLATE_WORD.format(
143
+ page_num=page.index,
144
+ word_id=word_id_counter,
145
+ x0=img_x0,
146
+ y0=img_y0,
147
+ x1=img_x1,
148
+ y1=img_y1,
149
+ confidence=int(confidence),
150
+ text=text
151
+ )
152
+ word_id_counter += 1
153
+ hocr_content += "\n" # Add newline for readability
154
+
155
+
156
+ hocr_content += HOCR_TEMPLATE_FOOTER
157
+ return hocr_content
158
+
159
+
160
+ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
161
+ """
162
+ Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
163
+
164
+ Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
165
+
166
+ Args:
167
+ pdf_object: The natural_pdf.PDF instance (OCR should have been run).
168
+ output_path: The path to save the resulting searchable PDF.
169
+ dpi: The resolution (dots per inch) for rendering page images and hOCR.
170
+ """
171
+ # _check_dependencies() # Removed check
172
+
173
+ # --- Ensure dependencies are loaded (they should be if installed) ---
174
+ if Image is None or pikepdf is None or HocrTransform is None:
175
+ # This should ideally not happen if dependencies are in main install,
176
+ # but serves as a safeguard during development or if install is broken.
177
+ raise ImportError(
178
+ "Required dependencies (Pillow, pikepdf, ocrmypdf) are missing. "
179
+ "Please ensure natural-pdf is installed correctly with all dependencies."
180
+ )
181
+ # --- End Safeguard Check ---
182
+
183
+ logger.info(f"Starting searchable PDF creation for '{pdf_object.source_path}' -> '{output_path}' at {dpi} DPI.")
184
+
185
+ temp_pdf_pages: List[str] = []
186
+ output_abs_path = os.path.abspath(output_path)
187
+
188
+ with tempfile.TemporaryDirectory() as tmpdir:
189
+ logger.debug(f"Using temporary directory: {tmpdir}")
190
+
191
+ for i, page in enumerate(pdf_object.pages):
192
+ logger.debug(f"Processing page {page.number} (index {i})...")
193
+ page_base_name = f"page_{i}"
194
+ img_path = os.path.join(tmpdir, f"{page_base_name}.png") # Use PNG for potentially better quality
195
+ hocr_path = os.path.join(tmpdir, f"{page_base_name}.hocr")
196
+ pdf_page_path = os.path.join(tmpdir, f"{page_base_name}.pdf")
197
+
198
+ try:
199
+ # 1. Render page image at target DPI
200
+ logger.debug(f" Rendering page {i} to image ({dpi} DPI)...")
201
+ # Use the Page's to_image method
202
+ pil_image = page.to_image(resolution=dpi, include_highlights=False)
203
+ pil_image.save(img_path, format='PNG')
204
+ img_width, img_height = pil_image.size
205
+ logger.debug(f" Image saved to {img_path} ({img_width}x{img_height})")
206
+
207
+ # 2. Generate hOCR
208
+ logger.debug(f" Generating hOCR...")
209
+ hocr_content = _generate_hocr_for_page(page, img_width, img_height)
210
+ with open(hocr_path, 'w', encoding='utf-8') as f:
211
+ f.write(hocr_content)
212
+ logger.debug(f" hOCR saved to {hocr_path}")
213
+
214
+
215
+ # 3. Use HocrTransform to create searchable PDF page
216
+ logger.debug(f" Running HocrTransform...")
217
+ hocr_transform = HocrTransform(hocr_filename=hocr_path, dpi=dpi)
218
+ # Pass image_filename explicitly
219
+ hocr_transform.to_pdf(out_filename=pdf_page_path, image_filename=img_path)
220
+ temp_pdf_pages.append(pdf_page_path)
221
+ logger.debug(f" Temporary PDF page saved to {pdf_page_path}")
222
+
223
+ except Exception as e:
224
+ logger.error(f" Failed to process page {page.number}: {e}", exc_info=True)
225
+ # Decide whether to skip or raise error
226
+ # For now, let's skip and continue
227
+ logger.warning(f" Skipping page {page.number} due to error.")
228
+ continue # Skip to the next page
229
+
230
+ # 4. Merge temporary PDF pages
231
+ if not temp_pdf_pages:
232
+ logger.error("No pages were successfully processed. Cannot create output PDF.")
233
+ raise RuntimeError("Failed to process any pages for searchable PDF creation.")
234
+
235
+ logger.info(f"Merging {len(temp_pdf_pages)} processed pages into final PDF...")
236
+ try:
237
+ # Use pikepdf for merging
238
+ output_pdf = pikepdf.Pdf.new()
239
+ for temp_pdf_path in temp_pdf_pages:
240
+ with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
241
+ # Assuming each temp PDF has exactly one page
242
+ if len(src_page_pdf.pages) == 1:
243
+ output_pdf.pages.append(src_page_pdf.pages[0])
244
+ else:
245
+ logger.warning(f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping.")
246
+ output_pdf.save(output_abs_path)
247
+ logger.info(f"Successfully saved merged searchable PDF to: {output_abs_path}")
248
+ except Exception as e:
249
+ logger.error(f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True)
250
+ raise RuntimeError(f"Failed to save final PDF: {e}") from e
251
+
252
+ logger.debug("Temporary directory cleaned up.")
@@ -0,0 +1,94 @@
1
+ """Makes search functionality easily importable and provides factory functions."""
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ # --- Service Implementation Import ---
7
+ # Import the concrete implementation
8
+ from .haystack_search_service import HaystackSearchService
9
+
10
+ # --- Protocol Import ---
11
+ # Import the protocol for type hinting
12
+ from .search_service_protocol import (
13
+ SearchServiceProtocol,
14
+ IndexConfigurationError,
15
+ Indexable
16
+ )
17
+
18
+ # --- Option Imports (for convenience) ---
19
+ # Make options easily available via `from natural_pdf.search import ...`
20
+ from .search_options import (
21
+ BaseSearchOptions,
22
+ SearchOptions, # Alias for TextSearchOptions for simplicity?
23
+ TextSearchOptions,
24
+ MultiModalSearchOptions
25
+ )
26
+ # --- Utils Import ---
27
+ from .haystack_utils import HAS_HAYSTACK_EXTRAS, check_haystack_availability # Re-export flag and helper
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # --- Factory Function ---
32
+
33
+ def get_search_service(
34
+ collection_name: str, # Add collection_name as a required argument
35
+ persist: bool = False, # Default to In-Memory
36
+ # Configuration for the service itself
37
+ default_persist_path: Optional[str] = None,
38
+ default_embedding_model: Optional[str] = None,
39
+ # Potential future args: cache_services=True? service_type='haystack'?
40
+ ) -> SearchServiceProtocol:
41
+ """
42
+ Factory function to get an instance of the configured search service.
43
+
44
+ A service instance is tied to a specific collection name.
45
+
46
+ Currently, only returns HaystackSearchService but is structured for future extension.
47
+
48
+ Args:
49
+ collection_name: The name of the collection this service instance will manage.
50
+ persist: If True, creates a service instance configured for persistent
51
+ storage (ChromaDB). If False (default), uses In-Memory.
52
+ default_persist_path: Override the default path for persistent storage.
53
+ default_embedding_model: Override the default embedding model used by the service.
54
+ **kwargs: Reserved for future configuration options.
55
+
56
+ Returns:
57
+ An instance conforming to the SearchServiceProtocol for the specified collection.
58
+ """
59
+ logger.debug(f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})...")
60
+
61
+ # For now, we only have one implementation
62
+ # Collect arguments relevant to HaystackSearchService.__init__
63
+ service_args = {}
64
+ service_args['collection_name'] = collection_name # Pass collection_name
65
+ service_args['persist'] = persist # Pass persist flag to service constructor
66
+ if default_persist_path is not None:
67
+ service_args['default_persist_path'] = default_persist_path
68
+ if default_embedding_model is not None:
69
+ service_args['default_embedding_model'] = default_embedding_model
70
+
71
+ # TODO: Implement caching/registry if needed to return the same instance
72
+ # for the same configuration instead of always creating a new one.
73
+ # cache_key = tuple(sorted(service_args.items()))
74
+ # if cache_key in _service_instance_cache:
75
+ # return _service_instance_cache[cache_key]
76
+
77
+ try:
78
+ service_instance = HaystackSearchService(**service_args)
79
+ # _service_instance_cache[cache_key] = service_instance
80
+ logger.info(f"Created new HaystackSearchService instance for collection '{collection_name}'.")
81
+ return service_instance
82
+ except ImportError as e:
83
+ logger.error(f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True)
84
+ raise ImportError("Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]") from e
85
+ except Exception as e:
86
+ logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
87
+ raise RuntimeError("Could not create Search Service instance.") from e
88
+
89
+ # --- Optional: Define a default instance for extreme ease of use? ---
90
+ # try:
91
+ # default_search_service = get_search_service()
92
+ # except Exception:
93
+ # default_search_service = None
94
+ # logger.warning("Could not create default search service instance on import.")