natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,14 @@ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_t
13
13
 
14
14
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
15
15
 
16
+ # --- Classification Imports --- #
17
+ from natural_pdf.classification.mixin import ClassificationMixin
18
+ from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
19
+ # --- End Classification Imports --- #
20
+
21
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
22
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
23
+
16
24
  if TYPE_CHECKING:
17
25
  from natural_pdf.core.page import Page
18
26
  from natural_pdf.elements.text import TextElement
@@ -27,7 +35,7 @@ except ImportError:
27
35
  logger = logging.getLogger(__name__)
28
36
 
29
37
 
30
- class Region(DirectionalMixin):
38
+ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
31
39
  """
32
40
  Represents a rectangular region on a page.
33
41
  """
@@ -57,6 +65,12 @@ class Region(DirectionalMixin):
57
65
  self.start_element = None
58
66
  self.end_element = None
59
67
 
68
+ # --- ADDED --- Metadata store for mixins
69
+ self.metadata: Dict[str, Any] = {}
70
+ # --- NEW --- Central registry for analysis results
71
+ self.analyses: Dict[str, Any] = {}
72
+ # --- END ADDED ---
73
+
60
74
  # Standard attributes for all elements
61
75
  self.object_type = "region" # For selector compatibility
62
76
 
@@ -600,6 +614,18 @@ class Region(DirectionalMixin):
600
614
  x1 = int(self.x1 * scale_factor)
601
615
  bottom = int(self.bottom * scale_factor)
602
616
 
617
+ # Ensure coords are valid for cropping (left < right, top < bottom)
618
+ if x0 >= x1:
619
+ logger.warning(
620
+ f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
621
+ )
622
+ return None
623
+ if top >= bottom:
624
+ logger.warning(
625
+ f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
626
+ )
627
+ return None
628
+
603
629
  # Crop the image to just this region
604
630
  region_image = page_image.crop((x0, top, x1, bottom))
605
631
 
@@ -776,11 +802,6 @@ class Region(DirectionalMixin):
776
802
  debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
777
803
  logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
778
804
 
779
- # --- Handle Docling source (priority) --- DEPRECATED or Adapt?
780
- # For now, let's bypass this and always use the standard extraction flow
781
- # based on contained elements to ensure consistency.
782
- # if self.model == 'docling' or hasattr(self, 'text_content'): ...
783
-
784
805
  # 1. Get Word Elements potentially within this region (initial broad phase)
785
806
  # Optimization: Could use spatial query if page elements were indexed
786
807
  page_words = self.page.words # Get all words from the page
@@ -829,7 +850,7 @@ class Region(DirectionalMixin):
829
850
  result = generate_text_layout(
830
851
  char_dicts=filtered_chars,
831
852
  layout_context_bbox=self.bbox, # Use region's bbox for context
832
- user_kwargs=kwargs,
853
+ user_kwargs=kwargs, # Pass original kwargs to layout generator
833
854
  )
834
855
 
835
856
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1084,11 +1105,14 @@ class Region(DirectionalMixin):
1084
1105
  filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1085
1106
  return ElementCollection(filtered_elements)
1086
1107
 
1087
- def apply_ocr(self, **ocr_params) -> "Region":
1108
+ def apply_ocr(self, replace=True, **ocr_params) -> "Region":
1088
1109
  """
1089
1110
  Apply OCR to this region and return the created text elements.
1090
1111
 
1091
1112
  Args:
1113
+ replace: If True (default), removes existing OCR elements in the region
1114
+ before adding new ones. If False, adds new OCR elements without
1115
+ removing existing ones.
1092
1116
  **ocr_params: Keyword arguments passed to the OCR Manager.
1093
1117
  Common parameters like `engine`, `languages`, `min_confidence`,
1094
1118
  `device`, and `resolution` (for image rendering) should be
@@ -1098,12 +1122,28 @@ class Region(DirectionalMixin):
1098
1122
  an `options` object (e.g., `options=EasyOCROptions(...)`).
1099
1123
 
1100
1124
  Returns:
1101
- List of created TextElement objects representing OCR words/lines.
1125
+ Self for method chaining.
1102
1126
  """
1103
1127
  # Ensure OCRManager is available
1104
1128
  if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
1105
1129
  logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
1106
- return []
1130
+ return self
1131
+
1132
+ # If replace is True, find and remove existing OCR elements in this region
1133
+ if replace:
1134
+ logger.info(f"Region {self.bbox}: Removing existing OCR elements before applying new OCR.")
1135
+ # Find all OCR elements in this region
1136
+ ocr_selector = "text[source=ocr]"
1137
+ ocr_elements = self.find_all(ocr_selector)
1138
+
1139
+ if ocr_elements:
1140
+ logger.info(f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove.")
1141
+ # Remove these elements from their page
1142
+ removed_count = ocr_elements.remove()
1143
+ logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
1144
+ else:
1145
+ logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
1146
+
1107
1147
  ocr_mgr = self.page._parent._ocr_manager
1108
1148
 
1109
1149
  # Determine rendering resolution from parameters
@@ -1123,11 +1163,11 @@ class Region(DirectionalMixin):
1123
1163
  )
1124
1164
  if not region_image:
1125
1165
  logger.error("Failed to render region to image for OCR.")
1126
- return []
1166
+ return self
1127
1167
  logger.debug(f"Region rendered to image size: {region_image.size}")
1128
1168
  except Exception as e:
1129
1169
  logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
1130
- return []
1170
+ return self
1131
1171
 
1132
1172
  # Prepare args for the OCR Manager
1133
1173
  manager_args = {
@@ -1148,11 +1188,11 @@ class Region(DirectionalMixin):
1148
1188
  logger.error(
1149
1189
  f"OCRManager returned unexpected type for single region image: {type(results)}"
1150
1190
  )
1151
- return []
1191
+ return self
1152
1192
  logger.debug(f"Region OCR processing returned {len(results)} results.")
1153
1193
  except Exception as e:
1154
1194
  logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
1155
- return []
1195
+ return self
1156
1196
 
1157
1197
  # Convert results to TextElements
1158
1198
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -1719,7 +1759,7 @@ class Region(DirectionalMixin):
1719
1759
  """
1720
1760
  # Find OCR elements specifically within this region
1721
1761
  # Note: We typically want to correct even if the element falls in an excluded area
1722
- target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
1762
+ target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
1723
1763
 
1724
1764
  # Delegate to the utility function
1725
1765
  _apply_ocr_correction_to_elements(
@@ -1729,3 +1769,47 @@ class Region(DirectionalMixin):
1729
1769
  )
1730
1770
 
1731
1771
  return self # Return self for chaining
1772
+
1773
+ # --- Classification Mixin Implementation --- #
1774
+ def _get_classification_manager(self) -> "ClassificationManager":
1775
+ if not hasattr(self, 'page') or not hasattr(self.page, 'pdf') or not hasattr(self.page.pdf, 'get_manager'):
1776
+ raise AttributeError("ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing.")
1777
+ try:
1778
+ # Use the PDF's manager registry accessor via page
1779
+ return self.page.pdf.get_manager('classification')
1780
+ except (ValueError, RuntimeError, AttributeError) as e:
1781
+ # Wrap potential errors from get_manager for clarity
1782
+ raise AttributeError(f"Failed to get ClassificationManager from PDF via Page: {e}") from e
1783
+
1784
+ def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
1785
+ if model_type == 'text':
1786
+ text_content = self.extract_text(layout=False) # Simple join for classification
1787
+ if not text_content or text_content.isspace():
1788
+ raise ValueError("Cannot classify region with 'text' model: No text content found.")
1789
+ return text_content
1790
+ elif model_type == 'vision':
1791
+ # Get resolution from manager/kwargs if possible, else default
1792
+ # We access manager via the method to ensure it's available
1793
+ manager = self._get_classification_manager()
1794
+ default_resolution = 150 # Manager doesn't store default res, set here
1795
+ # Note: classify() passes resolution via **kwargs if user specifies
1796
+ resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
1797
+
1798
+ img = self.to_image(
1799
+ resolution=resolution,
1800
+ include_highlights=False, # No highlights for classification input
1801
+ crop_only=True # Just the region content
1802
+ )
1803
+ if img is None:
1804
+ raise ValueError("Cannot classify region with 'vision' model: Failed to render image.")
1805
+ return img
1806
+ else:
1807
+ raise ValueError(f"Unsupported model_type for classification: {model_type}")
1808
+
1809
+ def _get_metadata_storage(self) -> Dict[str, Any]:
1810
+ # Ensure metadata exists
1811
+ if not hasattr(self, 'metadata') or self.metadata is None:
1812
+ self.metadata = {}
1813
+ return self.metadata
1814
+
1815
+ # --- End Classification Mixin Implementation --- #
@@ -274,9 +274,12 @@ class TextElement(Element):
274
274
 
275
275
  return False
276
276
 
277
- def __repr__(self) -> str:
277
+ def __repr__(self) -> str:
278
278
  """String representation of the text element."""
279
- preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
279
+ if self.text:
280
+ preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
281
+ else:
282
+ preview = "..."
280
283
  font_style = []
281
284
  if self.bold:
282
285
  font_style.append("bold")
@@ -48,7 +48,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
48
48
  selector: CSS-like selector to filter which TextElements to export.
49
49
  If None and corrected_only is False, all 'text' elements are considered.
50
50
  corrected_only: If True, overrides selector and exports only elements likely
51
- originating from a correction manifest (selector="text[source^=manifest]").
51
+ originating from a correction manifest (selector="text[source=manifest]").
52
52
  (default: False).
53
53
  split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
54
54
  If None, creates a single `label.txt` file (default: 0.9).
@@ -0,0 +1,134 @@
1
+ import logging
2
+ from typing import Any, Type, Optional
3
+ from pydantic import BaseModel
4
+ import io
5
+ import base64
6
+ from PIL import Image
7
+
8
+ from natural_pdf.extraction.result import StructuredDataResult
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class StructuredDataManager:
14
+ """
15
+ Manages the process of extracting structured data from elements using LLMs.
16
+
17
+ This manager is typically accessed via `pdf.get_manager('structured_data')`.
18
+ It is stateless and relies on parameters passed during method calls.
19
+ """
20
+
21
+ DEFAULT_TEXT_MODEL = "gpt-4o-mini"
22
+ DEFAULT_VISION_MODEL = "gpt-4o"
23
+
24
+ def __init__(self):
25
+ """Initializes the manager."""
26
+ logger.info("Initialized StructuredDataManager.")
27
+
28
+ def is_available(self) -> bool:
29
+ """Checks if necessary dependencies are available."""
30
+ try:
31
+ import pydantic
32
+ return True
33
+ except ImportError:
34
+ logger.warning("Pydantic is required for structured data extraction.")
35
+ return False
36
+
37
+ def _prepare_llm_messages(
38
+ self,
39
+ content: Any,
40
+ prompt: Optional[str],
41
+ using: str,
42
+ schema: Type[BaseModel]
43
+ ) -> list:
44
+ """Prepares the message list for the LLM API call."""
45
+ system_prompt = prompt or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
46
+
47
+ messages = [
48
+ {"role": "system", "content": system_prompt}
49
+ ]
50
+
51
+ if using == 'text':
52
+ messages.append({"role": "user", "content": str(content)})
53
+ elif using == 'vision':
54
+ if isinstance(content, Image.Image):
55
+ buffered = io.BytesIO()
56
+ content.save(buffered, format="PNG")
57
+ base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
58
+ messages.append({
59
+ "role": "user",
60
+ "content": [
61
+ {"type": "text", "text": "Extract information from this image based on the schema."},
62
+ {
63
+ "type": "image_url",
64
+ "image_url": {"url": f"data:image/png;base64,{base64_image}"},
65
+ },
66
+ ],
67
+ })
68
+ else:
69
+ raise TypeError(f"Content must be a PIL Image for using='vision', got {type(content)}")
70
+ else:
71
+ raise ValueError(f"Unsupported value for 'using': {using}")
72
+
73
+ return messages
74
+
75
+ def extract(
76
+ self,
77
+ content: Any,
78
+ schema: Type[BaseModel],
79
+ client: Any,
80
+ prompt: Optional[str] = None,
81
+ using: str = 'text',
82
+ model: Optional[str] = None,
83
+ **kwargs
84
+ ) -> StructuredDataResult:
85
+ """
86
+ Extract structured data from content using an LLM.
87
+
88
+ Args:
89
+ content: Text string or Image object
90
+ schema: Pydantic model class for the desired structure
91
+ client: Initialized LLM client (e.g., OpenAI client)
92
+ prompt: Optional user-provided instructions
93
+ using: Modality ('text' or 'vision')
94
+ model: Specific LLM model identifier
95
+ **kwargs: Additional parameters for the LLM API call
96
+
97
+ Returns:
98
+ StructuredDataResult object
99
+ """
100
+ logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
101
+
102
+ if isinstance(content, list) and using == 'vision':
103
+ if len(content) == 1:
104
+ content = content[0]
105
+ elif len(content) > 1:
106
+ logger.error("Vision extraction not supported for multi-page PDFs")
107
+ raise NotImplementedError("Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead.")
108
+
109
+ selected_model = model or (self.DEFAULT_VISION_MODEL if using == 'vision' else self.DEFAULT_TEXT_MODEL)
110
+ messages = self._prepare_llm_messages(content, prompt, using, schema)
111
+
112
+ try:
113
+ logger.debug(f"Extracting with model '{selected_model}'")
114
+ completion = client.beta.chat.completions.parse(
115
+ model=selected_model,
116
+ messages=messages,
117
+ response_format=schema,
118
+ **kwargs
119
+ )
120
+ parsed_data = completion.choices[0].message.parsed
121
+ return StructuredDataResult(
122
+ data=parsed_data,
123
+ success=True,
124
+ error_message=None,
125
+ model=selected_model
126
+ )
127
+ except Exception as e:
128
+ logger.error(f"Extraction failed: {str(e)}")
129
+ return StructuredDataResult(
130
+ data=None,
131
+ success=False,
132
+ error_message=str(e),
133
+ model=selected_model
134
+ )
@@ -0,0 +1,246 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Any, Type, Optional
3
+ from abc import ABC, abstractmethod
4
+ from pydantic import BaseModel
5
+
6
+ # Avoid circular import
7
+ if TYPE_CHECKING:
8
+ from natural_pdf.extraction.result import StructuredDataResult
9
+ from natural_pdf.core.page import Page
10
+ from natural_pdf.elements.base import Element
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ DEFAULT_STRUCTURED_KEY = "default-structured" # Define default key
15
+
16
+ class ExtractionMixin(ABC):
17
+ """
18
+ Mixin class providing structured data extraction capabilities to elements.
19
+ Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
20
+ """
21
+
22
+ def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
23
+ """
24
+ Retrieves the content (text or image) for extraction.
25
+
26
+ Args:
27
+ using: 'text' or 'vision'
28
+ **kwargs: Additional arguments passed to extract_text or to_image
29
+
30
+ Returns:
31
+ str: Extracted text if using='text'
32
+ PIL.Image.Image: Rendered image if using='vision'
33
+ None: If content cannot be retrieved
34
+ """
35
+ if not hasattr(self, 'extract_text') or not callable(self.extract_text):
36
+ logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
37
+ return None
38
+ if not hasattr(self, 'to_image') or not callable(self.to_image):
39
+ logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
40
+ return None
41
+
42
+ try:
43
+ if using == 'text':
44
+ layout = kwargs.pop('layout', True)
45
+ return self.extract_text(layout=layout, **kwargs)
46
+ elif using == 'vision':
47
+ resolution = kwargs.pop('resolution', 72)
48
+ include_highlights = kwargs.pop('include_highlights', False)
49
+ labels = kwargs.pop('labels', False)
50
+ return self.to_image(
51
+ resolution=resolution,
52
+ include_highlights=include_highlights,
53
+ labels=labels,
54
+ **kwargs
55
+ )
56
+ else:
57
+ logger.error(f"Unsupported value for 'using': {using}")
58
+ return None
59
+ except Exception as e:
60
+ logger.error(f"Error getting {using} content from {self!r}: {e}")
61
+ return None
62
+
63
+ def extract(
64
+ self: Any,
65
+ schema: Type[BaseModel],
66
+ client: Any,
67
+ analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
68
+ prompt: Optional[str] = None,
69
+ using: str = 'text',
70
+ model: Optional[str] = None,
71
+ overwrite: bool = False, # Add overwrite parameter
72
+ **kwargs
73
+ ) -> Any:
74
+ """
75
+ Extracts structured data according to the provided schema.
76
+
77
+ Results are stored in the element's `analyses` dictionary.
78
+
79
+ Args:
80
+ schema: Pydantic model class defining the desired structure
81
+ client: Initialized LLM client
82
+ analysis_key: Key to store the result under in `analyses`. Defaults to "default-structured".
83
+ prompt: Optional user-provided prompt for the LLM
84
+ using: Modality ('text' or 'vision')
85
+ model: Optional specific LLM model identifier
86
+ overwrite: If True, allow overwriting an existing result at `analysis_key`.
87
+ **kwargs: Additional parameters for extraction
88
+
89
+ Returns:
90
+ Self for method chaining
91
+ """
92
+ if not analysis_key:
93
+ raise ValueError("analysis_key cannot be empty for extract operation")
94
+
95
+ # --- Overwrite Check --- #
96
+ if not hasattr(self, 'analyses') or self.analyses is None:
97
+ self.analyses = {}
98
+
99
+ if analysis_key in self.analyses and not overwrite:
100
+ raise ValueError(
101
+ f"Analysis key '{analysis_key}' already exists in analyses. "
102
+ f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
103
+ )
104
+ # --- End Overwrite Check --- #
105
+
106
+ # Determine PDF instance to get manager
107
+ pdf_instance = None
108
+
109
+ if hasattr(self, 'get_manager') and callable(self.get_manager):
110
+ # Handle case where self is the PDF instance itself
111
+ pdf_instance = self
112
+ logger.debug(f"Manager access via self ({type(self).__name__})")
113
+ elif hasattr(self, 'pdf') and hasattr(self.pdf, 'get_manager') and callable(self.pdf.get_manager):
114
+ # Handle Page or other elements with direct .pdf reference
115
+ pdf_instance = self.pdf
116
+ logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
117
+ elif hasattr(self, 'page') and hasattr(self.page, 'pdf') and hasattr(self.page.pdf, 'get_manager') and callable(self.page.pdf.get_manager):
118
+ # Handle Region or other elements with .page.pdf reference
119
+ pdf_instance = self.page.pdf
120
+ logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
121
+ else:
122
+ logger.error(f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf")
123
+ raise RuntimeError(f"Cannot access PDF manager: {type(self).__name__} lacks necessary references")
124
+
125
+ try:
126
+ manager = pdf_instance.get_manager('structured_data')
127
+ except Exception as e:
128
+ raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
129
+
130
+ if not manager or not manager.is_available():
131
+ raise RuntimeError("StructuredDataManager is not available")
132
+
133
+ # Get content
134
+ layout_for_text = kwargs.pop('layout', True)
135
+ content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs) # Pass kwargs
136
+
137
+ if content is None or (using == 'text' and isinstance(content, str) and not content.strip()):
138
+ logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
139
+ # Import here to avoid circularity at module level
140
+ from natural_pdf.extraction.result import StructuredDataResult
141
+ result = StructuredDataResult(
142
+ data=None,
143
+ success=False,
144
+ error_message=f"No content available for extraction (using='{using}')",
145
+ model=model # Use model requested, even if failed
146
+ )
147
+ else:
148
+ result = manager.extract(
149
+ content=content,
150
+ schema=schema,
151
+ client=client,
152
+ prompt=prompt,
153
+ using=using,
154
+ model=model,
155
+ **kwargs
156
+ )
157
+
158
+ # Store the result
159
+ self.analyses[analysis_key] = result
160
+ logger.info(f"Stored extraction result under key '{analysis_key}' (Success: {result.success})")
161
+
162
+ return self
163
+
164
+ def extracted(self, field_name: Optional[str] = None, analysis_key: Optional[str] = None) -> Any:
165
+ """
166
+ Convenience method to access results from structured data extraction.
167
+
168
+ Args:
169
+ field_name: The specific field to retrieve from the extracted data dictionary.
170
+ If None, returns the entire data dictionary.
171
+ analysis_key: The key under which the extraction result was stored in `analyses`.
172
+ If None, defaults to "default-structured".
173
+
174
+ Returns:
175
+ The requested field value, the entire data dictionary, or raises an error.
176
+
177
+ Raises:
178
+ KeyError: If the specified `analysis_key` is not found in `analyses`.
179
+ ValueError: If the stored result for `analysis_key` indicates a failed extraction.
180
+ AttributeError: If the element does not have an `analyses` attribute.
181
+ KeyError: (Standard Python) If `field_name` is specified but not found in the data.
182
+ """
183
+ target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
184
+
185
+ if not hasattr(self, 'analyses') or self.analyses is None:
186
+ raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
187
+
188
+ if target_key not in self.analyses:
189
+ available_keys = list(self.analyses.keys())
190
+ raise KeyError(
191
+ f"Extraction '{target_key}' not found in analyses. "
192
+ f"Available extractions: {available_keys}"
193
+ )
194
+
195
+ # Import here to avoid circularity and allow type checking
196
+ from natural_pdf.extraction.result import StructuredDataResult
197
+ result: StructuredDataResult = self.analyses[target_key]
198
+
199
+ if not isinstance(result, StructuredDataResult):
200
+ logger.warning(f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process.")
201
+ raise TypeError(f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}")
202
+
203
+ if not result.success:
204
+ raise ValueError(
205
+ f"Stored result for '{target_key}' indicates a failed extraction attempt. "
206
+ f"Error: {result.error_message}"
207
+ )
208
+
209
+ if result.data is None:
210
+ # This case might occur if success=True but data is somehow None
211
+ raise ValueError(f"Extraction result for '{target_key}' has no data available, despite success flag.")
212
+
213
+ if field_name is None:
214
+ # Return the whole data object (Pydantic model instance or dict)
215
+ return result.data
216
+ else:
217
+ # Try dictionary key access first, then attribute access
218
+ if isinstance(result.data, dict):
219
+ try:
220
+ return result.data[field_name]
221
+ except KeyError:
222
+ available_keys = list(result.data.keys())
223
+ raise KeyError(
224
+ f"Field/Key '{field_name}' not found in extracted dictionary "
225
+ f"for key '{target_key}'. Available keys: {available_keys}"
226
+ )
227
+ else:
228
+ # Assume it's an object, try attribute access
229
+ try:
230
+ return getattr(result.data, field_name)
231
+ except AttributeError:
232
+ # Try to get available fields from the object
233
+ available_fields = []
234
+ if hasattr(result.data, 'model_fields'): # Pydantic v2
235
+ available_fields = list(result.data.model_fields.keys())
236
+ elif hasattr(result.data, '__fields__'): # Pydantic v1
237
+ available_fields = list(result.data.__fields__.keys())
238
+ elif hasattr(result.data, '__dict__'): # Fallback
239
+ available_fields = list(result.data.__dict__.keys())
240
+
241
+ raise AttributeError(
242
+ f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
243
+ f"for key '{target_key}'. Available fields/attributes: {available_fields}"
244
+ )
245
+ except Exception as e: # Catch other potential errors during getattr
246
+ raise TypeError(f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}") from e
@@ -0,0 +1,37 @@
1
+ from typing import Optional, TypeVar, Generic, Any
2
+ from pydantic import BaseModel, Field
3
+
4
+ # Generic type for the Pydantic model used in the schema
5
+ T_Schema = TypeVar("T_Schema", bound=BaseModel)
6
+
7
+
8
+ class StructuredDataResult(BaseModel, Generic[T_Schema]):
9
+ """
10
+ Represents the result of a structured data extraction operation.
11
+
12
+ Contains the extracted data, success status, and error information.
13
+ """
14
+
15
+ data: Optional[T_Schema] = Field(
16
+ None,
17
+ description="Validated data model or None on failure"
18
+ )
19
+ success: bool = Field(
20
+ ...,
21
+ description="Whether extraction succeeded"
22
+ )
23
+ error_message: Optional[str] = Field(
24
+ None,
25
+ description="Error details if extraction failed"
26
+ )
27
+ raw_output: Optional[Any] = Field(
28
+ None,
29
+ description="Raw output from the language model"
30
+ )
31
+ model_used: Optional[str] = Field(
32
+ None,
33
+ description="Identifier of the language model used"
34
+ )
35
+
36
+ class Config:
37
+ arbitrary_types_allowed = True