natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -276,7 +276,10 @@ class TextElement(Element):
276
276
 
277
277
  def __repr__(self) -> str:
278
278
  """String representation of the text element."""
279
- preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
279
+ if self.text:
280
+ preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
281
+ else:
282
+ preview = "..."
280
283
  font_style = []
281
284
  if self.bold:
282
285
  font_style.append("bold")
@@ -0,0 +1,137 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class ExportMixin:
11
+ """
12
+ Mixin for exporting analyses from collections of elements.
13
+
14
+ This mixin is designed to be used with PDF, PDFCollection,
15
+ PageCollection, and ElementCollection classes.
16
+ """
17
+
18
+ def export_analyses(
19
+ self,
20
+ output_path: str,
21
+ analysis_keys: Union[str, List[str]],
22
+ format: str = "json",
23
+ include_content: bool = True,
24
+ include_images: bool = False,
25
+ image_dir: Optional[str] = None,
26
+ image_format: str = "jpg",
27
+ image_resolution: int = 72,
28
+ overwrite: bool = True,
29
+ **kwargs,
30
+ ) -> str:
31
+ """
32
+ Export analysis results to a file.
33
+
34
+ Args:
35
+ output_path: Path to save the export file
36
+ analysis_keys: Key(s) in the analyses dictionary to export
37
+ format: Export format ('json', 'csv', 'excel')
38
+ include_content: Whether to include extracted text
39
+ include_images: Whether to export images of elements
40
+ image_dir: Directory to save images (created if doesn't exist)
41
+ image_format: Format to save images ('jpg', 'png')
42
+ image_resolution: Resolution for exported images
43
+ overwrite: Whether to overwrite existing files
44
+ **kwargs: Additional format-specific options
45
+
46
+ Returns:
47
+ Path to the exported file
48
+ """
49
+ # Convert single key to list for consistency
50
+ if isinstance(analysis_keys, str):
51
+ analysis_keys = [analysis_keys]
52
+
53
+ # Create output directory
54
+ output_path = Path(output_path)
55
+ os.makedirs(output_path.parent, exist_ok=True)
56
+
57
+ # Check if file exists and handle overwrite
58
+ if output_path.exists() and not overwrite:
59
+ raise FileExistsError(f"Output file {output_path} already exists and overwrite=False")
60
+
61
+ # Prepare image directory if needed
62
+ if include_images:
63
+ if image_dir is None:
64
+ image_dir = output_path.parent / f"{output_path.stem}_images"
65
+ os.makedirs(image_dir, exist_ok=True)
66
+ image_dir = Path(image_dir) # Convert to Path object
67
+
68
+ # Gather data from collection
69
+ data = self._gather_analysis_data(
70
+ analysis_keys=analysis_keys,
71
+ include_content=include_content,
72
+ include_images=include_images,
73
+ image_dir=image_dir,
74
+ image_format=image_format,
75
+ image_resolution=image_resolution,
76
+ )
77
+
78
+ # Export based on format
79
+ if format.lower() == "json":
80
+ return self._export_to_json(data, output_path, **kwargs)
81
+ elif format.lower() == "csv":
82
+ return self._export_to_csv(data, output_path, **kwargs)
83
+ elif format.lower() == "excel":
84
+ return self._export_to_excel(data, output_path, **kwargs)
85
+ else:
86
+ raise ValueError(f"Unsupported export format: {format}")
87
+
88
+ def _gather_analysis_data(
89
+ self,
90
+ analysis_keys: List[str],
91
+ include_content: bool,
92
+ include_images: bool,
93
+ image_dir: Optional[Path],
94
+ image_format: str,
95
+ image_resolution: int,
96
+ ) -> List[Dict[str, Any]]:
97
+ """
98
+ Gather analysis data from elements in the collection.
99
+
100
+ This method should be implemented by each collection class.
101
+ """
102
+ raise NotImplementedError("Subclasses must implement _gather_analysis_data")
103
+
104
+ def _export_to_json(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
105
+ """Export data to JSON format."""
106
+ with open(output_path, "w") as f:
107
+ json.dump(data, f, indent=2, **kwargs)
108
+ logger.info(f"Exported analysis data to {output_path}")
109
+ return str(output_path)
110
+
111
+ def _export_to_csv(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
112
+ """Export data to CSV format."""
113
+ try:
114
+ import pandas as pd
115
+
116
+ # Normalize nested data
117
+ df = pd.json_normalize(data)
118
+ df.to_csv(output_path, index=False, **kwargs)
119
+ logger.info(f"Exported analysis data to {output_path}")
120
+ return str(output_path)
121
+ except ImportError:
122
+ raise ImportError("Pandas is required for CSV export. Install with: pip install pandas")
123
+
124
+ def _export_to_excel(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
125
+ """Export data to Excel format."""
126
+ try:
127
+ import pandas as pd
128
+
129
+ # Normalize nested data
130
+ df = pd.json_normalize(data)
131
+ df.to_excel(output_path, index=False, **kwargs)
132
+ logger.info(f"Exported analysis data to {output_path}")
133
+ return str(output_path)
134
+ except ImportError:
135
+ raise ImportError(
136
+ "Pandas and openpyxl are required for Excel export. Install with: pip install pandas openpyxl"
137
+ )
@@ -1,10 +1,10 @@
1
1
  import abc
2
2
  import logging
3
- from typing import Union, List, TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, List, Union
4
4
 
5
5
  if TYPE_CHECKING:
6
- from natural_pdf.core.pdf import PDF
7
6
  from natural_pdf.collections.pdf_collection import PDFCollection
7
+ from natural_pdf.core.pdf import PDF
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
40
40
  """
41
41
  Helper to consistently resolve the input source to a list of PDF objects.
42
42
  """
43
- from natural_pdf.core.pdf import PDF # Avoid circular import at module level
44
43
  from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
44
+ from natural_pdf.core.pdf import PDF # Avoid circular import at module level
45
45
 
46
46
  pdfs_to_process: List["PDF"] = []
47
47
  if isinstance(source, PDF):
@@ -1,8 +1,9 @@
1
- import os
2
1
  import logging
2
+ import os
3
3
  import random
4
4
  import shutil
5
- from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
5
+ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
6
+
6
7
  from tqdm import tqdm
7
8
 
8
9
  from natural_pdf.exporters.base import FinetuneExporter
@@ -11,8 +12,8 @@ from natural_pdf.exporters.base import FinetuneExporter
11
12
  from natural_pdf.utils.identifiers import generate_short_path_hash
12
13
 
13
14
  if TYPE_CHECKING:
14
- from natural_pdf.core.pdf import PDF
15
15
  from natural_pdf.collections.pdf_collection import PDFCollection
16
+ from natural_pdf.core.pdf import PDF
16
17
  from natural_pdf.elements.text import TextElement
17
18
 
18
19
  logger = logging.getLogger(__name__)
@@ -48,7 +49,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
48
49
  selector: CSS-like selector to filter which TextElements to export.
49
50
  If None and corrected_only is False, all 'text' elements are considered.
50
51
  corrected_only: If True, overrides selector and exports only elements likely
51
- originating from a correction manifest (selector="text[source^=manifest]").
52
+ originating from a correction manifest (selector="text[source=manifest]").
52
53
  (default: False).
53
54
  split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
54
55
  If None, creates a single `label.txt` file (default: 0.9).
@@ -0,0 +1,135 @@
1
+ import base64
2
+ import io
3
+ import logging
4
+ from typing import Any, Optional, Type
5
+
6
+ from PIL import Image
7
+ from pydantic import BaseModel
8
+
9
+ from natural_pdf.extraction.result import StructuredDataResult
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class StructuredDataManager:
15
+ """
16
+ Manages the process of extracting structured data from elements using LLMs.
17
+
18
+ This manager is typically accessed via `pdf.get_manager('structured_data')`.
19
+ It is stateless and relies on parameters passed during method calls.
20
+ """
21
+
22
+ DEFAULT_TEXT_MODEL = "gpt-4o-mini"
23
+ DEFAULT_VISION_MODEL = "gpt-4o"
24
+
25
+ def __init__(self):
26
+ """Initializes the manager."""
27
+ logger.info("Initialized StructuredDataManager.")
28
+
29
+ def is_available(self) -> bool:
30
+ """Checks if necessary dependencies are available."""
31
+ try:
32
+ import pydantic
33
+
34
+ return True
35
+ except ImportError:
36
+ logger.warning("Pydantic is required for structured data extraction.")
37
+ return False
38
+
39
+ def _prepare_llm_messages(
40
+ self, content: Any, prompt: Optional[str], using: str, schema: Type[BaseModel]
41
+ ) -> list:
42
+ """Prepares the message list for the LLM API call."""
43
+ system_prompt = (
44
+ prompt
45
+ or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
46
+ )
47
+
48
+ messages = [{"role": "system", "content": system_prompt}]
49
+
50
+ if using == "text":
51
+ messages.append({"role": "user", "content": str(content)})
52
+ elif using == "vision":
53
+ if isinstance(content, Image.Image):
54
+ buffered = io.BytesIO()
55
+ content.save(buffered, format="PNG")
56
+ base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
57
+ messages.append(
58
+ {
59
+ "role": "user",
60
+ "content": [
61
+ {
62
+ "type": "text",
63
+ "text": "Extract information from this image based on the schema.",
64
+ },
65
+ {
66
+ "type": "image_url",
67
+ "image_url": {"url": f"data:image/png;base64,{base64_image}"},
68
+ },
69
+ ],
70
+ }
71
+ )
72
+ else:
73
+ raise TypeError(
74
+ f"Content must be a PIL Image for using='vision', got {type(content)}"
75
+ )
76
+ else:
77
+ raise ValueError(f"Unsupported value for 'using': {using}")
78
+
79
+ return messages
80
+
81
+ def extract(
82
+ self,
83
+ content: Any,
84
+ schema: Type[BaseModel],
85
+ client: Any,
86
+ prompt: Optional[str] = None,
87
+ using: str = "text",
88
+ model: Optional[str] = None,
89
+ **kwargs,
90
+ ) -> StructuredDataResult:
91
+ """
92
+ Extract structured data from content using an LLM.
93
+
94
+ Args:
95
+ content: Text string or Image object
96
+ schema: Pydantic model class for the desired structure
97
+ client: Initialized LLM client (e.g., OpenAI client)
98
+ prompt: Optional user-provided instructions
99
+ using: Modality ('text' or 'vision')
100
+ model: Specific LLM model identifier
101
+ **kwargs: Additional parameters for the LLM API call
102
+
103
+ Returns:
104
+ StructuredDataResult object
105
+ """
106
+ logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
107
+
108
+ if isinstance(content, list) and using == "vision":
109
+ if len(content) == 1:
110
+ content = content[0]
111
+ elif len(content) > 1:
112
+ logger.error("Vision extraction not supported for multi-page PDFs")
113
+ raise NotImplementedError(
114
+ "Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead."
115
+ )
116
+
117
+ selected_model = model or (
118
+ self.DEFAULT_VISION_MODEL if using == "vision" else self.DEFAULT_TEXT_MODEL
119
+ )
120
+ messages = self._prepare_llm_messages(content, prompt, using, schema)
121
+
122
+ try:
123
+ logger.debug(f"Extracting with model '{selected_model}'")
124
+ completion = client.beta.chat.completions.parse(
125
+ model=selected_model, messages=messages, response_format=schema, **kwargs
126
+ )
127
+ parsed_data = completion.choices[0].message.parsed
128
+ return StructuredDataResult(
129
+ data=parsed_data, success=True, error_message=None, model=selected_model
130
+ )
131
+ except Exception as e:
132
+ logger.error(f"Extraction failed: {str(e)}")
133
+ return StructuredDataResult(
134
+ data=None, success=False, error_message=str(e), model=selected_model
135
+ )
@@ -0,0 +1,279 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import TYPE_CHECKING, Any, Optional, Type
4
+
5
+ from pydantic import BaseModel
6
+
7
+ # Avoid circular import
8
+ if TYPE_CHECKING:
9
+ from natural_pdf.core.page import Page
10
+ from natural_pdf.elements.base import Element
11
+ from natural_pdf.extraction.result import StructuredDataResult
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ DEFAULT_STRUCTURED_KEY = "structured" # Define default key
16
+
17
+
18
+ class ExtractionMixin(ABC):
19
+ """
20
+ Mixin class providing structured data extraction capabilities to elements.
21
+ Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
22
+ """
23
+
24
+ def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
25
+ """
26
+ Retrieves the content (text or image) for extraction.
27
+
28
+ Args:
29
+ using: 'text' or 'vision'
30
+ **kwargs: Additional arguments passed to extract_text or to_image
31
+
32
+ Returns:
33
+ str: Extracted text if using='text'
34
+ PIL.Image.Image: Rendered image if using='vision'
35
+ None: If content cannot be retrieved
36
+ """
37
+ if not hasattr(self, "extract_text") or not callable(self.extract_text):
38
+ logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
39
+ return None
40
+ if not hasattr(self, "to_image") or not callable(self.to_image):
41
+ logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
42
+ return None
43
+
44
+ try:
45
+ if using == "text":
46
+ layout = kwargs.pop("layout", True)
47
+ return self.extract_text(layout=layout, **kwargs)
48
+ elif using == "vision":
49
+ resolution = kwargs.pop("resolution", 72)
50
+ include_highlights = kwargs.pop("include_highlights", False)
51
+ labels = kwargs.pop("labels", False)
52
+ return self.to_image(
53
+ resolution=resolution,
54
+ include_highlights=include_highlights,
55
+ labels=labels,
56
+ **kwargs,
57
+ )
58
+ else:
59
+ logger.error(f"Unsupported value for 'using': {using}")
60
+ return None
61
+ except Exception as e:
62
+ logger.error(f"Error getting {using} content from {self!r}: {e}")
63
+ return None
64
+
65
+ def extract(
66
+ self: Any,
67
+ schema: Type[BaseModel],
68
+ client: Any,
69
+ analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
70
+ prompt: Optional[str] = None,
71
+ using: str = "text",
72
+ model: Optional[str] = None,
73
+ overwrite: bool = False, # Add overwrite parameter
74
+ **kwargs,
75
+ ) -> Any:
76
+ """
77
+ Extracts structured data according to the provided schema.
78
+
79
+ Results are stored in the element's `analyses` dictionary.
80
+
81
+ Args:
82
+ schema: Pydantic model class defining the desired structure
83
+ client: Initialized LLM client
84
+ analysis_key: Key to store the result under in `analyses`. Defaults to "default-structured".
85
+ prompt: Optional user-provided prompt for the LLM
86
+ using: Modality ('text' or 'vision')
87
+ model: Optional specific LLM model identifier
88
+ overwrite: If True, allow overwriting an existing result at `analysis_key`.
89
+ **kwargs: Additional parameters for extraction
90
+
91
+ Returns:
92
+ Self for method chaining
93
+ """
94
+ if not analysis_key:
95
+ raise ValueError("analysis_key cannot be empty for extract operation")
96
+
97
+ # --- Overwrite Check --- #
98
+ if not hasattr(self, "analyses") or self.analyses is None:
99
+ self.analyses = {}
100
+
101
+ if analysis_key in self.analyses and not overwrite:
102
+ raise ValueError(
103
+ f"Analysis key '{analysis_key}' already exists in analyses. "
104
+ f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
105
+ )
106
+ # --- End Overwrite Check --- #
107
+
108
+ # Determine PDF instance to get manager
109
+ pdf_instance = None
110
+
111
+ if hasattr(self, "get_manager") and callable(self.get_manager):
112
+ # Handle case where self is the PDF instance itself
113
+ pdf_instance = self
114
+ logger.debug(f"Manager access via self ({type(self).__name__})")
115
+ elif (
116
+ hasattr(self, "pdf")
117
+ and hasattr(self.pdf, "get_manager")
118
+ and callable(self.pdf.get_manager)
119
+ ):
120
+ # Handle Page or other elements with direct .pdf reference
121
+ pdf_instance = self.pdf
122
+ logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
123
+ elif (
124
+ hasattr(self, "page")
125
+ and hasattr(self.page, "pdf")
126
+ and hasattr(self.page.pdf, "get_manager")
127
+ and callable(self.page.pdf.get_manager)
128
+ ):
129
+ # Handle Region or other elements with .page.pdf reference
130
+ pdf_instance = self.page.pdf
131
+ logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
132
+ else:
133
+ logger.error(
134
+ f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf"
135
+ )
136
+ raise RuntimeError(
137
+ f"Cannot access PDF manager: {type(self).__name__} lacks necessary references"
138
+ )
139
+
140
+ try:
141
+ manager = pdf_instance.get_manager("structured_data")
142
+ except Exception as e:
143
+ raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
144
+
145
+ if not manager or not manager.is_available():
146
+ raise RuntimeError("StructuredDataManager is not available")
147
+
148
+ # Get content
149
+ layout_for_text = kwargs.pop("layout", True)
150
+ content = self._get_extraction_content(
151
+ using=using, layout=layout_for_text, **kwargs
152
+ ) # Pass kwargs
153
+
154
+ if content is None or (
155
+ using == "text" and isinstance(content, str) and not content.strip()
156
+ ):
157
+ logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
158
+ # Import here to avoid circularity at module level
159
+ from natural_pdf.extraction.result import StructuredDataResult
160
+
161
+ result = StructuredDataResult(
162
+ data=None,
163
+ success=False,
164
+ error_message=f"No content available for extraction (using='{using}')",
165
+ model=model, # Use model requested, even if failed
166
+ )
167
+ else:
168
+ result = manager.extract(
169
+ content=content,
170
+ schema=schema,
171
+ client=client,
172
+ prompt=prompt,
173
+ using=using,
174
+ model=model,
175
+ **kwargs,
176
+ )
177
+
178
+ # Store the result
179
+ self.analyses[analysis_key] = result
180
+ logger.info(
181
+ f"Stored extraction result under key '{analysis_key}' (Success: {result.success})"
182
+ )
183
+
184
+ return self
185
+
186
+ def extracted(
187
+ self, field_name: Optional[str] = None, analysis_key: Optional[str] = None
188
+ ) -> Any:
189
+ """
190
+ Convenience method to access results from structured data extraction.
191
+
192
+ Args:
193
+ field_name: The specific field to retrieve from the extracted data dictionary.
194
+ If None, returns the entire data dictionary.
195
+ analysis_key: The key under which the extraction result was stored in `analyses`.
196
+ If None, defaults to "default-structured".
197
+
198
+ Returns:
199
+ The requested field value, the entire data dictionary, or raises an error.
200
+
201
+ Raises:
202
+ KeyError: If the specified `analysis_key` is not found in `analyses`.
203
+ ValueError: If the stored result for `analysis_key` indicates a failed extraction.
204
+ AttributeError: If the element does not have an `analyses` attribute.
205
+ KeyError: (Standard Python) If `field_name` is specified but not found in the data.
206
+ """
207
+ target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
208
+
209
+ if not hasattr(self, "analyses") or self.analyses is None:
210
+ raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
211
+
212
+ if target_key not in self.analyses:
213
+ available_keys = list(self.analyses.keys())
214
+ raise KeyError(
215
+ f"Extraction '{target_key}' not found in analyses. "
216
+ f"Available extractions: {available_keys}"
217
+ )
218
+
219
+ # Import here to avoid circularity and allow type checking
220
+ from natural_pdf.extraction.result import StructuredDataResult
221
+
222
+ result: StructuredDataResult = self.analyses[target_key]
223
+
224
+ if not isinstance(result, StructuredDataResult):
225
+ logger.warning(
226
+ f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process."
227
+ )
228
+ raise TypeError(
229
+ f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}"
230
+ )
231
+
232
+ if not result.success:
233
+ raise ValueError(
234
+ f"Stored result for '{target_key}' indicates a failed extraction attempt. "
235
+ f"Error: {result.error_message}"
236
+ )
237
+
238
+ if result.data is None:
239
+ # This case might occur if success=True but data is somehow None
240
+ raise ValueError(
241
+ f"Extraction result for '{target_key}' has no data available, despite success flag."
242
+ )
243
+
244
+ if field_name is None:
245
+ # Return the whole data object (Pydantic model instance or dict)
246
+ return result.data
247
+ else:
248
+ # Try dictionary key access first, then attribute access
249
+ if isinstance(result.data, dict):
250
+ try:
251
+ return result.data[field_name]
252
+ except KeyError:
253
+ available_keys = list(result.data.keys())
254
+ raise KeyError(
255
+ f"Field/Key '{field_name}' not found in extracted dictionary "
256
+ f"for key '{target_key}'. Available keys: {available_keys}"
257
+ )
258
+ else:
259
+ # Assume it's an object, try attribute access
260
+ try:
261
+ return getattr(result.data, field_name)
262
+ except AttributeError:
263
+ # Try to get available fields from the object
264
+ available_fields = []
265
+ if hasattr(result.data, "model_fields"): # Pydantic v2
266
+ available_fields = list(result.data.model_fields.keys())
267
+ elif hasattr(result.data, "__fields__"): # Pydantic v1
268
+ available_fields = list(result.data.__fields__.keys())
269
+ elif hasattr(result.data, "__dict__"): # Fallback
270
+ available_fields = list(result.data.__dict__.keys())
271
+
272
+ raise AttributeError(
273
+ f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
274
+ f"for key '{target_key}'. Available fields/attributes: {available_fields}"
275
+ )
276
+ except Exception as e: # Catch other potential errors during getattr
277
+ raise TypeError(
278
+ f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}"
279
+ ) from e
@@ -0,0 +1,23 @@
1
+ from typing import Any, Generic, Optional, TypeVar
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ # Generic type for the Pydantic model used in the schema
6
+ T_Schema = TypeVar("T_Schema", bound=BaseModel)
7
+
8
+
9
+ class StructuredDataResult(BaseModel, Generic[T_Schema]):
10
+ """
11
+ Represents the result of a structured data extraction operation.
12
+
13
+ Contains the extracted data, success status, and error information.
14
+ """
15
+
16
+ data: Optional[T_Schema] = Field(None, description="Validated data model or None on failure")
17
+ success: bool = Field(..., description="Whether extraction succeeded")
18
+ error_message: Optional[str] = Field(None, description="Error details if extraction failed")
19
+ raw_output: Optional[Any] = Field(None, description="Raw output from the language model")
20
+ model_used: Optional[str] = Field(None, description="Identifier of the language model used")
21
+
22
+ class Config:
23
+ arbitrary_types_allowed = True
@@ -11,15 +11,15 @@ logger = logging.getLogger("natural_pdf.ocr")
11
11
 
12
12
  # Import the base classes that are always available
13
13
  from .engine import OCREngine
14
+ from .ocr_factory import OCRFactory
15
+ from .ocr_manager import OCRManager
14
16
  from .ocr_options import (
15
- OCROptions,
16
17
  BaseOCROptions,
17
18
  EasyOCROptions,
19
+ OCROptions,
18
20
  PaddleOCROptions,
19
21
  SuryaOCROptions,
20
22
  )
21
- from .ocr_manager import OCRManager
22
- from .ocr_factory import OCRFactory
23
23
 
24
24
  # Add all public symbols that should be available when importing this module
25
25
  __all__ = [
@@ -41,7 +41,7 @@ def get_engine(engine_name=None, **kwargs):
41
41
  Get OCR engine by name with graceful handling of missing dependencies.
42
42
 
43
43
  Args:
44
- engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
44
+ engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya', 'doctr')
45
45
  If None, the best available engine is used
46
46
  **kwargs: Additional arguments to pass to the engine constructor
47
47
 
@@ -63,7 +63,7 @@ def get_engine(engine_name=None, **kwargs):
63
63
 
64
64
  # Use the factory to create a specific engine
65
65
  normalized_name = engine_name.lower()
66
- if normalized_name in ["easyocr", "paddle", "surya"]:
66
+ if normalized_name in ["easyocr", "paddle", "surya", "doctr"]:
67
67
  return OCRFactory.create_engine(normalized_name, **kwargs)
68
68
  else:
69
69
  raise ValueError(f"Unknown OCR engine: {engine_name}")