natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +230 -151
  13. natural_pdf/classification/mixin.py +49 -35
  14. natural_pdf/classification/results.py +64 -46
  15. natural_pdf/collections/mixins.py +68 -20
  16. natural_pdf/collections/pdf_collection.py +177 -64
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +633 -190
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +503 -131
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -274,7 +274,7 @@ class TextElement(Element):
274
274
 
275
275
  return False
276
276
 
277
- def __repr__(self) -> str:
277
+ def __repr__(self) -> str:
278
278
  """String representation of the text element."""
279
279
  if self.text:
280
280
  preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
@@ -0,0 +1,137 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class ExportMixin:
11
+ """
12
+ Mixin for exporting analyses from collections of elements.
13
+
14
+ This mixin is designed to be used with PDF, PDFCollection,
15
+ PageCollection, and ElementCollection classes.
16
+ """
17
+
18
+ def export_analyses(
19
+ self,
20
+ output_path: str,
21
+ analysis_keys: Union[str, List[str]],
22
+ format: str = "json",
23
+ include_content: bool = True,
24
+ include_images: bool = False,
25
+ image_dir: Optional[str] = None,
26
+ image_format: str = "jpg",
27
+ image_resolution: int = 72,
28
+ overwrite: bool = True,
29
+ **kwargs,
30
+ ) -> str:
31
+ """
32
+ Export analysis results to a file.
33
+
34
+ Args:
35
+ output_path: Path to save the export file
36
+ analysis_keys: Key(s) in the analyses dictionary to export
37
+ format: Export format ('json', 'csv', 'excel')
38
+ include_content: Whether to include extracted text
39
+ include_images: Whether to export images of elements
40
+ image_dir: Directory to save images (created if doesn't exist)
41
+ image_format: Format to save images ('jpg', 'png')
42
+ image_resolution: Resolution for exported images
43
+ overwrite: Whether to overwrite existing files
44
+ **kwargs: Additional format-specific options
45
+
46
+ Returns:
47
+ Path to the exported file
48
+ """
49
+ # Convert single key to list for consistency
50
+ if isinstance(analysis_keys, str):
51
+ analysis_keys = [analysis_keys]
52
+
53
+ # Create output directory
54
+ output_path = Path(output_path)
55
+ os.makedirs(output_path.parent, exist_ok=True)
56
+
57
+ # Check if file exists and handle overwrite
58
+ if output_path.exists() and not overwrite:
59
+ raise FileExistsError(f"Output file {output_path} already exists and overwrite=False")
60
+
61
+ # Prepare image directory if needed
62
+ if include_images:
63
+ if image_dir is None:
64
+ image_dir = output_path.parent / f"{output_path.stem}_images"
65
+ os.makedirs(image_dir, exist_ok=True)
66
+ image_dir = Path(image_dir) # Convert to Path object
67
+
68
+ # Gather data from collection
69
+ data = self._gather_analysis_data(
70
+ analysis_keys=analysis_keys,
71
+ include_content=include_content,
72
+ include_images=include_images,
73
+ image_dir=image_dir,
74
+ image_format=image_format,
75
+ image_resolution=image_resolution,
76
+ )
77
+
78
+ # Export based on format
79
+ if format.lower() == "json":
80
+ return self._export_to_json(data, output_path, **kwargs)
81
+ elif format.lower() == "csv":
82
+ return self._export_to_csv(data, output_path, **kwargs)
83
+ elif format.lower() == "excel":
84
+ return self._export_to_excel(data, output_path, **kwargs)
85
+ else:
86
+ raise ValueError(f"Unsupported export format: {format}")
87
+
88
+ def _gather_analysis_data(
89
+ self,
90
+ analysis_keys: List[str],
91
+ include_content: bool,
92
+ include_images: bool,
93
+ image_dir: Optional[Path],
94
+ image_format: str,
95
+ image_resolution: int,
96
+ ) -> List[Dict[str, Any]]:
97
+ """
98
+ Gather analysis data from elements in the collection.
99
+
100
+ This method should be implemented by each collection class.
101
+ """
102
+ raise NotImplementedError("Subclasses must implement _gather_analysis_data")
103
+
104
+ def _export_to_json(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
105
+ """Export data to JSON format."""
106
+ with open(output_path, "w") as f:
107
+ json.dump(data, f, indent=2, **kwargs)
108
+ logger.info(f"Exported analysis data to {output_path}")
109
+ return str(output_path)
110
+
111
+ def _export_to_csv(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
112
+ """Export data to CSV format."""
113
+ try:
114
+ import pandas as pd
115
+
116
+ # Normalize nested data
117
+ df = pd.json_normalize(data)
118
+ df.to_csv(output_path, index=False, **kwargs)
119
+ logger.info(f"Exported analysis data to {output_path}")
120
+ return str(output_path)
121
+ except ImportError:
122
+ raise ImportError("Pandas is required for CSV export. Install with: pip install pandas")
123
+
124
+ def _export_to_excel(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
125
+ """Export data to Excel format."""
126
+ try:
127
+ import pandas as pd
128
+
129
+ # Normalize nested data
130
+ df = pd.json_normalize(data)
131
+ df.to_excel(output_path, index=False, **kwargs)
132
+ logger.info(f"Exported analysis data to {output_path}")
133
+ return str(output_path)
134
+ except ImportError:
135
+ raise ImportError(
136
+ "Pandas and openpyxl are required for Excel export. Install with: pip install pandas openpyxl"
137
+ )
@@ -1,10 +1,10 @@
1
1
  import abc
2
2
  import logging
3
- from typing import Union, List, TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, List, Union
4
4
 
5
5
  if TYPE_CHECKING:
6
- from natural_pdf.core.pdf import PDF
7
6
  from natural_pdf.collections.pdf_collection import PDFCollection
7
+ from natural_pdf.core.pdf import PDF
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
40
40
  """
41
41
  Helper to consistently resolve the input source to a list of PDF objects.
42
42
  """
43
- from natural_pdf.core.pdf import PDF # Avoid circular import at module level
44
43
  from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
44
+ from natural_pdf.core.pdf import PDF # Avoid circular import at module level
45
45
 
46
46
  pdfs_to_process: List["PDF"] = []
47
47
  if isinstance(source, PDF):
@@ -1,8 +1,9 @@
1
- import os
2
1
  import logging
2
+ import os
3
3
  import random
4
4
  import shutil
5
- from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
5
+ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
6
+
6
7
  from tqdm import tqdm
7
8
 
8
9
  from natural_pdf.exporters.base import FinetuneExporter
@@ -11,8 +12,8 @@ from natural_pdf.exporters.base import FinetuneExporter
11
12
  from natural_pdf.utils.identifiers import generate_short_path_hash
12
13
 
13
14
  if TYPE_CHECKING:
14
- from natural_pdf.core.pdf import PDF
15
15
  from natural_pdf.collections.pdf_collection import PDFCollection
16
+ from natural_pdf.core.pdf import PDF
16
17
  from natural_pdf.elements.text import TextElement
17
18
 
18
19
  logger = logging.getLogger(__name__)
@@ -1,9 +1,10 @@
1
- import logging
2
- from typing import Any, Type, Optional
3
- from pydantic import BaseModel
4
- import io
5
1
  import base64
2
+ import io
3
+ import logging
4
+ from typing import Any, Optional, Type
5
+
6
6
  from PIL import Image
7
+ from pydantic import BaseModel
7
8
 
8
9
  from natural_pdf.extraction.result import StructuredDataResult
9
10
 
@@ -29,47 +30,52 @@ class StructuredDataManager:
29
30
  """Checks if necessary dependencies are available."""
30
31
  try:
31
32
  import pydantic
33
+
32
34
  return True
33
35
  except ImportError:
34
36
  logger.warning("Pydantic is required for structured data extraction.")
35
37
  return False
36
38
 
37
39
  def _prepare_llm_messages(
38
- self,
39
- content: Any,
40
- prompt: Optional[str],
41
- using: str,
42
- schema: Type[BaseModel]
40
+ self, content: Any, prompt: Optional[str], using: str, schema: Type[BaseModel]
43
41
  ) -> list:
44
42
  """Prepares the message list for the LLM API call."""
45
- system_prompt = prompt or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
46
-
47
- messages = [
48
- {"role": "system", "content": system_prompt}
49
- ]
50
-
51
- if using == 'text':
43
+ system_prompt = (
44
+ prompt
45
+ or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
46
+ )
47
+
48
+ messages = [{"role": "system", "content": system_prompt}]
49
+
50
+ if using == "text":
52
51
  messages.append({"role": "user", "content": str(content)})
53
- elif using == 'vision':
52
+ elif using == "vision":
54
53
  if isinstance(content, Image.Image):
55
54
  buffered = io.BytesIO()
56
55
  content.save(buffered, format="PNG")
57
56
  base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
58
- messages.append({
59
- "role": "user",
60
- "content": [
61
- {"type": "text", "text": "Extract information from this image based on the schema."},
62
- {
63
- "type": "image_url",
64
- "image_url": {"url": f"data:image/png;base64,{base64_image}"},
65
- },
66
- ],
67
- })
57
+ messages.append(
58
+ {
59
+ "role": "user",
60
+ "content": [
61
+ {
62
+ "type": "text",
63
+ "text": "Extract information from this image based on the schema.",
64
+ },
65
+ {
66
+ "type": "image_url",
67
+ "image_url": {"url": f"data:image/png;base64,{base64_image}"},
68
+ },
69
+ ],
70
+ }
71
+ )
68
72
  else:
69
- raise TypeError(f"Content must be a PIL Image for using='vision', got {type(content)}")
73
+ raise TypeError(
74
+ f"Content must be a PIL Image for using='vision', got {type(content)}"
75
+ )
70
76
  else:
71
- raise ValueError(f"Unsupported value for 'using': {using}")
72
-
77
+ raise ValueError(f"Unsupported value for 'using': {using}")
78
+
73
79
  return messages
74
80
 
75
81
  def extract(
@@ -78,9 +84,9 @@ class StructuredDataManager:
78
84
  schema: Type[BaseModel],
79
85
  client: Any,
80
86
  prompt: Optional[str] = None,
81
- using: str = 'text',
87
+ using: str = "text",
82
88
  model: Optional[str] = None,
83
- **kwargs
89
+ **kwargs,
84
90
  ) -> StructuredDataResult:
85
91
  """
86
92
  Extract structured data from content using an LLM.
@@ -99,36 +105,31 @@ class StructuredDataManager:
99
105
  """
100
106
  logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
101
107
 
102
- if isinstance(content, list) and using == 'vision':
108
+ if isinstance(content, list) and using == "vision":
103
109
  if len(content) == 1:
104
110
  content = content[0]
105
111
  elif len(content) > 1:
106
112
  logger.error("Vision extraction not supported for multi-page PDFs")
107
- raise NotImplementedError("Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead.")
108
-
109
- selected_model = model or (self.DEFAULT_VISION_MODEL if using == 'vision' else self.DEFAULT_TEXT_MODEL)
113
+ raise NotImplementedError(
114
+ "Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead."
115
+ )
116
+
117
+ selected_model = model or (
118
+ self.DEFAULT_VISION_MODEL if using == "vision" else self.DEFAULT_TEXT_MODEL
119
+ )
110
120
  messages = self._prepare_llm_messages(content, prompt, using, schema)
111
121
 
112
122
  try:
113
123
  logger.debug(f"Extracting with model '{selected_model}'")
114
124
  completion = client.beta.chat.completions.parse(
115
- model=selected_model,
116
- messages=messages,
117
- response_format=schema,
118
- **kwargs
125
+ model=selected_model, messages=messages, response_format=schema, **kwargs
119
126
  )
120
127
  parsed_data = completion.choices[0].message.parsed
121
128
  return StructuredDataResult(
122
- data=parsed_data,
123
- success=True,
124
- error_message=None,
125
- model=selected_model
129
+ data=parsed_data, success=True, error_message=None, model=selected_model
126
130
  )
127
131
  except Exception as e:
128
132
  logger.error(f"Extraction failed: {str(e)}")
129
133
  return StructuredDataResult(
130
- data=None,
131
- success=False,
132
- error_message=str(e),
133
- model=selected_model
134
- )
134
+ data=None, success=False, error_message=str(e), model=selected_model
135
+ )
@@ -1,17 +1,19 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Type, Optional
3
2
  from abc import ABC, abstractmethod
3
+ from typing import TYPE_CHECKING, Any, Optional, Type
4
+
4
5
  from pydantic import BaseModel
5
6
 
6
7
  # Avoid circular import
7
8
  if TYPE_CHECKING:
8
- from natural_pdf.extraction.result import StructuredDataResult
9
9
  from natural_pdf.core.page import Page
10
10
  from natural_pdf.elements.base import Element
11
+ from natural_pdf.extraction.result import StructuredDataResult
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
14
- DEFAULT_STRUCTURED_KEY = "default-structured" # Define default key
15
+ DEFAULT_STRUCTURED_KEY = "structured" # Define default key
16
+
15
17
 
16
18
  class ExtractionMixin(ABC):
17
19
  """
@@ -19,7 +21,7 @@ class ExtractionMixin(ABC):
19
21
  Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
20
22
  """
21
23
 
22
- def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
24
+ def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
23
25
  """
24
26
  Retrieves the content (text or image) for extraction.
25
27
 
@@ -32,26 +34,26 @@ class ExtractionMixin(ABC):
32
34
  PIL.Image.Image: Rendered image if using='vision'
33
35
  None: If content cannot be retrieved
34
36
  """
35
- if not hasattr(self, 'extract_text') or not callable(self.extract_text):
36
- logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
37
- return None
38
- if not hasattr(self, 'to_image') or not callable(self.to_image):
39
- logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
40
- return None
41
-
37
+ if not hasattr(self, "extract_text") or not callable(self.extract_text):
38
+ logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
39
+ return None
40
+ if not hasattr(self, "to_image") or not callable(self.to_image):
41
+ logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
42
+ return None
43
+
42
44
  try:
43
- if using == 'text':
44
- layout = kwargs.pop('layout', True)
45
+ if using == "text":
46
+ layout = kwargs.pop("layout", True)
45
47
  return self.extract_text(layout=layout, **kwargs)
46
- elif using == 'vision':
47
- resolution = kwargs.pop('resolution', 72)
48
- include_highlights = kwargs.pop('include_highlights', False)
49
- labels = kwargs.pop('labels', False)
48
+ elif using == "vision":
49
+ resolution = kwargs.pop("resolution", 72)
50
+ include_highlights = kwargs.pop("include_highlights", False)
51
+ labels = kwargs.pop("labels", False)
50
52
  return self.to_image(
51
- resolution=resolution,
52
- include_highlights=include_highlights,
53
- labels=labels,
54
- **kwargs
53
+ resolution=resolution,
54
+ include_highlights=include_highlights,
55
+ labels=labels,
56
+ **kwargs,
55
57
  )
56
58
  else:
57
59
  logger.error(f"Unsupported value for 'using': {using}")
@@ -64,12 +66,12 @@ class ExtractionMixin(ABC):
64
66
  self: Any,
65
67
  schema: Type[BaseModel],
66
68
  client: Any,
67
- analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
69
+ analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
68
70
  prompt: Optional[str] = None,
69
- using: str = 'text',
71
+ using: str = "text",
70
72
  model: Optional[str] = None,
71
- overwrite: bool = False, # Add overwrite parameter
72
- **kwargs
73
+ overwrite: bool = False, # Add overwrite parameter
74
+ **kwargs,
73
75
  ) -> Any:
74
76
  """
75
77
  Extracts structured data according to the provided schema.
@@ -91,39 +93,52 @@ class ExtractionMixin(ABC):
91
93
  """
92
94
  if not analysis_key:
93
95
  raise ValueError("analysis_key cannot be empty for extract operation")
94
-
96
+
95
97
  # --- Overwrite Check --- #
96
- if not hasattr(self, 'analyses') or self.analyses is None:
98
+ if not hasattr(self, "analyses") or self.analyses is None:
97
99
  self.analyses = {}
98
-
100
+
99
101
  if analysis_key in self.analyses and not overwrite:
100
102
  raise ValueError(
101
103
  f"Analysis key '{analysis_key}' already exists in analyses. "
102
104
  f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
103
105
  )
104
106
  # --- End Overwrite Check --- #
105
-
107
+
106
108
  # Determine PDF instance to get manager
107
109
  pdf_instance = None
108
-
109
- if hasattr(self, 'get_manager') and callable(self.get_manager):
110
+
111
+ if hasattr(self, "get_manager") and callable(self.get_manager):
110
112
  # Handle case where self is the PDF instance itself
111
113
  pdf_instance = self
112
114
  logger.debug(f"Manager access via self ({type(self).__name__})")
113
- elif hasattr(self, 'pdf') and hasattr(self.pdf, 'get_manager') and callable(self.pdf.get_manager):
115
+ elif (
116
+ hasattr(self, "pdf")
117
+ and hasattr(self.pdf, "get_manager")
118
+ and callable(self.pdf.get_manager)
119
+ ):
114
120
  # Handle Page or other elements with direct .pdf reference
115
121
  pdf_instance = self.pdf
116
122
  logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
117
- elif hasattr(self, 'page') and hasattr(self.page, 'pdf') and hasattr(self.page.pdf, 'get_manager') and callable(self.page.pdf.get_manager):
123
+ elif (
124
+ hasattr(self, "page")
125
+ and hasattr(self.page, "pdf")
126
+ and hasattr(self.page.pdf, "get_manager")
127
+ and callable(self.page.pdf.get_manager)
128
+ ):
118
129
  # Handle Region or other elements with .page.pdf reference
119
130
  pdf_instance = self.page.pdf
120
131
  logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
121
132
  else:
122
- logger.error(f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf")
123
- raise RuntimeError(f"Cannot access PDF manager: {type(self).__name__} lacks necessary references")
124
-
133
+ logger.error(
134
+ f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf"
135
+ )
136
+ raise RuntimeError(
137
+ f"Cannot access PDF manager: {type(self).__name__} lacks necessary references"
138
+ )
139
+
125
140
  try:
126
- manager = pdf_instance.get_manager('structured_data')
141
+ manager = pdf_instance.get_manager("structured_data")
127
142
  except Exception as e:
128
143
  raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
129
144
 
@@ -131,18 +146,23 @@ class ExtractionMixin(ABC):
131
146
  raise RuntimeError("StructuredDataManager is not available")
132
147
 
133
148
  # Get content
134
- layout_for_text = kwargs.pop('layout', True)
135
- content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs) # Pass kwargs
149
+ layout_for_text = kwargs.pop("layout", True)
150
+ content = self._get_extraction_content(
151
+ using=using, layout=layout_for_text, **kwargs
152
+ ) # Pass kwargs
136
153
 
137
- if content is None or (using == 'text' and isinstance(content, str) and not content.strip()):
154
+ if content is None or (
155
+ using == "text" and isinstance(content, str) and not content.strip()
156
+ ):
138
157
  logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
139
158
  # Import here to avoid circularity at module level
140
- from natural_pdf.extraction.result import StructuredDataResult
159
+ from natural_pdf.extraction.result import StructuredDataResult
160
+
141
161
  result = StructuredDataResult(
142
162
  data=None,
143
163
  success=False,
144
164
  error_message=f"No content available for extraction (using='{using}')",
145
- model=model # Use model requested, even if failed
165
+ model=model, # Use model requested, even if failed
146
166
  )
147
167
  else:
148
168
  result = manager.extract(
@@ -152,16 +172,20 @@ class ExtractionMixin(ABC):
152
172
  prompt=prompt,
153
173
  using=using,
154
174
  model=model,
155
- **kwargs
175
+ **kwargs,
156
176
  )
157
177
 
158
178
  # Store the result
159
179
  self.analyses[analysis_key] = result
160
- logger.info(f"Stored extraction result under key '{analysis_key}' (Success: {result.success})")
180
+ logger.info(
181
+ f"Stored extraction result under key '{analysis_key}' (Success: {result.success})"
182
+ )
161
183
 
162
184
  return self
163
185
 
164
- def extracted(self, field_name: Optional[str] = None, analysis_key: Optional[str] = None) -> Any:
186
+ def extracted(
187
+ self, field_name: Optional[str] = None, analysis_key: Optional[str] = None
188
+ ) -> Any:
165
189
  """
166
190
  Convenience method to access results from structured data extraction.
167
191
 
@@ -182,7 +206,7 @@ class ExtractionMixin(ABC):
182
206
  """
183
207
  target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
184
208
 
185
- if not hasattr(self, 'analyses') or self.analyses is None:
209
+ if not hasattr(self, "analyses") or self.analyses is None:
186
210
  raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
187
211
 
188
212
  if target_key not in self.analyses:
@@ -194,21 +218,28 @@ class ExtractionMixin(ABC):
194
218
 
195
219
  # Import here to avoid circularity and allow type checking
196
220
  from natural_pdf.extraction.result import StructuredDataResult
221
+
197
222
  result: StructuredDataResult = self.analyses[target_key]
198
223
 
199
224
  if not isinstance(result, StructuredDataResult):
200
- logger.warning(f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process.")
201
- raise TypeError(f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}")
225
+ logger.warning(
226
+ f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process."
227
+ )
228
+ raise TypeError(
229
+ f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}"
230
+ )
202
231
 
203
232
  if not result.success:
204
233
  raise ValueError(
205
234
  f"Stored result for '{target_key}' indicates a failed extraction attempt. "
206
235
  f"Error: {result.error_message}"
207
236
  )
208
-
237
+
209
238
  if result.data is None:
210
- # This case might occur if success=True but data is somehow None
211
- raise ValueError(f"Extraction result for '{target_key}' has no data available, despite success flag.")
239
+ # This case might occur if success=True but data is somehow None
240
+ raise ValueError(
241
+ f"Extraction result for '{target_key}' has no data available, despite success flag."
242
+ )
212
243
 
213
244
  if field_name is None:
214
245
  # Return the whole data object (Pydantic model instance or dict)
@@ -231,16 +262,18 @@ class ExtractionMixin(ABC):
231
262
  except AttributeError:
232
263
  # Try to get available fields from the object
233
264
  available_fields = []
234
- if hasattr(result.data, 'model_fields'): # Pydantic v2
265
+ if hasattr(result.data, "model_fields"): # Pydantic v2
235
266
  available_fields = list(result.data.model_fields.keys())
236
- elif hasattr(result.data, '__fields__'): # Pydantic v1
267
+ elif hasattr(result.data, "__fields__"): # Pydantic v1
237
268
  available_fields = list(result.data.__fields__.keys())
238
- elif hasattr(result.data, '__dict__'): # Fallback
269
+ elif hasattr(result.data, "__dict__"): # Fallback
239
270
  available_fields = list(result.data.__dict__.keys())
240
-
271
+
241
272
  raise AttributeError(
242
273
  f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
243
274
  f"for key '{target_key}'. Available fields/attributes: {available_fields}"
244
275
  )
245
- except Exception as e: # Catch other potential errors during getattr
246
- raise TypeError(f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}") from e
276
+ except Exception as e: # Catch other potential errors during getattr
277
+ raise TypeError(
278
+ f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}"
279
+ ) from e
@@ -1,4 +1,5 @@
1
- from typing import Optional, TypeVar, Generic, Any
1
+ from typing import Any, Generic, Optional, TypeVar
2
+
2
3
  from pydantic import BaseModel, Field
3
4
 
4
5
  # Generic type for the Pydantic model used in the schema
@@ -8,30 +9,15 @@ T_Schema = TypeVar("T_Schema", bound=BaseModel)
8
9
  class StructuredDataResult(BaseModel, Generic[T_Schema]):
9
10
  """
10
11
  Represents the result of a structured data extraction operation.
11
-
12
+
12
13
  Contains the extracted data, success status, and error information.
13
14
  """
14
15
 
15
- data: Optional[T_Schema] = Field(
16
- None,
17
- description="Validated data model or None on failure"
18
- )
19
- success: bool = Field(
20
- ...,
21
- description="Whether extraction succeeded"
22
- )
23
- error_message: Optional[str] = Field(
24
- None,
25
- description="Error details if extraction failed"
26
- )
27
- raw_output: Optional[Any] = Field(
28
- None,
29
- description="Raw output from the language model"
30
- )
31
- model_used: Optional[str] = Field(
32
- None,
33
- description="Identifier of the language model used"
34
- )
16
+ data: Optional[T_Schema] = Field(None, description="Validated data model or None on failure")
17
+ success: bool = Field(..., description="Whether extraction succeeded")
18
+ error_message: Optional[str] = Field(None, description="Error details if extraction failed")
19
+ raw_output: Optional[Any] = Field(None, description="Raw output from the language model")
20
+ model_used: Optional[str] = Field(None, description="Identifier of the language model used")
35
21
 
36
22
  class Config:
37
- arbitrary_types_allowed = True
23
+ arbitrary_types_allowed = True