natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/text.py
CHANGED
@@ -276,7 +276,10 @@ class TextElement(Element):
|
|
276
276
|
|
277
277
|
def __repr__(self) -> str:
|
278
278
|
"""String representation of the text element."""
|
279
|
-
|
279
|
+
if self.text:
|
280
|
+
preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
|
281
|
+
else:
|
282
|
+
preview = "..."
|
280
283
|
font_style = []
|
281
284
|
if self.bold:
|
282
285
|
font_style.append("bold")
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class ExportMixin:
|
11
|
+
"""
|
12
|
+
Mixin for exporting analyses from collections of elements.
|
13
|
+
|
14
|
+
This mixin is designed to be used with PDF, PDFCollection,
|
15
|
+
PageCollection, and ElementCollection classes.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def export_analyses(
|
19
|
+
self,
|
20
|
+
output_path: str,
|
21
|
+
analysis_keys: Union[str, List[str]],
|
22
|
+
format: str = "json",
|
23
|
+
include_content: bool = True,
|
24
|
+
include_images: bool = False,
|
25
|
+
image_dir: Optional[str] = None,
|
26
|
+
image_format: str = "jpg",
|
27
|
+
image_resolution: int = 72,
|
28
|
+
overwrite: bool = True,
|
29
|
+
**kwargs,
|
30
|
+
) -> str:
|
31
|
+
"""
|
32
|
+
Export analysis results to a file.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
output_path: Path to save the export file
|
36
|
+
analysis_keys: Key(s) in the analyses dictionary to export
|
37
|
+
format: Export format ('json', 'csv', 'excel')
|
38
|
+
include_content: Whether to include extracted text
|
39
|
+
include_images: Whether to export images of elements
|
40
|
+
image_dir: Directory to save images (created if doesn't exist)
|
41
|
+
image_format: Format to save images ('jpg', 'png')
|
42
|
+
image_resolution: Resolution for exported images
|
43
|
+
overwrite: Whether to overwrite existing files
|
44
|
+
**kwargs: Additional format-specific options
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
Path to the exported file
|
48
|
+
"""
|
49
|
+
# Convert single key to list for consistency
|
50
|
+
if isinstance(analysis_keys, str):
|
51
|
+
analysis_keys = [analysis_keys]
|
52
|
+
|
53
|
+
# Create output directory
|
54
|
+
output_path = Path(output_path)
|
55
|
+
os.makedirs(output_path.parent, exist_ok=True)
|
56
|
+
|
57
|
+
# Check if file exists and handle overwrite
|
58
|
+
if output_path.exists() and not overwrite:
|
59
|
+
raise FileExistsError(f"Output file {output_path} already exists and overwrite=False")
|
60
|
+
|
61
|
+
# Prepare image directory if needed
|
62
|
+
if include_images:
|
63
|
+
if image_dir is None:
|
64
|
+
image_dir = output_path.parent / f"{output_path.stem}_images"
|
65
|
+
os.makedirs(image_dir, exist_ok=True)
|
66
|
+
image_dir = Path(image_dir) # Convert to Path object
|
67
|
+
|
68
|
+
# Gather data from collection
|
69
|
+
data = self._gather_analysis_data(
|
70
|
+
analysis_keys=analysis_keys,
|
71
|
+
include_content=include_content,
|
72
|
+
include_images=include_images,
|
73
|
+
image_dir=image_dir,
|
74
|
+
image_format=image_format,
|
75
|
+
image_resolution=image_resolution,
|
76
|
+
)
|
77
|
+
|
78
|
+
# Export based on format
|
79
|
+
if format.lower() == "json":
|
80
|
+
return self._export_to_json(data, output_path, **kwargs)
|
81
|
+
elif format.lower() == "csv":
|
82
|
+
return self._export_to_csv(data, output_path, **kwargs)
|
83
|
+
elif format.lower() == "excel":
|
84
|
+
return self._export_to_excel(data, output_path, **kwargs)
|
85
|
+
else:
|
86
|
+
raise ValueError(f"Unsupported export format: {format}")
|
87
|
+
|
88
|
+
def _gather_analysis_data(
|
89
|
+
self,
|
90
|
+
analysis_keys: List[str],
|
91
|
+
include_content: bool,
|
92
|
+
include_images: bool,
|
93
|
+
image_dir: Optional[Path],
|
94
|
+
image_format: str,
|
95
|
+
image_resolution: int,
|
96
|
+
) -> List[Dict[str, Any]]:
|
97
|
+
"""
|
98
|
+
Gather analysis data from elements in the collection.
|
99
|
+
|
100
|
+
This method should be implemented by each collection class.
|
101
|
+
"""
|
102
|
+
raise NotImplementedError("Subclasses must implement _gather_analysis_data")
|
103
|
+
|
104
|
+
def _export_to_json(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
|
105
|
+
"""Export data to JSON format."""
|
106
|
+
with open(output_path, "w") as f:
|
107
|
+
json.dump(data, f, indent=2, **kwargs)
|
108
|
+
logger.info(f"Exported analysis data to {output_path}")
|
109
|
+
return str(output_path)
|
110
|
+
|
111
|
+
def _export_to_csv(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
|
112
|
+
"""Export data to CSV format."""
|
113
|
+
try:
|
114
|
+
import pandas as pd
|
115
|
+
|
116
|
+
# Normalize nested data
|
117
|
+
df = pd.json_normalize(data)
|
118
|
+
df.to_csv(output_path, index=False, **kwargs)
|
119
|
+
logger.info(f"Exported analysis data to {output_path}")
|
120
|
+
return str(output_path)
|
121
|
+
except ImportError:
|
122
|
+
raise ImportError("Pandas is required for CSV export. Install with: pip install pandas")
|
123
|
+
|
124
|
+
def _export_to_excel(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
|
125
|
+
"""Export data to Excel format."""
|
126
|
+
try:
|
127
|
+
import pandas as pd
|
128
|
+
|
129
|
+
# Normalize nested data
|
130
|
+
df = pd.json_normalize(data)
|
131
|
+
df.to_excel(output_path, index=False, **kwargs)
|
132
|
+
logger.info(f"Exported analysis data to {output_path}")
|
133
|
+
return str(output_path)
|
134
|
+
except ImportError:
|
135
|
+
raise ImportError(
|
136
|
+
"Pandas and openpyxl are required for Excel export. Install with: pip install pandas openpyxl"
|
137
|
+
)
|
natural_pdf/exporters/base.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
import abc
|
2
2
|
import logging
|
3
|
-
from typing import
|
3
|
+
from typing import TYPE_CHECKING, List, Union
|
4
4
|
|
5
5
|
if TYPE_CHECKING:
|
6
|
-
from natural_pdf.core.pdf import PDF
|
7
6
|
from natural_pdf.collections.pdf_collection import PDFCollection
|
7
|
+
from natural_pdf.core.pdf import PDF
|
8
8
|
|
9
9
|
logger = logging.getLogger(__name__)
|
10
10
|
|
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
|
|
40
40
|
"""
|
41
41
|
Helper to consistently resolve the input source to a list of PDF objects.
|
42
42
|
"""
|
43
|
-
from natural_pdf.core.pdf import PDF # Avoid circular import at module level
|
44
43
|
from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
|
44
|
+
from natural_pdf.core.pdf import PDF # Avoid circular import at module level
|
45
45
|
|
46
46
|
pdfs_to_process: List["PDF"] = []
|
47
47
|
if isinstance(source, PDF):
|
@@ -1,8 +1,9 @@
|
|
1
|
-
import os
|
2
1
|
import logging
|
2
|
+
import os
|
3
3
|
import random
|
4
4
|
import shutil
|
5
|
-
from typing import
|
5
|
+
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
6
|
+
|
6
7
|
from tqdm import tqdm
|
7
8
|
|
8
9
|
from natural_pdf.exporters.base import FinetuneExporter
|
@@ -11,8 +12,8 @@ from natural_pdf.exporters.base import FinetuneExporter
|
|
11
12
|
from natural_pdf.utils.identifiers import generate_short_path_hash
|
12
13
|
|
13
14
|
if TYPE_CHECKING:
|
14
|
-
from natural_pdf.core.pdf import PDF
|
15
15
|
from natural_pdf.collections.pdf_collection import PDFCollection
|
16
|
+
from natural_pdf.core.pdf import PDF
|
16
17
|
from natural_pdf.elements.text import TextElement
|
17
18
|
|
18
19
|
logger = logging.getLogger(__name__)
|
@@ -48,7 +49,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
48
49
|
selector: CSS-like selector to filter which TextElements to export.
|
49
50
|
If None and corrected_only is False, all 'text' elements are considered.
|
50
51
|
corrected_only: If True, overrides selector and exports only elements likely
|
51
|
-
originating from a correction manifest (selector="text[source
|
52
|
+
originating from a correction manifest (selector="text[source=manifest]").
|
52
53
|
(default: False).
|
53
54
|
split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
|
54
55
|
If None, creates a single `label.txt` file (default: 0.9).
|
@@ -0,0 +1,135 @@
|
|
1
|
+
import base64
|
2
|
+
import io
|
3
|
+
import logging
|
4
|
+
from typing import Any, Optional, Type
|
5
|
+
|
6
|
+
from PIL import Image
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
9
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class StructuredDataManager:
|
15
|
+
"""
|
16
|
+
Manages the process of extracting structured data from elements using LLMs.
|
17
|
+
|
18
|
+
This manager is typically accessed via `pdf.get_manager('structured_data')`.
|
19
|
+
It is stateless and relies on parameters passed during method calls.
|
20
|
+
"""
|
21
|
+
|
22
|
+
DEFAULT_TEXT_MODEL = "gpt-4o-mini"
|
23
|
+
DEFAULT_VISION_MODEL = "gpt-4o"
|
24
|
+
|
25
|
+
def __init__(self):
|
26
|
+
"""Initializes the manager."""
|
27
|
+
logger.info("Initialized StructuredDataManager.")
|
28
|
+
|
29
|
+
def is_available(self) -> bool:
|
30
|
+
"""Checks if necessary dependencies are available."""
|
31
|
+
try:
|
32
|
+
import pydantic
|
33
|
+
|
34
|
+
return True
|
35
|
+
except ImportError:
|
36
|
+
logger.warning("Pydantic is required for structured data extraction.")
|
37
|
+
return False
|
38
|
+
|
39
|
+
def _prepare_llm_messages(
|
40
|
+
self, content: Any, prompt: Optional[str], using: str, schema: Type[BaseModel]
|
41
|
+
) -> list:
|
42
|
+
"""Prepares the message list for the LLM API call."""
|
43
|
+
system_prompt = (
|
44
|
+
prompt
|
45
|
+
or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
|
46
|
+
)
|
47
|
+
|
48
|
+
messages = [{"role": "system", "content": system_prompt}]
|
49
|
+
|
50
|
+
if using == "text":
|
51
|
+
messages.append({"role": "user", "content": str(content)})
|
52
|
+
elif using == "vision":
|
53
|
+
if isinstance(content, Image.Image):
|
54
|
+
buffered = io.BytesIO()
|
55
|
+
content.save(buffered, format="PNG")
|
56
|
+
base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
57
|
+
messages.append(
|
58
|
+
{
|
59
|
+
"role": "user",
|
60
|
+
"content": [
|
61
|
+
{
|
62
|
+
"type": "text",
|
63
|
+
"text": "Extract information from this image based on the schema.",
|
64
|
+
},
|
65
|
+
{
|
66
|
+
"type": "image_url",
|
67
|
+
"image_url": {"url": f"data:image/png;base64,{base64_image}"},
|
68
|
+
},
|
69
|
+
],
|
70
|
+
}
|
71
|
+
)
|
72
|
+
else:
|
73
|
+
raise TypeError(
|
74
|
+
f"Content must be a PIL Image for using='vision', got {type(content)}"
|
75
|
+
)
|
76
|
+
else:
|
77
|
+
raise ValueError(f"Unsupported value for 'using': {using}")
|
78
|
+
|
79
|
+
return messages
|
80
|
+
|
81
|
+
def extract(
|
82
|
+
self,
|
83
|
+
content: Any,
|
84
|
+
schema: Type[BaseModel],
|
85
|
+
client: Any,
|
86
|
+
prompt: Optional[str] = None,
|
87
|
+
using: str = "text",
|
88
|
+
model: Optional[str] = None,
|
89
|
+
**kwargs,
|
90
|
+
) -> StructuredDataResult:
|
91
|
+
"""
|
92
|
+
Extract structured data from content using an LLM.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
content: Text string or Image object
|
96
|
+
schema: Pydantic model class for the desired structure
|
97
|
+
client: Initialized LLM client (e.g., OpenAI client)
|
98
|
+
prompt: Optional user-provided instructions
|
99
|
+
using: Modality ('text' or 'vision')
|
100
|
+
model: Specific LLM model identifier
|
101
|
+
**kwargs: Additional parameters for the LLM API call
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
StructuredDataResult object
|
105
|
+
"""
|
106
|
+
logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
|
107
|
+
|
108
|
+
if isinstance(content, list) and using == "vision":
|
109
|
+
if len(content) == 1:
|
110
|
+
content = content[0]
|
111
|
+
elif len(content) > 1:
|
112
|
+
logger.error("Vision extraction not supported for multi-page PDFs")
|
113
|
+
raise NotImplementedError(
|
114
|
+
"Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead."
|
115
|
+
)
|
116
|
+
|
117
|
+
selected_model = model or (
|
118
|
+
self.DEFAULT_VISION_MODEL if using == "vision" else self.DEFAULT_TEXT_MODEL
|
119
|
+
)
|
120
|
+
messages = self._prepare_llm_messages(content, prompt, using, schema)
|
121
|
+
|
122
|
+
try:
|
123
|
+
logger.debug(f"Extracting with model '{selected_model}'")
|
124
|
+
completion = client.beta.chat.completions.parse(
|
125
|
+
model=selected_model, messages=messages, response_format=schema, **kwargs
|
126
|
+
)
|
127
|
+
parsed_data = completion.choices[0].message.parsed
|
128
|
+
return StructuredDataResult(
|
129
|
+
data=parsed_data, success=True, error_message=None, model=selected_model
|
130
|
+
)
|
131
|
+
except Exception as e:
|
132
|
+
logger.error(f"Extraction failed: {str(e)}")
|
133
|
+
return StructuredDataResult(
|
134
|
+
data=None, success=False, error_message=str(e), model=selected_model
|
135
|
+
)
|
@@ -0,0 +1,279 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional, Type
|
4
|
+
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
7
|
+
# Avoid circular import
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page import Page
|
10
|
+
from natural_pdf.elements.base import Element
|
11
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
DEFAULT_STRUCTURED_KEY = "structured" # Define default key
|
16
|
+
|
17
|
+
|
18
|
+
class ExtractionMixin(ABC):
|
19
|
+
"""
|
20
|
+
Mixin class providing structured data extraction capabilities to elements.
|
21
|
+
Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
|
25
|
+
"""
|
26
|
+
Retrieves the content (text or image) for extraction.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
using: 'text' or 'vision'
|
30
|
+
**kwargs: Additional arguments passed to extract_text or to_image
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
str: Extracted text if using='text'
|
34
|
+
PIL.Image.Image: Rendered image if using='vision'
|
35
|
+
None: If content cannot be retrieved
|
36
|
+
"""
|
37
|
+
if not hasattr(self, "extract_text") or not callable(self.extract_text):
|
38
|
+
logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
|
39
|
+
return None
|
40
|
+
if not hasattr(self, "to_image") or not callable(self.to_image):
|
41
|
+
logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
|
42
|
+
return None
|
43
|
+
|
44
|
+
try:
|
45
|
+
if using == "text":
|
46
|
+
layout = kwargs.pop("layout", True)
|
47
|
+
return self.extract_text(layout=layout, **kwargs)
|
48
|
+
elif using == "vision":
|
49
|
+
resolution = kwargs.pop("resolution", 72)
|
50
|
+
include_highlights = kwargs.pop("include_highlights", False)
|
51
|
+
labels = kwargs.pop("labels", False)
|
52
|
+
return self.to_image(
|
53
|
+
resolution=resolution,
|
54
|
+
include_highlights=include_highlights,
|
55
|
+
labels=labels,
|
56
|
+
**kwargs,
|
57
|
+
)
|
58
|
+
else:
|
59
|
+
logger.error(f"Unsupported value for 'using': {using}")
|
60
|
+
return None
|
61
|
+
except Exception as e:
|
62
|
+
logger.error(f"Error getting {using} content from {self!r}: {e}")
|
63
|
+
return None
|
64
|
+
|
65
|
+
def extract(
|
66
|
+
self: Any,
|
67
|
+
schema: Type[BaseModel],
|
68
|
+
client: Any,
|
69
|
+
analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
|
70
|
+
prompt: Optional[str] = None,
|
71
|
+
using: str = "text",
|
72
|
+
model: Optional[str] = None,
|
73
|
+
overwrite: bool = False, # Add overwrite parameter
|
74
|
+
**kwargs,
|
75
|
+
) -> Any:
|
76
|
+
"""
|
77
|
+
Extracts structured data according to the provided schema.
|
78
|
+
|
79
|
+
Results are stored in the element's `analyses` dictionary.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
schema: Pydantic model class defining the desired structure
|
83
|
+
client: Initialized LLM client
|
84
|
+
analysis_key: Key to store the result under in `analyses`. Defaults to "default-structured".
|
85
|
+
prompt: Optional user-provided prompt for the LLM
|
86
|
+
using: Modality ('text' or 'vision')
|
87
|
+
model: Optional specific LLM model identifier
|
88
|
+
overwrite: If True, allow overwriting an existing result at `analysis_key`.
|
89
|
+
**kwargs: Additional parameters for extraction
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Self for method chaining
|
93
|
+
"""
|
94
|
+
if not analysis_key:
|
95
|
+
raise ValueError("analysis_key cannot be empty for extract operation")
|
96
|
+
|
97
|
+
# --- Overwrite Check --- #
|
98
|
+
if not hasattr(self, "analyses") or self.analyses is None:
|
99
|
+
self.analyses = {}
|
100
|
+
|
101
|
+
if analysis_key in self.analyses and not overwrite:
|
102
|
+
raise ValueError(
|
103
|
+
f"Analysis key '{analysis_key}' already exists in analyses. "
|
104
|
+
f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
|
105
|
+
)
|
106
|
+
# --- End Overwrite Check --- #
|
107
|
+
|
108
|
+
# Determine PDF instance to get manager
|
109
|
+
pdf_instance = None
|
110
|
+
|
111
|
+
if hasattr(self, "get_manager") and callable(self.get_manager):
|
112
|
+
# Handle case where self is the PDF instance itself
|
113
|
+
pdf_instance = self
|
114
|
+
logger.debug(f"Manager access via self ({type(self).__name__})")
|
115
|
+
elif (
|
116
|
+
hasattr(self, "pdf")
|
117
|
+
and hasattr(self.pdf, "get_manager")
|
118
|
+
and callable(self.pdf.get_manager)
|
119
|
+
):
|
120
|
+
# Handle Page or other elements with direct .pdf reference
|
121
|
+
pdf_instance = self.pdf
|
122
|
+
logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
|
123
|
+
elif (
|
124
|
+
hasattr(self, "page")
|
125
|
+
and hasattr(self.page, "pdf")
|
126
|
+
and hasattr(self.page.pdf, "get_manager")
|
127
|
+
and callable(self.page.pdf.get_manager)
|
128
|
+
):
|
129
|
+
# Handle Region or other elements with .page.pdf reference
|
130
|
+
pdf_instance = self.page.pdf
|
131
|
+
logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
|
132
|
+
else:
|
133
|
+
logger.error(
|
134
|
+
f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf"
|
135
|
+
)
|
136
|
+
raise RuntimeError(
|
137
|
+
f"Cannot access PDF manager: {type(self).__name__} lacks necessary references"
|
138
|
+
)
|
139
|
+
|
140
|
+
try:
|
141
|
+
manager = pdf_instance.get_manager("structured_data")
|
142
|
+
except Exception as e:
|
143
|
+
raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
|
144
|
+
|
145
|
+
if not manager or not manager.is_available():
|
146
|
+
raise RuntimeError("StructuredDataManager is not available")
|
147
|
+
|
148
|
+
# Get content
|
149
|
+
layout_for_text = kwargs.pop("layout", True)
|
150
|
+
content = self._get_extraction_content(
|
151
|
+
using=using, layout=layout_for_text, **kwargs
|
152
|
+
) # Pass kwargs
|
153
|
+
|
154
|
+
if content is None or (
|
155
|
+
using == "text" and isinstance(content, str) and not content.strip()
|
156
|
+
):
|
157
|
+
logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
|
158
|
+
# Import here to avoid circularity at module level
|
159
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
160
|
+
|
161
|
+
result = StructuredDataResult(
|
162
|
+
data=None,
|
163
|
+
success=False,
|
164
|
+
error_message=f"No content available for extraction (using='{using}')",
|
165
|
+
model=model, # Use model requested, even if failed
|
166
|
+
)
|
167
|
+
else:
|
168
|
+
result = manager.extract(
|
169
|
+
content=content,
|
170
|
+
schema=schema,
|
171
|
+
client=client,
|
172
|
+
prompt=prompt,
|
173
|
+
using=using,
|
174
|
+
model=model,
|
175
|
+
**kwargs,
|
176
|
+
)
|
177
|
+
|
178
|
+
# Store the result
|
179
|
+
self.analyses[analysis_key] = result
|
180
|
+
logger.info(
|
181
|
+
f"Stored extraction result under key '{analysis_key}' (Success: {result.success})"
|
182
|
+
)
|
183
|
+
|
184
|
+
return self
|
185
|
+
|
186
|
+
def extracted(
|
187
|
+
self, field_name: Optional[str] = None, analysis_key: Optional[str] = None
|
188
|
+
) -> Any:
|
189
|
+
"""
|
190
|
+
Convenience method to access results from structured data extraction.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
field_name: The specific field to retrieve from the extracted data dictionary.
|
194
|
+
If None, returns the entire data dictionary.
|
195
|
+
analysis_key: The key under which the extraction result was stored in `analyses`.
|
196
|
+
If None, defaults to "default-structured".
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
The requested field value, the entire data dictionary, or raises an error.
|
200
|
+
|
201
|
+
Raises:
|
202
|
+
KeyError: If the specified `analysis_key` is not found in `analyses`.
|
203
|
+
ValueError: If the stored result for `analysis_key` indicates a failed extraction.
|
204
|
+
AttributeError: If the element does not have an `analyses` attribute.
|
205
|
+
KeyError: (Standard Python) If `field_name` is specified but not found in the data.
|
206
|
+
"""
|
207
|
+
target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
|
208
|
+
|
209
|
+
if not hasattr(self, "analyses") or self.analyses is None:
|
210
|
+
raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
|
211
|
+
|
212
|
+
if target_key not in self.analyses:
|
213
|
+
available_keys = list(self.analyses.keys())
|
214
|
+
raise KeyError(
|
215
|
+
f"Extraction '{target_key}' not found in analyses. "
|
216
|
+
f"Available extractions: {available_keys}"
|
217
|
+
)
|
218
|
+
|
219
|
+
# Import here to avoid circularity and allow type checking
|
220
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
221
|
+
|
222
|
+
result: StructuredDataResult = self.analyses[target_key]
|
223
|
+
|
224
|
+
if not isinstance(result, StructuredDataResult):
|
225
|
+
logger.warning(
|
226
|
+
f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process."
|
227
|
+
)
|
228
|
+
raise TypeError(
|
229
|
+
f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}"
|
230
|
+
)
|
231
|
+
|
232
|
+
if not result.success:
|
233
|
+
raise ValueError(
|
234
|
+
f"Stored result for '{target_key}' indicates a failed extraction attempt. "
|
235
|
+
f"Error: {result.error_message}"
|
236
|
+
)
|
237
|
+
|
238
|
+
if result.data is None:
|
239
|
+
# This case might occur if success=True but data is somehow None
|
240
|
+
raise ValueError(
|
241
|
+
f"Extraction result for '{target_key}' has no data available, despite success flag."
|
242
|
+
)
|
243
|
+
|
244
|
+
if field_name is None:
|
245
|
+
# Return the whole data object (Pydantic model instance or dict)
|
246
|
+
return result.data
|
247
|
+
else:
|
248
|
+
# Try dictionary key access first, then attribute access
|
249
|
+
if isinstance(result.data, dict):
|
250
|
+
try:
|
251
|
+
return result.data[field_name]
|
252
|
+
except KeyError:
|
253
|
+
available_keys = list(result.data.keys())
|
254
|
+
raise KeyError(
|
255
|
+
f"Field/Key '{field_name}' not found in extracted dictionary "
|
256
|
+
f"for key '{target_key}'. Available keys: {available_keys}"
|
257
|
+
)
|
258
|
+
else:
|
259
|
+
# Assume it's an object, try attribute access
|
260
|
+
try:
|
261
|
+
return getattr(result.data, field_name)
|
262
|
+
except AttributeError:
|
263
|
+
# Try to get available fields from the object
|
264
|
+
available_fields = []
|
265
|
+
if hasattr(result.data, "model_fields"): # Pydantic v2
|
266
|
+
available_fields = list(result.data.model_fields.keys())
|
267
|
+
elif hasattr(result.data, "__fields__"): # Pydantic v1
|
268
|
+
available_fields = list(result.data.__fields__.keys())
|
269
|
+
elif hasattr(result.data, "__dict__"): # Fallback
|
270
|
+
available_fields = list(result.data.__dict__.keys())
|
271
|
+
|
272
|
+
raise AttributeError(
|
273
|
+
f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
|
274
|
+
f"for key '{target_key}'. Available fields/attributes: {available_fields}"
|
275
|
+
)
|
276
|
+
except Exception as e: # Catch other potential errors during getattr
|
277
|
+
raise TypeError(
|
278
|
+
f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}"
|
279
|
+
) from e
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from typing import Any, Generic, Optional, TypeVar
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
# Generic type for the Pydantic model used in the schema
|
6
|
+
T_Schema = TypeVar("T_Schema", bound=BaseModel)
|
7
|
+
|
8
|
+
|
9
|
+
class StructuredDataResult(BaseModel, Generic[T_Schema]):
|
10
|
+
"""
|
11
|
+
Represents the result of a structured data extraction operation.
|
12
|
+
|
13
|
+
Contains the extracted data, success status, and error information.
|
14
|
+
"""
|
15
|
+
|
16
|
+
data: Optional[T_Schema] = Field(None, description="Validated data model or None on failure")
|
17
|
+
success: bool = Field(..., description="Whether extraction succeeded")
|
18
|
+
error_message: Optional[str] = Field(None, description="Error details if extraction failed")
|
19
|
+
raw_output: Optional[Any] = Field(None, description="Raw output from the language model")
|
20
|
+
model_used: Optional[str] = Field(None, description="Identifier of the language model used")
|
21
|
+
|
22
|
+
class Config:
|
23
|
+
arbitrary_types_allowed = True
|
natural_pdf/ocr/__init__.py
CHANGED
@@ -11,15 +11,15 @@ logger = logging.getLogger("natural_pdf.ocr")
|
|
11
11
|
|
12
12
|
# Import the base classes that are always available
|
13
13
|
from .engine import OCREngine
|
14
|
+
from .ocr_factory import OCRFactory
|
15
|
+
from .ocr_manager import OCRManager
|
14
16
|
from .ocr_options import (
|
15
|
-
OCROptions,
|
16
17
|
BaseOCROptions,
|
17
18
|
EasyOCROptions,
|
19
|
+
OCROptions,
|
18
20
|
PaddleOCROptions,
|
19
21
|
SuryaOCROptions,
|
20
22
|
)
|
21
|
-
from .ocr_manager import OCRManager
|
22
|
-
from .ocr_factory import OCRFactory
|
23
23
|
|
24
24
|
# Add all public symbols that should be available when importing this module
|
25
25
|
__all__ = [
|
@@ -41,7 +41,7 @@ def get_engine(engine_name=None, **kwargs):
|
|
41
41
|
Get OCR engine by name with graceful handling of missing dependencies.
|
42
42
|
|
43
43
|
Args:
|
44
|
-
engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
|
44
|
+
engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya', 'doctr')
|
45
45
|
If None, the best available engine is used
|
46
46
|
**kwargs: Additional arguments to pass to the engine constructor
|
47
47
|
|
@@ -63,7 +63,7 @@ def get_engine(engine_name=None, **kwargs):
|
|
63
63
|
|
64
64
|
# Use the factory to create a specific engine
|
65
65
|
normalized_name = engine_name.lower()
|
66
|
-
if normalized_name in ["easyocr", "paddle", "surya"]:
|
66
|
+
if normalized_name in ["easyocr", "paddle", "surya", "doctr"]:
|
67
67
|
return OCRFactory.create_engine(normalized_name, **kwargs)
|
68
68
|
else:
|
69
69
|
raise ValueError(f"Unknown OCR engine: {engine_name}")
|