natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +230 -151
- natural_pdf/classification/mixin.py +49 -35
- natural_pdf/classification/results.py +64 -46
- natural_pdf/collections/mixins.py +68 -20
- natural_pdf/collections/pdf_collection.py +177 -64
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +633 -190
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +503 -131
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/text.py
CHANGED
@@ -274,7 +274,7 @@ class TextElement(Element):
|
|
274
274
|
|
275
275
|
return False
|
276
276
|
|
277
|
-
def __repr__(self) -> str:
|
277
|
+
def __repr__(self) -> str:
|
278
278
|
"""String representation of the text element."""
|
279
279
|
if self.text:
|
280
280
|
preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class ExportMixin:
|
11
|
+
"""
|
12
|
+
Mixin for exporting analyses from collections of elements.
|
13
|
+
|
14
|
+
This mixin is designed to be used with PDF, PDFCollection,
|
15
|
+
PageCollection, and ElementCollection classes.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def export_analyses(
|
19
|
+
self,
|
20
|
+
output_path: str,
|
21
|
+
analysis_keys: Union[str, List[str]],
|
22
|
+
format: str = "json",
|
23
|
+
include_content: bool = True,
|
24
|
+
include_images: bool = False,
|
25
|
+
image_dir: Optional[str] = None,
|
26
|
+
image_format: str = "jpg",
|
27
|
+
image_resolution: int = 72,
|
28
|
+
overwrite: bool = True,
|
29
|
+
**kwargs,
|
30
|
+
) -> str:
|
31
|
+
"""
|
32
|
+
Export analysis results to a file.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
output_path: Path to save the export file
|
36
|
+
analysis_keys: Key(s) in the analyses dictionary to export
|
37
|
+
format: Export format ('json', 'csv', 'excel')
|
38
|
+
include_content: Whether to include extracted text
|
39
|
+
include_images: Whether to export images of elements
|
40
|
+
image_dir: Directory to save images (created if doesn't exist)
|
41
|
+
image_format: Format to save images ('jpg', 'png')
|
42
|
+
image_resolution: Resolution for exported images
|
43
|
+
overwrite: Whether to overwrite existing files
|
44
|
+
**kwargs: Additional format-specific options
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
Path to the exported file
|
48
|
+
"""
|
49
|
+
# Convert single key to list for consistency
|
50
|
+
if isinstance(analysis_keys, str):
|
51
|
+
analysis_keys = [analysis_keys]
|
52
|
+
|
53
|
+
# Create output directory
|
54
|
+
output_path = Path(output_path)
|
55
|
+
os.makedirs(output_path.parent, exist_ok=True)
|
56
|
+
|
57
|
+
# Check if file exists and handle overwrite
|
58
|
+
if output_path.exists() and not overwrite:
|
59
|
+
raise FileExistsError(f"Output file {output_path} already exists and overwrite=False")
|
60
|
+
|
61
|
+
# Prepare image directory if needed
|
62
|
+
if include_images:
|
63
|
+
if image_dir is None:
|
64
|
+
image_dir = output_path.parent / f"{output_path.stem}_images"
|
65
|
+
os.makedirs(image_dir, exist_ok=True)
|
66
|
+
image_dir = Path(image_dir) # Convert to Path object
|
67
|
+
|
68
|
+
# Gather data from collection
|
69
|
+
data = self._gather_analysis_data(
|
70
|
+
analysis_keys=analysis_keys,
|
71
|
+
include_content=include_content,
|
72
|
+
include_images=include_images,
|
73
|
+
image_dir=image_dir,
|
74
|
+
image_format=image_format,
|
75
|
+
image_resolution=image_resolution,
|
76
|
+
)
|
77
|
+
|
78
|
+
# Export based on format
|
79
|
+
if format.lower() == "json":
|
80
|
+
return self._export_to_json(data, output_path, **kwargs)
|
81
|
+
elif format.lower() == "csv":
|
82
|
+
return self._export_to_csv(data, output_path, **kwargs)
|
83
|
+
elif format.lower() == "excel":
|
84
|
+
return self._export_to_excel(data, output_path, **kwargs)
|
85
|
+
else:
|
86
|
+
raise ValueError(f"Unsupported export format: {format}")
|
87
|
+
|
88
|
+
def _gather_analysis_data(
|
89
|
+
self,
|
90
|
+
analysis_keys: List[str],
|
91
|
+
include_content: bool,
|
92
|
+
include_images: bool,
|
93
|
+
image_dir: Optional[Path],
|
94
|
+
image_format: str,
|
95
|
+
image_resolution: int,
|
96
|
+
) -> List[Dict[str, Any]]:
|
97
|
+
"""
|
98
|
+
Gather analysis data from elements in the collection.
|
99
|
+
|
100
|
+
This method should be implemented by each collection class.
|
101
|
+
"""
|
102
|
+
raise NotImplementedError("Subclasses must implement _gather_analysis_data")
|
103
|
+
|
104
|
+
def _export_to_json(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
|
105
|
+
"""Export data to JSON format."""
|
106
|
+
with open(output_path, "w") as f:
|
107
|
+
json.dump(data, f, indent=2, **kwargs)
|
108
|
+
logger.info(f"Exported analysis data to {output_path}")
|
109
|
+
return str(output_path)
|
110
|
+
|
111
|
+
def _export_to_csv(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
|
112
|
+
"""Export data to CSV format."""
|
113
|
+
try:
|
114
|
+
import pandas as pd
|
115
|
+
|
116
|
+
# Normalize nested data
|
117
|
+
df = pd.json_normalize(data)
|
118
|
+
df.to_csv(output_path, index=False, **kwargs)
|
119
|
+
logger.info(f"Exported analysis data to {output_path}")
|
120
|
+
return str(output_path)
|
121
|
+
except ImportError:
|
122
|
+
raise ImportError("Pandas is required for CSV export. Install with: pip install pandas")
|
123
|
+
|
124
|
+
def _export_to_excel(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
|
125
|
+
"""Export data to Excel format."""
|
126
|
+
try:
|
127
|
+
import pandas as pd
|
128
|
+
|
129
|
+
# Normalize nested data
|
130
|
+
df = pd.json_normalize(data)
|
131
|
+
df.to_excel(output_path, index=False, **kwargs)
|
132
|
+
logger.info(f"Exported analysis data to {output_path}")
|
133
|
+
return str(output_path)
|
134
|
+
except ImportError:
|
135
|
+
raise ImportError(
|
136
|
+
"Pandas and openpyxl are required for Excel export. Install with: pip install pandas openpyxl"
|
137
|
+
)
|
natural_pdf/exporters/base.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
import abc
|
2
2
|
import logging
|
3
|
-
from typing import
|
3
|
+
from typing import TYPE_CHECKING, List, Union
|
4
4
|
|
5
5
|
if TYPE_CHECKING:
|
6
|
-
from natural_pdf.core.pdf import PDF
|
7
6
|
from natural_pdf.collections.pdf_collection import PDFCollection
|
7
|
+
from natural_pdf.core.pdf import PDF
|
8
8
|
|
9
9
|
logger = logging.getLogger(__name__)
|
10
10
|
|
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
|
|
40
40
|
"""
|
41
41
|
Helper to consistently resolve the input source to a list of PDF objects.
|
42
42
|
"""
|
43
|
-
from natural_pdf.core.pdf import PDF # Avoid circular import at module level
|
44
43
|
from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
|
44
|
+
from natural_pdf.core.pdf import PDF # Avoid circular import at module level
|
45
45
|
|
46
46
|
pdfs_to_process: List["PDF"] = []
|
47
47
|
if isinstance(source, PDF):
|
@@ -1,8 +1,9 @@
|
|
1
|
-
import os
|
2
1
|
import logging
|
2
|
+
import os
|
3
3
|
import random
|
4
4
|
import shutil
|
5
|
-
from typing import
|
5
|
+
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
6
|
+
|
6
7
|
from tqdm import tqdm
|
7
8
|
|
8
9
|
from natural_pdf.exporters.base import FinetuneExporter
|
@@ -11,8 +12,8 @@ from natural_pdf.exporters.base import FinetuneExporter
|
|
11
12
|
from natural_pdf.utils.identifiers import generate_short_path_hash
|
12
13
|
|
13
14
|
if TYPE_CHECKING:
|
14
|
-
from natural_pdf.core.pdf import PDF
|
15
15
|
from natural_pdf.collections.pdf_collection import PDFCollection
|
16
|
+
from natural_pdf.core.pdf import PDF
|
16
17
|
from natural_pdf.elements.text import TextElement
|
17
18
|
|
18
19
|
logger = logging.getLogger(__name__)
|
@@ -1,9 +1,10 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import Any, Type, Optional
|
3
|
-
from pydantic import BaseModel
|
4
|
-
import io
|
5
1
|
import base64
|
2
|
+
import io
|
3
|
+
import logging
|
4
|
+
from typing import Any, Optional, Type
|
5
|
+
|
6
6
|
from PIL import Image
|
7
|
+
from pydantic import BaseModel
|
7
8
|
|
8
9
|
from natural_pdf.extraction.result import StructuredDataResult
|
9
10
|
|
@@ -29,47 +30,52 @@ class StructuredDataManager:
|
|
29
30
|
"""Checks if necessary dependencies are available."""
|
30
31
|
try:
|
31
32
|
import pydantic
|
33
|
+
|
32
34
|
return True
|
33
35
|
except ImportError:
|
34
36
|
logger.warning("Pydantic is required for structured data extraction.")
|
35
37
|
return False
|
36
38
|
|
37
39
|
def _prepare_llm_messages(
|
38
|
-
self,
|
39
|
-
content: Any,
|
40
|
-
prompt: Optional[str],
|
41
|
-
using: str,
|
42
|
-
schema: Type[BaseModel]
|
40
|
+
self, content: Any, prompt: Optional[str], using: str, schema: Type[BaseModel]
|
43
41
|
) -> list:
|
44
42
|
"""Prepares the message list for the LLM API call."""
|
45
|
-
system_prompt =
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
43
|
+
system_prompt = (
|
44
|
+
prompt
|
45
|
+
or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
|
46
|
+
)
|
47
|
+
|
48
|
+
messages = [{"role": "system", "content": system_prompt}]
|
49
|
+
|
50
|
+
if using == "text":
|
52
51
|
messages.append({"role": "user", "content": str(content)})
|
53
|
-
elif using ==
|
52
|
+
elif using == "vision":
|
54
53
|
if isinstance(content, Image.Image):
|
55
54
|
buffered = io.BytesIO()
|
56
55
|
content.save(buffered, format="PNG")
|
57
56
|
base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
58
|
-
messages.append(
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
57
|
+
messages.append(
|
58
|
+
{
|
59
|
+
"role": "user",
|
60
|
+
"content": [
|
61
|
+
{
|
62
|
+
"type": "text",
|
63
|
+
"text": "Extract information from this image based on the schema.",
|
64
|
+
},
|
65
|
+
{
|
66
|
+
"type": "image_url",
|
67
|
+
"image_url": {"url": f"data:image/png;base64,{base64_image}"},
|
68
|
+
},
|
69
|
+
],
|
70
|
+
}
|
71
|
+
)
|
68
72
|
else:
|
69
|
-
raise TypeError(
|
73
|
+
raise TypeError(
|
74
|
+
f"Content must be a PIL Image for using='vision', got {type(content)}"
|
75
|
+
)
|
70
76
|
else:
|
71
|
-
|
72
|
-
|
77
|
+
raise ValueError(f"Unsupported value for 'using': {using}")
|
78
|
+
|
73
79
|
return messages
|
74
80
|
|
75
81
|
def extract(
|
@@ -78,9 +84,9 @@ class StructuredDataManager:
|
|
78
84
|
schema: Type[BaseModel],
|
79
85
|
client: Any,
|
80
86
|
prompt: Optional[str] = None,
|
81
|
-
using: str =
|
87
|
+
using: str = "text",
|
82
88
|
model: Optional[str] = None,
|
83
|
-
**kwargs
|
89
|
+
**kwargs,
|
84
90
|
) -> StructuredDataResult:
|
85
91
|
"""
|
86
92
|
Extract structured data from content using an LLM.
|
@@ -99,36 +105,31 @@ class StructuredDataManager:
|
|
99
105
|
"""
|
100
106
|
logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
|
101
107
|
|
102
|
-
if isinstance(content, list) and using ==
|
108
|
+
if isinstance(content, list) and using == "vision":
|
103
109
|
if len(content) == 1:
|
104
110
|
content = content[0]
|
105
111
|
elif len(content) > 1:
|
106
112
|
logger.error("Vision extraction not supported for multi-page PDFs")
|
107
|
-
raise NotImplementedError(
|
108
|
-
|
109
|
-
|
113
|
+
raise NotImplementedError(
|
114
|
+
"Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead."
|
115
|
+
)
|
116
|
+
|
117
|
+
selected_model = model or (
|
118
|
+
self.DEFAULT_VISION_MODEL if using == "vision" else self.DEFAULT_TEXT_MODEL
|
119
|
+
)
|
110
120
|
messages = self._prepare_llm_messages(content, prompt, using, schema)
|
111
121
|
|
112
122
|
try:
|
113
123
|
logger.debug(f"Extracting with model '{selected_model}'")
|
114
124
|
completion = client.beta.chat.completions.parse(
|
115
|
-
model=selected_model,
|
116
|
-
messages=messages,
|
117
|
-
response_format=schema,
|
118
|
-
**kwargs
|
125
|
+
model=selected_model, messages=messages, response_format=schema, **kwargs
|
119
126
|
)
|
120
127
|
parsed_data = completion.choices[0].message.parsed
|
121
128
|
return StructuredDataResult(
|
122
|
-
data=parsed_data,
|
123
|
-
success=True,
|
124
|
-
error_message=None,
|
125
|
-
model=selected_model
|
129
|
+
data=parsed_data, success=True, error_message=None, model=selected_model
|
126
130
|
)
|
127
131
|
except Exception as e:
|
128
132
|
logger.error(f"Extraction failed: {str(e)}")
|
129
133
|
return StructuredDataResult(
|
130
|
-
data=None,
|
131
|
-
|
132
|
-
error_message=str(e),
|
133
|
-
model=selected_model
|
134
|
-
)
|
134
|
+
data=None, success=False, error_message=str(e), model=selected_model
|
135
|
+
)
|
natural_pdf/extraction/mixin.py
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import TYPE_CHECKING, Any, Type, Optional
|
3
2
|
from abc import ABC, abstractmethod
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional, Type
|
4
|
+
|
4
5
|
from pydantic import BaseModel
|
5
6
|
|
6
7
|
# Avoid circular import
|
7
8
|
if TYPE_CHECKING:
|
8
|
-
from natural_pdf.extraction.result import StructuredDataResult
|
9
9
|
from natural_pdf.core.page import Page
|
10
10
|
from natural_pdf.elements.base import Element
|
11
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
14
|
-
DEFAULT_STRUCTURED_KEY = "
|
15
|
+
DEFAULT_STRUCTURED_KEY = "structured" # Define default key
|
16
|
+
|
15
17
|
|
16
18
|
class ExtractionMixin(ABC):
|
17
19
|
"""
|
@@ -19,7 +21,7 @@ class ExtractionMixin(ABC):
|
|
19
21
|
Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
|
20
22
|
"""
|
21
23
|
|
22
|
-
def _get_extraction_content(self, using: str =
|
24
|
+
def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
|
23
25
|
"""
|
24
26
|
Retrieves the content (text or image) for extraction.
|
25
27
|
|
@@ -32,26 +34,26 @@ class ExtractionMixin(ABC):
|
|
32
34
|
PIL.Image.Image: Rendered image if using='vision'
|
33
35
|
None: If content cannot be retrieved
|
34
36
|
"""
|
35
|
-
if not hasattr(self,
|
36
|
-
|
37
|
-
|
38
|
-
if not hasattr(self,
|
39
|
-
|
40
|
-
|
41
|
-
|
37
|
+
if not hasattr(self, "extract_text") or not callable(self.extract_text):
|
38
|
+
logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
|
39
|
+
return None
|
40
|
+
if not hasattr(self, "to_image") or not callable(self.to_image):
|
41
|
+
logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
|
42
|
+
return None
|
43
|
+
|
42
44
|
try:
|
43
|
-
if using ==
|
44
|
-
layout = kwargs.pop(
|
45
|
+
if using == "text":
|
46
|
+
layout = kwargs.pop("layout", True)
|
45
47
|
return self.extract_text(layout=layout, **kwargs)
|
46
|
-
elif using ==
|
47
|
-
resolution = kwargs.pop(
|
48
|
-
include_highlights = kwargs.pop(
|
49
|
-
labels = kwargs.pop(
|
48
|
+
elif using == "vision":
|
49
|
+
resolution = kwargs.pop("resolution", 72)
|
50
|
+
include_highlights = kwargs.pop("include_highlights", False)
|
51
|
+
labels = kwargs.pop("labels", False)
|
50
52
|
return self.to_image(
|
51
|
-
resolution=resolution,
|
52
|
-
include_highlights=include_highlights,
|
53
|
-
labels=labels,
|
54
|
-
**kwargs
|
53
|
+
resolution=resolution,
|
54
|
+
include_highlights=include_highlights,
|
55
|
+
labels=labels,
|
56
|
+
**kwargs,
|
55
57
|
)
|
56
58
|
else:
|
57
59
|
logger.error(f"Unsupported value for 'using': {using}")
|
@@ -64,12 +66,12 @@ class ExtractionMixin(ABC):
|
|
64
66
|
self: Any,
|
65
67
|
schema: Type[BaseModel],
|
66
68
|
client: Any,
|
67
|
-
analysis_key: str = DEFAULT_STRUCTURED_KEY,
|
69
|
+
analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
|
68
70
|
prompt: Optional[str] = None,
|
69
|
-
using: str =
|
71
|
+
using: str = "text",
|
70
72
|
model: Optional[str] = None,
|
71
|
-
overwrite: bool = False,
|
72
|
-
**kwargs
|
73
|
+
overwrite: bool = False, # Add overwrite parameter
|
74
|
+
**kwargs,
|
73
75
|
) -> Any:
|
74
76
|
"""
|
75
77
|
Extracts structured data according to the provided schema.
|
@@ -91,39 +93,52 @@ class ExtractionMixin(ABC):
|
|
91
93
|
"""
|
92
94
|
if not analysis_key:
|
93
95
|
raise ValueError("analysis_key cannot be empty for extract operation")
|
94
|
-
|
96
|
+
|
95
97
|
# --- Overwrite Check --- #
|
96
|
-
if not hasattr(self,
|
98
|
+
if not hasattr(self, "analyses") or self.analyses is None:
|
97
99
|
self.analyses = {}
|
98
|
-
|
100
|
+
|
99
101
|
if analysis_key in self.analyses and not overwrite:
|
100
102
|
raise ValueError(
|
101
103
|
f"Analysis key '{analysis_key}' already exists in analyses. "
|
102
104
|
f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
|
103
105
|
)
|
104
106
|
# --- End Overwrite Check --- #
|
105
|
-
|
107
|
+
|
106
108
|
# Determine PDF instance to get manager
|
107
109
|
pdf_instance = None
|
108
|
-
|
109
|
-
if hasattr(self,
|
110
|
+
|
111
|
+
if hasattr(self, "get_manager") and callable(self.get_manager):
|
110
112
|
# Handle case where self is the PDF instance itself
|
111
113
|
pdf_instance = self
|
112
114
|
logger.debug(f"Manager access via self ({type(self).__name__})")
|
113
|
-
elif
|
115
|
+
elif (
|
116
|
+
hasattr(self, "pdf")
|
117
|
+
and hasattr(self.pdf, "get_manager")
|
118
|
+
and callable(self.pdf.get_manager)
|
119
|
+
):
|
114
120
|
# Handle Page or other elements with direct .pdf reference
|
115
121
|
pdf_instance = self.pdf
|
116
122
|
logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
|
117
|
-
elif
|
123
|
+
elif (
|
124
|
+
hasattr(self, "page")
|
125
|
+
and hasattr(self.page, "pdf")
|
126
|
+
and hasattr(self.page.pdf, "get_manager")
|
127
|
+
and callable(self.page.pdf.get_manager)
|
128
|
+
):
|
118
129
|
# Handle Region or other elements with .page.pdf reference
|
119
130
|
pdf_instance = self.page.pdf
|
120
131
|
logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
|
121
132
|
else:
|
122
|
-
logger.error(
|
123
|
-
|
124
|
-
|
133
|
+
logger.error(
|
134
|
+
f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf"
|
135
|
+
)
|
136
|
+
raise RuntimeError(
|
137
|
+
f"Cannot access PDF manager: {type(self).__name__} lacks necessary references"
|
138
|
+
)
|
139
|
+
|
125
140
|
try:
|
126
|
-
manager = pdf_instance.get_manager(
|
141
|
+
manager = pdf_instance.get_manager("structured_data")
|
127
142
|
except Exception as e:
|
128
143
|
raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
|
129
144
|
|
@@ -131,18 +146,23 @@ class ExtractionMixin(ABC):
|
|
131
146
|
raise RuntimeError("StructuredDataManager is not available")
|
132
147
|
|
133
148
|
# Get content
|
134
|
-
layout_for_text = kwargs.pop(
|
135
|
-
content = self._get_extraction_content(
|
149
|
+
layout_for_text = kwargs.pop("layout", True)
|
150
|
+
content = self._get_extraction_content(
|
151
|
+
using=using, layout=layout_for_text, **kwargs
|
152
|
+
) # Pass kwargs
|
136
153
|
|
137
|
-
if content is None or (
|
154
|
+
if content is None or (
|
155
|
+
using == "text" and isinstance(content, str) and not content.strip()
|
156
|
+
):
|
138
157
|
logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
|
139
158
|
# Import here to avoid circularity at module level
|
140
|
-
from natural_pdf.extraction.result import StructuredDataResult
|
159
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
160
|
+
|
141
161
|
result = StructuredDataResult(
|
142
162
|
data=None,
|
143
163
|
success=False,
|
144
164
|
error_message=f"No content available for extraction (using='{using}')",
|
145
|
-
model=model
|
165
|
+
model=model, # Use model requested, even if failed
|
146
166
|
)
|
147
167
|
else:
|
148
168
|
result = manager.extract(
|
@@ -152,16 +172,20 @@ class ExtractionMixin(ABC):
|
|
152
172
|
prompt=prompt,
|
153
173
|
using=using,
|
154
174
|
model=model,
|
155
|
-
**kwargs
|
175
|
+
**kwargs,
|
156
176
|
)
|
157
177
|
|
158
178
|
# Store the result
|
159
179
|
self.analyses[analysis_key] = result
|
160
|
-
logger.info(
|
180
|
+
logger.info(
|
181
|
+
f"Stored extraction result under key '{analysis_key}' (Success: {result.success})"
|
182
|
+
)
|
161
183
|
|
162
184
|
return self
|
163
185
|
|
164
|
-
def extracted(
|
186
|
+
def extracted(
|
187
|
+
self, field_name: Optional[str] = None, analysis_key: Optional[str] = None
|
188
|
+
) -> Any:
|
165
189
|
"""
|
166
190
|
Convenience method to access results from structured data extraction.
|
167
191
|
|
@@ -182,7 +206,7 @@ class ExtractionMixin(ABC):
|
|
182
206
|
"""
|
183
207
|
target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
|
184
208
|
|
185
|
-
if not hasattr(self,
|
209
|
+
if not hasattr(self, "analyses") or self.analyses is None:
|
186
210
|
raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
|
187
211
|
|
188
212
|
if target_key not in self.analyses:
|
@@ -194,21 +218,28 @@ class ExtractionMixin(ABC):
|
|
194
218
|
|
195
219
|
# Import here to avoid circularity and allow type checking
|
196
220
|
from natural_pdf.extraction.result import StructuredDataResult
|
221
|
+
|
197
222
|
result: StructuredDataResult = self.analyses[target_key]
|
198
223
|
|
199
224
|
if not isinstance(result, StructuredDataResult):
|
200
|
-
logger.warning(
|
201
|
-
|
225
|
+
logger.warning(
|
226
|
+
f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process."
|
227
|
+
)
|
228
|
+
raise TypeError(
|
229
|
+
f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}"
|
230
|
+
)
|
202
231
|
|
203
232
|
if not result.success:
|
204
233
|
raise ValueError(
|
205
234
|
f"Stored result for '{target_key}' indicates a failed extraction attempt. "
|
206
235
|
f"Error: {result.error_message}"
|
207
236
|
)
|
208
|
-
|
237
|
+
|
209
238
|
if result.data is None:
|
210
|
-
|
211
|
-
|
239
|
+
# This case might occur if success=True but data is somehow None
|
240
|
+
raise ValueError(
|
241
|
+
f"Extraction result for '{target_key}' has no data available, despite success flag."
|
242
|
+
)
|
212
243
|
|
213
244
|
if field_name is None:
|
214
245
|
# Return the whole data object (Pydantic model instance or dict)
|
@@ -231,16 +262,18 @@ class ExtractionMixin(ABC):
|
|
231
262
|
except AttributeError:
|
232
263
|
# Try to get available fields from the object
|
233
264
|
available_fields = []
|
234
|
-
if hasattr(result.data,
|
265
|
+
if hasattr(result.data, "model_fields"): # Pydantic v2
|
235
266
|
available_fields = list(result.data.model_fields.keys())
|
236
|
-
elif hasattr(result.data,
|
267
|
+
elif hasattr(result.data, "__fields__"): # Pydantic v1
|
237
268
|
available_fields = list(result.data.__fields__.keys())
|
238
|
-
elif hasattr(result.data,
|
269
|
+
elif hasattr(result.data, "__dict__"): # Fallback
|
239
270
|
available_fields = list(result.data.__dict__.keys())
|
240
|
-
|
271
|
+
|
241
272
|
raise AttributeError(
|
242
273
|
f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
|
243
274
|
f"for key '{target_key}'. Available fields/attributes: {available_fields}"
|
244
275
|
)
|
245
|
-
except Exception as e:
|
246
|
-
|
276
|
+
except Exception as e: # Catch other potential errors during getattr
|
277
|
+
raise TypeError(
|
278
|
+
f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}"
|
279
|
+
) from e
|
natural_pdf/extraction/result.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import Any, Generic, Optional, TypeVar
|
2
|
+
|
2
3
|
from pydantic import BaseModel, Field
|
3
4
|
|
4
5
|
# Generic type for the Pydantic model used in the schema
|
@@ -8,30 +9,15 @@ T_Schema = TypeVar("T_Schema", bound=BaseModel)
|
|
8
9
|
class StructuredDataResult(BaseModel, Generic[T_Schema]):
|
9
10
|
"""
|
10
11
|
Represents the result of a structured data extraction operation.
|
11
|
-
|
12
|
+
|
12
13
|
Contains the extracted data, success status, and error information.
|
13
14
|
"""
|
14
15
|
|
15
|
-
data: Optional[T_Schema] = Field(
|
16
|
-
|
17
|
-
|
18
|
-
)
|
19
|
-
|
20
|
-
...,
|
21
|
-
description="Whether extraction succeeded"
|
22
|
-
)
|
23
|
-
error_message: Optional[str] = Field(
|
24
|
-
None,
|
25
|
-
description="Error details if extraction failed"
|
26
|
-
)
|
27
|
-
raw_output: Optional[Any] = Field(
|
28
|
-
None,
|
29
|
-
description="Raw output from the language model"
|
30
|
-
)
|
31
|
-
model_used: Optional[str] = Field(
|
32
|
-
None,
|
33
|
-
description="Identifier of the language model used"
|
34
|
-
)
|
16
|
+
data: Optional[T_Schema] = Field(None, description="Validated data model or None on failure")
|
17
|
+
success: bool = Field(..., description="Whether extraction succeeded")
|
18
|
+
error_message: Optional[str] = Field(None, description="Error details if extraction failed")
|
19
|
+
raw_output: Optional[Any] = Field(None, description="Raw output from the language model")
|
20
|
+
model_used: Optional[str] = Field(None, description="Identifier of the language model used")
|
35
21
|
|
36
22
|
class Config:
|
37
|
-
arbitrary_types_allowed = True
|
23
|
+
arbitrary_types_allowed = True
|