natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +241 -158
- natural_pdf/classification/mixin.py +52 -38
- natural_pdf/classification/results.py +71 -45
- natural_pdf/collections/mixins.py +85 -20
- natural_pdf/collections/pdf_collection.py +245 -100
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +694 -195
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +610 -134
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
- natural_pdf-0.1.10.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
natural_pdf/__init__.py
CHANGED
@@ -98,7 +98,7 @@ class LayoutDetector(ABC):
|
|
98
98
|
self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
|
99
99
|
# Remove potentially corrupted cache entry
|
100
100
|
self._model_cache.pop(cache_key, None)
|
101
|
-
raise
|
101
|
+
raise
|
102
102
|
else:
|
103
103
|
self.logger.debug(f"Using cached model for key: {cache_key}")
|
104
104
|
return self._model_cache[cache_key]
|
@@ -135,7 +135,6 @@ class LayoutDetector(ABC):
|
|
135
135
|
return
|
136
136
|
|
137
137
|
if classes:
|
138
|
-
# Normalize both requested and supported classes for comparison
|
139
138
|
normalized_supported = {self._normalize_class_name(c) for c in self.supported_classes}
|
140
139
|
normalized_requested = {self._normalize_class_name(c) for c in classes}
|
141
140
|
unsupported_normalized = normalized_requested - normalized_supported
|
@@ -153,7 +152,4 @@ class LayoutDetector(ABC):
|
|
153
152
|
def __del__(self):
|
154
153
|
"""Cleanup resources."""
|
155
154
|
self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
|
156
|
-
# Clear model cache to free up memory/GPU resources if models are large
|
157
|
-
# Consider implications if models are shared or expensive to reload
|
158
|
-
# del self._model_cache # Optional: uncomment if models should be released aggressively
|
159
155
|
self._model_cache.clear()
|
@@ -1,13 +1,13 @@
|
|
1
1
|
# layout_detector_gemini.py
|
2
|
+
import base64
|
2
3
|
import importlib.util
|
4
|
+
import io
|
3
5
|
import logging
|
4
6
|
import os
|
5
7
|
from typing import Any, Dict, List, Optional
|
6
|
-
import base64
|
7
|
-
import io
|
8
8
|
|
9
|
-
from pydantic import BaseModel, Field
|
10
9
|
from PIL import Image
|
10
|
+
from pydantic import BaseModel, Field
|
11
11
|
|
12
12
|
# Use OpenAI library for interaction
|
13
13
|
try:
|
@@ -53,10 +53,8 @@ logger = logging.getLogger(__name__)
|
|
53
53
|
# This is used by the openai library's `response_format`
|
54
54
|
class DetectedRegion(BaseModel):
|
55
55
|
label: str = Field(description="The identified class name.")
|
56
|
-
bbox: List[float] = Field(
|
57
|
-
|
58
|
-
)
|
59
|
-
confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
|
56
|
+
bbox: List[float] = Field(description="Bounding box coordinates [xmin, ymin, xmax, ymax].")
|
57
|
+
confidence: float = Field(description="Confidence score [0.0, 1.0].")
|
60
58
|
|
61
59
|
|
62
60
|
class GeminiLayoutDetector(LayoutDetector):
|
@@ -70,16 +68,10 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
70
68
|
self.supported_classes = set() # Indicate dynamic nature
|
71
69
|
|
72
70
|
def is_available(self) -> bool:
|
73
|
-
"""Check if openai library is installed
|
74
|
-
api_key = os.environ.get("GOOGLE_API_KEY")
|
75
|
-
if not api_key:
|
76
|
-
logger.warning(
|
77
|
-
"GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available."
|
78
|
-
)
|
79
|
-
return False
|
71
|
+
"""Check if openai library is installed."""
|
80
72
|
if OpenAI is None:
|
81
73
|
logger.warning(
|
82
|
-
"openai package not found. Gemini detector (via OpenAI lib) will not be available."
|
74
|
+
"openai package not found. Gemini detector (via OpenAI lib) will not be available. Run: pip install openai"
|
83
75
|
)
|
84
76
|
return False
|
85
77
|
return True
|
@@ -96,44 +88,65 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
96
88
|
def _load_model_from_options(self, options: GeminiLayoutOptions) -> Any:
|
97
89
|
"""Validate options and return the model name."""
|
98
90
|
if not self.is_available():
|
99
|
-
raise RuntimeError(
|
100
|
-
"OpenAI library not installed or GOOGLE_API_KEY not set. Please run: pip install openai"
|
101
|
-
)
|
91
|
+
raise RuntimeError("OpenAI library not installed. Please run: pip install openai")
|
102
92
|
|
103
93
|
if not isinstance(options, GeminiLayoutOptions):
|
104
94
|
raise TypeError("Incorrect options type provided for Gemini model loading.")
|
105
95
|
|
106
|
-
#
|
96
|
+
# Model loading is deferred to detect() based on whether a client is provided
|
107
97
|
return options.model_name
|
108
98
|
|
109
99
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
110
100
|
"""Detect layout elements in an image using Gemini via OpenAI library."""
|
111
101
|
if not self.is_available():
|
112
|
-
|
102
|
+
# The is_available check now only confirms library presence
|
103
|
+
raise RuntimeError("OpenAI library not installed. Please run: pip install openai")
|
113
104
|
|
114
105
|
# Ensure options are the correct type
|
115
|
-
|
106
|
+
final_options: GeminiLayoutOptions
|
107
|
+
if isinstance(options, GeminiLayoutOptions):
|
108
|
+
final_options = options
|
109
|
+
else:
|
110
|
+
# If base options are passed, try to convert, keeping extra_args
|
111
|
+
# Note: This won't transfer a 'client' if it was somehow attached to BaseLayoutOptions
|
116
112
|
self.logger.warning(
|
117
|
-
"Received BaseLayoutOptions, expected GeminiLayoutOptions.
|
113
|
+
"Received BaseLayoutOptions, expected GeminiLayoutOptions. Converting and using defaults."
|
118
114
|
)
|
119
|
-
|
115
|
+
final_options = GeminiLayoutOptions(
|
120
116
|
confidence=options.confidence,
|
121
117
|
classes=options.classes,
|
122
118
|
exclude_classes=options.exclude_classes,
|
123
|
-
device=options.device,
|
119
|
+
device=options.device, # device is not used by Gemini detector currently
|
124
120
|
extra_args=options.extra_args,
|
121
|
+
# client will be None here, forcing default client creation below
|
125
122
|
)
|
126
123
|
|
127
|
-
model_name = self._get_model(
|
128
|
-
api_key = os.environ.get("GOOGLE_API_KEY")
|
129
|
-
|
124
|
+
model_name = self._get_model(final_options)
|
130
125
|
detections = []
|
126
|
+
|
131
127
|
try:
|
132
|
-
# --- 1. Initialize OpenAI Client
|
133
|
-
client
|
128
|
+
# --- 1. Initialize OpenAI Client ---
|
129
|
+
client: Optional[OpenAI] = None
|
130
|
+
# Use the provided client instance
|
131
|
+
if hasattr(final_options.client, "beta") and hasattr(
|
132
|
+
final_options.client.beta.chat.completions, "parse"
|
133
|
+
):
|
134
|
+
client = final_options.client
|
135
|
+
logger.debug("Using provided client instance.")
|
136
|
+
else:
|
137
|
+
logger.error(
|
138
|
+
"Provided client does not seem compatible (missing beta.chat.completions.parse)."
|
139
|
+
)
|
140
|
+
raise TypeError(
|
141
|
+
"Provided client is not compatible with the expected OpenAI interface."
|
142
|
+
)
|
143
|
+
|
144
|
+
if not client:
|
145
|
+
# This should not happen if logic above is correct, but as a safeguard
|
146
|
+
raise RuntimeError("Failed to obtain a valid client for Gemini detection.")
|
134
147
|
|
135
148
|
# --- 2. Prepare Input for OpenAI API ---
|
136
|
-
if not
|
149
|
+
if not final_options.classes:
|
137
150
|
logger.error("Gemini layout detection requires a list of classes to find.")
|
138
151
|
return []
|
139
152
|
|
@@ -145,15 +158,13 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
145
158
|
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
146
159
|
image_url = f"data:image/png;base64,{img_base64}"
|
147
160
|
|
148
|
-
|
149
|
-
class_list_str = ", ".join(f"`{c}`" for c in options.classes)
|
161
|
+
class_list_str = ", ".join(f"`{c}`" for c in final_options.classes)
|
150
162
|
prompt_text = (
|
151
163
|
f"Analyze the provided image of a document page ({width}x{height}). "
|
152
164
|
f"Identify all regions corresponding to the following types: {class_list_str}. "
|
153
|
-
f"Return ONLY the structured data requested."
|
165
|
+
f"Return ONLY the structured data requested as formatted JSON."
|
154
166
|
)
|
155
167
|
|
156
|
-
# Prepare messages for chat completions endpoint
|
157
168
|
messages = [
|
158
169
|
{
|
159
170
|
"role": "user",
|
@@ -167,27 +178,26 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
167
178
|
}
|
168
179
|
]
|
169
180
|
|
170
|
-
# --- 3. Call OpenAI API using .parse for structured output ---
|
171
181
|
logger.debug(
|
172
|
-
f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {
|
182
|
+
f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {final_options.classes}"
|
173
183
|
)
|
174
184
|
|
175
|
-
# Extract relevant generation parameters from extra_args if provided
|
176
|
-
# Mapping common names: temperature, top_p, max_tokens
|
177
185
|
completion_kwargs = {
|
178
|
-
"temperature":
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
), # Map from max_output_tokens
|
186
|
+
"temperature": final_options.extra_args.get(
|
187
|
+
"temperature", 0.0
|
188
|
+
), # Default to low temp
|
189
|
+
"max_tokens": final_options.extra_args.get("max_tokens", 4096),
|
183
190
|
}
|
184
|
-
|
191
|
+
|
185
192
|
completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
|
186
193
|
|
194
|
+
class ImageContents(BaseModel):
|
195
|
+
regions: List[DetectedRegion]
|
196
|
+
|
187
197
|
completion: ChatCompletion = client.beta.chat.completions.parse(
|
188
198
|
model=model_name,
|
189
199
|
messages=messages,
|
190
|
-
response_format=
|
200
|
+
response_format=ImageContents,
|
191
201
|
**completion_kwargs,
|
192
202
|
)
|
193
203
|
|
@@ -199,7 +209,7 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
199
209
|
return []
|
200
210
|
|
201
211
|
# Get the parsed Pydantic objects
|
202
|
-
parsed_results = completion.choices[0].message.parsed
|
212
|
+
parsed_results = completion.choices[0].message.parsed.regions
|
203
213
|
if not parsed_results or not isinstance(parsed_results, list):
|
204
214
|
logger.error(
|
205
215
|
f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
|
@@ -207,10 +217,10 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
207
217
|
return []
|
208
218
|
|
209
219
|
# --- 5. Convert to Detections & Filter ---
|
210
|
-
normalized_classes_req = {self._normalize_class_name(c) for c in
|
220
|
+
normalized_classes_req = {self._normalize_class_name(c) for c in final_options.classes}
|
211
221
|
normalized_classes_excl = (
|
212
|
-
{self._normalize_class_name(c) for c in
|
213
|
-
if
|
222
|
+
{self._normalize_class_name(c) for c in final_options.exclude_classes}
|
223
|
+
if final_options.exclude_classes
|
214
224
|
else set()
|
215
225
|
)
|
216
226
|
|
@@ -242,9 +252,9 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
242
252
|
continue
|
243
253
|
|
244
254
|
# Check against base confidence threshold from options
|
245
|
-
if confidence_score <
|
255
|
+
if confidence_score < final_options.confidence:
|
246
256
|
logger.debug(
|
247
|
-
f"Skipping item with confidence {confidence_score:.3f} below threshold {
|
257
|
+
f"Skipping item with confidence {confidence_score:.3f} below threshold {final_options.confidence}."
|
248
258
|
)
|
249
259
|
continue
|
250
260
|
|
@@ -7,6 +7,7 @@ from PIL import Image
|
|
7
7
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
8
8
|
from natural_pdf.analyzers.layout.layout_options import (
|
9
9
|
BaseLayoutOptions,
|
10
|
+
GeminiLayoutOptions,
|
10
11
|
LayoutOptions,
|
11
12
|
TATRLayoutOptions,
|
12
13
|
)
|
@@ -82,10 +83,10 @@ class LayoutAnalyzer:
|
|
82
83
|
f" Rendering page {self._page.number} to image for initial layout detection..."
|
83
84
|
)
|
84
85
|
try:
|
85
|
-
layout_scale = getattr(self._page._parent, "_config", {}).get("layout_image_scale", 1.
|
86
|
+
layout_scale = getattr(self._page._parent, "_config", {}).get("layout_image_scale", 1.0)
|
86
87
|
layout_resolution = layout_scale * 72
|
87
88
|
std_res_page_image = self._page.to_image(
|
88
|
-
resolution=layout_resolution, include_highlights=False
|
89
|
+
resolution=layout_resolution, include_highlights=False, scale=1.0
|
89
90
|
)
|
90
91
|
if not std_res_page_image:
|
91
92
|
raise ValueError("Initial page rendering returned None")
|
@@ -110,12 +111,11 @@ class LayoutAnalyzer:
|
|
110
111
|
final_options: BaseLayoutOptions
|
111
112
|
|
112
113
|
if options is not None:
|
113
|
-
# User provided a complete options object, use it directly
|
114
114
|
logger.debug("Using user-provided options object.")
|
115
115
|
final_options = copy.deepcopy(options) # Copy to avoid modifying original user object
|
116
116
|
if kwargs:
|
117
117
|
logger.warning(
|
118
|
-
f"Ignoring
|
118
|
+
f"Ignoring simple mode keyword arguments {list(kwargs.keys())} because a full options object was provided."
|
119
119
|
)
|
120
120
|
# Infer engine from options type if engine arg wasn't provided
|
121
121
|
if engine is None:
|
@@ -145,16 +145,39 @@ class LayoutAnalyzer:
|
|
145
145
|
# Get base defaults
|
146
146
|
base_defaults = BaseLayoutOptions()
|
147
147
|
|
148
|
+
# Separate client from other kwargs
|
149
|
+
client_instance = kwargs.pop("client", None) # Get client, remove from kwargs
|
150
|
+
|
151
|
+
# Separate model_name if provided for Gemini
|
152
|
+
model_name_kwarg = None
|
153
|
+
if issubclass(options_class, GeminiLayoutOptions):
|
154
|
+
model_name_kwarg = kwargs.pop("model_name", None)
|
155
|
+
|
148
156
|
# Prepare args for constructor, prioritizing explicit args over defaults
|
149
157
|
constructor_args = {
|
150
158
|
"confidence": confidence if confidence is not None else base_defaults.confidence,
|
151
159
|
"classes": classes, # Pass None if not provided
|
152
160
|
"exclude_classes": exclude_classes, # Pass None if not provided
|
153
161
|
"device": device if device is not None else base_defaults.device,
|
154
|
-
|
162
|
+
# Pass client explicitly if constructing Gemini options
|
163
|
+
# Note: We check issubclass *before* calling constructor
|
164
|
+
**(
|
165
|
+
{"client": client_instance}
|
166
|
+
if client_instance and issubclass(options_class, GeminiLayoutOptions)
|
167
|
+
else {}
|
168
|
+
),
|
169
|
+
# Pass model_name explicitly if constructing Gemini options and it was provided
|
170
|
+
**(
|
171
|
+
{"model_name": model_name_kwarg}
|
172
|
+
if model_name_kwarg and issubclass(options_class, GeminiLayoutOptions)
|
173
|
+
else {}
|
174
|
+
),
|
175
|
+
"extra_args": kwargs, # Pass REMAINING kwargs here
|
155
176
|
}
|
156
177
|
# Remove None values unless they are valid defaults (like classes=None)
|
157
178
|
# We can pass all to the dataclass constructor; it handles defaults
|
179
|
+
# **Filter constructor_args to remove None values that aren't defaults?**
|
180
|
+
# For simplicity, let dataclass handle it for now.
|
158
181
|
|
159
182
|
try:
|
160
183
|
final_options = options_class(**constructor_args)
|
@@ -167,24 +190,30 @@ class LayoutAnalyzer:
|
|
167
190
|
# Re-raise for now, indicates programming error or invalid kwarg.
|
168
191
|
raise e
|
169
192
|
|
170
|
-
# --- Add Internal Context to extra_args (
|
193
|
+
# --- Add Internal Context to extra_args (Applies to the final_options object) ---
|
171
194
|
if not hasattr(final_options, "extra_args") or final_options.extra_args is None:
|
195
|
+
# Ensure extra_args exists, potentially overwriting if needed
|
196
|
+
final_options.extra_args = {}
|
197
|
+
elif not isinstance(final_options.extra_args, dict):
|
198
|
+
logger.warning(
|
199
|
+
f"final_options.extra_args was not a dict ({type(final_options.extra_args)}), replacing with internal context."
|
200
|
+
)
|
172
201
|
final_options.extra_args = {}
|
202
|
+
|
173
203
|
final_options.extra_args["_page_ref"] = self._page
|
174
204
|
final_options.extra_args["_img_scale_x"] = img_scale_x
|
175
205
|
final_options.extra_args["_img_scale_y"] = img_scale_y
|
176
206
|
logger.debug(
|
177
|
-
f"Added internal context
|
207
|
+
f"Added/updated internal context in final_options.extra_args: {final_options.extra_args}"
|
178
208
|
)
|
179
209
|
|
180
|
-
# --- Call Layout Manager with
|
210
|
+
# --- Call Layout Manager (ALWAYS with options object) ---
|
181
211
|
logger.debug(f"Calling Layout Manager with final options object.")
|
182
212
|
try:
|
183
|
-
#
|
213
|
+
# ALWAYS pass the constructed/modified options object
|
184
214
|
detections = self._layout_manager.analyze_layout(
|
185
215
|
image=std_res_page_image,
|
186
|
-
options=final_options,
|
187
|
-
# No engine, confidence, classes etc. passed here directly
|
216
|
+
options=final_options, # Pass the final object with internal context
|
188
217
|
)
|
189
218
|
logger.info(f" Layout Manager returned {len(detections)} detections.")
|
190
219
|
# Specifically let errors about unknown/unavailable engines propagate
|
@@ -96,9 +96,6 @@ class LayoutManager:
|
|
96
96
|
"options_class": GeminiLayoutOptions,
|
97
97
|
}
|
98
98
|
|
99
|
-
# Define the limited set of kwargs allowed for the simple analyze_layout call
|
100
|
-
SIMPLE_MODE_ALLOWED_KWARGS = {"engine", "confidence", "classes", "exclude_classes", "device"}
|
101
|
-
|
102
99
|
def __init__(self):
|
103
100
|
"""Initializes the Layout Manager."""
|
104
101
|
# Cache for detector instances (different from model cache inside detector)
|
@@ -145,109 +142,54 @@ class LayoutManager:
|
|
145
142
|
def analyze_layout(
|
146
143
|
self,
|
147
144
|
image: Image.Image,
|
148
|
-
|
149
|
-
options: Optional[LayoutOptions] = None,
|
150
|
-
**kwargs,
|
145
|
+
options: LayoutOptions,
|
151
146
|
) -> List[Dict[str, Any]]:
|
152
147
|
"""
|
153
|
-
Analyzes layout of a single image using
|
148
|
+
Analyzes layout of a single image using a specific options object.
|
154
149
|
|
155
150
|
Args:
|
156
151
|
image: The PIL Image to analyze.
|
157
|
-
|
158
|
-
|
159
|
-
options: Specific LayoutOptions object for advanced configuration.
|
160
|
-
**kwargs: For simple mode, accepts: 'confidence', 'classes',
|
161
|
-
'exclude_classes', 'device'.
|
152
|
+
options: Specific LayoutOptions object containing configuration and context.
|
153
|
+
This object MUST be provided.
|
162
154
|
|
163
155
|
Returns:
|
164
156
|
A list of standardized detection dictionaries.
|
165
157
|
"""
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
if not
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
# --- Determine Options and Engine ---
|
180
|
-
if options is not None:
|
181
|
-
# Advanced Mode: An options object was provided directly (or constructed by LayoutAnalyzer)
|
182
|
-
# Use this object directly, do not deep copy or reconstruct.
|
183
|
-
logger.debug(f"LayoutManager: Using provided options object: {type(options).__name__}")
|
184
|
-
final_options = options # Use the provided object directly
|
185
|
-
found_engine = False
|
186
|
-
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
187
|
-
if isinstance(options, registry_entry["options_class"]):
|
188
|
-
selected_engine_name = name
|
189
|
-
found_engine = True
|
190
|
-
break
|
191
|
-
if not found_engine:
|
192
|
-
raise TypeError(
|
193
|
-
f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options."
|
194
|
-
)
|
195
|
-
# Ignore simple kwargs if options object is present
|
196
|
-
if kwargs:
|
197
|
-
logger.warning(
|
198
|
-
f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored."
|
199
|
-
)
|
200
|
-
else:
|
201
|
-
# Simple Mode: No options object provided initially.
|
202
|
-
# Determine engine from kwargs or default, then construct options.
|
203
|
-
selected_engine_name = default_engine.lower()
|
204
|
-
logger.debug(
|
205
|
-
f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}"
|
158
|
+
selected_engine_name: Optional[str] = None
|
159
|
+
found_engine = False
|
160
|
+
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
161
|
+
if isinstance(options, registry_entry["options_class"]):
|
162
|
+
selected_engine_name = name
|
163
|
+
found_engine = True
|
164
|
+
break
|
165
|
+
if not found_engine or selected_engine_name is None:
|
166
|
+
available_options_types = [
|
167
|
+
reg["options_class"].__name__ for reg in self.ENGINE_REGISTRY.values()
|
168
|
+
]
|
169
|
+
raise TypeError(
|
170
|
+
f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options: {available_options_types}"
|
206
171
|
)
|
207
172
|
|
208
|
-
if selected_engine_name not in self.ENGINE_REGISTRY:
|
209
|
-
raise ValueError(
|
210
|
-
f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}"
|
211
|
-
)
|
212
|
-
|
213
|
-
unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
|
214
|
-
if unexpected_kwargs:
|
215
|
-
raise TypeError(
|
216
|
-
f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration."
|
217
|
-
)
|
218
|
-
|
219
|
-
options_class = self.ENGINE_REGISTRY[selected_engine_name]["options_class"]
|
220
|
-
# Use BaseLayoutOptions defaults unless overridden by kwargs
|
221
|
-
base_defaults = BaseLayoutOptions()
|
222
|
-
simple_args = {
|
223
|
-
"confidence": kwargs.get("confidence", base_defaults.confidence),
|
224
|
-
"classes": kwargs.get("classes"),
|
225
|
-
"exclude_classes": kwargs.get("exclude_classes"),
|
226
|
-
"device": kwargs.get("device", base_defaults.device),
|
227
|
-
}
|
228
|
-
# Filter out None values before passing to constructor
|
229
|
-
simple_args_filtered = {k: v for k, v in simple_args.items() if v is not None}
|
230
|
-
final_options = options_class(**simple_args_filtered)
|
231
|
-
logger.debug(f"LayoutManager: Constructed options for simple mode: {final_options}")
|
232
|
-
|
233
|
-
# --- Get Engine Instance and Process ---
|
234
173
|
try:
|
235
174
|
engine_instance = self._get_engine_instance(selected_engine_name)
|
236
175
|
logger.info(f"Analyzing layout with engine '{selected_engine_name}'...")
|
237
176
|
|
238
|
-
|
239
|
-
detections = engine_instance.detect(image, final_options)
|
177
|
+
detections = engine_instance.detect(image, options) # Pass options directly
|
240
178
|
|
241
179
|
logger.info(f"Layout analysis complete. Found {len(detections)} regions.")
|
242
180
|
return detections
|
243
181
|
|
244
182
|
except (ImportError, RuntimeError, ValueError, TypeError) as e:
|
245
|
-
|
246
|
-
|
247
|
-
)
|
183
|
+
# Add engine name to error message if possible
|
184
|
+
engine_context = f" for engine '{selected_engine_name}'" if selected_engine_name else ""
|
185
|
+
logger.error(f"Layout analysis failed{engine_context}: {e}", exc_info=True)
|
248
186
|
raise # Re-raise expected errors
|
249
187
|
except Exception as e:
|
250
|
-
|
188
|
+
engine_context = f" for engine '{selected_engine_name}'" if selected_engine_name else ""
|
189
|
+
logger.error(
|
190
|
+
f"An unexpected error occurred during layout analysis{engine_context}: {e}",
|
191
|
+
exc_info=True,
|
192
|
+
)
|
251
193
|
raise # Re-raise unexpected errors
|
252
194
|
|
253
195
|
def get_available_engines(self) -> List[str]:
|
@@ -43,6 +43,12 @@ class TATRLayoutOptions(BaseLayoutOptions):
|
|
43
43
|
max_structure_size: int = 1000
|
44
44
|
# Whether to create cell regions (can be slow)
|
45
45
|
create_cells: bool = True
|
46
|
+
# Image enhancement options
|
47
|
+
enhance_contrast: float = 1.5 # Contrast enhancement factor (1.0 = no change)
|
48
|
+
# Special thresholds for specific elements
|
49
|
+
column_threshold: Optional[float] = (
|
50
|
+
None # Lower threshold for columns (default: confidence * 0.8)
|
51
|
+
)
|
46
52
|
|
47
53
|
|
48
54
|
# --- Paddle Specific Options ---
|
@@ -86,6 +92,7 @@ class GeminiLayoutOptions(BaseLayoutOptions):
|
|
86
92
|
"""Options specific to Gemini-based layout detection (using OpenAI compatibility)."""
|
87
93
|
|
88
94
|
model_name: str = "gemini-2.0-flash"
|
95
|
+
client: Optional[Any] = None # Allow passing a pre-configured client
|
89
96
|
# Removed: prompt_template, temperature, top_p, max_output_tokens
|
90
97
|
# These are typically passed directly to the chat completion call or via extra_args
|
91
98
|
|