natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/__init__.py CHANGED
@@ -3,6 +3,9 @@ Natural PDF - A more intuitive interface for working with PDFs.
3
3
  """
4
4
 
5
5
  import logging
6
+ import os
7
+
8
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
6
9
 
7
10
  # Create library logger
8
11
  logger = logging.getLogger("natural_pdf")
@@ -98,7 +98,7 @@ class LayoutDetector(ABC):
98
98
  self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
99
99
  # Remove potentially corrupted cache entry
100
100
  self._model_cache.pop(cache_key, None)
101
- raise # Re-raise exception after logging
101
+ raise
102
102
  else:
103
103
  self.logger.debug(f"Using cached model for key: {cache_key}")
104
104
  return self._model_cache[cache_key]
@@ -135,7 +135,6 @@ class LayoutDetector(ABC):
135
135
  return
136
136
 
137
137
  if classes:
138
- # Normalize both requested and supported classes for comparison
139
138
  normalized_supported = {self._normalize_class_name(c) for c in self.supported_classes}
140
139
  normalized_requested = {self._normalize_class_name(c) for c in classes}
141
140
  unsupported_normalized = normalized_requested - normalized_supported
@@ -153,7 +152,4 @@ class LayoutDetector(ABC):
153
152
  def __del__(self):
154
153
  """Cleanup resources."""
155
154
  self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
156
- # Clear model cache to free up memory/GPU resources if models are large
157
- # Consider implications if models are shared or expensive to reload
158
- # del self._model_cache # Optional: uncomment if models should be released aggressively
159
155
  self._model_cache.clear()
@@ -1,13 +1,13 @@
1
1
  # layout_detector_gemini.py
2
+ import base64
2
3
  import importlib.util
4
+ import io
3
5
  import logging
4
6
  import os
5
7
  from typing import Any, Dict, List, Optional
6
- import base64
7
- import io
8
8
 
9
- from pydantic import BaseModel, Field
10
9
  from PIL import Image
10
+ from pydantic import BaseModel, Field
11
11
 
12
12
  # Use OpenAI library for interaction
13
13
  try:
@@ -53,10 +53,8 @@ logger = logging.getLogger(__name__)
53
53
  # This is used by the openai library's `response_format`
54
54
  class DetectedRegion(BaseModel):
55
55
  label: str = Field(description="The identified class name.")
56
- bbox: List[float] = Field(
57
- description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4
58
- )
59
- confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
56
+ bbox: List[float] = Field(description="Bounding box coordinates [xmin, ymin, xmax, ymax].")
57
+ confidence: float = Field(description="Confidence score [0.0, 1.0].")
60
58
 
61
59
 
62
60
  class GeminiLayoutDetector(LayoutDetector):
@@ -70,16 +68,10 @@ class GeminiLayoutDetector(LayoutDetector):
70
68
  self.supported_classes = set() # Indicate dynamic nature
71
69
 
72
70
  def is_available(self) -> bool:
73
- """Check if openai library is installed and GOOGLE_API_KEY is available."""
74
- api_key = os.environ.get("GOOGLE_API_KEY")
75
- if not api_key:
76
- logger.warning(
77
- "GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available."
78
- )
79
- return False
71
+ """Check if openai library is installed."""
80
72
  if OpenAI is None:
81
73
  logger.warning(
82
- "openai package not found. Gemini detector (via OpenAI lib) will not be available."
74
+ "openai package not found. Gemini detector (via OpenAI lib) will not be available. Run: pip install openai"
83
75
  )
84
76
  return False
85
77
  return True
@@ -96,44 +88,65 @@ class GeminiLayoutDetector(LayoutDetector):
96
88
  def _load_model_from_options(self, options: GeminiLayoutOptions) -> Any:
97
89
  """Validate options and return the model name."""
98
90
  if not self.is_available():
99
- raise RuntimeError(
100
- "OpenAI library not installed or GOOGLE_API_KEY not set. Please run: pip install openai"
101
- )
91
+ raise RuntimeError("OpenAI library not installed. Please run: pip install openai")
102
92
 
103
93
  if not isinstance(options, GeminiLayoutOptions):
104
94
  raise TypeError("Incorrect options type provided for Gemini model loading.")
105
95
 
106
- # Simply return the model name, client is created in detect()
96
+ # Model loading is deferred to detect() based on whether a client is provided
107
97
  return options.model_name
108
98
 
109
99
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
110
100
  """Detect layout elements in an image using Gemini via OpenAI library."""
111
101
  if not self.is_available():
112
- raise RuntimeError("OpenAI library not installed or GOOGLE_API_KEY not set.")
102
+ # The is_available check now only confirms library presence
103
+ raise RuntimeError("OpenAI library not installed. Please run: pip install openai")
113
104
 
114
105
  # Ensure options are the correct type
115
- if not isinstance(options, GeminiLayoutOptions):
106
+ final_options: GeminiLayoutOptions
107
+ if isinstance(options, GeminiLayoutOptions):
108
+ final_options = options
109
+ else:
110
+ # If base options are passed, try to convert, keeping extra_args
111
+ # Note: This won't transfer a 'client' if it was somehow attached to BaseLayoutOptions
116
112
  self.logger.warning(
117
- "Received BaseLayoutOptions, expected GeminiLayoutOptions. Using defaults."
113
+ "Received BaseLayoutOptions, expected GeminiLayoutOptions. Converting and using defaults."
118
114
  )
119
- options = GeminiLayoutOptions(
115
+ final_options = GeminiLayoutOptions(
120
116
  confidence=options.confidence,
121
117
  classes=options.classes,
122
118
  exclude_classes=options.exclude_classes,
123
- device=options.device,
119
+ device=options.device, # device is not used by Gemini detector currently
124
120
  extra_args=options.extra_args,
121
+ # client will be None here, forcing default client creation below
125
122
  )
126
123
 
127
- model_name = self._get_model(options)
128
- api_key = os.environ.get("GOOGLE_API_KEY")
129
-
124
+ model_name = self._get_model(final_options)
130
125
  detections = []
126
+
131
127
  try:
132
- # --- 1. Initialize OpenAI Client for Gemini ---
133
- client = OpenAI(api_key=api_key, base_url=self.GEMINI_BASE_URL)
128
+ # --- 1. Initialize OpenAI Client ---
129
+ client: Optional[OpenAI] = None
130
+ # Use the provided client instance
131
+ if hasattr(final_options.client, "beta") and hasattr(
132
+ final_options.client.beta.chat.completions, "parse"
133
+ ):
134
+ client = final_options.client
135
+ logger.debug("Using provided client instance.")
136
+ else:
137
+ logger.error(
138
+ "Provided client does not seem compatible (missing beta.chat.completions.parse)."
139
+ )
140
+ raise TypeError(
141
+ "Provided client is not compatible with the expected OpenAI interface."
142
+ )
143
+
144
+ if not client:
145
+ # This should not happen if logic above is correct, but as a safeguard
146
+ raise RuntimeError("Failed to obtain a valid client for Gemini detection.")
134
147
 
135
148
  # --- 2. Prepare Input for OpenAI API ---
136
- if not options.classes:
149
+ if not final_options.classes:
137
150
  logger.error("Gemini layout detection requires a list of classes to find.")
138
151
  return []
139
152
 
@@ -145,15 +158,13 @@ class GeminiLayoutDetector(LayoutDetector):
145
158
  img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
146
159
  image_url = f"data:image/png;base64,{img_base64}"
147
160
 
148
- # Construct the prompt text
149
- class_list_str = ", ".join(f"`{c}`" for c in options.classes)
161
+ class_list_str = ", ".join(f"`{c}`" for c in final_options.classes)
150
162
  prompt_text = (
151
163
  f"Analyze the provided image of a document page ({width}x{height}). "
152
164
  f"Identify all regions corresponding to the following types: {class_list_str}. "
153
- f"Return ONLY the structured data requested."
165
+ f"Return ONLY the structured data requested as formatted JSON."
154
166
  )
155
167
 
156
- # Prepare messages for chat completions endpoint
157
168
  messages = [
158
169
  {
159
170
  "role": "user",
@@ -167,27 +178,26 @@ class GeminiLayoutDetector(LayoutDetector):
167
178
  }
168
179
  ]
169
180
 
170
- # --- 3. Call OpenAI API using .parse for structured output ---
171
181
  logger.debug(
172
- f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}"
182
+ f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {final_options.classes}"
173
183
  )
174
184
 
175
- # Extract relevant generation parameters from extra_args if provided
176
- # Mapping common names: temperature, top_p, max_tokens
177
185
  completion_kwargs = {
178
- "temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
179
- "top_p": options.extra_args.get("top_p"),
180
- "max_tokens": options.extra_args.get(
181
- "max_tokens", 4096
182
- ), # Map from max_output_tokens
186
+ "temperature": final_options.extra_args.get(
187
+ "temperature", 0.0
188
+ ), # Default to low temp
189
+ "max_tokens": final_options.extra_args.get("max_tokens", 4096),
183
190
  }
184
- # Filter out None values
191
+
185
192
  completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
186
193
 
194
+ class ImageContents(BaseModel):
195
+ regions: List[DetectedRegion]
196
+
187
197
  completion: ChatCompletion = client.beta.chat.completions.parse(
188
198
  model=model_name,
189
199
  messages=messages,
190
- response_format=List[DetectedRegion], # Pass the Pydantic model list
200
+ response_format=ImageContents,
191
201
  **completion_kwargs,
192
202
  )
193
203
 
@@ -199,7 +209,7 @@ class GeminiLayoutDetector(LayoutDetector):
199
209
  return []
200
210
 
201
211
  # Get the parsed Pydantic objects
202
- parsed_results = completion.choices[0].message.parsed
212
+ parsed_results = completion.choices[0].message.parsed.regions
203
213
  if not parsed_results or not isinstance(parsed_results, list):
204
214
  logger.error(
205
215
  f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
@@ -207,10 +217,10 @@ class GeminiLayoutDetector(LayoutDetector):
207
217
  return []
208
218
 
209
219
  # --- 5. Convert to Detections & Filter ---
210
- normalized_classes_req = {self._normalize_class_name(c) for c in options.classes}
220
+ normalized_classes_req = {self._normalize_class_name(c) for c in final_options.classes}
211
221
  normalized_classes_excl = (
212
- {self._normalize_class_name(c) for c in options.exclude_classes}
213
- if options.exclude_classes
222
+ {self._normalize_class_name(c) for c in final_options.exclude_classes}
223
+ if final_options.exclude_classes
214
224
  else set()
215
225
  )
216
226
 
@@ -242,9 +252,9 @@ class GeminiLayoutDetector(LayoutDetector):
242
252
  continue
243
253
 
244
254
  # Check against base confidence threshold from options
245
- if confidence_score < options.confidence:
255
+ if confidence_score < final_options.confidence:
246
256
  logger.debug(
247
- f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}."
257
+ f"Skipping item with confidence {confidence_score:.3f} below threshold {final_options.confidence}."
248
258
  )
249
259
  continue
250
260
 
@@ -7,6 +7,7 @@ from PIL import Image
7
7
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
8
8
  from natural_pdf.analyzers.layout.layout_options import (
9
9
  BaseLayoutOptions,
10
+ GeminiLayoutOptions,
10
11
  LayoutOptions,
11
12
  TATRLayoutOptions,
12
13
  )
@@ -82,10 +83,10 @@ class LayoutAnalyzer:
82
83
  f" Rendering page {self._page.number} to image for initial layout detection..."
83
84
  )
84
85
  try:
85
- layout_scale = getattr(self._page._parent, "_config", {}).get("layout_image_scale", 1.5)
86
+ layout_scale = getattr(self._page._parent, "_config", {}).get("layout_image_scale", 1.0)
86
87
  layout_resolution = layout_scale * 72
87
88
  std_res_page_image = self._page.to_image(
88
- resolution=layout_resolution, include_highlights=False
89
+ resolution=layout_resolution, include_highlights=False, scale=1.0
89
90
  )
90
91
  if not std_res_page_image:
91
92
  raise ValueError("Initial page rendering returned None")
@@ -110,12 +111,11 @@ class LayoutAnalyzer:
110
111
  final_options: BaseLayoutOptions
111
112
 
112
113
  if options is not None:
113
- # User provided a complete options object, use it directly
114
114
  logger.debug("Using user-provided options object.")
115
115
  final_options = copy.deepcopy(options) # Copy to avoid modifying original user object
116
116
  if kwargs:
117
117
  logger.warning(
118
- f"Ignoring kwargs {list(kwargs.keys())} because a full options object was provided."
118
+ f"Ignoring simple mode keyword arguments {list(kwargs.keys())} because a full options object was provided."
119
119
  )
120
120
  # Infer engine from options type if engine arg wasn't provided
121
121
  if engine is None:
@@ -145,16 +145,39 @@ class LayoutAnalyzer:
145
145
  # Get base defaults
146
146
  base_defaults = BaseLayoutOptions()
147
147
 
148
+ # Separate client from other kwargs
149
+ client_instance = kwargs.pop("client", None) # Get client, remove from kwargs
150
+
151
+ # Separate model_name if provided for Gemini
152
+ model_name_kwarg = None
153
+ if issubclass(options_class, GeminiLayoutOptions):
154
+ model_name_kwarg = kwargs.pop("model_name", None)
155
+
148
156
  # Prepare args for constructor, prioritizing explicit args over defaults
149
157
  constructor_args = {
150
158
  "confidence": confidence if confidence is not None else base_defaults.confidence,
151
159
  "classes": classes, # Pass None if not provided
152
160
  "exclude_classes": exclude_classes, # Pass None if not provided
153
161
  "device": device if device is not None else base_defaults.device,
154
- "extra_args": kwargs, # Pass other kwargs here
162
+ # Pass client explicitly if constructing Gemini options
163
+ # Note: We check issubclass *before* calling constructor
164
+ **(
165
+ {"client": client_instance}
166
+ if client_instance and issubclass(options_class, GeminiLayoutOptions)
167
+ else {}
168
+ ),
169
+ # Pass model_name explicitly if constructing Gemini options and it was provided
170
+ **(
171
+ {"model_name": model_name_kwarg}
172
+ if model_name_kwarg and issubclass(options_class, GeminiLayoutOptions)
173
+ else {}
174
+ ),
175
+ "extra_args": kwargs, # Pass REMAINING kwargs here
155
176
  }
156
177
  # Remove None values unless they are valid defaults (like classes=None)
157
178
  # We can pass all to the dataclass constructor; it handles defaults
179
+ # **Filter constructor_args to remove None values that aren't defaults?**
180
+ # For simplicity, let dataclass handle it for now.
158
181
 
159
182
  try:
160
183
  final_options = options_class(**constructor_args)
@@ -167,24 +190,30 @@ class LayoutAnalyzer:
167
190
  # Re-raise for now, indicates programming error or invalid kwarg.
168
191
  raise e
169
192
 
170
- # --- Add Internal Context to extra_args (ALWAYS) ---
193
+ # --- Add Internal Context to extra_args (Applies to the final_options object) ---
171
194
  if not hasattr(final_options, "extra_args") or final_options.extra_args is None:
195
+ # Ensure extra_args exists, potentially overwriting if needed
196
+ final_options.extra_args = {}
197
+ elif not isinstance(final_options.extra_args, dict):
198
+ logger.warning(
199
+ f"final_options.extra_args was not a dict ({type(final_options.extra_args)}), replacing with internal context."
200
+ )
172
201
  final_options.extra_args = {}
202
+
173
203
  final_options.extra_args["_page_ref"] = self._page
174
204
  final_options.extra_args["_img_scale_x"] = img_scale_x
175
205
  final_options.extra_args["_img_scale_y"] = img_scale_y
176
206
  logger.debug(
177
- f"Added internal context to final_options.extra_args: {final_options.extra_args}"
207
+ f"Added/updated internal context in final_options.extra_args: {final_options.extra_args}"
178
208
  )
179
209
 
180
- # --- Call Layout Manager with the Final Options ---
210
+ # --- Call Layout Manager (ALWAYS with options object) ---
181
211
  logger.debug(f"Calling Layout Manager with final options object.")
182
212
  try:
183
- # Pass only image and the constructed options object
213
+ # ALWAYS pass the constructed/modified options object
184
214
  detections = self._layout_manager.analyze_layout(
185
215
  image=std_res_page_image,
186
- options=final_options,
187
- # No engine, confidence, classes etc. passed here directly
216
+ options=final_options, # Pass the final object with internal context
188
217
  )
189
218
  logger.info(f" Layout Manager returned {len(detections)} detections.")
190
219
  # Specifically let errors about unknown/unavailable engines propagate
@@ -96,9 +96,6 @@ class LayoutManager:
96
96
  "options_class": GeminiLayoutOptions,
97
97
  }
98
98
 
99
- # Define the limited set of kwargs allowed for the simple analyze_layout call
100
- SIMPLE_MODE_ALLOWED_KWARGS = {"engine", "confidence", "classes", "exclude_classes", "device"}
101
-
102
99
  def __init__(self):
103
100
  """Initializes the Layout Manager."""
104
101
  # Cache for detector instances (different from model cache inside detector)
@@ -145,109 +142,54 @@ class LayoutManager:
145
142
  def analyze_layout(
146
143
  self,
147
144
  image: Image.Image,
148
- engine: Optional[str] = None, # Default engine handled below
149
- options: Optional[LayoutOptions] = None,
150
- **kwargs,
145
+ options: LayoutOptions,
151
146
  ) -> List[Dict[str, Any]]:
152
147
  """
153
- Analyzes layout of a single image using simple args or an options object.
148
+ Analyzes layout of a single image using a specific options object.
154
149
 
155
150
  Args:
156
151
  image: The PIL Image to analyze.
157
- engine: Name of the engine (e.g., 'yolo', 'tatr'). Ignored if 'options' provided.
158
- Defaults to the first available engine if None.
159
- options: Specific LayoutOptions object for advanced configuration.
160
- **kwargs: For simple mode, accepts: 'confidence', 'classes',
161
- 'exclude_classes', 'device'.
152
+ options: Specific LayoutOptions object containing configuration and context.
153
+ This object MUST be provided.
162
154
 
163
155
  Returns:
164
156
  A list of standardized detection dictionaries.
165
157
  """
166
- final_options: BaseLayoutOptions
167
- selected_engine_name: str
168
-
169
- if not isinstance(image, Image.Image):
170
- raise TypeError("Input 'image' must be a PIL Image.")
171
-
172
- available_engines = self.get_available_engines()
173
- if not available_engines:
174
- raise RuntimeError("No layout engines are available. Please check dependencies.")
175
-
176
- # Determine default engine if not specified
177
- default_engine = engine if engine else available_engines[0]
178
-
179
- # --- Determine Options and Engine ---
180
- if options is not None:
181
- # Advanced Mode: An options object was provided directly (or constructed by LayoutAnalyzer)
182
- # Use this object directly, do not deep copy or reconstruct.
183
- logger.debug(f"LayoutManager: Using provided options object: {type(options).__name__}")
184
- final_options = options # Use the provided object directly
185
- found_engine = False
186
- for name, registry_entry in self.ENGINE_REGISTRY.items():
187
- if isinstance(options, registry_entry["options_class"]):
188
- selected_engine_name = name
189
- found_engine = True
190
- break
191
- if not found_engine:
192
- raise TypeError(
193
- f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options."
194
- )
195
- # Ignore simple kwargs if options object is present
196
- if kwargs:
197
- logger.warning(
198
- f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored."
199
- )
200
- else:
201
- # Simple Mode: No options object provided initially.
202
- # Determine engine from kwargs or default, then construct options.
203
- selected_engine_name = default_engine.lower()
204
- logger.debug(
205
- f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}"
158
+ selected_engine_name: Optional[str] = None
159
+ found_engine = False
160
+ for name, registry_entry in self.ENGINE_REGISTRY.items():
161
+ if isinstance(options, registry_entry["options_class"]):
162
+ selected_engine_name = name
163
+ found_engine = True
164
+ break
165
+ if not found_engine or selected_engine_name is None:
166
+ available_options_types = [
167
+ reg["options_class"].__name__ for reg in self.ENGINE_REGISTRY.values()
168
+ ]
169
+ raise TypeError(
170
+ f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options: {available_options_types}"
206
171
  )
207
172
 
208
- if selected_engine_name not in self.ENGINE_REGISTRY:
209
- raise ValueError(
210
- f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}"
211
- )
212
-
213
- unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
214
- if unexpected_kwargs:
215
- raise TypeError(
216
- f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration."
217
- )
218
-
219
- options_class = self.ENGINE_REGISTRY[selected_engine_name]["options_class"]
220
- # Use BaseLayoutOptions defaults unless overridden by kwargs
221
- base_defaults = BaseLayoutOptions()
222
- simple_args = {
223
- "confidence": kwargs.get("confidence", base_defaults.confidence),
224
- "classes": kwargs.get("classes"),
225
- "exclude_classes": kwargs.get("exclude_classes"),
226
- "device": kwargs.get("device", base_defaults.device),
227
- }
228
- # Filter out None values before passing to constructor
229
- simple_args_filtered = {k: v for k, v in simple_args.items() if v is not None}
230
- final_options = options_class(**simple_args_filtered)
231
- logger.debug(f"LayoutManager: Constructed options for simple mode: {final_options}")
232
-
233
- # --- Get Engine Instance and Process ---
234
173
  try:
235
174
  engine_instance = self._get_engine_instance(selected_engine_name)
236
175
  logger.info(f"Analyzing layout with engine '{selected_engine_name}'...")
237
176
 
238
- # Call the engine's detect method
239
- detections = engine_instance.detect(image, final_options)
177
+ detections = engine_instance.detect(image, options) # Pass options directly
240
178
 
241
179
  logger.info(f"Layout analysis complete. Found {len(detections)} regions.")
242
180
  return detections
243
181
 
244
182
  except (ImportError, RuntimeError, ValueError, TypeError) as e:
245
- logger.error(
246
- f"Layout analysis failed for engine '{selected_engine_name}': {e}", exc_info=True
247
- )
183
+ # Add engine name to error message if possible
184
+ engine_context = f" for engine '{selected_engine_name}'" if selected_engine_name else ""
185
+ logger.error(f"Layout analysis failed{engine_context}: {e}", exc_info=True)
248
186
  raise # Re-raise expected errors
249
187
  except Exception as e:
250
- logger.error(f"An unexpected error occurred during layout analysis: {e}", exc_info=True)
188
+ engine_context = f" for engine '{selected_engine_name}'" if selected_engine_name else ""
189
+ logger.error(
190
+ f"An unexpected error occurred during layout analysis{engine_context}: {e}",
191
+ exc_info=True,
192
+ )
251
193
  raise # Re-raise unexpected errors
252
194
 
253
195
  def get_available_engines(self) -> List[str]:
@@ -43,6 +43,12 @@ class TATRLayoutOptions(BaseLayoutOptions):
43
43
  max_structure_size: int = 1000
44
44
  # Whether to create cell regions (can be slow)
45
45
  create_cells: bool = True
46
+ # Image enhancement options
47
+ enhance_contrast: float = 1.5 # Contrast enhancement factor (1.0 = no change)
48
+ # Special thresholds for specific elements
49
+ column_threshold: Optional[float] = (
50
+ None # Lower threshold for columns (default: confidence * 0.8)
51
+ )
46
52
 
47
53
 
48
54
  # --- Paddle Specific Options ---
@@ -86,6 +92,7 @@ class GeminiLayoutOptions(BaseLayoutOptions):
86
92
  """Options specific to Gemini-based layout detection (using OpenAI compatibility)."""
87
93
 
88
94
  model_name: str = "gemini-2.0-flash"
95
+ client: Optional[Any] = None # Allow passing a pre-configured client
89
96
  # Removed: prompt_template, temperature, top_p, max_output_tokens
90
97
  # These are typically passed directly to the chat completion call or via extra_args
91
98