natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,16 @@
1
1
  # layout_manager.py
2
+ import copy
2
3
  import logging
3
- from typing import Dict, List, Any, Optional, Union, Type
4
+ from typing import Any, Dict, List, Optional, Type, Union
5
+
4
6
  from PIL import Image
5
- import copy
6
7
 
7
8
  # --- Import detector classes and options ---
8
9
  # Use try-except blocks for robustness if some detectors might be missing dependencies
9
10
  try:
10
11
  from .base import LayoutDetector
11
12
  except ImportError:
12
- LayoutDetector = type('LayoutDetector', (), {})
13
+ LayoutDetector = type("LayoutDetector", (), {})
13
14
 
14
15
  try:
15
16
  from .yolo import YOLODocLayoutDetector
@@ -35,14 +36,26 @@ try:
35
36
  from .docling import DoclingLayoutDetector
36
37
  except ImportError:
37
38
  DoclingLayoutDetector = None
38
-
39
+
40
+ try:
41
+ from .gemini import GeminiLayoutDetector
42
+ except ImportError:
43
+ GeminiLayoutDetector = None
44
+
39
45
  from .layout_options import (
40
- BaseLayoutOptions, YOLOLayoutOptions, TATRLayoutOptions,
41
- PaddleLayoutOptions, SuryaLayoutOptions, DoclingLayoutOptions, LayoutOptions
46
+ BaseLayoutOptions,
47
+ DoclingLayoutOptions,
48
+ GeminiLayoutOptions,
49
+ LayoutOptions,
50
+ PaddleLayoutOptions,
51
+ SuryaLayoutOptions,
52
+ TATRLayoutOptions,
53
+ YOLOLayoutOptions,
42
54
  )
43
55
 
44
56
  logger = logging.getLogger(__name__)
45
57
 
58
+
46
59
  class LayoutManager:
47
60
  """Manages layout detector selection, configuration, and execution."""
48
61
 
@@ -50,46 +63,91 @@ class LayoutManager:
50
63
  ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {}
51
64
 
52
65
  # Populate registry only with available detectors
53
- if YOLODocLayoutDetector: ENGINE_REGISTRY['yolo'] = {'class': YOLODocLayoutDetector, 'options_class': YOLOLayoutOptions}
54
- if TableTransformerDetector: ENGINE_REGISTRY['tatr'] = {'class': TableTransformerDetector, 'options_class': TATRLayoutOptions}
55
- if PaddleLayoutDetector: ENGINE_REGISTRY['paddle'] = {'class': PaddleLayoutDetector, 'options_class': PaddleLayoutOptions}
56
- if SuryaLayoutDetector: ENGINE_REGISTRY['surya'] = {'class': SuryaLayoutDetector, 'options_class': SuryaLayoutOptions}
57
- if DoclingLayoutDetector: ENGINE_REGISTRY['docling'] = {'class': DoclingLayoutDetector, 'options_class': DoclingLayoutOptions}
66
+ if YOLODocLayoutDetector:
67
+ ENGINE_REGISTRY["yolo"] = {
68
+ "class": YOLODocLayoutDetector,
69
+ "options_class": YOLOLayoutOptions,
70
+ }
71
+ if TableTransformerDetector:
72
+ ENGINE_REGISTRY["tatr"] = {
73
+ "class": TableTransformerDetector,
74
+ "options_class": TATRLayoutOptions,
75
+ }
76
+ if PaddleLayoutDetector:
77
+ ENGINE_REGISTRY["paddle"] = {
78
+ "class": PaddleLayoutDetector,
79
+ "options_class": PaddleLayoutOptions,
80
+ }
81
+ if SuryaLayoutDetector:
82
+ ENGINE_REGISTRY["surya"] = {
83
+ "class": SuryaLayoutDetector,
84
+ "options_class": SuryaLayoutOptions,
85
+ }
86
+ if DoclingLayoutDetector:
87
+ ENGINE_REGISTRY["docling"] = {
88
+ "class": DoclingLayoutDetector,
89
+ "options_class": DoclingLayoutOptions,
90
+ }
91
+
92
+ # Add Gemini entry if available
93
+ if GeminiLayoutDetector:
94
+ ENGINE_REGISTRY["gemini"] = {
95
+ "class": GeminiLayoutDetector,
96
+ "options_class": GeminiLayoutOptions,
97
+ }
58
98
 
59
99
  # Define the limited set of kwargs allowed for the simple analyze_layout call
60
- SIMPLE_MODE_ALLOWED_KWARGS = {
61
- 'engine', 'confidence', 'classes', 'exclude_classes', 'device'
62
- }
100
+ SIMPLE_MODE_ALLOWED_KWARGS = {"engine", "confidence", "classes", "exclude_classes", "device"}
63
101
 
64
102
  def __init__(self):
65
103
  """Initializes the Layout Manager."""
66
104
  # Cache for detector instances (different from model cache inside detector)
67
105
  self._detector_instances: Dict[str, LayoutDetector] = {}
68
- logger.info(f"LayoutManager initialized. Available engines: {list(self.ENGINE_REGISTRY.keys())}")
106
+ logger.info(
107
+ f"LayoutManager initialized. Available engines: {list(self.ENGINE_REGISTRY.keys())}"
108
+ )
69
109
 
70
110
  def _get_engine_instance(self, engine_name: str) -> LayoutDetector:
71
111
  """Retrieves or creates an instance of the specified layout detector."""
72
112
  engine_name = engine_name.lower()
73
113
  if engine_name not in self.ENGINE_REGISTRY:
74
- raise ValueError(f"Unknown layout engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
114
+ raise ValueError(
115
+ f"Unknown layout engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
116
+ )
75
117
 
76
118
  if engine_name not in self._detector_instances:
77
119
  logger.info(f"Creating instance of layout engine: {engine_name}")
78
- engine_class = self.ENGINE_REGISTRY[engine_name]['class']
79
- detector_instance = engine_class() # Instantiate
120
+ engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
121
+ detector_instance = engine_class() # Instantiate
80
122
  if not detector_instance.is_available():
81
- # Check availability before storing
82
- raise RuntimeError(f"Layout engine '{engine_name}' is not available. Please check dependencies.")
83
- self._detector_instances[engine_name] = detector_instance # Store if available
123
+ # Check availability before storing
124
+ # Construct helpful error message with install hint
125
+ install_hint = ""
126
+ if engine_name == "yolo":
127
+ install_hint = "pip install 'natural-pdf[layout_yolo]'"
128
+ elif engine_name == "tatr":
129
+ install_hint = "pip install 'natural-pdf[core-ml]'"
130
+ elif engine_name == "paddle":
131
+ install_hint = "pip install 'natural-pdf[paddle]'"
132
+ elif engine_name == "surya":
133
+ install_hint = "pip install 'natural-pdf[surya]'"
134
+ # Add other engines like docling if they become optional extras
135
+ else:
136
+ install_hint = f"(Check installation requirements for {engine_name})"
137
+
138
+ raise RuntimeError(
139
+ f"Layout engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
140
+ )
141
+ self._detector_instances[engine_name] = detector_instance # Store if available
84
142
 
85
143
  return self._detector_instances[engine_name]
86
144
 
87
145
  def analyze_layout(
88
146
  self,
89
147
  image: Image.Image,
90
- engine: Optional[str] = None, # Default engine handled below
148
+ engine: Optional[str] = None, # Default engine handled below
91
149
  options: Optional[LayoutOptions] = None,
92
- **kwargs
150
+ **kwargs,
93
151
  ) -> List[Dict[str, Any]]:
94
152
  """
95
153
  Analyzes layout of a single image using simple args or an options object.
@@ -109,11 +167,11 @@ class LayoutManager:
109
167
  selected_engine_name: str
110
168
 
111
169
  if not isinstance(image, Image.Image):
112
- raise TypeError("Input 'image' must be a PIL Image.")
170
+ raise TypeError("Input 'image' must be a PIL Image.")
113
171
 
114
172
  available_engines = self.get_available_engines()
115
173
  if not available_engines:
116
- raise RuntimeError("No layout engines are available. Please check dependencies.")
174
+ raise RuntimeError("No layout engines are available. Please check dependencies.")
117
175
 
118
176
  # Determine default engine if not specified
119
177
  default_engine = engine if engine else available_engines[0]
@@ -123,46 +181,55 @@ class LayoutManager:
123
181
  # Advanced Mode: An options object was provided directly (or constructed by LayoutAnalyzer)
124
182
  # Use this object directly, do not deep copy or reconstruct.
125
183
  logger.debug(f"LayoutManager: Using provided options object: {type(options).__name__}")
126
- final_options = options # Use the provided object directly
184
+ final_options = options # Use the provided object directly
127
185
  found_engine = False
128
186
  for name, registry_entry in self.ENGINE_REGISTRY.items():
129
- if isinstance(options, registry_entry['options_class']):
187
+ if isinstance(options, registry_entry["options_class"]):
130
188
  selected_engine_name = name
131
189
  found_engine = True
132
190
  break
133
191
  if not found_engine:
134
- raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options.")
192
+ raise TypeError(
193
+ f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options."
194
+ )
135
195
  # Ignore simple kwargs if options object is present
136
196
  if kwargs:
137
- logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored.")
197
+ logger.warning(
198
+ f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored."
199
+ )
138
200
  else:
139
- # Simple Mode: No options object provided initially.
201
+ # Simple Mode: No options object provided initially.
140
202
  # Determine engine from kwargs or default, then construct options.
141
203
  selected_engine_name = default_engine.lower()
142
- logger.debug(f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}")
204
+ logger.debug(
205
+ f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}"
206
+ )
143
207
 
144
208
  if selected_engine_name not in self.ENGINE_REGISTRY:
145
- raise ValueError(f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}")
209
+ raise ValueError(
210
+ f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}"
211
+ )
146
212
 
147
213
  unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
148
214
  if unexpected_kwargs:
149
- raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
215
+ raise TypeError(
216
+ f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration."
217
+ )
150
218
 
151
- options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
219
+ options_class = self.ENGINE_REGISTRY[selected_engine_name]["options_class"]
152
220
  # Use BaseLayoutOptions defaults unless overridden by kwargs
153
221
  base_defaults = BaseLayoutOptions()
154
222
  simple_args = {
155
- 'confidence': kwargs.get('confidence', base_defaults.confidence),
156
- 'classes': kwargs.get('classes'),
157
- 'exclude_classes': kwargs.get('exclude_classes'),
158
- 'device': kwargs.get('device', base_defaults.device)
223
+ "confidence": kwargs.get("confidence", base_defaults.confidence),
224
+ "classes": kwargs.get("classes"),
225
+ "exclude_classes": kwargs.get("exclude_classes"),
226
+ "device": kwargs.get("device", base_defaults.device),
159
227
  }
160
228
  # Filter out None values before passing to constructor
161
229
  simple_args_filtered = {k: v for k, v in simple_args.items() if v is not None}
162
230
  final_options = options_class(**simple_args_filtered)
163
231
  logger.debug(f"LayoutManager: Constructed options for simple mode: {final_options}")
164
232
 
165
-
166
233
  # --- Get Engine Instance and Process ---
167
234
  try:
168
235
  engine_instance = self._get_engine_instance(selected_engine_name)
@@ -175,29 +242,29 @@ class LayoutManager:
175
242
  return detections
176
243
 
177
244
  except (ImportError, RuntimeError, ValueError, TypeError) as e:
178
- logger.error(f"Layout analysis failed for engine '{selected_engine_name}': {e}", exc_info=True)
179
- raise # Re-raise expected errors
245
+ logger.error(
246
+ f"Layout analysis failed for engine '{selected_engine_name}': {e}", exc_info=True
247
+ )
248
+ raise # Re-raise expected errors
180
249
  except Exception as e:
181
- logger.error(f"An unexpected error occurred during layout analysis: {e}", exc_info=True)
182
- raise # Re-raise unexpected errors
183
-
250
+ logger.error(f"An unexpected error occurred during layout analysis: {e}", exc_info=True)
251
+ raise # Re-raise unexpected errors
184
252
 
185
253
  def get_available_engines(self) -> List[str]:
186
254
  """Returns a list of registered layout engine names that are currently available."""
187
255
  available = []
188
256
  for name, registry_entry in self.ENGINE_REGISTRY.items():
189
- try:
190
- engine_class = registry_entry['class']
191
- # Check availability without full instantiation if possible
192
- if hasattr(engine_class, 'is_available') and callable(engine_class.is_available):
193
- # Create temporary instance only for check if needed, or use classmethod
194
- if engine_class().is_available(): # Assumes instance needed for check
195
- available.append(name)
196
- else:
197
- # Assume available if class exists (less robust)
198
- available.append(name)
199
- except Exception as e:
200
- logger.debug(f"Layout engine '{name}' check failed: {e}")
201
- pass
257
+ try:
258
+ engine_class = registry_entry["class"]
259
+ # Check availability without full instantiation if possible
260
+ if hasattr(engine_class, "is_available") and callable(engine_class.is_available):
261
+ # Create temporary instance only for check if needed, or use classmethod
262
+ if engine_class().is_available(): # Assumes instance needed for check
263
+ available.append(name)
264
+ else:
265
+ # Assume available if class exists (less robust)
266
+ available.append(name)
267
+ except Exception as e:
268
+ logger.debug(f"Layout engine '{name}' check failed: {e}")
269
+ pass
202
270
  return available
203
-
@@ -1,32 +1,40 @@
1
1
  # layout_options.py
2
2
  import logging
3
3
  from dataclasses import dataclass, field
4
- from typing import List, Optional, Dict, Any, Tuple, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  logger = logging.getLogger(__name__)
7
7
 
8
+
8
9
  # --- Base Layout Options ---
9
10
  @dataclass
10
11
  class BaseLayoutOptions:
11
12
  """Base options for layout detection engines."""
12
- confidence: float = 0.5 # Minimum confidence threshold for detections
13
- classes: Optional[List[str]] = None # Specific classes to detect (None for all)
14
- exclude_classes: Optional[List[str]] = None # Classes to exclude
15
- device: Optional[str] = 'cpu' # Preferred device ('cpu', 'cuda', 'mps', etc.)
16
- extra_args: Dict[str, Any] = field(default_factory=dict) # For engine-specific args not yet fields
13
+
14
+ confidence: float = 0.5 # Minimum confidence threshold for detections
15
+ classes: Optional[List[str]] = None # Specific classes to detect (None for all)
16
+ exclude_classes: Optional[List[str]] = None # Classes to exclude
17
+ device: Optional[str] = "cpu" # Preferred device ('cpu', 'cuda', 'mps', etc.)
18
+ extra_args: Dict[str, Any] = field(
19
+ default_factory=dict
20
+ ) # For engine-specific args not yet fields
21
+
17
22
 
18
23
  # --- YOLO Specific Options ---
19
24
  @dataclass
20
25
  class YOLOLayoutOptions(BaseLayoutOptions):
21
26
  """Options specific to YOLO-based layout detection."""
27
+
22
28
  model_repo: str = "juliozhao/DocLayout-YOLO-DocStructBench"
23
29
  model_file: str = "doclayout_yolo_docstructbench_imgsz1024.pt"
24
- image_size: int = 1024 # Input image size for the model
30
+ image_size: int = 1024 # Input image size for the model
31
+
25
32
 
26
33
  # --- TATR Specific Options ---
27
34
  @dataclass
28
35
  class TATRLayoutOptions(BaseLayoutOptions):
29
36
  """Options specific to Table Transformer (TATR) layout detection."""
37
+
30
38
  # Which models to use (can be local paths or HF identifiers)
31
39
  detection_model: str = "microsoft/table-transformer-detection"
32
40
  structure_model: str = "microsoft/table-transformer-structure-recognition-v1.1-all"
@@ -36,35 +44,52 @@ class TATRLayoutOptions(BaseLayoutOptions):
36
44
  # Whether to create cell regions (can be slow)
37
45
  create_cells: bool = True
38
46
 
47
+
39
48
  # --- Paddle Specific Options ---
40
49
  @dataclass
41
50
  class PaddleLayoutOptions(BaseLayoutOptions):
42
51
  """Options specific to PaddlePaddle PP-Structure layout detection."""
43
- lang: str = "en" # Language ('en', 'ch', etc.)
44
- use_angle_cls: bool = False # Use text angle classification?
45
- enable_table: bool = True # Enable table structure detection?
46
- show_log: bool = False # Show Paddle internal logs?
47
- detect_text: bool = True # Also detect raw text boxes using PaddleOCR?
48
- verbose: bool = False # Verbose logging for the detector class
52
+
53
+ lang: str = "en" # Language ('en', 'ch', etc.)
54
+ use_angle_cls: bool = False # Use text angle classification?
55
+ enable_table: bool = True # Enable table structure detection?
56
+ show_log: bool = False # Show Paddle internal logs?
57
+ detect_text: bool = True # Also detect raw text boxes using PaddleOCR?
58
+ verbose: bool = False # Verbose logging for the detector class
59
+
49
60
 
50
61
  # --- Surya Specific Options ---
51
62
  @dataclass
52
63
  class SuryaLayoutOptions(BaseLayoutOptions):
53
64
  """Options specific to Surya layout detection."""
54
- model_name: str = "default" # Placeholder if different models become available
55
- recognize_table_structure: bool = True # Automatically run table structure recognition?
65
+
66
+ model_name: str = "default" # Placeholder if different models become available
67
+ recognize_table_structure: bool = True # Automatically run table structure recognition?
68
+
56
69
 
57
70
  # --- Docling Specific Options ---
58
71
  @dataclass
59
72
  class DoclingLayoutOptions(BaseLayoutOptions):
60
73
  """Options specific to Docling layout detection."""
74
+
61
75
  # Pass kwargs directly to Docling's DocumentConverter via extra_args
62
76
  # Common examples shown here for documentation, add others as needed to extra_args
63
77
  # model_name: str = "ds4sd/SmolDocling-256M-preview" # Example model (pass via extra_args)
64
78
  # prompt_text: Optional[str] = None # Optional prompt (pass via extra_args)
65
- verbose: bool = False # Verbose logging for the detector class
79
+ verbose: bool = False # Verbose logging for the detector class
66
80
  # Other kwargs like 'device', 'batch_size' can go in extra_args
67
81
 
82
+
83
+ # --- Gemini Specific Options ---
84
+ @dataclass
85
+ class GeminiLayoutOptions(BaseLayoutOptions):
86
+ """Options specific to Gemini-based layout detection (using OpenAI compatibility)."""
87
+
88
+ model_name: str = "gemini-2.0-flash"
89
+ # Removed: prompt_template, temperature, top_p, max_output_tokens
90
+ # These are typically passed directly to the chat completion call or via extra_args
91
+
92
+
68
93
  # --- Union Type ---
69
94
  LayoutOptions = Union[
70
95
  YOLOLayoutOptions,
@@ -72,5 +97,6 @@ LayoutOptions = Union[
72
97
  PaddleLayoutOptions,
73
98
  SuryaLayoutOptions,
74
99
  DoclingLayoutOptions,
75
- BaseLayoutOptions # Include base for typing flexibility
100
+ GeminiLayoutOptions,
101
+ BaseLayoutOptions, # Include base for typing flexibility
76
102
  ]