natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1 +1 @@
1
- # Templates package
1
+ # Templates package
@@ -4,9 +4,15 @@ Highlighting utilities for natural-pdf.
4
4
  This module primarily re-exports core highlighting utilities from the visualization module.
5
5
  The main highlighting logic is now centralized in `natural_pdf.core.highlighting_service.HighlightingService`.
6
6
  """
7
+
7
8
  # Re-export necessary functions from visualization
8
- from .visualization import get_next_highlight_color, create_legend, merge_images_with_legend, reset_highlight_colors
9
+ from .visualization import (
10
+ create_legend,
11
+ get_next_highlight_color,
12
+ merge_images_with_legend,
13
+ reset_highlight_colors,
14
+ )
9
15
 
10
16
  # --- The Highlight class and HighlightManager class previously defined here have been removed ---
11
17
  # --- The functionality is now handled by natural_pdf.core.highlighting_service.HighlightingService ---
12
- # --- and its internal HighlightRenderer class. ---
18
+ # --- and its internal HighlightRenderer class. ---
@@ -1,26 +1,28 @@
1
1
  """
2
2
  Reading order utilities for natural-pdf.
3
3
  """
4
- from typing import List, Dict, Any, Callable, Optional
5
4
 
5
+ from typing import Any, Callable, Dict, List, Optional
6
6
 
7
- def establish_reading_order(elements: List[Dict[str, Any]],
8
- algorithm: str = 'basic') -> List[Dict[str, Any]]:
7
+
8
+ def establish_reading_order(
9
+ elements: List[Dict[str, Any]], algorithm: str = "basic"
10
+ ) -> List[Dict[str, Any]]:
9
11
  """
10
12
  Establish reading order for a collection of elements.
11
-
13
+
12
14
  Args:
13
15
  elements: List of elements to order
14
16
  algorithm: Algorithm to use ('basic', 'column', 'complex')
15
-
17
+
16
18
  Returns:
17
19
  List of elements in reading order
18
20
  """
19
- if algorithm == 'basic':
21
+ if algorithm == "basic":
20
22
  return _basic_reading_order(elements)
21
- elif algorithm == 'column':
23
+ elif algorithm == "column":
22
24
  return _column_reading_order(elements)
23
- elif algorithm == 'complex':
25
+ elif algorithm == "complex":
24
26
  return _complex_reading_order(elements)
25
27
  else:
26
28
  # Default to basic
@@ -30,55 +32,52 @@ def establish_reading_order(elements: List[Dict[str, Any]],
30
32
  def _basic_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
31
33
  """
32
34
  Basic top-to-bottom, left-to-right reading order.
33
-
35
+
34
36
  Args:
35
37
  elements: List of elements to order
36
-
38
+
37
39
  Returns:
38
40
  List of elements in reading order
39
41
  """
40
42
  # Simple sort by y0 (top), then by x0 (left)
41
- return sorted(elements, key=lambda e: (
42
- e.get('top', e.get('y0', 0)),
43
- e.get('x0', 0)
44
- ))
43
+ return sorted(elements, key=lambda e: (e.get("top", e.get("y0", 0)), e.get("x0", 0)))
45
44
 
46
45
 
47
- def _column_reading_order(elements: List[Dict[str, Any]],
48
- column_threshold: float = 0.2,
49
- x_tolerance: float = 10.0) -> List[Dict[str, Any]]:
46
+ def _column_reading_order(
47
+ elements: List[Dict[str, Any]], column_threshold: float = 0.2, x_tolerance: float = 10.0
48
+ ) -> List[Dict[str, Any]]:
50
49
  """
51
50
  Reading order that accounts for columns.
52
-
51
+
53
52
  This is more complex as it needs to detect columns first,
54
53
  then read each column in order.
55
-
54
+
56
55
  Args:
57
56
  elements: List of elements to order
58
57
  column_threshold: Percentage overlap threshold for column detection (0.0 to 1.0)
59
58
  x_tolerance: Horizontal tolerance for determining column edges
60
-
59
+
61
60
  Returns:
62
61
  List of elements in reading order
63
62
  """
64
63
  if not elements:
65
64
  return []
66
-
65
+
67
66
  # 1. Group elements by line
68
67
  lines = group_elements_by_line(elements)
69
-
68
+
70
69
  # 2. For each line, find the x-coordinate ranges (potential column boundaries)
71
70
  line_x_ranges = []
72
71
  for line in lines:
73
72
  for el in line:
74
- x0 = el.get('x0', 0)
75
- x1 = el.get('x1', 0)
73
+ x0 = el.get("x0", 0)
74
+ x1 = el.get("x1", 0)
76
75
  line_x_ranges.append((x0, x1))
77
-
76
+
78
77
  # If we don't have enough ranges to detect columns, just use basic ordering
79
78
  if len(line_x_ranges) < 3:
80
79
  return _basic_reading_order(elements)
81
-
80
+
82
81
  # 3. Detect columns by clustering x-coordinate ranges
83
82
  def overlaps(range1, range2, threshold=column_threshold):
84
83
  """Determine if two ranges overlap by more than threshold percentage."""
@@ -86,25 +85,25 @@ def _column_reading_order(elements: List[Dict[str, Any]],
86
85
  overlap_start = max(range1[0], range2[0])
87
86
  overlap_end = min(range1[1], range2[1])
88
87
  overlap = max(0, overlap_end - overlap_start)
89
-
88
+
90
89
  # Calculate lengths
91
90
  len1 = range1[1] - range1[0]
92
91
  len2 = range2[1] - range2[0]
93
-
92
+
94
93
  # Calculate overlap as percentage of the shorter range
95
94
  shorter_len = min(len1, len2)
96
95
  if shorter_len == 0:
97
96
  return False
98
-
97
+
99
98
  return overlap / shorter_len >= threshold
100
-
99
+
101
100
  # Cluster x-ranges into columns
102
101
  columns = []
103
102
  for x_range in line_x_ranges:
104
103
  # Skip zero-width ranges
105
104
  if x_range[1] - x_range[0] <= 0:
106
105
  continue
107
-
106
+
108
107
  # Try to find an existing column to add to
109
108
  added = False
110
109
  for col in columns:
@@ -112,68 +111,70 @@ def _column_reading_order(elements: List[Dict[str, Any]],
112
111
  col.append(x_range)
113
112
  added = True
114
113
  break
115
-
114
+
116
115
  # If not added to an existing column, create a new one
117
116
  if not added:
118
117
  columns.append([x_range])
119
-
118
+
120
119
  # 4. Get column boundaries by averaging x-ranges in each column
121
120
  column_bounds = []
122
121
  for col in columns:
123
122
  left = sum(r[0] for r in col) / len(col)
124
123
  right = sum(r[1] for r in col) / len(col)
125
124
  column_bounds.append((left, right))
126
-
125
+
127
126
  # Sort columns by x-coordinate (left to right)
128
127
  column_bounds.sort(key=lambda b: b[0])
129
-
128
+
130
129
  # 5. Assign each element to a column
131
130
  element_columns = {}
132
131
  for el in elements:
133
132
  # Get element x-coordinates
134
- el_x0 = el.get('x0', 0)
135
- el_x1 = el.get('x1', 0)
133
+ el_x0 = el.get("x0", 0)
134
+ el_x1 = el.get("x1", 0)
136
135
  el_center = (el_x0 + el_x1) / 2
137
-
136
+
138
137
  # Find the column this element belongs to
139
138
  for i, (left, right) in enumerate(column_bounds):
140
139
  # Extend bounds by tolerance
141
140
  extended_left = left - x_tolerance
142
141
  extended_right = right + x_tolerance
143
-
142
+
144
143
  # Check if center point is within extended column bounds
145
144
  if extended_left <= el_center <= extended_right:
146
145
  element_columns[el] = i
147
146
  break
148
147
  else:
149
148
  # If no column found, assign to nearest column
150
- distances = [(i, min(abs(el_center - left), abs(el_center - right)))
151
- for i, (left, right) in enumerate(column_bounds)]
149
+ distances = [
150
+ (i, min(abs(el_center - left), abs(el_center - right)))
151
+ for i, (left, right) in enumerate(column_bounds)
152
+ ]
152
153
  nearest_col = min(distances, key=lambda d: d[1])[0]
153
154
  element_columns[el] = nearest_col
154
-
155
+
155
156
  # 6. Sort elements by column, then by vertical position
156
157
  sorted_elements = []
157
158
  for col_idx, _ in enumerate(column_bounds):
158
159
  # Get elements in this column
159
160
  col_elements = [el for el in elements if element_columns.get(el) == col_idx]
160
161
  # Sort by top coordinate
161
- col_elements.sort(key=lambda e: e.get('top', e.get('y0', 0)))
162
+ col_elements.sort(key=lambda e: e.get("top", e.get("y0", 0)))
162
163
  # Add to final list
163
164
  sorted_elements.extend(col_elements)
164
-
165
+
165
166
  return sorted_elements
166
167
 
167
168
 
168
169
  def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
169
170
  """
170
171
  Complex reading order that accounts for various document structures.
171
-
172
+
172
173
  This considers columns, text flow around images, tables, etc.
173
-
174
+
174
175
  Args:
175
176
  elements: List of elements to order
176
-
177
+
177
178
  Returns:
178
179
  List of elements in reading order
179
180
  """
@@ -182,31 +183,32 @@ def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any
182
183
  return _column_reading_order(elements)
183
184
 
184
185
 
185
- def group_elements_by_line(elements: List[Dict[str, Any]],
186
- tolerance: float = 3.0) -> List[List[Dict[str, Any]]]:
186
+ def group_elements_by_line(
187
+ elements: List[Dict[str, Any]], tolerance: float = 3.0
188
+ ) -> List[List[Dict[str, Any]]]:
187
189
  """
188
190
  Group elements into lines based on vertical position.
189
-
191
+
190
192
  Args:
191
193
  elements: List of elements to group
192
194
  tolerance: Maximum vertical distance for elements to be considered on the same line
193
-
195
+
194
196
  Returns:
195
197
  List of lists, where each sublist contains elements on the same line
196
198
  """
197
199
  if not elements:
198
200
  return []
199
-
201
+
200
202
  # Sort by top coordinate
201
- sorted_elements = sorted(elements, key=lambda e: e.get('top', e.get('y0', 0)))
202
-
203
+ sorted_elements = sorted(elements, key=lambda e: e.get("top", e.get("y0", 0)))
204
+
203
205
  lines = []
204
206
  current_line = [sorted_elements[0]]
205
- current_top = sorted_elements[0].get('top', sorted_elements[0].get('y0', 0))
206
-
207
+ current_top = sorted_elements[0].get("top", sorted_elements[0].get("y0", 0))
208
+
207
209
  for element in sorted_elements[1:]:
208
- element_top = element.get('top', element.get('y0', 0))
209
-
210
+ element_top = element.get("top", element.get("y0", 0))
211
+
210
212
  # If element is close enough to current line's top, add to current line
211
213
  if abs(element_top - current_top) <= tolerance:
212
214
  current_line.append(element)
@@ -215,13 +217,13 @@ def group_elements_by_line(elements: List[Dict[str, Any]],
215
217
  lines.append(current_line)
216
218
  current_line = [element]
217
219
  current_top = element_top
218
-
220
+
219
221
  # Add the last line
220
222
  if current_line:
221
223
  lines.append(current_line)
222
-
224
+
223
225
  # Sort elements within each line by x0
224
226
  for line in lines:
225
- line.sort(key=lambda e: e.get('x0', 0))
226
-
227
- return lines
227
+ line.sort(key=lambda e: e.get("x0", 0))
228
+
229
+ return lines
@@ -0,0 +1,195 @@
1
+ # natural_pdf/utils/text_extraction.py
2
+ import logging
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
4
+
5
+ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
6
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
7
+
8
+ if TYPE_CHECKING:
9
+ from natural_pdf.elements.region import Region # Use type hint
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def filter_chars_spatially(
15
+ char_dicts: List[Dict[str, Any]],
16
+ exclusion_regions: List["Region"],
17
+ target_region: Optional["Region"] = None,
18
+ debug: bool = False,
19
+ ) -> List[Dict[str, Any]]:
20
+ """
21
+ Filters a list of character dictionaries spatially based on exclusions
22
+ and an optional target region.
23
+
24
+ Args:
25
+ char_dicts: List of character dictionaries to filter.
26
+ exclusion_regions: List of Region objects to exclude characters from.
27
+ target_region: Optional Region object. If provided, only characters within
28
+ this region (respecting polygons) are kept.
29
+ debug: Enable debug logging.
30
+
31
+ Returns:
32
+ Filtered list of character dictionaries.
33
+ """
34
+ if not char_dicts:
35
+ return []
36
+
37
+ initial_count = len(char_dicts)
38
+ filtered_chars = char_dicts
39
+
40
+ # 1. Filter by Target Region (if provided)
41
+ if target_region:
42
+ target_bbox = target_region.bbox
43
+ target_is_polygon = target_region.has_polygon # Check once
44
+ region_filtered_chars = []
45
+ for char_dict in filtered_chars:
46
+ # Ensure basic geometry keys exist before processing
47
+ if not all(k in char_dict for k in ["x0", "top", "x1", "bottom"]):
48
+ if debug:
49
+ logger.warning(
50
+ f"Skipping char due to missing geometry: {char_dict.get('text', '?')}"
51
+ )
52
+ continue
53
+ char_bbox = (char_dict["x0"], char_dict["top"], char_dict["x1"], char_dict["bottom"])
54
+ # BBox pre-filter first
55
+ if get_bbox_overlap(char_bbox, target_bbox) is None:
56
+ continue
57
+ # Precise check if needed
58
+ char_center_x = (char_dict["x0"] + char_dict["x1"]) / 2
59
+ char_center_y = (char_dict["top"] + char_dict["bottom"]) / 2
60
+ if target_is_polygon:
61
+ if target_region.is_point_inside(char_center_x, char_center_y):
62
+ region_filtered_chars.append(char_dict)
63
+ # else: # Optionally log discarded by polygon
64
+ # if debug: logger.debug(...)
65
+ else: # Rectangular region, bbox overlap was sufficient
66
+ region_filtered_chars.append(char_dict)
67
+ filtered_chars = region_filtered_chars
68
+ if debug:
69
+ logger.debug(
70
+ f"filter_chars_spatially: {len(filtered_chars)}/{initial_count} chars remaining after target region filter."
71
+ )
72
+ if not filtered_chars:
73
+ return []
74
+
75
+ # 2. Filter by Exclusions (if any)
76
+ if exclusion_regions:
77
+ final_chars = []
78
+ # Only calculate union_bbox if there are exclusions AND chars remaining
79
+ union_bbox = merge_bboxes(excl.bbox for excl in exclusion_regions)
80
+ for char_dict in filtered_chars: # Process only chars within target
81
+ # Ensure basic geometry keys exist before processing
82
+ if not all(k in char_dict for k in ["x0", "top", "x1", "bottom"]):
83
+ # Already warned in target region filter if applicable
84
+ continue
85
+ char_bbox = (char_dict["x0"], char_dict["top"], char_dict["x1"], char_dict["bottom"])
86
+ # BBox pre-filter vs exclusion union
87
+ if get_bbox_overlap(char_bbox, union_bbox) is None:
88
+ final_chars.append(char_dict) # Cannot be excluded
89
+ continue
90
+ # Precise check against individual overlapping exclusions
91
+ is_excluded = False
92
+ char_center_x = (char_dict["x0"] + char_dict["x1"]) / 2
93
+ char_center_y = (char_dict["top"] + char_dict["bottom"]) / 2
94
+ for exclusion in exclusion_regions:
95
+ # Optional: Add bbox overlap check here too before point_inside
96
+ if get_bbox_overlap(char_bbox, exclusion.bbox) is not None:
97
+ if exclusion.is_point_inside(char_center_x, char_center_y):
98
+ is_excluded = True
99
+ if debug:
100
+ char_text = char_dict.get("text", "?")
101
+ log_msg = f" - Excluding char '{char_text}' at {char_bbox} due to overlap with exclusion {exclusion.bbox}"
102
+ logger.debug(log_msg)
103
+ break
104
+ if not is_excluded:
105
+ final_chars.append(char_dict)
106
+ filtered_chars = final_chars
107
+ if debug:
108
+ logger.debug(
109
+ f"filter_chars_spatially: {len(filtered_chars)}/{initial_count} chars remaining after exclusion filter."
110
+ )
111
+ if not filtered_chars:
112
+ return []
113
+
114
+ return filtered_chars
115
+
116
+
117
+ def generate_text_layout(
118
+ char_dicts: List[Dict[str, Any]],
119
+ layout_context_bbox: Tuple[float, float, float, float],
120
+ user_kwargs: Dict[str, Any],
121
+ ) -> str:
122
+ """
123
+ Takes a list of filtered character dictionaries and generates
124
+ text output using pdfplumber's layout engine.
125
+
126
+ Args:
127
+ char_dicts: The final list of character dictionaries to include.
128
+ layout_context_bbox: The bounding box (x0, top, x1, bottom) to use for
129
+ calculating default layout width/height/shifts.
130
+ user_kwargs: Dictionary of user-provided keyword arguments.
131
+
132
+ Returns:
133
+ The formatted text string.
134
+ """
135
+ if not char_dicts:
136
+ logger.debug("generate_text_layout: No characters provided.")
137
+ return ""
138
+
139
+ # Prepare layout kwargs, prioritizing user input
140
+ layout_kwargs = {}
141
+ allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
142
+ for key, value in user_kwargs.items():
143
+ if key in allowed_keys:
144
+ layout_kwargs[key] = value
145
+
146
+ # Default to layout=True unless explicitly False
147
+ use_layout = layout_kwargs.get("layout", True) # Default to layout if called
148
+ layout_kwargs["layout"] = use_layout
149
+
150
+ if use_layout:
151
+ ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
152
+ ctx_width = ctx_x1 - ctx_x0
153
+ ctx_height = ctx_bottom - ctx_top
154
+
155
+ # Set layout defaults based on context_bbox if not overridden by user
156
+ if "layout_bbox" not in layout_kwargs:
157
+ layout_kwargs["layout_bbox"] = layout_context_bbox
158
+ # Only set default layout_width if neither width specifier is present
159
+ if "layout_width_chars" not in layout_kwargs and "layout_width" not in layout_kwargs:
160
+ layout_kwargs["layout_width"] = ctx_width
161
+ if "layout_height" not in layout_kwargs:
162
+ layout_kwargs["layout_height"] = ctx_height
163
+ # Adjust shift based on context's top-left corner
164
+ if "x_shift" not in layout_kwargs:
165
+ layout_kwargs["x_shift"] = ctx_x0
166
+ if "y_shift" not in layout_kwargs:
167
+ layout_kwargs["y_shift"] = ctx_top
168
+
169
+ logger.debug(
170
+ f"generate_text_layout: Calling chars_to_textmap with {len(char_dicts)} chars and kwargs: {layout_kwargs}"
171
+ )
172
+ try:
173
+ # Sort final list by reading order before passing to textmap
174
+ # TODO: Make sorting key dynamic based on layout_kwargs directions?
175
+ char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
176
+ textmap = chars_to_textmap(char_dicts, **layout_kwargs)
177
+ result = textmap.as_string
178
+ except Exception as e:
179
+ logger.error(
180
+ f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=True
181
+ )
182
+ logger.warning(
183
+ "generate_text_layout: Falling back to simple character join due to layout error."
184
+ )
185
+ # Ensure chars are sorted before fallback join
186
+ fallback_chars = sorted(char_dicts, key=lambda c: (c.get("top", 0), c.get("x0", 0)))
187
+ result = "".join(c.get("text", "") for c in fallback_chars)
188
+ else:
189
+ # Simple join if layout=False
190
+ logger.debug("generate_text_layout: Using simple join (layout=False).")
191
+ # Sort by document order for simple join as well
192
+ char_dicts.sort(key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0)))
193
+ result = "".join(c.get("text", "") for c in char_dicts)
194
+
195
+ return result