natural-pdf 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +99 -40
- natural_pdf/core/page.py +76 -3
- natural_pdf/core/pdf.py +38 -3
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +270 -14
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/RECORD +14 -18
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/top_level.txt +0 -0
@@ -146,7 +146,7 @@ class ElementManager:
|
|
146
146
|
contained in the Page class, providing better separation of concerns.
|
147
147
|
"""
|
148
148
|
|
149
|
-
def __init__(self, page, font_attrs=None):
|
149
|
+
def __init__(self, page, font_attrs=None, load_text: bool = True):
|
150
150
|
"""
|
151
151
|
Initialize the ElementManager.
|
152
152
|
|
@@ -156,9 +156,11 @@ class ElementManager:
|
|
156
156
|
Default: ['fontname', 'size', 'bold', 'italic']
|
157
157
|
None: Only consider spatial relationships
|
158
158
|
List: Custom attributes to consider
|
159
|
+
load_text: Whether to load text elements from the PDF (default: True).
|
159
160
|
"""
|
160
161
|
self._page = page
|
161
162
|
self._elements = None # Lazy-loaded
|
163
|
+
self._load_text = load_text
|
162
164
|
# Default to splitting by fontname, size, bold, italic if not specified
|
163
165
|
# Renamed internal variable for clarity
|
164
166
|
self._word_split_attributes = (
|
@@ -175,11 +177,15 @@ class ElementManager:
|
|
175
177
|
|
176
178
|
logger.debug(f"Page {self._page.number}: Loading elements...")
|
177
179
|
|
178
|
-
# 1. Prepare character dictionaries
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
180
|
+
# 1. Prepare character dictionaries only if loading text
|
181
|
+
if self._load_text:
|
182
|
+
prepared_char_dicts = self._prepare_char_dicts()
|
183
|
+
logger.debug(
|
184
|
+
f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
|
185
|
+
)
|
186
|
+
else:
|
187
|
+
prepared_char_dicts = []
|
188
|
+
logger.debug(f"Page {self._page.number}: Skipping text loading (load_text=False)")
|
183
189
|
|
184
190
|
# -------------------------------------------------------------
|
185
191
|
# Detect strikethrough (horizontal strike-out lines) on raw
|
@@ -189,52 +195,105 @@ class ElementManager:
|
|
189
195
|
# belong to the same word.
|
190
196
|
# -------------------------------------------------------------
|
191
197
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
198
|
+
if self._load_text and prepared_char_dicts:
|
199
|
+
try:
|
200
|
+
self._mark_strikethrough_chars(prepared_char_dicts)
|
201
|
+
except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
|
202
|
+
logger.warning(
|
203
|
+
f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
|
204
|
+
exc_info=True,
|
205
|
+
)
|
199
206
|
|
200
207
|
# -------------------------------------------------------------
|
201
208
|
# Detect underlines on raw characters (must come after strike so
|
202
209
|
# both attributes are present before word grouping).
|
203
210
|
# -------------------------------------------------------------
|
204
211
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
+
if self._load_text and prepared_char_dicts:
|
213
|
+
try:
|
214
|
+
self._mark_underline_chars(prepared_char_dicts)
|
215
|
+
except Exception as u_err: # pragma: no cover
|
216
|
+
logger.warning(
|
217
|
+
f"Page {self._page.number}: Underline detection failed – {u_err}",
|
218
|
+
exc_info=True,
|
219
|
+
)
|
212
220
|
|
213
221
|
# Detect highlights
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
222
|
+
if self._load_text and prepared_char_dicts:
|
223
|
+
try:
|
224
|
+
self._mark_highlight_chars(prepared_char_dicts)
|
225
|
+
except Exception as h_err:
|
226
|
+
logger.warning(
|
227
|
+
f"Page {self._page.number}: Highlight detection failed – {h_err}",
|
228
|
+
exc_info=True,
|
229
|
+
)
|
221
230
|
|
222
231
|
# Create a mapping from character dict to index for efficient lookup
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
232
|
+
if self._load_text:
|
233
|
+
char_to_index = {}
|
234
|
+
for idx, char_dict in enumerate(prepared_char_dicts):
|
235
|
+
key = (
|
236
|
+
char_dict.get("x0", 0),
|
237
|
+
char_dict.get("top", 0),
|
238
|
+
char_dict.get("text", ""),
|
239
|
+
)
|
240
|
+
char_to_index[key] = idx
|
241
|
+
else:
|
242
|
+
char_to_index = {}
|
231
243
|
|
232
244
|
# 2. Instantiate the custom word extractor
|
233
|
-
#
|
245
|
+
# Prefer page-level config over PDF-level for tolerance lookup
|
246
|
+
word_elements: List[TextElement] = []
|
247
|
+
|
248
|
+
# Get config objects (needed for auto_text_tolerance check)
|
249
|
+
page_config = getattr(self._page, "_config", {})
|
234
250
|
pdf_config = getattr(self._page._parent, "_config", {})
|
235
|
-
|
236
|
-
|
251
|
+
|
252
|
+
# Initialize tolerance variables
|
253
|
+
xt = None
|
254
|
+
yt = None
|
237
255
|
use_flow = pdf_config.get("use_text_flow", False)
|
256
|
+
|
257
|
+
if self._load_text and prepared_char_dicts:
|
258
|
+
# Start with any explicitly supplied tolerances (may be None)
|
259
|
+
xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
|
260
|
+
yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
|
261
|
+
|
262
|
+
# ------------------------------------------------------------------
|
263
|
+
# Auto-adaptive tolerance: scale based on median character size when
|
264
|
+
# requested and explicit values are absent.
|
265
|
+
# ------------------------------------------------------------------
|
266
|
+
if self._load_text and pdf_config.get("auto_text_tolerance", True):
|
267
|
+
import statistics
|
268
|
+
|
269
|
+
sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
|
270
|
+
median_size = None
|
271
|
+
if sizes:
|
272
|
+
median_size = statistics.median(sizes)
|
273
|
+
if xt is None:
|
274
|
+
xt = 0.25 * median_size # ~kerning width
|
275
|
+
# Record back to page config for downstream users
|
276
|
+
page_config["x_tolerance"] = xt
|
277
|
+
if yt is None:
|
278
|
+
yt = 0.6 * median_size # ~line spacing fraction
|
279
|
+
page_config["y_tolerance"] = yt
|
280
|
+
|
281
|
+
# Warn users when the page's font size is extremely small –
|
282
|
+
# this is often the root cause of merged-row/column issues.
|
283
|
+
if median_size and median_size < 6: # 6 pt is unusually small
|
284
|
+
logger.warning(
|
285
|
+
f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
|
286
|
+
f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
|
287
|
+
"If the output looks wrong you can override these values via "
|
288
|
+
"PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
|
289
|
+
"auto_text_tolerance=False)."
|
290
|
+
)
|
291
|
+
|
292
|
+
# Fallback to pdfplumber defaults if still None
|
293
|
+
if xt is None:
|
294
|
+
xt = 3
|
295
|
+
if yt is None:
|
296
|
+
yt = 3
|
238
297
|
|
239
298
|
# List of attributes to preserve on word objects
|
240
299
|
attributes_to_preserve = list(
|
@@ -284,7 +343,6 @@ class ElementManager:
|
|
284
343
|
current_line_key = line_key
|
285
344
|
lines[-1].append(char_dict)
|
286
345
|
|
287
|
-
word_elements: List[TextElement] = []
|
288
346
|
# Process each line separately with direction detection
|
289
347
|
for line_chars in lines:
|
290
348
|
if not line_chars:
|
@@ -441,7 +499,8 @@ class ElementManager:
|
|
441
499
|
except Exception:
|
442
500
|
w._obj["highlight_color"] = dominant_color
|
443
501
|
|
444
|
-
generated_words
|
502
|
+
# generated_words defaults to empty list if text loading is disabled
|
503
|
+
generated_words = word_elements if self._load_text else []
|
445
504
|
logger.debug(
|
446
505
|
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
447
506
|
)
|
natural_pdf/core/page.py
CHANGED
@@ -101,7 +101,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
101
101
|
with improved selection, navigation, extraction, and question-answering capabilities.
|
102
102
|
"""
|
103
103
|
|
104
|
-
def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
|
104
|
+
def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None, load_text: bool = True):
|
105
105
|
"""
|
106
106
|
Initialize a page wrapper.
|
107
107
|
|
@@ -110,10 +110,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
110
110
|
parent: Parent PDF object
|
111
111
|
index: Index of this page in the PDF (0-based)
|
112
112
|
font_attrs: Font attributes to consider when grouping characters into words.
|
113
|
+
load_text: Whether to load text elements from the PDF (default: True).
|
113
114
|
"""
|
114
115
|
self._page = page
|
115
116
|
self._parent = parent
|
116
117
|
self._index = index
|
118
|
+
self._load_text = load_text
|
117
119
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
118
120
|
self._exclusions = [] # List to store exclusion functions/regions
|
119
121
|
self._skew_angle: Optional[float] = None # Stores detected skew angle
|
@@ -128,8 +130,15 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
128
130
|
"named": {}, # Named regions (name -> region)
|
129
131
|
}
|
130
132
|
|
133
|
+
# -------------------------------------------------------------
|
134
|
+
# Page-scoped configuration begins as a shallow copy of the parent
|
135
|
+
# PDF-level configuration so that auto-computed tolerances or other
|
136
|
+
# page-specific values do not overwrite siblings.
|
137
|
+
# -------------------------------------------------------------
|
138
|
+
self._config = dict(getattr(self._parent, "_config", {}))
|
139
|
+
|
131
140
|
# Initialize ElementManager, passing font_attrs
|
132
|
-
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
141
|
+
self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
|
133
142
|
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
134
143
|
# --- NEW --- Central registry for analysis results
|
135
144
|
self.analyses: Dict[str, Any] = {}
|
@@ -1153,10 +1162,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1153
1162
|
# 5. Generate Text Layout using Utility
|
1154
1163
|
# Pass page bbox as layout context
|
1155
1164
|
page_bbox = (0, 0, self.width, self.height)
|
1165
|
+
# Merge PDF-level default tolerances if caller did not override
|
1166
|
+
merged_kwargs = dict(kwargs)
|
1167
|
+
tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
|
1168
|
+
for k in tol_keys:
|
1169
|
+
if k not in merged_kwargs:
|
1170
|
+
if k in self._config:
|
1171
|
+
merged_kwargs[k] = self._config[k]
|
1172
|
+
elif k in getattr(self._parent, "_config", {}):
|
1173
|
+
merged_kwargs[k] = self._parent._config[k]
|
1174
|
+
|
1156
1175
|
result = generate_text_layout(
|
1157
1176
|
char_dicts=filtered_chars,
|
1158
1177
|
layout_context_bbox=page_bbox,
|
1159
|
-
user_kwargs=
|
1178
|
+
user_kwargs=merged_kwargs,
|
1160
1179
|
)
|
1161
1180
|
|
1162
1181
|
# --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
|
@@ -1356,6 +1375,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1356
1375
|
|
1357
1376
|
# Use the selected method
|
1358
1377
|
if effective_method == "pdfplumber":
|
1378
|
+
# ---------------------------------------------------------
|
1379
|
+
# Inject auto-computed or user-specified text tolerances so
|
1380
|
+
# pdfplumber uses the same numbers we used for word grouping
|
1381
|
+
# whenever the table algorithm relies on word positions.
|
1382
|
+
# ---------------------------------------------------------
|
1383
|
+
if "text" in (
|
1384
|
+
table_settings.get("vertical_strategy"),
|
1385
|
+
table_settings.get("horizontal_strategy"),
|
1386
|
+
):
|
1387
|
+
print("SETTING IT UP")
|
1388
|
+
pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
|
1389
|
+
if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1390
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1391
|
+
if x_tol is not None:
|
1392
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1393
|
+
if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1394
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1395
|
+
if y_tol is not None:
|
1396
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1397
|
+
|
1398
|
+
# pdfplumber's text strategy benefits from a tight snap tolerance.
|
1399
|
+
if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1400
|
+
# Derive from y_tol if available, else default 1
|
1401
|
+
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1402
|
+
table_settings.setdefault("snap_tolerance", snap)
|
1403
|
+
if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1404
|
+
join = table_settings.get("snap_tolerance", 1)
|
1405
|
+
table_settings.setdefault("join_tolerance", join)
|
1406
|
+
table_settings.setdefault("join_x_tolerance", join)
|
1407
|
+
table_settings.setdefault("join_y_tolerance", join)
|
1408
|
+
|
1359
1409
|
return self._page.extract_tables(table_settings)
|
1360
1410
|
else:
|
1361
1411
|
raise ValueError(
|
@@ -2950,6 +3000,29 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2950
3000
|
"""
|
2951
3001
|
return self.find_all('*').inspect(limit=limit)
|
2952
3002
|
|
3003
|
+
def remove_text_layer(self) -> "Page":
|
3004
|
+
"""
|
3005
|
+
Remove all text elements from this page.
|
3006
|
+
|
3007
|
+
This removes all text elements (words and characters) from the page,
|
3008
|
+
effectively clearing the text layer.
|
3009
|
+
|
3010
|
+
Returns:
|
3011
|
+
Self for method chaining
|
3012
|
+
"""
|
3013
|
+
logger.info(f"Page {self.number}: Removing all text elements...")
|
3014
|
+
|
3015
|
+
# Remove all words and chars from the element manager
|
3016
|
+
removed_words = len(self._element_mgr.words)
|
3017
|
+
removed_chars = len(self._element_mgr.chars)
|
3018
|
+
|
3019
|
+
# Clear the lists
|
3020
|
+
self._element_mgr._elements["words"] = []
|
3021
|
+
self._element_mgr._elements["chars"] = []
|
3022
|
+
|
3023
|
+
logger.info(f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters")
|
3024
|
+
return self
|
3025
|
+
|
2953
3026
|
@property
|
2954
3027
|
def lines(self) -> List[Any]:
|
2955
3028
|
"""Get all line elements on this page."""
|
natural_pdf/core/pdf.py
CHANGED
@@ -108,12 +108,13 @@ class _LazyPageList(Sequence):
|
|
108
108
|
also supported and will materialise pages on demand.
|
109
109
|
"""
|
110
110
|
|
111
|
-
def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
|
111
|
+
def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True):
|
112
112
|
self._parent_pdf = parent_pdf
|
113
113
|
self._plumber_pdf = plumber_pdf
|
114
114
|
self._font_attrs = font_attrs
|
115
115
|
# One slot per pdfplumber page – initially all None
|
116
116
|
self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
|
117
|
+
self._load_text = load_text
|
117
118
|
|
118
119
|
# Internal helper -----------------------------------------------------
|
119
120
|
def _create_page(self, index: int) -> "Page":
|
@@ -123,7 +124,7 @@ class _LazyPageList(Sequence):
|
|
123
124
|
from natural_pdf.core.page import Page
|
124
125
|
|
125
126
|
plumber_page = self._plumber_pdf.pages[index]
|
126
|
-
cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
|
127
|
+
cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs, load_text=self._load_text)
|
127
128
|
self._cache[index] = cached
|
128
129
|
return cached
|
129
130
|
|
@@ -168,6 +169,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
168
169
|
reading_order: bool = True,
|
169
170
|
font_attrs: Optional[List[str]] = None,
|
170
171
|
keep_spaces: bool = True,
|
172
|
+
text_tolerance: Optional[dict] = None,
|
173
|
+
auto_text_tolerance: bool = True,
|
174
|
+
text_layer: bool = True,
|
171
175
|
):
|
172
176
|
"""
|
173
177
|
Initialize the enhanced PDF object.
|
@@ -177,11 +181,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
177
181
|
reading_order: Whether to use natural reading order
|
178
182
|
font_attrs: Font attributes for grouping characters into words
|
179
183
|
keep_spaces: Whether to include spaces in word elements
|
184
|
+
text_tolerance: PDFplumber-style tolerance settings
|
185
|
+
auto_text_tolerance: Whether to automatically scale text tolerance
|
186
|
+
text_layer: Whether to keep the existing text layer from the PDF (default: True).
|
187
|
+
If False, removes all existing text elements during initialization.
|
180
188
|
"""
|
181
189
|
self._original_path_or_stream = path_or_url_or_stream
|
182
190
|
self._temp_file = None
|
183
191
|
self._resolved_path = None
|
184
192
|
self._is_stream = False
|
193
|
+
self._text_layer = text_layer
|
185
194
|
stream_to_open = None
|
186
195
|
|
187
196
|
if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
|
@@ -253,7 +262,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
253
262
|
self._manager_registry = {}
|
254
263
|
|
255
264
|
# Lazily instantiate pages only when accessed
|
256
|
-
self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
|
265
|
+
self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer)
|
257
266
|
|
258
267
|
self._element_cache = {}
|
259
268
|
self._exclusions = []
|
@@ -263,6 +272,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
263
272
|
|
264
273
|
self._initialize_managers()
|
265
274
|
self._initialize_highlighter()
|
275
|
+
|
276
|
+
# Remove text layer if requested
|
277
|
+
if not self._text_layer:
|
278
|
+
logger.info("Removing text layer as requested (text_layer=False)")
|
279
|
+
# Text layer is not loaded when text_layer=False, so no need to remove
|
280
|
+
pass
|
281
|
+
|
266
282
|
# Analysis results accessed via self.analyses property (see below)
|
267
283
|
|
268
284
|
# --- Automatic cleanup when object is garbage-collected ---
|
@@ -274,6 +290,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
274
290
|
getattr(self, "_is_stream", False),
|
275
291
|
)
|
276
292
|
|
293
|
+
# --- Text tolerance settings ------------------------------------
|
294
|
+
# Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
|
295
|
+
# y_tolerance, etc.) via *text_tolerance*. We also keep a flag that
|
296
|
+
# enables automatic tolerance scaling when explicit values are not
|
297
|
+
# supplied.
|
298
|
+
self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
|
299
|
+
if text_tolerance:
|
300
|
+
# Only copy recognised primitives (numbers / None); ignore junk.
|
301
|
+
allowed = {
|
302
|
+
"x_tolerance",
|
303
|
+
"x_tolerance_ratio",
|
304
|
+
"y_tolerance",
|
305
|
+
"keep_blank_chars", # passthrough convenience
|
306
|
+
}
|
307
|
+
for k, v in text_tolerance.items():
|
308
|
+
if k in allowed:
|
309
|
+
self._config[k] = v
|
310
|
+
|
277
311
|
def _initialize_managers(self):
|
278
312
|
"""Set up manager factories for lazy instantiation."""
|
279
313
|
# Store factories/classes for each manager key
|
@@ -1441,6 +1475,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1441
1475
|
reading_order=self._reading_order,
|
1442
1476
|
font_attrs=self._font_attrs,
|
1443
1477
|
keep_spaces=self._config.get("keep_spaces", True),
|
1478
|
+
text_layer=self._text_layer,
|
1444
1479
|
)
|
1445
1480
|
return new_pdf
|
1446
1481
|
except Exception as e:
|
@@ -1901,7 +1901,68 @@ class ElementCollection(
|
|
1901
1901
|
)
|
1902
1902
|
)
|
1903
1903
|
|
1904
|
+
# ------------------------------------------------------------------
|
1905
|
+
# NEW METHOD: apply_ocr for collections (supports custom function)
|
1906
|
+
# ------------------------------------------------------------------
|
1907
|
+
def apply_ocr(
|
1908
|
+
self,
|
1909
|
+
*,
|
1910
|
+
function: Optional[Callable[["Region"], Optional[str]]] = None,
|
1911
|
+
show_progress: bool = True,
|
1912
|
+
**kwargs,
|
1913
|
+
) -> "ElementCollection":
|
1914
|
+
"""Apply OCR to every element in the collection.
|
1915
|
+
|
1916
|
+
This is a convenience wrapper that simply iterates over the collection
|
1917
|
+
and calls ``el.apply_ocr(...)`` on each item.
|
1918
|
+
|
1919
|
+
Two modes are supported depending on the arguments provided:
|
1920
|
+
|
1921
|
+
1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
|
1922
|
+
or ``languages=['en']`` and each element delegates to the global
|
1923
|
+
OCRManager.
|
1924
|
+
2. **Custom function** – pass a *callable* via the ``function`` keyword
|
1925
|
+
(alias ``ocr_function`` also recognised). The callable will receive
|
1926
|
+
the element/region and must return the recognised text (or ``None``).
|
1927
|
+
Internally this is forwarded through the element's own
|
1928
|
+
:py:meth:`apply_ocr` implementation, so the behaviour mirrors the
|
1929
|
+
single-element API.
|
1930
|
+
|
1931
|
+
Parameters
|
1932
|
+
----------
|
1933
|
+
function : callable, optional
|
1934
|
+
Custom OCR function to use instead of the built-in engines.
|
1935
|
+
show_progress : bool, default True
|
1936
|
+
Display a tqdm progress bar while processing.
|
1937
|
+
**kwargs
|
1938
|
+
Additional parameters forwarded to each element's ``apply_ocr``.
|
1939
|
+
|
1940
|
+
Returns
|
1941
|
+
-------
|
1942
|
+
ElementCollection
|
1943
|
+
*Self* for fluent chaining.
|
1944
|
+
"""
|
1945
|
+
# Alias for backward-compatibility
|
1946
|
+
if function is None and "ocr_function" in kwargs:
|
1947
|
+
function = kwargs.pop("ocr_function")
|
1948
|
+
|
1949
|
+
def _process(el):
|
1950
|
+
if hasattr(el, "apply_ocr"):
|
1951
|
+
if function is not None:
|
1952
|
+
return el.apply_ocr(function=function, **kwargs)
|
1953
|
+
else:
|
1954
|
+
return el.apply_ocr(**kwargs)
|
1955
|
+
else:
|
1956
|
+
logger.warning(
|
1957
|
+
f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
|
1958
|
+
)
|
1959
|
+
return el
|
1960
|
+
|
1961
|
+
# Use collection's apply helper for optional progress bar
|
1962
|
+
self.apply(_process, show_progress=show_progress)
|
1963
|
+
return self
|
1904
1964
|
|
1965
|
+
# ------------------------------------------------------------------
|
1905
1966
|
|
1906
1967
|
|
1907
1968
|
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|