natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +241 -158
- natural_pdf/classification/mixin.py +52 -38
- natural_pdf/classification/results.py +71 -45
- natural_pdf/collections/mixins.py +85 -20
- natural_pdf/collections/pdf_collection.py +245 -100
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +694 -195
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +610 -134
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
- natural_pdf-0.1.10.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,32 +1,42 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import MutableSequence
|
3
|
+
from pathlib import Path
|
2
4
|
from typing import (
|
3
5
|
TYPE_CHECKING,
|
4
6
|
Any,
|
5
7
|
Callable,
|
6
8
|
Dict,
|
7
9
|
Generic,
|
10
|
+
Iterable,
|
8
11
|
Iterator,
|
9
12
|
List,
|
10
13
|
Optional,
|
14
|
+
Sequence,
|
11
15
|
Tuple,
|
16
|
+
Type,
|
12
17
|
TypeVar,
|
13
18
|
Union,
|
14
|
-
|
19
|
+
overload,
|
15
20
|
)
|
16
21
|
|
17
22
|
from pdfplumber.utils.geometry import objects_to_bbox
|
18
|
-
from
|
23
|
+
from PIL import Image, ImageDraw, ImageFont
|
19
24
|
|
20
25
|
# New Imports
|
21
26
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
27
|
+
from tqdm.auto import tqdm
|
22
28
|
|
29
|
+
from natural_pdf.classification.manager import ClassificationManager
|
30
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
31
|
+
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
32
|
+
from natural_pdf.core.pdf import PDF
|
33
|
+
from natural_pdf.elements.base import Element
|
34
|
+
from natural_pdf.elements.region import Region
|
23
35
|
from natural_pdf.elements.text import TextElement
|
36
|
+
from natural_pdf.export.mixin import ExportMixin
|
24
37
|
from natural_pdf.ocr import OCROptions
|
25
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
26
38
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
27
|
-
from natural_pdf.
|
28
|
-
from natural_pdf.classification.manager import ClassificationManager
|
29
|
-
from natural_pdf.collections.mixins import ApplyMixin
|
39
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
30
40
|
|
31
41
|
logger = logging.getLogger(__name__)
|
32
42
|
|
@@ -38,7 +48,9 @@ T = TypeVar("T")
|
|
38
48
|
P = TypeVar("P", bound="Page")
|
39
49
|
|
40
50
|
|
41
|
-
class ElementCollection(
|
51
|
+
class ElementCollection(
|
52
|
+
Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
|
53
|
+
):
|
42
54
|
"""
|
43
55
|
Collection of PDF elements with batch operations.
|
44
56
|
"""
|
@@ -60,10 +72,6 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
60
72
|
"""Get an element by index."""
|
61
73
|
return self._elements[index]
|
62
74
|
|
63
|
-
def __iter__(self):
|
64
|
-
"""Iterate over elements."""
|
65
|
-
return iter(self._elements)
|
66
|
-
|
67
75
|
def __repr__(self) -> str:
|
68
76
|
"""Return a string representation showing the element count."""
|
69
77
|
element_type = "Mixed"
|
@@ -73,6 +81,20 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
73
81
|
element_type = types.pop()
|
74
82
|
return f"<ElementCollection[{element_type}](count={len(self)})>"
|
75
83
|
|
84
|
+
def __add__(self, other: "ElementCollection") -> "ElementCollection":
|
85
|
+
if not isinstance(other, ElementCollection):
|
86
|
+
return NotImplemented
|
87
|
+
return ElementCollection(self._elements + other._elements)
|
88
|
+
|
89
|
+
def __setitem__(self, index, value):
|
90
|
+
self._elements[index] = value
|
91
|
+
|
92
|
+
def __delitem__(self, index):
|
93
|
+
del self._elements[index]
|
94
|
+
|
95
|
+
def insert(self, index, value):
|
96
|
+
self._elements.insert(index, value)
|
97
|
+
|
76
98
|
@property
|
77
99
|
def elements(self) -> List["Element"]:
|
78
100
|
"""Get the elements in this collection."""
|
@@ -125,9 +147,7 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
125
147
|
|
126
148
|
# Check if any element is from a different PDF
|
127
149
|
return any(
|
128
|
-
hasattr(e, "page") and
|
129
|
-
hasattr(e.page, "pdf") and
|
130
|
-
e.page.pdf is not first_pdf
|
150
|
+
hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
|
131
151
|
for e in self._elements
|
132
152
|
)
|
133
153
|
|
@@ -1113,62 +1133,23 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1113
1133
|
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
1114
1134
|
return None
|
1115
1135
|
|
1116
|
-
def
|
1117
|
-
self, selector: str, regex: bool = False, case: bool = True, **kwargs
|
1118
|
-
) -> "ElementCollection[T]":
|
1136
|
+
def find(self, selector: str, **kwargs) -> "ElementCollection":
|
1119
1137
|
"""
|
1120
|
-
|
1138
|
+
Find elements in this collection matching the selector.
|
1121
1139
|
|
1122
1140
|
Args:
|
1123
|
-
selector: CSS-like selector string
|
1124
|
-
|
1125
|
-
case: Whether to do case-sensitive text search (default: True).
|
1126
|
-
**kwargs: Additional filter parameters passed to the selector function.
|
1127
|
-
|
1128
|
-
Returns:
|
1129
|
-
A new ElementCollection containing only the matching elements from this collection.
|
1141
|
+
selector: CSS-like selector string
|
1142
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
1130
1143
|
"""
|
1131
|
-
|
1132
|
-
return ElementCollection([])
|
1144
|
+
return self.apply(lambda element: element.find(selector, **kwargs))
|
1133
1145
|
|
1134
|
-
|
1135
|
-
selector_obj = parse_selector(selector)
|
1136
|
-
except Exception as e:
|
1137
|
-
logger.error(f"Error parsing selector '{selector}': {e}")
|
1138
|
-
return ElementCollection([]) # Return empty on parse error
|
1139
|
-
|
1140
|
-
# Pass regex and case flags to selector function generator
|
1141
|
-
kwargs["regex"] = regex
|
1142
|
-
kwargs["case"] = case
|
1143
|
-
|
1144
|
-
try:
|
1145
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1146
|
-
except Exception as e:
|
1147
|
-
logger.error(f"Error creating filter function for selector '{selector}': {e}")
|
1148
|
-
return ElementCollection([]) # Return empty on filter creation error
|
1149
|
-
|
1150
|
-
matching_elements = [element for element in self._elements if filter_func(element)]
|
1151
|
-
|
1152
|
-
# Note: Unlike Page.find_all, this doesn't re-sort.
|
1153
|
-
# Sorting should be done explicitly on the collection if needed.
|
1154
|
-
|
1155
|
-
return ElementCollection(matching_elements)
|
1156
|
-
|
1157
|
-
def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
|
1146
|
+
def extract_each_text(self, **kwargs) -> List[str]:
|
1158
1147
|
"""
|
1159
|
-
|
1160
|
-
|
1161
|
-
Args:
|
1162
|
-
selector: CSS-like selector string.
|
1163
|
-
regex: Whether to use regex for text search in :contains (default: False).
|
1164
|
-
case: Whether to do case-sensitive text search (default: True).
|
1165
|
-
**kwargs: Additional filter parameters passed to the selector function.
|
1166
|
-
|
1167
|
-
Returns:
|
1168
|
-
The first matching element or None.
|
1148
|
+
Extract text from each element in this region.
|
1169
1149
|
"""
|
1170
|
-
|
1171
|
-
|
1150
|
+
return self.apply(
|
1151
|
+
lambda element: element.extract_text(**kwargs) if element is not None else None
|
1152
|
+
)
|
1172
1153
|
|
1173
1154
|
def correct_ocr(
|
1174
1155
|
self,
|
@@ -1214,23 +1195,23 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1214
1195
|
def remove(self) -> int:
|
1215
1196
|
"""
|
1216
1197
|
Remove all elements in this collection from their respective pages.
|
1217
|
-
|
1198
|
+
|
1218
1199
|
This method removes elements from the page's _element_mgr storage.
|
1219
1200
|
It's particularly useful for removing OCR elements before applying new OCR.
|
1220
|
-
|
1201
|
+
|
1221
1202
|
Returns:
|
1222
1203
|
int: Number of elements successfully removed
|
1223
1204
|
"""
|
1224
1205
|
if not self._elements:
|
1225
1206
|
return 0
|
1226
|
-
|
1207
|
+
|
1227
1208
|
removed_count = 0
|
1228
|
-
|
1209
|
+
|
1229
1210
|
for element in self._elements:
|
1230
1211
|
# Each element should have a reference to its page
|
1231
1212
|
if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
|
1232
1213
|
element_mgr = element.page._element_mgr
|
1233
|
-
|
1214
|
+
|
1234
1215
|
# Determine element type
|
1235
1216
|
element_type = getattr(element, "object_type", None)
|
1236
1217
|
if element_type:
|
@@ -1243,7 +1224,7 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1243
1224
|
element_type = "rects"
|
1244
1225
|
elif element_type == "line":
|
1245
1226
|
element_type = "lines"
|
1246
|
-
|
1227
|
+
|
1247
1228
|
# Try to remove from the element manager
|
1248
1229
|
if hasattr(element_mgr, "remove_element"):
|
1249
1230
|
success = element_mgr.remove_element(element, element_type)
|
@@ -1253,27 +1234,27 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1253
1234
|
logger.warning("ElementManager does not have remove_element method")
|
1254
1235
|
else:
|
1255
1236
|
logger.warning(f"Element has no page or page has no _element_mgr: {element}")
|
1256
|
-
|
1237
|
+
|
1257
1238
|
return removed_count
|
1258
1239
|
|
1259
1240
|
# --- Classification Method --- #
|
1260
1241
|
def classify_all(
|
1261
1242
|
self,
|
1262
|
-
|
1243
|
+
labels: List[str],
|
1263
1244
|
model: Optional[str] = None,
|
1264
1245
|
using: Optional[str] = None,
|
1265
1246
|
min_confidence: float = 0.0,
|
1266
|
-
analysis_key: str =
|
1247
|
+
analysis_key: str = "classification",
|
1267
1248
|
multi_label: bool = False,
|
1268
1249
|
batch_size: int = 8,
|
1269
1250
|
max_workers: Optional[int] = None,
|
1270
1251
|
progress_bar: bool = True,
|
1271
|
-
**kwargs
|
1252
|
+
**kwargs,
|
1272
1253
|
):
|
1273
1254
|
"""Classifies all elements in the collection in batch.
|
1274
1255
|
|
1275
1256
|
Args:
|
1276
|
-
|
1257
|
+
labels: List of category labels.
|
1277
1258
|
model: Model ID (or alias 'text', 'vision').
|
1278
1259
|
using: Optional processing mode ('text' or 'vision'). Inferred if None.
|
1279
1260
|
min_confidence: Minimum confidence threshold.
|
@@ -1292,21 +1273,21 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1292
1273
|
# Requires access to the PDF's manager. Assume first element has it.
|
1293
1274
|
first_element = self.elements[0]
|
1294
1275
|
manager_source = None
|
1295
|
-
if hasattr(first_element,
|
1296
|
-
|
1297
|
-
elif hasattr(first_element,
|
1298
|
-
|
1299
|
-
|
1300
|
-
if not manager_source or not hasattr(manager_source,
|
1301
|
-
|
1276
|
+
if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
|
1277
|
+
manager_source = first_element.page.pdf
|
1278
|
+
elif hasattr(first_element, "pdf"): # Maybe it's a PageCollection?
|
1279
|
+
manager_source = first_element.pdf
|
1280
|
+
|
1281
|
+
if not manager_source or not hasattr(manager_source, "get_manager"):
|
1282
|
+
raise RuntimeError("Cannot access ClassificationManager via elements.")
|
1302
1283
|
|
1303
1284
|
try:
|
1304
|
-
manager = manager_source.get_manager(
|
1285
|
+
manager = manager_source.get_manager("classification")
|
1305
1286
|
except Exception as e:
|
1306
|
-
|
1287
|
+
raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
|
1307
1288
|
|
1308
1289
|
if not manager or not manager.is_available():
|
1309
|
-
|
1290
|
+
raise RuntimeError("ClassificationManager is not available.")
|
1310
1291
|
|
1311
1292
|
# Determine engine type early for content gathering
|
1312
1293
|
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
@@ -1314,60 +1295,187 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1314
1295
|
# Gather content from all elements
|
1315
1296
|
items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
|
1316
1297
|
original_elements: List[Any] = []
|
1317
|
-
logger.info(
|
1298
|
+
logger.info(
|
1299
|
+
f"Gathering content for {len(self.elements)} elements for batch classification..."
|
1300
|
+
)
|
1318
1301
|
for element in self.elements:
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1302
|
+
if not isinstance(element, ClassificationMixin):
|
1303
|
+
logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
|
1304
|
+
continue
|
1305
|
+
try:
|
1306
|
+
# Delegate content fetching to the element itself
|
1307
|
+
content = element._get_classification_content(model_type=inferred_using, **kwargs)
|
1308
|
+
items_to_classify.append(content)
|
1309
|
+
original_elements.append(element)
|
1310
|
+
except (ValueError, NotImplementedError) as e:
|
1311
|
+
logger.warning(
|
1312
|
+
f"Skipping element {element!r}: Cannot get content for classification - {e}"
|
1313
|
+
)
|
1314
|
+
except Exception as e:
|
1315
|
+
logger.warning(
|
1316
|
+
f"Skipping element {element!r}: Error getting classification content - {e}"
|
1317
|
+
)
|
1331
1318
|
|
1332
1319
|
if not items_to_classify:
|
1333
|
-
|
1334
|
-
|
1320
|
+
logger.warning("No content could be gathered from elements for batch classification.")
|
1321
|
+
return self
|
1335
1322
|
|
1336
|
-
logger.info(
|
1323
|
+
logger.info(
|
1324
|
+
f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
|
1325
|
+
)
|
1337
1326
|
|
1338
1327
|
# Call manager's batch classify
|
1339
1328
|
batch_results: List[ClassificationResult] = manager.classify_batch(
|
1340
1329
|
item_contents=items_to_classify,
|
1341
|
-
|
1330
|
+
labels=labels,
|
1342
1331
|
model_id=model,
|
1343
1332
|
using=inferred_using,
|
1344
1333
|
min_confidence=min_confidence,
|
1345
1334
|
multi_label=multi_label,
|
1346
1335
|
batch_size=batch_size,
|
1347
1336
|
progress_bar=progress_bar,
|
1348
|
-
**kwargs
|
1337
|
+
**kwargs,
|
1349
1338
|
)
|
1350
1339
|
|
1351
1340
|
# Assign results back to elements
|
1352
1341
|
if len(batch_results) != len(original_elements):
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1342
|
+
logger.error(
|
1343
|
+
f"Batch classification result count ({len(batch_results)}) mismatch "
|
1344
|
+
f"with elements processed ({len(original_elements)}). Cannot assign results."
|
1345
|
+
)
|
1346
|
+
# Decide how to handle mismatch - maybe store errors?
|
1358
1347
|
else:
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1348
|
+
logger.info(
|
1349
|
+
f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
|
1350
|
+
)
|
1351
|
+
for element, result_obj in zip(original_elements, batch_results):
|
1352
|
+
try:
|
1353
|
+
if not hasattr(element, "analyses") or element.analyses is None:
|
1354
|
+
element.analyses = {}
|
1355
|
+
element.analyses[analysis_key] = result_obj
|
1356
|
+
except Exception as e:
|
1357
|
+
logger.warning(f"Failed to store classification result for {element!r}: {e}")
|
1367
1358
|
|
1368
1359
|
return self
|
1360
|
+
|
1369
1361
|
# --- End Classification Method --- #
|
1370
1362
|
|
1363
|
+
def _gather_analysis_data(
|
1364
|
+
self,
|
1365
|
+
analysis_keys: List[str],
|
1366
|
+
include_content: bool,
|
1367
|
+
include_images: bool,
|
1368
|
+
image_dir: Optional[Path],
|
1369
|
+
image_format: str,
|
1370
|
+
image_resolution: int,
|
1371
|
+
) -> List[Dict[str, Any]]:
|
1372
|
+
"""
|
1373
|
+
Gather analysis data from all elements in the collection.
|
1374
|
+
|
1375
|
+
Args:
|
1376
|
+
analysis_keys: Keys in the analyses dictionary to export
|
1377
|
+
include_content: Whether to include extracted text
|
1378
|
+
include_images: Whether to export images
|
1379
|
+
image_dir: Directory to save images
|
1380
|
+
image_format: Format to save images
|
1381
|
+
image_resolution: Resolution for exported images
|
1382
|
+
|
1383
|
+
Returns:
|
1384
|
+
List of dictionaries containing analysis data
|
1385
|
+
"""
|
1386
|
+
if not self.elements:
|
1387
|
+
logger.warning("No elements found in collection")
|
1388
|
+
return []
|
1389
|
+
|
1390
|
+
all_data = []
|
1391
|
+
|
1392
|
+
for i, element in enumerate(self.elements):
|
1393
|
+
# Base element information
|
1394
|
+
element_data = {
|
1395
|
+
"element_index": i,
|
1396
|
+
"element_type": getattr(element, "type", type(element).__name__),
|
1397
|
+
}
|
1398
|
+
|
1399
|
+
# Add geometry if available
|
1400
|
+
for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
|
1401
|
+
if hasattr(element, attr):
|
1402
|
+
element_data[attr] = getattr(element, attr)
|
1403
|
+
|
1404
|
+
# Add page information if available
|
1405
|
+
if hasattr(element, "page"):
|
1406
|
+
page = element.page
|
1407
|
+
if page:
|
1408
|
+
element_data["page_number"] = getattr(page, "number", None)
|
1409
|
+
element_data["pdf_path"] = (
|
1410
|
+
getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
|
1411
|
+
)
|
1412
|
+
|
1413
|
+
# Include extracted text if requested
|
1414
|
+
if include_content and hasattr(element, "extract_text"):
|
1415
|
+
try:
|
1416
|
+
element_data["content"] = element.extract_text(preserve_whitespace=True)
|
1417
|
+
except Exception as e:
|
1418
|
+
logger.error(f"Error extracting text from element {i}: {e}")
|
1419
|
+
element_data["content"] = ""
|
1420
|
+
|
1421
|
+
# Save image if requested
|
1422
|
+
if include_images and hasattr(element, "to_image"):
|
1423
|
+
try:
|
1424
|
+
# Create identifier for the element
|
1425
|
+
pdf_name = "unknown"
|
1426
|
+
page_num = "unknown"
|
1427
|
+
|
1428
|
+
if hasattr(element, "page") and element.page:
|
1429
|
+
page_num = element.page.number
|
1430
|
+
if hasattr(element.page, "pdf") and element.page.pdf:
|
1431
|
+
pdf_name = Path(element.page.pdf.path).stem
|
1432
|
+
|
1433
|
+
# Create image filename
|
1434
|
+
element_type = element_data.get("element_type", "element").lower()
|
1435
|
+
image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
|
1436
|
+
image_path = image_dir / image_filename
|
1437
|
+
|
1438
|
+
# Save image
|
1439
|
+
element.to_image(
|
1440
|
+
path=str(image_path), resolution=image_resolution, include_highlights=True
|
1441
|
+
)
|
1442
|
+
|
1443
|
+
# Add relative path to data
|
1444
|
+
element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
1445
|
+
except Exception as e:
|
1446
|
+
logger.error(f"Error saving image for element {i}: {e}")
|
1447
|
+
element_data["image_path"] = None
|
1448
|
+
|
1449
|
+
# Add analyses data
|
1450
|
+
if hasattr(element, "analyses"):
|
1451
|
+
for key in analysis_keys:
|
1452
|
+
if key not in element.analyses:
|
1453
|
+
# Skip this key if it doesn't exist - elements might have different analyses
|
1454
|
+
logger.warning(f"Analysis key '{key}' not found in element {i}")
|
1455
|
+
continue
|
1456
|
+
|
1457
|
+
# Get the analysis result
|
1458
|
+
analysis_result = element.analyses[key]
|
1459
|
+
|
1460
|
+
# If the result has a to_dict method, use it
|
1461
|
+
if hasattr(analysis_result, "to_dict"):
|
1462
|
+
analysis_data = analysis_result.to_dict()
|
1463
|
+
else:
|
1464
|
+
# Otherwise, use the result directly if it's dict-like
|
1465
|
+
try:
|
1466
|
+
analysis_data = dict(analysis_result)
|
1467
|
+
except (TypeError, ValueError):
|
1468
|
+
# Last resort: convert to string
|
1469
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
1470
|
+
|
1471
|
+
# Add analysis data to element data with the key as prefix
|
1472
|
+
for k, v in analysis_data.items():
|
1473
|
+
element_data[f"{key}.{k}"] = v
|
1474
|
+
|
1475
|
+
all_data.append(element_data)
|
1476
|
+
|
1477
|
+
return all_data
|
1478
|
+
|
1371
1479
|
|
1372
1480
|
class PageCollection(Generic[P], ApplyMixin):
|
1373
1481
|
"""
|
@@ -1500,39 +1608,127 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1500
1608
|
|
1501
1609
|
return self # Return self for chaining
|
1502
1610
|
|
1503
|
-
|
1611
|
+
@overload
|
1612
|
+
def find(
|
1613
|
+
self,
|
1614
|
+
*,
|
1615
|
+
text: str,
|
1616
|
+
apply_exclusions: bool = True,
|
1617
|
+
regex: bool = False,
|
1618
|
+
case: bool = True,
|
1619
|
+
**kwargs,
|
1620
|
+
) -> Optional[T]: ...
|
1621
|
+
|
1622
|
+
@overload
|
1623
|
+
def find(
|
1624
|
+
self,
|
1625
|
+
selector: str,
|
1626
|
+
*,
|
1627
|
+
apply_exclusions: bool = True,
|
1628
|
+
regex: bool = False,
|
1629
|
+
case: bool = True,
|
1630
|
+
**kwargs,
|
1631
|
+
) -> Optional[T]: ...
|
1632
|
+
|
1633
|
+
def find(
|
1634
|
+
self,
|
1635
|
+
selector: Optional[str] = None,
|
1636
|
+
*,
|
1637
|
+
text: Optional[str] = None,
|
1638
|
+
apply_exclusions: bool = True,
|
1639
|
+
regex: bool = False,
|
1640
|
+
case: bool = True,
|
1641
|
+
**kwargs,
|
1642
|
+
) -> Optional[T]:
|
1504
1643
|
"""
|
1505
|
-
Find the first element matching the selector across all pages.
|
1644
|
+
Find the first element matching the selector OR text across all pages in the collection.
|
1645
|
+
|
1646
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1506
1647
|
|
1507
1648
|
Args:
|
1508
|
-
selector: CSS-like selector string
|
1509
|
-
|
1510
|
-
|
1649
|
+
selector: CSS-like selector string.
|
1650
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1651
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1652
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1653
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1654
|
+
**kwargs: Additional filter parameters.
|
1511
1655
|
|
1512
1656
|
Returns:
|
1513
|
-
First matching element or None
|
1657
|
+
First matching element or None.
|
1514
1658
|
"""
|
1659
|
+
# Input validation happens within page.find
|
1515
1660
|
for page in self.pages:
|
1516
|
-
element = page.find(
|
1661
|
+
element = page.find(
|
1662
|
+
selector=selector,
|
1663
|
+
text=text,
|
1664
|
+
apply_exclusions=apply_exclusions,
|
1665
|
+
regex=regex,
|
1666
|
+
case=case,
|
1667
|
+
**kwargs,
|
1668
|
+
)
|
1517
1669
|
if element:
|
1518
1670
|
return element
|
1519
1671
|
return None
|
1520
1672
|
|
1521
|
-
|
1673
|
+
@overload
|
1674
|
+
def find_all(
|
1675
|
+
self,
|
1676
|
+
*,
|
1677
|
+
text: str,
|
1678
|
+
apply_exclusions: bool = True,
|
1679
|
+
regex: bool = False,
|
1680
|
+
case: bool = True,
|
1681
|
+
**kwargs,
|
1682
|
+
) -> "ElementCollection": ...
|
1683
|
+
|
1684
|
+
@overload
|
1685
|
+
def find_all(
|
1686
|
+
self,
|
1687
|
+
selector: str,
|
1688
|
+
*,
|
1689
|
+
apply_exclusions: bool = True,
|
1690
|
+
regex: bool = False,
|
1691
|
+
case: bool = True,
|
1692
|
+
**kwargs,
|
1693
|
+
) -> "ElementCollection": ...
|
1694
|
+
|
1695
|
+
def find_all(
|
1696
|
+
self,
|
1697
|
+
selector: Optional[str] = None,
|
1698
|
+
*,
|
1699
|
+
text: Optional[str] = None,
|
1700
|
+
apply_exclusions: bool = True,
|
1701
|
+
regex: bool = False,
|
1702
|
+
case: bool = True,
|
1703
|
+
**kwargs,
|
1704
|
+
) -> "ElementCollection":
|
1522
1705
|
"""
|
1523
|
-
Find all elements matching the selector across all pages.
|
1706
|
+
Find all elements matching the selector OR text across all pages in the collection.
|
1707
|
+
|
1708
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1524
1709
|
|
1525
1710
|
Args:
|
1526
|
-
selector: CSS-like selector string
|
1527
|
-
|
1528
|
-
|
1711
|
+
selector: CSS-like selector string.
|
1712
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1713
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1714
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1715
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1716
|
+
**kwargs: Additional filter parameters.
|
1529
1717
|
|
1530
1718
|
Returns:
|
1531
|
-
ElementCollection with matching elements from all pages
|
1719
|
+
ElementCollection with matching elements from all pages.
|
1532
1720
|
"""
|
1533
1721
|
all_elements = []
|
1722
|
+
# Input validation happens within page.find_all
|
1534
1723
|
for page in self.pages:
|
1535
|
-
elements = page.find_all(
|
1724
|
+
elements = page.find_all(
|
1725
|
+
selector=selector,
|
1726
|
+
text=text,
|
1727
|
+
apply_exclusions=apply_exclusions,
|
1728
|
+
regex=regex,
|
1729
|
+
case=case,
|
1730
|
+
**kwargs,
|
1731
|
+
)
|
1536
1732
|
if elements:
|
1537
1733
|
all_elements.extend(elements.elements)
|
1538
1734
|
|
@@ -1571,10 +1767,14 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1571
1767
|
|
1572
1768
|
# Assume all pages share the same parent PDF object
|
1573
1769
|
parent_pdf = self.pages[0]._parent
|
1574
|
-
if
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1770
|
+
if (
|
1771
|
+
not parent_pdf
|
1772
|
+
or not hasattr(parent_pdf, "correct_ocr")
|
1773
|
+
or not callable(parent_pdf.correct_ocr)
|
1774
|
+
):
|
1775
|
+
raise RuntimeError(
|
1776
|
+
"Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
|
1777
|
+
)
|
1578
1778
|
|
1579
1779
|
page_indices = [p.index for p in self.pages]
|
1580
1780
|
logger.info(
|
@@ -1586,7 +1786,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1586
1786
|
parent_pdf.correct_ocr(
|
1587
1787
|
correction_callback=correction_callback,
|
1588
1788
|
pages=page_indices,
|
1589
|
-
max_workers=max_workers
|
1789
|
+
max_workers=max_workers, # Pass it here
|
1590
1790
|
)
|
1591
1791
|
|
1592
1792
|
return self
|
@@ -1891,3 +2091,279 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1891
2091
|
sections.append(region)
|
1892
2092
|
|
1893
2093
|
return sections
|
2094
|
+
|
2095
|
+
def _gather_analysis_data(
|
2096
|
+
self,
|
2097
|
+
analysis_keys: List[str],
|
2098
|
+
include_content: bool,
|
2099
|
+
include_images: bool,
|
2100
|
+
image_dir: Optional[Path],
|
2101
|
+
image_format: str,
|
2102
|
+
image_resolution: int,
|
2103
|
+
) -> List[Dict[str, Any]]:
|
2104
|
+
"""
|
2105
|
+
Gather analysis data from all pages in the collection.
|
2106
|
+
|
2107
|
+
Args:
|
2108
|
+
analysis_keys: Keys in the analyses dictionary to export
|
2109
|
+
include_content: Whether to include extracted text
|
2110
|
+
include_images: Whether to export images
|
2111
|
+
image_dir: Directory to save images
|
2112
|
+
image_format: Format to save images
|
2113
|
+
image_resolution: Resolution for exported images
|
2114
|
+
|
2115
|
+
Returns:
|
2116
|
+
List of dictionaries containing analysis data
|
2117
|
+
"""
|
2118
|
+
if not self.elements:
|
2119
|
+
logger.warning("No pages found in collection")
|
2120
|
+
return []
|
2121
|
+
|
2122
|
+
all_data = []
|
2123
|
+
|
2124
|
+
for page in self.elements:
|
2125
|
+
# Basic page information
|
2126
|
+
page_data = {
|
2127
|
+
"page_number": page.number,
|
2128
|
+
"page_index": page.index,
|
2129
|
+
"width": page.width,
|
2130
|
+
"height": page.height,
|
2131
|
+
}
|
2132
|
+
|
2133
|
+
# Add PDF information if available
|
2134
|
+
if hasattr(page, "pdf") and page.pdf:
|
2135
|
+
page_data["pdf_path"] = page.pdf.path
|
2136
|
+
page_data["pdf_filename"] = Path(page.pdf.path).name
|
2137
|
+
|
2138
|
+
# Include extracted text if requested
|
2139
|
+
if include_content:
|
2140
|
+
try:
|
2141
|
+
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
2142
|
+
except Exception as e:
|
2143
|
+
logger.error(f"Error extracting text from page {page.number}: {e}")
|
2144
|
+
page_data["content"] = ""
|
2145
|
+
|
2146
|
+
# Save image if requested
|
2147
|
+
if include_images:
|
2148
|
+
try:
|
2149
|
+
# Create image filename
|
2150
|
+
pdf_name = "unknown"
|
2151
|
+
if hasattr(page, "pdf") and page.pdf:
|
2152
|
+
pdf_name = Path(page.pdf.path).stem
|
2153
|
+
|
2154
|
+
image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
|
2155
|
+
image_path = image_dir / image_filename
|
2156
|
+
|
2157
|
+
# Save image
|
2158
|
+
page.save_image(
|
2159
|
+
str(image_path), resolution=image_resolution, include_highlights=True
|
2160
|
+
)
|
2161
|
+
|
2162
|
+
# Add relative path to data
|
2163
|
+
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
2164
|
+
except Exception as e:
|
2165
|
+
logger.error(f"Error saving image for page {page.number}: {e}")
|
2166
|
+
page_data["image_path"] = None
|
2167
|
+
|
2168
|
+
# Add analyses data
|
2169
|
+
if hasattr(page, "analyses") and page.analyses:
|
2170
|
+
for key in analysis_keys:
|
2171
|
+
if key not in page.analyses:
|
2172
|
+
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
2173
|
+
|
2174
|
+
# Get the analysis result
|
2175
|
+
analysis_result = page.analyses[key]
|
2176
|
+
|
2177
|
+
# If the result has a to_dict method, use it
|
2178
|
+
if hasattr(analysis_result, "to_dict"):
|
2179
|
+
analysis_data = analysis_result.to_dict()
|
2180
|
+
else:
|
2181
|
+
# Otherwise, use the result directly if it's dict-like
|
2182
|
+
try:
|
2183
|
+
analysis_data = dict(analysis_result)
|
2184
|
+
except (TypeError, ValueError):
|
2185
|
+
# Last resort: convert to string
|
2186
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
2187
|
+
|
2188
|
+
# Add analysis data to page data with the key as prefix
|
2189
|
+
for k, v in analysis_data.items():
|
2190
|
+
page_data[f"{key}.{k}"] = v
|
2191
|
+
|
2192
|
+
all_data.append(page_data)
|
2193
|
+
|
2194
|
+
return all_data
|
2195
|
+
|
2196
|
+
# --- Deskew Method --- #
|
2197
|
+
|
2198
|
+
def deskew(
|
2199
|
+
self,
|
2200
|
+
resolution: int = 300,
|
2201
|
+
detection_resolution: int = 72,
|
2202
|
+
force_overwrite: bool = False,
|
2203
|
+
**deskew_kwargs,
|
2204
|
+
) -> "PDF": # Changed return type
|
2205
|
+
"""
|
2206
|
+
Creates a new, in-memory PDF object containing deskewed versions of the pages
|
2207
|
+
in this collection.
|
2208
|
+
|
2209
|
+
This method delegates the actual processing to the parent PDF object's
|
2210
|
+
`deskew` method.
|
2211
|
+
|
2212
|
+
Important: The returned PDF is image-based. Any existing text, OCR results,
|
2213
|
+
annotations, or other elements from the original pages will *not* be carried over.
|
2214
|
+
|
2215
|
+
Args:
|
2216
|
+
resolution: DPI resolution for rendering the output deskewed pages.
|
2217
|
+
detection_resolution: DPI resolution used for skew detection if angles are not
|
2218
|
+
already cached on the page objects.
|
2219
|
+
force_overwrite: If False (default), raises a ValueError if any target page
|
2220
|
+
already contains processed elements (text, OCR, regions) to
|
2221
|
+
prevent accidental data loss. Set to True to proceed anyway.
|
2222
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2223
|
+
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
2224
|
+
|
2225
|
+
Returns:
|
2226
|
+
A new PDF object representing the deskewed document.
|
2227
|
+
|
2228
|
+
Raises:
|
2229
|
+
ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
|
2230
|
+
ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
|
2231
|
+
or if the collection is empty.
|
2232
|
+
RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
|
2233
|
+
"""
|
2234
|
+
if not self.pages:
|
2235
|
+
logger.warning("Cannot deskew an empty PageCollection.")
|
2236
|
+
raise ValueError("Cannot deskew an empty PageCollection.")
|
2237
|
+
|
2238
|
+
# Assume all pages share the same parent PDF object
|
2239
|
+
# Need to hint the type of _parent for type checkers
|
2240
|
+
if TYPE_CHECKING:
|
2241
|
+
parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
|
2242
|
+
else:
|
2243
|
+
parent_pdf = self.pages[0]._parent
|
2244
|
+
|
2245
|
+
if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
|
2246
|
+
raise RuntimeError(
|
2247
|
+
"Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
|
2248
|
+
)
|
2249
|
+
|
2250
|
+
# Get the 0-based indices of the pages in this collection
|
2251
|
+
page_indices = [p.index for p in self.pages]
|
2252
|
+
logger.info(
|
2253
|
+
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2254
|
+
)
|
2255
|
+
|
2256
|
+
# Delegate the call to the parent PDF object for the relevant pages
|
2257
|
+
# Pass all relevant arguments through (no output_path anymore)
|
2258
|
+
return parent_pdf.deskew(
|
2259
|
+
pages=page_indices,
|
2260
|
+
resolution=resolution,
|
2261
|
+
detection_resolution=detection_resolution,
|
2262
|
+
force_overwrite=force_overwrite,
|
2263
|
+
**deskew_kwargs,
|
2264
|
+
)
|
2265
|
+
|
2266
|
+
# --- End Deskew Method --- #
|
2267
|
+
|
2268
|
+
def to_image(
|
2269
|
+
self,
|
2270
|
+
page_width: int = 300,
|
2271
|
+
cols: Optional[int] = 4,
|
2272
|
+
rows: Optional[int] = None,
|
2273
|
+
max_pages: Optional[int] = None,
|
2274
|
+
spacing: int = 10,
|
2275
|
+
add_labels: bool = True,
|
2276
|
+
show_category: bool = False, # Add new flag
|
2277
|
+
) -> Optional["Image.Image"]:
|
2278
|
+
"""
|
2279
|
+
Generate a grid of page images for this collection.
|
2280
|
+
|
2281
|
+
Args:
|
2282
|
+
page_width: Width in pixels for rendering individual pages
|
2283
|
+
cols: Number of columns in grid (default: 4)
|
2284
|
+
rows: Number of rows in grid (calculated automatically if None)
|
2285
|
+
max_pages: Maximum number of pages to include (default: all)
|
2286
|
+
spacing: Spacing between page thumbnails in pixels
|
2287
|
+
add_labels: Whether to add page number labels
|
2288
|
+
show_category: Whether to add category and confidence labels (if available)
|
2289
|
+
|
2290
|
+
Returns:
|
2291
|
+
PIL Image of the page grid or None if no pages
|
2292
|
+
"""
|
2293
|
+
if not self.pages:
|
2294
|
+
logger.warning("Cannot generate image for empty PageCollection")
|
2295
|
+
return None
|
2296
|
+
|
2297
|
+
# Limit pages if max_pages is specified
|
2298
|
+
pages_to_render = self.pages[:max_pages] if max_pages else self.pages
|
2299
|
+
|
2300
|
+
# Load font once outside the loop
|
2301
|
+
font = ImageFont.load_default(16) if add_labels else None
|
2302
|
+
|
2303
|
+
# Render individual page images
|
2304
|
+
page_images = []
|
2305
|
+
for page in pages_to_render:
|
2306
|
+
img = page.to_image(width=page_width)
|
2307
|
+
|
2308
|
+
# Add page number label
|
2309
|
+
if add_labels and font: # Check if font was loaded
|
2310
|
+
draw = ImageDraw.Draw(img)
|
2311
|
+
pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
|
2312
|
+
label_text = f"p{page.number} - {pdf_name}"
|
2313
|
+
|
2314
|
+
# Add category if requested and available
|
2315
|
+
if show_category:
|
2316
|
+
category = getattr(page, "category", None)
|
2317
|
+
confidence = getattr(page, "category_confidence", None)
|
2318
|
+
if category is not None and confidence is not None:
|
2319
|
+
category_str = f"{category} {confidence:.3f}"
|
2320
|
+
label_text += f"\n{category_str}"
|
2321
|
+
|
2322
|
+
# Calculate bounding box for multi-line text
|
2323
|
+
# Use (5, 5) as top-left anchor for textbbox calculation for padding
|
2324
|
+
# Use multiline_textbbox for accurate bounds with newlines
|
2325
|
+
bbox = draw.multiline_textbbox((5, 5), label_text, font=font)
|
2326
|
+
# Add padding to the calculated bbox for the white background
|
2327
|
+
bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
|
2328
|
+
|
2329
|
+
# Draw white background rectangle
|
2330
|
+
draw.rectangle(bg_rect, fill=(255, 255, 255))
|
2331
|
+
|
2332
|
+
# Draw the potentially multi-line text using multiline_text
|
2333
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font)
|
2334
|
+
|
2335
|
+
page_images.append(img)
|
2336
|
+
|
2337
|
+
# Calculate grid dimensions if not provided
|
2338
|
+
if not rows and not cols:
|
2339
|
+
# Default to a square-ish grid
|
2340
|
+
cols = min(4, int(len(page_images) ** 0.5) + 1)
|
2341
|
+
rows = (len(page_images) + cols - 1) // cols
|
2342
|
+
elif rows and not cols:
|
2343
|
+
cols = (len(page_images) + rows - 1) // rows
|
2344
|
+
elif cols and not rows:
|
2345
|
+
rows = (len(page_images) + cols - 1) // cols
|
2346
|
+
|
2347
|
+
# Get maximum dimensions for consistent grid cells
|
2348
|
+
max_width = max(img.width for img in page_images)
|
2349
|
+
max_height = max(img.height for img in page_images)
|
2350
|
+
|
2351
|
+
# Create grid image
|
2352
|
+
grid_width = cols * max_width + (cols + 1) * spacing
|
2353
|
+
grid_height = rows * max_height + (rows + 1) * spacing
|
2354
|
+
grid_img = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))
|
2355
|
+
|
2356
|
+
# Place images in grid
|
2357
|
+
for i, img in enumerate(page_images):
|
2358
|
+
if i >= rows * cols:
|
2359
|
+
break
|
2360
|
+
|
2361
|
+
row = i // cols
|
2362
|
+
col = i % cols
|
2363
|
+
|
2364
|
+
x = col * max_width + (col + 1) * spacing
|
2365
|
+
y = row * max_height + (row + 1) * spacing
|
2366
|
+
|
2367
|
+
grid_img.paste(img, (x, y))
|
2368
|
+
|
2369
|
+
return grid_img
|