natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -16,6 +16,7 @@ from typing import (
|
|
16
16
|
Dict,
|
17
17
|
Iterable,
|
18
18
|
List,
|
19
|
+
Literal,
|
19
20
|
Optional,
|
20
21
|
Tuple,
|
21
22
|
Type,
|
@@ -31,6 +32,7 @@ from natural_pdf.classification.manager import ClassificationError
|
|
31
32
|
from natural_pdf.classification.mixin import ClassificationMixin
|
32
33
|
from natural_pdf.classification.results import ClassificationResult
|
33
34
|
from natural_pdf.core.highlighting_service import HighlightingService
|
35
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
34
36
|
from natural_pdf.elements.base import Element
|
35
37
|
from natural_pdf.elements.region import Region
|
36
38
|
from natural_pdf.export.mixin import ExportMixin
|
@@ -38,8 +40,12 @@ from natural_pdf.extraction.manager import StructuredDataManager
|
|
38
40
|
from natural_pdf.extraction.mixin import ExtractionMixin
|
39
41
|
from natural_pdf.ocr import OCRManager, OCROptions
|
40
42
|
from natural_pdf.selectors.parser import parse_selector
|
43
|
+
from natural_pdf.text_mixin import TextMixin
|
41
44
|
from natural_pdf.utils.locks import pdf_render_lock
|
42
45
|
|
46
|
+
if TYPE_CHECKING:
|
47
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
48
|
+
|
43
49
|
try:
|
44
50
|
from typing import Any as TypingAny
|
45
51
|
|
@@ -103,7 +109,6 @@ except ImportError:
|
|
103
109
|
from collections.abc import Sequence
|
104
110
|
|
105
111
|
|
106
|
-
|
107
112
|
class _LazyPageList(Sequence):
|
108
113
|
"""A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
|
109
114
|
|
@@ -141,18 +146,18 @@ class _LazyPageList(Sequence):
|
|
141
146
|
"""
|
142
147
|
|
143
148
|
def __init__(
|
144
|
-
self,
|
145
|
-
parent_pdf: "PDF",
|
146
|
-
plumber_pdf: "pdfplumber.PDF",
|
147
|
-
font_attrs=None,
|
149
|
+
self,
|
150
|
+
parent_pdf: "PDF",
|
151
|
+
plumber_pdf: "pdfplumber.PDF",
|
152
|
+
font_attrs=None,
|
148
153
|
load_text=True,
|
149
|
-
indices: Optional[List[int]] = None
|
154
|
+
indices: Optional[List[int]] = None,
|
150
155
|
):
|
151
156
|
self._parent_pdf = parent_pdf
|
152
157
|
self._plumber_pdf = plumber_pdf
|
153
158
|
self._font_attrs = font_attrs
|
154
159
|
self._load_text = load_text
|
155
|
-
|
160
|
+
|
156
161
|
# If indices is provided, this is a sliced view
|
157
162
|
if indices is not None:
|
158
163
|
self._indices = indices
|
@@ -180,23 +185,23 @@ class _LazyPageList(Sequence):
|
|
180
185
|
font_attrs=self._font_attrs,
|
181
186
|
load_text=self._load_text,
|
182
187
|
)
|
183
|
-
|
188
|
+
|
184
189
|
# Apply any stored exclusions to the newly created page
|
185
|
-
if hasattr(self._parent_pdf,
|
190
|
+
if hasattr(self._parent_pdf, "_exclusions"):
|
186
191
|
for exclusion_data in self._parent_pdf._exclusions:
|
187
192
|
exclusion_func, label = exclusion_data
|
188
193
|
try:
|
189
194
|
cached.add_exclusion(exclusion_func, label=label)
|
190
195
|
except Exception as e:
|
191
196
|
logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
|
192
|
-
|
197
|
+
|
193
198
|
# Apply any stored regions to the newly created page
|
194
|
-
if hasattr(self._parent_pdf,
|
199
|
+
if hasattr(self._parent_pdf, "_regions"):
|
195
200
|
for region_data in self._parent_pdf._regions:
|
196
201
|
region_func, name = region_data
|
197
202
|
try:
|
198
203
|
region_instance = region_func(cached)
|
199
|
-
if region_instance and hasattr(region_instance,
|
204
|
+
if region_instance and hasattr(region_instance, "__class__"):
|
200
205
|
# Check if it's a Region-like object (avoid importing Region here)
|
201
206
|
cached.add_region(region_instance, name=name, source="named")
|
202
207
|
elif region_instance is not None:
|
@@ -205,7 +210,7 @@ class _LazyPageList(Sequence):
|
|
205
210
|
)
|
206
211
|
except Exception as e:
|
207
212
|
logger.warning(f"Failed to apply region to page {cached.number}: {e}")
|
208
|
-
|
213
|
+
|
209
214
|
self._cache[index] = cached
|
210
215
|
return cached
|
211
216
|
|
@@ -215,7 +220,7 @@ class _LazyPageList(Sequence):
|
|
215
220
|
|
216
221
|
def __getitem__(self, key):
|
217
222
|
if isinstance(key, slice):
|
218
|
-
# Get the slice of our current indices
|
223
|
+
# Get the slice of our current indices
|
219
224
|
slice_indices = range(*key.indices(len(self)))
|
220
225
|
# Extract the actual page indices for this slice
|
221
226
|
actual_indices = [self._indices[i] for i in slice_indices]
|
@@ -225,7 +230,7 @@ class _LazyPageList(Sequence):
|
|
225
230
|
self._plumber_pdf,
|
226
231
|
font_attrs=self._font_attrs,
|
227
232
|
load_text=self._load_text,
|
228
|
-
indices=actual_indices
|
233
|
+
indices=actual_indices,
|
229
234
|
)
|
230
235
|
elif isinstance(key, int):
|
231
236
|
if key < 0:
|
@@ -247,7 +252,7 @@ class _LazyPageList(Sequence):
|
|
247
252
|
# --- End Lazy Page List Helper --- #
|
248
253
|
|
249
254
|
|
250
|
-
class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
255
|
+
class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
|
251
256
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
252
257
|
|
253
258
|
This class provides a fluent interface for working with PDF documents,
|
@@ -576,7 +581,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
576
581
|
print(f"Page {page.index} has {len(page.chars)} characters")
|
577
582
|
```
|
578
583
|
"""
|
579
|
-
from natural_pdf.
|
584
|
+
from natural_pdf.core.page_collection import PageCollection
|
580
585
|
|
581
586
|
if not hasattr(self, "_pages"):
|
582
587
|
raise AttributeError("PDF pages not yet initialized.")
|
@@ -608,7 +613,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
608
613
|
raise AttributeError("PDF pages not yet initialized.")
|
609
614
|
|
610
615
|
self._exclusions = []
|
611
|
-
|
616
|
+
|
612
617
|
# Clear exclusions only from already-created (cached) pages to avoid forcing page creation
|
613
618
|
for i in range(len(self._pages)):
|
614
619
|
if self._pages._cache[i] is not None: # Only clear from existing pages
|
@@ -618,9 +623,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
618
623
|
logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
|
619
624
|
return self
|
620
625
|
|
621
|
-
def add_exclusion(
|
622
|
-
self, exclusion_func, label: str = None
|
623
|
-
) -> "PDF":
|
626
|
+
def add_exclusion(self, exclusion_func, label: str = None) -> "PDF":
|
624
627
|
"""Add an exclusion function to the PDF.
|
625
628
|
|
626
629
|
Exclusion functions define regions of each page that should be ignored during
|
@@ -669,12 +672,12 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
669
672
|
# Support selector strings and ElementCollection objects directly.
|
670
673
|
# Store exclusion and apply only to already-created pages.
|
671
674
|
# ------------------------------------------------------------------
|
672
|
-
from natural_pdf.elements.
|
675
|
+
from natural_pdf.elements.element_collection import ElementCollection # local import
|
673
676
|
|
674
677
|
if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
|
675
678
|
# Store for bookkeeping and lazy application
|
676
679
|
self._exclusions.append((exclusion_func, label))
|
677
|
-
|
680
|
+
|
678
681
|
# Apply only to already-created (cached) pages to avoid forcing page creation
|
679
682
|
for i in range(len(self._pages)):
|
680
683
|
if self._pages._cache[i] is not None: # Only apply to existing pages
|
@@ -842,11 +845,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
842
845
|
"include_highlights": False,
|
843
846
|
"exclusions": "mask" if apply_exclusions else None,
|
844
847
|
}
|
845
|
-
|
848
|
+
# Use render() for clean image without highlights
|
849
|
+
img = page.render(resolution=final_resolution)
|
846
850
|
if img is None:
|
847
851
|
logger.error(f" Failed to render page {page.number} to image.")
|
848
852
|
continue
|
849
|
-
continue
|
850
853
|
images_pil.append(img)
|
851
854
|
page_image_map.append((page, img))
|
852
855
|
except Exception as e:
|
@@ -1140,7 +1143,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1140
1143
|
if page_elements:
|
1141
1144
|
all_elements.extend(page_elements.elements)
|
1142
1145
|
|
1143
|
-
from natural_pdf.elements.
|
1146
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
1144
1147
|
|
1145
1148
|
return ElementCollection(all_elements)
|
1146
1149
|
|
@@ -1229,6 +1232,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1229
1232
|
|
1230
1233
|
return all_tables
|
1231
1234
|
|
1235
|
+
def get_sections(
|
1236
|
+
self,
|
1237
|
+
start_elements=None,
|
1238
|
+
end_elements=None,
|
1239
|
+
new_section_on_page_break=False,
|
1240
|
+
include_boundaries="both",
|
1241
|
+
) -> "ElementCollection":
|
1242
|
+
"""
|
1243
|
+
Extract sections from the entire PDF based on start/end elements.
|
1244
|
+
|
1245
|
+
This method delegates to the PageCollection.get_sections() method,
|
1246
|
+
providing a convenient way to extract document sections across all pages.
|
1247
|
+
|
1248
|
+
Args:
|
1249
|
+
start_elements: Elements or selector string that mark the start of sections (optional)
|
1250
|
+
end_elements: Elements or selector string that mark the end of sections (optional)
|
1251
|
+
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1252
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1253
|
+
|
1254
|
+
Returns:
|
1255
|
+
ElementCollection of Region objects representing the extracted sections
|
1256
|
+
|
1257
|
+
Example:
|
1258
|
+
Extract sections between headers:
|
1259
|
+
```python
|
1260
|
+
pdf = npdf.PDF("document.pdf")
|
1261
|
+
|
1262
|
+
# Get sections between headers
|
1263
|
+
sections = pdf.get_sections(
|
1264
|
+
start_elements='text[size>14]:bold',
|
1265
|
+
end_elements='text[size>14]:bold'
|
1266
|
+
)
|
1267
|
+
|
1268
|
+
# Get sections that break at page boundaries
|
1269
|
+
sections = pdf.get_sections(
|
1270
|
+
start_elements='text:contains("Chapter")',
|
1271
|
+
new_section_on_page_break=True
|
1272
|
+
)
|
1273
|
+
```
|
1274
|
+
|
1275
|
+
Note:
|
1276
|
+
You can provide only start_elements, only end_elements, or both.
|
1277
|
+
- With only start_elements: sections go from each start to the next start (or end of document)
|
1278
|
+
- With only end_elements: sections go from beginning of document to each end
|
1279
|
+
- With both: sections go from each start to the corresponding end
|
1280
|
+
"""
|
1281
|
+
if not hasattr(self, "_pages"):
|
1282
|
+
raise AttributeError("PDF pages not yet initialized.")
|
1283
|
+
|
1284
|
+
return self.pages.get_sections(
|
1285
|
+
start_elements=start_elements,
|
1286
|
+
end_elements=end_elements,
|
1287
|
+
new_section_on_page_break=new_section_on_page_break,
|
1288
|
+
include_boundaries=include_boundaries,
|
1289
|
+
)
|
1290
|
+
|
1232
1291
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
1233
1292
|
"""
|
1234
1293
|
DEPRECATED: Use save_pdf(..., ocr=True) instead.
|
@@ -1363,6 +1422,36 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1363
1422
|
# Re-raise exception from exporter
|
1364
1423
|
raise e
|
1365
1424
|
|
1425
|
+
def _get_render_specs(
|
1426
|
+
self,
|
1427
|
+
mode: Literal["show", "render"] = "show",
|
1428
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
1429
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
1430
|
+
crop: Union[bool, Literal["content"]] = False,
|
1431
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
1432
|
+
**kwargs,
|
1433
|
+
) -> List[RenderSpec]:
|
1434
|
+
"""Get render specifications for this PDF.
|
1435
|
+
|
1436
|
+
For PDF objects, this delegates to the pages collection to handle
|
1437
|
+
multi-page rendering.
|
1438
|
+
|
1439
|
+
Args:
|
1440
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
1441
|
+
color: Color for highlighting pages in show mode
|
1442
|
+
highlights: Additional highlight groups to show
|
1443
|
+
crop: Whether to crop pages
|
1444
|
+
crop_bbox: Explicit crop bounds
|
1445
|
+
**kwargs: Additional parameters
|
1446
|
+
|
1447
|
+
Returns:
|
1448
|
+
List of RenderSpec objects, one per page
|
1449
|
+
"""
|
1450
|
+
# Delegate to pages collection
|
1451
|
+
return self.pages._get_render_specs(
|
1452
|
+
mode=mode, color=color, highlights=highlights, crop=crop, crop_bbox=crop_bbox, **kwargs
|
1453
|
+
)
|
1454
|
+
|
1366
1455
|
def ask(
|
1367
1456
|
self,
|
1368
1457
|
question: str,
|
@@ -1387,14 +1476,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1387
1476
|
Dict containing: answer, confidence, found, page_num, source_elements, etc.
|
1388
1477
|
"""
|
1389
1478
|
# Delegate to ask_batch and return the first result
|
1390
|
-
results = self.ask_batch(
|
1391
|
-
|
1392
|
-
|
1393
|
-
|
1394
|
-
|
1395
|
-
|
1396
|
-
|
1397
|
-
|
1479
|
+
results = self.ask_batch(
|
1480
|
+
[question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs
|
1481
|
+
)
|
1482
|
+
return (
|
1483
|
+
results[0]
|
1484
|
+
if results
|
1485
|
+
else {
|
1486
|
+
"answer": None,
|
1487
|
+
"confidence": 0.0,
|
1488
|
+
"found": False,
|
1489
|
+
"page_num": None,
|
1490
|
+
"source_elements": [],
|
1491
|
+
}
|
1492
|
+
)
|
1398
1493
|
|
1399
1494
|
def ask_batch(
|
1400
1495
|
self,
|
@@ -1464,7 +1559,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1464
1559
|
for _ in questions
|
1465
1560
|
]
|
1466
1561
|
|
1467
|
-
logger.info(
|
1562
|
+
logger.info(
|
1563
|
+
f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA..."
|
1564
|
+
)
|
1468
1565
|
|
1469
1566
|
# Collect all page images and metadata for batch processing
|
1470
1567
|
page_images = []
|
@@ -1474,26 +1571,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1474
1571
|
for page in target_pages:
|
1475
1572
|
# Get page image
|
1476
1573
|
try:
|
1477
|
-
|
1574
|
+
# Use render() for clean image without highlights
|
1575
|
+
page_image = page.render(resolution=150)
|
1478
1576
|
if page_image is None:
|
1479
1577
|
logger.warning(f"Failed to render image for page {page.number}, skipping")
|
1480
1578
|
continue
|
1481
|
-
|
1579
|
+
|
1482
1580
|
# Get text elements for word boxes
|
1483
1581
|
elements = page.find_all("text")
|
1484
1582
|
if not elements:
|
1485
1583
|
logger.warning(f"No text elements found on page {page.number}")
|
1486
1584
|
word_boxes = []
|
1487
1585
|
else:
|
1488
|
-
word_boxes = qa_engine._get_word_boxes_from_elements(
|
1489
|
-
|
1586
|
+
word_boxes = qa_engine._get_word_boxes_from_elements(
|
1587
|
+
elements, offset_x=0, offset_y=0
|
1588
|
+
)
|
1589
|
+
|
1490
1590
|
page_images.append(page_image)
|
1491
1591
|
page_word_boxes.append(word_boxes)
|
1492
|
-
page_metadata.append({
|
1493
|
-
|
1494
|
-
"page_object": page
|
1495
|
-
})
|
1496
|
-
|
1592
|
+
page_metadata.append({"page_number": page.number, "page_object": page})
|
1593
|
+
|
1497
1594
|
except Exception as e:
|
1498
1595
|
logger.warning(f"Error processing page {page.number}: {e}")
|
1499
1596
|
continue
|
@@ -1513,22 +1610,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1513
1610
|
|
1514
1611
|
# Process all questions against all pages in batch
|
1515
1612
|
all_results = []
|
1516
|
-
|
1613
|
+
|
1517
1614
|
for question_text in questions:
|
1518
1615
|
question_results = []
|
1519
|
-
|
1616
|
+
|
1520
1617
|
# Ask this question against each page (but in batch per page)
|
1521
|
-
for i, (page_image, word_boxes, page_meta) in enumerate(
|
1618
|
+
for i, (page_image, word_boxes, page_meta) in enumerate(
|
1619
|
+
zip(page_images, page_word_boxes, page_metadata)
|
1620
|
+
):
|
1522
1621
|
try:
|
1523
|
-
# Use the DocumentQA batch interface
|
1622
|
+
# Use the DocumentQA batch interface
|
1524
1623
|
page_result = qa_engine.ask(
|
1525
1624
|
image=page_image,
|
1526
1625
|
question=question_text,
|
1527
1626
|
word_boxes=word_boxes,
|
1528
1627
|
min_confidence=min_confidence,
|
1529
|
-
**kwargs
|
1628
|
+
**kwargs,
|
1530
1629
|
)
|
1531
|
-
|
1630
|
+
|
1532
1631
|
if page_result and page_result.found:
|
1533
1632
|
# Add page metadata to result
|
1534
1633
|
page_result_dict = {
|
@@ -1536,30 +1635,34 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1536
1635
|
"confidence": page_result.confidence,
|
1537
1636
|
"found": page_result.found,
|
1538
1637
|
"page_num": page_meta["page_number"],
|
1539
|
-
"source_elements": getattr(page_result,
|
1540
|
-
"start": getattr(page_result,
|
1541
|
-
"end": getattr(page_result,
|
1638
|
+
"source_elements": getattr(page_result, "source_elements", []),
|
1639
|
+
"start": getattr(page_result, "start", -1),
|
1640
|
+
"end": getattr(page_result, "end", -1),
|
1542
1641
|
}
|
1543
1642
|
question_results.append(page_result_dict)
|
1544
|
-
|
1643
|
+
|
1545
1644
|
except Exception as e:
|
1546
|
-
logger.warning(
|
1645
|
+
logger.warning(
|
1646
|
+
f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}"
|
1647
|
+
)
|
1547
1648
|
continue
|
1548
|
-
|
1649
|
+
|
1549
1650
|
# Sort results by confidence and take the best one for this question
|
1550
1651
|
question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
1551
|
-
|
1652
|
+
|
1552
1653
|
if question_results:
|
1553
1654
|
all_results.append(question_results[0])
|
1554
1655
|
else:
|
1555
1656
|
# No results found for this question
|
1556
|
-
all_results.append(
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1657
|
+
all_results.append(
|
1658
|
+
{
|
1659
|
+
"answer": None,
|
1660
|
+
"confidence": 0.0,
|
1661
|
+
"found": False,
|
1662
|
+
"page_num": None,
|
1663
|
+
"source_elements": [],
|
1664
|
+
}
|
1665
|
+
)
|
1563
1666
|
|
1564
1667
|
return all_results
|
1565
1668
|
|
@@ -1703,32 +1806,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1703
1806
|
logger.error(f"Failed to export correction task: {e}")
|
1704
1807
|
raise
|
1705
1808
|
|
1706
|
-
def
|
1809
|
+
def update_text(
|
1707
1810
|
self,
|
1708
|
-
|
1811
|
+
transform: Callable[[Any], Optional[str]],
|
1709
1812
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1813
|
+
selector: str = "text",
|
1710
1814
|
max_workers: Optional[int] = None,
|
1711
1815
|
progress_callback: Optional[Callable[[], None]] = None,
|
1712
1816
|
) -> "PDF":
|
1713
1817
|
"""
|
1714
|
-
Applies corrections to
|
1715
|
-
Applies corrections to OCR text elements using a callback function.
|
1818
|
+
Applies corrections to text elements using a callback function.
|
1716
1819
|
|
1717
1820
|
Args:
|
1718
|
-
correction_callback: Function that takes an element and returns corrected text or None
|
1719
1821
|
correction_callback: Function that takes an element and returns corrected text or None
|
1720
1822
|
pages: Optional page indices/slice to limit the scope of correction
|
1721
|
-
|
1722
|
-
progress_callback: Optional callback function for progress updates
|
1823
|
+
selector: Selector to apply corrections to (default: "text")
|
1723
1824
|
max_workers: Maximum number of threads to use for parallel execution
|
1724
1825
|
progress_callback: Optional callback function for progress updates
|
1725
1826
|
|
1726
1827
|
Returns:
|
1727
1828
|
Self for method chaining
|
1728
|
-
Self for method chaining
|
1729
1829
|
"""
|
1730
1830
|
target_page_indices = []
|
1731
|
-
target_page_indices = []
|
1732
1831
|
if pages is None:
|
1733
1832
|
target_page_indices = list(range(len(self._pages)))
|
1734
1833
|
elif isinstance(pages, slice):
|
@@ -1741,32 +1840,31 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1741
1840
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
1742
1841
|
except (IndexError, TypeError, ValueError) as e:
|
1743
1842
|
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1744
|
-
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1745
1843
|
else:
|
1746
1844
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1747
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1748
1845
|
|
1749
1846
|
if not target_page_indices:
|
1750
|
-
logger.warning("No pages selected for
|
1847
|
+
logger.warning("No pages selected for text update.")
|
1751
1848
|
return self
|
1752
1849
|
|
1753
|
-
logger.info(
|
1754
|
-
|
1850
|
+
logger.info(
|
1851
|
+
f"Starting text update for pages: {target_page_indices} with selector='{selector}'"
|
1852
|
+
)
|
1755
1853
|
|
1756
1854
|
for page_idx in target_page_indices:
|
1757
1855
|
page = self._pages[page_idx]
|
1758
1856
|
try:
|
1759
|
-
page.
|
1760
|
-
|
1857
|
+
page.update_text(
|
1858
|
+
transform=transform,
|
1859
|
+
selector=selector,
|
1761
1860
|
max_workers=max_workers,
|
1762
1861
|
progress_callback=progress_callback,
|
1763
1862
|
)
|
1764
1863
|
except Exception as e:
|
1765
|
-
logger.error(f"Error during
|
1766
|
-
logger.error(f"Error during
|
1864
|
+
logger.error(f"Error during text update on page {page_idx}: {e}")
|
1865
|
+
logger.error(f"Error during text update on page {page_idx}: {e}")
|
1767
1866
|
|
1768
|
-
logger.info("
|
1769
|
-
logger.info("OCR correction process finished.")
|
1867
|
+
logger.info("Text update process finished.")
|
1770
1868
|
return self
|
1771
1869
|
|
1772
1870
|
def __len__(self) -> int:
|
@@ -1781,9 +1879,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1781
1879
|
raise AttributeError("PDF pages not initialized yet.")
|
1782
1880
|
|
1783
1881
|
if isinstance(key, slice):
|
1784
|
-
from natural_pdf.
|
1882
|
+
from natural_pdf.core.page_collection import PageCollection
|
1883
|
+
|
1785
1884
|
# Use the lazy page list's slicing which returns another _LazyPageList
|
1786
|
-
lazy_slice = self._pages[key]
|
1885
|
+
lazy_slice = self._pages[key]
|
1787
1886
|
# Wrap in PageCollection for compatibility
|
1788
1887
|
return PageCollection(lazy_slice)
|
1789
1888
|
elif isinstance(key, int):
|
@@ -2126,10 +2225,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
2126
2225
|
|
2127
2226
|
try:
|
2128
2227
|
for page in tqdm(self.pages, desc="Rendering Pages"):
|
2129
|
-
|
2228
|
+
# Use render() for clean images
|
2229
|
+
img = page.render(
|
2130
2230
|
resolution=resolution,
|
2131
|
-
include_highlights=include_highlights,
|
2132
|
-
labels=labels,
|
2133
2231
|
**kwargs,
|
2134
2232
|
)
|
2135
2233
|
if img:
|
@@ -2359,3 +2457,47 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
2359
2457
|
os.unlink(path)
|
2360
2458
|
except Exception as e:
|
2361
2459
|
logger.warning(f"Failed to clean up temporary file '{path}': {e}")
|
2460
|
+
|
2461
|
+
def analyze_layout(self, *args, **kwargs) -> "ElementCollection[Region]":
|
2462
|
+
"""
|
2463
|
+
Analyzes the layout of all pages in the PDF.
|
2464
|
+
|
2465
|
+
This is a convenience method that calls analyze_layout on the PDF's
|
2466
|
+
page collection.
|
2467
|
+
|
2468
|
+
Args:
|
2469
|
+
*args: Positional arguments passed to pages.analyze_layout().
|
2470
|
+
**kwargs: Keyword arguments passed to pages.analyze_layout().
|
2471
|
+
|
2472
|
+
Returns:
|
2473
|
+
An ElementCollection of all detected Region objects.
|
2474
|
+
"""
|
2475
|
+
return self.pages.analyze_layout(*args, **kwargs)
|
2476
|
+
|
2477
|
+
def highlights(self, show: bool = False) -> "HighlightContext":
|
2478
|
+
"""
|
2479
|
+
Create a highlight context for accumulating highlights.
|
2480
|
+
|
2481
|
+
This allows for clean syntax to show multiple highlight groups:
|
2482
|
+
|
2483
|
+
Example:
|
2484
|
+
with pdf.highlights() as h:
|
2485
|
+
h.add(pdf.find_all('table'), label='tables', color='blue')
|
2486
|
+
h.add(pdf.find_all('text:bold'), label='bold text', color='red')
|
2487
|
+
h.show()
|
2488
|
+
|
2489
|
+
Or with automatic display:
|
2490
|
+
with pdf.highlights(show=True) as h:
|
2491
|
+
h.add(pdf.find_all('table'), label='tables')
|
2492
|
+
h.add(pdf.find_all('text:bold'), label='bold')
|
2493
|
+
# Automatically shows when exiting the context
|
2494
|
+
|
2495
|
+
Args:
|
2496
|
+
show: If True, automatically show highlights when exiting context
|
2497
|
+
|
2498
|
+
Returns:
|
2499
|
+
HighlightContext for accumulating highlights
|
2500
|
+
"""
|
2501
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
2502
|
+
|
2503
|
+
return HighlightContext(self, show_on_exit=show)
|