natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -16,6 +16,7 @@ from typing import (
|
|
16
16
|
Dict,
|
17
17
|
Iterable,
|
18
18
|
List,
|
19
|
+
Literal,
|
19
20
|
Optional,
|
20
21
|
Tuple,
|
21
22
|
Type,
|
@@ -31,6 +32,7 @@ from natural_pdf.classification.manager import ClassificationError
|
|
31
32
|
from natural_pdf.classification.mixin import ClassificationMixin
|
32
33
|
from natural_pdf.classification.results import ClassificationResult
|
33
34
|
from natural_pdf.core.highlighting_service import HighlightingService
|
35
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
34
36
|
from natural_pdf.elements.base import Element
|
35
37
|
from natural_pdf.elements.region import Region
|
36
38
|
from natural_pdf.export.mixin import ExportMixin
|
@@ -38,11 +40,11 @@ from natural_pdf.extraction.manager import StructuredDataManager
|
|
38
40
|
from natural_pdf.extraction.mixin import ExtractionMixin
|
39
41
|
from natural_pdf.ocr import OCRManager, OCROptions
|
40
42
|
from natural_pdf.selectors.parser import parse_selector
|
41
|
-
from natural_pdf.utils.locks import pdf_render_lock
|
42
43
|
from natural_pdf.text_mixin import TextMixin
|
44
|
+
from natural_pdf.utils.locks import pdf_render_lock
|
43
45
|
|
44
46
|
if TYPE_CHECKING:
|
45
|
-
from natural_pdf.elements.
|
47
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
46
48
|
|
47
49
|
try:
|
48
50
|
from typing import Any as TypingAny
|
@@ -107,7 +109,6 @@ except ImportError:
|
|
107
109
|
from collections.abc import Sequence
|
108
110
|
|
109
111
|
|
110
|
-
|
111
112
|
class _LazyPageList(Sequence):
|
112
113
|
"""A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
|
113
114
|
|
@@ -145,18 +146,18 @@ class _LazyPageList(Sequence):
|
|
145
146
|
"""
|
146
147
|
|
147
148
|
def __init__(
|
148
|
-
self,
|
149
|
-
parent_pdf: "PDF",
|
150
|
-
plumber_pdf: "pdfplumber.PDF",
|
151
|
-
font_attrs=None,
|
149
|
+
self,
|
150
|
+
parent_pdf: "PDF",
|
151
|
+
plumber_pdf: "pdfplumber.PDF",
|
152
|
+
font_attrs=None,
|
152
153
|
load_text=True,
|
153
|
-
indices: Optional[List[int]] = None
|
154
|
+
indices: Optional[List[int]] = None,
|
154
155
|
):
|
155
156
|
self._parent_pdf = parent_pdf
|
156
157
|
self._plumber_pdf = plumber_pdf
|
157
158
|
self._font_attrs = font_attrs
|
158
159
|
self._load_text = load_text
|
159
|
-
|
160
|
+
|
160
161
|
# If indices is provided, this is a sliced view
|
161
162
|
if indices is not None:
|
162
163
|
self._indices = indices
|
@@ -184,23 +185,23 @@ class _LazyPageList(Sequence):
|
|
184
185
|
font_attrs=self._font_attrs,
|
185
186
|
load_text=self._load_text,
|
186
187
|
)
|
187
|
-
|
188
|
+
|
188
189
|
# Apply any stored exclusions to the newly created page
|
189
|
-
if hasattr(self._parent_pdf,
|
190
|
+
if hasattr(self._parent_pdf, "_exclusions"):
|
190
191
|
for exclusion_data in self._parent_pdf._exclusions:
|
191
192
|
exclusion_func, label = exclusion_data
|
192
193
|
try:
|
193
194
|
cached.add_exclusion(exclusion_func, label=label)
|
194
195
|
except Exception as e:
|
195
196
|
logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
|
196
|
-
|
197
|
+
|
197
198
|
# Apply any stored regions to the newly created page
|
198
|
-
if hasattr(self._parent_pdf,
|
199
|
+
if hasattr(self._parent_pdf, "_regions"):
|
199
200
|
for region_data in self._parent_pdf._regions:
|
200
201
|
region_func, name = region_data
|
201
202
|
try:
|
202
203
|
region_instance = region_func(cached)
|
203
|
-
if region_instance and hasattr(region_instance,
|
204
|
+
if region_instance and hasattr(region_instance, "__class__"):
|
204
205
|
# Check if it's a Region-like object (avoid importing Region here)
|
205
206
|
cached.add_region(region_instance, name=name, source="named")
|
206
207
|
elif region_instance is not None:
|
@@ -209,7 +210,7 @@ class _LazyPageList(Sequence):
|
|
209
210
|
)
|
210
211
|
except Exception as e:
|
211
212
|
logger.warning(f"Failed to apply region to page {cached.number}: {e}")
|
212
|
-
|
213
|
+
|
213
214
|
self._cache[index] = cached
|
214
215
|
return cached
|
215
216
|
|
@@ -219,7 +220,7 @@ class _LazyPageList(Sequence):
|
|
219
220
|
|
220
221
|
def __getitem__(self, key):
|
221
222
|
if isinstance(key, slice):
|
222
|
-
# Get the slice of our current indices
|
223
|
+
# Get the slice of our current indices
|
223
224
|
slice_indices = range(*key.indices(len(self)))
|
224
225
|
# Extract the actual page indices for this slice
|
225
226
|
actual_indices = [self._indices[i] for i in slice_indices]
|
@@ -229,7 +230,7 @@ class _LazyPageList(Sequence):
|
|
229
230
|
self._plumber_pdf,
|
230
231
|
font_attrs=self._font_attrs,
|
231
232
|
load_text=self._load_text,
|
232
|
-
indices=actual_indices
|
233
|
+
indices=actual_indices,
|
233
234
|
)
|
234
235
|
elif isinstance(key, int):
|
235
236
|
if key < 0:
|
@@ -251,7 +252,7 @@ class _LazyPageList(Sequence):
|
|
251
252
|
# --- End Lazy Page List Helper --- #
|
252
253
|
|
253
254
|
|
254
|
-
class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
255
|
+
class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
|
255
256
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
256
257
|
|
257
258
|
This class provides a fluent interface for working with PDF documents,
|
@@ -580,7 +581,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
580
581
|
print(f"Page {page.index} has {len(page.chars)} characters")
|
581
582
|
```
|
582
583
|
"""
|
583
|
-
from natural_pdf.
|
584
|
+
from natural_pdf.core.page_collection import PageCollection
|
584
585
|
|
585
586
|
if not hasattr(self, "_pages"):
|
586
587
|
raise AttributeError("PDF pages not yet initialized.")
|
@@ -612,7 +613,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
612
613
|
raise AttributeError("PDF pages not yet initialized.")
|
613
614
|
|
614
615
|
self._exclusions = []
|
615
|
-
|
616
|
+
|
616
617
|
# Clear exclusions only from already-created (cached) pages to avoid forcing page creation
|
617
618
|
for i in range(len(self._pages)):
|
618
619
|
if self._pages._cache[i] is not None: # Only clear from existing pages
|
@@ -622,9 +623,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
622
623
|
logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
|
623
624
|
return self
|
624
625
|
|
625
|
-
def add_exclusion(
|
626
|
-
self, exclusion_func, label: str = None
|
627
|
-
) -> "PDF":
|
626
|
+
def add_exclusion(self, exclusion_func, label: str = None) -> "PDF":
|
628
627
|
"""Add an exclusion function to the PDF.
|
629
628
|
|
630
629
|
Exclusion functions define regions of each page that should be ignored during
|
@@ -673,12 +672,12 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
673
672
|
# Support selector strings and ElementCollection objects directly.
|
674
673
|
# Store exclusion and apply only to already-created pages.
|
675
674
|
# ------------------------------------------------------------------
|
676
|
-
from natural_pdf.elements.
|
675
|
+
from natural_pdf.elements.element_collection import ElementCollection # local import
|
677
676
|
|
678
677
|
if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
|
679
678
|
# Store for bookkeeping and lazy application
|
680
679
|
self._exclusions.append((exclusion_func, label))
|
681
|
-
|
680
|
+
|
682
681
|
# Apply only to already-created (cached) pages to avoid forcing page creation
|
683
682
|
for i in range(len(self._pages)):
|
684
683
|
if self._pages._cache[i] is not None: # Only apply to existing pages
|
@@ -846,11 +845,11 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
846
845
|
"include_highlights": False,
|
847
846
|
"exclusions": "mask" if apply_exclusions else None,
|
848
847
|
}
|
849
|
-
|
848
|
+
# Use render() for clean image without highlights
|
849
|
+
img = page.render(resolution=final_resolution)
|
850
850
|
if img is None:
|
851
851
|
logger.error(f" Failed to render page {page.number} to image.")
|
852
852
|
continue
|
853
|
-
continue
|
854
853
|
images_pil.append(img)
|
855
854
|
page_image_map.append((page, img))
|
856
855
|
except Exception as e:
|
@@ -1144,7 +1143,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1144
1143
|
if page_elements:
|
1145
1144
|
all_elements.extend(page_elements.elements)
|
1146
1145
|
|
1147
|
-
from natural_pdf.elements.
|
1146
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
1148
1147
|
|
1149
1148
|
return ElementCollection(all_elements)
|
1150
1149
|
|
@@ -1238,7 +1237,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1238
1237
|
start_elements=None,
|
1239
1238
|
end_elements=None,
|
1240
1239
|
new_section_on_page_break=False,
|
1241
|
-
|
1240
|
+
include_boundaries="both",
|
1242
1241
|
) -> "ElementCollection":
|
1243
1242
|
"""
|
1244
1243
|
Extract sections from the entire PDF based on start/end elements.
|
@@ -1250,7 +1249,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1250
1249
|
start_elements: Elements or selector string that mark the start of sections (optional)
|
1251
1250
|
end_elements: Elements or selector string that mark the end of sections (optional)
|
1252
1251
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1253
|
-
|
1252
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1254
1253
|
|
1255
1254
|
Returns:
|
1256
1255
|
ElementCollection of Region objects representing the extracted sections
|
@@ -1259,13 +1258,13 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1259
1258
|
Extract sections between headers:
|
1260
1259
|
```python
|
1261
1260
|
pdf = npdf.PDF("document.pdf")
|
1262
|
-
|
1261
|
+
|
1263
1262
|
# Get sections between headers
|
1264
1263
|
sections = pdf.get_sections(
|
1265
1264
|
start_elements='text[size>14]:bold',
|
1266
1265
|
end_elements='text[size>14]:bold'
|
1267
1266
|
)
|
1268
|
-
|
1267
|
+
|
1269
1268
|
# Get sections that break at page boundaries
|
1270
1269
|
sections = pdf.get_sections(
|
1271
1270
|
start_elements='text:contains("Chapter")',
|
@@ -1286,7 +1285,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1286
1285
|
start_elements=start_elements,
|
1287
1286
|
end_elements=end_elements,
|
1288
1287
|
new_section_on_page_break=new_section_on_page_break,
|
1289
|
-
|
1288
|
+
include_boundaries=include_boundaries,
|
1290
1289
|
)
|
1291
1290
|
|
1292
1291
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
@@ -1423,6 +1422,36 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1423
1422
|
# Re-raise exception from exporter
|
1424
1423
|
raise e
|
1425
1424
|
|
1425
|
+
def _get_render_specs(
|
1426
|
+
self,
|
1427
|
+
mode: Literal["show", "render"] = "show",
|
1428
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
1429
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
1430
|
+
crop: Union[bool, Literal["content"]] = False,
|
1431
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
1432
|
+
**kwargs,
|
1433
|
+
) -> List[RenderSpec]:
|
1434
|
+
"""Get render specifications for this PDF.
|
1435
|
+
|
1436
|
+
For PDF objects, this delegates to the pages collection to handle
|
1437
|
+
multi-page rendering.
|
1438
|
+
|
1439
|
+
Args:
|
1440
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
1441
|
+
color: Color for highlighting pages in show mode
|
1442
|
+
highlights: Additional highlight groups to show
|
1443
|
+
crop: Whether to crop pages
|
1444
|
+
crop_bbox: Explicit crop bounds
|
1445
|
+
**kwargs: Additional parameters
|
1446
|
+
|
1447
|
+
Returns:
|
1448
|
+
List of RenderSpec objects, one per page
|
1449
|
+
"""
|
1450
|
+
# Delegate to pages collection
|
1451
|
+
return self.pages._get_render_specs(
|
1452
|
+
mode=mode, color=color, highlights=highlights, crop=crop, crop_bbox=crop_bbox, **kwargs
|
1453
|
+
)
|
1454
|
+
|
1426
1455
|
def ask(
|
1427
1456
|
self,
|
1428
1457
|
question: str,
|
@@ -1447,14 +1476,20 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1447
1476
|
Dict containing: answer, confidence, found, page_num, source_elements, etc.
|
1448
1477
|
"""
|
1449
1478
|
# Delegate to ask_batch and return the first result
|
1450
|
-
results = self.ask_batch(
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1479
|
+
results = self.ask_batch(
|
1480
|
+
[question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs
|
1481
|
+
)
|
1482
|
+
return (
|
1483
|
+
results[0]
|
1484
|
+
if results
|
1485
|
+
else {
|
1486
|
+
"answer": None,
|
1487
|
+
"confidence": 0.0,
|
1488
|
+
"found": False,
|
1489
|
+
"page_num": None,
|
1490
|
+
"source_elements": [],
|
1491
|
+
}
|
1492
|
+
)
|
1458
1493
|
|
1459
1494
|
def ask_batch(
|
1460
1495
|
self,
|
@@ -1524,7 +1559,9 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1524
1559
|
for _ in questions
|
1525
1560
|
]
|
1526
1561
|
|
1527
|
-
logger.info(
|
1562
|
+
logger.info(
|
1563
|
+
f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA..."
|
1564
|
+
)
|
1528
1565
|
|
1529
1566
|
# Collect all page images and metadata for batch processing
|
1530
1567
|
page_images = []
|
@@ -1534,26 +1571,26 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1534
1571
|
for page in target_pages:
|
1535
1572
|
# Get page image
|
1536
1573
|
try:
|
1537
|
-
|
1574
|
+
# Use render() for clean image without highlights
|
1575
|
+
page_image = page.render(resolution=150)
|
1538
1576
|
if page_image is None:
|
1539
1577
|
logger.warning(f"Failed to render image for page {page.number}, skipping")
|
1540
1578
|
continue
|
1541
|
-
|
1579
|
+
|
1542
1580
|
# Get text elements for word boxes
|
1543
1581
|
elements = page.find_all("text")
|
1544
1582
|
if not elements:
|
1545
1583
|
logger.warning(f"No text elements found on page {page.number}")
|
1546
1584
|
word_boxes = []
|
1547
1585
|
else:
|
1548
|
-
word_boxes = qa_engine._get_word_boxes_from_elements(
|
1549
|
-
|
1586
|
+
word_boxes = qa_engine._get_word_boxes_from_elements(
|
1587
|
+
elements, offset_x=0, offset_y=0
|
1588
|
+
)
|
1589
|
+
|
1550
1590
|
page_images.append(page_image)
|
1551
1591
|
page_word_boxes.append(word_boxes)
|
1552
|
-
page_metadata.append({
|
1553
|
-
|
1554
|
-
"page_object": page
|
1555
|
-
})
|
1556
|
-
|
1592
|
+
page_metadata.append({"page_number": page.number, "page_object": page})
|
1593
|
+
|
1557
1594
|
except Exception as e:
|
1558
1595
|
logger.warning(f"Error processing page {page.number}: {e}")
|
1559
1596
|
continue
|
@@ -1573,22 +1610,24 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1573
1610
|
|
1574
1611
|
# Process all questions against all pages in batch
|
1575
1612
|
all_results = []
|
1576
|
-
|
1613
|
+
|
1577
1614
|
for question_text in questions:
|
1578
1615
|
question_results = []
|
1579
|
-
|
1616
|
+
|
1580
1617
|
# Ask this question against each page (but in batch per page)
|
1581
|
-
for i, (page_image, word_boxes, page_meta) in enumerate(
|
1618
|
+
for i, (page_image, word_boxes, page_meta) in enumerate(
|
1619
|
+
zip(page_images, page_word_boxes, page_metadata)
|
1620
|
+
):
|
1582
1621
|
try:
|
1583
|
-
# Use the DocumentQA batch interface
|
1622
|
+
# Use the DocumentQA batch interface
|
1584
1623
|
page_result = qa_engine.ask(
|
1585
1624
|
image=page_image,
|
1586
1625
|
question=question_text,
|
1587
1626
|
word_boxes=word_boxes,
|
1588
1627
|
min_confidence=min_confidence,
|
1589
|
-
**kwargs
|
1628
|
+
**kwargs,
|
1590
1629
|
)
|
1591
|
-
|
1630
|
+
|
1592
1631
|
if page_result and page_result.found:
|
1593
1632
|
# Add page metadata to result
|
1594
1633
|
page_result_dict = {
|
@@ -1596,30 +1635,34 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1596
1635
|
"confidence": page_result.confidence,
|
1597
1636
|
"found": page_result.found,
|
1598
1637
|
"page_num": page_meta["page_number"],
|
1599
|
-
"source_elements": getattr(page_result,
|
1600
|
-
"start": getattr(page_result,
|
1601
|
-
"end": getattr(page_result,
|
1638
|
+
"source_elements": getattr(page_result, "source_elements", []),
|
1639
|
+
"start": getattr(page_result, "start", -1),
|
1640
|
+
"end": getattr(page_result, "end", -1),
|
1602
1641
|
}
|
1603
1642
|
question_results.append(page_result_dict)
|
1604
|
-
|
1643
|
+
|
1605
1644
|
except Exception as e:
|
1606
|
-
logger.warning(
|
1645
|
+
logger.warning(
|
1646
|
+
f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}"
|
1647
|
+
)
|
1607
1648
|
continue
|
1608
|
-
|
1649
|
+
|
1609
1650
|
# Sort results by confidence and take the best one for this question
|
1610
1651
|
question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
1611
|
-
|
1652
|
+
|
1612
1653
|
if question_results:
|
1613
1654
|
all_results.append(question_results[0])
|
1614
1655
|
else:
|
1615
1656
|
# No results found for this question
|
1616
|
-
all_results.append(
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1657
|
+
all_results.append(
|
1658
|
+
{
|
1659
|
+
"answer": None,
|
1660
|
+
"confidence": 0.0,
|
1661
|
+
"found": False,
|
1662
|
+
"page_num": None,
|
1663
|
+
"source_elements": [],
|
1664
|
+
}
|
1665
|
+
)
|
1623
1666
|
|
1624
1667
|
return all_results
|
1625
1668
|
|
@@ -1804,17 +1847,19 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1804
1847
|
logger.warning("No pages selected for text update.")
|
1805
1848
|
return self
|
1806
1849
|
|
1807
|
-
logger.info(
|
1850
|
+
logger.info(
|
1851
|
+
f"Starting text update for pages: {target_page_indices} with selector='{selector}'"
|
1852
|
+
)
|
1808
1853
|
|
1809
1854
|
for page_idx in target_page_indices:
|
1810
1855
|
page = self._pages[page_idx]
|
1811
1856
|
try:
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
1817
|
-
|
1857
|
+
page.update_text(
|
1858
|
+
transform=transform,
|
1859
|
+
selector=selector,
|
1860
|
+
max_workers=max_workers,
|
1861
|
+
progress_callback=progress_callback,
|
1862
|
+
)
|
1818
1863
|
except Exception as e:
|
1819
1864
|
logger.error(f"Error during text update on page {page_idx}: {e}")
|
1820
1865
|
logger.error(f"Error during text update on page {page_idx}: {e}")
|
@@ -1834,9 +1879,10 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1834
1879
|
raise AttributeError("PDF pages not initialized yet.")
|
1835
1880
|
|
1836
1881
|
if isinstance(key, slice):
|
1837
|
-
from natural_pdf.
|
1882
|
+
from natural_pdf.core.page_collection import PageCollection
|
1883
|
+
|
1838
1884
|
# Use the lazy page list's slicing which returns another _LazyPageList
|
1839
|
-
lazy_slice = self._pages[key]
|
1885
|
+
lazy_slice = self._pages[key]
|
1840
1886
|
# Wrap in PageCollection for compatibility
|
1841
1887
|
return PageCollection(lazy_slice)
|
1842
1888
|
elif isinstance(key, int):
|
@@ -2179,10 +2225,9 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
2179
2225
|
|
2180
2226
|
try:
|
2181
2227
|
for page in tqdm(self.pages, desc="Rendering Pages"):
|
2182
|
-
|
2228
|
+
# Use render() for clean images
|
2229
|
+
img = page.render(
|
2183
2230
|
resolution=resolution,
|
2184
|
-
include_highlights=include_highlights,
|
2185
|
-
labels=labels,
|
2186
2231
|
**kwargs,
|
2187
2232
|
)
|
2188
2233
|
if img:
|
@@ -2412,3 +2457,47 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
2412
2457
|
os.unlink(path)
|
2413
2458
|
except Exception as e:
|
2414
2459
|
logger.warning(f"Failed to clean up temporary file '{path}': {e}")
|
2460
|
+
|
2461
|
+
def analyze_layout(self, *args, **kwargs) -> "ElementCollection[Region]":
|
2462
|
+
"""
|
2463
|
+
Analyzes the layout of all pages in the PDF.
|
2464
|
+
|
2465
|
+
This is a convenience method that calls analyze_layout on the PDF's
|
2466
|
+
page collection.
|
2467
|
+
|
2468
|
+
Args:
|
2469
|
+
*args: Positional arguments passed to pages.analyze_layout().
|
2470
|
+
**kwargs: Keyword arguments passed to pages.analyze_layout().
|
2471
|
+
|
2472
|
+
Returns:
|
2473
|
+
An ElementCollection of all detected Region objects.
|
2474
|
+
"""
|
2475
|
+
return self.pages.analyze_layout(*args, **kwargs)
|
2476
|
+
|
2477
|
+
def highlights(self, show: bool = False) -> "HighlightContext":
|
2478
|
+
"""
|
2479
|
+
Create a highlight context for accumulating highlights.
|
2480
|
+
|
2481
|
+
This allows for clean syntax to show multiple highlight groups:
|
2482
|
+
|
2483
|
+
Example:
|
2484
|
+
with pdf.highlights() as h:
|
2485
|
+
h.add(pdf.find_all('table'), label='tables', color='blue')
|
2486
|
+
h.add(pdf.find_all('text:bold'), label='bold text', color='red')
|
2487
|
+
h.show()
|
2488
|
+
|
2489
|
+
Or with automatic display:
|
2490
|
+
with pdf.highlights(show=True) as h:
|
2491
|
+
h.add(pdf.find_all('table'), label='tables')
|
2492
|
+
h.add(pdf.find_all('text:bold'), label='bold')
|
2493
|
+
# Automatically shows when exiting the context
|
2494
|
+
|
2495
|
+
Args:
|
2496
|
+
show: If True, automatically show highlights when exiting context
|
2497
|
+
|
2498
|
+
Returns:
|
2499
|
+
HighlightContext for accumulating highlights
|
2500
|
+
"""
|
2501
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
2502
|
+
|
2503
|
+
return HighlightContext(self, show_on_exit=show)
|
@@ -588,24 +588,25 @@ class PDFCollection(
|
|
588
588
|
# Get classification manager from first PDF
|
589
589
|
try:
|
590
590
|
first_pdf = self._pdfs[0]
|
591
|
-
if not hasattr(first_pdf,
|
591
|
+
if not hasattr(first_pdf, "get_manager"):
|
592
592
|
raise RuntimeError("PDFs do not support classification manager")
|
593
|
-
manager = first_pdf.get_manager(
|
593
|
+
manager = first_pdf.get_manager("classification")
|
594
594
|
if not manager or not manager.is_available():
|
595
595
|
raise RuntimeError("ClassificationManager is not available")
|
596
596
|
except Exception as e:
|
597
597
|
from natural_pdf.classification.manager import ClassificationError
|
598
|
+
|
598
599
|
raise ClassificationError(f"Cannot access ClassificationManager: {e}") from e
|
599
600
|
|
600
601
|
# Determine processing mode early
|
601
602
|
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
602
|
-
|
603
|
+
|
603
604
|
# Gather content from all PDFs
|
604
605
|
pdf_contents = []
|
605
606
|
valid_pdfs = []
|
606
|
-
|
607
|
+
|
607
608
|
logger.info(f"Gathering content from {len(self._pdfs)} PDFs for batch classification...")
|
608
|
-
|
609
|
+
|
609
610
|
for pdf in self._pdfs:
|
610
611
|
try:
|
611
612
|
# Get the content for classification - use the same logic as individual PDF classify
|
@@ -618,16 +619,18 @@ class PDFCollection(
|
|
618
619
|
elif inferred_using == "vision":
|
619
620
|
# For vision, we need single-page PDFs only
|
620
621
|
if len(pdf.pages) != 1:
|
621
|
-
logger.warning(
|
622
|
+
logger.warning(
|
623
|
+
f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs"
|
624
|
+
)
|
622
625
|
continue
|
623
626
|
# Get first page image
|
624
|
-
content = pdf.pages[0].
|
627
|
+
content = pdf.pages[0].render()
|
625
628
|
else:
|
626
629
|
raise ValueError(f"Unsupported using mode: {inferred_using}")
|
627
|
-
|
630
|
+
|
628
631
|
pdf_contents.append(content)
|
629
632
|
valid_pdfs.append(pdf)
|
630
|
-
|
633
|
+
|
631
634
|
except Exception as e:
|
632
635
|
logger.warning(f"Skipping PDF {pdf.path}: Error getting content - {e}")
|
633
636
|
continue
|
@@ -636,7 +639,9 @@ class PDFCollection(
|
|
636
639
|
logger.warning("No valid content could be gathered from PDFs for classification.")
|
637
640
|
return self
|
638
641
|
|
639
|
-
logger.info(
|
642
|
+
logger.info(
|
643
|
+
f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification..."
|
644
|
+
)
|
640
645
|
|
641
646
|
# Run batch classification
|
642
647
|
try:
|
@@ -651,6 +656,7 @@ class PDFCollection(
|
|
651
656
|
except Exception as e:
|
652
657
|
logger.error(f"Batch classification failed: {e}")
|
653
658
|
from natural_pdf.classification.manager import ClassificationError
|
659
|
+
|
654
660
|
raise ClassificationError(f"Batch classification failed: {e}") from e
|
655
661
|
|
656
662
|
# Assign results back to PDFs
|
@@ -660,10 +666,11 @@ class PDFCollection(
|
|
660
666
|
f"with PDFs processed ({len(valid_pdfs)}). Cannot assign results."
|
661
667
|
)
|
662
668
|
from natural_pdf.classification.manager import ClassificationError
|
669
|
+
|
663
670
|
raise ClassificationError("Batch result count mismatch with input PDFs")
|
664
671
|
|
665
672
|
logger.info(f"Assigning {len(batch_results)} results to PDFs under key '{analysis_key}'.")
|
666
|
-
|
673
|
+
|
667
674
|
processed_count = 0
|
668
675
|
for pdf, result_obj in zip(valid_pdfs, batch_results):
|
669
676
|
try:
|