natural-pdf 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +88 -22
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +48 -46
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +169 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0
@@ -207,22 +207,82 @@ class ElementCollection(
|
|
207
207
|
if not self._elements:
|
208
208
|
return []
|
209
209
|
|
210
|
-
#
|
211
|
-
|
210
|
+
# Check for FlowRegions which need special handling
|
211
|
+
from natural_pdf.flows.region import FlowRegion
|
212
|
+
|
213
|
+
flow_regions = []
|
214
|
+
regular_elements = []
|
215
|
+
|
212
216
|
for elem in self._elements:
|
217
|
+
if isinstance(elem, FlowRegion):
|
218
|
+
flow_regions.append(elem)
|
219
|
+
else:
|
220
|
+
regular_elements.append(elem)
|
221
|
+
|
222
|
+
# Start with specs from FlowRegions (they handle their own multi-page rendering)
|
223
|
+
all_specs = []
|
224
|
+
specs_by_page = {} # Track specs by page for merging
|
225
|
+
|
226
|
+
for flow_region in flow_regions:
|
227
|
+
# FlowRegions have their own _get_render_specs method
|
228
|
+
flow_specs = flow_region._get_render_specs(
|
229
|
+
mode=mode,
|
230
|
+
color=color,
|
231
|
+
highlights=highlights,
|
232
|
+
crop=crop,
|
233
|
+
crop_bbox=crop_bbox,
|
234
|
+
**kwargs,
|
235
|
+
)
|
236
|
+
for spec in flow_specs:
|
237
|
+
# Check if we already have a spec for this page
|
238
|
+
if spec.page in specs_by_page:
|
239
|
+
# Merge highlights into existing spec
|
240
|
+
existing_spec = specs_by_page[spec.page]
|
241
|
+
# Add all highlights from this spec to the existing one
|
242
|
+
existing_spec.highlights.extend(spec.highlights)
|
243
|
+
# Merge crop bbox if needed
|
244
|
+
if spec.crop_bbox and not existing_spec.crop_bbox:
|
245
|
+
existing_spec.crop_bbox = spec.crop_bbox
|
246
|
+
elif spec.crop_bbox and existing_spec.crop_bbox:
|
247
|
+
# Expand crop bbox to include both
|
248
|
+
x0 = min(spec.crop_bbox[0], existing_spec.crop_bbox[0])
|
249
|
+
y0 = min(spec.crop_bbox[1], existing_spec.crop_bbox[1])
|
250
|
+
x1 = max(spec.crop_bbox[2], existing_spec.crop_bbox[2])
|
251
|
+
y1 = max(spec.crop_bbox[3], existing_spec.crop_bbox[3])
|
252
|
+
existing_spec.crop_bbox = (x0, y0, x1, y1)
|
253
|
+
else:
|
254
|
+
# First spec for this page
|
255
|
+
all_specs.append(spec)
|
256
|
+
specs_by_page[spec.page] = spec
|
257
|
+
|
258
|
+
# Group regular elements by page
|
259
|
+
elements_by_page = {}
|
260
|
+
for elem in regular_elements:
|
213
261
|
if hasattr(elem, "page"):
|
214
262
|
page = elem.page
|
215
263
|
if page not in elements_by_page:
|
216
264
|
elements_by_page[page] = []
|
217
265
|
elements_by_page[page].append(elem)
|
218
266
|
|
219
|
-
if not elements_by_page:
|
267
|
+
if not elements_by_page and not flow_regions:
|
220
268
|
return []
|
221
269
|
|
222
|
-
# Create RenderSpec for each page
|
223
|
-
specs = []
|
270
|
+
# Create or update RenderSpec for each page with regular elements
|
224
271
|
for page, page_elements in elements_by_page.items():
|
225
|
-
spec
|
272
|
+
# Check if we already have a spec for this page from FlowRegions
|
273
|
+
existing_spec = None
|
274
|
+
for spec in all_specs:
|
275
|
+
if spec.page == page:
|
276
|
+
existing_spec = spec
|
277
|
+
break
|
278
|
+
|
279
|
+
if existing_spec:
|
280
|
+
# We'll add to the existing spec
|
281
|
+
spec = existing_spec
|
282
|
+
else:
|
283
|
+
# Create new spec for this page
|
284
|
+
spec = RenderSpec(page=page)
|
285
|
+
all_specs.append(spec)
|
226
286
|
|
227
287
|
# Handle cropping
|
228
288
|
if crop_bbox:
|
@@ -390,9 +450,7 @@ class ElementCollection(
|
|
390
450
|
element=elem, color=group_color, label=group_label
|
391
451
|
)
|
392
452
|
|
393
|
-
|
394
|
-
|
395
|
-
return specs
|
453
|
+
return all_specs
|
396
454
|
|
397
455
|
def _get_highlighter(self):
|
398
456
|
"""Get the highlighting service for rendering.
|
@@ -889,7 +947,22 @@ class ElementCollection(
|
|
889
947
|
return self
|
890
948
|
|
891
949
|
def exclude(self):
|
892
|
-
|
950
|
+
"""
|
951
|
+
Excludes all elements in the collection from their respective pages.
|
952
|
+
|
953
|
+
Since a collection can span multiple pages, this method iterates through
|
954
|
+
all elements and calls exclude() on each one individually.
|
955
|
+
|
956
|
+
Each element type is handled appropriately:
|
957
|
+
- Region elements exclude everything within their bounds
|
958
|
+
- Text/other elements exclude only the specific element, not the area
|
959
|
+
|
960
|
+
Returns:
|
961
|
+
Self for method chaining
|
962
|
+
"""
|
963
|
+
for element in self._elements:
|
964
|
+
element.exclude()
|
965
|
+
return self
|
893
966
|
|
894
967
|
def highlight(
|
895
968
|
self,
|