natural-pdf 0.1.24__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/page.py +66 -7
- natural_pdf/describe/summary.py +2 -2
- natural_pdf/elements/line.py +9 -4
- natural_pdf/elements/region.py +48 -12
- natural_pdf/elements/text.py +50 -1
- natural_pdf/qa/document_qa.py +62 -8
- natural_pdf/utils/packaging.py +23 -9
- {natural_pdf-0.1.24.dist-info → natural_pdf-0.1.27.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.24.dist-info → natural_pdf-0.1.27.dist-info}/RECORD +13 -13
- {natural_pdf-0.1.24.dist-info → natural_pdf-0.1.27.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.24.dist-info → natural_pdf-0.1.27.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.24.dist-info → natural_pdf-0.1.27.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.24.dist-info → natural_pdf-0.1.27.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1576,8 +1576,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1576
1576
|
render_ocr: Whether to render OCR text on highlights.
|
1577
1577
|
resolution: Resolution in DPI for base page image (default: scale * 72).
|
1578
1578
|
include_highlights: Whether to render highlights.
|
1579
|
-
exclusions:
|
1580
|
-
(default
|
1579
|
+
exclusions: Accepts one of the following:
|
1580
|
+
• None – no masking (default)
|
1581
|
+
• "mask" – mask using solid white (back-compat)
|
1582
|
+
• CSS/HTML colour string (e.g. "red", "#ff0000", "#ff000080")
|
1583
|
+
• Tuple of RGB or RGBA values (ints 0-255 or floats 0-1)
|
1584
|
+
All excluded regions are filled with this colour.
|
1581
1585
|
**kwargs: Additional parameters for pdfplumber.to_image.
|
1582
1586
|
|
1583
1587
|
Returns:
|
@@ -1690,7 +1694,52 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1690
1694
|
# --- Apply exclusion masking if requested ---
|
1691
1695
|
# This modifies 'rendered_image_component'
|
1692
1696
|
image_after_masking = rendered_image_component # Start with the rendered image
|
1693
|
-
|
1697
|
+
|
1698
|
+
# Determine if masking is requested and establish the fill colour
|
1699
|
+
mask_requested = exclusions is not None and self._exclusions
|
1700
|
+
mask_color: Union[str, Tuple[int, int, int, int]] = "white" # default
|
1701
|
+
|
1702
|
+
if mask_requested:
|
1703
|
+
if exclusions != "mask":
|
1704
|
+
# Attempt to parse custom colour input
|
1705
|
+
try:
|
1706
|
+
if isinstance(exclusions, tuple):
|
1707
|
+
# Handle RGB/RGBA tuples with ints 0-255 or floats 0-1
|
1708
|
+
processed = []
|
1709
|
+
all_float = all(isinstance(c, float) for c in exclusions)
|
1710
|
+
for i, c in enumerate(exclusions):
|
1711
|
+
if isinstance(c, float):
|
1712
|
+
val = int(c * 255) if all_float or i == 3 else int(c)
|
1713
|
+
else:
|
1714
|
+
val = int(c)
|
1715
|
+
processed.append(max(0, min(255, val)))
|
1716
|
+
if len(processed) == 3:
|
1717
|
+
processed.append(255) # add full alpha
|
1718
|
+
mask_color = tuple(processed) # type: ignore[assignment]
|
1719
|
+
elif isinstance(exclusions, str):
|
1720
|
+
# Try using the optional 'colour' library for rich parsing
|
1721
|
+
try:
|
1722
|
+
from colour import Color # type: ignore
|
1723
|
+
|
1724
|
+
color_obj = Color(exclusions)
|
1725
|
+
mask_color = (
|
1726
|
+
int(color_obj.red * 255),
|
1727
|
+
int(color_obj.green * 255),
|
1728
|
+
int(color_obj.blue * 255),
|
1729
|
+
255,
|
1730
|
+
)
|
1731
|
+
except Exception:
|
1732
|
+
# Fallback: if parsing fails, treat as plain string accepted by PIL
|
1733
|
+
mask_color = exclusions # e.g. "red"
|
1734
|
+
else:
|
1735
|
+
logger.warning(
|
1736
|
+
f"Unsupported exclusions colour spec: {exclusions!r}. Using white."
|
1737
|
+
)
|
1738
|
+
except Exception as colour_parse_err: # pragma: no cover
|
1739
|
+
logger.warning(
|
1740
|
+
f"Failed to parse exclusions colour {exclusions!r}: {colour_parse_err}. Using white."
|
1741
|
+
)
|
1742
|
+
|
1694
1743
|
try:
|
1695
1744
|
# Ensure image is mutable (RGB or RGBA)
|
1696
1745
|
if image_after_masking.mode not in ("RGB", "RGBA"):
|
@@ -1701,17 +1750,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1701
1750
|
)
|
1702
1751
|
if exclusion_regions:
|
1703
1752
|
draw = ImageDraw.Draw(image_after_masking)
|
1704
|
-
#
|
1753
|
+
# Scaling factor for converting PDF pts → image px
|
1705
1754
|
img_scale = render_resolution / 72.0
|
1706
1755
|
|
1756
|
+
# Determine fill colour compatible with current mode
|
1757
|
+
def _mode_compatible(colour):
|
1758
|
+
if isinstance(colour, tuple) and image_after_masking.mode != "RGBA":
|
1759
|
+
return colour[:3] # drop alpha for RGB images
|
1760
|
+
return colour
|
1761
|
+
|
1762
|
+
fill_colour = _mode_compatible(mask_color)
|
1763
|
+
|
1707
1764
|
for region in exclusion_regions:
|
1708
|
-
# Convert PDF points (x0, top, x1, bottom) to image pixels
|
1709
1765
|
img_x0 = region.x0 * img_scale
|
1710
1766
|
img_top = region.top * img_scale
|
1711
1767
|
img_x1 = region.x1 * img_scale
|
1712
1768
|
img_bottom = region.bottom * img_scale
|
1713
1769
|
|
1714
|
-
# Draw a white rectangle over the excluded area
|
1715
1770
|
img_coords = (
|
1716
1771
|
max(0, img_x0),
|
1717
1772
|
max(0, img_top),
|
@@ -1719,7 +1774,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1719
1774
|
min(image_after_masking.height, img_bottom),
|
1720
1775
|
)
|
1721
1776
|
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1722
|
-
draw.rectangle(img_coords, fill=
|
1777
|
+
draw.rectangle(img_coords, fill=fill_colour)
|
1723
1778
|
else: # pragma: no cover
|
1724
1779
|
logger.warning(
|
1725
1780
|
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
@@ -1994,6 +2049,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1994
2049
|
)
|
1995
2050
|
return ElementCollection([]) # Return empty collection
|
1996
2051
|
|
2052
|
+
# Clear existing detected regions if 'replace' is specified
|
2053
|
+
if existing == "replace":
|
2054
|
+
self.clear_detected_layout_regions()
|
2055
|
+
|
1997
2056
|
# The analyzer's analyze_layout method already adds regions to the page
|
1998
2057
|
# and its element manager. We just need to retrieve them.
|
1999
2058
|
analyzer.analyze_layout(
|
natural_pdf/describe/summary.py
CHANGED
@@ -56,8 +56,8 @@ class ElementSummary:
|
|
56
56
|
section_title = name.replace('_', ' ').title()
|
57
57
|
|
58
58
|
if isinstance(data, dict):
|
59
|
-
lines = [f"**{section_title}**:"]
|
60
|
-
lines.extend(self._format_dict(data, indent="
|
59
|
+
lines = [f"**{section_title}**:", ""]
|
60
|
+
lines.extend(self._format_dict(data, indent=""))
|
61
61
|
elif isinstance(data, list):
|
62
62
|
lines = [f"**{section_title}**: {', '.join(str(item) for item in data)}"]
|
63
63
|
else:
|
natural_pdf/elements/line.py
CHANGED
@@ -94,6 +94,14 @@ class LineElement(Element):
|
|
94
94
|
# Vertical if x-change is within tolerance and y-change is significant
|
95
95
|
return dx <= tolerance and dy > tolerance
|
96
96
|
|
97
|
+
@property
|
98
|
+
def orientation(self) -> str:
|
99
|
+
"""Get the orientation of the line ('horizontal', 'vertical', or 'diagonal')."""
|
100
|
+
if self.is_horizontal:
|
101
|
+
return "horizontal"
|
102
|
+
elif self.is_vertical:
|
103
|
+
return "vertical"
|
104
|
+
|
97
105
|
def text_above(self, distance: float = 5, **kwargs) -> Any:
|
98
106
|
"""
|
99
107
|
Get text elements above this line.
|
@@ -142,7 +150,4 @@ class LineElement(Element):
|
|
142
150
|
|
143
151
|
def __repr__(self) -> str:
|
144
152
|
"""String representation of the line element."""
|
145
|
-
|
146
|
-
"horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
|
147
|
-
)
|
148
|
-
return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
|
153
|
+
return f"<LineElement type={self.orientation} width={self.width:.1f} bbox={self.bbox}>"
|
natural_pdf/elements/region.py
CHANGED
@@ -1924,18 +1924,54 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1924
1924
|
f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
|
1925
1925
|
)
|
1926
1926
|
|
1927
|
-
#
|
1928
|
-
|
1929
|
-
|
1930
|
-
|
1931
|
-
|
1932
|
-
|
1933
|
-
|
1934
|
-
|
1935
|
-
|
1936
|
-
|
1937
|
-
|
1938
|
-
)
|
1927
|
+
# --- Robust removal: iterate through all OCR elements on the page and
|
1928
|
+
# remove those that overlap this region. This avoids reliance on
|
1929
|
+
# identity‐based look-ups that can break if the ElementManager
|
1930
|
+
# rebuilt its internal lists.
|
1931
|
+
|
1932
|
+
removed_count = 0
|
1933
|
+
|
1934
|
+
# Helper to remove a single element safely
|
1935
|
+
def _safe_remove(elem):
|
1936
|
+
nonlocal removed_count
|
1937
|
+
success = False
|
1938
|
+
if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
|
1939
|
+
etype = getattr(elem, "object_type", "word")
|
1940
|
+
if etype == "word":
|
1941
|
+
etype_key = "words"
|
1942
|
+
elif etype == "char":
|
1943
|
+
etype_key = "chars"
|
1944
|
+
else:
|
1945
|
+
etype_key = etype + "s" if not etype.endswith("s") else etype
|
1946
|
+
try:
|
1947
|
+
success = elem.page._element_mgr.remove_element(elem, etype_key)
|
1948
|
+
except Exception:
|
1949
|
+
success = False
|
1950
|
+
if success:
|
1951
|
+
removed_count += 1
|
1952
|
+
|
1953
|
+
# Remove OCR WORD elements overlapping region
|
1954
|
+
for word in list(self.page._element_mgr.words):
|
1955
|
+
if getattr(word, "source", None) == "ocr" and self.intersects(word):
|
1956
|
+
_safe_remove(word)
|
1957
|
+
|
1958
|
+
# Remove OCR CHAR dicts overlapping region
|
1959
|
+
for char in list(self.page._element_mgr.chars):
|
1960
|
+
# char can be dict or TextElement; normalise
|
1961
|
+
char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
|
1962
|
+
if char_src == "ocr":
|
1963
|
+
# Rough bbox for dicts
|
1964
|
+
if isinstance(char, dict):
|
1965
|
+
cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
|
1966
|
+
else:
|
1967
|
+
cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
|
1968
|
+
# Quick overlap check
|
1969
|
+
if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
|
1970
|
+
_safe_remove(char)
|
1971
|
+
|
1972
|
+
logger.info(
|
1973
|
+
f"Region {self.bbox}: Removed {removed_count} existing OCR elements (words & chars) before re-applying OCR."
|
1974
|
+
)
|
1939
1975
|
|
1940
1976
|
ocr_mgr = self.page._parent._ocr_manager
|
1941
1977
|
|
natural_pdf/elements/text.py
CHANGED
@@ -43,9 +43,58 @@ class TextElement(Element):
|
|
43
43
|
|
44
44
|
@text.setter
|
45
45
|
def text(self, value: str):
|
46
|
-
"""Set the text content."""
|
46
|
+
"""Set the text content and synchronise underlying char dictionaries (if any)."""
|
47
|
+
# Update the primary text value stored on the object itself
|
47
48
|
self._obj["text"] = value
|
48
49
|
|
50
|
+
# --- Keep _char_dicts in sync so downstream utilities (e.g. extract_text)
|
51
|
+
# that rely on the raw character dictionaries see the corrected text.
|
52
|
+
# For OCR-generated words we usually have a single representative char
|
53
|
+
# dict; for native words there may be one per character.
|
54
|
+
# ---------------------------------------------------------------------
|
55
|
+
try:
|
56
|
+
if hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
|
57
|
+
if not self._char_dicts:
|
58
|
+
return # Nothing to update
|
59
|
+
|
60
|
+
if len(self._char_dicts) == 1:
|
61
|
+
# Simple case – a single char dict represents the whole text
|
62
|
+
self._char_dicts[0]["text"] = value
|
63
|
+
else:
|
64
|
+
# Update character-by-character. If new value is shorter than
|
65
|
+
# existing char dicts, truncate remaining dicts by setting
|
66
|
+
# their text to empty string; if longer, extend by repeating
|
67
|
+
# the last char dict geometry (best-effort fallback).
|
68
|
+
for idx, char_dict in enumerate(self._char_dicts):
|
69
|
+
if idx < len(value):
|
70
|
+
char_dict["text"] = value[idx]
|
71
|
+
else:
|
72
|
+
# Clear extra characters from old text
|
73
|
+
char_dict["text"] = ""
|
74
|
+
|
75
|
+
# If new text is longer, append additional char dicts based
|
76
|
+
# on the last available geometry. This is an approximation
|
77
|
+
# but ensures text length consistency for downstream joins.
|
78
|
+
if len(value) > len(self._char_dicts):
|
79
|
+
last_dict = self._char_dicts[-1]
|
80
|
+
for extra_idx in range(len(self._char_dicts), len(value)):
|
81
|
+
new_dict = last_dict.copy()
|
82
|
+
new_dict["text"] = value[extra_idx]
|
83
|
+
# Advance x0/x1 roughly by average char width if available
|
84
|
+
char_width = last_dict.get("adv") or (
|
85
|
+
last_dict.get("width", 0) / max(len(self.text), 1)
|
86
|
+
)
|
87
|
+
if isinstance(char_width, (int, float)) and char_width > 0:
|
88
|
+
shift = char_width * (extra_idx - len(self._char_dicts) + 1)
|
89
|
+
new_dict["x0"] = last_dict.get("x0", 0) + shift
|
90
|
+
new_dict["x1"] = last_dict.get("x1", 0) + shift
|
91
|
+
self._char_dicts.append(new_dict)
|
92
|
+
except Exception as sync_err: # pragma: no cover
|
93
|
+
# Keep failures silent but logged; better to have outdated chars than crash.
|
94
|
+
import logging
|
95
|
+
logger = logging.getLogger(__name__)
|
96
|
+
logger.debug(f"TextElement: Failed to sync _char_dicts after text update: {sync_err}")
|
97
|
+
|
49
98
|
@property
|
50
99
|
def source(self) -> str:
|
51
100
|
"""Get the source of this text element (pdf or ocr)."""
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -2,6 +2,7 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import tempfile
|
5
|
+
import warnings
|
5
6
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
6
7
|
|
7
8
|
import numpy as np
|
@@ -310,13 +311,39 @@ class DocumentQA:
|
|
310
311
|
QAResult instance with answer details
|
311
312
|
"""
|
312
313
|
# Ensure we have text elements on the page
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
314
|
+
elements = page.find_all("text")
|
315
|
+
if not elements:
|
316
|
+
# Warn that no text was found and recommend OCR
|
317
|
+
warnings.warn(
|
318
|
+
f"No text elements found on page {page.index}. "
|
319
|
+
"Consider applying OCR first using page.apply_ocr() to extract text from images.",
|
320
|
+
UserWarning
|
321
|
+
)
|
322
|
+
|
323
|
+
# Return appropriate "not found" result(s)
|
324
|
+
if isinstance(question, (list, tuple)):
|
325
|
+
return [
|
326
|
+
QAResult(
|
327
|
+
question=q,
|
328
|
+
answer="",
|
329
|
+
confidence=0.0,
|
330
|
+
start=-1,
|
331
|
+
end=-1,
|
332
|
+
found=False,
|
333
|
+
)
|
334
|
+
for q in question
|
335
|
+
]
|
336
|
+
else:
|
337
|
+
return QAResult(
|
338
|
+
question=question,
|
339
|
+
answer="",
|
340
|
+
confidence=0.0,
|
341
|
+
start=-1,
|
342
|
+
end=-1,
|
343
|
+
found=False,
|
344
|
+
)
|
317
345
|
|
318
346
|
# Extract word boxes
|
319
|
-
elements = page.find_all("text")
|
320
347
|
word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
|
321
348
|
|
322
349
|
# Generate a high-resolution image of the page
|
@@ -393,10 +420,37 @@ class DocumentQA:
|
|
393
420
|
# Get all text elements within the region
|
394
421
|
elements = region.find_all("text")
|
395
422
|
|
396
|
-
#
|
423
|
+
# Check if we have text elements
|
397
424
|
if not elements:
|
398
|
-
|
399
|
-
|
425
|
+
# Warn that no text was found and recommend OCR
|
426
|
+
warnings.warn(
|
427
|
+
f"No text elements found in region on page {region.page.index}. "
|
428
|
+
"Consider applying OCR first using region.apply_ocr() to extract text from images.",
|
429
|
+
UserWarning
|
430
|
+
)
|
431
|
+
|
432
|
+
# Return appropriate "not found" result(s)
|
433
|
+
if isinstance(question, (list, tuple)):
|
434
|
+
return [
|
435
|
+
QAResult(
|
436
|
+
question=q,
|
437
|
+
answer="",
|
438
|
+
confidence=0.0,
|
439
|
+
start=-1,
|
440
|
+
end=-1,
|
441
|
+
found=False,
|
442
|
+
)
|
443
|
+
for q in question
|
444
|
+
]
|
445
|
+
else:
|
446
|
+
return QAResult(
|
447
|
+
question=question,
|
448
|
+
answer="",
|
449
|
+
confidence=0.0,
|
450
|
+
start=-1,
|
451
|
+
end=-1,
|
452
|
+
found=False,
|
453
|
+
)
|
400
454
|
|
401
455
|
# Extract word boxes adjusted for the cropped region
|
402
456
|
x0, top = int(region.x0), int(region.top)
|
natural_pdf/utils/packaging.py
CHANGED
@@ -36,7 +36,7 @@ def create_correction_task_package(
|
|
36
36
|
output_zip_path: str,
|
37
37
|
overwrite: bool = False,
|
38
38
|
suggest=None,
|
39
|
-
resolution: int =
|
39
|
+
resolution: int = 300,
|
40
40
|
) -> None:
|
41
41
|
"""
|
42
42
|
Creates a zip package containing data for an OCR correction task.
|
@@ -160,8 +160,22 @@ def create_correction_task_package(
|
|
160
160
|
|
161
161
|
# 3. Prepare region data for manifest
|
162
162
|
page_regions_data = []
|
163
|
-
# Calculate scaling factor from PDF
|
164
|
-
|
163
|
+
# Calculate scaling factor *from PDF points* to *actual image pixels*.
|
164
|
+
# We prefer using the rendered image dimensions rather than the nominal
|
165
|
+
# resolution value, because the image might have been resized (e.g. via
|
166
|
+
# global `natural_pdf.options.image.width`). This guarantees that the
|
167
|
+
# bounding boxes we write to the manifest always align with the exact
|
168
|
+
# pixel grid of the exported image.
|
169
|
+
|
170
|
+
try:
|
171
|
+
scale_x = img.width / float(page.width) if page.width else 1.0
|
172
|
+
scale_y = img.height / float(page.height) if page.height else 1.0
|
173
|
+
except Exception as e:
|
174
|
+
logger.warning(
|
175
|
+
f"Could not compute per-axis scale factors for page {page.number}: {e}. "
|
176
|
+
"Falling back to resolution-based scaling."
|
177
|
+
)
|
178
|
+
scale_x = scale_y = resolution / 72.0
|
165
179
|
|
166
180
|
i = -1
|
167
181
|
for elem in tqdm(ocr_elements):
|
@@ -176,12 +190,12 @@ def create_correction_task_package(
|
|
176
190
|
continue
|
177
191
|
region_id = f"r_{page.index}_{i}" # ID unique within page
|
178
192
|
|
179
|
-
# Scale coordinates to match the
|
193
|
+
# Scale coordinates to match the **actual** image dimensions.
|
180
194
|
scaled_bbox = [
|
181
|
-
elem.x0 *
|
182
|
-
elem.top *
|
183
|
-
elem.x1 *
|
184
|
-
elem.bottom *
|
195
|
+
elem.x0 * scale_x,
|
196
|
+
elem.top * scale_y,
|
197
|
+
elem.x1 * scale_x,
|
198
|
+
elem.bottom * scale_y,
|
185
199
|
]
|
186
200
|
|
187
201
|
corrected = elem.text
|
@@ -191,7 +205,7 @@ def create_correction_task_package(
|
|
191
205
|
|
192
206
|
page_regions_data.append(
|
193
207
|
{
|
194
|
-
"resolution":
|
208
|
+
"resolution": scale_x * 72.0,
|
195
209
|
"id": region_id,
|
196
210
|
"bbox": scaled_bbox,
|
197
211
|
"ocr_text": elem.text,
|
@@ -26,20 +26,20 @@ natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm
|
|
26
26
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
27
27
|
natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
|
28
28
|
natural_pdf/core/highlighting_service.py,sha256=DKoaxiiuQsWgtf6wSroMAIcFiqJOOF7dXhciYdQKdCw,38223
|
29
|
-
natural_pdf/core/page.py,sha256=
|
29
|
+
natural_pdf/core/page.py,sha256=GqYfYiVkuL1M_GoPTcLL0yWFXISN38BUCdQIKyF6vJ0,122721
|
30
30
|
natural_pdf/core/pdf.py,sha256=qsSW4RxOJRmCnweLPMs0NhzkRfiAVdghTgnh4D_wuO4,74295
|
31
31
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
32
32
|
natural_pdf/describe/base.py,sha256=mUvEydumXXPJ2FkWAYm1BbWrRWY81I0dMyQrEU32rmc,17256
|
33
33
|
natural_pdf/describe/elements.py,sha256=xD8wwR1z5IKat7RIwoAwQRUEL6zJTEwcOKorF4F-xPg,12717
|
34
34
|
natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
|
35
|
-
natural_pdf/describe/summary.py,sha256=
|
35
|
+
natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
|
36
36
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
37
37
|
natural_pdf/elements/base.py,sha256=iw-Ab0o7eI69npt0gAxQvA14GPWHAAhkLrJ_JeKvIos,43309
|
38
38
|
natural_pdf/elements/collections.py,sha256=JrM42VPRtDOJ9Q9KIR3SrcbamiiCHXI4nzTq2BBkeEk,124223
|
39
|
-
natural_pdf/elements/line.py,sha256=
|
39
|
+
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
40
40
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
41
|
-
natural_pdf/elements/region.py,sha256=
|
42
|
-
natural_pdf/elements/text.py,sha256=
|
41
|
+
natural_pdf/elements/region.py,sha256=63rdyjOnbmsgTN1WMSOyQWQnvJRUYco9qTWLqBi3TBk,125498
|
42
|
+
natural_pdf/elements/text.py,sha256=x163dnr2ZDEIE_WZXWH5hXJtoO-6cvTdA2BABcZd69U,14575
|
43
43
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
44
44
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
45
45
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
@@ -70,7 +70,7 @@ natural_pdf/ocr/ocr_manager.py,sha256=K2gpFo3e6RB1ouXOstlEAAYd14DbjBNt5RH6J7ZdDQ
|
|
70
70
|
natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
|
71
71
|
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
72
72
|
natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
|
73
|
-
natural_pdf/qa/document_qa.py,sha256=
|
73
|
+
natural_pdf/qa/document_qa.py,sha256=cli1E9NBSVtT5Qo6n7ZRd7BpstnbpZfkljX69LGTYU8,19608
|
74
74
|
natural_pdf/qa/qa_result.py,sha256=_q4dlSqsjtgomcI8-pqbOT69lqQKnEMkhZNydoxEkkE,2227
|
75
75
|
natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
|
76
76
|
natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
|
@@ -86,15 +86,15 @@ natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,99
|
|
86
86
|
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
87
87
|
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
88
88
|
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
89
|
-
natural_pdf/utils/packaging.py,sha256=
|
89
|
+
natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
|
90
90
|
natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
|
91
91
|
natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
|
92
92
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
93
93
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
94
94
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
95
|
-
natural_pdf-0.1.
|
96
|
-
natural_pdf-0.1.
|
97
|
-
natural_pdf-0.1.
|
98
|
-
natural_pdf-0.1.
|
99
|
-
natural_pdf-0.1.
|
100
|
-
natural_pdf-0.1.
|
95
|
+
natural_pdf-0.1.27.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
96
|
+
natural_pdf-0.1.27.dist-info/METADATA,sha256=TpH1LQXwoM6iK9l9q0HcftgezJzePczCtmYfQnAMh5w,6684
|
97
|
+
natural_pdf-0.1.27.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
98
|
+
natural_pdf-0.1.27.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
99
|
+
natural_pdf-0.1.27.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
100
|
+
natural_pdf-0.1.27.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|