natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/line.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
"""
|
2
2
|
Line element class for natural-pdf.
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
|
5
6
|
|
6
7
|
from natural_pdf.elements.base import Element
|
7
8
|
|
@@ -12,40 +13,40 @@ if TYPE_CHECKING:
|
|
12
13
|
class LineElement(Element):
|
13
14
|
"""
|
14
15
|
Represents a line element in a PDF.
|
15
|
-
|
16
|
+
|
16
17
|
This class is a wrapper around pdfplumber's line objects,
|
17
18
|
providing additional functionality for analysis and extraction.
|
18
19
|
"""
|
19
|
-
|
20
|
-
def __init__(self, obj: Dict[str, Any], page:
|
20
|
+
|
21
|
+
def __init__(self, obj: Dict[str, Any], page: "Page"):
|
21
22
|
"""
|
22
23
|
Initialize a line element.
|
23
|
-
|
24
|
+
|
24
25
|
Args:
|
25
26
|
obj: The underlying pdfplumber object
|
26
27
|
page: The parent Page object
|
27
28
|
"""
|
28
29
|
super().__init__(obj, page)
|
29
|
-
|
30
|
+
|
30
31
|
@property
|
31
32
|
def type(self) -> str:
|
32
33
|
"""Element type."""
|
33
|
-
return
|
34
|
-
|
34
|
+
return "line"
|
35
|
+
|
35
36
|
@property
|
36
37
|
def color(self) -> Tuple:
|
37
38
|
"""Get the line color (RGB tuple)."""
|
38
39
|
# PDFs often use non-RGB values, so we handle different formats
|
39
|
-
color = self._obj.get(
|
40
|
-
|
40
|
+
color = self._obj.get("stroking_color", (0, 0, 0))
|
41
|
+
|
41
42
|
# If it's a single value, treat as grayscale
|
42
43
|
if isinstance(color, (int, float)):
|
43
44
|
return (color, color, color)
|
44
|
-
|
45
|
+
|
45
46
|
# If it's a tuple of 3 values, treat as RGB
|
46
47
|
if isinstance(color, tuple) and len(color) == 3:
|
47
48
|
return color
|
48
|
-
|
49
|
+
|
49
50
|
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
50
51
|
if isinstance(color, tuple) and len(color) == 4:
|
51
52
|
c, m, y, k = color
|
@@ -53,88 +54,90 @@ class LineElement(Element):
|
|
53
54
|
g = 1 - min(1, m + k)
|
54
55
|
b = 1 - min(1, y + k)
|
55
56
|
return (r, g, b)
|
56
|
-
|
57
|
+
|
57
58
|
# Default to black
|
58
59
|
return (0, 0, 0)
|
59
|
-
|
60
|
+
|
60
61
|
@property
|
61
62
|
def width(self) -> float:
|
62
63
|
"""Get the line thickness (extracted from PDF properties)."""
|
63
|
-
return self._obj.get(
|
64
|
-
|
64
|
+
return self._obj.get("linewidth", 0)
|
65
|
+
|
65
66
|
@property
|
66
67
|
def is_horizontal(self) -> bool:
|
67
68
|
"""Check if this is a horizontal line based on coordinates."""
|
68
69
|
# Calculate absolute difference in coordinates
|
69
70
|
dx = abs(self.x1 - self.x0)
|
70
71
|
dy = abs(self.top - self.bottom)
|
71
|
-
|
72
|
+
|
72
73
|
# Define a tolerance for near-horizontal lines (e.g., 1 point)
|
73
|
-
tolerance = 1.0
|
74
|
-
|
74
|
+
tolerance = 1.0
|
75
|
+
|
75
76
|
# Horizontal if y-change is within tolerance and x-change is significant
|
76
77
|
return dy <= tolerance and dx > tolerance
|
77
|
-
|
78
|
+
|
78
79
|
@property
|
79
80
|
def is_vertical(self) -> bool:
|
80
81
|
"""Check if this is a vertical line based on coordinates."""
|
81
82
|
# Calculate absolute difference in coordinates
|
82
83
|
dx = abs(self.x1 - self.x0)
|
83
84
|
dy = abs(self.top - self.bottom)
|
84
|
-
|
85
|
+
|
85
86
|
# Define a tolerance for near-vertical lines (e.g., 1 point)
|
86
87
|
tolerance = 1.0
|
87
|
-
|
88
|
+
|
88
89
|
# Vertical if x-change is within tolerance and y-change is significant
|
89
90
|
return dx <= tolerance and dy > tolerance
|
90
|
-
|
91
|
+
|
91
92
|
def text_above(self, distance: float = 5, **kwargs) -> Any:
|
92
93
|
"""
|
93
94
|
Get text elements above this line.
|
94
|
-
|
95
|
+
|
95
96
|
Args:
|
96
97
|
distance: Maximum distance above the line in points
|
97
98
|
**kwargs: Additional filter parameters
|
98
|
-
|
99
|
+
|
99
100
|
Returns:
|
100
101
|
ElementCollection of text elements above this line
|
101
102
|
"""
|
102
103
|
from natural_pdf.elements.collections import ElementCollection
|
103
|
-
|
104
|
+
|
104
105
|
# TODO: Implement proper filtering of elements above this line
|
105
106
|
return ElementCollection([]) # Placeholder
|
106
|
-
|
107
|
+
|
107
108
|
def text_below(self, distance: float = 5, **kwargs) -> Any:
|
108
109
|
"""
|
109
110
|
Get text elements below this line.
|
110
|
-
|
111
|
+
|
111
112
|
Args:
|
112
113
|
distance: Maximum distance below the line in points
|
113
114
|
**kwargs: Additional filter parameters
|
114
|
-
|
115
|
+
|
115
116
|
Returns:
|
116
117
|
ElementCollection of text elements below this line
|
117
118
|
"""
|
118
119
|
from natural_pdf.elements.collections import ElementCollection
|
119
|
-
|
120
|
+
|
120
121
|
# TODO: Implement proper filtering of elements below this line
|
121
122
|
return ElementCollection([]) # Placeholder
|
122
|
-
|
123
|
+
|
123
124
|
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
|
124
125
|
"""
|
125
126
|
Lines don't have text, so this returns an empty string.
|
126
|
-
|
127
|
+
|
127
128
|
Args:
|
128
129
|
keep_blank_chars: Whether to keep blank characters (default: True)
|
129
130
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
130
131
|
**kwargs: Additional extraction parameters
|
131
|
-
|
132
|
+
|
132
133
|
Returns:
|
133
134
|
Empty string
|
134
135
|
"""
|
135
136
|
return ""
|
136
|
-
|
137
|
+
|
137
138
|
def __repr__(self) -> str:
|
138
139
|
"""String representation of the line element."""
|
139
|
-
line_type =
|
140
|
-
|
140
|
+
line_type = (
|
141
|
+
"horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
|
142
|
+
)
|
143
|
+
return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
|
natural_pdf/elements/rect.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
"""
|
2
2
|
Rectangle element class for natural-pdf.
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
|
5
6
|
|
6
7
|
from natural_pdf.elements.base import Element
|
7
8
|
|
@@ -12,40 +13,40 @@ if TYPE_CHECKING:
|
|
12
13
|
class RectangleElement(Element):
|
13
14
|
"""
|
14
15
|
Represents a rectangle element in a PDF.
|
15
|
-
|
16
|
+
|
16
17
|
This class is a wrapper around pdfplumber's rectangle objects,
|
17
18
|
providing additional functionality for analysis and extraction.
|
18
19
|
"""
|
19
|
-
|
20
|
-
def __init__(self, obj: Dict[str, Any], page:
|
20
|
+
|
21
|
+
def __init__(self, obj: Dict[str, Any], page: "Page"):
|
21
22
|
"""
|
22
23
|
Initialize a rectangle element.
|
23
|
-
|
24
|
+
|
24
25
|
Args:
|
25
26
|
obj: The underlying pdfplumber object
|
26
27
|
page: The parent Page object
|
27
28
|
"""
|
28
29
|
super().__init__(obj, page)
|
29
|
-
|
30
|
+
|
30
31
|
@property
|
31
32
|
def type(self) -> str:
|
32
33
|
"""Element type."""
|
33
|
-
return
|
34
|
-
|
34
|
+
return "rect"
|
35
|
+
|
35
36
|
@property
|
36
37
|
def fill(self) -> Tuple:
|
37
38
|
"""Get the fill color of the rectangle (RGB tuple)."""
|
38
39
|
# PDFs often use non-RGB values, so we handle different formats
|
39
|
-
color = self._obj.get(
|
40
|
-
|
40
|
+
color = self._obj.get("non_stroking_color", (0, 0, 0))
|
41
|
+
|
41
42
|
# If it's a single value, treat as grayscale
|
42
43
|
if isinstance(color, (int, float)):
|
43
44
|
return (color, color, color)
|
44
|
-
|
45
|
+
|
45
46
|
# If it's a tuple of 3 values, treat as RGB
|
46
47
|
if isinstance(color, tuple) and len(color) == 3:
|
47
48
|
return color
|
48
|
-
|
49
|
+
|
49
50
|
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
50
51
|
if isinstance(color, tuple) and len(color) == 4:
|
51
52
|
c, m, y, k = color
|
@@ -53,24 +54,24 @@ class RectangleElement(Element):
|
|
53
54
|
g = 1 - min(1, m + k)
|
54
55
|
b = 1 - min(1, y + k)
|
55
56
|
return (r, g, b)
|
56
|
-
|
57
|
+
|
57
58
|
# Default to black
|
58
59
|
return (0, 0, 0)
|
59
|
-
|
60
|
+
|
60
61
|
@property
|
61
62
|
def stroke(self) -> Tuple:
|
62
63
|
"""Get the stroke color of the rectangle (RGB tuple)."""
|
63
64
|
# PDFs often use non-RGB values, so we handle different formats
|
64
|
-
color = self._obj.get(
|
65
|
-
|
65
|
+
color = self._obj.get("stroking_color", (0, 0, 0))
|
66
|
+
|
66
67
|
# If it's a single value, treat as grayscale
|
67
68
|
if isinstance(color, (int, float)):
|
68
69
|
return (color, color, color)
|
69
|
-
|
70
|
+
|
70
71
|
# If it's a tuple of 3 values, treat as RGB
|
71
72
|
if isinstance(color, tuple) and len(color) == 3:
|
72
73
|
return color
|
73
|
-
|
74
|
+
|
74
75
|
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
75
76
|
if isinstance(color, tuple) and len(color) == 4:
|
76
77
|
c, m, y, k = color
|
@@ -78,45 +79,46 @@ class RectangleElement(Element):
|
|
78
79
|
g = 1 - min(1, m + k)
|
79
80
|
b = 1 - min(1, y + k)
|
80
81
|
return (r, g, b)
|
81
|
-
|
82
|
+
|
82
83
|
# Default to black
|
83
84
|
return (0, 0, 0)
|
84
|
-
|
85
|
+
|
85
86
|
@property
|
86
87
|
def stroke_width(self) -> float:
|
87
88
|
"""Get the stroke width of the rectangle."""
|
88
|
-
return self._obj.get(
|
89
|
-
|
89
|
+
return self._obj.get("linewidth", 0)
|
90
|
+
|
90
91
|
def text_inside(self, **kwargs) -> Any:
|
91
92
|
"""
|
92
93
|
Get text elements inside this rectangle.
|
93
|
-
|
94
|
+
|
94
95
|
Args:
|
95
96
|
**kwargs: Additional filter parameters
|
96
|
-
|
97
|
+
|
97
98
|
Returns:
|
98
99
|
ElementCollection of text elements inside this rectangle
|
99
100
|
"""
|
100
101
|
from natural_pdf.elements.collections import ElementCollection
|
101
|
-
|
102
|
+
|
102
103
|
# TODO: Implement proper filtering of elements inside this rectangle
|
103
104
|
return ElementCollection([]) # Placeholder
|
104
|
-
|
105
|
+
|
105
106
|
def extract_text(self, **kwargs) -> str:
|
106
107
|
"""
|
107
108
|
Extract text from inside this rectangle.
|
108
|
-
|
109
|
+
|
109
110
|
Args:
|
110
111
|
**kwargs: Additional extraction parameters
|
111
|
-
|
112
|
+
|
112
113
|
Returns:
|
113
114
|
Extracted text as string
|
114
115
|
"""
|
115
116
|
# Use the region to extract text
|
116
117
|
from natural_pdf.elements.region import Region
|
118
|
+
|
117
119
|
region = Region(self.page, self.bbox)
|
118
120
|
return region.extract_text(**kwargs)
|
119
|
-
|
121
|
+
|
120
122
|
def __repr__(self) -> str:
|
121
123
|
"""String representation of the rectangle element."""
|
122
|
-
return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"
|
124
|
+
return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"
|