natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,124 @@
|
|
1
|
+
"""
|
2
|
+
Line element class for natural-pdf.
|
3
|
+
"""
|
4
|
+
from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
|
5
|
+
|
6
|
+
from natural_pdf.elements.base import Element
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page import Page
|
10
|
+
|
11
|
+
|
12
|
+
class LineElement(Element):
|
13
|
+
"""
|
14
|
+
Represents a line element in a PDF.
|
15
|
+
|
16
|
+
This class is a wrapper around pdfplumber's line objects,
|
17
|
+
providing additional functionality for analysis and extraction.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, obj: Dict[str, Any], page: 'Page'):
|
21
|
+
"""
|
22
|
+
Initialize a line element.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
obj: The underlying pdfplumber object
|
26
|
+
page: The parent Page object
|
27
|
+
"""
|
28
|
+
super().__init__(obj, page)
|
29
|
+
|
30
|
+
@property
|
31
|
+
def type(self) -> str:
|
32
|
+
"""Element type."""
|
33
|
+
return 'line'
|
34
|
+
|
35
|
+
@property
|
36
|
+
def color(self) -> Tuple:
|
37
|
+
"""Get the line color (RGB tuple)."""
|
38
|
+
# PDFs often use non-RGB values, so we handle different formats
|
39
|
+
color = self._obj.get('stroking_color', (0, 0, 0))
|
40
|
+
|
41
|
+
# If it's a single value, treat as grayscale
|
42
|
+
if isinstance(color, (int, float)):
|
43
|
+
return (color, color, color)
|
44
|
+
|
45
|
+
# If it's a tuple of 3 values, treat as RGB
|
46
|
+
if isinstance(color, tuple) and len(color) == 3:
|
47
|
+
return color
|
48
|
+
|
49
|
+
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
50
|
+
if isinstance(color, tuple) and len(color) == 4:
|
51
|
+
c, m, y, k = color
|
52
|
+
r = 1 - min(1, c + k)
|
53
|
+
g = 1 - min(1, m + k)
|
54
|
+
b = 1 - min(1, y + k)
|
55
|
+
return (r, g, b)
|
56
|
+
|
57
|
+
# Default to black
|
58
|
+
return (0, 0, 0)
|
59
|
+
|
60
|
+
@property
|
61
|
+
def width(self) -> float:
|
62
|
+
"""Get the line width."""
|
63
|
+
return self._obj.get('linewidth', 0)
|
64
|
+
|
65
|
+
@property
|
66
|
+
def is_horizontal(self) -> bool:
|
67
|
+
"""Check if this is a horizontal line."""
|
68
|
+
return self.height < 1 and self.width > 1
|
69
|
+
|
70
|
+
@property
|
71
|
+
def is_vertical(self) -> bool:
|
72
|
+
"""Check if this is a vertical line."""
|
73
|
+
return self.width < 1 and self.height > 1
|
74
|
+
|
75
|
+
def text_above(self, distance: float = 5, **kwargs) -> Any:
|
76
|
+
"""
|
77
|
+
Get text elements above this line.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
distance: Maximum distance above the line in points
|
81
|
+
**kwargs: Additional filter parameters
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
ElementCollection of text elements above this line
|
85
|
+
"""
|
86
|
+
from natural_pdf.elements.collections import ElementCollection
|
87
|
+
|
88
|
+
# TODO: Implement proper filtering of elements above this line
|
89
|
+
return ElementCollection([]) # Placeholder
|
90
|
+
|
91
|
+
def text_below(self, distance: float = 5, **kwargs) -> Any:
|
92
|
+
"""
|
93
|
+
Get text elements below this line.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
distance: Maximum distance below the line in points
|
97
|
+
**kwargs: Additional filter parameters
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
ElementCollection of text elements below this line
|
101
|
+
"""
|
102
|
+
from natural_pdf.elements.collections import ElementCollection
|
103
|
+
|
104
|
+
# TODO: Implement proper filtering of elements below this line
|
105
|
+
return ElementCollection([]) # Placeholder
|
106
|
+
|
107
|
+
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
|
108
|
+
"""
|
109
|
+
Lines don't have text, so this returns an empty string.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
keep_blank_chars: Whether to keep blank characters (default: True)
|
113
|
+
apply_exclusions: Whether to apply exclusion regions (default: True)
|
114
|
+
**kwargs: Additional extraction parameters
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
Empty string
|
118
|
+
"""
|
119
|
+
return ""
|
120
|
+
|
121
|
+
def __repr__(self) -> str:
|
122
|
+
"""String representation of the line element."""
|
123
|
+
line_type = "horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
|
124
|
+
return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
|
@@ -0,0 +1,122 @@
|
|
1
|
+
"""
|
2
|
+
Rectangle element class for natural-pdf.
|
3
|
+
"""
|
4
|
+
from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
|
5
|
+
|
6
|
+
from natural_pdf.elements.base import Element
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page import Page
|
10
|
+
|
11
|
+
|
12
|
+
class RectangleElement(Element):
|
13
|
+
"""
|
14
|
+
Represents a rectangle element in a PDF.
|
15
|
+
|
16
|
+
This class is a wrapper around pdfplumber's rectangle objects,
|
17
|
+
providing additional functionality for analysis and extraction.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, obj: Dict[str, Any], page: 'Page'):
|
21
|
+
"""
|
22
|
+
Initialize a rectangle element.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
obj: The underlying pdfplumber object
|
26
|
+
page: The parent Page object
|
27
|
+
"""
|
28
|
+
super().__init__(obj, page)
|
29
|
+
|
30
|
+
@property
|
31
|
+
def type(self) -> str:
|
32
|
+
"""Element type."""
|
33
|
+
return 'rect'
|
34
|
+
|
35
|
+
@property
|
36
|
+
def fill(self) -> Tuple:
|
37
|
+
"""Get the fill color of the rectangle (RGB tuple)."""
|
38
|
+
# PDFs often use non-RGB values, so we handle different formats
|
39
|
+
color = self._obj.get('non_stroking_color', (0, 0, 0))
|
40
|
+
|
41
|
+
# If it's a single value, treat as grayscale
|
42
|
+
if isinstance(color, (int, float)):
|
43
|
+
return (color, color, color)
|
44
|
+
|
45
|
+
# If it's a tuple of 3 values, treat as RGB
|
46
|
+
if isinstance(color, tuple) and len(color) == 3:
|
47
|
+
return color
|
48
|
+
|
49
|
+
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
50
|
+
if isinstance(color, tuple) and len(color) == 4:
|
51
|
+
c, m, y, k = color
|
52
|
+
r = 1 - min(1, c + k)
|
53
|
+
g = 1 - min(1, m + k)
|
54
|
+
b = 1 - min(1, y + k)
|
55
|
+
return (r, g, b)
|
56
|
+
|
57
|
+
# Default to black
|
58
|
+
return (0, 0, 0)
|
59
|
+
|
60
|
+
@property
|
61
|
+
def stroke(self) -> Tuple:
|
62
|
+
"""Get the stroke color of the rectangle (RGB tuple)."""
|
63
|
+
# PDFs often use non-RGB values, so we handle different formats
|
64
|
+
color = self._obj.get('stroking_color', (0, 0, 0))
|
65
|
+
|
66
|
+
# If it's a single value, treat as grayscale
|
67
|
+
if isinstance(color, (int, float)):
|
68
|
+
return (color, color, color)
|
69
|
+
|
70
|
+
# If it's a tuple of 3 values, treat as RGB
|
71
|
+
if isinstance(color, tuple) and len(color) == 3:
|
72
|
+
return color
|
73
|
+
|
74
|
+
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
75
|
+
if isinstance(color, tuple) and len(color) == 4:
|
76
|
+
c, m, y, k = color
|
77
|
+
r = 1 - min(1, c + k)
|
78
|
+
g = 1 - min(1, m + k)
|
79
|
+
b = 1 - min(1, y + k)
|
80
|
+
return (r, g, b)
|
81
|
+
|
82
|
+
# Default to black
|
83
|
+
return (0, 0, 0)
|
84
|
+
|
85
|
+
@property
|
86
|
+
def stroke_width(self) -> float:
|
87
|
+
"""Get the stroke width of the rectangle."""
|
88
|
+
return self._obj.get('linewidth', 0)
|
89
|
+
|
90
|
+
def text_inside(self, **kwargs) -> Any:
|
91
|
+
"""
|
92
|
+
Get text elements inside this rectangle.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
**kwargs: Additional filter parameters
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
ElementCollection of text elements inside this rectangle
|
99
|
+
"""
|
100
|
+
from natural_pdf.elements.collections import ElementCollection
|
101
|
+
|
102
|
+
# TODO: Implement proper filtering of elements inside this rectangle
|
103
|
+
return ElementCollection([]) # Placeholder
|
104
|
+
|
105
|
+
def extract_text(self, **kwargs) -> str:
|
106
|
+
"""
|
107
|
+
Extract text from inside this rectangle.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
**kwargs: Additional extraction parameters
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
Extracted text as string
|
114
|
+
"""
|
115
|
+
# Use the region to extract text
|
116
|
+
from natural_pdf.elements.region import Region
|
117
|
+
region = Region(self.page, self.bbox)
|
118
|
+
return region.extract_text(**kwargs)
|
119
|
+
|
120
|
+
def __repr__(self) -> str:
|
121
|
+
"""String representation of the rectangle element."""
|
122
|
+
return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"
|