natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,124 @@
1
+ """
2
+ Line element class for natural-pdf.
3
+ """
4
+ from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
5
+
6
+ from natural_pdf.elements.base import Element
7
+
8
+ if TYPE_CHECKING:
9
+ from natural_pdf.core.page import Page
10
+
11
+
12
+ class LineElement(Element):
13
+ """
14
+ Represents a line element in a PDF.
15
+
16
+ This class is a wrapper around pdfplumber's line objects,
17
+ providing additional functionality for analysis and extraction.
18
+ """
19
+
20
+ def __init__(self, obj: Dict[str, Any], page: 'Page'):
21
+ """
22
+ Initialize a line element.
23
+
24
+ Args:
25
+ obj: The underlying pdfplumber object
26
+ page: The parent Page object
27
+ """
28
+ super().__init__(obj, page)
29
+
30
+ @property
31
+ def type(self) -> str:
32
+ """Element type."""
33
+ return 'line'
34
+
35
+ @property
36
+ def color(self) -> Tuple:
37
+ """Get the line color (RGB tuple)."""
38
+ # PDFs often use non-RGB values, so we handle different formats
39
+ color = self._obj.get('stroking_color', (0, 0, 0))
40
+
41
+ # If it's a single value, treat as grayscale
42
+ if isinstance(color, (int, float)):
43
+ return (color, color, color)
44
+
45
+ # If it's a tuple of 3 values, treat as RGB
46
+ if isinstance(color, tuple) and len(color) == 3:
47
+ return color
48
+
49
+ # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
50
+ if isinstance(color, tuple) and len(color) == 4:
51
+ c, m, y, k = color
52
+ r = 1 - min(1, c + k)
53
+ g = 1 - min(1, m + k)
54
+ b = 1 - min(1, y + k)
55
+ return (r, g, b)
56
+
57
+ # Default to black
58
+ return (0, 0, 0)
59
+
60
+ @property
61
+ def width(self) -> float:
62
+ """Get the line width."""
63
+ return self._obj.get('linewidth', 0)
64
+
65
+ @property
66
+ def is_horizontal(self) -> bool:
67
+ """Check if this is a horizontal line."""
68
+ return self.height < 1 and self.width > 1
69
+
70
+ @property
71
+ def is_vertical(self) -> bool:
72
+ """Check if this is a vertical line."""
73
+ return self.width < 1 and self.height > 1
74
+
75
+ def text_above(self, distance: float = 5, **kwargs) -> Any:
76
+ """
77
+ Get text elements above this line.
78
+
79
+ Args:
80
+ distance: Maximum distance above the line in points
81
+ **kwargs: Additional filter parameters
82
+
83
+ Returns:
84
+ ElementCollection of text elements above this line
85
+ """
86
+ from natural_pdf.elements.collections import ElementCollection
87
+
88
+ # TODO: Implement proper filtering of elements above this line
89
+ return ElementCollection([]) # Placeholder
90
+
91
+ def text_below(self, distance: float = 5, **kwargs) -> Any:
92
+ """
93
+ Get text elements below this line.
94
+
95
+ Args:
96
+ distance: Maximum distance below the line in points
97
+ **kwargs: Additional filter parameters
98
+
99
+ Returns:
100
+ ElementCollection of text elements below this line
101
+ """
102
+ from natural_pdf.elements.collections import ElementCollection
103
+
104
+ # TODO: Implement proper filtering of elements below this line
105
+ return ElementCollection([]) # Placeholder
106
+
107
+ def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
108
+ """
109
+ Lines don't have text, so this returns an empty string.
110
+
111
+ Args:
112
+ keep_blank_chars: Whether to keep blank characters (default: True)
113
+ apply_exclusions: Whether to apply exclusion regions (default: True)
114
+ **kwargs: Additional extraction parameters
115
+
116
+ Returns:
117
+ Empty string
118
+ """
119
+ return ""
120
+
121
+ def __repr__(self) -> str:
122
+ """String representation of the line element."""
123
+ line_type = "horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
124
+ return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
@@ -0,0 +1,122 @@
1
+ """
2
+ Rectangle element class for natural-pdf.
3
+ """
4
+ from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
5
+
6
+ from natural_pdf.elements.base import Element
7
+
8
+ if TYPE_CHECKING:
9
+ from natural_pdf.core.page import Page
10
+
11
+
12
+ class RectangleElement(Element):
13
+ """
14
+ Represents a rectangle element in a PDF.
15
+
16
+ This class is a wrapper around pdfplumber's rectangle objects,
17
+ providing additional functionality for analysis and extraction.
18
+ """
19
+
20
+ def __init__(self, obj: Dict[str, Any], page: 'Page'):
21
+ """
22
+ Initialize a rectangle element.
23
+
24
+ Args:
25
+ obj: The underlying pdfplumber object
26
+ page: The parent Page object
27
+ """
28
+ super().__init__(obj, page)
29
+
30
+ @property
31
+ def type(self) -> str:
32
+ """Element type."""
33
+ return 'rect'
34
+
35
+ @property
36
+ def fill(self) -> Tuple:
37
+ """Get the fill color of the rectangle (RGB tuple)."""
38
+ # PDFs often use non-RGB values, so we handle different formats
39
+ color = self._obj.get('non_stroking_color', (0, 0, 0))
40
+
41
+ # If it's a single value, treat as grayscale
42
+ if isinstance(color, (int, float)):
43
+ return (color, color, color)
44
+
45
+ # If it's a tuple of 3 values, treat as RGB
46
+ if isinstance(color, tuple) and len(color) == 3:
47
+ return color
48
+
49
+ # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
50
+ if isinstance(color, tuple) and len(color) == 4:
51
+ c, m, y, k = color
52
+ r = 1 - min(1, c + k)
53
+ g = 1 - min(1, m + k)
54
+ b = 1 - min(1, y + k)
55
+ return (r, g, b)
56
+
57
+ # Default to black
58
+ return (0, 0, 0)
59
+
60
+ @property
61
+ def stroke(self) -> Tuple:
62
+ """Get the stroke color of the rectangle (RGB tuple)."""
63
+ # PDFs often use non-RGB values, so we handle different formats
64
+ color = self._obj.get('stroking_color', (0, 0, 0))
65
+
66
+ # If it's a single value, treat as grayscale
67
+ if isinstance(color, (int, float)):
68
+ return (color, color, color)
69
+
70
+ # If it's a tuple of 3 values, treat as RGB
71
+ if isinstance(color, tuple) and len(color) == 3:
72
+ return color
73
+
74
+ # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
75
+ if isinstance(color, tuple) and len(color) == 4:
76
+ c, m, y, k = color
77
+ r = 1 - min(1, c + k)
78
+ g = 1 - min(1, m + k)
79
+ b = 1 - min(1, y + k)
80
+ return (r, g, b)
81
+
82
+ # Default to black
83
+ return (0, 0, 0)
84
+
85
+ @property
86
+ def stroke_width(self) -> float:
87
+ """Get the stroke width of the rectangle."""
88
+ return self._obj.get('linewidth', 0)
89
+
90
+ def text_inside(self, **kwargs) -> Any:
91
+ """
92
+ Get text elements inside this rectangle.
93
+
94
+ Args:
95
+ **kwargs: Additional filter parameters
96
+
97
+ Returns:
98
+ ElementCollection of text elements inside this rectangle
99
+ """
100
+ from natural_pdf.elements.collections import ElementCollection
101
+
102
+ # TODO: Implement proper filtering of elements inside this rectangle
103
+ return ElementCollection([]) # Placeholder
104
+
105
+ def extract_text(self, **kwargs) -> str:
106
+ """
107
+ Extract text from inside this rectangle.
108
+
109
+ Args:
110
+ **kwargs: Additional extraction parameters
111
+
112
+ Returns:
113
+ Extracted text as string
114
+ """
115
+ # Use the region to extract text
116
+ from natural_pdf.elements.region import Region
117
+ region = Region(self.page, self.bbox)
118
+ return region.extract_text(**kwargs)
119
+
120
+ def __repr__(self) -> str:
121
+ """String representation of the rectangle element."""
122
+ return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"