natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,8 @@
1
1
  """
2
2
  Line element class for natural-pdf.
3
3
  """
4
- from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
4
+
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
5
6
 
6
7
  from natural_pdf.elements.base import Element
7
8
 
@@ -12,40 +13,40 @@ if TYPE_CHECKING:
12
13
  class LineElement(Element):
13
14
  """
14
15
  Represents a line element in a PDF.
15
-
16
+
16
17
  This class is a wrapper around pdfplumber's line objects,
17
18
  providing additional functionality for analysis and extraction.
18
19
  """
19
-
20
- def __init__(self, obj: Dict[str, Any], page: 'Page'):
20
+
21
+ def __init__(self, obj: Dict[str, Any], page: "Page"):
21
22
  """
22
23
  Initialize a line element.
23
-
24
+
24
25
  Args:
25
26
  obj: The underlying pdfplumber object
26
27
  page: The parent Page object
27
28
  """
28
29
  super().__init__(obj, page)
29
-
30
+
30
31
  @property
31
32
  def type(self) -> str:
32
33
  """Element type."""
33
- return 'line'
34
-
34
+ return "line"
35
+
35
36
  @property
36
37
  def color(self) -> Tuple:
37
38
  """Get the line color (RGB tuple)."""
38
39
  # PDFs often use non-RGB values, so we handle different formats
39
- color = self._obj.get('stroking_color', (0, 0, 0))
40
-
40
+ color = self._obj.get("stroking_color", (0, 0, 0))
41
+
41
42
  # If it's a single value, treat as grayscale
42
43
  if isinstance(color, (int, float)):
43
44
  return (color, color, color)
44
-
45
+
45
46
  # If it's a tuple of 3 values, treat as RGB
46
47
  if isinstance(color, tuple) and len(color) == 3:
47
48
  return color
48
-
49
+
49
50
  # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
50
51
  if isinstance(color, tuple) and len(color) == 4:
51
52
  c, m, y, k = color
@@ -53,88 +54,90 @@ class LineElement(Element):
53
54
  g = 1 - min(1, m + k)
54
55
  b = 1 - min(1, y + k)
55
56
  return (r, g, b)
56
-
57
+
57
58
  # Default to black
58
59
  return (0, 0, 0)
59
-
60
+
60
61
  @property
61
62
  def width(self) -> float:
62
63
  """Get the line thickness (extracted from PDF properties)."""
63
- return self._obj.get('linewidth', 0)
64
-
64
+ return self._obj.get("linewidth", 0)
65
+
65
66
  @property
66
67
  def is_horizontal(self) -> bool:
67
68
  """Check if this is a horizontal line based on coordinates."""
68
69
  # Calculate absolute difference in coordinates
69
70
  dx = abs(self.x1 - self.x0)
70
71
  dy = abs(self.top - self.bottom)
71
-
72
+
72
73
  # Define a tolerance for near-horizontal lines (e.g., 1 point)
73
- tolerance = 1.0
74
-
74
+ tolerance = 1.0
75
+
75
76
  # Horizontal if y-change is within tolerance and x-change is significant
76
77
  return dy <= tolerance and dx > tolerance
77
-
78
+
78
79
  @property
79
80
  def is_vertical(self) -> bool:
80
81
  """Check if this is a vertical line based on coordinates."""
81
82
  # Calculate absolute difference in coordinates
82
83
  dx = abs(self.x1 - self.x0)
83
84
  dy = abs(self.top - self.bottom)
84
-
85
+
85
86
  # Define a tolerance for near-vertical lines (e.g., 1 point)
86
87
  tolerance = 1.0
87
-
88
+
88
89
  # Vertical if x-change is within tolerance and y-change is significant
89
90
  return dx <= tolerance and dy > tolerance
90
-
91
+
91
92
  def text_above(self, distance: float = 5, **kwargs) -> Any:
92
93
  """
93
94
  Get text elements above this line.
94
-
95
+
95
96
  Args:
96
97
  distance: Maximum distance above the line in points
97
98
  **kwargs: Additional filter parameters
98
-
99
+
99
100
  Returns:
100
101
  ElementCollection of text elements above this line
101
102
  """
102
103
  from natural_pdf.elements.collections import ElementCollection
103
-
104
+
104
105
  # TODO: Implement proper filtering of elements above this line
105
106
  return ElementCollection([]) # Placeholder
106
-
107
+
107
108
  def text_below(self, distance: float = 5, **kwargs) -> Any:
108
109
  """
109
110
  Get text elements below this line.
110
-
111
+
111
112
  Args:
112
113
  distance: Maximum distance below the line in points
113
114
  **kwargs: Additional filter parameters
114
-
115
+
115
116
  Returns:
116
117
  ElementCollection of text elements below this line
117
118
  """
118
119
  from natural_pdf.elements.collections import ElementCollection
119
-
120
+
120
121
  # TODO: Implement proper filtering of elements below this line
121
122
  return ElementCollection([]) # Placeholder
122
-
123
+
123
124
  def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
124
125
  """
125
126
  Lines don't have text, so this returns an empty string.
126
-
127
+
127
128
  Args:
128
129
  keep_blank_chars: Whether to keep blank characters (default: True)
129
130
  apply_exclusions: Whether to apply exclusion regions (default: True)
130
131
  **kwargs: Additional extraction parameters
131
-
132
+
132
133
  Returns:
133
134
  Empty string
134
135
  """
135
136
  return ""
136
-
137
+
137
138
  def __repr__(self) -> str:
138
139
  """String representation of the line element."""
139
- line_type = "horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
140
- return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
140
+ line_type = (
141
+ "horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
142
+ )
143
+ return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
@@ -1,7 +1,8 @@
1
1
  """
2
2
  Rectangle element class for natural-pdf.
3
3
  """
4
- from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
4
+
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
5
6
 
6
7
  from natural_pdf.elements.base import Element
7
8
 
@@ -12,40 +13,40 @@ if TYPE_CHECKING:
12
13
  class RectangleElement(Element):
13
14
  """
14
15
  Represents a rectangle element in a PDF.
15
-
16
+
16
17
  This class is a wrapper around pdfplumber's rectangle objects,
17
18
  providing additional functionality for analysis and extraction.
18
19
  """
19
-
20
- def __init__(self, obj: Dict[str, Any], page: 'Page'):
20
+
21
+ def __init__(self, obj: Dict[str, Any], page: "Page"):
21
22
  """
22
23
  Initialize a rectangle element.
23
-
24
+
24
25
  Args:
25
26
  obj: The underlying pdfplumber object
26
27
  page: The parent Page object
27
28
  """
28
29
  super().__init__(obj, page)
29
-
30
+
30
31
  @property
31
32
  def type(self) -> str:
32
33
  """Element type."""
33
- return 'rect'
34
-
34
+ return "rect"
35
+
35
36
  @property
36
37
  def fill(self) -> Tuple:
37
38
  """Get the fill color of the rectangle (RGB tuple)."""
38
39
  # PDFs often use non-RGB values, so we handle different formats
39
- color = self._obj.get('non_stroking_color', (0, 0, 0))
40
-
40
+ color = self._obj.get("non_stroking_color", (0, 0, 0))
41
+
41
42
  # If it's a single value, treat as grayscale
42
43
  if isinstance(color, (int, float)):
43
44
  return (color, color, color)
44
-
45
+
45
46
  # If it's a tuple of 3 values, treat as RGB
46
47
  if isinstance(color, tuple) and len(color) == 3:
47
48
  return color
48
-
49
+
49
50
  # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
50
51
  if isinstance(color, tuple) and len(color) == 4:
51
52
  c, m, y, k = color
@@ -53,24 +54,24 @@ class RectangleElement(Element):
53
54
  g = 1 - min(1, m + k)
54
55
  b = 1 - min(1, y + k)
55
56
  return (r, g, b)
56
-
57
+
57
58
  # Default to black
58
59
  return (0, 0, 0)
59
-
60
+
60
61
  @property
61
62
  def stroke(self) -> Tuple:
62
63
  """Get the stroke color of the rectangle (RGB tuple)."""
63
64
  # PDFs often use non-RGB values, so we handle different formats
64
- color = self._obj.get('stroking_color', (0, 0, 0))
65
-
65
+ color = self._obj.get("stroking_color", (0, 0, 0))
66
+
66
67
  # If it's a single value, treat as grayscale
67
68
  if isinstance(color, (int, float)):
68
69
  return (color, color, color)
69
-
70
+
70
71
  # If it's a tuple of 3 values, treat as RGB
71
72
  if isinstance(color, tuple) and len(color) == 3:
72
73
  return color
73
-
74
+
74
75
  # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
75
76
  if isinstance(color, tuple) and len(color) == 4:
76
77
  c, m, y, k = color
@@ -78,45 +79,46 @@ class RectangleElement(Element):
78
79
  g = 1 - min(1, m + k)
79
80
  b = 1 - min(1, y + k)
80
81
  return (r, g, b)
81
-
82
+
82
83
  # Default to black
83
84
  return (0, 0, 0)
84
-
85
+
85
86
  @property
86
87
  def stroke_width(self) -> float:
87
88
  """Get the stroke width of the rectangle."""
88
- return self._obj.get('linewidth', 0)
89
-
89
+ return self._obj.get("linewidth", 0)
90
+
90
91
  def text_inside(self, **kwargs) -> Any:
91
92
  """
92
93
  Get text elements inside this rectangle.
93
-
94
+
94
95
  Args:
95
96
  **kwargs: Additional filter parameters
96
-
97
+
97
98
  Returns:
98
99
  ElementCollection of text elements inside this rectangle
99
100
  """
100
101
  from natural_pdf.elements.collections import ElementCollection
101
-
102
+
102
103
  # TODO: Implement proper filtering of elements inside this rectangle
103
104
  return ElementCollection([]) # Placeholder
104
-
105
+
105
106
  def extract_text(self, **kwargs) -> str:
106
107
  """
107
108
  Extract text from inside this rectangle.
108
-
109
+
109
110
  Args:
110
111
  **kwargs: Additional extraction parameters
111
-
112
+
112
113
  Returns:
113
114
  Extracted text as string
114
115
  """
115
116
  # Use the region to extract text
116
117
  from natural_pdf.elements.region import Region
118
+
117
119
  region = Region(self.page, self.bbox)
118
120
  return region.extract_text(**kwargs)
119
-
121
+
120
122
  def __repr__(self) -> str:
121
123
  """String representation of the rectangle element."""
122
- return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"
124
+ return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"