natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,8 @@
1
1
  """
2
2
  Text element classes for natural-pdf.
3
3
  """
4
- from typing import Dict, Any, Optional, TYPE_CHECKING
4
+
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional
5
6
 
6
7
  from natural_pdf.elements.base import Element
7
8
 
@@ -12,117 +13,128 @@ if TYPE_CHECKING:
12
13
  class TextElement(Element):
13
14
  """
14
15
  Represents a text element in a PDF.
15
-
16
+
16
17
  This class is a wrapper around pdfplumber's character objects,
17
18
  providing additional functionality for text extraction and analysis.
18
19
  """
19
-
20
- def __init__(self, obj: Dict[str, Any], page: 'Page'):
20
+
21
+ def __init__(self, obj: Dict[str, Any], page: "Page"):
21
22
  """
22
23
  Initialize a text element.
23
-
24
+
24
25
  Args:
25
26
  obj: The underlying pdfplumber object. For OCR text elements,
26
27
  should include 'text', 'bbox', 'source', and 'confidence'
27
28
  page: The parent Page object
28
29
  """
29
30
  # Add object_type if not present
30
- if 'object_type' not in obj:
31
- obj['object_type'] = 'text'
32
-
31
+ if "object_type" not in obj:
32
+ obj["object_type"] = "text"
33
+
33
34
  super().__init__(obj, page)
34
-
35
+ # Explicitly store constituent characters if provided
36
+ # (Pop from obj to avoid storing it twice if super() stores _obj by ref)
37
+ self._char_dicts = obj.pop("_char_dicts", [])
38
+
35
39
  @property
36
40
  def text(self) -> str:
37
41
  """Get the text content."""
38
- return self._obj.get('text', '')
39
-
42
+ return self._obj.get("text", "")
43
+
40
44
  @property
41
45
  def source(self) -> str:
42
46
  """Get the source of this text element (pdf or ocr)."""
43
- return self._obj.get('source', 'pdf')
44
-
47
+ return self._obj.get("source", "pdf")
48
+
45
49
  @property
46
50
  def confidence(self) -> float:
47
51
  """Get the confidence score for OCR text elements."""
48
- return self._obj.get('confidence', 1.0)
49
-
52
+ return self._obj.get("confidence", 1.0)
53
+
50
54
  @property
51
55
  def fontname(self) -> str:
52
56
  """Get the font name."""
53
57
  # First check if we have a real fontname from PDF resources
54
- if 'real_fontname' in self._obj:
55
- return self._obj['real_fontname']
58
+ if "real_fontname" in self._obj:
59
+ return self._obj["real_fontname"]
56
60
  # Otherwise use standard fontname
57
- return self._obj.get('fontname', '') or self._obj.get('font', '')
58
-
61
+ return self._obj.get("fontname", "") or self._obj.get("font", "")
62
+
59
63
  @property
60
64
  def font_family(self) -> str:
61
65
  """
62
66
  Get a cleaner font family name by stripping PDF-specific prefixes.
63
-
67
+
64
68
  PDF font names often include prefixes like 'ABCDEF+' followed by the font name
65
69
  or unique identifiers. This method attempts to extract a more readable font name.
66
70
  """
67
71
  font = self.fontname
68
-
72
+
69
73
  # Remove common PDF font prefixes (e.g., 'ABCDEF+')
70
- if '+' in font:
71
- font = font.split('+', 1)[1]
72
-
74
+ if "+" in font:
75
+ font = font.split("+", 1)[1]
76
+
73
77
  # Try to extract common font family names
74
78
  common_fonts = [
75
- 'Arial', 'Helvetica', 'Times', 'Courier', 'Calibri',
76
- 'Cambria', 'Georgia', 'Verdana', 'Tahoma', 'Trebuchet'
79
+ "Arial",
80
+ "Helvetica",
81
+ "Times",
82
+ "Courier",
83
+ "Calibri",
84
+ "Cambria",
85
+ "Georgia",
86
+ "Verdana",
87
+ "Tahoma",
88
+ "Trebuchet",
77
89
  ]
78
-
90
+
79
91
  for common in common_fonts:
80
92
  if common.lower() in font.lower():
81
93
  return common
82
-
94
+
83
95
  return font
84
-
96
+
85
97
  @property
86
98
  def font_variant(self) -> str:
87
99
  """
88
100
  Get the font variant identifier (prefix before the '+' in PDF font names).
89
-
101
+
90
102
  PDF embeds font subsets with unique identifiers like 'AAAAAB+FontName'.
91
103
  Different variants of the same base font will have different prefixes.
92
- This can be used to differentiate text that looks different despite
104
+ This can be used to differentiate text that looks different despite
93
105
  having the same font name and size.
94
-
106
+
95
107
  Returns:
96
108
  The font variant prefix, or empty string if no variant is present
97
109
  """
98
110
  font = self.fontname
99
-
111
+
100
112
  # Extract the prefix before '+' if it exists
101
- if '+' in font:
102
- return font.split('+', 1)[0]
103
-
113
+ if "+" in font:
114
+ return font.split("+", 1)[0]
115
+
104
116
  return ""
105
-
117
+
106
118
  @property
107
119
  def size(self) -> float:
108
120
  """Get the font size."""
109
- return self._obj.get('size', 0)
110
-
121
+ return self._obj.get("size", 0)
122
+
111
123
  @property
112
124
  def color(self) -> tuple:
113
125
  """Get the text color (RGB tuple)."""
114
126
  # PDFs often use non-RGB values, so we handle different formats
115
127
  # In pdfplumber, colors can be in various formats depending on the PDF
116
- color = self._obj.get('non_stroking_color', (0, 0, 0))
117
-
128
+ color = self._obj.get("non_stroking_color", (0, 0, 0))
129
+
118
130
  # If it's a single value, treat as grayscale
119
131
  if isinstance(color, (int, float)):
120
132
  return (color, color, color)
121
-
133
+
122
134
  # If it's a tuple of 3 values, treat as RGB
123
135
  if isinstance(color, tuple) and len(color) == 3:
124
136
  return color
125
-
137
+
126
138
  # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
127
139
  if isinstance(color, tuple) and len(color) == 4:
128
140
  c, m, y, k = color
@@ -130,33 +142,33 @@ class TextElement(Element):
130
142
  g = 1 - min(1, m + k)
131
143
  b = 1 - min(1, y + k)
132
144
  return (r, g, b)
133
-
145
+
134
146
  # Default to black
135
147
  return (0, 0, 0)
136
-
148
+
137
149
  def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
138
150
  """
139
151
  Extract text from this element.
140
-
152
+
141
153
  Args:
142
154
  keep_blank_chars: Whether to keep blank characters (default: True)
143
155
  **kwargs: Additional extraction parameters
144
-
156
+
145
157
  Returns:
146
158
  Text content
147
159
  """
148
160
  # For text elements, keep_blank_chars doesn't affect anything as we're
149
161
  # simply returning the text property. Included for API consistency.
150
162
  return self.text
151
-
163
+
152
164
  def contains(self, substring: str, case_sensitive: bool = True) -> bool:
153
165
  """
154
166
  Check if this text element contains a substring.
155
-
167
+
156
168
  Args:
157
169
  substring: The substring to check for
158
170
  case_sensitive: Whether the check is case-sensitive
159
-
171
+
160
172
  Returns:
161
173
  True if the text contains the substring
162
174
  """
@@ -164,25 +176,26 @@ class TextElement(Element):
164
176
  return substring in self.text
165
177
  else:
166
178
  return substring.lower() in self.text.lower()
167
-
179
+
168
180
  def matches(self, pattern: str) -> bool:
169
181
  """
170
182
  Check if this text element matches a regular expression pattern.
171
-
183
+
172
184
  Args:
173
185
  pattern: Regular expression pattern
174
-
186
+
175
187
  Returns:
176
188
  True if the text matches the pattern
177
189
  """
178
190
  import re
191
+
179
192
  return bool(re.search(pattern, self.text))
180
-
193
+
181
194
  @property
182
195
  def bold(self) -> bool:
183
196
  """
184
197
  Check if the text is bold based on multiple indicators in the PDF.
185
-
198
+
186
199
  PDFs encode boldness in several ways:
187
200
  1. Font name containing 'bold' or 'black'
188
201
  2. Font descriptor flags (bit 2 indicates bold)
@@ -192,43 +205,43 @@ class TextElement(Element):
192
205
  """
193
206
  # Check font name (original method)
194
207
  fontname = self.fontname.lower()
195
- if 'bold' in fontname or 'black' in fontname or self.fontname.endswith('-B'):
208
+ if "bold" in fontname or "black" in fontname or self.fontname.endswith("-B"):
196
209
  return True
197
-
210
+
198
211
  # Check font descriptor flags if available (bit 2 = bold)
199
- flags = self._obj.get('flags')
212
+ flags = self._obj.get("flags")
200
213
  if flags is not None and (flags & 4) != 0: # Check if bit 2 is set
201
214
  return True
202
-
215
+
203
216
  # Check StemV (vertical stem width) if available
204
217
  # Higher StemV values indicate bolder fonts
205
- stemv = self._obj.get('stemv') or self._obj.get('StemV')
218
+ stemv = self._obj.get("stemv") or self._obj.get("StemV")
206
219
  if stemv is not None and isinstance(stemv, (int, float)) and stemv > 120:
207
220
  return True
208
-
221
+
209
222
  # Check font weight if available (700+ is typically bold)
210
- weight = self._obj.get('weight') or self._obj.get('FontWeight')
223
+ weight = self._obj.get("weight") or self._obj.get("FontWeight")
211
224
  if weight is not None and isinstance(weight, (int, float)) and weight >= 700:
212
225
  return True
213
-
226
+
214
227
  # Check text rendering mode (mode 2 = fill and stroke, can make text appear bold)
215
- render_mode = self._obj.get('render_mode')
228
+ render_mode = self._obj.get("render_mode")
216
229
  if render_mode is not None and render_mode == 2:
217
230
  return True
218
-
231
+
219
232
  # Additional check: if we have text with the same font but different paths/strokes
220
233
  # Path widths or stroke widths can indicate boldness
221
- stroke_width = self._obj.get('stroke_width') or self._obj.get('lineWidth')
234
+ stroke_width = self._obj.get("stroke_width") or self._obj.get("lineWidth")
222
235
  if stroke_width is not None and isinstance(stroke_width, (int, float)) and stroke_width > 0:
223
236
  return True
224
-
237
+
225
238
  return False
226
-
239
+
227
240
  @property
228
241
  def italic(self) -> bool:
229
242
  """
230
243
  Check if the text is italic based on multiple indicators in the PDF.
231
-
244
+
232
245
  PDFs encode italic (oblique) text in several ways:
233
246
  1. Font name containing 'italic' or 'oblique'
234
247
  2. Font descriptor flags (bit 6 indicates italic)
@@ -236,69 +249,79 @@ class TextElement(Element):
236
249
  """
237
250
  # Check font name (original method)
238
251
  fontname = self.fontname.lower()
239
- if 'italic' in fontname or 'oblique' in fontname or self.fontname.endswith('-I'):
252
+ if "italic" in fontname or "oblique" in fontname or self.fontname.endswith("-I"):
240
253
  return True
241
-
254
+
242
255
  # Check font descriptor flags if available (bit 6 = italic)
243
- flags = self._obj.get('flags')
256
+ flags = self._obj.get("flags")
244
257
  if flags is not None and (flags & 64) != 0: # Check if bit 6 is set
245
258
  return True
246
-
259
+
247
260
  # Check italic angle if available
248
261
  # Non-zero italic angle indicates italic font
249
- italic_angle = self._obj.get('italic_angle') or self._obj.get('ItalicAngle')
250
- if italic_angle is not None and isinstance(italic_angle, (int, float)) and italic_angle != 0:
262
+ italic_angle = self._obj.get("italic_angle") or self._obj.get("ItalicAngle")
263
+ if (
264
+ italic_angle is not None
265
+ and isinstance(italic_angle, (int, float))
266
+ and italic_angle != 0
267
+ ):
251
268
  return True
252
-
269
+
253
270
  return False
254
-
271
+
255
272
  def __repr__(self) -> str:
256
273
  """String representation of the text element."""
257
- preview = self.text[:10] + '...' if len(self.text) > 10 else self.text
274
+ preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
258
275
  font_style = []
259
276
  if self.bold:
260
277
  font_style.append("bold")
261
278
  if self.italic:
262
279
  font_style.append("italic")
263
280
  style_str = f", style={font_style}" if font_style else ""
264
-
281
+
265
282
  # Use font_family for display but include raw fontname and variant
266
283
  font_display = self.font_family
267
284
  variant = self.font_variant
268
285
  variant_str = f", variant='{variant}'" if variant else ""
269
-
270
- if font_display != self.fontname and '+' in self.fontname:
271
- base_font = self.fontname.split('+', 1)[1]
286
+
287
+ if font_display != self.fontname and "+" in self.fontname:
288
+ base_font = self.fontname.split("+", 1)[1]
272
289
  font_display = f"{font_display} ({base_font})"
273
-
290
+
274
291
  return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str} bbox={self.bbox}>"
275
-
292
+
276
293
  def font_info(self) -> dict:
277
294
  """
278
295
  Get detailed font information for this text element.
279
-
296
+
280
297
  Returns a dictionary with all available font-related properties,
281
298
  useful for debugging font detection issues.
282
299
  """
283
300
  info = {
284
- 'text': self.text,
285
- 'fontname': self.fontname,
286
- 'font_family': self.font_family,
287
- 'font_variant': self.font_variant,
288
- 'size': self.size,
289
- 'bold': self.bold,
290
- 'italic': self.italic,
291
- 'color': self.color
301
+ "text": self.text,
302
+ "fontname": self.fontname,
303
+ "font_family": self.font_family,
304
+ "font_variant": self.font_variant,
305
+ "size": self.size,
306
+ "bold": self.bold,
307
+ "italic": self.italic,
308
+ "color": self.color,
292
309
  }
293
-
310
+
294
311
  # Include raw font properties from the PDF
295
312
  font_props = [
296
- 'flags', 'stemv', 'StemV', 'weight', 'FontWeight',
297
- 'render_mode', 'stroke_width', 'lineWidth'
313
+ "flags",
314
+ "stemv",
315
+ "StemV",
316
+ "weight",
317
+ "FontWeight",
318
+ "render_mode",
319
+ "stroke_width",
320
+ "lineWidth",
298
321
  ]
299
-
322
+
300
323
  for prop in font_props:
301
324
  if prop in self._obj:
302
325
  info[f"raw_{prop}"] = self._obj[prop]
303
-
304
- return info
326
+
327
+ return info
@@ -1 +0,0 @@
1
-