natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,8 @@
1
1
  """
2
2
  Text element classes for natural-pdf.
3
3
  """
4
- from typing import Dict, Any, Optional, TYPE_CHECKING
4
+
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional
5
6
 
6
7
  from natural_pdf.elements.base import Element
7
8
 
@@ -12,117 +13,133 @@ if TYPE_CHECKING:
12
13
  class TextElement(Element):
13
14
  """
14
15
  Represents a text element in a PDF.
15
-
16
+
16
17
  This class is a wrapper around pdfplumber's character objects,
17
18
  providing additional functionality for text extraction and analysis.
18
19
  """
19
-
20
- def __init__(self, obj: Dict[str, Any], page: 'Page'):
20
+
21
+ def __init__(self, obj: Dict[str, Any], page: "Page"):
21
22
  """
22
23
  Initialize a text element.
23
-
24
+
24
25
  Args:
25
26
  obj: The underlying pdfplumber object. For OCR text elements,
26
27
  should include 'text', 'bbox', 'source', and 'confidence'
27
28
  page: The parent Page object
28
29
  """
29
30
  # Add object_type if not present
30
- if 'object_type' not in obj:
31
- obj['object_type'] = 'text'
32
-
31
+ if "object_type" not in obj:
32
+ obj["object_type"] = "text"
33
+
33
34
  super().__init__(obj, page)
34
-
35
+ # Explicitly store constituent characters if provided
36
+ # (Pop from obj to avoid storing it twice if super() stores _obj by ref)
37
+ self._char_dicts = obj.pop("_char_dicts", [])
38
+
35
39
  @property
36
40
  def text(self) -> str:
37
41
  """Get the text content."""
38
- return self._obj.get('text', '')
39
-
42
+ return self._obj.get("text", "")
43
+
44
+ @text.setter
45
+ def text(self, value: str):
46
+ """Set the text content."""
47
+ self._obj["text"] = value
48
+
40
49
  @property
41
50
  def source(self) -> str:
42
51
  """Get the source of this text element (pdf or ocr)."""
43
- return self._obj.get('source', 'pdf')
44
-
52
+ return self._obj.get("source", "pdf")
53
+
45
54
  @property
46
55
  def confidence(self) -> float:
47
56
  """Get the confidence score for OCR text elements."""
48
- return self._obj.get('confidence', 1.0)
49
-
57
+ return self._obj.get("confidence", 1.0)
58
+
50
59
  @property
51
60
  def fontname(self) -> str:
52
61
  """Get the font name."""
53
62
  # First check if we have a real fontname from PDF resources
54
- if 'real_fontname' in self._obj:
55
- return self._obj['real_fontname']
63
+ if "real_fontname" in self._obj:
64
+ return self._obj["real_fontname"]
56
65
  # Otherwise use standard fontname
57
- return self._obj.get('fontname', '') or self._obj.get('font', '')
58
-
66
+ return self._obj.get("fontname", "") or self._obj.get("font", "")
67
+
59
68
  @property
60
69
  def font_family(self) -> str:
61
70
  """
62
71
  Get a cleaner font family name by stripping PDF-specific prefixes.
63
-
72
+
64
73
  PDF font names often include prefixes like 'ABCDEF+' followed by the font name
65
74
  or unique identifiers. This method attempts to extract a more readable font name.
66
75
  """
67
76
  font = self.fontname
68
-
77
+
69
78
  # Remove common PDF font prefixes (e.g., 'ABCDEF+')
70
- if '+' in font:
71
- font = font.split('+', 1)[1]
72
-
79
+ if "+" in font:
80
+ font = font.split("+", 1)[1]
81
+
73
82
  # Try to extract common font family names
74
83
  common_fonts = [
75
- 'Arial', 'Helvetica', 'Times', 'Courier', 'Calibri',
76
- 'Cambria', 'Georgia', 'Verdana', 'Tahoma', 'Trebuchet'
84
+ "Arial",
85
+ "Helvetica",
86
+ "Times",
87
+ "Courier",
88
+ "Calibri",
89
+ "Cambria",
90
+ "Georgia",
91
+ "Verdana",
92
+ "Tahoma",
93
+ "Trebuchet",
77
94
  ]
78
-
95
+
79
96
  for common in common_fonts:
80
97
  if common.lower() in font.lower():
81
98
  return common
82
-
99
+
83
100
  return font
84
-
101
+
85
102
  @property
86
103
  def font_variant(self) -> str:
87
104
  """
88
105
  Get the font variant identifier (prefix before the '+' in PDF font names).
89
-
106
+
90
107
  PDF embeds font subsets with unique identifiers like 'AAAAAB+FontName'.
91
108
  Different variants of the same base font will have different prefixes.
92
- This can be used to differentiate text that looks different despite
109
+ This can be used to differentiate text that looks different despite
93
110
  having the same font name and size.
94
-
111
+
95
112
  Returns:
96
113
  The font variant prefix, or empty string if no variant is present
97
114
  """
98
115
  font = self.fontname
99
-
116
+
100
117
  # Extract the prefix before '+' if it exists
101
- if '+' in font:
102
- return font.split('+', 1)[0]
103
-
118
+ if "+" in font:
119
+ return font.split("+", 1)[0]
120
+
104
121
  return ""
105
-
122
+
106
123
  @property
107
124
  def size(self) -> float:
108
125
  """Get the font size."""
109
- return self._obj.get('size', 0)
110
-
126
+ return self._obj.get("size", 0)
127
+
111
128
  @property
112
129
  def color(self) -> tuple:
113
130
  """Get the text color (RGB tuple)."""
114
131
  # PDFs often use non-RGB values, so we handle different formats
115
132
  # In pdfplumber, colors can be in various formats depending on the PDF
116
- color = self._obj.get('non_stroking_color', (0, 0, 0))
117
-
133
+ color = self._obj.get("non_stroking_color", (0, 0, 0))
134
+
118
135
  # If it's a single value, treat as grayscale
119
136
  if isinstance(color, (int, float)):
120
137
  return (color, color, color)
121
-
138
+
122
139
  # If it's a tuple of 3 values, treat as RGB
123
140
  if isinstance(color, tuple) and len(color) == 3:
124
141
  return color
125
-
142
+
126
143
  # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
127
144
  if isinstance(color, tuple) and len(color) == 4:
128
145
  c, m, y, k = color
@@ -130,33 +147,33 @@ class TextElement(Element):
130
147
  g = 1 - min(1, m + k)
131
148
  b = 1 - min(1, y + k)
132
149
  return (r, g, b)
133
-
150
+
134
151
  # Default to black
135
152
  return (0, 0, 0)
136
-
153
+
137
154
  def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
138
155
  """
139
156
  Extract text from this element.
140
-
157
+
141
158
  Args:
142
159
  keep_blank_chars: Whether to keep blank characters (default: True)
143
160
  **kwargs: Additional extraction parameters
144
-
161
+
145
162
  Returns:
146
163
  Text content
147
164
  """
148
165
  # For text elements, keep_blank_chars doesn't affect anything as we're
149
166
  # simply returning the text property. Included for API consistency.
150
167
  return self.text
151
-
168
+
152
169
  def contains(self, substring: str, case_sensitive: bool = True) -> bool:
153
170
  """
154
171
  Check if this text element contains a substring.
155
-
172
+
156
173
  Args:
157
174
  substring: The substring to check for
158
175
  case_sensitive: Whether the check is case-sensitive
159
-
176
+
160
177
  Returns:
161
178
  True if the text contains the substring
162
179
  """
@@ -164,25 +181,26 @@ class TextElement(Element):
164
181
  return substring in self.text
165
182
  else:
166
183
  return substring.lower() in self.text.lower()
167
-
184
+
168
185
  def matches(self, pattern: str) -> bool:
169
186
  """
170
187
  Check if this text element matches a regular expression pattern.
171
-
188
+
172
189
  Args:
173
190
  pattern: Regular expression pattern
174
-
191
+
175
192
  Returns:
176
193
  True if the text matches the pattern
177
194
  """
178
195
  import re
196
+
179
197
  return bool(re.search(pattern, self.text))
180
-
198
+
181
199
  @property
182
200
  def bold(self) -> bool:
183
201
  """
184
202
  Check if the text is bold based on multiple indicators in the PDF.
185
-
203
+
186
204
  PDFs encode boldness in several ways:
187
205
  1. Font name containing 'bold' or 'black'
188
206
  2. Font descriptor flags (bit 2 indicates bold)
@@ -192,43 +210,43 @@ class TextElement(Element):
192
210
  """
193
211
  # Check font name (original method)
194
212
  fontname = self.fontname.lower()
195
- if 'bold' in fontname or 'black' in fontname or self.fontname.endswith('-B'):
213
+ if "bold" in fontname or "black" in fontname or self.fontname.endswith("-B"):
196
214
  return True
197
-
215
+
198
216
  # Check font descriptor flags if available (bit 2 = bold)
199
- flags = self._obj.get('flags')
217
+ flags = self._obj.get("flags")
200
218
  if flags is not None and (flags & 4) != 0: # Check if bit 2 is set
201
219
  return True
202
-
220
+
203
221
  # Check StemV (vertical stem width) if available
204
222
  # Higher StemV values indicate bolder fonts
205
- stemv = self._obj.get('stemv') or self._obj.get('StemV')
223
+ stemv = self._obj.get("stemv") or self._obj.get("StemV")
206
224
  if stemv is not None and isinstance(stemv, (int, float)) and stemv > 120:
207
225
  return True
208
-
226
+
209
227
  # Check font weight if available (700+ is typically bold)
210
- weight = self._obj.get('weight') or self._obj.get('FontWeight')
228
+ weight = self._obj.get("weight") or self._obj.get("FontWeight")
211
229
  if weight is not None and isinstance(weight, (int, float)) and weight >= 700:
212
230
  return True
213
-
231
+
214
232
  # Check text rendering mode (mode 2 = fill and stroke, can make text appear bold)
215
- render_mode = self._obj.get('render_mode')
233
+ render_mode = self._obj.get("render_mode")
216
234
  if render_mode is not None and render_mode == 2:
217
235
  return True
218
-
236
+
219
237
  # Additional check: if we have text with the same font but different paths/strokes
220
238
  # Path widths or stroke widths can indicate boldness
221
- stroke_width = self._obj.get('stroke_width') or self._obj.get('lineWidth')
239
+ stroke_width = self._obj.get("stroke_width") or self._obj.get("lineWidth")
222
240
  if stroke_width is not None and isinstance(stroke_width, (int, float)) and stroke_width > 0:
223
241
  return True
224
-
242
+
225
243
  return False
226
-
244
+
227
245
  @property
228
246
  def italic(self) -> bool:
229
247
  """
230
248
  Check if the text is italic based on multiple indicators in the PDF.
231
-
249
+
232
250
  PDFs encode italic (oblique) text in several ways:
233
251
  1. Font name containing 'italic' or 'oblique'
234
252
  2. Font descriptor flags (bit 6 indicates italic)
@@ -236,69 +254,79 @@ class TextElement(Element):
236
254
  """
237
255
  # Check font name (original method)
238
256
  fontname = self.fontname.lower()
239
- if 'italic' in fontname or 'oblique' in fontname or self.fontname.endswith('-I'):
257
+ if "italic" in fontname or "oblique" in fontname or self.fontname.endswith("-I"):
240
258
  return True
241
-
259
+
242
260
  # Check font descriptor flags if available (bit 6 = italic)
243
- flags = self._obj.get('flags')
261
+ flags = self._obj.get("flags")
244
262
  if flags is not None and (flags & 64) != 0: # Check if bit 6 is set
245
263
  return True
246
-
264
+
247
265
  # Check italic angle if available
248
266
  # Non-zero italic angle indicates italic font
249
- italic_angle = self._obj.get('italic_angle') or self._obj.get('ItalicAngle')
250
- if italic_angle is not None and isinstance(italic_angle, (int, float)) and italic_angle != 0:
267
+ italic_angle = self._obj.get("italic_angle") or self._obj.get("ItalicAngle")
268
+ if (
269
+ italic_angle is not None
270
+ and isinstance(italic_angle, (int, float))
271
+ and italic_angle != 0
272
+ ):
251
273
  return True
252
-
274
+
253
275
  return False
254
-
276
+
255
277
  def __repr__(self) -> str:
256
278
  """String representation of the text element."""
257
- preview = self.text[:10] + '...' if len(self.text) > 10 else self.text
279
+ preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
258
280
  font_style = []
259
281
  if self.bold:
260
282
  font_style.append("bold")
261
283
  if self.italic:
262
284
  font_style.append("italic")
263
285
  style_str = f", style={font_style}" if font_style else ""
264
-
286
+
265
287
  # Use font_family for display but include raw fontname and variant
266
288
  font_display = self.font_family
267
289
  variant = self.font_variant
268
290
  variant_str = f", variant='{variant}'" if variant else ""
269
-
270
- if font_display != self.fontname and '+' in self.fontname:
271
- base_font = self.fontname.split('+', 1)[1]
291
+
292
+ if font_display != self.fontname and "+" in self.fontname:
293
+ base_font = self.fontname.split("+", 1)[1]
272
294
  font_display = f"{font_display} ({base_font})"
273
-
295
+
274
296
  return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str} bbox={self.bbox}>"
275
-
297
+
276
298
  def font_info(self) -> dict:
277
299
  """
278
300
  Get detailed font information for this text element.
279
-
301
+
280
302
  Returns a dictionary with all available font-related properties,
281
303
  useful for debugging font detection issues.
282
304
  """
283
305
  info = {
284
- 'text': self.text,
285
- 'fontname': self.fontname,
286
- 'font_family': self.font_family,
287
- 'font_variant': self.font_variant,
288
- 'size': self.size,
289
- 'bold': self.bold,
290
- 'italic': self.italic,
291
- 'color': self.color
306
+ "text": self.text,
307
+ "fontname": self.fontname,
308
+ "font_family": self.font_family,
309
+ "font_variant": self.font_variant,
310
+ "size": self.size,
311
+ "bold": self.bold,
312
+ "italic": self.italic,
313
+ "color": self.color,
292
314
  }
293
-
315
+
294
316
  # Include raw font properties from the PDF
295
317
  font_props = [
296
- 'flags', 'stemv', 'StemV', 'weight', 'FontWeight',
297
- 'render_mode', 'stroke_width', 'lineWidth'
318
+ "flags",
319
+ "stemv",
320
+ "StemV",
321
+ "weight",
322
+ "FontWeight",
323
+ "render_mode",
324
+ "stroke_width",
325
+ "lineWidth",
298
326
  ]
299
-
327
+
300
328
  for prop in font_props:
301
329
  if prop in self._obj:
302
330
  info[f"raw_{prop}"] = self._obj[prop]
303
-
304
- return info
331
+
332
+ return info
@@ -1 +0,0 @@
1
-