natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +751 -607
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +120 -23
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.35.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,11 @@ This script compares memory usage before and after the optimization by:
10
10
 
11
11
  import gc
12
12
  import os
13
- import psutil
14
13
  import sys
15
14
  from pathlib import Path
16
15
 
16
+ import psutil
17
+
17
18
  import natural_pdf as npdf
18
19
 
19
20
 
@@ -22,9 +23,9 @@ def get_detailed_memory_info():
22
23
  process = psutil.Process()
23
24
  memory_info = process.memory_info()
24
25
  return {
25
- 'rss_mb': memory_info.rss / 1024 / 1024,
26
- 'vms_mb': memory_info.vms / 1024 / 1024,
27
- 'python_objects': len(gc.get_objects())
26
+ "rss_mb": memory_info.rss / 1024 / 1024,
27
+ "vms_mb": memory_info.vms / 1024 / 1024,
28
+ "python_objects": len(gc.get_objects()),
28
29
  }
29
30
 
30
31
 
@@ -32,77 +33,83 @@ def analyze_character_storage(page):
32
33
  """Analyze how characters are stored in the page"""
33
34
  # Force element loading
34
35
  text_elements = page.find_all("text")
35
-
36
+
36
37
  total_char_indices = 0
37
38
  total_char_dicts = 0
38
39
  total_chars_in_words = 0
39
40
  memory_efficient_words = 0
40
41
  legacy_words = 0
41
-
42
+
42
43
  for element in text_elements:
43
- if hasattr(element, '_char_indices') and element._char_indices:
44
+ if hasattr(element, "_char_indices") and element._char_indices:
44
45
  memory_efficient_words += 1
45
46
  total_char_indices += len(element._char_indices)
46
47
  total_chars_in_words += len(element._char_indices)
47
-
48
- if hasattr(element, '_char_dicts') and element._char_dicts:
48
+
49
+ if hasattr(element, "_char_dicts") and element._char_dicts:
49
50
  total_char_dicts += len(element._char_dicts)
50
- if not (hasattr(element, '_char_indices') and element._char_indices):
51
+ if not (hasattr(element, "_char_indices") and element._char_indices):
51
52
  legacy_words += 1
52
53
  total_chars_in_words += len(element._char_dicts)
53
-
54
+
54
55
  # Get individual character elements
55
56
  char_elements = []
56
- if hasattr(page, '_element_mgr'):
57
- char_elements = page._element_mgr.get_elements('chars')
58
-
57
+ if hasattr(page, "_element_mgr"):
58
+ char_elements = page._element_mgr.get_elements("chars")
59
+
59
60
  return {
60
- 'total_words': len(text_elements),
61
- 'memory_efficient_words': memory_efficient_words,
62
- 'legacy_words': legacy_words,
63
- 'total_char_elements': len(char_elements),
64
- 'total_char_indices': total_char_indices,
65
- 'total_char_dicts': total_char_dicts,
66
- 'total_chars_in_words': total_chars_in_words,
67
- 'estimated_duplication_ratio': total_char_dicts / max(len(char_elements), 1)
61
+ "total_words": len(text_elements),
62
+ "memory_efficient_words": memory_efficient_words,
63
+ "legacy_words": legacy_words,
64
+ "total_char_elements": len(char_elements),
65
+ "total_char_indices": total_char_indices,
66
+ "total_char_dicts": total_char_dicts,
67
+ "total_chars_in_words": total_chars_in_words,
68
+ "estimated_duplication_ratio": total_char_dicts / max(len(char_elements), 1),
68
69
  }
69
70
 
70
71
 
71
72
  def test_memory_optimization():
72
73
  """Test the memory optimization with a real PDF"""
73
-
74
+
74
75
  # Test with the practice PDF
75
76
  test_pdf = Path("pdfs/01-practice.pdf")
76
77
  if not test_pdf.exists():
77
78
  print(f"Test PDF not found: {test_pdf}")
78
79
  return
79
-
80
+
80
81
  print("=" * 60)
81
82
  print("MEMORY OPTIMIZATION ANALYSIS")
82
83
  print("=" * 60)
83
-
84
+
84
85
  # Baseline memory
85
86
  gc.collect()
86
87
  baseline_memory = get_detailed_memory_info()
87
- print(f"Baseline memory: {baseline_memory['rss_mb']:.2f} MB RSS, {baseline_memory['python_objects']:,} objects")
88
-
88
+ print(
89
+ f"Baseline memory: {baseline_memory['rss_mb']:.2f} MB RSS, {baseline_memory['python_objects']:,} objects"
90
+ )
91
+
89
92
  # Load PDF
90
93
  pdf = npdf.PDF(str(test_pdf))
91
94
  page = pdf.pages[0]
92
-
95
+
93
96
  post_load_memory = get_detailed_memory_info()
94
- print(f"After PDF load: {post_load_memory['rss_mb']:.2f} MB RSS, {post_load_memory['python_objects']:,} objects")
95
-
97
+ print(
98
+ f"After PDF load: {post_load_memory['rss_mb']:.2f} MB RSS, {post_load_memory['python_objects']:,} objects"
99
+ )
100
+
96
101
  # Analyze character storage
97
102
  storage_analysis = analyze_character_storage(page)
98
-
103
+
99
104
  final_memory = get_detailed_memory_info()
100
- print(f"After element load: {final_memory['rss_mb']:.2f} MB RSS, {final_memory['python_objects']:,} objects")
101
-
105
+ print(
106
+ f"After element load: {final_memory['rss_mb']:.2f} MB RSS, {final_memory['python_objects']:,} objects"
107
+ )
108
+
102
109
  print("\n" + "=" * 40)
103
110
  print("CHARACTER STORAGE ANALYSIS")
104
111
  print("=" * 40)
105
-
112
+
106
113
  print(f"Total words: {storage_analysis['total_words']}")
107
114
  print(f"Memory-efficient words: {storage_analysis['memory_efficient_words']}")
108
115
  print(f"Legacy words: {storage_analysis['legacy_words']}")
@@ -110,63 +117,71 @@ def test_memory_optimization():
110
117
  print(f"Character indices used: {storage_analysis['total_char_indices']}")
111
118
  print(f"Character dicts stored: {storage_analysis['total_char_dicts']}")
112
119
  print(f"Characters referenced by words: {storage_analysis['total_chars_in_words']}")
113
-
120
+
114
121
  # Calculate optimization metrics
115
- duplication_ratio = storage_analysis['estimated_duplication_ratio']
116
- optimization_percentage = storage_analysis['memory_efficient_words'] / max(storage_analysis['total_words'], 1) * 100
117
-
122
+ duplication_ratio = storage_analysis["estimated_duplication_ratio"]
123
+ optimization_percentage = (
124
+ storage_analysis["memory_efficient_words"] / max(storage_analysis["total_words"], 1) * 100
125
+ )
126
+
118
127
  print(f"\nOptimization metrics:")
119
128
  print(f"- Duplication ratio: {duplication_ratio:.2f}x")
120
129
  print(f"- Words using optimization: {optimization_percentage:.1f}%")
121
-
130
+
122
131
  # Memory savings estimation
123
- memory_used = final_memory['rss_mb'] - baseline_memory['rss_mb']
124
- chars_total = storage_analysis['total_char_elements']
125
-
132
+ memory_used = final_memory["rss_mb"] - baseline_memory["rss_mb"]
133
+ chars_total = storage_analysis["total_char_elements"]
134
+
126
135
  if chars_total > 0:
127
136
  memory_per_char = memory_used / chars_total * 1024 # KB per char
128
137
  print(f"- Memory per character: {memory_per_char:.2f} KB")
129
-
138
+
130
139
  # Estimate savings from eliminating _char_dicts duplication
131
- duplicated_chars = storage_analysis['total_char_dicts']
140
+ duplicated_chars = storage_analysis["total_char_dicts"]
132
141
  if duplicated_chars > 0:
133
142
  estimated_wasted_memory = duplicated_chars * memory_per_char / 1024 # MB
134
143
  print(f"- Estimated memory saved by optimization: {estimated_wasted_memory:.2f} MB")
135
- print(f"- Memory efficiency improvement: {estimated_wasted_memory / memory_used * 100:.1f}%")
136
-
144
+ print(
145
+ f"- Memory efficiency improvement: {estimated_wasted_memory / memory_used * 100:.1f}%"
146
+ )
147
+
137
148
  print(f"\nTotal memory used for page processing: {memory_used:.2f} MB")
138
-
149
+
139
150
  # Test functionality
140
151
  print("\n" + "=" * 40)
141
152
  print("FUNCTIONALITY VERIFICATION")
142
153
  print("=" * 40)
143
-
154
+
144
155
  # Test character access
145
156
  test_elements = page.find_all("text")[:3]
146
157
  for i, element in enumerate(test_elements):
147
158
  print(f"\nWord {i+1}: '{element.text[:30]}{'...' if len(element.text) > 30 else ''}'")
148
-
149
- if hasattr(element, '_char_indices') and element._char_indices:
159
+
160
+ if hasattr(element, "_char_indices") and element._char_indices:
150
161
  chars = element.chars
151
- print(f" - Uses character indices: {len(element._char_indices)} indices -> {len(chars)} chars")
162
+ print(
163
+ f" - Uses character indices: {len(element._char_indices)} indices -> {len(chars)} chars"
164
+ )
152
165
  print(f" - Memory optimization: ACTIVE")
153
-
166
+
154
167
  # Verify character access works
155
168
  if chars:
156
169
  first_char = chars[0]
157
- print(f" - First char: '{first_char.text}' at ({first_char.x0:.1f}, {first_char.top:.1f})")
158
-
159
- elif hasattr(element, '_char_dicts') and element._char_dicts:
170
+ print(
171
+ f" - First char: '{first_char.text}' at ({first_char.x0:.1f}, {first_char.top:.1f})"
172
+ )
173
+
174
+ elif hasattr(element, "_char_dicts") and element._char_dicts:
160
175
  print(f" - Uses character dicts: {len(element._char_dicts)} dicts")
161
176
  print(f" - Memory optimization: LEGACY MODE")
162
-
177
+
163
178
  else:
164
179
  print(f" - No character data available")
165
-
180
+
166
181
  print("\n" + "=" * 60)
167
182
  print("✅ MEMORY OPTIMIZATION ANALYSIS COMPLETE")
168
183
  print("=" * 60)
169
184
 
170
185
 
171
186
  if __name__ == "__main__":
172
- test_memory_optimization()
187
+ test_memory_optimization()