natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,156 @@
1
+ """
2
+ Example demonstrating how to use the until parameter with above() and below() methods.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+
15
+ def main():
16
+ """Main entry point."""
17
+ # Get the PDF path from command line or use a default
18
+ if len(sys.argv) > 1:
19
+ pdf_path = sys.argv[1]
20
+ else:
21
+ # Look for any PDF in the examples directory or pdfs directory
22
+ example_dir = Path(__file__).parent
23
+ pdf_files = list(example_dir.glob("*.pdf"))
24
+
25
+ if not pdf_files:
26
+ pdfs_dir = example_dir.parent / "pdfs"
27
+ if pdfs_dir.exists():
28
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
29
+
30
+ if pdf_files:
31
+ pdf_path = str(pdf_files[0])
32
+ else:
33
+ print("No PDF file found. Please provide a path to a PDF file.")
34
+ sys.exit(1)
35
+
36
+ print(f"Using PDF: {pdf_path}")
37
+
38
+ # Open the PDF
39
+ pdf = PDF(pdf_path)
40
+ page = pdf.pages[0]
41
+
42
+ # Clear any existing highlights
43
+ page.clear_highlights()
44
+
45
+ # First, find some key elements on the page
46
+ heading1 = page.find('text[size>=12]')
47
+
48
+ if not heading1:
49
+ # If no large headings, just use the first few elements as examples
50
+ elements = page.get_elements()
51
+ elements.sort(key=lambda e: (e.top, e.x0)) # Sort in reading order
52
+
53
+ if len(elements) < 3:
54
+ print("Not enough elements found for demonstration")
55
+ return
56
+
57
+ element1 = elements[0]
58
+ element2 = elements[len(elements) // 3] # About 1/3 down
59
+ element3 = elements[len(elements) // 2] # About halfway down
60
+
61
+ # Highlight the reference elements
62
+ element1.highlight(label="First Element")
63
+ element2.highlight(label="Second Element")
64
+ element3.highlight(label="Third Element")
65
+
66
+ print(f"First element: '{element1.text if hasattr(element1, 'text') else 'non-text'}' at y={element1.top}")
67
+ print(f"Second element: '{element2.text if hasattr(element2, 'text') else 'non-text'}' at y={element2.top}")
68
+ print(f"Third element: '{element3.text if hasattr(element3, 'text') else 'non-text'}' at y={element3.top}")
69
+
70
+ # Demonstrate below() with until parameter
71
+ print("\nDemonstrating below() with until parameter")
72
+
73
+ # Get the region from element1 to element2
74
+ region1 = element1.below(until=f'text:contains("{element2.text}")')
75
+ region1.highlight(label="Below until Second Element")
76
+
77
+ # Get the region from element2 to element3, excluding element3
78
+ region2 = element2.below(until=f'text:contains("{element3.text}")', include_until=False)
79
+ region2.highlight(label="Below until Third Element (excluded)")
80
+
81
+ # Demonstrate above() with until parameter
82
+ print("\nDemonstrating above() with until parameter")
83
+
84
+ # Get the region from element3 up to element2
85
+ region3 = element3.above(until=f'text:contains("{element2.text}")')
86
+ region3.highlight(label="Above until Second Element")
87
+
88
+ # Get the region from element2 up to element1, excluding element1
89
+ region4 = element2.above(until=f'text:contains("{element1.text}")', include_until=False)
90
+ region4.highlight(label="Above until First Element (excluded)")
91
+
92
+ # Create an output directory
93
+ output_dir = Path(__file__).parent / "until_output"
94
+ output_dir.mkdir(exist_ok=True)
95
+
96
+ # Save the result
97
+ page.save(str(output_dir / "until_boundaries.png"), labels=True)
98
+
99
+ # Print the contents of the regions
100
+ print("\nContent in region 'below until second element':")
101
+ print(region1.extract_text()[:100] + "..." if len(region1.extract_text()) > 100 else region1.extract_text())
102
+
103
+ print("\nContent in region 'above until second element':")
104
+ print(region3.extract_text()[:100] + "..." if len(region3.extract_text()) > 100 else region3.extract_text())
105
+
106
+ print("\nExample completed. Check 'until_output/until_boundaries.png' for the result.")
107
+ else:
108
+ # Find more headings
109
+ headings = page.find_all('text[size>=12]')
110
+
111
+ if len(headings) < 2:
112
+ # If not enough headings, fall back to the approach above
113
+ print("Not enough headings found. Using generic elements instead.")
114
+ main() # Re-run with the above approach
115
+ return
116
+
117
+ # Use the first two headings
118
+ heading1 = headings[0]
119
+ heading2 = headings[1]
120
+
121
+ # Highlight the headings
122
+ heading1.highlight(label="First Heading")
123
+ heading2.highlight(label="Second Heading")
124
+
125
+ print(f"First heading: '{heading1.text}' at y={heading1.top}")
126
+ print(f"Second heading: '{heading2.text}' at y={heading2.top}")
127
+
128
+ # Demonstrate below() with until parameter
129
+ print("\nDemonstrating below() with until parameter")
130
+
131
+ # Get the region from heading1 to heading2
132
+ region1 = heading1.below(until=f'text:contains("{heading2.text}")')
133
+ region1.highlight(label="Below until Second Heading")
134
+
135
+ # Get the region from heading1 to heading2, excluding heading2
136
+ region2 = heading1.below(until=f'text:contains("{heading2.text}")', include_until=False)
137
+ region2.highlight(label="Below until Second Heading (excluded)")
138
+
139
+ # Create an output directory
140
+ output_dir = Path(__file__).parent / "until_output"
141
+ output_dir.mkdir(exist_ok=True)
142
+
143
+ # Save the result
144
+ page.to_image(path=str(output_dir / "until_boundaries_headings.png"), show_labels=True)
145
+
146
+ # Print the contents of the regions
147
+ print("\nContent in region 'below until second heading':")
148
+ print(region1.extract_text()[:100] + "..." if len(region1.extract_text()) > 100 else region1.extract_text())
149
+
150
+ print("\nContent in region 'below until second heading (excluded)':")
151
+ print(region2.extract_text()[:100] + "..." if len(region2.extract_text()) > 100 else region2.extract_text())
152
+
153
+ print("\nExample completed. Check 'until_output/until_boundaries_headings.png' for the result.")
154
+
155
+ if __name__ == "__main__":
156
+ main()
@@ -0,0 +1,112 @@
1
+ """
2
+ Example demonstrating the 'until' feature of natural-pdf.
3
+ (This was previously named 'select_until')
4
+ """
5
+ import os
6
+ import sys
7
+
8
+ # Add the parent directory to the path to import the package
9
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ from natural_pdf import PDF
12
+
13
+ def until_example(pdf_path):
14
+ """Demonstrates the 'until' method for defining content regions."""
15
+ # Open the PDF
16
+ with PDF(pdf_path) as pdf:
17
+ page = pdf.pages[0]
18
+
19
+ print(f"PDF loaded: {pdf_path}")
20
+ print(f"PDF has {len(pdf)} pages\n")
21
+
22
+ # EXAMPLE 1: Select from "Summary:" until the thick line
23
+ print("EXAMPLE 1: Select from Summary until thick line")
24
+ print("----------------------------------------------")
25
+
26
+ # Find the "Summary:" text
27
+ summary = page.find('text:contains("Summary:")')
28
+ print(f"Found 'Summary' text at: {summary.bbox}")
29
+
30
+ # Find the thick line
31
+ thick_line = page.find('line[width>=2]')
32
+ print(f"Found thick line at: {thick_line.bbox}")
33
+
34
+ # Create a region from Summary until the thick line
35
+ print("\nCreating region from 'Summary' until the thick line...")
36
+ summary_region = summary.until('line[width>=2]', width="full")
37
+ print(f"Region boundaries: {summary_region.bbox}")
38
+
39
+ # Extract and display text from this region
40
+ region_text = summary_region.extract_text()
41
+ print("\nText from the region:")
42
+ print("-" * 60)
43
+ print(region_text)
44
+ print("-" * 60)
45
+
46
+ # Find all text elements in this region
47
+ text_elements = summary_region.find_all('text')
48
+ print(f"\nFound {len(text_elements)} text elements in the region")
49
+
50
+ # Display the first 5 elements
51
+ if text_elements:
52
+ print("First 5 elements:")
53
+ for i, el in enumerate(text_elements[:5]):
54
+ print(f" {i+1}. '{el.text}'")
55
+
56
+ # EXAMPLE 2: Demonstrate include_endpoint=False option
57
+ print("\nEXAMPLE 2: Without including endpoint element")
58
+ print("----------------------------------------------")
59
+
60
+ # Create a region from Summary until the thick line, excluding the line
61
+ exclusive_region = summary.until('line[width>=2]', include_endpoint=False, width="full")
62
+ print(f"Region boundaries: {exclusive_region.bbox}")
63
+
64
+ # Compare text length
65
+ inclusive_text = summary_region.extract_text()
66
+ exclusive_text = exclusive_region.extract_text()
67
+
68
+ print(f"\nWith include_endpoint=True: {len(inclusive_text)} characters")
69
+ print(f"With include_endpoint=False: {len(exclusive_text)} characters")
70
+
71
+ # EXAMPLE 3: Different elements for until
72
+ print("\nEXAMPLE 3: Select from one text to another")
73
+ print("----------------------------------------------")
74
+
75
+ # Find text elements to use as boundaries
76
+ heading = page.find('text:contains("Violations")')
77
+ if heading:
78
+ # Select from "Violations" to "Critical"
79
+ target_word = page.find('text:contains("Critical")')
80
+ if target_word:
81
+ region = heading.until('text:contains("Critical")', width="full")
82
+ print(f"\nRegion from 'Violations' to 'Critical': {region.bbox}")
83
+
84
+ text = region.extract_text()
85
+ print(f"Extracted {len(text)} characters of text")
86
+ if len(text) > 100:
87
+ print(f"First 100 characters: {text[:100]}...")
88
+ else:
89
+ print("Could not find 'Critical' text")
90
+ else:
91
+ print("Could not find 'Violations' heading")
92
+
93
+ print("\nEnd of 'until' method demonstration.")
94
+
95
+ if __name__ == "__main__":
96
+ # Default to example PDF if no path is provided
97
+ if len(sys.argv) < 2:
98
+ # Use the example PDF in the pdfs directory
99
+ pdf_path = os.path.abspath(os.path.join(
100
+ os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
101
+ if not os.path.exists(pdf_path):
102
+ print("Example PDF not found. Please provide a path to a PDF file.")
103
+ print("Usage: python until_example.py [path/to/file.pdf]")
104
+ sys.exit(1)
105
+ else:
106
+ pdf_path = sys.argv[1]
107
+ # Check if the file exists
108
+ if not os.path.exists(pdf_path):
109
+ print(f"File not found: {pdf_path}")
110
+ sys.exit(1)
111
+
112
+ until_example(pdf_path)
@@ -0,0 +1,15 @@
1
+ from natural_pdf import PDF
2
+
3
+ # Open the PDF
4
+ pdf = PDF("./pdfs/01-practice.pdf")
5
+
6
+ # Approximate match for red
7
+ serial = pdf.find('text[color~=red]')
8
+
9
+ # Between 'Summary' and thick line
10
+ summary = pdf.find('text:contains("Summary")').below(include_element=True, until='line[width>=2]')
11
+
12
+ # Debug
13
+ serial.highlight(label='Serial')
14
+ summary.highlight(label='Summary')
15
+ pdf.pages[0].to_image(path="output.png", show_labels=True)
@@ -0,0 +1,55 @@
1
+ """
2
+ Natural PDF - A more intuitive interface for working with PDFs.
3
+ """
4
+ import logging
5
+
6
+ # Create library logger
7
+ logger = logging.getLogger("natural_pdf")
8
+
9
+ # Add a NullHandler to prevent "No handler found" warnings
10
+ # (Best practice for libraries)
11
+ logger.addHandler(logging.NullHandler())
12
+
13
+ # Utility function for users to easily configure logging
14
+ def configure_logging(level=logging.INFO, handler=None):
15
+ """Configure Natural PDF's logging.
16
+
17
+ Args:
18
+ level: The logging level (e.g., logging.INFO, logging.DEBUG)
19
+ handler: A custom handler, or None to use StreamHandler
20
+ """
21
+ # Remove NullHandler if present
22
+ if logger.handlers and isinstance(logger.handlers[0], logging.NullHandler):
23
+ logger.removeHandler(logger.handlers[0])
24
+
25
+ if handler is None:
26
+ handler = logging.StreamHandler()
27
+ formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
28
+ handler.setFormatter(formatter)
29
+
30
+ logger.addHandler(handler)
31
+ logger.setLevel(level)
32
+
33
+ # Propagate level to all child loggers
34
+ for name in logging.root.manager.loggerDict:
35
+ if name.startswith("natural_pdf."):
36
+ logging.getLogger(name).setLevel(level)
37
+
38
+ from natural_pdf.core.pdf import PDF
39
+ from natural_pdf.core.page import Page
40
+ from natural_pdf.elements.region import Region
41
+ from natural_pdf.elements.collections import ElementCollection
42
+
43
+ # Import QA module if available
44
+ try:
45
+ from natural_pdf.qa import DocumentQA, get_qa_engine
46
+ HAS_QA = True
47
+ except ImportError:
48
+ HAS_QA = False
49
+
50
+ __version__ = "0.1.0"
51
+
52
+ if HAS_QA:
53
+ __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
54
+ else:
55
+ __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
@@ -0,0 +1,9 @@
1
+ """
2
+ Analyzers for natural-pdf.
3
+ """
4
+ from natural_pdf.analyzers.document_layout import (
5
+ LayoutDetector,
6
+ YOLODocLayoutDetector,
7
+ TableTransformerDetector,
8
+ convert_to_regions
9
+ )