natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,155 @@
1
+ """
2
+ Table structure detection example using Table Transformer.
3
+
4
+ This example demonstrates how to use the Table Transformer (TATR)
5
+ to detect tables and their structure in PDF documents.
6
+
7
+ Note: This example requires additional dependencies:
8
+ - torch
9
+ - torchvision
10
+ - transformers
11
+
12
+ These will be automatically installed when you install natural-pdf.
13
+ """
14
+ import os
15
+ from natural_pdf import PDF
16
+
17
+ # Get the current directory of this script
18
+ script_dir = os.path.dirname(os.path.realpath(__file__))
19
+ # Get the parent directory (project root)
20
+ root_dir = os.path.dirname(script_dir)
21
+ # Setup paths
22
+ pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
23
+ output_dir = os.path.join(root_dir, "output")
24
+ os.makedirs(output_dir, exist_ok=True)
25
+
26
+ print(f"Analyzing table structure in: {pdf_path}")
27
+
28
+ # Load the PDF - this file has a single page with a table
29
+ pdf = PDF(pdf_path)
30
+ page = pdf.pages[0] # Get the first page
31
+
32
+ print("Running YOLO layout analysis first (excluding tables)...")
33
+ # First run YOLO detector but exclude tables
34
+ page.analyze_layout(
35
+ model="yolo",
36
+ confidence=0.3,
37
+ exclude_classes=["table", "table_caption", "table_footnote"]
38
+ )
39
+
40
+ print(f"Found {len(page.detected_layout_regions)} general layout regions")
41
+
42
+ print("Now running Table Transformer detection...")
43
+ # Then run Table Transformer detection and add to existing regions
44
+ page.analyze_layout(
45
+ model="tatr",
46
+ confidence=0.4, # Table detection confidence threshold
47
+ existing="append"
48
+ )
49
+
50
+ print(f"Found {len(page.detected_layout_regions)} total regions (including table structure)")
51
+
52
+ # Example of method chaining
53
+ print("\nDemonstrating method chaining for layout analysis and highlighting:")
54
+ # Create a highlighted image with a single method chain
55
+ page.clear_highlights()\
56
+ .analyze_layout(model="tatr", confidence=0.3)\
57
+ .highlight_layout()\
58
+ .to_image(path=os.path.join(output_dir, "chained_analysis.png"), show_labels=True)
59
+ print("Created highlighted image with method chaining")
60
+
61
+ # Group regions by type and model
62
+ regions_by_type = {}
63
+ for region in page.detected_layout_regions:
64
+ region_type = region.region_type
65
+ if region_type not in regions_by_type:
66
+ regions_by_type[region_type] = []
67
+ regions_by_type[region_type].append(region)
68
+
69
+ # Print a summary of all detected regions by type
70
+ print("\nAll detected regions:")
71
+ for region_type, type_regions in regions_by_type.items():
72
+ model_name = type_regions[0].model if hasattr(type_regions[0], 'model') else "unknown"
73
+ print(f" - {region_type} ({model_name}): {len(type_regions)} regions")
74
+
75
+ # Highlight all regions using method chaining
76
+ output_path = os.path.join(output_dir, "all_detected_regions.png")
77
+ page.clear_highlights()\
78
+ .highlight_layout()\
79
+ .to_image(path=output_path, show_labels=True)
80
+ print(f"\nSaved combined layout visualization to {output_path}")
81
+
82
+ # Highlight only YOLO regions using selector and chaining
83
+ output_path = os.path.join(output_dir, "yolo_regions.png")
84
+ page.clear_highlights()\
85
+ .find_all('region[model=yolo]')\
86
+ .highlight(label="YOLO Regions")
87
+ page.to_image(path=output_path, show_labels=True)
88
+ print(f"Saved YOLO layout visualization to {output_path}")
89
+
90
+ # Highlight only Table Transformer regions using selector and chaining
91
+ output_path = os.path.join(output_dir, "table_structure.png")
92
+ page.clear_highlights()\
93
+ .find_all('region[model=tatr]')\
94
+ .highlight(label="Table Structure")
95
+ page.to_image(path=output_path, show_labels=True)
96
+ print(f"Saved Table Transformer visualization to {output_path}")
97
+
98
+ # Find tables and process their content
99
+ tables = page.find_all('region[type=table]')
100
+ if tables:
101
+ print(f"\nFound {len(tables)} tables")
102
+
103
+ # Get the first table
104
+ table = tables[0]
105
+ print(f"Table details:")
106
+ print(f" Confidence: {table.confidence:.2f}")
107
+ print(f" Bounding box: {table.bbox}")
108
+
109
+ # Find rows, columns, and headers within this table
110
+ # Note: Original class names with spaces are converted to hyphenated format in selectors
111
+ rows = page.find_all('region[type=table-row]')
112
+ columns = page.find_all('region[type=table-column]')
113
+ headers = page.find_all('region[type=table-column-header]')
114
+
115
+ print(f" Structure: {len(rows)} rows, {len(columns)} columns, {len(headers)} headers")
116
+
117
+ # Extract text from the table
118
+ table_text = table.extract_text()
119
+ print(f" Content preview: {table_text[:150]}..." if len(table_text) > 150 else table_text)
120
+
121
+ # Highlight the table structure with distinct colors
122
+ page.clear_highlights()
123
+
124
+ # First highlight the table
125
+ table.highlight(label="Table", color=(1, 0, 0, 0.3))
126
+
127
+ # Then highlight the structure elements
128
+ for row in rows:
129
+ row.highlight(label="Row", color=(0, 1, 0, 0.3))
130
+ for column in columns:
131
+ column.highlight(label="Column", color=(0, 0, 1, 0.3))
132
+ for header in headers:
133
+ header.highlight(label="Header", color=(0, 1, 1, 0.3))
134
+
135
+ # Save the highlighted table structure
136
+ output_path = os.path.join(output_dir, "table_structure_detail.png")
137
+ page.to_image(path=output_path, show_labels=True)
138
+ print(f" Saved detailed table structure visualization to {output_path}")
139
+
140
+ # Now find text elements within the table
141
+ print("\nExtracting text from table cells:")
142
+ table_text_elements = table.find_all('text')
143
+ print(f" Found {len(table_text_elements)} text elements in the table")
144
+
145
+ # Show the first few text elements
146
+ for i, elem in enumerate(table_text_elements[:5]):
147
+ print(f" Text {i+1}: '{elem.text}'")
148
+
149
+ # You can also extract text just from table headers
150
+ if headers:
151
+ header = headers[0]
152
+ header_text = header.extract_text()
153
+ print(f"\nHeader text: {header_text}")
154
+ else:
155
+ print("\nNo tables detected on this page")
@@ -0,0 +1,56 @@
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ # Add parent directory to path for imports
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+
8
+ from natural_pdf import PDF
9
+
10
+ # Get absolute path for the PDF
11
+ script_dir = os.path.dirname(os.path.realpath(__file__))
12
+ root_dir = os.path.dirname(script_dir)
13
+ pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
14
+
15
+ print(f"Loading PDF: {pdf_path}")
16
+ pdf = PDF(pdf_path)
17
+
18
+ # Create output directory if it doesn't exist
19
+ output_dir = os.path.join(root_dir, "output")
20
+ os.makedirs(output_dir, exist_ok=True)
21
+
22
+ # Use a specific page
23
+ page = pdf.pages[6]
24
+
25
+ # Test 1: Analyze layout with create_cells=True
26
+ print("\n-- Testing layout detection with cell creation --")
27
+ regions = page.analyze_layout(model='tatr', create_cells=True)
28
+
29
+ # Count tables and cells
30
+ tables = page.find_all('region[type=table][model=tatr]')
31
+ cells = page.find_all('region[type=table-cell][model=tatr]')
32
+
33
+ print(f"Found {len(tables)} tables")
34
+ print(f"Found {len(cells)} table cells")
35
+
36
+ # Test 2: Create cells explicitly from a table
37
+ if tables:
38
+ print("\n-- Testing explicit cell creation from a table --")
39
+ table = tables[0]
40
+ # Create cells if not already created
41
+ explicit_cells = table.create_cells()
42
+ print(f"Created {len(explicit_cells)} cells explicitly")
43
+
44
+ # Highlight the first few cells
45
+ for i, cell in enumerate(explicit_cells[:5]):
46
+ cell.highlight(label=f"Cell {i+1}", color=(255, 0, 0, 50))
47
+
48
+ # Highlight the table
49
+ table.highlight(label="Table", color=(0, 0, 255, 50))
50
+
51
+ # Save the highlighted image
52
+ output_path = os.path.join(output_dir, "tatr_cells_test.png")
53
+ print(f"\nSaving highlighted image to: {output_path}")
54
+ page.to_image(path=output_path, show_labels=True)
55
+
56
+ print("\nTest completed successfully!")
@@ -0,0 +1,94 @@
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+ import pandas as pd
5
+
6
+ # Add parent directory to path for imports
7
+ sys.path.insert(0, str(Path(__file__).parent.parent))
8
+
9
+ from natural_pdf import PDF
10
+
11
+ # Get absolute path for the PDF
12
+ script_dir = os.path.dirname(os.path.realpath(__file__))
13
+ root_dir = os.path.dirname(script_dir)
14
+ pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
15
+
16
+ print(f"Loading PDF: {pdf_path}")
17
+ pdf = PDF(pdf_path)
18
+
19
+ # Create output directory if it doesn't exist
20
+ output_dir = os.path.join(root_dir, "output")
21
+ os.makedirs(output_dir, exist_ok=True)
22
+
23
+ # Use a specific page
24
+ page = pdf.pages[3] # Try page 3 (this should be correct - pages are indexed from 0)
25
+
26
+ # Run document layout analysis to find tables
27
+ print("\n-- Running layout analysis to find tables --")
28
+ regions = page.analyze_layout(model='tatr')
29
+
30
+ # Find the first table
31
+ table = page.find('region[type=table][model=tatr]')
32
+ if not table:
33
+ print("No tables found.")
34
+ sys.exit(1)
35
+
36
+ print(f"Found table at coordinates: {table.bbox}")
37
+
38
+ # Find table structure elements
39
+ rows = page.find_all(f'region[type=table-row][model=tatr]')
40
+ columns = page.find_all(f'region[type=table-column][model=tatr]')
41
+ headers = page.find_all(f'region[type=table-column-header][model=tatr]')
42
+
43
+ # Filter to elements that are part of this table
44
+ def is_in_table(region, table):
45
+ region_center_x = (region.x0 + region.x1) / 2
46
+ region_center_y = (region.top + region.bottom) / 2
47
+ return (table.x0 <= region_center_x <= table.x1 and
48
+ table.top <= region_center_y <= table.bottom)
49
+
50
+ table_rows = [r for r in rows if is_in_table(r, table)]
51
+ table_columns = [c for c in columns if is_in_table(c, table)]
52
+ table_headers = [h for h in headers if is_in_table(h, table)]
53
+
54
+ # Print structure info
55
+ print(f"Table has {len(table_rows)} rows, {len(table_columns)} columns, and {len(table_headers)} headers")
56
+
57
+ # Create cells and check OCR on some of them
58
+ cells = table.create_cells()
59
+ print(f"Created {len(cells)} cells")
60
+
61
+ # Try OCR on a few individual cells to debug
62
+ print("\n-- Testing OCR on individual cells --")
63
+ if cells:
64
+ sample_cells = cells[:50] # First 50 cells
65
+
66
+ for i, cell in enumerate(sample_cells):
67
+ # print(f"Cell {i+1}:", cell.bbox)
68
+
69
+ # Try OCR with very low confidence
70
+ ocr_config = {
71
+ "enabled": True,
72
+ "min_confidence": 0.01,
73
+ "detection_params": {
74
+ "text_threshold": 0.001, # Lower threshold to detect more text (default is 0.7)
75
+ "mag_ratio": 4.0, # Double the magnification during detectio
76
+ "link_threshold": 1
77
+ },
78
+ "recognition_params": {
79
+ "min_size": 6
80
+ }
81
+ }
82
+
83
+ ocr_elements = cell.apply_ocr(**ocr_config)
84
+ if ocr_elements:
85
+ print(f" OCR detected {len(ocr_elements)} text elements:")
86
+ for elem in ocr_elements:
87
+ print(f" '{elem.text}' (conf: {elem.confidence:.2f})")
88
+
89
+ # Get regular text
90
+ text = cell.extract_text().strip()
91
+ if text:
92
+ print(f" Regular extraction: '{text}'")
93
+
94
+ print("\nTest completed successfully!")
@@ -0,0 +1,122 @@
1
+ """
2
+ Example demonstrating enhanced text search capabilities in Natural PDF.
3
+
4
+ This showcases:
5
+ 1. Multi-word searching with keep_spaces enabled (default)
6
+ 2. Case-insensitive searching
7
+ 3. Regular expression searching
8
+ 4. Turning off keep_spaces to see the difference
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import argparse
14
+ from pathlib import Path
15
+
16
+ # Add parent directory to path for running the example
17
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
18
+
19
+ from natural_pdf import PDF, configure_logging
20
+ import logging
21
+
22
+
23
+ def main(pdf_path=None):
24
+ # Use a default PDF if none provided
25
+ if not pdf_path:
26
+ pdf_path = os.path.join(os.path.dirname(__file__), '..', 'pdfs', '2019 Statistics.pdf')
27
+
28
+ print(f"Using PDF: {pdf_path}")
29
+ print("-" * 50)
30
+
31
+ # Create PDF with default settings (keep_spaces=True)
32
+ pdf = PDF(pdf_path)
33
+ page = pdf.pages[0]
34
+
35
+ # Display basic page info
36
+ print(f"Page dimensions: {page.width} x {page.height}")
37
+
38
+ # 1. Basic multi-word search with default keep_spaces=True
39
+ print("\nMulti-word search with keep_spaces=True (default):")
40
+ print("-" * 50)
41
+
42
+ # Search for a multi-word phrase
43
+ results = page.find_all('text:contains("annual report")', case=False)
44
+ print(f"Found {len(results)} results for 'annual report' (case-insensitive)")
45
+ for i, result in enumerate(results):
46
+ print(f" Result {i+1}: '{result.text}'")
47
+ # Highlight the results
48
+ result.highlight(label=f"Match {i+1}: 'annual report'", color=(1, 0.7, 0, 0.3))
49
+
50
+ # 2. Case-sensitive search
51
+ print("\nCase-sensitive search:")
52
+ print("-" * 50)
53
+
54
+ # Search with case sensitivity
55
+ results = page.find_all('text:contains("Annual Report")', case=True)
56
+ print(f"Found {len(results)} results for 'Annual Report' (case-sensitive)")
57
+ for i, result in enumerate(results):
58
+ print(f" Result {i+1}: '{result.text}'")
59
+ # Highlight with a different color
60
+ result.highlight(label=f"Match {i+1}: 'Annual Report'", color=(0, 0.7, 1, 0.3))
61
+
62
+ # 3. Regular expression search
63
+ print("\nRegular expression search:")
64
+ print("-" * 50)
65
+
66
+ # Use regex to find patterns
67
+ pattern = "report\\s+\\d{4}" # "report" followed by whitespace and 4 digits
68
+ results = page.find_all(f'text:contains("{pattern}")', regex=True, case=False)
69
+ print(f"Found {len(results)} results for regex pattern '{pattern}'")
70
+ for i, result in enumerate(results):
71
+ print(f" Result {i+1}: '{result.text}'")
72
+ # Highlight with another color
73
+ result.highlight(label=f"Match {i+1}: regex '{pattern}'", color=(0, 1, 0, 0.3))
74
+
75
+ # Save highlighted page as an image
76
+ output_path = os.path.join(os.path.dirname(__file__), '..', 'output', 'text_search_results.png')
77
+ page.save_image(output_path, labels=True)
78
+ print(f"\nSaved highlighted results to: {output_path}")
79
+
80
+ # 4. Create a new PDF with keep_spaces=False to compare
81
+ print("\nComparing with keep_spaces=False (legacy behavior):")
82
+ print("-" * 50)
83
+
84
+ # Create a new PDF with keep_spaces=False
85
+ pdf_legacy = PDF(pdf_path, keep_spaces=False)
86
+ page_legacy = pdf_legacy.pages[0]
87
+
88
+ # Try the same multi-word search
89
+ results_legacy = page_legacy.find_all('text:contains("annual report")', case=False)
90
+ print(f"Found {len(results_legacy)} results for 'annual report' (case-insensitive)")
91
+
92
+ # Try regex to find occurrences in separate words
93
+ pattern = "annual\\s+report" # "annual" followed by whitespace and "report"
94
+ regex_results = page_legacy.find_all(f'text:contains("{pattern}")', regex=True, case=False)
95
+ print(f"With regex '{pattern}': Found {len(regex_results)} results")
96
+
97
+ # Show conclusion
98
+ print("\nConclusion:")
99
+ print("-" * 50)
100
+ print("1. With keep_spaces=True (default):")
101
+ print(" - Multi-word phrases can be found directly with :contains()")
102
+ print(" - Text maintains its natural spacing within word elements")
103
+ print("\n2. With keep_spaces=False (legacy):")
104
+ print(" - Words are split at spaces, making multi-word search less effective")
105
+ print(" - Regular expressions with \\s patterns can help bridge words")
106
+
107
+ return pdf
108
+
109
+
110
+ if __name__ == "__main__":
111
+ # Set up command line arguments
112
+ parser = argparse.ArgumentParser(description="Demonstrate Natural PDF's enhanced text search capabilities")
113
+ parser.add_argument("--pdf", help="Path to a PDF file to analyze")
114
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
115
+ args = parser.parse_args()
116
+
117
+ # Configure logging
118
+ log_level = logging.DEBUG if args.verbose else logging.INFO
119
+ configure_logging(level=log_level)
120
+
121
+ # Run the example
122
+ pdf = main(args.pdf)
@@ -0,0 +1,110 @@
1
+ """
2
+ Example demonstrating the text style analysis feature of natural-pdf.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ def text_style_example(pdf_path):
13
+ """Demonstrates the text style analysis feature."""
14
+ # Open the PDF
15
+ with PDF(pdf_path) as pdf:
16
+ page = pdf.pages[0]
17
+
18
+ print(f"PDF loaded: {pdf_path}")
19
+ print(f"PDF has {len(pdf)} pages")
20
+
21
+ # Create an output directory for saving images
22
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
23
+ os.makedirs(output_dir, exist_ok=True)
24
+
25
+ # EXAMPLE 1: Analyze text styles
26
+ print("\nEXAMPLE 1: Analyzing text styles")
27
+ print("-" * 60)
28
+
29
+ # Analyze the styles
30
+ styles = page.analyze_text_styles()
31
+
32
+ # Display what was found
33
+ print("Text style analysis results:")
34
+ for label, elements in styles.items():
35
+ print(f"- {label}: {len(elements)} elements")
36
+
37
+ # Show a sample of each style
38
+ if len(elements) > 0:
39
+ sample = elements[0]
40
+ # Get style properties
41
+ size = getattr(sample, 'size', 'N/A')
42
+ font = getattr(sample, 'fontname', 'N/A')
43
+
44
+ # Determine if bold/italic based on font name
45
+ is_bold = False
46
+ is_italic = False
47
+ if hasattr(sample, 'fontname') and sample.fontname:
48
+ font_lower = sample.fontname.lower()
49
+ is_bold = ('bold' in font_lower or 'black' in font_lower or
50
+ sample.fontname.endswith('-B'))
51
+ is_italic = ('italic' in font_lower or 'oblique' in font_lower or
52
+ sample.fontname.endswith('-I'))
53
+
54
+ style_desc = []
55
+ if is_bold:
56
+ style_desc.append("bold")
57
+ if is_italic:
58
+ style_desc.append("italic")
59
+
60
+ style_text = ", ".join(style_desc) if style_desc else "regular"
61
+
62
+ print(f" Sample: '{sample.text}' (size={size}, {style_text}, font={font})")
63
+
64
+ # EXAMPLE 2: Visualize text styles with highlighting
65
+ print("\nEXAMPLE 2: Visualizing text styles")
66
+ print("-" * 60)
67
+
68
+ # Highlight the styles
69
+ page.highlight_text_styles()
70
+
71
+ # Save the image with a legend
72
+ output_file = os.path.join(output_dir, "text_styles.png")
73
+ page.to_image(path=output_file, show_labels=True)
74
+ print(f"Saved text style visualization to: {output_file}")
75
+
76
+ # Clear highlights for the next example
77
+ page.clear_highlights()
78
+
79
+ # EXAMPLE 3: Using highlight_all with text styles
80
+ print("\nEXAMPLE 3: Using highlight_all with text styles")
81
+ print("-" * 60)
82
+
83
+ # Highlight all elements including text styles
84
+ page.highlight_all(include_text_styles=True)
85
+
86
+ # Save the image with a legend
87
+ output_file = os.path.join(output_dir, "highlight_all_styles.png")
88
+ page.to_image(path=output_file, show_labels=True)
89
+ print(f"Saved highlight_all with text styles to: {output_file}")
90
+
91
+ print("\nEnd of text style demonstration.")
92
+
93
+ if __name__ == "__main__":
94
+ # Default to example PDF if no path is provided
95
+ if len(sys.argv) < 2:
96
+ # Use the example PDF in the pdfs directory
97
+ pdf_path = os.path.abspath(os.path.join(
98
+ os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
99
+ if not os.path.exists(pdf_path):
100
+ print("Example PDF not found. Please provide a path to a PDF file.")
101
+ print("Usage: python text_style_example.py [path/to/file.pdf]")
102
+ sys.exit(1)
103
+ else:
104
+ pdf_path = sys.argv[1]
105
+ # Check if the file exists
106
+ if not os.path.exists(pdf_path):
107
+ print(f"File not found: {pdf_path}")
108
+ sys.exit(1)
109
+
110
+ text_style_example(pdf_path)
examples/tiny-text.py ADDED
@@ -0,0 +1,61 @@
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ # Add parent directory to path for imports
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+
8
+ from natural_pdf import PDF
9
+
10
+ # Get absolute path for the PDF
11
+ script_dir = os.path.dirname(os.path.realpath(__file__))
12
+ root_dir = os.path.dirname(script_dir)
13
+ pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
14
+
15
+ print(f"Loading PDF: {pdf_path}")
16
+ pdf = PDF(pdf_path, ocr={
17
+ "enabled": True,
18
+ "engine": "easyocr",
19
+ "languages": ["en"],
20
+ "detection_params": {
21
+ "text_threshold": 0.001,
22
+ "mag_ratio": 3.0, # Quadruple the magnification during detection
23
+ "canvas_size": 5000,
24
+ },
25
+ "recognition_params": {
26
+ "min_size": 4,
27
+ "contrast_ths": 0.5
28
+ }
29
+ })
30
+
31
+ # Create output directory if it doesn't exist
32
+ output_dir = os.path.join(root_dir, "output")
33
+ os.makedirs(output_dir, exist_ok=True)
34
+
35
+ # Use a specific page
36
+ page = pdf.pages[6]
37
+ # Run document layout analysis
38
+ regions = page.analyze_layout(model='tatr')
39
+
40
+ print(f"Found {len(regions)} regions")
41
+
42
+ # # Apply OCR explicitly
43
+ # print("Applying OCR...")
44
+ ocr_elements = page.apply_ocr()
45
+ print(f"Found {len(ocr_elements)} OCR elements")
46
+
47
+ # Print some sample elements
48
+ print("\nSample OCR elements:")
49
+ for i, elem in enumerate(ocr_elements[:30]):
50
+ print(f"{i+1}. {elem}")
51
+
52
+ # Highlight the OCR text elements
53
+ print("\nHighlighting OCR elements...")
54
+ for elem in ocr_elements:
55
+ elem.highlight(label=f"OCR ({elem.confidence:.2f})")
56
+
57
+ output_path = os.path.join(output_dir, "ocr_highlight_all_test.png")
58
+ print(f"Saving highlight_all image to: {output_path}")
59
+ page.to_image(path=output_path, show_labels=True)
60
+
61
+ print("\nTest completed successfully!")