natural-pdf 0.2.12__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,67 +0,0 @@
1
- """Debug why exclusions aren't working with guides.extract_table()"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Check initial text
9
- print("Initial text:")
10
- print(page.extract_text()[:200])
11
- print()
12
-
13
- # Add exclusions
14
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
15
- pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
16
-
17
- # Check text after exclusions
18
- print("Text after exclusions:")
19
- print(page.extract_text()[:100])
20
- print()
21
-
22
- # Debug exclusion regions
23
- print("Checking exclusion regions:")
24
- exclusions = page._get_exclusion_regions(debug=True)
25
- print(f"\nTotal exclusions: {len(exclusions)}")
26
- for i, exc in enumerate(exclusions):
27
- print(f" {i}: {exc.bbox}")
28
- print()
29
-
30
- # Create guides
31
- headers = (
32
- page
33
- .find(text="NUMBER")
34
- .right(include_source=True)
35
- .expand(top=3, bottom=3)
36
- .find_all('text')
37
- )
38
-
39
- guides = Guides(page)
40
- guides.vertical.from_content(headers, align='left')
41
- guides.horizontal.from_stripes()
42
-
43
- # Build grid to see what regions are created
44
- print("\nBuilding grid...")
45
- grid_result = guides.build_grid(include_outer_boundaries=True)
46
- table_region = grid_result["regions"]["table"]
47
- print(f"Table region: {table_region}")
48
- print(f"Table bbox: {table_region.bbox if table_region else 'None'}")
49
-
50
- # Check if table region respects exclusions
51
- if table_region:
52
- print("\nExtracting text from table region directly:")
53
- table_text = table_region.extract_text()[:200]
54
- print(f"Table text: {table_text}")
55
-
56
- # Now extract table
57
- print("\nExtracting table with apply_exclusions=True:")
58
- result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
59
- df = result.to_df()
60
- print(df.head())
61
-
62
- # Check if excluded content is in the table
63
- table_str = df.to_string()
64
- has_feb = "FEBRUARY 2014" in table_str or "FEBR" in table_str
65
- has_alphabetic = "ALPHABETIC LISTING" in table_str
66
- print(f"\nContains 'FEBRUARY': {has_feb}")
67
- print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
temp/debug_extra_guide.py DELETED
@@ -1,41 +0,0 @@
1
- """Debug the extra guide issue."""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Get headers
9
- headers = (
10
- page
11
- .find("text:contains(NUMBER)")
12
- .right(include_source=True)
13
- .expand(top=3, bottom=3)
14
- .find_all('text')
15
- )
16
-
17
- print("Headers 3-5:")
18
- for i, h in enumerate(headers[3:5]):
19
- print(f" {i}: '{h.text}' bbox={h.bbox}")
20
-
21
- # Create guides with just these two headers
22
- guides = Guides(page)
23
- guides.vertical.from_content(headers[3:5], align='left', outer=False)
24
-
25
- print(f"\nResulting guides: {guides.vertical}")
26
- print(f"Expected: [328.32012, 539.63316]")
27
-
28
- # Let's also check what happens with each header individually
29
- print("\nTesting each header individually:")
30
- for i, h in enumerate(headers[3:5]):
31
- g = Guides(page)
32
- g.vertical.from_content([h], align='left', outer=False)
33
- print(f" Header {i} guides: {g.vertical}")
34
-
35
- # Check if it's related to the ElementCollection
36
- print("\nTesting with manual list of text:")
37
- text_list = [h.text for h in headers[3:5]]
38
- print(f"Text list: {text_list}")
39
- guides2 = Guides(page)
40
- guides2.vertical.from_content(text_list, align='left', outer=False)
41
- print(f"Guides from text list: {guides2.vertical}")
@@ -1,46 +0,0 @@
1
- """Debug outer boundaries issue with exclusions"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Add exclusions
9
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
10
-
11
- # Create guides
12
- headers = (
13
- page
14
- .find(text="NUMBER")
15
- .right(include_source=True)
16
- .expand(top=3, bottom=3)
17
- .find_all('text')
18
- )
19
-
20
- guides = Guides(page)
21
- guides.vertical.from_content(headers, align='left')
22
- guides.horizontal.from_stripes()
23
-
24
- print("Horizontal guides (sorted):")
25
- for i, h in enumerate(sorted(guides.horizontal)):
26
- print(f" {i}: {h:.2f}")
27
-
28
- print(f"\nFirst content guide: {sorted(guides.horizontal)[0]:.2f}")
29
- print(f"Page height: {page.height}")
30
-
31
- # Test without outer boundaries
32
- print("\n\nWithout outer boundaries:")
33
- result1 = guides.extract_table(include_outer_boundaries=False, apply_exclusions=True, header=False)
34
- df1 = result1.to_df()
35
- print(f"Shape: {df1.shape}")
36
- print("First row, first column:", df1.iloc[0, 0] if not df1.empty else "Empty")
37
-
38
- # Test with outer boundaries
39
- print("\n\nWith outer boundaries:")
40
- result2 = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
41
- df2 = result2.to_df()
42
- print(f"Shape: {df2.shape}")
43
- print("First row, first column:", df2.iloc[0, 0] if not df2.empty else "Empty")
44
-
45
- # The issue: include_outer_boundaries adds guides at 0 and 612,
46
- # which creates cells that span into the exclusion zone
temp/debug_st_search.py DELETED
@@ -1,33 +0,0 @@
1
- """Debug searching for 'ST' text."""
2
- from natural_pdf import PDF
3
-
4
- pdf = PDF("pdfs/m27.pdf")
5
- page = pdf.pages[0]
6
-
7
- # Get the original ST element
8
- headers = (
9
- page
10
- .find("text:contains(NUMBER)")
11
- .right(include_source=True)
12
- .expand(top=3, bottom=3)
13
- .find_all('text')
14
- )
15
- original_st = headers[4]
16
- print(f"Original 'ST' element: '{original_st.text}' at {original_st.bbox}")
17
-
18
- # Search for 'ST' using find
19
- found_st = page.find('text:contains("ST")')
20
- print(f"\nFound 'ST' using find: '{found_st.text}' at {found_st.bbox}")
21
-
22
- # Find all elements containing 'ST'
23
- all_st = page.find_all('text:contains("ST")')
24
- print(f"\nAll elements containing 'ST':")
25
- for i, elem in enumerate(all_st[:10]): # First 10
26
- print(f" {i}: '{elem.text}' at x={elem.x0:.2f}, bbox={elem.bbox}")
27
-
28
- # Check what's at position 332.88
29
- print(f"\nLooking for element at x≈332.88:")
30
- all_text = page.find_all('text')
31
- for elem in all_text:
32
- if 332 < elem.x0 < 334:
33
- print(f" Found: '{elem.text}' at x={elem.x0:.5f}, bbox={elem.bbox}")