natural-pdf 0.2.12__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/highlighting_service.py +40 -10
- natural_pdf/elements/base.py +18 -1
- natural_pdf/elements/element_collection.py +153 -15
- natural_pdf/elements/rect.py +34 -0
- natural_pdf/elements/region.py +55 -3
- natural_pdf/elements/text.py +20 -2
- natural_pdf/selectors/parser.py +28 -1
- natural_pdf/vision/__init__.py +1 -2
- natural_pdf/vision/mixin.py +67 -27
- natural_pdf/vision/results.py +49 -5
- natural_pdf/vision/similarity.py +195 -23
- natural_pdf/vision/template_matching.py +209 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/RECORD +24 -23
- temp/test_draw_guides.py +25 -0
- temp/test_draw_guides_interactive.py +30 -0
- temp/test_guide_draw_notebook.py +47 -0
- temp/test_inline_js.py +22 -0
- temp/test_widget_functionality.py +68 -0
- temp/test_widget_simple.py +41 -0
- temp/debug_cell_extraction.py +0 -42
- temp/debug_exclusion_overlap.py +0 -43
- temp/debug_exclusions_guides.py +0 -67
- temp/debug_extra_guide.py +0 -41
- temp/debug_outer_boundaries.py +0 -46
- temp/debug_st_search.py +0 -33
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/top_level.txt +0 -0
temp/debug_exclusions_guides.py
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
"""Debug why exclusions aren't working with guides.extract_table()"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Check initial text
|
9
|
-
print("Initial text:")
|
10
|
-
print(page.extract_text()[:200])
|
11
|
-
print()
|
12
|
-
|
13
|
-
# Add exclusions
|
14
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
15
|
-
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
|
16
|
-
|
17
|
-
# Check text after exclusions
|
18
|
-
print("Text after exclusions:")
|
19
|
-
print(page.extract_text()[:100])
|
20
|
-
print()
|
21
|
-
|
22
|
-
# Debug exclusion regions
|
23
|
-
print("Checking exclusion regions:")
|
24
|
-
exclusions = page._get_exclusion_regions(debug=True)
|
25
|
-
print(f"\nTotal exclusions: {len(exclusions)}")
|
26
|
-
for i, exc in enumerate(exclusions):
|
27
|
-
print(f" {i}: {exc.bbox}")
|
28
|
-
print()
|
29
|
-
|
30
|
-
# Create guides
|
31
|
-
headers = (
|
32
|
-
page
|
33
|
-
.find(text="NUMBER")
|
34
|
-
.right(include_source=True)
|
35
|
-
.expand(top=3, bottom=3)
|
36
|
-
.find_all('text')
|
37
|
-
)
|
38
|
-
|
39
|
-
guides = Guides(page)
|
40
|
-
guides.vertical.from_content(headers, align='left')
|
41
|
-
guides.horizontal.from_stripes()
|
42
|
-
|
43
|
-
# Build grid to see what regions are created
|
44
|
-
print("\nBuilding grid...")
|
45
|
-
grid_result = guides.build_grid(include_outer_boundaries=True)
|
46
|
-
table_region = grid_result["regions"]["table"]
|
47
|
-
print(f"Table region: {table_region}")
|
48
|
-
print(f"Table bbox: {table_region.bbox if table_region else 'None'}")
|
49
|
-
|
50
|
-
# Check if table region respects exclusions
|
51
|
-
if table_region:
|
52
|
-
print("\nExtracting text from table region directly:")
|
53
|
-
table_text = table_region.extract_text()[:200]
|
54
|
-
print(f"Table text: {table_text}")
|
55
|
-
|
56
|
-
# Now extract table
|
57
|
-
print("\nExtracting table with apply_exclusions=True:")
|
58
|
-
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
59
|
-
df = result.to_df()
|
60
|
-
print(df.head())
|
61
|
-
|
62
|
-
# Check if excluded content is in the table
|
63
|
-
table_str = df.to_string()
|
64
|
-
has_feb = "FEBRUARY 2014" in table_str or "FEBR" in table_str
|
65
|
-
has_alphabetic = "ALPHABETIC LISTING" in table_str
|
66
|
-
print(f"\nContains 'FEBRUARY': {has_feb}")
|
67
|
-
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
temp/debug_extra_guide.py
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
"""Debug the extra guide issue."""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Get headers
|
9
|
-
headers = (
|
10
|
-
page
|
11
|
-
.find("text:contains(NUMBER)")
|
12
|
-
.right(include_source=True)
|
13
|
-
.expand(top=3, bottom=3)
|
14
|
-
.find_all('text')
|
15
|
-
)
|
16
|
-
|
17
|
-
print("Headers 3-5:")
|
18
|
-
for i, h in enumerate(headers[3:5]):
|
19
|
-
print(f" {i}: '{h.text}' bbox={h.bbox}")
|
20
|
-
|
21
|
-
# Create guides with just these two headers
|
22
|
-
guides = Guides(page)
|
23
|
-
guides.vertical.from_content(headers[3:5], align='left', outer=False)
|
24
|
-
|
25
|
-
print(f"\nResulting guides: {guides.vertical}")
|
26
|
-
print(f"Expected: [328.32012, 539.63316]")
|
27
|
-
|
28
|
-
# Let's also check what happens with each header individually
|
29
|
-
print("\nTesting each header individually:")
|
30
|
-
for i, h in enumerate(headers[3:5]):
|
31
|
-
g = Guides(page)
|
32
|
-
g.vertical.from_content([h], align='left', outer=False)
|
33
|
-
print(f" Header {i} guides: {g.vertical}")
|
34
|
-
|
35
|
-
# Check if it's related to the ElementCollection
|
36
|
-
print("\nTesting with manual list of text:")
|
37
|
-
text_list = [h.text for h in headers[3:5]]
|
38
|
-
print(f"Text list: {text_list}")
|
39
|
-
guides2 = Guides(page)
|
40
|
-
guides2.vertical.from_content(text_list, align='left', outer=False)
|
41
|
-
print(f"Guides from text list: {guides2.vertical}")
|
temp/debug_outer_boundaries.py
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
"""Debug outer boundaries issue with exclusions"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Add exclusions
|
9
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
10
|
-
|
11
|
-
# Create guides
|
12
|
-
headers = (
|
13
|
-
page
|
14
|
-
.find(text="NUMBER")
|
15
|
-
.right(include_source=True)
|
16
|
-
.expand(top=3, bottom=3)
|
17
|
-
.find_all('text')
|
18
|
-
)
|
19
|
-
|
20
|
-
guides = Guides(page)
|
21
|
-
guides.vertical.from_content(headers, align='left')
|
22
|
-
guides.horizontal.from_stripes()
|
23
|
-
|
24
|
-
print("Horizontal guides (sorted):")
|
25
|
-
for i, h in enumerate(sorted(guides.horizontal)):
|
26
|
-
print(f" {i}: {h:.2f}")
|
27
|
-
|
28
|
-
print(f"\nFirst content guide: {sorted(guides.horizontal)[0]:.2f}")
|
29
|
-
print(f"Page height: {page.height}")
|
30
|
-
|
31
|
-
# Test without outer boundaries
|
32
|
-
print("\n\nWithout outer boundaries:")
|
33
|
-
result1 = guides.extract_table(include_outer_boundaries=False, apply_exclusions=True, header=False)
|
34
|
-
df1 = result1.to_df()
|
35
|
-
print(f"Shape: {df1.shape}")
|
36
|
-
print("First row, first column:", df1.iloc[0, 0] if not df1.empty else "Empty")
|
37
|
-
|
38
|
-
# Test with outer boundaries
|
39
|
-
print("\n\nWith outer boundaries:")
|
40
|
-
result2 = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
41
|
-
df2 = result2.to_df()
|
42
|
-
print(f"Shape: {df2.shape}")
|
43
|
-
print("First row, first column:", df2.iloc[0, 0] if not df2.empty else "Empty")
|
44
|
-
|
45
|
-
# The issue: include_outer_boundaries adds guides at 0 and 612,
|
46
|
-
# which creates cells that span into the exclusion zone
|
temp/debug_st_search.py
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
"""Debug searching for 'ST' text."""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
|
4
|
-
pdf = PDF("pdfs/m27.pdf")
|
5
|
-
page = pdf.pages[0]
|
6
|
-
|
7
|
-
# Get the original ST element
|
8
|
-
headers = (
|
9
|
-
page
|
10
|
-
.find("text:contains(NUMBER)")
|
11
|
-
.right(include_source=True)
|
12
|
-
.expand(top=3, bottom=3)
|
13
|
-
.find_all('text')
|
14
|
-
)
|
15
|
-
original_st = headers[4]
|
16
|
-
print(f"Original 'ST' element: '{original_st.text}' at {original_st.bbox}")
|
17
|
-
|
18
|
-
# Search for 'ST' using find
|
19
|
-
found_st = page.find('text:contains("ST")')
|
20
|
-
print(f"\nFound 'ST' using find: '{found_st.text}' at {found_st.bbox}")
|
21
|
-
|
22
|
-
# Find all elements containing 'ST'
|
23
|
-
all_st = page.find_all('text:contains("ST")')
|
24
|
-
print(f"\nAll elements containing 'ST':")
|
25
|
-
for i, elem in enumerate(all_st[:10]): # First 10
|
26
|
-
print(f" {i}: '{elem.text}' at x={elem.x0:.2f}, bbox={elem.bbox}")
|
27
|
-
|
28
|
-
# Check what's at position 332.88
|
29
|
-
print(f"\nLooking for element at x≈332.88:")
|
30
|
-
all_text = page.find_all('text')
|
31
|
-
for elem in all_text:
|
32
|
-
if 332 < elem.x0 < 334:
|
33
|
-
print(f" Found: '{elem.text}' at x={elem.x0:.5f}, bbox={elem.bbox}")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|