natural-pdf 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +318 -73
- natural_pdf/core/page.py +56 -8
- natural_pdf/elements/region.py +5 -3
- {natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/RECORD +29 -9
- {natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/top_level.txt +1 -0
- temp/debug_cell_extraction.py +42 -0
- temp/debug_exclusion_overlap.py +43 -0
- temp/debug_exclusions_guides.py +67 -0
- temp/debug_extra_guide.py +41 -0
- temp/debug_outer_boundaries.py +46 -0
- temp/debug_st_search.py +33 -0
- temp/fix_page_exclusions.py +42 -0
- temp/test_exclusion_with_debug.py +30 -0
- temp/test_find_exclusions_fix.py +53 -0
- temp/test_find_exclusions_fix_no_recursion.py +97 -0
- temp/test_fix_real_pdf.py +48 -0
- temp/test_fix_working.py +55 -0
- temp/test_fixed_pdf_exclusions.py +67 -0
- temp/test_horizontal_top_bottom.py +53 -0
- temp/test_marker_order.py +45 -0
- temp/test_original_exclusions_now_work.py +56 -0
- temp/test_pdf_exclusions_with_guides.py +84 -0
- temp/test_region_exclusions_detailed.py +25 -0
- temp/test_stripes_real_pdf.py +62 -0
- temp/test_vertical_stripes.py +55 -0
- {natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@ natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
|
|
2
2
|
natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
|
3
3
|
natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
|
4
4
|
natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
|
5
|
-
natural_pdf/analyzers/guides.py,sha256=
|
5
|
+
natural_pdf/analyzers/guides.py,sha256=B2_Etb0o-lOku-FQw-T1Fo1qxbcAXT4FB0hdp-5kXRs,188171
|
6
6
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
|
7
7
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
8
8
|
natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
|
@@ -27,7 +27,7 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
|
30
|
-
natural_pdf/core/page.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=Pid5hqVjcyX-gcCzxCJ62k6AQhNbUMNM_5QmEcylIjM,155264
|
31
31
|
natural_pdf/core/page_collection.py,sha256=IjdFq9q0D0P6ZKWInf0H25rLzxfMb7RsUXucogkhNkU,63169
|
32
32
|
natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
|
33
33
|
natural_pdf/core/pdf.py,sha256=ovdeu9TRPnVYyMltD7QpcdcFYBLZFXh3LlfC5ifj6RY,104227
|
@@ -44,7 +44,7 @@ natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkN
|
|
44
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
46
|
natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
47
|
+
natural_pdf/elements/region.py,sha256=HF6KzeuudO9upVLIrPsp3omcziLcILE3nnzl1a-LvK0,165400
|
48
48
|
natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
|
49
49
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
50
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
@@ -107,12 +107,32 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
|
|
107
107
|
natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
|
108
108
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
109
109
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
110
|
-
natural_pdf-0.2.
|
110
|
+
natural_pdf-0.2.12.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
111
111
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
112
112
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
113
113
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
114
114
|
optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
|
115
115
|
optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
|
116
|
+
temp/debug_cell_extraction.py,sha256=nE0Z470P40v8xZfWO1V3qgNaejs_pernEQaUOFeOJ1U,1527
|
117
|
+
temp/debug_exclusion_overlap.py,sha256=RptJXwqBXy5gsvMF037KEx1o2QgjwEDkMB6TD5aJdqA,1644
|
118
|
+
temp/debug_exclusions_guides.py,sha256=s8siep9te1KRJ2j0vH1tvDQnBlz7PKbHeCiYMrZL8jE,2096
|
119
|
+
temp/debug_extra_guide.py,sha256=95Tim-YnmAR4kICw2XDKVDvlW5WsjK_51cv5-EV11rc,1236
|
120
|
+
temp/debug_outer_boundaries.py,sha256=uJUJwojTxOU4VtbGUouuhV65IYzS6NDIVKxnS7o64nU,1456
|
121
|
+
temp/debug_st_search.py,sha256=F4c_mUVi_d5AKaKIpQ0AnW1amDqAwALoQQj7wZj--J0,1021
|
122
|
+
temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
|
123
|
+
temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
|
124
|
+
temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
|
125
|
+
temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
|
126
|
+
temp/test_fix_real_pdf.py,sha256=uuylxmpeAEbIix9wjl0Gri1sZlN61dBWTq6ZCyfvzF8,1454
|
127
|
+
temp/test_fix_working.py,sha256=-Ryre1rXYA2EG_lmPZGYEGi8yz0slhHEXPJMYexZW84,1750
|
128
|
+
temp/test_fixed_pdf_exclusions.py,sha256=Q5zxooKDvtTXo-dDsx3nsQw1ZVHX3TW47iZ_dXpFdrY,2168
|
129
|
+
temp/test_horizontal_top_bottom.py,sha256=Mb3tjt9Z3wOTpzFOgK7i0K-j-_ynNh4vDu2x1L3nu-s,2163
|
130
|
+
temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,1417
|
131
|
+
temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
|
132
|
+
temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
|
133
|
+
temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
|
134
|
+
temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
|
135
|
+
temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
|
116
136
|
tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
117
137
|
tools/bad_pdf_eval/analyser.py,sha256=oqSTo3NLyignp_XdCO9_SRCUUXMU8lfgDavKYZYNxws,13690
|
118
138
|
tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
|
@@ -124,8 +144,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
124
144
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
125
145
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
126
146
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
127
|
-
natural_pdf-0.2.
|
128
|
-
natural_pdf-0.2.
|
129
|
-
natural_pdf-0.2.
|
130
|
-
natural_pdf-0.2.
|
131
|
-
natural_pdf-0.2.
|
147
|
+
natural_pdf-0.2.12.dist-info/METADATA,sha256=jRNM0JxYvPDuqzD63earjbaUwQgXCjPYPLC5pLl49Uk,6960
|
148
|
+
natural_pdf-0.2.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
149
|
+
natural_pdf-0.2.12.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
150
|
+
natural_pdf-0.2.12.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
|
151
|
+
natural_pdf-0.2.12.dist-info/RECORD,,
|
@@ -0,0 +1,42 @@
|
|
1
|
+
"""Debug cell text extraction with exclusions"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Add exclusions
|
9
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
+
|
11
|
+
# Check exclusions are registered
|
12
|
+
print("Exclusions on page:")
|
13
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
14
|
+
|
15
|
+
# Create guides and build grid
|
16
|
+
headers = page.find(text="NUMBER").right(include_source=True).expand(top=3, bottom=3).find_all('text')
|
17
|
+
guides = Guides(page)
|
18
|
+
guides.vertical.from_content(headers, align='left')
|
19
|
+
guides.horizontal.from_stripes()
|
20
|
+
|
21
|
+
# Build grid and get cells
|
22
|
+
grid_result = guides.build_grid(include_outer_boundaries=True)
|
23
|
+
cells = grid_result["regions"]["cells"]
|
24
|
+
|
25
|
+
print(f"\nTotal cells: {len(cells)}")
|
26
|
+
|
27
|
+
# Check first row cells (these should be in excluded area)
|
28
|
+
first_row_cells = [c for c in cells if c.bbox[1] < 90] # y < 90
|
29
|
+
print(f"\nFirst row cells: {len(first_row_cells)}")
|
30
|
+
|
31
|
+
for i, cell in enumerate(first_row_cells[:3]):
|
32
|
+
print(f"\nCell {i}:")
|
33
|
+
print(f" Bbox: {cell.bbox}")
|
34
|
+
print(f" Raw text: {repr(cell.extract_text(apply_exclusions=False))}")
|
35
|
+
print(f" With exclusions: {repr(cell.extract_text(apply_exclusions=True))}")
|
36
|
+
|
37
|
+
# Now test the full table extraction
|
38
|
+
print("\n\nFull table extraction:")
|
39
|
+
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
40
|
+
df = result.to_df()
|
41
|
+
print("\nFirst row of dataframe:")
|
42
|
+
print(df.iloc[0].to_dict() if not df.empty else "Empty")
|
@@ -0,0 +1,43 @@
|
|
1
|
+
"""Debug how exclusions work with overlapping regions"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Add exclusion
|
9
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
+
|
11
|
+
# Get the exclusion region
|
12
|
+
exclusions = page._get_exclusion_regions()
|
13
|
+
excl_region = exclusions[0]
|
14
|
+
print(f"Exclusion region: {excl_region.bbox}")
|
15
|
+
print(f"Exclusion bottom: {excl_region.bbox[3]}")
|
16
|
+
|
17
|
+
# Create a test cell that overlaps the exclusion
|
18
|
+
# Cell 1 from before: (32.06, 0.5, 73.18288, 79.53999999999996)
|
19
|
+
test_cell = page.region(32.06, 0.5, 73.18288, 79.53999999999996)
|
20
|
+
|
21
|
+
print(f"\nTest cell: {test_cell.bbox}")
|
22
|
+
print(f"Cell overlaps exclusion: top={test_cell.bbox[1]} < excl_bottom={excl_region.bbox[3]}")
|
23
|
+
|
24
|
+
# Extract text from different y-ranges
|
25
|
+
print("\nText in different parts of the cell:")
|
26
|
+
|
27
|
+
# Part above exclusion line (should be empty)
|
28
|
+
upper_part = page.region(32.06, 0.5, 73.18288, 59.12)
|
29
|
+
print(f"Upper part (0.5 to 59.12): '{upper_part.extract_text(apply_exclusions=True)}'")
|
30
|
+
|
31
|
+
# Part below exclusion line (should have text)
|
32
|
+
lower_part = page.region(32.06, 59.12, 73.18288, 79.54)
|
33
|
+
print(f"Lower part (59.12 to 79.54): '{lower_part.extract_text()}'")
|
34
|
+
|
35
|
+
# The whole cell
|
36
|
+
print(f"Whole cell with exclusions: '{test_cell.extract_text(apply_exclusions=True)}'")
|
37
|
+
print(f"Whole cell without exclusions: '{test_cell.extract_text(apply_exclusions=False)}'")
|
38
|
+
|
39
|
+
# Check what text elements are in this region
|
40
|
+
print("\nText elements in cell:")
|
41
|
+
cell_texts = test_cell.find_all('text')
|
42
|
+
for t in cell_texts[:5]:
|
43
|
+
print(f" '{t.text}' at y={t.top:.2f}-{t.bottom:.2f}")
|
@@ -0,0 +1,67 @@
|
|
1
|
+
"""Debug why exclusions aren't working with guides.extract_table()"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Check initial text
|
9
|
+
print("Initial text:")
|
10
|
+
print(page.extract_text()[:200])
|
11
|
+
print()
|
12
|
+
|
13
|
+
# Add exclusions
|
14
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
15
|
+
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
|
16
|
+
|
17
|
+
# Check text after exclusions
|
18
|
+
print("Text after exclusions:")
|
19
|
+
print(page.extract_text()[:100])
|
20
|
+
print()
|
21
|
+
|
22
|
+
# Debug exclusion regions
|
23
|
+
print("Checking exclusion regions:")
|
24
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
25
|
+
print(f"\nTotal exclusions: {len(exclusions)}")
|
26
|
+
for i, exc in enumerate(exclusions):
|
27
|
+
print(f" {i}: {exc.bbox}")
|
28
|
+
print()
|
29
|
+
|
30
|
+
# Create guides
|
31
|
+
headers = (
|
32
|
+
page
|
33
|
+
.find(text="NUMBER")
|
34
|
+
.right(include_source=True)
|
35
|
+
.expand(top=3, bottom=3)
|
36
|
+
.find_all('text')
|
37
|
+
)
|
38
|
+
|
39
|
+
guides = Guides(page)
|
40
|
+
guides.vertical.from_content(headers, align='left')
|
41
|
+
guides.horizontal.from_stripes()
|
42
|
+
|
43
|
+
# Build grid to see what regions are created
|
44
|
+
print("\nBuilding grid...")
|
45
|
+
grid_result = guides.build_grid(include_outer_boundaries=True)
|
46
|
+
table_region = grid_result["regions"]["table"]
|
47
|
+
print(f"Table region: {table_region}")
|
48
|
+
print(f"Table bbox: {table_region.bbox if table_region else 'None'}")
|
49
|
+
|
50
|
+
# Check if table region respects exclusions
|
51
|
+
if table_region:
|
52
|
+
print("\nExtracting text from table region directly:")
|
53
|
+
table_text = table_region.extract_text()[:200]
|
54
|
+
print(f"Table text: {table_text}")
|
55
|
+
|
56
|
+
# Now extract table
|
57
|
+
print("\nExtracting table with apply_exclusions=True:")
|
58
|
+
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
59
|
+
df = result.to_df()
|
60
|
+
print(df.head())
|
61
|
+
|
62
|
+
# Check if excluded content is in the table
|
63
|
+
table_str = df.to_string()
|
64
|
+
has_feb = "FEBRUARY 2014" in table_str or "FEBR" in table_str
|
65
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_str
|
66
|
+
print(f"\nContains 'FEBRUARY': {has_feb}")
|
67
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
@@ -0,0 +1,41 @@
|
|
1
|
+
"""Debug the extra guide issue."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Get headers
|
9
|
+
headers = (
|
10
|
+
page
|
11
|
+
.find("text:contains(NUMBER)")
|
12
|
+
.right(include_source=True)
|
13
|
+
.expand(top=3, bottom=3)
|
14
|
+
.find_all('text')
|
15
|
+
)
|
16
|
+
|
17
|
+
print("Headers 3-5:")
|
18
|
+
for i, h in enumerate(headers[3:5]):
|
19
|
+
print(f" {i}: '{h.text}' bbox={h.bbox}")
|
20
|
+
|
21
|
+
# Create guides with just these two headers
|
22
|
+
guides = Guides(page)
|
23
|
+
guides.vertical.from_content(headers[3:5], align='left', outer=False)
|
24
|
+
|
25
|
+
print(f"\nResulting guides: {guides.vertical}")
|
26
|
+
print(f"Expected: [328.32012, 539.63316]")
|
27
|
+
|
28
|
+
# Let's also check what happens with each header individually
|
29
|
+
print("\nTesting each header individually:")
|
30
|
+
for i, h in enumerate(headers[3:5]):
|
31
|
+
g = Guides(page)
|
32
|
+
g.vertical.from_content([h], align='left', outer=False)
|
33
|
+
print(f" Header {i} guides: {g.vertical}")
|
34
|
+
|
35
|
+
# Check if it's related to the ElementCollection
|
36
|
+
print("\nTesting with manual list of text:")
|
37
|
+
text_list = [h.text for h in headers[3:5]]
|
38
|
+
print(f"Text list: {text_list}")
|
39
|
+
guides2 = Guides(page)
|
40
|
+
guides2.vertical.from_content(text_list, align='left', outer=False)
|
41
|
+
print(f"Guides from text list: {guides2.vertical}")
|
@@ -0,0 +1,46 @@
|
|
1
|
+
"""Debug outer boundaries issue with exclusions"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Add exclusions
|
9
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
10
|
+
|
11
|
+
# Create guides
|
12
|
+
headers = (
|
13
|
+
page
|
14
|
+
.find(text="NUMBER")
|
15
|
+
.right(include_source=True)
|
16
|
+
.expand(top=3, bottom=3)
|
17
|
+
.find_all('text')
|
18
|
+
)
|
19
|
+
|
20
|
+
guides = Guides(page)
|
21
|
+
guides.vertical.from_content(headers, align='left')
|
22
|
+
guides.horizontal.from_stripes()
|
23
|
+
|
24
|
+
print("Horizontal guides (sorted):")
|
25
|
+
for i, h in enumerate(sorted(guides.horizontal)):
|
26
|
+
print(f" {i}: {h:.2f}")
|
27
|
+
|
28
|
+
print(f"\nFirst content guide: {sorted(guides.horizontal)[0]:.2f}")
|
29
|
+
print(f"Page height: {page.height}")
|
30
|
+
|
31
|
+
# Test without outer boundaries
|
32
|
+
print("\n\nWithout outer boundaries:")
|
33
|
+
result1 = guides.extract_table(include_outer_boundaries=False, apply_exclusions=True, header=False)
|
34
|
+
df1 = result1.to_df()
|
35
|
+
print(f"Shape: {df1.shape}")
|
36
|
+
print("First row, first column:", df1.iloc[0, 0] if not df1.empty else "Empty")
|
37
|
+
|
38
|
+
# Test with outer boundaries
|
39
|
+
print("\n\nWith outer boundaries:")
|
40
|
+
result2 = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
41
|
+
df2 = result2.to_df()
|
42
|
+
print(f"Shape: {df2.shape}")
|
43
|
+
print("First row, first column:", df2.iloc[0, 0] if not df2.empty else "Empty")
|
44
|
+
|
45
|
+
# The issue: include_outer_boundaries adds guides at 0 and 612,
|
46
|
+
# which creates cells that span into the exclusion zone
|
temp/debug_st_search.py
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
"""Debug searching for 'ST' text."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
|
4
|
+
pdf = PDF("pdfs/m27.pdf")
|
5
|
+
page = pdf.pages[0]
|
6
|
+
|
7
|
+
# Get the original ST element
|
8
|
+
headers = (
|
9
|
+
page
|
10
|
+
.find("text:contains(NUMBER)")
|
11
|
+
.right(include_source=True)
|
12
|
+
.expand(top=3, bottom=3)
|
13
|
+
.find_all('text')
|
14
|
+
)
|
15
|
+
original_st = headers[4]
|
16
|
+
print(f"Original 'ST' element: '{original_st.text}' at {original_st.bbox}")
|
17
|
+
|
18
|
+
# Search for 'ST' using find
|
19
|
+
found_st = page.find('text:contains("ST")')
|
20
|
+
print(f"\nFound 'ST' using find: '{found_st.text}' at {found_st.bbox}")
|
21
|
+
|
22
|
+
# Find all elements containing 'ST'
|
23
|
+
all_st = page.find_all('text:contains("ST")')
|
24
|
+
print(f"\nAll elements containing 'ST':")
|
25
|
+
for i, elem in enumerate(all_st[:10]): # First 10
|
26
|
+
print(f" {i}: '{elem.text}' at x={elem.x0:.2f}, bbox={elem.bbox}")
|
27
|
+
|
28
|
+
# Check what's at position 332.88
|
29
|
+
print(f"\nLooking for element at x≈332.88:")
|
30
|
+
all_text = page.find_all('text')
|
31
|
+
for elem in all_text:
|
32
|
+
if 332 < elem.x0 < 334:
|
33
|
+
print(f" Found: '{elem.text}' at x={elem.x0:.5f}, bbox={elem.bbox}")
|
@@ -0,0 +1,42 @@
|
|
1
|
+
"""Script to fix the remaining exclusion bugs in page.py"""
|
2
|
+
|
3
|
+
import re
|
4
|
+
|
5
|
+
# Read the file
|
6
|
+
with open('/Users/soma/Development/natural-pdf/natural_pdf/core/page.py', 'r') as f:
|
7
|
+
content = f.read()
|
8
|
+
|
9
|
+
# Fix 1: Line 1132 in find() method
|
10
|
+
# Change: if apply_exclusions and self._exclusions and results_collection:
|
11
|
+
# To: if apply_exclusions and results_collection:
|
12
|
+
content = re.sub(
|
13
|
+
r'(\s+)if apply_exclusions and self\._exclusions and results_collection:',
|
14
|
+
r'\1if apply_exclusions and results_collection:',
|
15
|
+
content
|
16
|
+
)
|
17
|
+
|
18
|
+
# Fix 2: Line 1227 in find_all() method
|
19
|
+
# Same change pattern
|
20
|
+
content = re.sub(
|
21
|
+
r'(\s+)if apply_exclusions and self\._exclusions and results_collection:',
|
22
|
+
r'\1if apply_exclusions and results_collection:',
|
23
|
+
content
|
24
|
+
)
|
25
|
+
|
26
|
+
# Fix 3: Line 1599 in get_elements() method
|
27
|
+
# Change: if apply_exclusions and self._exclusions:
|
28
|
+
# To: if apply_exclusions:
|
29
|
+
content = re.sub(
|
30
|
+
r'(\s+)if apply_exclusions and self\._exclusions:',
|
31
|
+
r'\1if apply_exclusions:',
|
32
|
+
content
|
33
|
+
)
|
34
|
+
|
35
|
+
# Write the fixed content back
|
36
|
+
with open('/Users/soma/Development/natural-pdf/natural_pdf/core/page.py', 'w') as f:
|
37
|
+
f.write(content)
|
38
|
+
|
39
|
+
print("Fixed exclusion checks in page.py")
|
40
|
+
print("- find() method: removed self._exclusions check")
|
41
|
+
print("- find_all() method: removed self._exclusions check")
|
42
|
+
print("- get_elements() method: removed self._exclusions check")
|
@@ -0,0 +1,30 @@
|
|
1
|
+
"""Test exclusion with detailed debugging"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
|
4
|
+
pdf = PDF("pdfs/m27.pdf")
|
5
|
+
page = pdf.pages[0]
|
6
|
+
|
7
|
+
# Add exclusion
|
8
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
9
|
+
|
10
|
+
# First, verify the exclusion works on the page itself
|
11
|
+
print("Page-level text extraction:")
|
12
|
+
print("Without exclusions:", page.extract_text(apply_exclusions=False)[:100])
|
13
|
+
print("With exclusions:", page.extract_text(apply_exclusions=True)[:100])
|
14
|
+
|
15
|
+
# Now test on a specific region that should be excluded
|
16
|
+
print("\n\nRegion in excluded area (0, 0, 200, 50):")
|
17
|
+
excluded_region = page.region(0, 0, 200, 50)
|
18
|
+
print("Without exclusions:", repr(excluded_region.extract_text(apply_exclusions=False)))
|
19
|
+
print("With exclusions:", repr(excluded_region.extract_text(apply_exclusions=True)))
|
20
|
+
|
21
|
+
# Test the actual problematic cell region
|
22
|
+
print("\n\nProblematic cell region (32.06, 0.5, 73.18, 79.54):")
|
23
|
+
cell_region = page.region(32.06, 0.5, 73.18288, 79.54)
|
24
|
+
print("Without exclusions:", repr(cell_region.extract_text(apply_exclusions=False)))
|
25
|
+
print("With exclusions:", repr(cell_region.extract_text(apply_exclusions=True)))
|
26
|
+
|
27
|
+
# Check if the region inherits the page
|
28
|
+
print(f"\nCell region's page: {cell_region.page}")
|
29
|
+
print(f"Cell region's _page: {getattr(cell_region, '_page', 'Not found')}")
|
30
|
+
print(f"Same as original page: {cell_region.page is page if hasattr(cell_region, 'page') else 'N/A'}")
|
@@ -0,0 +1,53 @@
|
|
1
|
+
"""Test that the find methods now work with PDF-level exclusions."""
|
2
|
+
|
3
|
+
from natural_pdf import PDF
|
4
|
+
|
5
|
+
# Load a test PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Add PDF-level exclusion for the header
|
10
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
11
|
+
|
12
|
+
# Test 1: find() should now exclude the header text
|
13
|
+
print("Test 1: find() with PDF-level exclusions")
|
14
|
+
result = page.find("text:contains(FEBRUARY)", apply_exclusions=True)
|
15
|
+
if result is None:
|
16
|
+
print("✅ SUCCESS: find() correctly excluded header text")
|
17
|
+
else:
|
18
|
+
print(f"❌ FAILED: find() returned {result.text}")
|
19
|
+
|
20
|
+
# Test 2: find_all() should exclude header elements
|
21
|
+
print("\nTest 2: find_all() with PDF-level exclusions")
|
22
|
+
all_text = page.find_all("text", apply_exclusions=False)
|
23
|
+
filtered_text = page.find_all("text", apply_exclusions=True)
|
24
|
+
print(f"Without exclusions: {len(all_text)} elements")
|
25
|
+
print(f"With exclusions: {len(filtered_text)} elements")
|
26
|
+
if len(filtered_text) < len(all_text):
|
27
|
+
print("✅ SUCCESS: find_all() excluded some elements")
|
28
|
+
else:
|
29
|
+
print("❌ FAILED: find_all() didn't exclude any elements")
|
30
|
+
|
31
|
+
# Test 3: get_elements() should exclude header elements
|
32
|
+
print("\nTest 3: get_elements() with PDF-level exclusions")
|
33
|
+
all_elements = page.get_elements(apply_exclusions=False)
|
34
|
+
filtered_elements = page.get_elements(apply_exclusions=True)
|
35
|
+
print(f"Without exclusions: {len(all_elements)} elements")
|
36
|
+
print(f"With exclusions: {len(filtered_elements)} elements")
|
37
|
+
if len(filtered_elements) < len(all_elements):
|
38
|
+
print("✅ SUCCESS: get_elements() excluded some elements")
|
39
|
+
else:
|
40
|
+
print("❌ FAILED: get_elements() didn't exclude any elements")
|
41
|
+
|
42
|
+
# Test that excluded text is not in the filtered results
|
43
|
+
print("\nChecking excluded text...")
|
44
|
+
excluded_texts = ["FEBRUARY", "ALPHABETIC LISTING", "M27"]
|
45
|
+
for text in excluded_texts:
|
46
|
+
found_in_filtered = any(
|
47
|
+
text in str(el.text) if hasattr(el, 'text') else False
|
48
|
+
for el in filtered_text
|
49
|
+
)
|
50
|
+
if not found_in_filtered:
|
51
|
+
print(f"✅ '{text}' correctly excluded")
|
52
|
+
else:
|
53
|
+
print(f"❌ '{text}' still present in filtered results")
|
@@ -0,0 +1,97 @@
|
|
1
|
+
"""Test that the find methods now work with PDF-level exclusions (without recursion)."""
|
2
|
+
|
3
|
+
from natural_pdf import PDF
|
4
|
+
|
5
|
+
# Load a test PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Add PDF-level exclusion using a Region directly to avoid recursion
|
10
|
+
# First, get the header region without exclusions
|
11
|
+
header_element = page.find(text="PREMISE", apply_exclusions=False)
|
12
|
+
if header_element:
|
13
|
+
header_region = header_element.above()
|
14
|
+
pdf.add_exclusion(header_region)
|
15
|
+
else:
|
16
|
+
print("WARNING: Could not find PREMISE text for exclusion")
|
17
|
+
|
18
|
+
# Test 1: find() should now exclude the header text
|
19
|
+
print("Test 1: find() with PDF-level exclusions")
|
20
|
+
result = page.find("text:contains(FEBRUARY)", apply_exclusions=True)
|
21
|
+
if result is None:
|
22
|
+
print("✅ SUCCESS: find() correctly excluded header text")
|
23
|
+
else:
|
24
|
+
print(f"❌ FAILED: find() returned {result.text}")
|
25
|
+
|
26
|
+
# Test 2: find_all() should exclude header elements
|
27
|
+
print("\nTest 2: find_all() with PDF-level exclusions")
|
28
|
+
all_text = page.find_all("text", apply_exclusions=False)
|
29
|
+
filtered_text = page.find_all("text", apply_exclusions=True)
|
30
|
+
print(f"Without exclusions: {len(all_text)} elements")
|
31
|
+
print(f"With exclusions: {len(filtered_text)} elements")
|
32
|
+
if len(filtered_text) < len(all_text):
|
33
|
+
print("✅ SUCCESS: find_all() excluded some elements")
|
34
|
+
else:
|
35
|
+
print("❌ FAILED: find_all() didn't exclude any elements")
|
36
|
+
|
37
|
+
# Test 3: get_elements() should exclude header elements
|
38
|
+
print("\nTest 3: get_elements() with PDF-level exclusions")
|
39
|
+
all_elements = page.get_elements(apply_exclusions=False)
|
40
|
+
filtered_elements = page.get_elements(apply_exclusions=True)
|
41
|
+
print(f"Without exclusions: {len(all_elements)} elements")
|
42
|
+
print(f"With exclusions: {len(filtered_elements)} elements")
|
43
|
+
if len(filtered_elements) < len(all_elements):
|
44
|
+
print("✅ SUCCESS: get_elements() excluded some elements")
|
45
|
+
else:
|
46
|
+
print("❌ FAILED: get_elements() didn't exclude any elements")
|
47
|
+
|
48
|
+
# Test that excluded text is not in the filtered results
|
49
|
+
print("\nChecking excluded text...")
|
50
|
+
excluded_texts = ["FEBRUARY", "ALPHABETIC LISTING", "M27"]
|
51
|
+
for text in excluded_texts:
|
52
|
+
found_in_filtered = any(
|
53
|
+
text in str(el.text) if hasattr(el, 'text') else False
|
54
|
+
for el in filtered_text
|
55
|
+
)
|
56
|
+
if not found_in_filtered:
|
57
|
+
print(f"✅ '{text}' correctly excluded")
|
58
|
+
else:
|
59
|
+
print(f"❌ '{text}' still present in filtered results")
|
60
|
+
|
61
|
+
# Also test that the original table extraction issue is fixed
|
62
|
+
print("\n\nTesting original table extraction issue...")
|
63
|
+
from natural_pdf.analyzers.guides import Guides
|
64
|
+
|
65
|
+
# Add exclusion for footer too
|
66
|
+
footer_element = page.find("text:regex(Page \\d+ of)", apply_exclusions=False)
|
67
|
+
if footer_element:
|
68
|
+
pdf.add_exclusion(footer_element.expand())
|
69
|
+
|
70
|
+
headers = (
|
71
|
+
page
|
72
|
+
.find(text="NUMBER", apply_exclusions=False)
|
73
|
+
.right(include_source=True)
|
74
|
+
.expand(top=3, bottom=3)
|
75
|
+
.find_all('text')
|
76
|
+
)
|
77
|
+
|
78
|
+
guides = Guides(page)
|
79
|
+
guides.vertical.from_content(headers, align='left')
|
80
|
+
guides.horizontal.from_stripes()
|
81
|
+
|
82
|
+
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
83
|
+
df = result.to_df()
|
84
|
+
|
85
|
+
# Check if excluded content is in the table
|
86
|
+
table_str = df.to_string()
|
87
|
+
has_feb = "FEBRUARY" in table_str
|
88
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_str
|
89
|
+
|
90
|
+
print(f"\nTable extraction with exclusions:")
|
91
|
+
print(f"Contains 'FEBRUARY': {has_feb}")
|
92
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
93
|
+
|
94
|
+
if not has_feb and not has_alphabetic:
|
95
|
+
print("✅ SUCCESS: Table extraction correctly excludes header/footer!")
|
96
|
+
else:
|
97
|
+
print("❌ FAILED: Exclusions not working in table extraction")
|
@@ -0,0 +1,48 @@
|
|
1
|
+
"""Test the fix with the actual PDF from the user's example."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Get headers using the user's exact code
|
9
|
+
headers = (
|
10
|
+
page
|
11
|
+
.find("text:contains(NUMBER)")
|
12
|
+
.right(include_source=True)
|
13
|
+
.expand(top=3, bottom=3)
|
14
|
+
.find_all('text')
|
15
|
+
)
|
16
|
+
|
17
|
+
print("Headers found:")
|
18
|
+
for i, h in enumerate(headers):
|
19
|
+
print(f" {i}: '{h.text}' at x={h.x0:.2f}")
|
20
|
+
|
21
|
+
# Create guides using ElementCollection
|
22
|
+
guides = Guides(page)
|
23
|
+
guides.vertical.from_content(headers, align='left', outer=False)
|
24
|
+
|
25
|
+
print(f"\nResulting vertical guides: {sorted(guides.vertical)}")
|
26
|
+
|
27
|
+
# Check specific headers that were problematic
|
28
|
+
print("\nChecking headers 3-5:")
|
29
|
+
for i, h in enumerate(headers[3:5]):
|
30
|
+
print(f" Header {i+3}: '{h.text}' at x={h.x0:.5f}")
|
31
|
+
|
32
|
+
# Test with just headers 3-5
|
33
|
+
guides2 = Guides(page)
|
34
|
+
guides2.vertical.from_content(headers[3:5], align='left', outer=False)
|
35
|
+
|
36
|
+
print(f"\nGuides from headers[3:5]: {guides2.vertical}")
|
37
|
+
print(f"Expected: [328.32012, 539.63316]")
|
38
|
+
|
39
|
+
# Verify the fix
|
40
|
+
if 332.88095999999996 in guides2.vertical:
|
41
|
+
print("\n❌ FAILED: Extra guide at 332.88 is still present")
|
42
|
+
else:
|
43
|
+
print("\n✅ SUCCESS: Extra guide at 332.88 is not present")
|
44
|
+
|
45
|
+
# Test that outer guides work correctly too
|
46
|
+
guides3 = Guides(page)
|
47
|
+
guides3.vertical.from_content(headers[3:5], align='left', outer=True)
|
48
|
+
print(f"\nWith outer=True: {guides3.vertical}")
|