natural-pdf 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@ natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
2
2
  natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
3
3
  natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
4
4
  natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
5
- natural_pdf/analyzers/guides.py,sha256=O3MaeVLgH5l1qbj2fpJCUsDKF44350CiUb_T1J6HTiQ,178025
5
+ natural_pdf/analyzers/guides.py,sha256=B2_Etb0o-lOku-FQw-T1Fo1qxbcAXT4FB0hdp-5kXRs,188171
6
6
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
7
7
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
8
8
  natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
@@ -27,7 +27,7 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
29
29
  natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
30
- natural_pdf/core/page.py,sha256=XmXii652iM-JVKgzpbKQ8f59U0TvDLD5iAfdtx92gis,152675
30
+ natural_pdf/core/page.py,sha256=Pid5hqVjcyX-gcCzxCJ62k6AQhNbUMNM_5QmEcylIjM,155264
31
31
  natural_pdf/core/page_collection.py,sha256=IjdFq9q0D0P6ZKWInf0H25rLzxfMb7RsUXucogkhNkU,63169
32
32
  natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
33
33
  natural_pdf/core/pdf.py,sha256=ovdeu9TRPnVYyMltD7QpcdcFYBLZFXh3LlfC5ifj6RY,104227
@@ -44,7 +44,7 @@ natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkN
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
47
- natural_pdf/elements/region.py,sha256=hCpbKg0R5TGfWEskZ6P-o_ZXPKhU4keaYjWIVX0Y7F4,165244
47
+ natural_pdf/elements/region.py,sha256=HF6KzeuudO9upVLIrPsp3omcziLcILE3nnzl1a-LvK0,165400
48
48
  natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
49
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -107,12 +107,32 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
107
107
  natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
108
108
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
109
109
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
110
- natural_pdf-0.2.10.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
110
+ natural_pdf-0.2.12.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
111
111
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
112
112
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
113
113
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
114
114
  optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
115
115
  optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
116
+ temp/debug_cell_extraction.py,sha256=nE0Z470P40v8xZfWO1V3qgNaejs_pernEQaUOFeOJ1U,1527
117
+ temp/debug_exclusion_overlap.py,sha256=RptJXwqBXy5gsvMF037KEx1o2QgjwEDkMB6TD5aJdqA,1644
118
+ temp/debug_exclusions_guides.py,sha256=s8siep9te1KRJ2j0vH1tvDQnBlz7PKbHeCiYMrZL8jE,2096
119
+ temp/debug_extra_guide.py,sha256=95Tim-YnmAR4kICw2XDKVDvlW5WsjK_51cv5-EV11rc,1236
120
+ temp/debug_outer_boundaries.py,sha256=uJUJwojTxOU4VtbGUouuhV65IYzS6NDIVKxnS7o64nU,1456
121
+ temp/debug_st_search.py,sha256=F4c_mUVi_d5AKaKIpQ0AnW1amDqAwALoQQj7wZj--J0,1021
122
+ temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
123
+ temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
124
+ temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
125
+ temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
126
+ temp/test_fix_real_pdf.py,sha256=uuylxmpeAEbIix9wjl0Gri1sZlN61dBWTq6ZCyfvzF8,1454
127
+ temp/test_fix_working.py,sha256=-Ryre1rXYA2EG_lmPZGYEGi8yz0slhHEXPJMYexZW84,1750
128
+ temp/test_fixed_pdf_exclusions.py,sha256=Q5zxooKDvtTXo-dDsx3nsQw1ZVHX3TW47iZ_dXpFdrY,2168
129
+ temp/test_horizontal_top_bottom.py,sha256=Mb3tjt9Z3wOTpzFOgK7i0K-j-_ynNh4vDu2x1L3nu-s,2163
130
+ temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,1417
131
+ temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
132
+ temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
133
+ temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
134
+ temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
135
+ temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
116
136
  tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
117
137
  tools/bad_pdf_eval/analyser.py,sha256=oqSTo3NLyignp_XdCO9_SRCUUXMU8lfgDavKYZYNxws,13690
118
138
  tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
@@ -124,8 +144,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
124
144
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
125
145
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
126
146
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
127
- natural_pdf-0.2.10.dist-info/METADATA,sha256=DnjO1O3cHjhT-sXzLjIHQ286xqBAzeSJdPR50yol4D4,6960
128
- natural_pdf-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
129
- natural_pdf-0.2.10.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
130
- natural_pdf-0.2.10.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
131
- natural_pdf-0.2.10.dist-info/RECORD,,
147
+ natural_pdf-0.2.12.dist-info/METADATA,sha256=jRNM0JxYvPDuqzD63earjbaUwQgXCjPYPLC5pLl49Uk,6960
148
+ natural_pdf-0.2.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
149
+ natural_pdf-0.2.12.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
150
+ natural_pdf-0.2.12.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
151
+ natural_pdf-0.2.12.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  natural_pdf
2
2
  optimization
3
+ temp
3
4
  todo
4
5
  tools
@@ -0,0 +1,42 @@
1
+ """Debug cell text extraction with exclusions"""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ pdf = PDF("pdfs/m27.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Add exclusions
9
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
10
+
11
+ # Check exclusions are registered
12
+ print("Exclusions on page:")
13
+ exclusions = page._get_exclusion_regions(debug=True)
14
+
15
+ # Create guides and build grid
16
+ headers = page.find(text="NUMBER").right(include_source=True).expand(top=3, bottom=3).find_all('text')
17
+ guides = Guides(page)
18
+ guides.vertical.from_content(headers, align='left')
19
+ guides.horizontal.from_stripes()
20
+
21
+ # Build grid and get cells
22
+ grid_result = guides.build_grid(include_outer_boundaries=True)
23
+ cells = grid_result["regions"]["cells"]
24
+
25
+ print(f"\nTotal cells: {len(cells)}")
26
+
27
+ # Check first row cells (these should be in excluded area)
28
+ first_row_cells = [c for c in cells if c.bbox[1] < 90] # y < 90
29
+ print(f"\nFirst row cells: {len(first_row_cells)}")
30
+
31
+ for i, cell in enumerate(first_row_cells[:3]):
32
+ print(f"\nCell {i}:")
33
+ print(f" Bbox: {cell.bbox}")
34
+ print(f" Raw text: {repr(cell.extract_text(apply_exclusions=False))}")
35
+ print(f" With exclusions: {repr(cell.extract_text(apply_exclusions=True))}")
36
+
37
+ # Now test the full table extraction
38
+ print("\n\nFull table extraction:")
39
+ result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
40
+ df = result.to_df()
41
+ print("\nFirst row of dataframe:")
42
+ print(df.iloc[0].to_dict() if not df.empty else "Empty")
@@ -0,0 +1,43 @@
1
+ """Debug how exclusions work with overlapping regions"""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ pdf = PDF("pdfs/m27.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Add exclusion
9
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
10
+
11
+ # Get the exclusion region
12
+ exclusions = page._get_exclusion_regions()
13
+ excl_region = exclusions[0]
14
+ print(f"Exclusion region: {excl_region.bbox}")
15
+ print(f"Exclusion bottom: {excl_region.bbox[3]}")
16
+
17
+ # Create a test cell that overlaps the exclusion
18
+ # Cell 1 from before: (32.06, 0.5, 73.18288, 79.53999999999996)
19
+ test_cell = page.region(32.06, 0.5, 73.18288, 79.53999999999996)
20
+
21
+ print(f"\nTest cell: {test_cell.bbox}")
22
+ print(f"Cell overlaps exclusion: top={test_cell.bbox[1]} < excl_bottom={excl_region.bbox[3]}")
23
+
24
+ # Extract text from different y-ranges
25
+ print("\nText in different parts of the cell:")
26
+
27
+ # Part above exclusion line (should be empty)
28
+ upper_part = page.region(32.06, 0.5, 73.18288, 59.12)
29
+ print(f"Upper part (0.5 to 59.12): '{upper_part.extract_text(apply_exclusions=True)}'")
30
+
31
+ # Part below exclusion line (should have text)
32
+ lower_part = page.region(32.06, 59.12, 73.18288, 79.54)
33
+ print(f"Lower part (59.12 to 79.54): '{lower_part.extract_text()}'")
34
+
35
+ # The whole cell
36
+ print(f"Whole cell with exclusions: '{test_cell.extract_text(apply_exclusions=True)}'")
37
+ print(f"Whole cell without exclusions: '{test_cell.extract_text(apply_exclusions=False)}'")
38
+
39
+ # Check what text elements are in this region
40
+ print("\nText elements in cell:")
41
+ cell_texts = test_cell.find_all('text')
42
+ for t in cell_texts[:5]:
43
+ print(f" '{t.text}' at y={t.top:.2f}-{t.bottom:.2f}")
@@ -0,0 +1,67 @@
1
+ """Debug why exclusions aren't working with guides.extract_table()"""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ pdf = PDF("pdfs/m27.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Check initial text
9
+ print("Initial text:")
10
+ print(page.extract_text()[:200])
11
+ print()
12
+
13
+ # Add exclusions
14
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
15
+ pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
16
+
17
+ # Check text after exclusions
18
+ print("Text after exclusions:")
19
+ print(page.extract_text()[:100])
20
+ print()
21
+
22
+ # Debug exclusion regions
23
+ print("Checking exclusion regions:")
24
+ exclusions = page._get_exclusion_regions(debug=True)
25
+ print(f"\nTotal exclusions: {len(exclusions)}")
26
+ for i, exc in enumerate(exclusions):
27
+ print(f" {i}: {exc.bbox}")
28
+ print()
29
+
30
+ # Create guides
31
+ headers = (
32
+ page
33
+ .find(text="NUMBER")
34
+ .right(include_source=True)
35
+ .expand(top=3, bottom=3)
36
+ .find_all('text')
37
+ )
38
+
39
+ guides = Guides(page)
40
+ guides.vertical.from_content(headers, align='left')
41
+ guides.horizontal.from_stripes()
42
+
43
+ # Build grid to see what regions are created
44
+ print("\nBuilding grid...")
45
+ grid_result = guides.build_grid(include_outer_boundaries=True)
46
+ table_region = grid_result["regions"]["table"]
47
+ print(f"Table region: {table_region}")
48
+ print(f"Table bbox: {table_region.bbox if table_region else 'None'}")
49
+
50
+ # Check if table region respects exclusions
51
+ if table_region:
52
+ print("\nExtracting text from table region directly:")
53
+ table_text = table_region.extract_text()[:200]
54
+ print(f"Table text: {table_text}")
55
+
56
+ # Now extract table
57
+ print("\nExtracting table with apply_exclusions=True:")
58
+ result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
59
+ df = result.to_df()
60
+ print(df.head())
61
+
62
+ # Check if excluded content is in the table
63
+ table_str = df.to_string()
64
+ has_feb = "FEBRUARY 2014" in table_str or "FEBR" in table_str
65
+ has_alphabetic = "ALPHABETIC LISTING" in table_str
66
+ print(f"\nContains 'FEBRUARY': {has_feb}")
67
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
@@ -0,0 +1,41 @@
1
+ """Debug the extra guide issue."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ pdf = PDF("pdfs/m27.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Get headers
9
+ headers = (
10
+ page
11
+ .find("text:contains(NUMBER)")
12
+ .right(include_source=True)
13
+ .expand(top=3, bottom=3)
14
+ .find_all('text')
15
+ )
16
+
17
+ print("Headers 3-5:")
18
+ for i, h in enumerate(headers[3:5]):
19
+ print(f" {i}: '{h.text}' bbox={h.bbox}")
20
+
21
+ # Create guides with just these two headers
22
+ guides = Guides(page)
23
+ guides.vertical.from_content(headers[3:5], align='left', outer=False)
24
+
25
+ print(f"\nResulting guides: {guides.vertical}")
26
+ print(f"Expected: [328.32012, 539.63316]")
27
+
28
+ # Let's also check what happens with each header individually
29
+ print("\nTesting each header individually:")
30
+ for i, h in enumerate(headers[3:5]):
31
+ g = Guides(page)
32
+ g.vertical.from_content([h], align='left', outer=False)
33
+ print(f" Header {i} guides: {g.vertical}")
34
+
35
+ # Check if it's related to the ElementCollection
36
+ print("\nTesting with manual list of text:")
37
+ text_list = [h.text for h in headers[3:5]]
38
+ print(f"Text list: {text_list}")
39
+ guides2 = Guides(page)
40
+ guides2.vertical.from_content(text_list, align='left', outer=False)
41
+ print(f"Guides from text list: {guides2.vertical}")
@@ -0,0 +1,46 @@
1
+ """Debug outer boundaries issue with exclusions"""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ pdf = PDF("pdfs/m27.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Add exclusions
9
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
10
+
11
+ # Create guides
12
+ headers = (
13
+ page
14
+ .find(text="NUMBER")
15
+ .right(include_source=True)
16
+ .expand(top=3, bottom=3)
17
+ .find_all('text')
18
+ )
19
+
20
+ guides = Guides(page)
21
+ guides.vertical.from_content(headers, align='left')
22
+ guides.horizontal.from_stripes()
23
+
24
+ print("Horizontal guides (sorted):")
25
+ for i, h in enumerate(sorted(guides.horizontal)):
26
+ print(f" {i}: {h:.2f}")
27
+
28
+ print(f"\nFirst content guide: {sorted(guides.horizontal)[0]:.2f}")
29
+ print(f"Page height: {page.height}")
30
+
31
+ # Test without outer boundaries
32
+ print("\n\nWithout outer boundaries:")
33
+ result1 = guides.extract_table(include_outer_boundaries=False, apply_exclusions=True, header=False)
34
+ df1 = result1.to_df()
35
+ print(f"Shape: {df1.shape}")
36
+ print("First row, first column:", df1.iloc[0, 0] if not df1.empty else "Empty")
37
+
38
+ # Test with outer boundaries
39
+ print("\n\nWith outer boundaries:")
40
+ result2 = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
41
+ df2 = result2.to_df()
42
+ print(f"Shape: {df2.shape}")
43
+ print("First row, first column:", df2.iloc[0, 0] if not df2.empty else "Empty")
44
+
45
+ # The issue: include_outer_boundaries adds guides at 0 and 612,
46
+ # which creates cells that span into the exclusion zone
@@ -0,0 +1,33 @@
1
+ """Debug searching for 'ST' text."""
2
+ from natural_pdf import PDF
3
+
4
+ pdf = PDF("pdfs/m27.pdf")
5
+ page = pdf.pages[0]
6
+
7
+ # Get the original ST element
8
+ headers = (
9
+ page
10
+ .find("text:contains(NUMBER)")
11
+ .right(include_source=True)
12
+ .expand(top=3, bottom=3)
13
+ .find_all('text')
14
+ )
15
+ original_st = headers[4]
16
+ print(f"Original 'ST' element: '{original_st.text}' at {original_st.bbox}")
17
+
18
+ # Search for 'ST' using find
19
+ found_st = page.find('text:contains("ST")')
20
+ print(f"\nFound 'ST' using find: '{found_st.text}' at {found_st.bbox}")
21
+
22
+ # Find all elements containing 'ST'
23
+ all_st = page.find_all('text:contains("ST")')
24
+ print(f"\nAll elements containing 'ST':")
25
+ for i, elem in enumerate(all_st[:10]): # First 10
26
+ print(f" {i}: '{elem.text}' at x={elem.x0:.2f}, bbox={elem.bbox}")
27
+
28
+ # Check what's at position 332.88
29
+ print(f"\nLooking for element at x≈332.88:")
30
+ all_text = page.find_all('text')
31
+ for elem in all_text:
32
+ if 332 < elem.x0 < 334:
33
+ print(f" Found: '{elem.text}' at x={elem.x0:.5f}, bbox={elem.bbox}")
@@ -0,0 +1,42 @@
1
+ """Script to fix the remaining exclusion bugs in page.py"""
2
+
3
+ import re
4
+
5
+ # Read the file
6
+ with open('/Users/soma/Development/natural-pdf/natural_pdf/core/page.py', 'r') as f:
7
+ content = f.read()
8
+
9
+ # Fix 1: Line 1132 in find() method
10
+ # Change: if apply_exclusions and self._exclusions and results_collection:
11
+ # To: if apply_exclusions and results_collection:
12
+ content = re.sub(
13
+ r'(\s+)if apply_exclusions and self\._exclusions and results_collection:',
14
+ r'\1if apply_exclusions and results_collection:',
15
+ content
16
+ )
17
+
18
+ # Fix 2: Line 1227 in find_all() method
19
+ # Same change pattern
20
+ content = re.sub(
21
+ r'(\s+)if apply_exclusions and self\._exclusions and results_collection:',
22
+ r'\1if apply_exclusions and results_collection:',
23
+ content
24
+ )
25
+
26
+ # Fix 3: Line 1599 in get_elements() method
27
+ # Change: if apply_exclusions and self._exclusions:
28
+ # To: if apply_exclusions:
29
+ content = re.sub(
30
+ r'(\s+)if apply_exclusions and self\._exclusions:',
31
+ r'\1if apply_exclusions:',
32
+ content
33
+ )
34
+
35
+ # Write the fixed content back
36
+ with open('/Users/soma/Development/natural-pdf/natural_pdf/core/page.py', 'w') as f:
37
+ f.write(content)
38
+
39
+ print("Fixed exclusion checks in page.py")
40
+ print("- find() method: removed self._exclusions check")
41
+ print("- find_all() method: removed self._exclusions check")
42
+ print("- get_elements() method: removed self._exclusions check")
@@ -0,0 +1,30 @@
1
+ """Test exclusion with detailed debugging"""
2
+ from natural_pdf import PDF
3
+
4
+ pdf = PDF("pdfs/m27.pdf")
5
+ page = pdf.pages[0]
6
+
7
+ # Add exclusion
8
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
9
+
10
+ # First, verify the exclusion works on the page itself
11
+ print("Page-level text extraction:")
12
+ print("Without exclusions:", page.extract_text(apply_exclusions=False)[:100])
13
+ print("With exclusions:", page.extract_text(apply_exclusions=True)[:100])
14
+
15
+ # Now test on a specific region that should be excluded
16
+ print("\n\nRegion in excluded area (0, 0, 200, 50):")
17
+ excluded_region = page.region(0, 0, 200, 50)
18
+ print("Without exclusions:", repr(excluded_region.extract_text(apply_exclusions=False)))
19
+ print("With exclusions:", repr(excluded_region.extract_text(apply_exclusions=True)))
20
+
21
+ # Test the actual problematic cell region
22
+ print("\n\nProblematic cell region (32.06, 0.5, 73.18, 79.54):")
23
+ cell_region = page.region(32.06, 0.5, 73.18288, 79.54)
24
+ print("Without exclusions:", repr(cell_region.extract_text(apply_exclusions=False)))
25
+ print("With exclusions:", repr(cell_region.extract_text(apply_exclusions=True)))
26
+
27
+ # Check if the region inherits the page
28
+ print(f"\nCell region's page: {cell_region.page}")
29
+ print(f"Cell region's _page: {getattr(cell_region, '_page', 'Not found')}")
30
+ print(f"Same as original page: {cell_region.page is page if hasattr(cell_region, 'page') else 'N/A'}")
@@ -0,0 +1,53 @@
1
+ """Test that the find methods now work with PDF-level exclusions."""
2
+
3
+ from natural_pdf import PDF
4
+
5
+ # Load a test PDF
6
+ pdf = PDF("pdfs/m27.pdf")
7
+ page = pdf.pages[0]
8
+
9
+ # Add PDF-level exclusion for the header
10
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
11
+
12
+ # Test 1: find() should now exclude the header text
13
+ print("Test 1: find() with PDF-level exclusions")
14
+ result = page.find("text:contains(FEBRUARY)", apply_exclusions=True)
15
+ if result is None:
16
+ print("✅ SUCCESS: find() correctly excluded header text")
17
+ else:
18
+ print(f"❌ FAILED: find() returned {result.text}")
19
+
20
+ # Test 2: find_all() should exclude header elements
21
+ print("\nTest 2: find_all() with PDF-level exclusions")
22
+ all_text = page.find_all("text", apply_exclusions=False)
23
+ filtered_text = page.find_all("text", apply_exclusions=True)
24
+ print(f"Without exclusions: {len(all_text)} elements")
25
+ print(f"With exclusions: {len(filtered_text)} elements")
26
+ if len(filtered_text) < len(all_text):
27
+ print("✅ SUCCESS: find_all() excluded some elements")
28
+ else:
29
+ print("❌ FAILED: find_all() didn't exclude any elements")
30
+
31
+ # Test 3: get_elements() should exclude header elements
32
+ print("\nTest 3: get_elements() with PDF-level exclusions")
33
+ all_elements = page.get_elements(apply_exclusions=False)
34
+ filtered_elements = page.get_elements(apply_exclusions=True)
35
+ print(f"Without exclusions: {len(all_elements)} elements")
36
+ print(f"With exclusions: {len(filtered_elements)} elements")
37
+ if len(filtered_elements) < len(all_elements):
38
+ print("✅ SUCCESS: get_elements() excluded some elements")
39
+ else:
40
+ print("❌ FAILED: get_elements() didn't exclude any elements")
41
+
42
+ # Test that excluded text is not in the filtered results
43
+ print("\nChecking excluded text...")
44
+ excluded_texts = ["FEBRUARY", "ALPHABETIC LISTING", "M27"]
45
+ for text in excluded_texts:
46
+ found_in_filtered = any(
47
+ text in str(el.text) if hasattr(el, 'text') else False
48
+ for el in filtered_text
49
+ )
50
+ if not found_in_filtered:
51
+ print(f"✅ '{text}' correctly excluded")
52
+ else:
53
+ print(f"❌ '{text}' still present in filtered results")
@@ -0,0 +1,97 @@
1
+ """Test that the find methods now work with PDF-level exclusions (without recursion)."""
2
+
3
+ from natural_pdf import PDF
4
+
5
+ # Load a test PDF
6
+ pdf = PDF("pdfs/m27.pdf")
7
+ page = pdf.pages[0]
8
+
9
+ # Add PDF-level exclusion using a Region directly to avoid recursion
10
+ # First, get the header region without exclusions
11
+ header_element = page.find(text="PREMISE", apply_exclusions=False)
12
+ if header_element:
13
+ header_region = header_element.above()
14
+ pdf.add_exclusion(header_region)
15
+ else:
16
+ print("WARNING: Could not find PREMISE text for exclusion")
17
+
18
+ # Test 1: find() should now exclude the header text
19
+ print("Test 1: find() with PDF-level exclusions")
20
+ result = page.find("text:contains(FEBRUARY)", apply_exclusions=True)
21
+ if result is None:
22
+ print("✅ SUCCESS: find() correctly excluded header text")
23
+ else:
24
+ print(f"❌ FAILED: find() returned {result.text}")
25
+
26
+ # Test 2: find_all() should exclude header elements
27
+ print("\nTest 2: find_all() with PDF-level exclusions")
28
+ all_text = page.find_all("text", apply_exclusions=False)
29
+ filtered_text = page.find_all("text", apply_exclusions=True)
30
+ print(f"Without exclusions: {len(all_text)} elements")
31
+ print(f"With exclusions: {len(filtered_text)} elements")
32
+ if len(filtered_text) < len(all_text):
33
+ print("✅ SUCCESS: find_all() excluded some elements")
34
+ else:
35
+ print("❌ FAILED: find_all() didn't exclude any elements")
36
+
37
+ # Test 3: get_elements() should exclude header elements
38
+ print("\nTest 3: get_elements() with PDF-level exclusions")
39
+ all_elements = page.get_elements(apply_exclusions=False)
40
+ filtered_elements = page.get_elements(apply_exclusions=True)
41
+ print(f"Without exclusions: {len(all_elements)} elements")
42
+ print(f"With exclusions: {len(filtered_elements)} elements")
43
+ if len(filtered_elements) < len(all_elements):
44
+ print("✅ SUCCESS: get_elements() excluded some elements")
45
+ else:
46
+ print("❌ FAILED: get_elements() didn't exclude any elements")
47
+
48
+ # Test that excluded text is not in the filtered results
49
+ print("\nChecking excluded text...")
50
+ excluded_texts = ["FEBRUARY", "ALPHABETIC LISTING", "M27"]
51
+ for text in excluded_texts:
52
+ found_in_filtered = any(
53
+ text in str(el.text) if hasattr(el, 'text') else False
54
+ for el in filtered_text
55
+ )
56
+ if not found_in_filtered:
57
+ print(f"✅ '{text}' correctly excluded")
58
+ else:
59
+ print(f"❌ '{text}' still present in filtered results")
60
+
61
+ # Also test that the original table extraction issue is fixed
62
+ print("\n\nTesting original table extraction issue...")
63
+ from natural_pdf.analyzers.guides import Guides
64
+
65
+ # Add exclusion for footer too
66
+ footer_element = page.find("text:regex(Page \\d+ of)", apply_exclusions=False)
67
+ if footer_element:
68
+ pdf.add_exclusion(footer_element.expand())
69
+
70
+ headers = (
71
+ page
72
+ .find(text="NUMBER", apply_exclusions=False)
73
+ .right(include_source=True)
74
+ .expand(top=3, bottom=3)
75
+ .find_all('text')
76
+ )
77
+
78
+ guides = Guides(page)
79
+ guides.vertical.from_content(headers, align='left')
80
+ guides.horizontal.from_stripes()
81
+
82
+ result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
83
+ df = result.to_df()
84
+
85
+ # Check if excluded content is in the table
86
+ table_str = df.to_string()
87
+ has_feb = "FEBRUARY" in table_str
88
+ has_alphabetic = "ALPHABETIC LISTING" in table_str
89
+
90
+ print(f"\nTable extraction with exclusions:")
91
+ print(f"Contains 'FEBRUARY': {has_feb}")
92
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
93
+
94
+ if not has_feb and not has_alphabetic:
95
+ print("✅ SUCCESS: Table extraction correctly excludes header/footer!")
96
+ else:
97
+ print("❌ FAILED: Exclusions not working in table extraction")
@@ -0,0 +1,48 @@
1
+ """Test the fix with the actual PDF from the user's example."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ pdf = PDF("pdfs/m27.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Get headers using the user's exact code
9
+ headers = (
10
+ page
11
+ .find("text:contains(NUMBER)")
12
+ .right(include_source=True)
13
+ .expand(top=3, bottom=3)
14
+ .find_all('text')
15
+ )
16
+
17
+ print("Headers found:")
18
+ for i, h in enumerate(headers):
19
+ print(f" {i}: '{h.text}' at x={h.x0:.2f}")
20
+
21
+ # Create guides using ElementCollection
22
+ guides = Guides(page)
23
+ guides.vertical.from_content(headers, align='left', outer=False)
24
+
25
+ print(f"\nResulting vertical guides: {sorted(guides.vertical)}")
26
+
27
+ # Check specific headers that were problematic
28
+ print("\nChecking headers 3-5:")
29
+ for i, h in enumerate(headers[3:5]):
30
+ print(f" Header {i+3}: '{h.text}' at x={h.x0:.5f}")
31
+
32
+ # Test with just headers 3-5
33
+ guides2 = Guides(page)
34
+ guides2.vertical.from_content(headers[3:5], align='left', outer=False)
35
+
36
+ print(f"\nGuides from headers[3:5]: {guides2.vertical}")
37
+ print(f"Expected: [328.32012, 539.63316]")
38
+
39
+ # Verify the fix
40
+ if 332.88095999999996 in guides2.vertical:
41
+ print("\n❌ FAILED: Extra guide at 332.88 is still present")
42
+ else:
43
+ print("\n✅ SUCCESS: Extra guide at 332.88 is not present")
44
+
45
+ # Test that outer guides work correctly too
46
+ guides3 = Guides(page)
47
+ guides3.vertical.from_content(headers[3:5], align='left', outer=True)
48
+ print(f"\nWith outer=True: {guides3.vertical}")