natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,6 +44,7 @@ class TableResult(Sequence):
44
44
  header: Union[str, int, List[int], None] = "first",
45
45
  index_col=None,
46
46
  skip_repeating_headers=None,
47
+ keep_blank: bool = False,
47
48
  **kwargs,
48
49
  ):
49
50
  """Convert to *pandas* DataFrame.
@@ -52,11 +53,22 @@ class TableResult(Sequence):
52
53
  ----------
53
54
  header : "first" | int | list[int] | None, default "first"
54
55
  • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
56
+
57
+ Note: If the header row has a different number of columns than the
58
+ body rows, the method will automatically fall back to header=None
59
+ to prevent pandas errors. This commonly occurs when headers are
60
+ merged into a single cell during PDF extraction.
61
+
55
62
  index_col : same semantics as pandas, forwarded.
56
63
  skip_repeating_headers : bool, optional
57
64
  Whether to remove body rows that exactly match the header row(s).
58
65
  Defaults to True when header is truthy, False otherwise.
59
66
  Useful for PDFs where headers repeat throughout the table body.
67
+ keep_blank : bool, default False
68
+ Whether to preserve empty strings ('') as-is in the DataFrame.
69
+ When False (default), empty cells become pd.NA for better pandas integration
70
+ with numerical operations and missing data functions (.dropna(), .fillna(), etc.).
71
+ When True, empty strings are preserved as empty strings.
60
72
  **kwargs : forwarded to :pyclass:`pandas.DataFrame`.
61
73
  """
62
74
  try:
@@ -112,7 +124,32 @@ class TableResult(Sequence):
112
124
  # Could add logging here if desired
113
125
  pass
114
126
 
127
+ # Check for header/body column count mismatch and fallback to no header
128
+ if hdr is not None and body:
129
+ # Get the maximum number of columns from all body rows
130
+ # This handles cases where some rows have different column counts
131
+ max_cols = max(len(row) for row in body) if body else 0
132
+
133
+ # Check if header matches the maximum column count
134
+ header_cols = 0
135
+ if isinstance(hdr, list) and not isinstance(hdr[0], list):
136
+ # Single header row
137
+ header_cols = len(hdr)
138
+ elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
139
+ # Multi-row header - check first header row
140
+ header_cols = len(hdr[0])
141
+
142
+ if header_cols != max_cols:
143
+ # Column count mismatch - fallback to no header
144
+ hdr = None
145
+ body = self._rows # Use all rows as body
146
+
115
147
  df = pd.DataFrame(body, columns=hdr)
148
+
149
+ # Convert empty strings to NaN by default
150
+ if not keep_blank:
151
+ df = df.replace("", pd.NA)
152
+
116
153
  if index_col is not None and not df.empty:
117
154
  df.set_index(
118
155
  df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.1.dev0
3
+ Version: 0.2.3
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -14,7 +14,7 @@ License-File: LICENSE
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: markdown
16
16
  Requires-Dist: pandas
17
- Requires-Dist: pdfplumber
17
+ Requires-Dist: pdfplumber>=0.11.7
18
18
  Requires-Dist: colormath2
19
19
  Requires-Dist: pillow
20
20
  Requires-Dist: colour
@@ -2,7 +2,7 @@ natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
2
2
  natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
3
3
  natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
4
4
  natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
5
- natural_pdf/analyzers/guides.py,sha256=N8fetR3jrDXzeHtIlbxg8BEbthB_lS0L8yhzVXHqiGQ,143245
5
+ natural_pdf/analyzers/guides.py,sha256=9FUbxk4XBOyktXgq9q5-bB949JFrzT1kBPikg2ENoIw,150032
6
6
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
7
7
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
8
8
  natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
@@ -23,28 +23,29 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
23
23
  natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
24
24
  natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
25
25
  natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
26
- natural_pdf/collections/mixins.py,sha256=u4KtnlUZZYQ74e0OXAniOv9RtuA6FhwBxsLMJLjdbpQ,5169
26
+ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
- natural_pdf/core/highlighting_service.py,sha256=k_SMCINeK4aUwfQLmaiyipCPL8vv33ibrCyqtlni8Bc,67921
30
- natural_pdf/core/page.py,sha256=nQDUR4eKsUhPmEnofjmJRPITQ1RJoK3ITC0Lrtt4AHw,135510
31
- natural_pdf/core/page_collection.py,sha256=9ff7IfO04bUkJCBZv__Z9G8A-NY7mR3OujVl54lH-FE,50985
29
+ natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
30
+ natural_pdf/core/page.py,sha256=4-il2WPMVX4hNSgQ5P6yLc1-3jXfi73WCrpF9912ct4,142472
31
+ natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
32
+ natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
32
33
  natural_pdf/core/pdf.py,sha256=q54DyhXwAS_zAmsBd3PsCezu1wyQOYmGmB3iKfP8gAM,101884
33
34
  natural_pdf/core/pdf_collection.py,sha256=8tM0qVWS1L5Hwv5cXuZ2X8znAYOjKmlERX62bksDlJU,30144
34
- natural_pdf/core/render_spec.py,sha256=j77UrHA_g_e0RbAyn-4hkjFtqm_oaTe5KRd_Ii9izf4,12243
35
+ natural_pdf/core/render_spec.py,sha256=3GTfnlv8JKzePrruLq_dNr3HFeWMVcZT2fwWmJN44NI,14456
35
36
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
36
37
  natural_pdf/describe/base.py,sha256=Of9WVo9XuShXoeyJr0RN2CpLhF_CeiOjazl-or53RKU,18173
37
38
  natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
38
39
  natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
39
40
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
40
41
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
41
- natural_pdf/elements/base.py,sha256=jEBw5cq4mzgOYeEBrWPml2RBuVmOnwBNA4nTd7pLmMI,52292
42
- natural_pdf/elements/element_collection.py,sha256=av2YKTxEB5lHYqw1A6aYoN-Uef2qzT9z6ibBAbJMPo4,101322
42
+ natural_pdf/elements/base.py,sha256=xXdNV1_gt4T_V_4m6qJDieWiysvJxUBhSEEAJzMOzqo,55094
43
+ natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdDPwU7PX1ebzGMw,101342
43
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
44
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
45
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
46
- natural_pdf/elements/region.py,sha256=PoT4e2s0gPkMa2Px0LjkThi-Jc8O0_ebl6U7UYADAQk,155289
47
- natural_pdf/elements/text.py,sha256=IyyU3G4F3OzNZ4Oo0BTK_Wq0p0xFj5EYBWNVL4SZ-BQ,20492
47
+ natural_pdf/elements/region.py,sha256=Onok5VzmF1CvMCa3UGLUszCuhL-CCGk_IgtSUDva-Cc,155314
48
+ natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
48
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
49
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
50
51
  natural_pdf/exporters/base.py,sha256=379sioW_hbkGb21sEVuJhbkkDO5MFsFtTUNO5TgG2YU,2101
@@ -56,8 +57,8 @@ natural_pdf/exporters/searchable_pdf.py,sha256=7RDNTV2jK5b5PhZz-v-kpYGTDCXu8FBgX
56
57
  natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
58
  natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
58
59
  natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
59
- natural_pdf/extraction/manager.py,sha256=sASPJZ5cWFsl8A4PyTjg2yqkyC00tRl6glfoFA6HcsM,4979
60
- natural_pdf/extraction/mixin.py,sha256=ck2e48BYZg5RNderNE0QST6RSn2D6mIZYBw91nMSgp8,24970
60
+ natural_pdf/extraction/manager.py,sha256=R-wGe9PGky6r4BTSUPMXf3N2l12kycku3GJKEd45eFU,4701
61
+ natural_pdf/extraction/mixin.py,sha256=dBcp96R8zMQqaRHiB8vpyad8GR89gv5RPXlr8Mt0ais,25427
61
62
  natural_pdf/extraction/result.py,sha256=PDaCCN2LQBbHsZy0_lrQ0ROeMsnmH1WRoXWOjk9M2o4,1825
62
63
  natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
63
64
  natural_pdf/flows/collections.py,sha256=ErkHWdX6W_y1SjkcA_bGM0uUYRGPWWpRkHip6LHpej0,25740
@@ -84,9 +85,9 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
84
85
  natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
85
86
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
86
87
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
87
- natural_pdf/selectors/parser.py,sha256=uWo0K4uWJFbD4kTXz9fOcPwEjs7cGR9Mfpm1jm7qKUM,38824
88
+ natural_pdf/selectors/parser.py,sha256=pw0M8ICKPMOzZPzWpLsQMG_lnl8PewGIdIG3ciukabk,38877
88
89
  natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
89
- natural_pdf/tables/result.py,sha256=lfhLs5OxZ2IRLNndb8zjOQBk1SPjHx4KePzI7GlRkMg,5478
90
+ natural_pdf/tables/result.py,sha256=1pcelNZvOb6Anlwj08Z1XU-YK1ihlCsLpYMRA3Zc4JM,7242
90
91
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
91
92
  natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
92
93
  natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3HCE,1151
@@ -101,14 +102,14 @@ natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO
101
102
  natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
102
103
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
103
104
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
104
- natural_pdf-0.2.1.dev0.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
105
+ natural_pdf-0.2.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
105
106
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
106
107
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
107
- optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
108
+ optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
108
109
  optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
109
110
  optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
110
111
  tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
- tools/bad_pdf_eval/analyser.py,sha256=bKUT3muP3ESE5i1D8sGyAS5tMzFMcq-i-xD_ZeUxYhY,13692
112
+ tools/bad_pdf_eval/analyser.py,sha256=oqSTo3NLyignp_XdCO9_SRCUUXMU8lfgDavKYZYNxws,13690
112
113
  tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
113
114
  tools/bad_pdf_eval/compile_attempts_markdown.py,sha256=ArFDZaSa9dz0ez0lsNlbUSK4hbvB3___DlfwqPEAZpY,4359
114
115
  tools/bad_pdf_eval/eval_suite.py,sha256=zcapsGwO-VJ2OupJnPYKbrkzvzdGdoh2DZPK19bfkQg,4450
@@ -118,8 +119,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
118
119
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
119
120
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
120
121
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
121
- natural_pdf-0.2.1.dev0.dist-info/METADATA,sha256=A8hOXH7KhQgMTCKN0keud9u2m9V-_RnWPWjaSBo7Luc,6956
122
- natural_pdf-0.2.1.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
123
- natural_pdf-0.2.1.dev0.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
124
- natural_pdf-0.2.1.dev0.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
125
- natural_pdf-0.2.1.dev0.dist-info/RECORD,,
122
+ natural_pdf-0.2.3.dist-info/METADATA,sha256=lyx6Cx1xPGhy-p1m0wRfTvv4zSJ4ZJnNo7DeGQZ99yU,6959
123
+ natural_pdf-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
+ natural_pdf-0.2.3.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
125
+ natural_pdf-0.2.3.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
126
+ natural_pdf-0.2.3.dist-info/RECORD,,
@@ -211,7 +211,7 @@ class PDFPerformanceTester:
211
211
 
212
212
  for resolution in resolutions:
213
213
  try:
214
- img = page.to_image(resolution=resolution)
214
+ img = page.render(resolution=resolution)
215
215
 
216
216
  self.profiler.take_snapshot(
217
217
  f"image_{resolution}dpi_{i+1}",
@@ -39,7 +39,7 @@ class BadPDFAnalyzer:
39
39
  # ---------------------------------------------------------------------
40
40
  def _save_page_image(self, page, page_num: int) -> Path:
41
41
  """Render and save page image as high-quality JPG."""
42
- img: Image.Image = page.to_image(resolution=self.resolution)
42
+ img: Image.Image = page.render(resolution=self.resolution)
43
43
  if img.mode != "RGB":
44
44
  img = img.convert("RGB")
45
45
  img_path = self.output_dir / f"page_{page_num:04d}.jpg"