natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +159 -3
- natural_pdf/collections/mixins.py +16 -3
- natural_pdf/core/highlighting_service.py +33 -9
- natural_pdf/core/page.py +138 -7
- natural_pdf/core/page_collection.py +51 -14
- natural_pdf/core/page_groupby.py +229 -0
- natural_pdf/core/render_spec.py +62 -4
- natural_pdf/elements/base.py +102 -20
- natural_pdf/elements/element_collection.py +11 -10
- natural_pdf/elements/region.py +21 -21
- natural_pdf/elements/text.py +5 -0
- natural_pdf/extraction/manager.py +8 -14
- natural_pdf/extraction/mixin.py +35 -21
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +37 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/METADATA +2 -2
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/RECORD +23 -22
- optimization/performance_analysis.py +1 -1
- tools/bad_pdf_eval/analyser.py +1 -1
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/top_level.txt +0 -0
natural_pdf/tables/result.py
CHANGED
@@ -44,6 +44,7 @@ class TableResult(Sequence):
|
|
44
44
|
header: Union[str, int, List[int], None] = "first",
|
45
45
|
index_col=None,
|
46
46
|
skip_repeating_headers=None,
|
47
|
+
keep_blank: bool = False,
|
47
48
|
**kwargs,
|
48
49
|
):
|
49
50
|
"""Convert to *pandas* DataFrame.
|
@@ -52,11 +53,22 @@ class TableResult(Sequence):
|
|
52
53
|
----------
|
53
54
|
header : "first" | int | list[int] | None, default "first"
|
54
55
|
• "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
|
56
|
+
|
57
|
+
Note: If the header row has a different number of columns than the
|
58
|
+
body rows, the method will automatically fall back to header=None
|
59
|
+
to prevent pandas errors. This commonly occurs when headers are
|
60
|
+
merged into a single cell during PDF extraction.
|
61
|
+
|
55
62
|
index_col : same semantics as pandas, forwarded.
|
56
63
|
skip_repeating_headers : bool, optional
|
57
64
|
Whether to remove body rows that exactly match the header row(s).
|
58
65
|
Defaults to True when header is truthy, False otherwise.
|
59
66
|
Useful for PDFs where headers repeat throughout the table body.
|
67
|
+
keep_blank : bool, default False
|
68
|
+
Whether to preserve empty strings ('') as-is in the DataFrame.
|
69
|
+
When False (default), empty cells become pd.NA for better pandas integration
|
70
|
+
with numerical operations and missing data functions (.dropna(), .fillna(), etc.).
|
71
|
+
When True, empty strings are preserved as empty strings.
|
60
72
|
**kwargs : forwarded to :pyclass:`pandas.DataFrame`.
|
61
73
|
"""
|
62
74
|
try:
|
@@ -112,7 +124,32 @@ class TableResult(Sequence):
|
|
112
124
|
# Could add logging here if desired
|
113
125
|
pass
|
114
126
|
|
127
|
+
# Check for header/body column count mismatch and fallback to no header
|
128
|
+
if hdr is not None and body:
|
129
|
+
# Get the maximum number of columns from all body rows
|
130
|
+
# This handles cases where some rows have different column counts
|
131
|
+
max_cols = max(len(row) for row in body) if body else 0
|
132
|
+
|
133
|
+
# Check if header matches the maximum column count
|
134
|
+
header_cols = 0
|
135
|
+
if isinstance(hdr, list) and not isinstance(hdr[0], list):
|
136
|
+
# Single header row
|
137
|
+
header_cols = len(hdr)
|
138
|
+
elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
|
139
|
+
# Multi-row header - check first header row
|
140
|
+
header_cols = len(hdr[0])
|
141
|
+
|
142
|
+
if header_cols != max_cols:
|
143
|
+
# Column count mismatch - fallback to no header
|
144
|
+
hdr = None
|
145
|
+
body = self._rows # Use all rows as body
|
146
|
+
|
115
147
|
df = pd.DataFrame(body, columns=hdr)
|
148
|
+
|
149
|
+
# Convert empty strings to NaN by default
|
150
|
+
if not keep_blank:
|
151
|
+
df = df.replace("", pd.NA)
|
152
|
+
|
116
153
|
if index_col is not None and not df.empty:
|
117
154
|
df.set_index(
|
118
155
|
df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -14,7 +14,7 @@ License-File: LICENSE
|
|
14
14
|
Requires-Dist: scikit-learn
|
15
15
|
Requires-Dist: markdown
|
16
16
|
Requires-Dist: pandas
|
17
|
-
Requires-Dist: pdfplumber
|
17
|
+
Requires-Dist: pdfplumber>=0.11.7
|
18
18
|
Requires-Dist: colormath2
|
19
19
|
Requires-Dist: pillow
|
20
20
|
Requires-Dist: colour
|
@@ -2,7 +2,7 @@ natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
|
|
2
2
|
natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
|
3
3
|
natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
|
4
4
|
natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
|
5
|
-
natural_pdf/analyzers/guides.py,sha256=
|
5
|
+
natural_pdf/analyzers/guides.py,sha256=9FUbxk4XBOyktXgq9q5-bB949JFrzT1kBPikg2ENoIw,150032
|
6
6
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
|
7
7
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
8
8
|
natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
|
@@ -23,28 +23,29 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
|
|
23
23
|
natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
|
24
24
|
natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
|
25
25
|
natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
|
26
|
-
natural_pdf/collections/mixins.py,sha256=
|
26
|
+
natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
|
-
natural_pdf/core/highlighting_service.py,sha256=
|
30
|
-
natural_pdf/core/page.py,sha256=
|
31
|
-
natural_pdf/core/page_collection.py,sha256=
|
29
|
+
natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
|
30
|
+
natural_pdf/core/page.py,sha256=4-il2WPMVX4hNSgQ5P6yLc1-3jXfi73WCrpF9912ct4,142472
|
31
|
+
natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
|
32
|
+
natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
|
32
33
|
natural_pdf/core/pdf.py,sha256=q54DyhXwAS_zAmsBd3PsCezu1wyQOYmGmB3iKfP8gAM,101884
|
33
34
|
natural_pdf/core/pdf_collection.py,sha256=8tM0qVWS1L5Hwv5cXuZ2X8znAYOjKmlERX62bksDlJU,30144
|
34
|
-
natural_pdf/core/render_spec.py,sha256=
|
35
|
+
natural_pdf/core/render_spec.py,sha256=3GTfnlv8JKzePrruLq_dNr3HFeWMVcZT2fwWmJN44NI,14456
|
35
36
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
36
37
|
natural_pdf/describe/base.py,sha256=Of9WVo9XuShXoeyJr0RN2CpLhF_CeiOjazl-or53RKU,18173
|
37
38
|
natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
|
38
39
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
39
40
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
40
41
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
41
|
-
natural_pdf/elements/base.py,sha256=
|
42
|
-
natural_pdf/elements/element_collection.py,sha256=
|
42
|
+
natural_pdf/elements/base.py,sha256=xXdNV1_gt4T_V_4m6qJDieWiysvJxUBhSEEAJzMOzqo,55094
|
43
|
+
natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdDPwU7PX1ebzGMw,101342
|
43
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
44
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
45
46
|
natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
|
46
|
-
natural_pdf/elements/region.py,sha256=
|
47
|
-
natural_pdf/elements/text.py,sha256=
|
47
|
+
natural_pdf/elements/region.py,sha256=Onok5VzmF1CvMCa3UGLUszCuhL-CCGk_IgtSUDva-Cc,155314
|
48
|
+
natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
|
48
49
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
49
50
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
50
51
|
natural_pdf/exporters/base.py,sha256=379sioW_hbkGb21sEVuJhbkkDO5MFsFtTUNO5TgG2YU,2101
|
@@ -56,8 +57,8 @@ natural_pdf/exporters/searchable_pdf.py,sha256=7RDNTV2jK5b5PhZz-v-kpYGTDCXu8FBgX
|
|
56
57
|
natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
57
58
|
natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
|
58
59
|
natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
|
59
|
-
natural_pdf/extraction/manager.py,sha256=
|
60
|
-
natural_pdf/extraction/mixin.py,sha256=
|
60
|
+
natural_pdf/extraction/manager.py,sha256=R-wGe9PGky6r4BTSUPMXf3N2l12kycku3GJKEd45eFU,4701
|
61
|
+
natural_pdf/extraction/mixin.py,sha256=dBcp96R8zMQqaRHiB8vpyad8GR89gv5RPXlr8Mt0ais,25427
|
61
62
|
natural_pdf/extraction/result.py,sha256=PDaCCN2LQBbHsZy0_lrQ0ROeMsnmH1WRoXWOjk9M2o4,1825
|
62
63
|
natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
|
63
64
|
natural_pdf/flows/collections.py,sha256=ErkHWdX6W_y1SjkcA_bGM0uUYRGPWWpRkHip6LHpej0,25740
|
@@ -84,9 +85,9 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
84
85
|
natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
|
85
86
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
86
87
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
87
|
-
natural_pdf/selectors/parser.py,sha256=
|
88
|
+
natural_pdf/selectors/parser.py,sha256=pw0M8ICKPMOzZPzWpLsQMG_lnl8PewGIdIG3ciukabk,38877
|
88
89
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
89
|
-
natural_pdf/tables/result.py,sha256=
|
90
|
+
natural_pdf/tables/result.py,sha256=1pcelNZvOb6Anlwj08Z1XU-YK1ihlCsLpYMRA3Zc4JM,7242
|
90
91
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
91
92
|
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
92
93
|
natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3HCE,1151
|
@@ -101,14 +102,14 @@ natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO
|
|
101
102
|
natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
|
102
103
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
103
104
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
104
|
-
natural_pdf-0.2.
|
105
|
+
natural_pdf-0.2.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
105
106
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
106
107
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
107
|
-
optimization/performance_analysis.py,sha256=
|
108
|
+
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
108
109
|
optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
|
109
110
|
optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
|
110
111
|
tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
111
|
-
tools/bad_pdf_eval/analyser.py,sha256=
|
112
|
+
tools/bad_pdf_eval/analyser.py,sha256=oqSTo3NLyignp_XdCO9_SRCUUXMU8lfgDavKYZYNxws,13690
|
112
113
|
tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
|
113
114
|
tools/bad_pdf_eval/compile_attempts_markdown.py,sha256=ArFDZaSa9dz0ez0lsNlbUSK4hbvB3___DlfwqPEAZpY,4359
|
114
115
|
tools/bad_pdf_eval/eval_suite.py,sha256=zcapsGwO-VJ2OupJnPYKbrkzvzdGdoh2DZPK19bfkQg,4450
|
@@ -118,8 +119,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
118
119
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
119
120
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
120
121
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
121
|
-
natural_pdf-0.2.
|
122
|
-
natural_pdf-0.2.
|
123
|
-
natural_pdf-0.2.
|
124
|
-
natural_pdf-0.2.
|
125
|
-
natural_pdf-0.2.
|
122
|
+
natural_pdf-0.2.3.dist-info/METADATA,sha256=lyx6Cx1xPGhy-p1m0wRfTvv4zSJ4ZJnNo7DeGQZ99yU,6959
|
123
|
+
natural_pdf-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
124
|
+
natural_pdf-0.2.3.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
125
|
+
natural_pdf-0.2.3.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
|
126
|
+
natural_pdf-0.2.3.dist-info/RECORD,,
|
tools/bad_pdf_eval/analyser.py
CHANGED
@@ -39,7 +39,7 @@ class BadPDFAnalyzer:
|
|
39
39
|
# ---------------------------------------------------------------------
|
40
40
|
def _save_page_image(self, page, page_num: int) -> Path:
|
41
41
|
"""Render and save page image as high-quality JPG."""
|
42
|
-
img: Image.Image = page.
|
42
|
+
img: Image.Image = page.render(resolution=self.resolution)
|
43
43
|
if img.mode != "RGB":
|
44
44
|
img = img.convert("RGB")
|
45
45
|
img_path = self.output_dir / f"page_{page_num:04d}.jpg"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|