doctra 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/engines/image_restoration/docres_engine.py +4 -4
- doctra/exporters/html_writer.py +206 -1
- doctra/parsers/enhanced_pdf_parser.py +107 -18
- doctra/parsers/structured_pdf_parser.py +52 -15
- doctra/parsers/table_chart_extractor.py +290 -290
- doctra/ui/app.py +39 -954
- doctra/ui/docres_ui.py +338 -0
- doctra/ui/docres_wrapper.py +120 -0
- doctra/ui/enhanced_parser_ui.py +483 -0
- doctra/ui/full_parse_ui.py +539 -0
- doctra/ui/tables_charts_ui.py +445 -0
- doctra/ui/ui_helpers.py +435 -0
- doctra/utils/progress.py +7 -7
- doctra/version.py +1 -1
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/METADATA +331 -74
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/RECORD +20 -13
- doctra-0.4.3.dist-info/entry_points.txt +2 -0
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/WHEEL +0 -0
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,291 +1,291 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import os
|
4
|
-
import sys
|
5
|
-
from typing import List, Dict, Any
|
6
|
-
from contextlib import ExitStack
|
7
|
-
from pathlib import Path
|
8
|
-
|
9
|
-
from PIL import Image
|
10
|
-
from tqdm import tqdm
|
11
|
-
|
12
|
-
from doctra.utils.pdf_io import render_pdf_to_images
|
13
|
-
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
14
|
-
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
15
|
-
from doctra.engines.layout.layout_models import LayoutPage
|
16
|
-
|
17
|
-
from doctra.parsers.layout_order import reading_order_key
|
18
|
-
from doctra.exporters.image_saver import save_box_image
|
19
|
-
from doctra.utils.file_ops import ensure_output_dirs
|
20
|
-
|
21
|
-
from doctra.engines.vlm.service import VLMStructuredExtractor
|
22
|
-
from doctra.exporters.excel_writer import write_structured_excel
|
23
|
-
from doctra.utils.structured_utils import to_structured_dict
|
24
|
-
from doctra.exporters.markdown_table import render_markdown_table
|
25
|
-
from doctra.exporters.markdown_writer import write_markdown
|
26
|
-
from doctra.exporters.html_writer import write_structured_html
|
27
|
-
import json
|
28
|
-
|
29
|
-
|
30
|
-
class ChartTablePDFParser:
|
31
|
-
"""
|
32
|
-
Specialized PDF parser for extracting charts and tables.
|
33
|
-
|
34
|
-
Focuses specifically on chart and table extraction from PDF documents,
|
35
|
-
with optional VLM (Vision Language Model) processing to convert visual
|
36
|
-
elements into structured data.
|
37
|
-
|
38
|
-
:param extract_charts: Whether to extract charts from the document (default: True)
|
39
|
-
:param extract_tables: Whether to extract tables from the document (default: True)
|
40
|
-
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
41
|
-
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
42
|
-
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
43
|
-
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
44
|
-
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
45
|
-
:param dpi: DPI for PDF rendering (default: 200)
|
46
|
-
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
47
|
-
"""
|
48
|
-
|
49
|
-
def __init__(
|
50
|
-
self,
|
51
|
-
*,
|
52
|
-
extract_charts: bool = True,
|
53
|
-
extract_tables: bool = True,
|
54
|
-
use_vlm: bool = False,
|
55
|
-
vlm_provider: str = "gemini",
|
56
|
-
vlm_model: str | None = None,
|
57
|
-
vlm_api_key: str | None = None,
|
58
|
-
layout_model_name: str = "PP-DocLayout_plus-L",
|
59
|
-
dpi: int = 200,
|
60
|
-
min_score: float = 0.0,
|
61
|
-
):
|
62
|
-
"""
|
63
|
-
Initialize the ChartTablePDFParser with extraction configuration.
|
64
|
-
|
65
|
-
:param extract_charts: Whether to extract charts from the document (default: True)
|
66
|
-
:param extract_tables: Whether to extract tables from the document (default: True)
|
67
|
-
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
68
|
-
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
69
|
-
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
70
|
-
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
71
|
-
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
72
|
-
:param dpi: DPI for PDF rendering (default: 200)
|
73
|
-
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
74
|
-
"""
|
75
|
-
if not extract_charts and not extract_tables:
|
76
|
-
raise ValueError("At least one of extract_charts or extract_tables must be True")
|
77
|
-
|
78
|
-
self.extract_charts = extract_charts
|
79
|
-
self.extract_tables = extract_tables
|
80
|
-
self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
|
81
|
-
self.dpi = dpi
|
82
|
-
self.min_score = min_score
|
83
|
-
|
84
|
-
self.use_vlm = use_vlm
|
85
|
-
self.vlm = None
|
86
|
-
if self.use_vlm:
|
87
|
-
self.vlm = VLMStructuredExtractor(
|
88
|
-
vlm_provider=vlm_provider,
|
89
|
-
vlm_model=vlm_model,
|
90
|
-
api_key=vlm_api_key,
|
91
|
-
)
|
92
|
-
|
93
|
-
def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
|
94
|
-
"""
|
95
|
-
Parse a PDF document and extract charts and/or tables.
|
96
|
-
|
97
|
-
:param pdf_path: Path to the input PDF file
|
98
|
-
:param output_base_dir: Base directory for output files (default: "outputs")
|
99
|
-
:return: None
|
100
|
-
"""
|
101
|
-
pdf_name = Path(pdf_path).stem
|
102
|
-
out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
|
103
|
-
os.makedirs(out_dir, exist_ok=True)
|
104
|
-
|
105
|
-
charts_dir = None
|
106
|
-
tables_dir = None
|
107
|
-
|
108
|
-
if self.extract_charts:
|
109
|
-
charts_dir = os.path.join(out_dir, "charts")
|
110
|
-
os.makedirs(charts_dir, exist_ok=True)
|
111
|
-
|
112
|
-
if self.extract_tables:
|
113
|
-
tables_dir = os.path.join(out_dir, "tables")
|
114
|
-
os.makedirs(tables_dir, exist_ok=True)
|
115
|
-
|
116
|
-
pages: List[LayoutPage] = self.layout_engine.predict_pdf(
|
117
|
-
pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
|
118
|
-
)
|
119
|
-
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
120
|
-
|
121
|
-
target_labels = []
|
122
|
-
if self.extract_charts:
|
123
|
-
target_labels.append("chart")
|
124
|
-
if self.extract_tables:
|
125
|
-
target_labels.append("table")
|
126
|
-
|
127
|
-
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
|
128
|
-
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
|
129
|
-
|
130
|
-
if self.use_vlm:
|
131
|
-
md_lines: List[str] = ["# Extracted Charts and Tables\n"]
|
132
|
-
structured_items: List[Dict[str, Any]] = []
|
133
|
-
vlm_items: List[Dict[str, Any]] = []
|
134
|
-
|
135
|
-
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
136
|
-
tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
|
137
|
-
|
138
|
-
chart_counter = 1
|
139
|
-
table_counter = 1
|
140
|
-
|
141
|
-
with ExitStack() as stack:
|
142
|
-
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
143
|
-
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
144
|
-
|
145
|
-
if is_notebook:
|
146
|
-
charts_bar = stack.enter_context(
|
147
|
-
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
148
|
-
tables_bar = stack.enter_context(
|
149
|
-
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
150
|
-
else:
|
151
|
-
charts_bar = stack.enter_context(
|
152
|
-
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
153
|
-
tables_bar = stack.enter_context(
|
154
|
-
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
155
|
-
|
156
|
-
for p in pages:
|
157
|
-
page_num = p.page_index
|
158
|
-
page_img: Image.Image = pil_pages[page_num - 1]
|
159
|
-
|
160
|
-
target_items = [box for box in p.boxes if box.label in target_labels]
|
161
|
-
|
162
|
-
if target_items and self.use_vlm:
|
163
|
-
md_lines.append(f"\n## Page {page_num}\n")
|
164
|
-
|
165
|
-
for box in sorted(target_items, key=reading_order_key):
|
166
|
-
if box.label == "chart" and self.extract_charts:
|
167
|
-
chart_filename = f"chart_{chart_counter:03d}.png"
|
168
|
-
chart_path = os.path.join(charts_dir, chart_filename)
|
169
|
-
|
170
|
-
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
171
|
-
cropped_img.save(chart_path)
|
172
|
-
|
173
|
-
if self.use_vlm and self.vlm:
|
174
|
-
rel_path = os.path.join("charts", chart_filename)
|
175
|
-
wrote_table = False
|
176
|
-
|
177
|
-
try:
|
178
|
-
extracted_chart = self.vlm.extract_chart(chart_path)
|
179
|
-
structured_item = to_structured_dict(extracted_chart)
|
180
|
-
if structured_item:
|
181
|
-
# Add page and type information to structured item
|
182
|
-
structured_item["page"] = page_num
|
183
|
-
structured_item["type"] = "Chart"
|
184
|
-
structured_items.append(structured_item)
|
185
|
-
vlm_items.append({
|
186
|
-
"kind": "chart",
|
187
|
-
"page": page_num,
|
188
|
-
"image_rel_path": rel_path,
|
189
|
-
"title": structured_item.get("title"),
|
190
|
-
"headers": structured_item.get("headers"),
|
191
|
-
"rows": structured_item.get("rows"),
|
192
|
-
})
|
193
|
-
md_lines.append(
|
194
|
-
render_markdown_table(
|
195
|
-
structured_item.get("headers"),
|
196
|
-
structured_item.get("rows"),
|
197
|
-
title=structured_item.get(
|
198
|
-
"title") or f"Chart {chart_counter} — page {page_num}"
|
199
|
-
)
|
200
|
-
)
|
201
|
-
wrote_table = True
|
202
|
-
except Exception:
|
203
|
-
pass
|
204
|
-
|
205
|
-
if not wrote_table:
|
206
|
-
md_lines.append(f"\n")
|
207
|
-
|
208
|
-
chart_counter += 1
|
209
|
-
if charts_bar:
|
210
|
-
charts_bar.update(1)
|
211
|
-
|
212
|
-
elif box.label == "table" and self.extract_tables:
|
213
|
-
table_filename = f"table_{table_counter:03d}.png"
|
214
|
-
table_path = os.path.join(tables_dir, table_filename)
|
215
|
-
|
216
|
-
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
217
|
-
cropped_img.save(table_path)
|
218
|
-
|
219
|
-
if self.use_vlm and self.vlm:
|
220
|
-
rel_path = os.path.join("tables", table_filename)
|
221
|
-
wrote_table = False
|
222
|
-
|
223
|
-
try:
|
224
|
-
extracted_table = self.vlm.extract_table(table_path)
|
225
|
-
structured_item = to_structured_dict(extracted_table)
|
226
|
-
if structured_item:
|
227
|
-
# Add page and type information to structured item
|
228
|
-
structured_item["page"] = page_num
|
229
|
-
structured_item["type"] = "Table"
|
230
|
-
structured_items.append(structured_item)
|
231
|
-
vlm_items.append({
|
232
|
-
"kind": "table",
|
233
|
-
"page": page_num,
|
234
|
-
"image_rel_path": rel_path,
|
235
|
-
"title": structured_item.get("title"),
|
236
|
-
"headers": structured_item.get("headers"),
|
237
|
-
"rows": structured_item.get("rows"),
|
238
|
-
})
|
239
|
-
md_lines.append(
|
240
|
-
render_markdown_table(
|
241
|
-
structured_item.get("headers"),
|
242
|
-
structured_item.get("rows"),
|
243
|
-
title=structured_item.get(
|
244
|
-
"title") or f"Table {table_counter} — page {page_num}"
|
245
|
-
)
|
246
|
-
)
|
247
|
-
wrote_table = True
|
248
|
-
except Exception:
|
249
|
-
pass
|
250
|
-
|
251
|
-
if not wrote_table:
|
252
|
-
md_lines.append(f"\n")
|
253
|
-
|
254
|
-
table_counter += 1
|
255
|
-
if tables_bar:
|
256
|
-
tables_bar.update(1)
|
257
|
-
|
258
|
-
excel_path = None
|
259
|
-
|
260
|
-
if self.use_vlm:
|
261
|
-
|
262
|
-
if structured_items:
|
263
|
-
if self.extract_charts and self.extract_tables:
|
264
|
-
excel_filename = "parsed_tables_charts.xlsx"
|
265
|
-
elif self.extract_charts:
|
266
|
-
excel_filename = "parsed_charts.xlsx"
|
267
|
-
elif self.extract_tables:
|
268
|
-
excel_filename = "parsed_tables.xlsx"
|
269
|
-
else:
|
270
|
-
excel_filename = "parsed_data.xlsx" # fallback
|
271
|
-
|
272
|
-
|
273
|
-
excel_path = os.path.join(out_dir, excel_filename)
|
274
|
-
write_structured_excel(excel_path, structured_items)
|
275
|
-
|
276
|
-
html_filename = excel_filename.replace('.xlsx', '.html')
|
277
|
-
html_path = os.path.join(out_dir, html_filename)
|
278
|
-
write_structured_html(html_path, structured_items)
|
279
|
-
|
280
|
-
if 'vlm_items' in locals() and vlm_items:
|
281
|
-
with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
|
282
|
-
json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
|
283
|
-
|
284
|
-
extraction_types = []
|
285
|
-
if self.extract_charts:
|
286
|
-
extraction_types.append("charts")
|
287
|
-
if self.extract_tables:
|
288
|
-
extraction_types.append("tables")
|
289
|
-
|
290
|
-
print(f"✅ Parsing completed successfully!")
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from typing import List, Dict, Any
|
6
|
+
from contextlib import ExitStack
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
from PIL import Image
|
10
|
+
from tqdm import tqdm
|
11
|
+
|
12
|
+
from doctra.utils.pdf_io import render_pdf_to_images
|
13
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
14
|
+
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
15
|
+
from doctra.engines.layout.layout_models import LayoutPage
|
16
|
+
|
17
|
+
from doctra.parsers.layout_order import reading_order_key
|
18
|
+
from doctra.exporters.image_saver import save_box_image
|
19
|
+
from doctra.utils.file_ops import ensure_output_dirs
|
20
|
+
|
21
|
+
from doctra.engines.vlm.service import VLMStructuredExtractor
|
22
|
+
from doctra.exporters.excel_writer import write_structured_excel
|
23
|
+
from doctra.utils.structured_utils import to_structured_dict
|
24
|
+
from doctra.exporters.markdown_table import render_markdown_table
|
25
|
+
from doctra.exporters.markdown_writer import write_markdown
|
26
|
+
from doctra.exporters.html_writer import write_structured_html
|
27
|
+
import json
|
28
|
+
|
29
|
+
|
30
|
+
class ChartTablePDFParser:
|
31
|
+
"""
|
32
|
+
Specialized PDF parser for extracting charts and tables.
|
33
|
+
|
34
|
+
Focuses specifically on chart and table extraction from PDF documents,
|
35
|
+
with optional VLM (Vision Language Model) processing to convert visual
|
36
|
+
elements into structured data.
|
37
|
+
|
38
|
+
:param extract_charts: Whether to extract charts from the document (default: True)
|
39
|
+
:param extract_tables: Whether to extract tables from the document (default: True)
|
40
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
41
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
42
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
43
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
44
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
45
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
46
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
47
|
+
"""
|
48
|
+
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
*,
|
52
|
+
extract_charts: bool = True,
|
53
|
+
extract_tables: bool = True,
|
54
|
+
use_vlm: bool = False,
|
55
|
+
vlm_provider: str = "gemini",
|
56
|
+
vlm_model: str | None = None,
|
57
|
+
vlm_api_key: str | None = None,
|
58
|
+
layout_model_name: str = "PP-DocLayout_plus-L",
|
59
|
+
dpi: int = 200,
|
60
|
+
min_score: float = 0.0,
|
61
|
+
):
|
62
|
+
"""
|
63
|
+
Initialize the ChartTablePDFParser with extraction configuration.
|
64
|
+
|
65
|
+
:param extract_charts: Whether to extract charts from the document (default: True)
|
66
|
+
:param extract_tables: Whether to extract tables from the document (default: True)
|
67
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
68
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
69
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
70
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
71
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
72
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
73
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
74
|
+
"""
|
75
|
+
if not extract_charts and not extract_tables:
|
76
|
+
raise ValueError("At least one of extract_charts or extract_tables must be True")
|
77
|
+
|
78
|
+
self.extract_charts = extract_charts
|
79
|
+
self.extract_tables = extract_tables
|
80
|
+
self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
|
81
|
+
self.dpi = dpi
|
82
|
+
self.min_score = min_score
|
83
|
+
|
84
|
+
self.use_vlm = use_vlm
|
85
|
+
self.vlm = None
|
86
|
+
if self.use_vlm:
|
87
|
+
self.vlm = VLMStructuredExtractor(
|
88
|
+
vlm_provider=vlm_provider,
|
89
|
+
vlm_model=vlm_model,
|
90
|
+
api_key=vlm_api_key,
|
91
|
+
)
|
92
|
+
|
93
|
+
def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
|
94
|
+
"""
|
95
|
+
Parse a PDF document and extract charts and/or tables.
|
96
|
+
|
97
|
+
:param pdf_path: Path to the input PDF file
|
98
|
+
:param output_base_dir: Base directory for output files (default: "outputs")
|
99
|
+
:return: None
|
100
|
+
"""
|
101
|
+
pdf_name = Path(pdf_path).stem
|
102
|
+
out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
|
103
|
+
os.makedirs(out_dir, exist_ok=True)
|
104
|
+
|
105
|
+
charts_dir = None
|
106
|
+
tables_dir = None
|
107
|
+
|
108
|
+
if self.extract_charts:
|
109
|
+
charts_dir = os.path.join(out_dir, "charts")
|
110
|
+
os.makedirs(charts_dir, exist_ok=True)
|
111
|
+
|
112
|
+
if self.extract_tables:
|
113
|
+
tables_dir = os.path.join(out_dir, "tables")
|
114
|
+
os.makedirs(tables_dir, exist_ok=True)
|
115
|
+
|
116
|
+
pages: List[LayoutPage] = self.layout_engine.predict_pdf(
|
117
|
+
pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
|
118
|
+
)
|
119
|
+
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
120
|
+
|
121
|
+
target_labels = []
|
122
|
+
if self.extract_charts:
|
123
|
+
target_labels.append("chart")
|
124
|
+
if self.extract_tables:
|
125
|
+
target_labels.append("table")
|
126
|
+
|
127
|
+
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
|
128
|
+
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
|
129
|
+
|
130
|
+
if self.use_vlm:
|
131
|
+
md_lines: List[str] = ["# Extracted Charts and Tables\n"]
|
132
|
+
structured_items: List[Dict[str, Any]] = []
|
133
|
+
vlm_items: List[Dict[str, Any]] = []
|
134
|
+
|
135
|
+
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
136
|
+
tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
|
137
|
+
|
138
|
+
chart_counter = 1
|
139
|
+
table_counter = 1
|
140
|
+
|
141
|
+
with ExitStack() as stack:
|
142
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
143
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
144
|
+
|
145
|
+
if is_notebook:
|
146
|
+
charts_bar = stack.enter_context(
|
147
|
+
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
148
|
+
tables_bar = stack.enter_context(
|
149
|
+
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
150
|
+
else:
|
151
|
+
charts_bar = stack.enter_context(
|
152
|
+
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
153
|
+
tables_bar = stack.enter_context(
|
154
|
+
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
155
|
+
|
156
|
+
for p in pages:
|
157
|
+
page_num = p.page_index
|
158
|
+
page_img: Image.Image = pil_pages[page_num - 1]
|
159
|
+
|
160
|
+
target_items = [box for box in p.boxes if box.label in target_labels]
|
161
|
+
|
162
|
+
if target_items and self.use_vlm:
|
163
|
+
md_lines.append(f"\n## Page {page_num}\n")
|
164
|
+
|
165
|
+
for box in sorted(target_items, key=reading_order_key):
|
166
|
+
if box.label == "chart" and self.extract_charts:
|
167
|
+
chart_filename = f"chart_{chart_counter:03d}.png"
|
168
|
+
chart_path = os.path.join(charts_dir, chart_filename)
|
169
|
+
|
170
|
+
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
171
|
+
cropped_img.save(chart_path)
|
172
|
+
|
173
|
+
if self.use_vlm and self.vlm:
|
174
|
+
rel_path = os.path.join("charts", chart_filename)
|
175
|
+
wrote_table = False
|
176
|
+
|
177
|
+
try:
|
178
|
+
extracted_chart = self.vlm.extract_chart(chart_path)
|
179
|
+
structured_item = to_structured_dict(extracted_chart)
|
180
|
+
if structured_item:
|
181
|
+
# Add page and type information to structured item
|
182
|
+
structured_item["page"] = page_num
|
183
|
+
structured_item["type"] = "Chart"
|
184
|
+
structured_items.append(structured_item)
|
185
|
+
vlm_items.append({
|
186
|
+
"kind": "chart",
|
187
|
+
"page": page_num,
|
188
|
+
"image_rel_path": rel_path,
|
189
|
+
"title": structured_item.get("title"),
|
190
|
+
"headers": structured_item.get("headers"),
|
191
|
+
"rows": structured_item.get("rows"),
|
192
|
+
})
|
193
|
+
md_lines.append(
|
194
|
+
render_markdown_table(
|
195
|
+
structured_item.get("headers"),
|
196
|
+
structured_item.get("rows"),
|
197
|
+
title=structured_item.get(
|
198
|
+
"title") or f"Chart {chart_counter} — page {page_num}"
|
199
|
+
)
|
200
|
+
)
|
201
|
+
wrote_table = True
|
202
|
+
except Exception:
|
203
|
+
pass
|
204
|
+
|
205
|
+
if not wrote_table:
|
206
|
+
md_lines.append(f"\n")
|
207
|
+
|
208
|
+
chart_counter += 1
|
209
|
+
if charts_bar:
|
210
|
+
charts_bar.update(1)
|
211
|
+
|
212
|
+
elif box.label == "table" and self.extract_tables:
|
213
|
+
table_filename = f"table_{table_counter:03d}.png"
|
214
|
+
table_path = os.path.join(tables_dir, table_filename)
|
215
|
+
|
216
|
+
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
217
|
+
cropped_img.save(table_path)
|
218
|
+
|
219
|
+
if self.use_vlm and self.vlm:
|
220
|
+
rel_path = os.path.join("tables", table_filename)
|
221
|
+
wrote_table = False
|
222
|
+
|
223
|
+
try:
|
224
|
+
extracted_table = self.vlm.extract_table(table_path)
|
225
|
+
structured_item = to_structured_dict(extracted_table)
|
226
|
+
if structured_item:
|
227
|
+
# Add page and type information to structured item
|
228
|
+
structured_item["page"] = page_num
|
229
|
+
structured_item["type"] = "Table"
|
230
|
+
structured_items.append(structured_item)
|
231
|
+
vlm_items.append({
|
232
|
+
"kind": "table",
|
233
|
+
"page": page_num,
|
234
|
+
"image_rel_path": rel_path,
|
235
|
+
"title": structured_item.get("title"),
|
236
|
+
"headers": structured_item.get("headers"),
|
237
|
+
"rows": structured_item.get("rows"),
|
238
|
+
})
|
239
|
+
md_lines.append(
|
240
|
+
render_markdown_table(
|
241
|
+
structured_item.get("headers"),
|
242
|
+
structured_item.get("rows"),
|
243
|
+
title=structured_item.get(
|
244
|
+
"title") or f"Table {table_counter} — page {page_num}"
|
245
|
+
)
|
246
|
+
)
|
247
|
+
wrote_table = True
|
248
|
+
except Exception:
|
249
|
+
pass
|
250
|
+
|
251
|
+
if not wrote_table:
|
252
|
+
md_lines.append(f"\n")
|
253
|
+
|
254
|
+
table_counter += 1
|
255
|
+
if tables_bar:
|
256
|
+
tables_bar.update(1)
|
257
|
+
|
258
|
+
excel_path = None
|
259
|
+
|
260
|
+
if self.use_vlm:
|
261
|
+
|
262
|
+
if structured_items:
|
263
|
+
if self.extract_charts and self.extract_tables:
|
264
|
+
excel_filename = "parsed_tables_charts.xlsx"
|
265
|
+
elif self.extract_charts:
|
266
|
+
excel_filename = "parsed_charts.xlsx"
|
267
|
+
elif self.extract_tables:
|
268
|
+
excel_filename = "parsed_tables.xlsx"
|
269
|
+
else:
|
270
|
+
excel_filename = "parsed_data.xlsx" # fallback
|
271
|
+
|
272
|
+
|
273
|
+
excel_path = os.path.join(out_dir, excel_filename)
|
274
|
+
write_structured_excel(excel_path, structured_items)
|
275
|
+
|
276
|
+
html_filename = excel_filename.replace('.xlsx', '.html')
|
277
|
+
html_path = os.path.join(out_dir, html_filename)
|
278
|
+
write_structured_html(html_path, structured_items)
|
279
|
+
|
280
|
+
if 'vlm_items' in locals() and vlm_items:
|
281
|
+
with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
|
282
|
+
json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
|
283
|
+
|
284
|
+
extraction_types = []
|
285
|
+
if self.extract_charts:
|
286
|
+
extraction_types.append("charts")
|
287
|
+
if self.extract_tables:
|
288
|
+
extraction_types.append("tables")
|
289
|
+
|
290
|
+
print(f"✅ Parsing completed successfully!")
|
291
291
|
print(f"📁 Output directory: {out_dir}")
|