doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,285 +1,291 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import sys
5
- from typing import List, Dict, Any
6
- from contextlib import ExitStack
7
- from pathlib import Path
8
-
9
- from PIL import Image
10
- from tqdm import tqdm
11
-
12
- from doctra.utils.pdf_io import render_pdf_to_images
13
- from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
14
- from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
15
- from doctra.engines.layout.layout_models import LayoutPage
16
-
17
- from doctra.parsers.layout_order import reading_order_key
18
- from doctra.exporters.image_saver import save_box_image
19
- from doctra.utils.file_ops import ensure_output_dirs
20
-
21
- from doctra.engines.vlm.service import VLMStructuredExtractor
22
- from doctra.exporters.excel_writer import write_structured_excel
23
- from doctra.utils.structured_utils import to_structured_dict
24
- from doctra.exporters.markdown_table import render_markdown_table
25
- from doctra.exporters.markdown_writer import write_markdown
26
- from doctra.exporters.html_writer import write_structured_html
27
- import json
28
-
29
-
30
- class ChartTablePDFParser:
31
- """
32
- Specialized PDF parser for extracting charts and tables.
33
-
34
- Focuses specifically on chart and table extraction from PDF documents,
35
- with optional VLM (Vision Language Model) processing to convert visual
36
- elements into structured data.
37
-
38
- :param extract_charts: Whether to extract charts from the document (default: True)
39
- :param extract_tables: Whether to extract tables from the document (default: True)
40
- :param use_vlm: Whether to use VLM for structured data extraction (default: False)
41
- :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
42
- :param vlm_model: Model name to use (defaults to provider-specific defaults)
43
- :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
44
- :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
45
- :param dpi: DPI for PDF rendering (default: 200)
46
- :param min_score: Minimum confidence score for layout detection (default: 0.0)
47
- """
48
-
49
- def __init__(
50
- self,
51
- *,
52
- extract_charts: bool = True,
53
- extract_tables: bool = True,
54
- use_vlm: bool = False,
55
- vlm_provider: str = "gemini",
56
- vlm_model: str | None = None,
57
- vlm_api_key: str | None = None,
58
- layout_model_name: str = "PP-DocLayout_plus-L",
59
- dpi: int = 200,
60
- min_score: float = 0.0,
61
- ):
62
- """
63
- Initialize the ChartTablePDFParser with extraction configuration.
64
-
65
- :param extract_charts: Whether to extract charts from the document (default: True)
66
- :param extract_tables: Whether to extract tables from the document (default: True)
67
- :param use_vlm: Whether to use VLM for structured data extraction (default: False)
68
- :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
69
- :param vlm_model: Model name to use (defaults to provider-specific defaults)
70
- :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
71
- :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
72
- :param dpi: DPI for PDF rendering (default: 200)
73
- :param min_score: Minimum confidence score for layout detection (default: 0.0)
74
- """
75
- if not extract_charts and not extract_tables:
76
- raise ValueError("At least one of extract_charts or extract_tables must be True")
77
-
78
- self.extract_charts = extract_charts
79
- self.extract_tables = extract_tables
80
- self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
81
- self.dpi = dpi
82
- self.min_score = min_score
83
-
84
- self.use_vlm = use_vlm
85
- self.vlm = None
86
- if self.use_vlm:
87
- self.vlm = VLMStructuredExtractor(
88
- vlm_provider=vlm_provider,
89
- vlm_model=vlm_model,
90
- api_key=vlm_api_key,
91
- )
92
-
93
- def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
94
- """
95
- Parse a PDF document and extract charts and/or tables.
96
-
97
- :param pdf_path: Path to the input PDF file
98
- :param output_base_dir: Base directory for output files (default: "outputs")
99
- :return: None
100
- """
101
- pdf_name = Path(pdf_path).stem
102
- out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
103
- os.makedirs(out_dir, exist_ok=True)
104
-
105
- charts_dir = None
106
- tables_dir = None
107
-
108
- if self.extract_charts:
109
- charts_dir = os.path.join(out_dir, "charts")
110
- os.makedirs(charts_dir, exist_ok=True)
111
-
112
- if self.extract_tables:
113
- tables_dir = os.path.join(out_dir, "tables")
114
- os.makedirs(tables_dir, exist_ok=True)
115
-
116
- pages: List[LayoutPage] = self.layout_engine.predict_pdf(
117
- pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
118
- )
119
- pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
120
-
121
- target_labels = []
122
- if self.extract_charts:
123
- target_labels.append("chart")
124
- if self.extract_tables:
125
- target_labels.append("table")
126
-
127
- chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
128
- table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
129
-
130
- if self.use_vlm:
131
- md_lines: List[str] = ["# Extracted Charts and Tables\n"]
132
- structured_items: List[Dict[str, Any]] = []
133
- vlm_items: List[Dict[str, Any]] = []
134
-
135
- charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
136
- tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
137
-
138
- chart_counter = 1
139
- table_counter = 1
140
-
141
- with ExitStack() as stack:
142
- is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
143
- is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
144
-
145
- if is_notebook:
146
- charts_bar = stack.enter_context(
147
- create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
148
- tables_bar = stack.enter_context(
149
- create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
150
- else:
151
- charts_bar = stack.enter_context(
152
- create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
153
- tables_bar = stack.enter_context(
154
- create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
155
-
156
- for p in pages:
157
- page_num = p.page_index
158
- page_img: Image.Image = pil_pages[page_num - 1]
159
-
160
- target_items = [box for box in p.boxes if box.label in target_labels]
161
-
162
- if target_items and self.use_vlm:
163
- md_lines.append(f"\n## Page {page_num}\n")
164
-
165
- for box in sorted(target_items, key=reading_order_key):
166
- if box.label == "chart" and self.extract_charts:
167
- chart_filename = f"chart_{chart_counter:03d}.png"
168
- chart_path = os.path.join(charts_dir, chart_filename)
169
-
170
- cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
171
- cropped_img.save(chart_path)
172
-
173
- if self.use_vlm and self.vlm:
174
- rel_path = os.path.join("charts", chart_filename)
175
- wrote_table = False
176
-
177
- try:
178
- extracted_chart = self.vlm.extract_chart(chart_path)
179
- structured_item = to_structured_dict(extracted_chart)
180
- if structured_item:
181
- structured_items.append(structured_item)
182
- vlm_items.append({
183
- "kind": "chart",
184
- "page": page_num,
185
- "image_rel_path": rel_path,
186
- "title": structured_item.get("title"),
187
- "headers": structured_item.get("headers"),
188
- "rows": structured_item.get("rows"),
189
- })
190
- md_lines.append(
191
- render_markdown_table(
192
- structured_item.get("headers"),
193
- structured_item.get("rows"),
194
- title=structured_item.get(
195
- "title") or f"Chart {chart_counter} — page {page_num}"
196
- )
197
- )
198
- wrote_table = True
199
- except Exception:
200
- pass
201
-
202
- if not wrote_table:
203
- md_lines.append(f"![Chart {chart_counter} — page {page_num}]({rel_path})\n")
204
-
205
- chart_counter += 1
206
- if charts_bar:
207
- charts_bar.update(1)
208
-
209
- elif box.label == "table" and self.extract_tables:
210
- table_filename = f"table_{table_counter:03d}.png"
211
- table_path = os.path.join(tables_dir, table_filename)
212
-
213
- cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
214
- cropped_img.save(table_path)
215
-
216
- if self.use_vlm and self.vlm:
217
- rel_path = os.path.join("tables", table_filename)
218
- wrote_table = False
219
-
220
- try:
221
- extracted_table = self.vlm.extract_table(table_path)
222
- structured_item = to_structured_dict(extracted_table)
223
- if structured_item:
224
- structured_items.append(structured_item)
225
- vlm_items.append({
226
- "kind": "table",
227
- "page": page_num,
228
- "image_rel_path": rel_path,
229
- "title": structured_item.get("title"),
230
- "headers": structured_item.get("headers"),
231
- "rows": structured_item.get("rows"),
232
- })
233
- md_lines.append(
234
- render_markdown_table(
235
- structured_item.get("headers"),
236
- structured_item.get("rows"),
237
- title=structured_item.get(
238
- "title") or f"Table {table_counter} — page {page_num}"
239
- )
240
- )
241
- wrote_table = True
242
- except Exception:
243
- pass
244
-
245
- if not wrote_table:
246
- md_lines.append(f"![Table {table_counter} — page {page_num}]({rel_path})\n")
247
-
248
- table_counter += 1
249
- if tables_bar:
250
- tables_bar.update(1)
251
-
252
- excel_path = None
253
-
254
- if self.use_vlm:
255
-
256
- if structured_items:
257
- if self.extract_charts and self.extract_tables:
258
- excel_filename = "parsed_tables_charts.xlsx"
259
- elif self.extract_charts:
260
- excel_filename = "parsed_charts.xlsx"
261
- elif self.extract_tables:
262
- excel_filename = "parsed_tables.xlsx"
263
- else:
264
- excel_filename = "parsed_data.xlsx" # fallback
265
-
266
-
267
- excel_path = os.path.join(out_dir, excel_filename)
268
- write_structured_excel(excel_path, structured_items)
269
-
270
- html_filename = excel_filename.replace('.xlsx', '.html')
271
- html_path = os.path.join(out_dir, html_filename)
272
- write_structured_html(html_path, structured_items)
273
-
274
- if 'vlm_items' in locals() and vlm_items:
275
- with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
276
- json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
277
-
278
- extraction_types = []
279
- if self.extract_charts:
280
- extraction_types.append("charts")
281
- if self.extract_tables:
282
- extraction_types.append("tables")
283
-
284
- print(f"✅ Parsing completed successfully!")
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from typing import List, Dict, Any
6
+ from contextlib import ExitStack
7
+ from pathlib import Path
8
+
9
+ from PIL import Image
10
+ from tqdm import tqdm
11
+
12
+ from doctra.utils.pdf_io import render_pdf_to_images
13
+ from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
14
+ from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
15
+ from doctra.engines.layout.layout_models import LayoutPage
16
+
17
+ from doctra.parsers.layout_order import reading_order_key
18
+ from doctra.exporters.image_saver import save_box_image
19
+ from doctra.utils.file_ops import ensure_output_dirs
20
+
21
+ from doctra.engines.vlm.service import VLMStructuredExtractor
22
+ from doctra.exporters.excel_writer import write_structured_excel
23
+ from doctra.utils.structured_utils import to_structured_dict
24
+ from doctra.exporters.markdown_table import render_markdown_table
25
+ from doctra.exporters.markdown_writer import write_markdown
26
+ from doctra.exporters.html_writer import write_structured_html
27
+ import json
28
+
29
+
30
+ class ChartTablePDFParser:
31
+ """
32
+ Specialized PDF parser for extracting charts and tables.
33
+
34
+ Focuses specifically on chart and table extraction from PDF documents,
35
+ with optional VLM (Vision Language Model) processing to convert visual
36
+ elements into structured data.
37
+
38
+ :param extract_charts: Whether to extract charts from the document (default: True)
39
+ :param extract_tables: Whether to extract tables from the document (default: True)
40
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
41
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
42
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
43
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
44
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
45
+ :param dpi: DPI for PDF rendering (default: 200)
46
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ *,
52
+ extract_charts: bool = True,
53
+ extract_tables: bool = True,
54
+ use_vlm: bool = False,
55
+ vlm_provider: str = "gemini",
56
+ vlm_model: str | None = None,
57
+ vlm_api_key: str | None = None,
58
+ layout_model_name: str = "PP-DocLayout_plus-L",
59
+ dpi: int = 200,
60
+ min_score: float = 0.0,
61
+ ):
62
+ """
63
+ Initialize the ChartTablePDFParser with extraction configuration.
64
+
65
+ :param extract_charts: Whether to extract charts from the document (default: True)
66
+ :param extract_tables: Whether to extract tables from the document (default: True)
67
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
68
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
69
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
70
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
71
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
72
+ :param dpi: DPI for PDF rendering (default: 200)
73
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
74
+ """
75
+ if not extract_charts and not extract_tables:
76
+ raise ValueError("At least one of extract_charts or extract_tables must be True")
77
+
78
+ self.extract_charts = extract_charts
79
+ self.extract_tables = extract_tables
80
+ self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
81
+ self.dpi = dpi
82
+ self.min_score = min_score
83
+
84
+ self.use_vlm = use_vlm
85
+ self.vlm = None
86
+ if self.use_vlm:
87
+ self.vlm = VLMStructuredExtractor(
88
+ vlm_provider=vlm_provider,
89
+ vlm_model=vlm_model,
90
+ api_key=vlm_api_key,
91
+ )
92
+
93
+ def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
94
+ """
95
+ Parse a PDF document and extract charts and/or tables.
96
+
97
+ :param pdf_path: Path to the input PDF file
98
+ :param output_base_dir: Base directory for output files (default: "outputs")
99
+ :return: None
100
+ """
101
+ pdf_name = Path(pdf_path).stem
102
+ out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
103
+ os.makedirs(out_dir, exist_ok=True)
104
+
105
+ charts_dir = None
106
+ tables_dir = None
107
+
108
+ if self.extract_charts:
109
+ charts_dir = os.path.join(out_dir, "charts")
110
+ os.makedirs(charts_dir, exist_ok=True)
111
+
112
+ if self.extract_tables:
113
+ tables_dir = os.path.join(out_dir, "tables")
114
+ os.makedirs(tables_dir, exist_ok=True)
115
+
116
+ pages: List[LayoutPage] = self.layout_engine.predict_pdf(
117
+ pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
118
+ )
119
+ pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
120
+
121
+ target_labels = []
122
+ if self.extract_charts:
123
+ target_labels.append("chart")
124
+ if self.extract_tables:
125
+ target_labels.append("table")
126
+
127
+ chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
128
+ table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
129
+
130
+ if self.use_vlm:
131
+ md_lines: List[str] = ["# Extracted Charts and Tables\n"]
132
+ structured_items: List[Dict[str, Any]] = []
133
+ vlm_items: List[Dict[str, Any]] = []
134
+
135
+ charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
136
+ tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
137
+
138
+ chart_counter = 1
139
+ table_counter = 1
140
+
141
+ with ExitStack() as stack:
142
+ is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
143
+ is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
144
+
145
+ if is_notebook:
146
+ charts_bar = stack.enter_context(
147
+ create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
148
+ tables_bar = stack.enter_context(
149
+ create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
150
+ else:
151
+ charts_bar = stack.enter_context(
152
+ create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
153
+ tables_bar = stack.enter_context(
154
+ create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
155
+
156
+ for p in pages:
157
+ page_num = p.page_index
158
+ page_img: Image.Image = pil_pages[page_num - 1]
159
+
160
+ target_items = [box for box in p.boxes if box.label in target_labels]
161
+
162
+ if target_items and self.use_vlm:
163
+ md_lines.append(f"\n## Page {page_num}\n")
164
+
165
+ for box in sorted(target_items, key=reading_order_key):
166
+ if box.label == "chart" and self.extract_charts:
167
+ chart_filename = f"chart_{chart_counter:03d}.png"
168
+ chart_path = os.path.join(charts_dir, chart_filename)
169
+
170
+ cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
171
+ cropped_img.save(chart_path)
172
+
173
+ if self.use_vlm and self.vlm:
174
+ rel_path = os.path.join("charts", chart_filename)
175
+ wrote_table = False
176
+
177
+ try:
178
+ extracted_chart = self.vlm.extract_chart(chart_path)
179
+ structured_item = to_structured_dict(extracted_chart)
180
+ if structured_item:
181
+ # Add page and type information to structured item
182
+ structured_item["page"] = page_num
183
+ structured_item["type"] = "Chart"
184
+ structured_items.append(structured_item)
185
+ vlm_items.append({
186
+ "kind": "chart",
187
+ "page": page_num,
188
+ "image_rel_path": rel_path,
189
+ "title": structured_item.get("title"),
190
+ "headers": structured_item.get("headers"),
191
+ "rows": structured_item.get("rows"),
192
+ })
193
+ md_lines.append(
194
+ render_markdown_table(
195
+ structured_item.get("headers"),
196
+ structured_item.get("rows"),
197
+ title=structured_item.get(
198
+ "title") or f"Chart {chart_counter} — page {page_num}"
199
+ )
200
+ )
201
+ wrote_table = True
202
+ except Exception:
203
+ pass
204
+
205
+ if not wrote_table:
206
+ md_lines.append(f"![Chart {chart_counter} — page {page_num}]({rel_path})\n")
207
+
208
+ chart_counter += 1
209
+ if charts_bar:
210
+ charts_bar.update(1)
211
+
212
+ elif box.label == "table" and self.extract_tables:
213
+ table_filename = f"table_{table_counter:03d}.png"
214
+ table_path = os.path.join(tables_dir, table_filename)
215
+
216
+ cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
217
+ cropped_img.save(table_path)
218
+
219
+ if self.use_vlm and self.vlm:
220
+ rel_path = os.path.join("tables", table_filename)
221
+ wrote_table = False
222
+
223
+ try:
224
+ extracted_table = self.vlm.extract_table(table_path)
225
+ structured_item = to_structured_dict(extracted_table)
226
+ if structured_item:
227
+ # Add page and type information to structured item
228
+ structured_item["page"] = page_num
229
+ structured_item["type"] = "Table"
230
+ structured_items.append(structured_item)
231
+ vlm_items.append({
232
+ "kind": "table",
233
+ "page": page_num,
234
+ "image_rel_path": rel_path,
235
+ "title": structured_item.get("title"),
236
+ "headers": structured_item.get("headers"),
237
+ "rows": structured_item.get("rows"),
238
+ })
239
+ md_lines.append(
240
+ render_markdown_table(
241
+ structured_item.get("headers"),
242
+ structured_item.get("rows"),
243
+ title=structured_item.get(
244
+ "title") or f"Table {table_counter} — page {page_num}"
245
+ )
246
+ )
247
+ wrote_table = True
248
+ except Exception:
249
+ pass
250
+
251
+ if not wrote_table:
252
+ md_lines.append(f"![Table {table_counter} — page {page_num}]({rel_path})\n")
253
+
254
+ table_counter += 1
255
+ if tables_bar:
256
+ tables_bar.update(1)
257
+
258
+ excel_path = None
259
+
260
+ if self.use_vlm:
261
+
262
+ if structured_items:
263
+ if self.extract_charts and self.extract_tables:
264
+ excel_filename = "parsed_tables_charts.xlsx"
265
+ elif self.extract_charts:
266
+ excel_filename = "parsed_charts.xlsx"
267
+ elif self.extract_tables:
268
+ excel_filename = "parsed_tables.xlsx"
269
+ else:
270
+ excel_filename = "parsed_data.xlsx" # fallback
271
+
272
+
273
+ excel_path = os.path.join(out_dir, excel_filename)
274
+ write_structured_excel(excel_path, structured_items)
275
+
276
+ html_filename = excel_filename.replace('.xlsx', '.html')
277
+ html_path = os.path.join(out_dir, html_filename)
278
+ write_structured_html(html_path, structured_items)
279
+
280
+ if 'vlm_items' in locals() and vlm_items:
281
+ with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
282
+ json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
283
+
284
+ extraction_types = []
285
+ if self.extract_charts:
286
+ extraction_types.append("charts")
287
+ if self.extract_tables:
288
+ extraction_types.append("tables")
289
+
290
+ print(f"✅ Parsing completed successfully!")
285
291
  print(f"📁 Output directory: {out_dir}")