doctra 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +21 -18
- doctra/cli/main.py +3 -0
- doctra/engines/layout/paddle_layout.py +11 -77
- doctra/engines/vlm/provider.py +85 -85
- doctra/engines/vlm/service.py +6 -13
- doctra/exporters/html_writer.py +1235 -0
- doctra/parsers/structured_pdf_parser.py +12 -7
- doctra/parsers/table_chart_extractor.py +47 -22
- doctra/ui/__init__.py +5 -0
- doctra/ui/app.py +1012 -0
- doctra/utils/progress.py +200 -49
- doctra/utils/structured_utils.py +49 -49
- doctra/version.py +1 -1
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/METADATA +38 -1
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/RECORD +18 -15
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/WHEEL +0 -0
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
|
|
20
20
|
from doctra.utils.structured_utils import to_structured_dict
|
21
21
|
from doctra.exporters.markdown_table import render_markdown_table
|
22
22
|
from doctra.exporters.markdown_writer import write_markdown
|
23
|
+
from doctra.exporters.html_writer import write_html, write_structured_html
|
23
24
|
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
24
25
|
|
25
26
|
|
@@ -109,7 +110,7 @@ class StructuredPDFParser:
|
|
109
110
|
"""
|
110
111
|
# Extract filename without extension and create output directory
|
111
112
|
pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
112
|
-
out_dir = f"outputs/{pdf_filename}"
|
113
|
+
out_dir = f"outputs/{pdf_filename}/full_parse"
|
113
114
|
|
114
115
|
os.makedirs(out_dir, exist_ok=True)
|
115
116
|
ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
|
@@ -182,7 +183,7 @@ class StructuredPDFParser:
|
|
182
183
|
title=item.get("title"))
|
183
184
|
)
|
184
185
|
wrote_table = True
|
185
|
-
except Exception:
|
186
|
+
except Exception as e:
|
186
187
|
pass
|
187
188
|
if not wrote_table:
|
188
189
|
md_lines.append(f"\n")
|
@@ -204,7 +205,7 @@ class StructuredPDFParser:
|
|
204
205
|
title=item.get("title"))
|
205
206
|
)
|
206
207
|
wrote_table = True
|
207
|
-
except Exception:
|
208
|
+
except Exception as e:
|
208
209
|
pass
|
209
210
|
if not wrote_table:
|
210
211
|
md_lines.append(f"\n")
|
@@ -218,15 +219,19 @@ class StructuredPDFParser:
|
|
218
219
|
md_lines.append(self.box_separator if self.box_separator else "")
|
219
220
|
|
220
221
|
md_path = write_markdown(md_lines, out_dir)
|
222
|
+
html_path = write_html(md_lines, out_dir)
|
223
|
+
|
221
224
|
excel_path = None
|
225
|
+
html_structured_path = None
|
222
226
|
if self.use_vlm and structured_items:
|
223
227
|
excel_path = os.path.join(out_dir, "tables.xlsx")
|
224
228
|
write_structured_excel(excel_path, structured_items)
|
229
|
+
html_structured_path = os.path.join(out_dir, "tables.html")
|
230
|
+
write_structured_html(html_structured_path, structured_items)
|
225
231
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
print(f"Parsing completed successfully.\n- Markdown: {md_path}")
|
232
|
+
# Print completion message with output directory
|
233
|
+
print(f"✅ Parsing completed successfully!")
|
234
|
+
print(f"📁 Output directory: {out_dir}")
|
230
235
|
|
231
236
|
def display_pages_with_boxes(self, pdf_path: str, num_pages: int = 3, cols: int = 2,
|
232
237
|
page_width: int = 800, spacing: int = 40, save_path: str = None) -> None:
|
@@ -23,6 +23,8 @@ from doctra.exporters.excel_writer import write_structured_excel
|
|
23
23
|
from doctra.utils.structured_utils import to_structured_dict
|
24
24
|
from doctra.exporters.markdown_table import render_markdown_table
|
25
25
|
from doctra.exporters.markdown_writer import write_markdown
|
26
|
+
from doctra.exporters.html_writer import write_structured_html
|
27
|
+
import json
|
26
28
|
|
27
29
|
|
28
30
|
class ChartTablePDFParser:
|
@@ -105,9 +107,9 @@ class ChartTablePDFParser:
|
|
105
107
|
:param output_base_dir: Base directory for output files (default: "outputs")
|
106
108
|
:return: None
|
107
109
|
"""
|
108
|
-
# Create output directory structure: outputs
|
110
|
+
# Create output directory structure: outputs/<filename>/structured_parsing/
|
109
111
|
pdf_name = Path(pdf_path).stem
|
110
|
-
out_dir = os.path.join(output_base_dir, pdf_name)
|
112
|
+
out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
|
111
113
|
os.makedirs(out_dir, exist_ok=True)
|
112
114
|
|
113
115
|
# Create subdirectories based on what we're extracting
|
@@ -142,6 +144,7 @@ class ChartTablePDFParser:
|
|
142
144
|
if self.use_vlm:
|
143
145
|
md_lines: List[str] = ["# Extracted Charts and Tables\n"]
|
144
146
|
structured_items: List[Dict[str, Any]] = []
|
147
|
+
vlm_items: List[Dict[str, Any]] = []
|
145
148
|
|
146
149
|
# Progress bar descriptions
|
147
150
|
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
@@ -197,6 +200,14 @@ class ChartTablePDFParser:
|
|
197
200
|
structured_item = to_structured_dict(extracted_chart)
|
198
201
|
if structured_item:
|
199
202
|
structured_items.append(structured_item)
|
203
|
+
vlm_items.append({
|
204
|
+
"kind": "chart",
|
205
|
+
"page": page_num,
|
206
|
+
"image_rel_path": rel_path,
|
207
|
+
"title": structured_item.get("title"),
|
208
|
+
"headers": structured_item.get("headers"),
|
209
|
+
"rows": structured_item.get("rows"),
|
210
|
+
})
|
200
211
|
md_lines.append(
|
201
212
|
render_markdown_table(
|
202
213
|
structured_item.get("headers"),
|
@@ -235,6 +246,14 @@ class ChartTablePDFParser:
|
|
235
246
|
structured_item = to_structured_dict(extracted_table)
|
236
247
|
if structured_item:
|
237
248
|
structured_items.append(structured_item)
|
249
|
+
vlm_items.append({
|
250
|
+
"kind": "table",
|
251
|
+
"page": page_num,
|
252
|
+
"image_rel_path": rel_path,
|
253
|
+
"title": structured_item.get("title"),
|
254
|
+
"headers": structured_item.get("headers"),
|
255
|
+
"rows": structured_item.get("rows"),
|
256
|
+
})
|
238
257
|
md_lines.append(
|
239
258
|
render_markdown_table(
|
240
259
|
structured_item.get("headers"),
|
@@ -266,8 +285,29 @@ class ChartTablePDFParser:
|
|
266
285
|
|
267
286
|
# Write Excel file if we have structured data
|
268
287
|
if structured_items:
|
269
|
-
|
288
|
+
# Determine Excel filename based on extraction target
|
289
|
+
if self.extract_charts and self.extract_tables:
|
290
|
+
excel_filename = "parsed_tables_charts.xlsx"
|
291
|
+
elif self.extract_charts:
|
292
|
+
excel_filename = "parsed_charts.xlsx"
|
293
|
+
elif self.extract_tables:
|
294
|
+
excel_filename = "parsed_tables.xlsx"
|
295
|
+
else:
|
296
|
+
excel_filename = "parsed_data.xlsx" # fallback
|
297
|
+
|
298
|
+
|
299
|
+
excel_path = os.path.join(out_dir, excel_filename)
|
270
300
|
write_structured_excel(excel_path, structured_items)
|
301
|
+
|
302
|
+
# Also create HTML version
|
303
|
+
html_filename = excel_filename.replace('.xlsx', '.html')
|
304
|
+
html_path = os.path.join(out_dir, html_filename)
|
305
|
+
write_structured_html(html_path, structured_items)
|
306
|
+
|
307
|
+
# Write VLM items mapping for UI linkage
|
308
|
+
if 'vlm_items' in locals() and vlm_items:
|
309
|
+
with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
|
310
|
+
json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
|
271
311
|
|
272
312
|
# Print results
|
273
313
|
extraction_types = []
|
@@ -275,22 +315,7 @@ class ChartTablePDFParser:
|
|
275
315
|
extraction_types.append("charts")
|
276
316
|
if self.extract_tables:
|
277
317
|
extraction_types.append("tables")
|
278
|
-
|
279
|
-
|
280
|
-
print(f"
|
281
|
-
|
282
|
-
if charts_dir and self.extract_charts:
|
283
|
-
print(f"- Charts directory: {charts_dir}")
|
284
|
-
print(f"- Charts extracted: {chart_counter - 1}")
|
285
|
-
|
286
|
-
if tables_dir and self.extract_tables:
|
287
|
-
print(f"- Tables directory: {tables_dir}")
|
288
|
-
print(f"- Tables extracted: {table_counter - 1}")
|
289
|
-
|
290
|
-
if md_path:
|
291
|
-
print(f"- Markdown file: {md_path}")
|
292
|
-
if excel_path:
|
293
|
-
print(f"- Excel file: {excel_path}")
|
294
|
-
|
295
|
-
if not self.use_vlm:
|
296
|
-
print("- Note: VLM disabled - only cropped images saved")
|
318
|
+
|
319
|
+
# Print completion message with output directory
|
320
|
+
print(f"✅ Parsing completed successfully!")
|
321
|
+
print(f"📁 Output directory: {out_dir}")
|