doctra 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +21 -18
- doctra/cli/main.py +5 -2
- doctra/cli/utils.py +12 -3
- doctra/engines/layout/paddle_layout.py +13 -78
- doctra/engines/vlm/provider.py +86 -58
- doctra/engines/vlm/service.py +10 -14
- doctra/exporters/html_writer.py +1235 -0
- doctra/parsers/structured_pdf_parser.py +35 -15
- doctra/parsers/table_chart_extractor.py +66 -28
- doctra/ui/__init__.py +5 -0
- doctra/ui/app.py +1012 -0
- doctra/utils/progress.py +428 -0
- doctra/utils/structured_utils.py +49 -49
- doctra/version.py +1 -1
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/METADATA +45 -6
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/RECORD +19 -15
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/WHEEL +0 -0
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import os
|
3
3
|
import re
|
4
|
+
import sys
|
4
5
|
from typing import List, Dict, Any
|
5
6
|
from contextlib import ExitStack
|
6
7
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -19,6 +20,8 @@ from doctra.exporters.excel_writer import write_structured_excel
|
|
19
20
|
from doctra.utils.structured_utils import to_structured_dict
|
20
21
|
from doctra.exporters.markdown_table import render_markdown_table
|
21
22
|
from doctra.exporters.markdown_writer import write_markdown
|
23
|
+
from doctra.exporters.html_writer import write_html, write_structured_html
|
24
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
22
25
|
|
23
26
|
|
24
27
|
class StructuredPDFParser:
|
@@ -30,7 +33,7 @@ class StructuredPDFParser:
|
|
30
33
|
converting visual elements into structured data.
|
31
34
|
|
32
35
|
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
33
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
36
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
34
37
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
35
38
|
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
36
39
|
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
@@ -66,7 +69,7 @@ class StructuredPDFParser:
|
|
66
69
|
the VLM service for comprehensive document processing.
|
67
70
|
|
68
71
|
:param use_vlm: Whether to use VLM for structured data extraction
|
69
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
72
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
|
70
73
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
71
74
|
:param vlm_api_key: API key for VLM provider
|
72
75
|
:param layout_model_name: Layout detection model name
|
@@ -107,7 +110,7 @@ class StructuredPDFParser:
|
|
107
110
|
"""
|
108
111
|
# Extract filename without extension and create output directory
|
109
112
|
pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
110
|
-
out_dir = f"outputs/{pdf_filename}"
|
113
|
+
out_dir = f"outputs/{pdf_filename}/full_parse"
|
111
114
|
|
112
115
|
os.makedirs(out_dir, exist_ok=True)
|
113
116
|
ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
|
@@ -130,12 +133,25 @@ class StructuredPDFParser:
|
|
130
133
|
figures_desc = "Figures (cropped)"
|
131
134
|
|
132
135
|
with ExitStack() as stack:
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
136
|
+
# Enhanced environment detection
|
137
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
138
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
139
|
+
|
140
|
+
# Use appropriate progress bars based on environment
|
141
|
+
if is_notebook:
|
142
|
+
charts_bar = stack.enter_context(
|
143
|
+
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
144
|
+
tables_bar = stack.enter_context(
|
145
|
+
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
146
|
+
figures_bar = stack.enter_context(
|
147
|
+
create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
|
148
|
+
else:
|
149
|
+
charts_bar = stack.enter_context(
|
150
|
+
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
151
|
+
tables_bar = stack.enter_context(
|
152
|
+
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
153
|
+
figures_bar = stack.enter_context(
|
154
|
+
create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
|
139
155
|
|
140
156
|
for p in pages:
|
141
157
|
page_num = p.page_index
|
@@ -167,7 +183,7 @@ class StructuredPDFParser:
|
|
167
183
|
title=item.get("title"))
|
168
184
|
)
|
169
185
|
wrote_table = True
|
170
|
-
except Exception:
|
186
|
+
except Exception as e:
|
171
187
|
pass
|
172
188
|
if not wrote_table:
|
173
189
|
md_lines.append(f"\n")
|
@@ -189,7 +205,7 @@ class StructuredPDFParser:
|
|
189
205
|
title=item.get("title"))
|
190
206
|
)
|
191
207
|
wrote_table = True
|
192
|
-
except Exception:
|
208
|
+
except Exception as e:
|
193
209
|
pass
|
194
210
|
if not wrote_table:
|
195
211
|
md_lines.append(f"\n")
|
@@ -203,15 +219,19 @@ class StructuredPDFParser:
|
|
203
219
|
md_lines.append(self.box_separator if self.box_separator else "")
|
204
220
|
|
205
221
|
md_path = write_markdown(md_lines, out_dir)
|
222
|
+
html_path = write_html(md_lines, out_dir)
|
223
|
+
|
206
224
|
excel_path = None
|
225
|
+
html_structured_path = None
|
207
226
|
if self.use_vlm and structured_items:
|
208
227
|
excel_path = os.path.join(out_dir, "tables.xlsx")
|
209
228
|
write_structured_excel(excel_path, structured_items)
|
229
|
+
html_structured_path = os.path.join(out_dir, "tables.html")
|
230
|
+
write_structured_html(html_structured_path, structured_items)
|
210
231
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
print(f"Parsing completed successfully.\n- Markdown: {md_path}")
|
232
|
+
# Print completion message with output directory
|
233
|
+
print(f"✅ Parsing completed successfully!")
|
234
|
+
print(f"📁 Output directory: {out_dir}")
|
215
235
|
|
216
236
|
def display_pages_with_boxes(self, pdf_path: str, num_pages: int = 3, cols: int = 2,
|
217
237
|
page_width: int = 800, spacing: int = 40, save_path: str = None) -> None:
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import os
|
4
|
+
import sys
|
4
5
|
from typing import List, Dict, Any
|
5
6
|
from contextlib import ExitStack
|
6
7
|
from pathlib import Path
|
@@ -9,6 +10,7 @@ from PIL import Image
|
|
9
10
|
from tqdm import tqdm
|
10
11
|
|
11
12
|
from doctra.utils.pdf_io import render_pdf_to_images
|
13
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
12
14
|
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
13
15
|
from doctra.engines.layout.layout_models import LayoutPage
|
14
16
|
|
@@ -21,6 +23,8 @@ from doctra.exporters.excel_writer import write_structured_excel
|
|
21
23
|
from doctra.utils.structured_utils import to_structured_dict
|
22
24
|
from doctra.exporters.markdown_table import render_markdown_table
|
23
25
|
from doctra.exporters.markdown_writer import write_markdown
|
26
|
+
from doctra.exporters.html_writer import write_structured_html
|
27
|
+
import json
|
24
28
|
|
25
29
|
|
26
30
|
class ChartTablePDFParser:
|
@@ -34,7 +38,7 @@ class ChartTablePDFParser:
|
|
34
38
|
:param extract_charts: Whether to extract charts from the document (default: True)
|
35
39
|
:param extract_tables: Whether to extract tables from the document (default: True)
|
36
40
|
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
37
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
41
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
38
42
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
39
43
|
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
40
44
|
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
@@ -64,7 +68,7 @@ class ChartTablePDFParser:
|
|
64
68
|
:param extract_charts: Whether to extract charts from the document
|
65
69
|
:param extract_tables: Whether to extract tables from the document
|
66
70
|
:param use_vlm: Whether to use VLM for structured data extraction
|
67
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
71
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
|
68
72
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
69
73
|
:param vlm_api_key: API key for VLM provider
|
70
74
|
:param layout_model_name: Layout detection model name
|
@@ -103,9 +107,9 @@ class ChartTablePDFParser:
|
|
103
107
|
:param output_base_dir: Base directory for output files (default: "outputs")
|
104
108
|
:return: None
|
105
109
|
"""
|
106
|
-
# Create output directory structure: outputs
|
110
|
+
# Create output directory structure: outputs/<filename>/structured_parsing/
|
107
111
|
pdf_name = Path(pdf_path).stem
|
108
|
-
out_dir = os.path.join(output_base_dir, pdf_name)
|
112
|
+
out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
|
109
113
|
os.makedirs(out_dir, exist_ok=True)
|
110
114
|
|
111
115
|
# Create subdirectories based on what we're extracting
|
@@ -140,6 +144,7 @@ class ChartTablePDFParser:
|
|
140
144
|
if self.use_vlm:
|
141
145
|
md_lines: List[str] = ["# Extracted Charts and Tables\n"]
|
142
146
|
structured_items: List[Dict[str, Any]] = []
|
147
|
+
vlm_items: List[Dict[str, Any]] = []
|
143
148
|
|
144
149
|
# Progress bar descriptions
|
145
150
|
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
@@ -149,10 +154,21 @@ class ChartTablePDFParser:
|
|
149
154
|
table_counter = 1
|
150
155
|
|
151
156
|
with ExitStack() as stack:
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
157
|
+
# Enhanced environment detection
|
158
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
159
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
160
|
+
|
161
|
+
# Use appropriate progress bars based on environment
|
162
|
+
if is_notebook:
|
163
|
+
charts_bar = stack.enter_context(
|
164
|
+
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
165
|
+
tables_bar = stack.enter_context(
|
166
|
+
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
167
|
+
else:
|
168
|
+
charts_bar = stack.enter_context(
|
169
|
+
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
170
|
+
tables_bar = stack.enter_context(
|
171
|
+
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
156
172
|
|
157
173
|
for p in pages:
|
158
174
|
page_num = p.page_index
|
@@ -184,6 +200,14 @@ class ChartTablePDFParser:
|
|
184
200
|
structured_item = to_structured_dict(extracted_chart)
|
185
201
|
if structured_item:
|
186
202
|
structured_items.append(structured_item)
|
203
|
+
vlm_items.append({
|
204
|
+
"kind": "chart",
|
205
|
+
"page": page_num,
|
206
|
+
"image_rel_path": rel_path,
|
207
|
+
"title": structured_item.get("title"),
|
208
|
+
"headers": structured_item.get("headers"),
|
209
|
+
"rows": structured_item.get("rows"),
|
210
|
+
})
|
187
211
|
md_lines.append(
|
188
212
|
render_markdown_table(
|
189
213
|
structured_item.get("headers"),
|
@@ -222,6 +246,14 @@ class ChartTablePDFParser:
|
|
222
246
|
structured_item = to_structured_dict(extracted_table)
|
223
247
|
if structured_item:
|
224
248
|
structured_items.append(structured_item)
|
249
|
+
vlm_items.append({
|
250
|
+
"kind": "table",
|
251
|
+
"page": page_num,
|
252
|
+
"image_rel_path": rel_path,
|
253
|
+
"title": structured_item.get("title"),
|
254
|
+
"headers": structured_item.get("headers"),
|
255
|
+
"rows": structured_item.get("rows"),
|
256
|
+
})
|
225
257
|
md_lines.append(
|
226
258
|
render_markdown_table(
|
227
259
|
structured_item.get("headers"),
|
@@ -253,8 +285,29 @@ class ChartTablePDFParser:
|
|
253
285
|
|
254
286
|
# Write Excel file if we have structured data
|
255
287
|
if structured_items:
|
256
|
-
|
288
|
+
# Determine Excel filename based on extraction target
|
289
|
+
if self.extract_charts and self.extract_tables:
|
290
|
+
excel_filename = "parsed_tables_charts.xlsx"
|
291
|
+
elif self.extract_charts:
|
292
|
+
excel_filename = "parsed_charts.xlsx"
|
293
|
+
elif self.extract_tables:
|
294
|
+
excel_filename = "parsed_tables.xlsx"
|
295
|
+
else:
|
296
|
+
excel_filename = "parsed_data.xlsx" # fallback
|
297
|
+
|
298
|
+
|
299
|
+
excel_path = os.path.join(out_dir, excel_filename)
|
257
300
|
write_structured_excel(excel_path, structured_items)
|
301
|
+
|
302
|
+
# Also create HTML version
|
303
|
+
html_filename = excel_filename.replace('.xlsx', '.html')
|
304
|
+
html_path = os.path.join(out_dir, html_filename)
|
305
|
+
write_structured_html(html_path, structured_items)
|
306
|
+
|
307
|
+
# Write VLM items mapping for UI linkage
|
308
|
+
if 'vlm_items' in locals() and vlm_items:
|
309
|
+
with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
|
310
|
+
json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
|
258
311
|
|
259
312
|
# Print results
|
260
313
|
extraction_types = []
|
@@ -262,22 +315,7 @@ class ChartTablePDFParser:
|
|
262
315
|
extraction_types.append("charts")
|
263
316
|
if self.extract_tables:
|
264
317
|
extraction_types.append("tables")
|
265
|
-
|
266
|
-
|
267
|
-
print(f"
|
268
|
-
|
269
|
-
if charts_dir and self.extract_charts:
|
270
|
-
print(f"- Charts directory: {charts_dir}")
|
271
|
-
print(f"- Charts extracted: {chart_counter - 1}")
|
272
|
-
|
273
|
-
if tables_dir and self.extract_tables:
|
274
|
-
print(f"- Tables directory: {tables_dir}")
|
275
|
-
print(f"- Tables extracted: {table_counter - 1}")
|
276
|
-
|
277
|
-
if md_path:
|
278
|
-
print(f"- Markdown file: {md_path}")
|
279
|
-
if excel_path:
|
280
|
-
print(f"- Excel file: {excel_path}")
|
281
|
-
|
282
|
-
if not self.use_vlm:
|
283
|
-
print("- Note: VLM disabled - only cropped images saved")
|
318
|
+
|
319
|
+
# Print completion message with output directory
|
320
|
+
print(f"✅ Parsing completed successfully!")
|
321
|
+
print(f"📁 Output directory: {out_dir}")
|