doctra 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
  import os
3
3
  import re
4
+ import sys
4
5
  from typing import List, Dict, Any
5
6
  from contextlib import ExitStack
6
7
  from PIL import Image, ImageDraw, ImageFont
@@ -19,6 +20,8 @@ from doctra.exporters.excel_writer import write_structured_excel
19
20
  from doctra.utils.structured_utils import to_structured_dict
20
21
  from doctra.exporters.markdown_table import render_markdown_table
21
22
  from doctra.exporters.markdown_writer import write_markdown
23
+ from doctra.exporters.html_writer import write_html, write_structured_html
24
+ from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
22
25
 
23
26
 
24
27
  class StructuredPDFParser:
@@ -30,7 +33,7 @@ class StructuredPDFParser:
30
33
  converting visual elements into structured data.
31
34
 
32
35
  :param use_vlm: Whether to use VLM for structured data extraction (default: False)
33
- :param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
36
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
34
37
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
35
38
  :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
36
39
  :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
@@ -66,7 +69,7 @@ class StructuredPDFParser:
66
69
  the VLM service for comprehensive document processing.
67
70
 
68
71
  :param use_vlm: Whether to use VLM for structured data extraction
69
- :param vlm_provider: VLM provider to use ("gemini" or "openai")
72
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
70
73
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
71
74
  :param vlm_api_key: API key for VLM provider
72
75
  :param layout_model_name: Layout detection model name
@@ -107,7 +110,7 @@ class StructuredPDFParser:
107
110
  """
108
111
  # Extract filename without extension and create output directory
109
112
  pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
110
- out_dir = f"outputs/{pdf_filename}"
113
+ out_dir = f"outputs/{pdf_filename}/full_parse"
111
114
 
112
115
  os.makedirs(out_dir, exist_ok=True)
113
116
  ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
@@ -130,12 +133,25 @@ class StructuredPDFParser:
130
133
  figures_desc = "Figures (cropped)"
131
134
 
132
135
  with ExitStack() as stack:
133
- charts_bar = stack.enter_context(
134
- tqdm(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
135
- tables_bar = stack.enter_context(
136
- tqdm(total=table_count, desc=tables_desc, leave=True)) if table_count else None
137
- figures_bar = stack.enter_context(
138
- tqdm(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
136
+ # Enhanced environment detection
137
+ is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
138
+ is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
139
+
140
+ # Use appropriate progress bars based on environment
141
+ if is_notebook:
142
+ charts_bar = stack.enter_context(
143
+ create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
144
+ tables_bar = stack.enter_context(
145
+ create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
146
+ figures_bar = stack.enter_context(
147
+ create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
148
+ else:
149
+ charts_bar = stack.enter_context(
150
+ create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
151
+ tables_bar = stack.enter_context(
152
+ create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
153
+ figures_bar = stack.enter_context(
154
+ create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
139
155
 
140
156
  for p in pages:
141
157
  page_num = p.page_index
@@ -167,7 +183,7 @@ class StructuredPDFParser:
167
183
  title=item.get("title"))
168
184
  )
169
185
  wrote_table = True
170
- except Exception:
186
+ except Exception as e:
171
187
  pass
172
188
  if not wrote_table:
173
189
  md_lines.append(f"![Chart — page {page_num}]({rel})\n")
@@ -189,7 +205,7 @@ class StructuredPDFParser:
189
205
  title=item.get("title"))
190
206
  )
191
207
  wrote_table = True
192
- except Exception:
208
+ except Exception as e:
193
209
  pass
194
210
  if not wrote_table:
195
211
  md_lines.append(f"![Table — page {page_num}]({rel})\n")
@@ -203,15 +219,19 @@ class StructuredPDFParser:
203
219
  md_lines.append(self.box_separator if self.box_separator else "")
204
220
 
205
221
  md_path = write_markdown(md_lines, out_dir)
222
+ html_path = write_html(md_lines, out_dir)
223
+
206
224
  excel_path = None
225
+ html_structured_path = None
207
226
  if self.use_vlm and structured_items:
208
227
  excel_path = os.path.join(out_dir, "tables.xlsx")
209
228
  write_structured_excel(excel_path, structured_items)
229
+ html_structured_path = os.path.join(out_dir, "tables.html")
230
+ write_structured_html(html_structured_path, structured_items)
210
231
 
211
- if excel_path:
212
- print(f"Parsing completed successfully.\n- Markdown: {md_path}\n- Excel: {excel_path}")
213
- else:
214
- print(f"Parsing completed successfully.\n- Markdown: {md_path}")
232
+ # Print completion message with output directory
233
+ print(f"Parsing completed successfully!")
234
+ print(f"📁 Output directory: {out_dir}")
215
235
 
216
236
  def display_pages_with_boxes(self, pdf_path: str, num_pages: int = 3, cols: int = 2,
217
237
  page_width: int = 800, spacing: int = 40, save_path: str = None) -> None:
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ import sys
4
5
  from typing import List, Dict, Any
5
6
  from contextlib import ExitStack
6
7
  from pathlib import Path
@@ -9,6 +10,7 @@ from PIL import Image
9
10
  from tqdm import tqdm
10
11
 
11
12
  from doctra.utils.pdf_io import render_pdf_to_images
13
+ from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
12
14
  from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
13
15
  from doctra.engines.layout.layout_models import LayoutPage
14
16
 
@@ -21,6 +23,8 @@ from doctra.exporters.excel_writer import write_structured_excel
21
23
  from doctra.utils.structured_utils import to_structured_dict
22
24
  from doctra.exporters.markdown_table import render_markdown_table
23
25
  from doctra.exporters.markdown_writer import write_markdown
26
+ from doctra.exporters.html_writer import write_structured_html
27
+ import json
24
28
 
25
29
 
26
30
  class ChartTablePDFParser:
@@ -34,7 +38,7 @@ class ChartTablePDFParser:
34
38
  :param extract_charts: Whether to extract charts from the document (default: True)
35
39
  :param extract_tables: Whether to extract tables from the document (default: True)
36
40
  :param use_vlm: Whether to use VLM for structured data extraction (default: False)
37
- :param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
41
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
38
42
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
39
43
  :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
40
44
  :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
@@ -64,7 +68,7 @@ class ChartTablePDFParser:
64
68
  :param extract_charts: Whether to extract charts from the document
65
69
  :param extract_tables: Whether to extract tables from the document
66
70
  :param use_vlm: Whether to use VLM for structured data extraction
67
- :param vlm_provider: VLM provider to use ("gemini" or "openai")
71
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
68
72
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
69
73
  :param vlm_api_key: API key for VLM provider
70
74
  :param layout_model_name: Layout detection model name
@@ -103,9 +107,9 @@ class ChartTablePDFParser:
103
107
  :param output_base_dir: Base directory for output files (default: "outputs")
104
108
  :return: None
105
109
  """
106
- # Create output directory structure: outputs/structured_doc/<filename>/
110
+ # Create output directory structure: outputs/<filename>/structured_parsing/
107
111
  pdf_name = Path(pdf_path).stem
108
- out_dir = os.path.join(output_base_dir, pdf_name)
112
+ out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
109
113
  os.makedirs(out_dir, exist_ok=True)
110
114
 
111
115
  # Create subdirectories based on what we're extracting
@@ -140,6 +144,7 @@ class ChartTablePDFParser:
140
144
  if self.use_vlm:
141
145
  md_lines: List[str] = ["# Extracted Charts and Tables\n"]
142
146
  structured_items: List[Dict[str, Any]] = []
147
+ vlm_items: List[Dict[str, Any]] = []
143
148
 
144
149
  # Progress bar descriptions
145
150
  charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
@@ -149,10 +154,21 @@ class ChartTablePDFParser:
149
154
  table_counter = 1
150
155
 
151
156
  with ExitStack() as stack:
152
- charts_bar = stack.enter_context(
153
- tqdm(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
154
- tables_bar = stack.enter_context(
155
- tqdm(total=table_count, desc=tables_desc, leave=True)) if table_count else None
157
+ # Enhanced environment detection
158
+ is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
159
+ is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
160
+
161
+ # Use appropriate progress bars based on environment
162
+ if is_notebook:
163
+ charts_bar = stack.enter_context(
164
+ create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
165
+ tables_bar = stack.enter_context(
166
+ create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
167
+ else:
168
+ charts_bar = stack.enter_context(
169
+ create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
170
+ tables_bar = stack.enter_context(
171
+ create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
156
172
 
157
173
  for p in pages:
158
174
  page_num = p.page_index
@@ -184,6 +200,14 @@ class ChartTablePDFParser:
184
200
  structured_item = to_structured_dict(extracted_chart)
185
201
  if structured_item:
186
202
  structured_items.append(structured_item)
203
+ vlm_items.append({
204
+ "kind": "chart",
205
+ "page": page_num,
206
+ "image_rel_path": rel_path,
207
+ "title": structured_item.get("title"),
208
+ "headers": structured_item.get("headers"),
209
+ "rows": structured_item.get("rows"),
210
+ })
187
211
  md_lines.append(
188
212
  render_markdown_table(
189
213
  structured_item.get("headers"),
@@ -222,6 +246,14 @@ class ChartTablePDFParser:
222
246
  structured_item = to_structured_dict(extracted_table)
223
247
  if structured_item:
224
248
  structured_items.append(structured_item)
249
+ vlm_items.append({
250
+ "kind": "table",
251
+ "page": page_num,
252
+ "image_rel_path": rel_path,
253
+ "title": structured_item.get("title"),
254
+ "headers": structured_item.get("headers"),
255
+ "rows": structured_item.get("rows"),
256
+ })
225
257
  md_lines.append(
226
258
  render_markdown_table(
227
259
  structured_item.get("headers"),
@@ -253,8 +285,29 @@ class ChartTablePDFParser:
253
285
 
254
286
  # Write Excel file if we have structured data
255
287
  if structured_items:
256
- excel_path = os.path.join(out_dir, "charts.xlsx")
288
+ # Determine Excel filename based on extraction target
289
+ if self.extract_charts and self.extract_tables:
290
+ excel_filename = "parsed_tables_charts.xlsx"
291
+ elif self.extract_charts:
292
+ excel_filename = "parsed_charts.xlsx"
293
+ elif self.extract_tables:
294
+ excel_filename = "parsed_tables.xlsx"
295
+ else:
296
+ excel_filename = "parsed_data.xlsx" # fallback
297
+
298
+
299
+ excel_path = os.path.join(out_dir, excel_filename)
257
300
  write_structured_excel(excel_path, structured_items)
301
+
302
+ # Also create HTML version
303
+ html_filename = excel_filename.replace('.xlsx', '.html')
304
+ html_path = os.path.join(out_dir, html_filename)
305
+ write_structured_html(html_path, structured_items)
306
+
307
+ # Write VLM items mapping for UI linkage
308
+ if 'vlm_items' in locals() and vlm_items:
309
+ with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
310
+ json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
258
311
 
259
312
  # Print results
260
313
  extraction_types = []
@@ -262,22 +315,7 @@ class ChartTablePDFParser:
262
315
  extraction_types.append("charts")
263
316
  if self.extract_tables:
264
317
  extraction_types.append("tables")
265
-
266
- print(f"{' and '.join(extraction_types).title()} extraction completed successfully.")
267
- print(f"- Output directory: {out_dir}")
268
-
269
- if charts_dir and self.extract_charts:
270
- print(f"- Charts directory: {charts_dir}")
271
- print(f"- Charts extracted: {chart_counter - 1}")
272
-
273
- if tables_dir and self.extract_tables:
274
- print(f"- Tables directory: {tables_dir}")
275
- print(f"- Tables extracted: {table_counter - 1}")
276
-
277
- if md_path:
278
- print(f"- Markdown file: {md_path}")
279
- if excel_path:
280
- print(f"- Excel file: {excel_path}")
281
-
282
- if not self.use_vlm:
283
- print("- Note: VLM disabled - only cropped images saved")
318
+
319
+ # Print completion message with output directory
320
+ print(f" Parsing completed successfully!")
321
+ print(f"📁 Output directory: {out_dir}")
doctra/ui/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ from .app import build_demo, launch_ui
2
+
3
+ __all__ = ["build_demo", "launch_ui"]
4
+
5
+