doctra 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
20
20
  from doctra.utils.structured_utils import to_structured_dict
21
21
  from doctra.exporters.markdown_table import render_markdown_table
22
22
  from doctra.exporters.markdown_writer import write_markdown
23
+ from doctra.exporters.html_writer import write_html, write_structured_html
23
24
  from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
24
25
 
25
26
 
@@ -109,7 +110,7 @@ class StructuredPDFParser:
109
110
  """
110
111
  # Extract filename without extension and create output directory
111
112
  pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
112
- out_dir = f"outputs/{pdf_filename}"
113
+ out_dir = f"outputs/{pdf_filename}/full_parse"
113
114
 
114
115
  os.makedirs(out_dir, exist_ok=True)
115
116
  ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
@@ -182,7 +183,7 @@ class StructuredPDFParser:
182
183
  title=item.get("title"))
183
184
  )
184
185
  wrote_table = True
185
- except Exception:
186
+ except Exception as e:
186
187
  pass
187
188
  if not wrote_table:
188
189
  md_lines.append(f"![Chart — page {page_num}]({rel})\n")
@@ -204,7 +205,7 @@ class StructuredPDFParser:
204
205
  title=item.get("title"))
205
206
  )
206
207
  wrote_table = True
207
- except Exception:
208
+ except Exception as e:
208
209
  pass
209
210
  if not wrote_table:
210
211
  md_lines.append(f"![Table — page {page_num}]({rel})\n")
@@ -218,15 +219,19 @@ class StructuredPDFParser:
218
219
  md_lines.append(self.box_separator if self.box_separator else "")
219
220
 
220
221
  md_path = write_markdown(md_lines, out_dir)
222
+ html_path = write_html(md_lines, out_dir)
223
+
221
224
  excel_path = None
225
+ html_structured_path = None
222
226
  if self.use_vlm and structured_items:
223
227
  excel_path = os.path.join(out_dir, "tables.xlsx")
224
228
  write_structured_excel(excel_path, structured_items)
229
+ html_structured_path = os.path.join(out_dir, "tables.html")
230
+ write_structured_html(html_structured_path, structured_items)
225
231
 
226
- if excel_path:
227
- print(f"Parsing completed successfully.\n- Markdown: {md_path}\n- Excel: {excel_path}")
228
- else:
229
- print(f"Parsing completed successfully.\n- Markdown: {md_path}")
232
+ # Print completion message with output directory
233
+ print(f"Parsing completed successfully!")
234
+ print(f"📁 Output directory: {out_dir}")
230
235
 
231
236
  def display_pages_with_boxes(self, pdf_path: str, num_pages: int = 3, cols: int = 2,
232
237
  page_width: int = 800, spacing: int = 40, save_path: str = None) -> None:
@@ -23,6 +23,8 @@ from doctra.exporters.excel_writer import write_structured_excel
23
23
  from doctra.utils.structured_utils import to_structured_dict
24
24
  from doctra.exporters.markdown_table import render_markdown_table
25
25
  from doctra.exporters.markdown_writer import write_markdown
26
+ from doctra.exporters.html_writer import write_structured_html
27
+ import json
26
28
 
27
29
 
28
30
  class ChartTablePDFParser:
@@ -105,9 +107,9 @@ class ChartTablePDFParser:
105
107
  :param output_base_dir: Base directory for output files (default: "outputs")
106
108
  :return: None
107
109
  """
108
- # Create output directory structure: outputs/structured_doc/<filename>/
110
+ # Create output directory structure: outputs/<filename>/structured_parsing/
109
111
  pdf_name = Path(pdf_path).stem
110
- out_dir = os.path.join(output_base_dir, pdf_name)
112
+ out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
111
113
  os.makedirs(out_dir, exist_ok=True)
112
114
 
113
115
  # Create subdirectories based on what we're extracting
@@ -142,6 +144,7 @@ class ChartTablePDFParser:
142
144
  if self.use_vlm:
143
145
  md_lines: List[str] = ["# Extracted Charts and Tables\n"]
144
146
  structured_items: List[Dict[str, Any]] = []
147
+ vlm_items: List[Dict[str, Any]] = []
145
148
 
146
149
  # Progress bar descriptions
147
150
  charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
@@ -197,6 +200,14 @@ class ChartTablePDFParser:
197
200
  structured_item = to_structured_dict(extracted_chart)
198
201
  if structured_item:
199
202
  structured_items.append(structured_item)
203
+ vlm_items.append({
204
+ "kind": "chart",
205
+ "page": page_num,
206
+ "image_rel_path": rel_path,
207
+ "title": structured_item.get("title"),
208
+ "headers": structured_item.get("headers"),
209
+ "rows": structured_item.get("rows"),
210
+ })
200
211
  md_lines.append(
201
212
  render_markdown_table(
202
213
  structured_item.get("headers"),
@@ -235,6 +246,14 @@ class ChartTablePDFParser:
235
246
  structured_item = to_structured_dict(extracted_table)
236
247
  if structured_item:
237
248
  structured_items.append(structured_item)
249
+ vlm_items.append({
250
+ "kind": "table",
251
+ "page": page_num,
252
+ "image_rel_path": rel_path,
253
+ "title": structured_item.get("title"),
254
+ "headers": structured_item.get("headers"),
255
+ "rows": structured_item.get("rows"),
256
+ })
238
257
  md_lines.append(
239
258
  render_markdown_table(
240
259
  structured_item.get("headers"),
@@ -266,8 +285,29 @@ class ChartTablePDFParser:
266
285
 
267
286
  # Write Excel file if we have structured data
268
287
  if structured_items:
269
- excel_path = os.path.join(out_dir, "charts.xlsx")
288
+ # Determine Excel filename based on extraction target
289
+ if self.extract_charts and self.extract_tables:
290
+ excel_filename = "parsed_tables_charts.xlsx"
291
+ elif self.extract_charts:
292
+ excel_filename = "parsed_charts.xlsx"
293
+ elif self.extract_tables:
294
+ excel_filename = "parsed_tables.xlsx"
295
+ else:
296
+ excel_filename = "parsed_data.xlsx" # fallback
297
+
298
+
299
+ excel_path = os.path.join(out_dir, excel_filename)
270
300
  write_structured_excel(excel_path, structured_items)
301
+
302
+ # Also create HTML version
303
+ html_filename = excel_filename.replace('.xlsx', '.html')
304
+ html_path = os.path.join(out_dir, html_filename)
305
+ write_structured_html(html_path, structured_items)
306
+
307
+ # Write VLM items mapping for UI linkage
308
+ if 'vlm_items' in locals() and vlm_items:
309
+ with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
310
+ json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
271
311
 
272
312
  # Print results
273
313
  extraction_types = []
@@ -275,22 +315,7 @@ class ChartTablePDFParser:
275
315
  extraction_types.append("charts")
276
316
  if self.extract_tables:
277
317
  extraction_types.append("tables")
278
-
279
- print(f"{' and '.join(extraction_types).title()} extraction completed successfully.")
280
- print(f"- Output directory: {out_dir}")
281
-
282
- if charts_dir and self.extract_charts:
283
- print(f"- Charts directory: {charts_dir}")
284
- print(f"- Charts extracted: {chart_counter - 1}")
285
-
286
- if tables_dir and self.extract_tables:
287
- print(f"- Tables directory: {tables_dir}")
288
- print(f"- Tables extracted: {table_counter - 1}")
289
-
290
- if md_path:
291
- print(f"- Markdown file: {md_path}")
292
- if excel_path:
293
- print(f"- Excel file: {excel_path}")
294
-
295
- if not self.use_vlm:
296
- print("- Note: VLM disabled - only cropped images saved")
318
+
319
+ # Print completion message with output directory
320
+ print(f" Parsing completed successfully!")
321
+ print(f"📁 Output directory: {out_dir}")
doctra/ui/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ from .app import build_demo, launch_ui
2
+
3
+ __all__ = ["build_demo", "launch_ui"]
4
+
5
+