doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ capabilities with DocRes image restoration for improved document processing.
8
8
  from __future__ import annotations
9
9
  import os
10
10
  import sys
11
+ import numpy as np
11
12
  from typing import List, Dict, Any, Optional, Union
12
13
  from contextlib import ExitStack
13
14
  from PIL import Image
@@ -16,9 +17,17 @@ from tqdm import tqdm
16
17
  from doctra.parsers.structured_pdf_parser import StructuredPDFParser
17
18
  from doctra.engines.image_restoration import DocResEngine
18
19
  from doctra.utils.pdf_io import render_pdf_to_images
19
- from doctra.utils.constants import IMAGE_SUBDIRS
20
+ from doctra.utils.constants import IMAGE_SUBDIRS, EXCLUDE_LABELS
20
21
  from doctra.utils.file_ops import ensure_output_dirs
21
22
  from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
23
+ from doctra.parsers.layout_order import reading_order_key
24
+ from doctra.utils.ocr_utils import ocr_box_text
25
+ from doctra.exporters.image_saver import save_box_image
26
+ from doctra.exporters.markdown_writer import write_markdown
27
+ from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
28
+ from doctra.exporters.excel_writer import write_structured_excel
29
+ from doctra.utils.structured_utils import to_structured_dict
30
+ from doctra.exporters.markdown_table import render_markdown_table
22
31
 
23
32
 
24
33
  class EnhancedPDFParser(StructuredPDFParser):
@@ -132,6 +141,13 @@ class EnhancedPDFParser(StructuredPDFParser):
132
141
  if self.use_image_restoration and self.docres_engine:
133
142
  print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
134
143
  enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
144
+
145
+ # Create enhanced PDF file using the already processed enhanced pages
146
+ enhanced_pdf_path = os.path.join(out_dir, f"{pdf_filename}_enhanced.pdf")
147
+ try:
148
+ self._create_enhanced_pdf_from_pages(enhanced_pages, enhanced_pdf_path)
149
+ except Exception as e:
150
+ print(f"⚠️ Failed to create enhanced PDF: {e}")
135
151
  else:
136
152
  print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
137
153
  enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
@@ -146,7 +162,7 @@ class EnhancedPDFParser(StructuredPDFParser):
146
162
  pil_pages = enhanced_pages
147
163
 
148
164
  # Continue with standard parsing logic
149
- self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
165
+ self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename, pdf_path)
150
166
 
151
167
  def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
152
168
  """
@@ -168,12 +184,12 @@ class EnhancedPDFParser(StructuredPDFParser):
168
184
  if is_notebook:
169
185
  progress_bar = create_notebook_friendly_bar(
170
186
  total=len(original_pages),
171
- desc=f"🔄 DocRes {self.restoration_task}"
187
+ desc=f"DocRes {self.restoration_task}"
172
188
  )
173
189
  else:
174
190
  progress_bar = create_beautiful_progress_bar(
175
191
  total=len(original_pages),
176
- desc=f"🔄 DocRes {self.restoration_task}",
192
+ desc=f"DocRes {self.restoration_task}",
177
193
  leave=True
178
194
  )
179
195
 
@@ -186,7 +202,6 @@ class EnhancedPDFParser(StructuredPDFParser):
186
202
  for i, page_img in enumerate(original_pages):
187
203
  try:
188
204
  # Convert PIL to numpy array
189
- import numpy as np
190
205
  img_array = np.array(page_img)
191
206
 
192
207
  # Apply DocRes restoration
@@ -216,31 +231,22 @@ class EnhancedPDFParser(StructuredPDFParser):
216
231
  if hasattr(progress_bar, 'close'):
217
232
  progress_bar.close()
218
233
 
219
- print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
220
234
  return enhanced_pages
221
235
 
222
- def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
236
+ def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
223
237
  """
224
238
  Process the parsing logic with enhanced pages.
225
239
  This is extracted from the parent class to allow customization.
226
240
  """
227
- from doctra.utils.constants import EXCLUDE_LABELS
228
- from doctra.parsers.layout_order import reading_order_key
229
- from doctra.utils.ocr_utils import ocr_box_text
230
- from doctra.exporters.image_saver import save_box_image
231
- from doctra.exporters.markdown_writer import write_markdown
232
- from doctra.exporters.html_writer import write_html
233
- from doctra.exporters.excel_writer import write_structured_excel
234
- from doctra.exporters.html_writer import write_structured_html
235
- from doctra.utils.structured_utils import to_structured_dict
236
- from doctra.exporters.markdown_table import render_markdown_table
237
241
 
238
242
  fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
239
243
  chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
240
244
  table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
241
245
 
242
246
  md_lines: List[str] = ["# Enhanced Document Content\n"]
247
+ html_lines: List[str] = ["<h1>Enhanced Document Content</h1>"] # For direct HTML generation
243
248
  structured_items: List[Dict[str, Any]] = []
249
+ page_content: Dict[int, List[str]] = {} # Store content by page
244
250
 
245
251
  charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
246
252
  tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
@@ -263,10 +269,15 @@ class EnhancedPDFParser(StructuredPDFParser):
263
269
  figures_bar = stack.enter_context(
264
270
  create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
265
271
 
272
+ # Initialize page content for all pages first
273
+ for page_num in range(1, len(pil_pages) + 1):
274
+ page_content[page_num] = [f"# Page {page_num} Content\n"]
275
+
266
276
  for p in pages:
267
277
  page_num = p.page_index
268
278
  page_img: Image.Image = pil_pages[page_num - 1]
269
279
  md_lines.append(f"\n## Page {page_num}\n")
280
+ html_lines.append(f"<h2>Page {page_num}</h2>")
270
281
 
271
282
  for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
272
283
  if box.label in EXCLUDE_LABELS:
@@ -275,7 +286,11 @@ class EnhancedPDFParser(StructuredPDFParser):
275
286
  rel = os.path.relpath(abs_img_path, out_dir)
276
287
 
277
288
  if box.label == "figure":
278
- md_lines.append(f"![Figure — page {page_num}]({rel})\n")
289
+ figure_md = f"![Figure — page {page_num}]({rel})\n"
290
+ figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
291
+ md_lines.append(figure_md)
292
+ html_lines.append(figure_html)
293
+ page_content[page_num].append(figure_md)
279
294
  if figures_bar: figures_bar.update(1)
280
295
 
281
296
  elif box.label == "chart":
@@ -285,18 +300,35 @@ class EnhancedPDFParser(StructuredPDFParser):
285
300
  chart = self.vlm.extract_chart(abs_img_path)
286
301
  item = to_structured_dict(chart)
287
302
  if item:
303
+ # Add page and type information to structured item
304
+ item["page"] = page_num
305
+ item["type"] = "Chart"
288
306
  structured_items.append(item)
289
- md_lines.append(
290
- render_markdown_table(item.get("headers"), item.get("rows"),
291
- title=item.get("title"))
292
- )
307
+
308
+ # Generate both markdown and HTML tables
309
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
310
+ title=item.get("title"))
311
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
312
+ title=item.get("title"))
313
+
314
+ md_lines.append(table_md)
315
+ html_lines.append(table_html)
316
+ page_content[page_num].append(table_md)
293
317
  wrote_table = True
294
318
  except Exception as e:
295
319
  pass
296
320
  if not wrote_table:
297
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
321
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
322
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
323
+ md_lines.append(chart_md)
324
+ html_lines.append(chart_html)
325
+ page_content[page_num].append(chart_md)
298
326
  else:
299
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
327
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
328
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
329
+ md_lines.append(chart_md)
330
+ html_lines.append(chart_html)
331
+ page_content[page_num].append(chart_md)
300
332
  if charts_bar: charts_bar.update(1)
301
333
 
302
334
  elif box.label == "table":
@@ -306,27 +338,64 @@ class EnhancedPDFParser(StructuredPDFParser):
306
338
  table = self.vlm.extract_table(abs_img_path)
307
339
  item = to_structured_dict(table)
308
340
  if item:
341
+ # Add page and type information to structured item
342
+ item["page"] = page_num
343
+ item["type"] = "Table"
309
344
  structured_items.append(item)
310
- md_lines.append(
311
- render_markdown_table(item.get("headers"), item.get("rows"),
312
- title=item.get("title"))
313
- )
345
+
346
+ # Generate both markdown and HTML tables
347
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
348
+ title=item.get("title"))
349
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
350
+ title=item.get("title"))
351
+
352
+ md_lines.append(table_md)
353
+ html_lines.append(table_html)
354
+ page_content[page_num].append(table_md)
314
355
  wrote_table = True
315
356
  except Exception as e:
316
357
  pass
317
358
  if not wrote_table:
318
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
359
+ table_md = f"![Table — page {page_num}]({rel})\n"
360
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
361
+ md_lines.append(table_md)
362
+ html_lines.append(table_html)
363
+ page_content[page_num].append(table_md)
319
364
  else:
320
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
365
+ table_md = f"![Table — page {page_num}]({rel})\n"
366
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
367
+ md_lines.append(table_md)
368
+ html_lines.append(table_html)
369
+ page_content[page_num].append(table_md)
321
370
  if tables_bar: tables_bar.update(1)
322
371
  else:
323
372
  text = ocr_box_text(self.ocr_engine, page_img, box)
324
373
  if text:
325
374
  md_lines.append(text)
326
375
  md_lines.append(self.box_separator if self.box_separator else "")
376
+ # Convert text to HTML (basic conversion)
377
+ html_text = text.replace('\n', '<br>')
378
+ html_lines.append(f"<p>{html_text}</p>")
379
+ if self.box_separator:
380
+ html_lines.append("<br>")
381
+ page_content[page_num].append(text)
382
+ page_content[page_num].append(self.box_separator if self.box_separator else "")
327
383
 
328
384
  md_path = write_markdown(md_lines, out_dir)
329
- html_path = write_html(md_lines, out_dir)
385
+
386
+ # Use HTML lines if VLM is enabled for better table formatting
387
+ if self.use_vlm and html_lines:
388
+ html_path = write_html_from_lines(html_lines, out_dir)
389
+ else:
390
+ html_path = write_html(md_lines, out_dir)
391
+
392
+ # Create pages folder and save individual page markdown files
393
+ pages_dir = os.path.join(out_dir, "pages")
394
+ os.makedirs(pages_dir, exist_ok=True)
395
+
396
+ for page_num, content_lines in page_content.items():
397
+ page_md_path = os.path.join(pages_dir, f"page_{page_num:03d}.md")
398
+ write_markdown(content_lines, os.path.dirname(page_md_path), os.path.basename(page_md_path))
330
399
 
331
400
  excel_path = None
332
401
  html_structured_path = None
@@ -339,6 +408,30 @@ class EnhancedPDFParser(StructuredPDFParser):
339
408
  print(f"✅ Enhanced parsing completed successfully!")
340
409
  print(f"📁 Output directory: {out_dir}")
341
410
 
411
+ def _create_enhanced_pdf_from_pages(self, enhanced_pages: List[Image.Image], output_path: str) -> None:
412
+ """
413
+ Create an enhanced PDF from already processed enhanced pages.
414
+
415
+ :param enhanced_pages: List of enhanced PIL images
416
+ :param output_path: Path for the enhanced PDF
417
+ """
418
+ if not enhanced_pages:
419
+ raise ValueError("No enhanced pages provided")
420
+
421
+ try:
422
+ # Create enhanced PDF from the processed pages
423
+ enhanced_pages[0].save(
424
+ output_path,
425
+ "PDF",
426
+ resolution=100.0,
427
+ save_all=True,
428
+ append_images=enhanced_pages[1:] if len(enhanced_pages) > 1 else []
429
+ )
430
+ print(f"✅ Enhanced PDF saved from processed pages: {output_path}")
431
+ except Exception as e:
432
+ print(f"❌ Error creating enhanced PDF from pages: {e}")
433
+ raise
434
+
342
435
  def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
343
436
  """
344
437
  Apply DocRes restoration to a PDF without parsing.
@@ -20,7 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
20
20
  from doctra.utils.structured_utils import to_structured_dict
21
21
  from doctra.exporters.markdown_table import render_markdown_table
22
22
  from doctra.exporters.markdown_writer import write_markdown
23
- from doctra.exporters.html_writer import write_html, write_structured_html
23
+ from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
24
24
  from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
25
25
 
26
26
 
@@ -117,6 +117,7 @@ class StructuredPDFParser:
117
117
  table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
118
118
 
119
119
  md_lines: List[str] = ["# Extracted Content\n"]
120
+ html_lines: List[str] = ["<h1>Extracted Content</h1>"] # For direct HTML generation
120
121
  structured_items: List[Dict[str, Any]] = []
121
122
 
122
123
  charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
@@ -145,6 +146,7 @@ class StructuredPDFParser:
145
146
  page_num = p.page_index
146
147
  page_img: Image.Image = pil_pages[page_num - 1]
147
148
  md_lines.append(f"\n## Page {page_num}\n")
149
+ html_lines.append(f"<h2>Page {page_num}</h2>")
148
150
 
149
151
  for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
150
152
  if box.label in EXCLUDE_LABELS:
@@ -153,7 +155,10 @@ class StructuredPDFParser:
153
155
  rel = os.path.relpath(abs_img_path, out_dir)
154
156
 
155
157
  if box.label == "figure":
156
- md_lines.append(f"![Figure — page {page_num}]({rel})\n")
158
+ figure_md = f"![Figure — page {page_num}]({rel})\n"
159
+ figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
160
+ md_lines.append(figure_md)
161
+ html_lines.append(figure_html)
157
162
  if figures_bar: figures_bar.update(1)
158
163
 
159
164
  elif box.label == "chart":
@@ -163,18 +168,32 @@ class StructuredPDFParser:
163
168
  chart = self.vlm.extract_chart(abs_img_path)
164
169
  item = to_structured_dict(chart)
165
170
  if item:
171
+ # Add page and type information to structured item
172
+ item["page"] = page_num
173
+ item["type"] = "Chart"
166
174
  structured_items.append(item)
167
- md_lines.append(
168
- render_markdown_table(item.get("headers"), item.get("rows"),
169
- title=item.get("title"))
170
- )
175
+
176
+ # Generate both markdown and HTML tables
177
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
178
+ title=item.get("title"))
179
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
180
+ title=item.get("title"))
181
+
182
+ md_lines.append(table_md)
183
+ html_lines.append(table_html)
171
184
  wrote_table = True
172
185
  except Exception as e:
173
186
  pass
174
187
  if not wrote_table:
175
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
188
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
189
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
190
+ md_lines.append(chart_md)
191
+ html_lines.append(chart_html)
176
192
  else:
177
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
193
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
194
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
195
+ md_lines.append(chart_md)
196
+ html_lines.append(chart_html)
178
197
  if charts_bar: charts_bar.update(1)
179
198
 
180
199
  elif box.label == "table":
@@ -184,27 +203,51 @@ class StructuredPDFParser:
184
203
  table = self.vlm.extract_table(abs_img_path)
185
204
  item = to_structured_dict(table)
186
205
  if item:
206
+ # Add page and type information to structured item
207
+ item["page"] = page_num
208
+ item["type"] = "Table"
187
209
  structured_items.append(item)
188
- md_lines.append(
189
- render_markdown_table(item.get("headers"), item.get("rows"),
190
- title=item.get("title"))
191
- )
210
+
211
+ # Generate both markdown and HTML tables
212
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
213
+ title=item.get("title"))
214
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
215
+ title=item.get("title"))
216
+
217
+ md_lines.append(table_md)
218
+ html_lines.append(table_html)
192
219
  wrote_table = True
193
220
  except Exception as e:
194
221
  pass
195
222
  if not wrote_table:
196
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
223
+ table_md = f"![Table — page {page_num}]({rel})\n"
224
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
225
+ md_lines.append(table_md)
226
+ html_lines.append(table_html)
197
227
  else:
198
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
228
+ table_md = f"![Table — page {page_num}]({rel})\n"
229
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
230
+ md_lines.append(table_md)
231
+ html_lines.append(table_html)
199
232
  if tables_bar: tables_bar.update(1)
200
233
  else:
201
234
  text = ocr_box_text(self.ocr_engine, page_img, box)
202
235
  if text:
203
236
  md_lines.append(text)
204
237
  md_lines.append(self.box_separator if self.box_separator else "")
238
+ # Convert text to HTML (basic conversion)
239
+ html_text = text.replace('\n', '<br>')
240
+ html_lines.append(f"<p>{html_text}</p>")
241
+ if self.box_separator:
242
+ html_lines.append("<br>")
205
243
 
206
244
  md_path = write_markdown(md_lines, out_dir)
207
- html_path = write_html(md_lines, out_dir)
245
+
246
+ # Use HTML lines if VLM is enabled for better table formatting
247
+ if self.use_vlm and html_lines:
248
+ html_path = write_html_from_lines(html_lines, out_dir)
249
+ else:
250
+ html_path = write_html(md_lines, out_dir)
208
251
 
209
252
  excel_path = None
210
253
  html_structured_path = None