doctra 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,435 @@
1
+ """
2
+ Shared utilities for Doctra Gradio UI components
3
+
4
+ This module contains common functions, constants, and utilities used across
5
+ all UI modules to ensure consistency and reduce code duplication.
6
+ """
7
+
8
+ import os
9
+ import shutil
10
+ import tempfile
11
+ import re
12
+ import html as _html
13
+ import base64
14
+ import json
15
+ from pathlib import Path
16
+ from typing import Optional, Tuple, List, Dict, Any
17
+
18
+ import gradio as gr
19
+ import pandas as pd
20
+
21
+
22
+ # UI Theme and Styling Constants
23
+ THEME = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
24
+
25
+ CUSTOM_CSS = """
26
+ .gradio-container {max-width: 100% !important; padding-left: 24px; padding-right: 24px}
27
+ .container {max-width: 100% !important}
28
+ .app {max-width: 100% !important}
29
+ .header {margin-bottom: 8px}
30
+ .subtitle {color: var(--body-text-color-subdued)}
31
+ .card {border:1px solid var(--border-color); border-radius:12px; padding:8px}
32
+ .status-ok {color: var(--color-success)}
33
+
34
+ /* Page content styling */
35
+ .page-content img {
36
+ max-width: 100% !important;
37
+ height: auto !important;
38
+ display: block !important;
39
+ margin: 10px auto !important;
40
+ border: 1px solid #ddd !important;
41
+ border-radius: 8px !important;
42
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
43
+ }
44
+
45
+ .page-content {
46
+ max-height: none !important;
47
+ overflow: visible !important;
48
+ }
49
+
50
+ /* Table styling */
51
+ .page-content table.doc-table {
52
+ width: 100% !important;
53
+ border-collapse: collapse !important;
54
+ margin: 12px 0 !important;
55
+ }
56
+ .page-content table.doc-table th,
57
+ .page-content table.doc-table td {
58
+ border: 1px solid #e5e7eb !important;
59
+ padding: 8px 10px !important;
60
+ text-align: left !important;
61
+ }
62
+ .page-content table.doc-table thead th {
63
+ background: #f9fafb !important;
64
+ font-weight: 600 !important;
65
+ }
66
+ .page-content table.doc-table tbody tr:nth-child(even) td {
67
+ background: #fafafa !important;
68
+ }
69
+
70
+ /* Clickable image buttons */
71
+ .image-button {
72
+ background: #0066cc !important;
73
+ color: white !important;
74
+ border: none !important;
75
+ padding: 5px 10px !important;
76
+ border-radius: 4px !important;
77
+ cursor: pointer !important;
78
+ margin: 2px !important;
79
+ font-size: 14px !important;
80
+ }
81
+
82
+ .image-button:hover {
83
+ background: #0052a3 !important;
84
+ }
85
+ """
86
+
87
+
88
+ def gather_outputs(
89
+ out_dir: Path,
90
+ allowed_kinds: Optional[List[str]] = None,
91
+ zip_filename: Optional[str] = None,
92
+ is_structured_parsing: bool = False
93
+ ) -> Tuple[List[tuple[str, str]], List[str], str]:
94
+ """
95
+ Gather output files and create a ZIP archive for download.
96
+
97
+ Args:
98
+ out_dir: Output directory path
99
+ allowed_kinds: List of allowed file kinds (tables, charts, figures)
100
+ zip_filename: Name for the ZIP file
101
+ is_structured_parsing: Whether this is structured parsing output
102
+
103
+ Returns:
104
+ Tuple of (gallery_items, file_paths, zip_path)
105
+ """
106
+ gallery_items: List[tuple[str, str]] = []
107
+ file_paths: List[str] = []
108
+
109
+ if out_dir.exists():
110
+ if is_structured_parsing:
111
+ # For structured parsing, include all files
112
+ for file_path in sorted(out_dir.rglob("*")):
113
+ if file_path.is_file():
114
+ file_paths.append(str(file_path))
115
+ else:
116
+ # For full parsing, include specific main files
117
+ main_files = [
118
+ "result.html",
119
+ "result.md",
120
+ "tables.html",
121
+ "tables.xlsx"
122
+ ]
123
+
124
+ for main_file in main_files:
125
+ file_path = out_dir / main_file
126
+ if file_path.exists():
127
+ file_paths.append(str(file_path))
128
+
129
+ # Include images based on allowed kinds
130
+ if allowed_kinds:
131
+ for kind in allowed_kinds:
132
+ p = out_dir / kind
133
+ if p.exists():
134
+ for img in sorted(p.glob("*.png")):
135
+ file_paths.append(str(img))
136
+
137
+ images_dir = out_dir / "images" / kind
138
+ if images_dir.exists():
139
+ for img in sorted(images_dir.glob("*.jpg")):
140
+ file_paths.append(str(img))
141
+ else:
142
+ # Include all images if no specific kinds specified
143
+ for p in (out_dir / "charts").glob("*.png"):
144
+ file_paths.append(str(p))
145
+ for p in (out_dir / "tables").glob("*.png"):
146
+ file_paths.append(str(p))
147
+ for p in (out_dir / "images").rglob("*.jpg"):
148
+ file_paths.append(str(p))
149
+
150
+ # Include Excel files based on allowed kinds
151
+ if allowed_kinds:
152
+ if "charts" in allowed_kinds and "tables" in allowed_kinds:
153
+ excel_files = ["parsed_tables_charts.xlsx"]
154
+ elif "charts" in allowed_kinds:
155
+ excel_files = ["parsed_charts.xlsx"]
156
+ elif "tables" in allowed_kinds:
157
+ excel_files = ["parsed_tables.xlsx"]
158
+ else:
159
+ excel_files = []
160
+
161
+ for excel_file in excel_files:
162
+ excel_path = out_dir / excel_file
163
+ if excel_path.exists():
164
+ file_paths.append(str(excel_path))
165
+
166
+ # Build gallery items for image display
167
+ kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
168
+ for sub in kinds:
169
+ p = out_dir / sub
170
+ if p.exists():
171
+ for img in sorted(p.glob("*.png")):
172
+ gallery_items.append((str(img), f"{sub}: {img.name}"))
173
+
174
+ images_dir = out_dir / "images" / sub
175
+ if images_dir.exists():
176
+ for img in sorted(images_dir.glob("*.jpg")):
177
+ gallery_items.append((str(img), f"{sub}: {img.name}"))
178
+
179
+ # Create ZIP archive
180
+ tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
181
+
182
+ if zip_filename:
183
+ safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
184
+ zip_base = tmp_zip_dir / safe_filename
185
+ else:
186
+ zip_base = tmp_zip_dir / "doctra_outputs"
187
+
188
+ filtered_dir = tmp_zip_dir / "filtered_outputs"
189
+ shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
190
+
191
+ zip_path = shutil.make_archive(str(zip_base), 'zip', root_dir=str(filtered_dir))
192
+
193
+ return gallery_items, file_paths, zip_path
194
+
195
+
196
+ def parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
197
+ """
198
+ Parse markdown content and organize it by pages.
199
+
200
+ Args:
201
+ md_content: Raw markdown content string
202
+
203
+ Returns:
204
+ List of page dictionaries with content, tables, charts, and figures
205
+ """
206
+ pages = []
207
+ current_page = None
208
+
209
+ lines = md_content.split('\n')
210
+ i = 0
211
+
212
+ while i < len(lines):
213
+ line = lines[i].strip()
214
+
215
+ # Detect page headers
216
+ if line.startswith('## Page '):
217
+ if current_page:
218
+ pages.append(current_page)
219
+
220
+ page_num = line.replace('## Page ', '').strip()
221
+ current_page = {
222
+ 'page_num': page_num,
223
+ 'content': [],
224
+ 'tables': [],
225
+ 'charts': [],
226
+ 'figures': [],
227
+ 'images': [],
228
+ 'full_content': [] # Store full content with inline images
229
+ }
230
+ i += 1
231
+ continue
232
+
233
+ # Detect image references
234
+ if line.startswith('![') and '](images/' in line:
235
+ match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
236
+ if match:
237
+ caption = match.group(1)
238
+ img_path = match.group(2)
239
+
240
+ # Categorize images by type
241
+ if 'Table' in caption:
242
+ current_page['tables'].append({'caption': caption, 'path': img_path})
243
+ elif 'Chart' in caption:
244
+ current_page['charts'].append({'caption': caption, 'path': img_path})
245
+ elif 'Figure' in caption:
246
+ current_page['figures'].append({'caption': caption, 'path': img_path})
247
+
248
+ current_page['images'].append({'caption': caption, 'path': img_path})
249
+ current_page['full_content'].append(f"![{caption}]({img_path})")
250
+
251
+ elif current_page:
252
+ if line:
253
+ current_page['content'].append(line)
254
+ current_page['full_content'].append(line)
255
+
256
+ i += 1
257
+
258
+ if current_page:
259
+ pages.append(current_page)
260
+
261
+ return pages
262
+
263
+
264
+ def validate_vlm_config(use_vlm: bool, vlm_api_key: str) -> Optional[str]:
265
+ """
266
+ Validate VLM configuration parameters.
267
+
268
+ Args:
269
+ use_vlm: Whether VLM is enabled
270
+ vlm_api_key: API key for VLM provider
271
+
272
+ Returns:
273
+ Error message if validation fails, None if valid
274
+ """
275
+ if use_vlm and not vlm_api_key:
276
+ return "❌ Error: VLM API key is required when using VLM"
277
+
278
+ if use_vlm and vlm_api_key:
279
+ # Basic API key validation
280
+ if len(vlm_api_key.strip()) < 10:
281
+ return "❌ Error: VLM API key appears to be too short or invalid"
282
+ if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
283
+ return "❌ Error: OpenAI API key appears to be invalid (too short)"
284
+
285
+ return None
286
+
287
+
288
+ def render_markdown_table(lines: List[str]) -> str:
289
+ """
290
+ Render markdown table lines to HTML table.
291
+
292
+ Args:
293
+ lines: List of markdown table lines
294
+
295
+ Returns:
296
+ HTML table string
297
+ """
298
+ rows = [l.strip().strip('|').split('|') for l in lines]
299
+ rows = [[_html.escape(c.strip()) for c in r] for r in rows]
300
+ if len(rows) < 2:
301
+ return ""
302
+
303
+ header = rows[0]
304
+ body = rows[2:] if len(rows) > 2 else []
305
+ thead = '<thead><tr>' + ''.join(f'<th>{c}</th>' for c in header) + '</tr></thead>'
306
+ tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{c}</td>' for c in r) + '</tr>' for r in body) + '</tbody>'
307
+ return f'<table class="doc-table">{thead}{tbody}</table>'
308
+
309
+
310
+ def is_markdown_table_header(s: str) -> bool:
311
+ """
312
+ Check if a line is a markdown table header.
313
+
314
+ Args:
315
+ s: Line string to check
316
+
317
+ Returns:
318
+ True if line is a table header
319
+ """
320
+ return '|' in s and ('---' in s or '—' in s)
321
+
322
+
323
+ def create_page_html_content(page_content: List[str], base_dir: Optional[Path] = None) -> str:
324
+ """
325
+ Convert page content lines to HTML with inline images and proper formatting.
326
+
327
+ Args:
328
+ page_content: List of content lines for the page
329
+ base_dir: Base directory for resolving image paths
330
+
331
+ Returns:
332
+ HTML content string
333
+ """
334
+ processed_content = []
335
+ paragraph_buffer = []
336
+
337
+ def flush_paragraph():
338
+ """Flush accumulated paragraph content to HTML"""
339
+ nonlocal paragraph_buffer
340
+ if paragraph_buffer:
341
+ joined = '<br/>'.join(_html.escape(l) for l in paragraph_buffer)
342
+ processed_content.append(f'<p>{joined}</p>')
343
+ paragraph_buffer = []
344
+
345
+ i = 0
346
+ n = len(page_content)
347
+
348
+ while i < n:
349
+ raw_line = page_content[i]
350
+ line = raw_line.rstrip('\r\n')
351
+ stripped = line.strip()
352
+
353
+ # Handle image references
354
+ if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
355
+ flush_paragraph()
356
+ match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
357
+ if match and base_dir is not None:
358
+ caption = match.group(1)
359
+ rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
360
+ abs_path = (base_dir / rel_path).resolve()
361
+ try:
362
+ with open(abs_path, 'rb') as f:
363
+ b64 = base64.b64encode(f.read()).decode('ascii')
364
+ processed_content.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
365
+ except Exception as e:
366
+ print(f"❌ Failed to embed image {rel_path}: {e}")
367
+ print(f"📁 File exists: {abs_path.exists()}")
368
+ if abs_path.exists():
369
+ print(f"📁 File size: {abs_path.stat().st_size} bytes")
370
+ processed_content.append(f'<div>{_html.escape(caption)} (image not found)</div>')
371
+ else:
372
+ # If no match or no base_dir, just add the raw markdown
373
+ print(f"⚠️ Image reference not processed: {stripped}")
374
+ processed_content.append(f'<div>{_html.escape(stripped)}</div>')
375
+ i += 1
376
+ continue
377
+
378
+ # Handle markdown tables
379
+ if (stripped.startswith('|') or stripped.count('|') >= 2) and i + 1 < n and is_markdown_table_header(page_content[i + 1]):
380
+ flush_paragraph()
381
+ table_block = [stripped]
382
+ i += 1
383
+ table_block.append(page_content[i].strip())
384
+ i += 1
385
+ while i < n:
386
+ nxt = page_content[i].rstrip('\r\n')
387
+ if nxt.strip() == '' or (not nxt.strip().startswith('|') and nxt.count('|') < 2):
388
+ break
389
+ table_block.append(nxt.strip())
390
+ i += 1
391
+ html_table = render_markdown_table(table_block)
392
+ if html_table:
393
+ processed_content.append(html_table)
394
+ else:
395
+ for tl in table_block:
396
+ paragraph_buffer.append(tl)
397
+ continue
398
+
399
+ # Handle headers and content
400
+ if stripped.startswith('## '):
401
+ flush_paragraph()
402
+ processed_content.append(f'<h3>{_html.escape(stripped[3:])}</h3>')
403
+ elif stripped.startswith('# '):
404
+ flush_paragraph()
405
+ processed_content.append(f'<h2>{_html.escape(stripped[2:])}</h2>')
406
+ elif stripped == '':
407
+ flush_paragraph()
408
+ processed_content.append('<br/>')
409
+ else:
410
+ paragraph_buffer.append(raw_line)
411
+ i += 1
412
+
413
+ flush_paragraph()
414
+ return "\n".join(processed_content)
415
+
416
+
417
+ def create_tips_markdown() -> str:
418
+ """
419
+ Create the tips section markdown for the UI.
420
+
421
+ Returns:
422
+ Tips markdown content with helpful usage information
423
+ """
424
+ return """
425
+ <div class="card">
426
+ <b>Tips</b>
427
+ <ul>
428
+ <li>On Spaces, set a secret <code>VLM_API_KEY</code> to enable VLM features.</li>
429
+ <li>Use <strong>Enhanced Parser</strong> for documents that need image restoration before parsing (scanned docs, low-quality PDFs).</li>
430
+ <li>Use <strong>DocRes Image Restoration</strong> for standalone image enhancement without parsing.</li>
431
+ <li>DocRes tasks: <code>appearance</code> (default), <code>dewarping</code>, <code>deshadowing</code>, <code>deblurring</code>, <code>binarization</code>, <code>end2end</code>.</li>
432
+ <li>Outputs are saved under <code>outputs/&lt;pdf_stem&gt;/</code>.</li>
433
+ </ul>
434
+ </div>
435
+ """
doctra/utils/progress.py CHANGED
@@ -68,13 +68,13 @@ def _select_emoji(key: str) -> str:
68
68
  "processing": "⚙️",
69
69
  }
70
70
  safe_map = {
71
- "loading": "",
72
- "charts": "",
73
- "tables": "",
74
- "figures": "",
75
- "ocr": "🔎",
76
- "vlm": "",
77
- "processing": "",
71
+ "loading": "🔄",
72
+ "charts": "📊",
73
+ "tables": "📋",
74
+ "figures": "🖼️",
75
+ "ocr": "🔍",
76
+ "vlm": "🤖",
77
+ "processing": "⚙️",
78
78
  }
79
79
  ascii_map = {
80
80
  "loading": "[loading]",
doctra/version.py CHANGED
@@ -1,2 +1,2 @@
1
1
  """Version information for Doctra."""
2
- __version__ = '0.4.1'
2
+ __version__ = '0.4.3'