doctra 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +21 -18
- doctra/cli/main.py +5 -2
- doctra/cli/utils.py +12 -3
- doctra/engines/layout/paddle_layout.py +13 -78
- doctra/engines/vlm/provider.py +86 -58
- doctra/engines/vlm/service.py +10 -14
- doctra/exporters/html_writer.py +1235 -0
- doctra/parsers/structured_pdf_parser.py +35 -15
- doctra/parsers/table_chart_extractor.py +66 -28
- doctra/ui/__init__.py +5 -0
- doctra/ui/app.py +1012 -0
- doctra/utils/progress.py +428 -0
- doctra/utils/structured_utils.py +49 -49
- doctra/version.py +1 -1
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/METADATA +45 -6
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/RECORD +19 -15
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/WHEEL +0 -0
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/top_level.txt +0 -0
doctra/utils/progress.py
ADDED
@@ -0,0 +1,428 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from typing import Optional, Dict, Any, Iterable, Iterator, Tuple
|
6
|
+
from tqdm import tqdm
|
7
|
+
from tqdm.auto import tqdm as tqdm_auto
|
8
|
+
|
9
|
+
|
10
|
+
class ProgressConfig:
|
11
|
+
"""
|
12
|
+
Central configuration for progress behavior, overridable via environment.
|
13
|
+
|
14
|
+
Env vars:
|
15
|
+
- DOCTRA_PROGRESS_DISABLE: "1" to disable progress entirely
|
16
|
+
- DOCTRA_PROGRESS_ASCII: "1" to force ASCII bars
|
17
|
+
- DOCTRA_PROGRESS_EMOJI: "0" to disable emoji prefixing
|
18
|
+
- DOCTRA_PROGRESS_NCOLS: integer width for bars
|
19
|
+
- DOCTRA_PROGRESS_EMOJI_MODE: one of {default, safe, ascii, none}
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self) -> None:
|
23
|
+
self.disable: bool = os.getenv("DOCTRA_PROGRESS_DISABLE", "0") == "1"
|
24
|
+
self.force_ascii: bool = os.getenv("DOCTRA_PROGRESS_ASCII", "0") == "1"
|
25
|
+
self.use_emoji: bool = os.getenv("DOCTRA_PROGRESS_EMOJI", "1") == "1"
|
26
|
+
self.ncols_env: Optional[int] = None
|
27
|
+
self.emoji_mode: str = os.getenv("DOCTRA_PROGRESS_EMOJI_MODE", "default").lower()
|
28
|
+
try:
|
29
|
+
ncols_val = os.getenv("DOCTRA_PROGRESS_NCOLS")
|
30
|
+
self.ncols_env = int(ncols_val) if ncols_val else None
|
31
|
+
except Exception:
|
32
|
+
self.ncols_env = None
|
33
|
+
|
34
|
+
|
35
|
+
_PROGRESS_CONFIG = ProgressConfig()
|
36
|
+
|
37
|
+
|
38
|
+
def _detect_environment() -> Tuple[bool, bool, bool]:
|
39
|
+
"""
|
40
|
+
Returns (is_notebook, is_tty, is_windows).
|
41
|
+
"""
|
42
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
43
|
+
# Colab/Kaggle specifics
|
44
|
+
if "google.colab" in sys.modules:
|
45
|
+
is_notebook = True
|
46
|
+
if "kaggle_secrets" in sys.modules or "kaggle_web_client" in sys.modules:
|
47
|
+
is_notebook = True
|
48
|
+
is_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
49
|
+
is_windows = sys.platform.startswith("win")
|
50
|
+
return is_notebook, is_tty, is_windows
|
51
|
+
|
52
|
+
|
53
|
+
def _select_emoji(key: str) -> str:
|
54
|
+
"""
|
55
|
+
Choose an emoji/symbol for a given key according to env and config.
|
56
|
+
Modes:
|
57
|
+
- default: rich emoji
|
58
|
+
- safe: single-codepoint symbols with stable width
|
59
|
+
- ascii: ASCII text tokens
|
60
|
+
- none: empty prefix
|
61
|
+
"""
|
62
|
+
# Maps
|
63
|
+
default_map = {
|
64
|
+
"loading": "🔄",
|
65
|
+
"charts": "📊",
|
66
|
+
"tables": "📋",
|
67
|
+
"figures": "🖼️",
|
68
|
+
"ocr": "🔍",
|
69
|
+
"vlm": "🤖",
|
70
|
+
"processing": "⚙️",
|
71
|
+
}
|
72
|
+
safe_map = {
|
73
|
+
# Use BMP or geometric shapes likely to render everywhere
|
74
|
+
"loading": "⏳",
|
75
|
+
"charts": "▦",
|
76
|
+
"tables": "▤",
|
77
|
+
"figures": "▧",
|
78
|
+
"ocr": "🔎",
|
79
|
+
"vlm": "★",
|
80
|
+
"processing": "⚙", # no variation selector
|
81
|
+
}
|
82
|
+
ascii_map = {
|
83
|
+
"loading": "[loading]",
|
84
|
+
"charts": "[charts]",
|
85
|
+
"tables": "[tables]",
|
86
|
+
"figures": "[figures]",
|
87
|
+
"ocr": "[ocr]",
|
88
|
+
"vlm": "[vlm]",
|
89
|
+
"processing": "[processing]",
|
90
|
+
}
|
91
|
+
|
92
|
+
# Determine effective mode
|
93
|
+
mode = _PROGRESS_CONFIG.emoji_mode
|
94
|
+
is_notebook, _, is_windows = _detect_environment()
|
95
|
+
if not _PROGRESS_CONFIG.use_emoji:
|
96
|
+
mode = "none"
|
97
|
+
elif mode == "default":
|
98
|
+
# Heuristics: prefer safe in Colab/Kaggle notebooks and Windows terminals
|
99
|
+
if is_windows or "google.colab" in sys.modules or "kaggle_secrets" in sys.modules:
|
100
|
+
mode = "safe"
|
101
|
+
|
102
|
+
if mode == "none":
|
103
|
+
return ""
|
104
|
+
if mode == "ascii":
|
105
|
+
return ascii_map.get(key, "")
|
106
|
+
if mode == "safe":
|
107
|
+
return safe_map.get(key, safe_map["processing"])
|
108
|
+
# default
|
109
|
+
return default_map.get(key, default_map["processing"])
|
110
|
+
|
111
|
+
|
112
|
+
def _supports_unicode_output() -> bool:
|
113
|
+
"""Best-effort detection whether stdout likely supports Unicode/emoji."""
|
114
|
+
try:
|
115
|
+
enc = getattr(sys.stdout, "encoding", None) or ""
|
116
|
+
enc_lower = enc.lower()
|
117
|
+
if "utf" in enc_lower:
|
118
|
+
return True
|
119
|
+
except Exception:
|
120
|
+
pass
|
121
|
+
|
122
|
+
# Heuristics for common notebook environments that support emoji
|
123
|
+
env = os.environ
|
124
|
+
if any(k in env for k in ("COLAB_GPU", "GCE_METADATA_HOST", "KAGGLE_KERNEL_RUN_TYPE", "JPY_PARENT_PID")):
|
125
|
+
return True
|
126
|
+
|
127
|
+
# On modern Windows terminals with UTF-8 code page, assume yes
|
128
|
+
if sys.platform.startswith("win"):
|
129
|
+
# If user opted-in to force ASCII, respect it
|
130
|
+
if _PROGRESS_CONFIG.force_ascii:
|
131
|
+
return False
|
132
|
+
# Try to detect WT/Terminal/VSCode which usually handle Unicode
|
133
|
+
if any(k in env for k in ("WT_SESSION", "TERM_PROGRAM", "VSCODE_PID")):
|
134
|
+
return True
|
135
|
+
|
136
|
+
return False
|
137
|
+
|
138
|
+
|
139
|
+
def create_beautiful_progress_bar(
|
140
|
+
total: int,
|
141
|
+
desc: str,
|
142
|
+
leave: bool = True,
|
143
|
+
position: Optional[int] = None,
|
144
|
+
**kwargs
|
145
|
+
) -> tqdm:
|
146
|
+
"""
|
147
|
+
Create a beautiful and interactive tqdm progress bar with enhanced styling.
|
148
|
+
|
149
|
+
Features:
|
150
|
+
- Colorful progress bars with gradients
|
151
|
+
- Emoji icons for different operations
|
152
|
+
- Better formatting and spacing
|
153
|
+
- Interactive features
|
154
|
+
- Responsive design
|
155
|
+
|
156
|
+
:param total: Total number of items to process
|
157
|
+
:param desc: Description text for the progress bar
|
158
|
+
:param leave: Whether to leave the progress bar after completion
|
159
|
+
:param position: Position of the progress bar (for multiple bars)
|
160
|
+
:param kwargs: Additional tqdm parameters
|
161
|
+
:return: Configured tqdm progress bar instance
|
162
|
+
"""
|
163
|
+
|
164
|
+
# Enhanced styling parameters - notebook-friendly format
|
165
|
+
is_notebook, is_tty, is_windows = _detect_environment()
|
166
|
+
if is_notebook:
|
167
|
+
# Simpler format for notebooks to avoid display issues
|
168
|
+
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
|
169
|
+
else:
|
170
|
+
# Full format for terminal
|
171
|
+
bar_format = (
|
172
|
+
"{l_bar}{bar:30}| {n_fmt}/{total_fmt} "
|
173
|
+
"[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
|
174
|
+
)
|
175
|
+
|
176
|
+
# Color schemes based on operation type
|
177
|
+
color_schemes = {
|
178
|
+
"loading": {"colour": "cyan", "ncols": 100},
|
179
|
+
"charts": {"colour": "green", "ncols": 100},
|
180
|
+
"tables": {"colour": "blue", "ncols": 100},
|
181
|
+
"figures": {"colour": "magenta", "ncols": 100},
|
182
|
+
"ocr": {"colour": "yellow", "ncols": 100},
|
183
|
+
"vlm": {"colour": "red", "ncols": 100},
|
184
|
+
"processing": {"colour": "white", "ncols": 100},
|
185
|
+
}
|
186
|
+
|
187
|
+
# Determine color scheme based on description
|
188
|
+
desc_lower = desc.lower()
|
189
|
+
if "loading" in desc_lower or "model" in desc_lower:
|
190
|
+
color_scheme = color_schemes["loading"]
|
191
|
+
elif "chart" in desc_lower:
|
192
|
+
color_scheme = color_schemes["charts"]
|
193
|
+
elif "table" in desc_lower:
|
194
|
+
color_scheme = color_schemes["tables"]
|
195
|
+
elif "figure" in desc_lower:
|
196
|
+
color_scheme = color_schemes["figures"]
|
197
|
+
elif "ocr" in desc_lower:
|
198
|
+
color_scheme = color_schemes["ocr"]
|
199
|
+
elif "vlm" in desc_lower:
|
200
|
+
color_scheme = color_schemes["vlm"]
|
201
|
+
else:
|
202
|
+
color_scheme = color_schemes["processing"]
|
203
|
+
|
204
|
+
# Emoji categories
|
205
|
+
emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
|
206
|
+
|
207
|
+
# Add appropriate emoji to description (can be disabled)
|
208
|
+
if _PROGRESS_CONFIG.use_emoji:
|
209
|
+
prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
|
210
|
+
prefix = _select_emoji(prefix_key)
|
211
|
+
if prefix:
|
212
|
+
desc = f"{prefix} {desc}"
|
213
|
+
|
214
|
+
# Enhanced tqdm configuration
|
215
|
+
tqdm_config = {
|
216
|
+
"total": total,
|
217
|
+
"desc": desc,
|
218
|
+
"leave": leave,
|
219
|
+
"bar_format": bar_format,
|
220
|
+
"ncols": _PROGRESS_CONFIG.ncols_env or color_scheme["ncols"],
|
221
|
+
# Prefer Unicode unless user forces ASCII or environment lacks Unicode support
|
222
|
+
"ascii": _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output(),
|
223
|
+
"dynamic_ncols": True, # Responsive width
|
224
|
+
"smoothing": 0.3, # Smooth progress updates
|
225
|
+
"mininterval": 0.1, # Minimum update interval
|
226
|
+
"maxinterval": 1.0, # Maximum update interval
|
227
|
+
"position": position,
|
228
|
+
**kwargs
|
229
|
+
}
|
230
|
+
|
231
|
+
# Enhanced environment detection
|
232
|
+
is_notebook, is_terminal, is_windows = _detect_environment()
|
233
|
+
|
234
|
+
# Add color only for terminal environments (not notebooks)
|
235
|
+
if not is_notebook and is_terminal:
|
236
|
+
tqdm_config["colour"] = color_scheme["colour"]
|
237
|
+
|
238
|
+
# Respect global disable
|
239
|
+
if _PROGRESS_CONFIG.disable:
|
240
|
+
tqdm_config["disable"] = True
|
241
|
+
|
242
|
+
# Try creating the progress bar with Unicode, fallback to ASCII on failure (e.g., Windows code page)
|
243
|
+
if is_notebook:
|
244
|
+
tqdm_config.pop("colour", None)
|
245
|
+
try:
|
246
|
+
return tqdm_auto(**tqdm_config)
|
247
|
+
except Exception:
|
248
|
+
tqdm_config["ascii"] = True
|
249
|
+
return tqdm_auto(**tqdm_config)
|
250
|
+
else:
|
251
|
+
try:
|
252
|
+
return tqdm(**tqdm_config)
|
253
|
+
except Exception:
|
254
|
+
tqdm_config["ascii"] = True
|
255
|
+
return tqdm(**tqdm_config)
|
256
|
+
|
257
|
+
|
258
|
+
def create_multi_progress_bars(
|
259
|
+
descriptions: list[str],
|
260
|
+
totals: list[int],
|
261
|
+
positions: Optional[list[int]] = None
|
262
|
+
) -> list[tqdm]:
|
263
|
+
"""
|
264
|
+
Create multiple beautiful progress bars for concurrent operations.
|
265
|
+
|
266
|
+
:param descriptions: List of descriptions for each progress bar
|
267
|
+
:param totals: List of totals for each progress bar
|
268
|
+
:param positions: Optional list of positions for each bar
|
269
|
+
:return: List of configured tqdm progress bar instances
|
270
|
+
"""
|
271
|
+
if positions is None:
|
272
|
+
positions = list(range(len(descriptions)))
|
273
|
+
|
274
|
+
bars = []
|
275
|
+
for desc, total, pos in zip(descriptions, totals, positions):
|
276
|
+
bar = create_beautiful_progress_bar(
|
277
|
+
total=total,
|
278
|
+
desc=desc,
|
279
|
+
position=pos,
|
280
|
+
leave=True
|
281
|
+
)
|
282
|
+
bars.append(bar)
|
283
|
+
|
284
|
+
return bars
|
285
|
+
|
286
|
+
|
287
|
+
def update_progress_with_info(
|
288
|
+
bar: tqdm,
|
289
|
+
increment: int = 1,
|
290
|
+
info: Optional[Dict[str, Any]] = None
|
291
|
+
) -> None:
|
292
|
+
"""
|
293
|
+
Update progress bar with additional information.
|
294
|
+
|
295
|
+
:param bar: tqdm progress bar instance
|
296
|
+
:param increment: Number to increment the progress
|
297
|
+
:param info: Optional dictionary of information to display
|
298
|
+
"""
|
299
|
+
if info:
|
300
|
+
# Format info as postfix
|
301
|
+
postfix_parts = []
|
302
|
+
for key, value in info.items():
|
303
|
+
if isinstance(value, float):
|
304
|
+
postfix_parts.append(f"{key}: {value:.2f}")
|
305
|
+
else:
|
306
|
+
postfix_parts.append(f"{key}: {value}")
|
307
|
+
|
308
|
+
bar.set_postfix_str(", ".join(postfix_parts))
|
309
|
+
|
310
|
+
bar.update(increment)
|
311
|
+
|
312
|
+
|
313
|
+
def create_loading_bar(desc: str = "Loading", **kwargs) -> tqdm:
|
314
|
+
"""
|
315
|
+
Create a special loading progress bar for model initialization.
|
316
|
+
|
317
|
+
:param desc: Description for the loading operation
|
318
|
+
:param kwargs: Additional tqdm parameters
|
319
|
+
:return: Configured loading progress bar
|
320
|
+
"""
|
321
|
+
return create_beautiful_progress_bar(
|
322
|
+
total=1,
|
323
|
+
desc=desc,
|
324
|
+
leave=True,
|
325
|
+
**kwargs
|
326
|
+
)
|
327
|
+
|
328
|
+
|
329
|
+
def create_processing_bar(
|
330
|
+
total: int,
|
331
|
+
operation: str,
|
332
|
+
**kwargs
|
333
|
+
) -> tqdm:
|
334
|
+
"""
|
335
|
+
Create a processing progress bar for data operations.
|
336
|
+
|
337
|
+
:param total: Total number of items to process
|
338
|
+
:param operation: Type of operation (charts, tables, figures, etc.)
|
339
|
+
:param kwargs: Additional tqdm parameters
|
340
|
+
:return: Configured processing progress bar
|
341
|
+
"""
|
342
|
+
desc = f"{operation.title()} (processing)"
|
343
|
+
return create_beautiful_progress_bar(
|
344
|
+
total=total,
|
345
|
+
desc=desc,
|
346
|
+
leave=True,
|
347
|
+
**kwargs
|
348
|
+
)
|
349
|
+
|
350
|
+
|
351
|
+
def create_notebook_friendly_bar(
|
352
|
+
total: int,
|
353
|
+
desc: str,
|
354
|
+
**kwargs
|
355
|
+
) -> tqdm:
|
356
|
+
"""
|
357
|
+
Create a notebook-friendly progress bar with minimal formatting.
|
358
|
+
|
359
|
+
This function creates progress bars specifically optimized for Jupyter notebooks
|
360
|
+
to avoid display issues and ANSI code problems.
|
361
|
+
|
362
|
+
:param total: Total number of items to process
|
363
|
+
:param desc: Description text for the progress bar
|
364
|
+
:param kwargs: Additional tqdm parameters
|
365
|
+
:return: Configured notebook-friendly progress bar
|
366
|
+
"""
|
367
|
+
# Force notebook mode
|
368
|
+
if _PROGRESS_CONFIG.disable:
|
369
|
+
kwargs["disable"] = True
|
370
|
+
else:
|
371
|
+
kwargs["disable"] = False
|
372
|
+
# Prefer Unicode in notebooks if supported
|
373
|
+
if "ascii" not in kwargs:
|
374
|
+
kwargs["ascii"] = _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output()
|
375
|
+
|
376
|
+
# Emoji categories
|
377
|
+
emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
|
378
|
+
|
379
|
+
# Add appropriate emoji to description
|
380
|
+
desc_lower = desc.lower()
|
381
|
+
if _PROGRESS_CONFIG.use_emoji:
|
382
|
+
prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
|
383
|
+
prefix = _select_emoji(prefix_key)
|
384
|
+
if prefix:
|
385
|
+
desc = f"{prefix} {desc}"
|
386
|
+
|
387
|
+
# Simple format for notebooks
|
388
|
+
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt}"
|
389
|
+
|
390
|
+
tqdm_config = {
|
391
|
+
"total": total,
|
392
|
+
"desc": desc,
|
393
|
+
"leave": True,
|
394
|
+
"bar_format": bar_format,
|
395
|
+
"ncols": _PROGRESS_CONFIG.ncols_env or 80,
|
396
|
+
"ascii": kwargs.get("ascii", False),
|
397
|
+
"dynamic_ncols": False, # Fixed width for notebooks
|
398
|
+
"smoothing": 0.1, # Faster updates
|
399
|
+
"mininterval": 0.05,
|
400
|
+
"maxinterval": 0.5,
|
401
|
+
**kwargs
|
402
|
+
}
|
403
|
+
|
404
|
+
return tqdm_auto(**tqdm_config)
|
405
|
+
|
406
|
+
|
407
|
+
def progress_for(iterable: Iterable[Any], desc: str, total: Optional[int] = None, leave: bool = True, **kwargs) -> Iterator[Any]:
|
408
|
+
"""
|
409
|
+
Wrap an iterable with a configured progress bar.
|
410
|
+
Respects env config and auto-detects notebook vs terminal.
|
411
|
+
"""
|
412
|
+
if _PROGRESS_CONFIG.disable:
|
413
|
+
for item in iterable:
|
414
|
+
yield item
|
415
|
+
return
|
416
|
+
|
417
|
+
is_notebook, _, _ = _detect_environment()
|
418
|
+
bar_factory = create_notebook_friendly_bar if is_notebook else create_beautiful_progress_bar
|
419
|
+
with bar_factory(total=total if total is not None else 0, desc=desc, leave=leave, **kwargs) as bar:
|
420
|
+
if total is None:
|
421
|
+
# Unknown total: manual increments
|
422
|
+
for item in iterable:
|
423
|
+
yield item
|
424
|
+
bar.update(1)
|
425
|
+
else:
|
426
|
+
for item in iterable:
|
427
|
+
yield item
|
428
|
+
bar.update(1)
|
doctra/utils/structured_utils.py
CHANGED
@@ -1,49 +1,49 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
from typing import Any, Dict, Optional
|
3
|
-
import json
|
4
|
-
|
5
|
-
try:
|
6
|
-
from pydantic import BaseModel # type: ignore
|
7
|
-
except Exception: # pydantic not strictly required for normalization
|
8
|
-
class BaseModel: # fallback stub
|
9
|
-
pass
|
10
|
-
|
11
|
-
def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
|
12
|
-
"""
|
13
|
-
Accepts a VLM result that might be:
|
14
|
-
- JSON string
|
15
|
-
- dict
|
16
|
-
- Pydantic BaseModel (v1 .dict() or v2 .model_dump())
|
17
|
-
Returns a normalized dict with keys: title, headers, rows — or None.
|
18
|
-
"""
|
19
|
-
if obj is None:
|
20
|
-
return None
|
21
|
-
|
22
|
-
# JSON string from VLM
|
23
|
-
if isinstance(obj, str):
|
24
|
-
try:
|
25
|
-
obj = json.loads(obj)
|
26
|
-
except Exception:
|
27
|
-
return None
|
28
|
-
|
29
|
-
# Pydantic model
|
30
|
-
if isinstance(obj, BaseModel):
|
31
|
-
try:
|
32
|
-
return obj.model_dump() # pydantic v2
|
33
|
-
except Exception:
|
34
|
-
try:
|
35
|
-
return obj.dict() # pydantic v1
|
36
|
-
except Exception:
|
37
|
-
return None
|
38
|
-
|
39
|
-
# Plain dict
|
40
|
-
if isinstance(obj, dict):
|
41
|
-
title = obj.get("title") or "Untitled"
|
42
|
-
headers = obj.get("headers") or []
|
43
|
-
rows = obj.get("rows") or []
|
44
|
-
# Basic shape checks
|
45
|
-
if not isinstance(headers, list) or not isinstance(rows, list):
|
46
|
-
return None
|
47
|
-
return {"title": title, "headers": headers, "rows": rows}
|
48
|
-
|
49
|
-
return None
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
import json
|
4
|
+
|
5
|
+
try:
|
6
|
+
from pydantic import BaseModel # type: ignore
|
7
|
+
except Exception: # pydantic not strictly required for normalization
|
8
|
+
class BaseModel: # fallback stub
|
9
|
+
pass
|
10
|
+
|
11
|
+
def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
|
12
|
+
"""
|
13
|
+
Accepts a VLM result that might be:
|
14
|
+
- JSON string
|
15
|
+
- dict
|
16
|
+
- Pydantic BaseModel (v1 .dict() or v2 .model_dump())
|
17
|
+
Returns a normalized dict with keys: title, headers, rows — or None.
|
18
|
+
"""
|
19
|
+
if obj is None:
|
20
|
+
return None
|
21
|
+
|
22
|
+
# JSON string from VLM
|
23
|
+
if isinstance(obj, str):
|
24
|
+
try:
|
25
|
+
obj = json.loads(obj)
|
26
|
+
except Exception:
|
27
|
+
return None
|
28
|
+
|
29
|
+
# Pydantic model
|
30
|
+
if isinstance(obj, BaseModel):
|
31
|
+
try:
|
32
|
+
return obj.model_dump() # pydantic v2
|
33
|
+
except Exception:
|
34
|
+
try:
|
35
|
+
return obj.dict() # pydantic v1
|
36
|
+
except Exception:
|
37
|
+
return None
|
38
|
+
|
39
|
+
# Plain dict
|
40
|
+
if isinstance(obj, dict):
|
41
|
+
title = obj.get("title") or "Untitled"
|
42
|
+
headers = obj.get("headers") or []
|
43
|
+
rows = obj.get("rows") or []
|
44
|
+
# Basic shape checks
|
45
|
+
if not isinstance(headers, list) or not isinstance(rows, list):
|
46
|
+
return None
|
47
|
+
return {"title": title, "headers": headers, "rows": rows}
|
48
|
+
|
49
|
+
return None
|
doctra/version.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
"""Version information for Doctra."""
|
2
|
-
__version__ = '0.
|
2
|
+
__version__ = '0.3.0'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -241,6 +241,8 @@ Provides-Extra: openai
|
|
241
241
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
242
242
|
Provides-Extra: gemini
|
243
243
|
Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
|
244
|
+
Provides-Extra: anthropic
|
245
|
+
Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
|
244
246
|
Provides-Extra: dev
|
245
247
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
246
248
|
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
@@ -329,7 +331,7 @@ parser = StructuredPDFParser()
|
|
329
331
|
# Parser with VLM for structured data extraction
|
330
332
|
parser = StructuredPDFParser(
|
331
333
|
use_vlm=True,
|
332
|
-
vlm_provider="openai", # or "gemini"
|
334
|
+
vlm_provider="openai", # or "gemini" or "anthropic" or "openrouter"
|
333
335
|
vlm_api_key="your_api_key_here"
|
334
336
|
)
|
335
337
|
|
@@ -344,7 +346,7 @@ parser = StructuredPDFParser(
|
|
344
346
|
# VLM Settings
|
345
347
|
use_vlm=True,
|
346
348
|
vlm_provider="openai",
|
347
|
-
vlm_model="gpt-
|
349
|
+
vlm_model="gpt-5",
|
348
350
|
vlm_api_key="your_api_key",
|
349
351
|
|
350
352
|
# Layout Detection Settings
|
@@ -406,7 +408,7 @@ parser = ChartTablePDFParser(
|
|
406
408
|
# VLM Settings
|
407
409
|
use_vlm=True,
|
408
410
|
vlm_provider="openai",
|
409
|
-
vlm_model="gpt-
|
411
|
+
vlm_model="gpt-5",
|
410
412
|
vlm_api_key="your_api_key",
|
411
413
|
|
412
414
|
# Layout Detection Settings
|
@@ -545,7 +547,7 @@ parser = StructuredPDFParser(
|
|
545
547
|
use_vlm=True,
|
546
548
|
vlm_provider="openai",
|
547
549
|
vlm_api_key="your_openai_api_key",
|
548
|
-
vlm__model="gpt-
|
550
|
+
vlm__model="gpt-5",
|
549
551
|
layout_model_name="PP-DocLayout_plus-L",
|
550
552
|
dpi=300, # Higher DPI for better quality
|
551
553
|
min_score=0.5, # Higher confidence threshold
|
@@ -623,4 +625,41 @@ parser.display_pages_with_boxes("document.pdf")
|
|
623
625
|
- **Pandas**: Data manipulation
|
624
626
|
- **OpenPyXL**: Excel file generation
|
625
627
|
- **Google Generative AI**: For Gemini VLM integration
|
626
|
-
- **OpenAI**: For GPT-
|
628
|
+
- **OpenAI**: For GPT-5 VLM integration
|
629
|
+
|
630
|
+
## 🖥️ Web Interface (Gradio)
|
631
|
+
|
632
|
+
You can try Doctra in a simple web UI powered by Gradio.
|
633
|
+
|
634
|
+
### Run locally
|
635
|
+
|
636
|
+
```bash
|
637
|
+
pip install -U gradio
|
638
|
+
python gradio_app.py
|
639
|
+
```
|
640
|
+
|
641
|
+
Then open the printed URL (default `http://127.0.0.1:7860`).
|
642
|
+
|
643
|
+
Notes:
|
644
|
+
- If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
|
645
|
+
- Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
|
646
|
+
|
647
|
+
### Deploy on Hugging Face Spaces
|
648
|
+
|
649
|
+
1) Create a new Space (type: Gradio, SDK: Python).
|
650
|
+
|
651
|
+
2) Add these files to the Space repo:
|
652
|
+
- Your package code (or install from PyPI).
|
653
|
+
- `gradio_app.py` (entry point).
|
654
|
+
- `requirements.txt` with at least:
|
655
|
+
|
656
|
+
```text
|
657
|
+
doctra
|
658
|
+
gradio
|
659
|
+
```
|
660
|
+
|
661
|
+
3) Set a secret named `VLM_API_KEY` if you want VLM features.
|
662
|
+
|
663
|
+
4) In Space settings, set `python gradio_app.py` as the run command (or rely on auto-detect).
|
664
|
+
|
665
|
+
The Space will build and expose the same interface for uploads and processing.
|
@@ -1,29 +1,32 @@
|
|
1
|
-
doctra/__init__.py,sha256
|
2
|
-
doctra/version.py,sha256=
|
1
|
+
doctra/__init__.py,sha256=ST_c2GWBoB0y_wpL1qsOeK4bR1RyJhMMn6I5VjVRI6Y,613
|
2
|
+
doctra/version.py,sha256=hnuLMAgAv9rqQndLE3xdEZsa3vwZ4eZ2RVbRJjlJu8Y,60
|
3
3
|
doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
|
4
|
-
doctra/cli/main.py,sha256=
|
5
|
-
doctra/cli/utils.py,sha256=
|
4
|
+
doctra/cli/main.py,sha256=o_W1b5kx3xaTbWK6l4IYi0YLwffKBj5pQKflnlaG2Fw,35611
|
5
|
+
doctra/cli/utils.py,sha256=IghiUZQCOmXODC5-5smHGz2KeV4xqbP4avmA1Mggln0,11800
|
6
6
|
doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
doctra/engines/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
doctra/engines/layout/layout_models.py,sha256=vuTzjWd3FD-SkFPngktmUVhOJ6Xvff6ufwFEq796PQs,3162
|
9
|
-
doctra/engines/layout/paddle_layout.py,sha256=
|
9
|
+
doctra/engines/layout/paddle_layout.py,sha256=P2-Gk8wHpWoA5Jpmo_3OLI59zWq3HeAOBOUKKVdXu8I,6792
|
10
10
|
doctra/engines/ocr/__init__.py,sha256=h6bFiveGXdI59fsKzCqOXki3C74DCndEmvloOtMqnR0,133
|
11
11
|
doctra/engines/ocr/api.py,sha256=YOBKDLExXpvSiOsc_TDJasaMPxzdVx1llQCtYlsruWo,1280
|
12
12
|
doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4G7S3pA,1492
|
13
13
|
doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
|
14
14
|
doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
|
16
|
-
doctra/engines/vlm/provider.py,sha256=
|
17
|
-
doctra/engines/vlm/service.py,sha256=
|
16
|
+
doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
|
17
|
+
doctra/engines/vlm/service.py,sha256=Jwws2Jw68-IdHyvEWks4UCoP7Olhqt8IpXfCv5Z7Ml4,4724
|
18
18
|
doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
|
20
|
+
doctra/exporters/html_writer.py,sha256=OlW24Eg5bZcjldRHtd3GDD7RrajuRXj43EJpXIJkYf8,38810
|
20
21
|
doctra/exporters/image_saver.py,sha256=zsPoQ0CwoE643ui4iZMdXk96kv5mU8L_zC2JfF22N1A,1639
|
21
22
|
doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r-b0zw,2030
|
22
23
|
doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
|
23
24
|
doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
|
24
25
|
doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
|
25
|
-
doctra/parsers/structured_pdf_parser.py,sha256=
|
26
|
-
doctra/parsers/table_chart_extractor.py,sha256=
|
26
|
+
doctra/parsers/structured_pdf_parser.py,sha256=fbDIQ6VFv1phFPC3lKgcjtCp0AdNA8Ny1dK0F726Pww,21357
|
27
|
+
doctra/parsers/table_chart_extractor.py,sha256=JuoScqCQbPdQjy4ak77OcZHSPYKGHF4H39fEW6gF3eo,15323
|
28
|
+
doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
|
29
|
+
doctra/ui/app.py,sha256=FYDlEG_2pfp7SSHnA04NRNUhOcI-BJPh3qAf5dw5D6g,45903
|
27
30
|
doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
31
|
doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
|
29
32
|
doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
|
@@ -31,10 +34,11 @@ doctra/utils/file_ops.py,sha256=3IS0EQncs6Kaj27fcg2zxQX3xRSvtItIsyKGLYgeOgw,815
|
|
31
34
|
doctra/utils/io_utils.py,sha256=L1bWV4-ybs2j_3ZEN7GfQVgdC73JKVECVnpwKbP0dy0,219
|
32
35
|
doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
|
33
36
|
doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
|
37
|
+
doctra/utils/progress.py,sha256=sNEjTdN32J1-eXFPqwZRw2EZQ1SXSesXBd5StJvtlmc,14481
|
34
38
|
doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
|
35
|
-
doctra/utils/structured_utils.py,sha256=
|
36
|
-
doctra-0.
|
37
|
-
doctra-0.
|
38
|
-
doctra-0.
|
39
|
-
doctra-0.
|
40
|
-
doctra-0.
|
39
|
+
doctra/utils/structured_utils.py,sha256=J-qTqo8eCjm36FaRJ_I482LFgYCpm3eukZm-gbNnchw,1401
|
40
|
+
doctra-0.3.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
41
|
+
doctra-0.3.0.dist-info/METADATA,sha256=tdfVsN0nDj_WcpptBvJvWF2tzdgp_0SfeeYya7oTqgU,27794
|
42
|
+
doctra-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
43
|
+
doctra-0.3.0.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
|
44
|
+
doctra-0.3.0.dist-info/RECORD,,
|
File without changes
|