doctra 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/cli/main.py +10 -23
- doctra/cli/utils.py +7 -6
- doctra/engines/vlm/provider.py +257 -86
- doctra/parsers/structured_pdf_parser.py +8 -5
- doctra/ui/enhanced_parser_ui.py +2 -2
- doctra/ui/full_parse_ui.py +2 -2
- doctra/ui/tables_charts_ui.py +2 -2
- doctra/ui/ui_helpers.py +5 -4
- doctra/version.py +1 -1
- {doctra-0.4.2.dist-info → doctra-0.5.0.dist-info}/METADATA +332 -74
- {doctra-0.4.2.dist-info → doctra-0.5.0.dist-info}/RECORD +15 -14
- doctra-0.5.0.dist-info/entry_points.txt +2 -0
- {doctra-0.4.2.dist-info → doctra-0.5.0.dist-info}/WHEEL +0 -0
- {doctra-0.4.2.dist-info → doctra-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.4.2.dist-info → doctra-0.5.0.dist-info}/top_level.txt +0 -0
doctra/cli/main.py
CHANGED
@@ -28,6 +28,7 @@ except ImportError:
|
|
28
28
|
|
29
29
|
# Import additional modules
|
30
30
|
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
31
|
+
from doctra.cli.utils import validate_vlm_config, handle_keyboard_interrupt
|
31
32
|
from doctra.engines.image_restoration import DocResEngine
|
32
33
|
|
33
34
|
|
@@ -85,7 +86,7 @@ def vlm_options(func):
|
|
85
86
|
"""
|
86
87
|
func = click.option('--use-vlm/--no-vlm', default=False,
|
87
88
|
help='Use Vision Language Model for table/chart extraction')(func)
|
88
|
-
func = click.option('--vlm-provider', type=click.Choice(['gemini', 'openai']), default='gemini',
|
89
|
+
func = click.option('--vlm-provider', type=click.Choice(['gemini', 'openai', 'anthropic', 'openrouter', 'ollama']), default='gemini',
|
89
90
|
help='VLM provider to use (default: gemini)')(func)
|
90
91
|
func = click.option('--vlm-model', type=str, default=None,
|
91
92
|
help='Model name to use (defaults to provider-specific defaults)')(func)
|
@@ -141,23 +142,6 @@ def ocr_options(func):
|
|
141
142
|
return func
|
142
143
|
|
143
144
|
|
144
|
-
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
|
145
|
-
"""
|
146
|
-
Validate VLM configuration and exit with error if invalid.
|
147
|
-
|
148
|
-
Checks if VLM is enabled but no API key is provided, and exits
|
149
|
-
with an appropriate error message if the configuration is invalid.
|
150
|
-
|
151
|
-
:param use_vlm: Whether VLM processing is enabled
|
152
|
-
:param vlm_api_key: The VLM API key (can be None if VLM is disabled)
|
153
|
-
:return: None
|
154
|
-
:raises SystemExit: If VLM is enabled but no API key is provided
|
155
|
-
"""
|
156
|
-
if use_vlm and not vlm_api_key:
|
157
|
-
click.echo("❌ Error: VLM API key is required when using --use-vlm", err=True)
|
158
|
-
click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
|
159
|
-
click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
|
160
|
-
sys.exit(1)
|
161
145
|
|
162
146
|
|
163
147
|
@cli.command()
|
@@ -212,7 +196,7 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
212
196
|
:param verbose: Whether to enable verbose output
|
213
197
|
:return: None
|
214
198
|
"""
|
215
|
-
validate_vlm_config(use_vlm, vlm_api_key)
|
199
|
+
validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
216
200
|
|
217
201
|
if verbose:
|
218
202
|
click.echo(f"🔍 Starting full PDF parsing...")
|
@@ -350,7 +334,7 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
|
|
350
334
|
:param verbose: Whether to enable verbose output
|
351
335
|
:return: None
|
352
336
|
"""
|
353
|
-
validate_vlm_config(use_vlm, vlm_api_key)
|
337
|
+
validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
354
338
|
|
355
339
|
if verbose:
|
356
340
|
click.echo(f"🔍 Starting enhanced PDF parsing with DocRes...")
|
@@ -488,7 +472,7 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
488
472
|
:param verbose: Whether to enable verbose output
|
489
473
|
:return: None
|
490
474
|
"""
|
491
|
-
validate_vlm_config(use_vlm, vlm_api_key)
|
475
|
+
validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
492
476
|
|
493
477
|
if verbose:
|
494
478
|
click.echo(f"📊 Starting chart extraction...")
|
@@ -564,7 +548,7 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
564
548
|
:param verbose: Whether to enable verbose output
|
565
549
|
:return: None
|
566
550
|
"""
|
567
|
-
validate_vlm_config(use_vlm, vlm_api_key)
|
551
|
+
validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
568
552
|
|
569
553
|
if verbose:
|
570
554
|
click.echo(f"📋 Starting table extraction...")
|
@@ -642,7 +626,7 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
642
626
|
:param verbose: Whether to enable verbose output
|
643
627
|
:return: None
|
644
628
|
"""
|
645
|
-
validate_vlm_config(use_vlm, vlm_api_key)
|
629
|
+
validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
646
630
|
|
647
631
|
if verbose:
|
648
632
|
click.echo(f"📊📋 Starting chart and table extraction...")
|
@@ -972,6 +956,9 @@ def info():
|
|
972
956
|
click.echo("\nVLM Providers:")
|
973
957
|
click.echo(" • Gemini (Google) - gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite, gemini-2.0-flash")
|
974
958
|
click.echo(" • OpenAI - gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini, gpt-4o")
|
959
|
+
click.echo(" • Anthropic - claude-opus-4-1, claude-3.5-sonnet, claude-3-haiku")
|
960
|
+
click.echo(" • OpenRouter - x-ai/grok-4, meta-llama/llama-3.1-405b-instruct")
|
961
|
+
click.echo(" • Ollama (Local) - llava:latest, gemma3:latest, llama3.2-vision:latest")
|
975
962
|
|
976
963
|
# Available layout models
|
977
964
|
click.echo("\nLayout Detection Models:")
|
doctra/cli/utils.py
CHANGED
@@ -13,20 +13,21 @@ from pathlib import Path
|
|
13
13
|
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
14
14
|
|
15
15
|
|
16
|
-
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
|
16
|
+
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str], vlm_provider: str = "gemini") -> None:
|
17
17
|
"""
|
18
18
|
Validate VLM configuration and exit with error if invalid.
|
19
19
|
|
20
|
-
Checks if VLM is enabled but no API key is provided, and exits
|
20
|
+
Checks if VLM is enabled but no API key is provided (except for Ollama), and exits
|
21
21
|
with an appropriate error message if the configuration is invalid.
|
22
22
|
|
23
23
|
:param use_vlm: Whether VLM processing is enabled
|
24
|
-
:param vlm_api_key: The VLM API key (can be None if VLM is disabled)
|
24
|
+
:param vlm_api_key: The VLM API key (can be None if VLM is disabled or using Ollama)
|
25
|
+
:param vlm_provider: VLM provider name (default: "gemini")
|
25
26
|
:return: None
|
26
|
-
:raises SystemExit: If VLM is enabled but no API key is provided
|
27
|
+
:raises SystemExit: If VLM is enabled but no API key is provided (except for Ollama)
|
27
28
|
"""
|
28
|
-
if use_vlm and not vlm_api_key:
|
29
|
-
click.echo("❌ Error: VLM API key is required when using --use-vlm", err=True)
|
29
|
+
if use_vlm and vlm_provider != "ollama" and not vlm_api_key:
|
30
|
+
click.echo("❌ Error: VLM API key is required when using --use-vlm (except for Ollama)", err=True)
|
30
31
|
click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
|
31
32
|
click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
|
32
33
|
sys.exit(1)
|
doctra/engines/vlm/provider.py
CHANGED
@@ -1,86 +1,257 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
# --- keep these imports to match your snippet style ---
|
4
|
-
import io
|
5
|
-
import
|
6
|
-
import
|
7
|
-
import
|
8
|
-
|
9
|
-
from
|
10
|
-
from
|
11
|
-
from
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
:param
|
29
|
-
:
|
30
|
-
:
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
vlm_model = "
|
40
|
-
elif vlm_provider == "
|
41
|
-
vlm_model = "
|
42
|
-
elif vlm_provider == "
|
43
|
-
vlm_model = "
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
)
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
api_key
|
80
|
-
)
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
)
|
85
|
-
|
86
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
# --- keep these imports to match your snippet style ---
|
4
|
+
import io
|
5
|
+
import os
|
6
|
+
import PIL
|
7
|
+
import openai
|
8
|
+
import outlines
|
9
|
+
from pydantic import BaseModel
|
10
|
+
from google.genai import Client
|
11
|
+
from outlines.inputs import Image
|
12
|
+
from anthropic import Anthropic
|
13
|
+
import ollama
|
14
|
+
# ------------------------------------------------------
|
15
|
+
|
16
|
+
def make_model(
|
17
|
+
vlm_provider: str | None = "gemini",
|
18
|
+
vlm_model: str | None = None,
|
19
|
+
*,
|
20
|
+
api_key: str | None = None,
|
21
|
+
):
|
22
|
+
"""
|
23
|
+
Build a callable Outlines model for VLM processing.
|
24
|
+
|
25
|
+
Creates an Outlines model instance configured for Gemini, OpenAI, Anthropic, OpenRouter, or Ollama
|
26
|
+
providers. Only one backend is active at a time, with Gemini as the default.
|
27
|
+
|
28
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", "openrouter", or "ollama", default: "gemini")
|
29
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
30
|
+
:param api_key: API key for the VLM provider (required for all providers except Ollama)
|
31
|
+
:return: Configured Outlines model instance
|
32
|
+
:raises ValueError: If provider is unsupported or API key is missing
|
33
|
+
"""
|
34
|
+
vlm_provider = (vlm_provider or "gemini").lower()
|
35
|
+
|
36
|
+
# Set default models if not provided
|
37
|
+
if vlm_model is None:
|
38
|
+
if vlm_provider == "gemini":
|
39
|
+
vlm_model = "gemini-2.5-pro"
|
40
|
+
elif vlm_provider == "openai":
|
41
|
+
vlm_model = "gpt-5"
|
42
|
+
elif vlm_provider == "anthropic":
|
43
|
+
vlm_model = "claude-opus-4-1"
|
44
|
+
elif vlm_provider == "openrouter":
|
45
|
+
vlm_model = "x-ai/grok-4"
|
46
|
+
elif vlm_provider == "ollama":
|
47
|
+
vlm_model = "llava:latest"
|
48
|
+
|
49
|
+
if vlm_provider == "gemini":
|
50
|
+
if not api_key:
|
51
|
+
raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
|
52
|
+
# Create the model (exactly like your snippet)
|
53
|
+
return outlines.from_gemini(
|
54
|
+
Client(api_key=api_key),
|
55
|
+
vlm_model,
|
56
|
+
)
|
57
|
+
|
58
|
+
if vlm_provider == "openai":
|
59
|
+
if not api_key:
|
60
|
+
raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
|
61
|
+
# this part is for the openai models (exactly like your snippet)
|
62
|
+
return outlines.from_openai(
|
63
|
+
openai.OpenAI(api_key=api_key),
|
64
|
+
vlm_model,
|
65
|
+
)
|
66
|
+
|
67
|
+
if vlm_provider == "anthropic":
|
68
|
+
if not api_key:
|
69
|
+
raise ValueError("Anthropic provider requires api_key to be passed to make_model(...).")
|
70
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
71
|
+
client = Anthropic(api_key=api_key)
|
72
|
+
return outlines.from_anthropic(
|
73
|
+
client,
|
74
|
+
vlm_model,
|
75
|
+
)
|
76
|
+
|
77
|
+
if vlm_provider == "openrouter":
|
78
|
+
if not api_key:
|
79
|
+
raise ValueError("OpenRouter provider requires api_key to be passed to make_model(...).")
|
80
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
81
|
+
client = openai.OpenAI(
|
82
|
+
base_url="https://openrouter.ai/api/v1",
|
83
|
+
api_key=api_key,
|
84
|
+
)
|
85
|
+
return outlines.from_openai(
|
86
|
+
client,
|
87
|
+
vlm_model
|
88
|
+
)
|
89
|
+
|
90
|
+
if vlm_provider == "ollama":
|
91
|
+
# Ollama doesn't use Outlines, so we return a custom wrapper
|
92
|
+
return OllamaModelWrapper(vlm_model)
|
93
|
+
|
94
|
+
raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini', 'openai', 'anthropic', 'openrouter', or 'ollama'.")
|
95
|
+
|
96
|
+
|
97
|
+
class OllamaModelWrapper:
|
98
|
+
"""
|
99
|
+
Wrapper class to make Ollama compatible with the Outlines interface.
|
100
|
+
|
101
|
+
This class provides a callable interface that matches the Outlines model
|
102
|
+
signature, allowing Ollama to be used as a drop-in replacement for other
|
103
|
+
VLM providers in the Doctra framework.
|
104
|
+
"""
|
105
|
+
|
106
|
+
def __init__(self, model_name: str):
|
107
|
+
"""
|
108
|
+
Initialize the Ollama model wrapper.
|
109
|
+
|
110
|
+
:param model_name: Name of the Ollama model to use (e.g., "llava:latest", "gemma3:latest")
|
111
|
+
"""
|
112
|
+
self.model_name = model_name
|
113
|
+
|
114
|
+
def __call__(self, prompt, schema):
|
115
|
+
"""
|
116
|
+
Call the Ollama model with the given prompt and schema.
|
117
|
+
|
118
|
+
:param prompt: List containing [text_prompt, Image] - the text prompt and PIL Image
|
119
|
+
:param schema: Pydantic model class for structured output
|
120
|
+
:return: Structured data object matching the provided schema
|
121
|
+
"""
|
122
|
+
if not isinstance(prompt, list) or len(prompt) != 2:
|
123
|
+
raise ValueError("Prompt must be a list with [text, image] format")
|
124
|
+
|
125
|
+
text_prompt, image = prompt
|
126
|
+
|
127
|
+
# Convert Image object to bytes for Ollama
|
128
|
+
# The Image object from Outlines might be a PIL Image or a different type
|
129
|
+
try:
|
130
|
+
# Try to get the PIL Image from the Outlines Image object
|
131
|
+
if hasattr(image, 'image'):
|
132
|
+
pil_image = image.image
|
133
|
+
elif hasattr(image, '_image'):
|
134
|
+
pil_image = image._image
|
135
|
+
else:
|
136
|
+
pil_image = image
|
137
|
+
|
138
|
+
# Convert to bytes
|
139
|
+
img_buffer = io.BytesIO()
|
140
|
+
pil_image.save(img_buffer, format='JPEG')
|
141
|
+
img_bytes = img_buffer.getvalue()
|
142
|
+
except Exception as e:
|
143
|
+
# Try alternative approach - save the image directly to a file
|
144
|
+
import tempfile
|
145
|
+
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
|
146
|
+
try:
|
147
|
+
if hasattr(image, 'image'):
|
148
|
+
image.image.save(tmp_file.name, format='JPEG')
|
149
|
+
else:
|
150
|
+
image.save(tmp_file.name, format='JPEG')
|
151
|
+
with open(tmp_file.name, 'rb') as f:
|
152
|
+
img_bytes = f.read()
|
153
|
+
os.unlink(tmp_file.name)
|
154
|
+
except Exception as e2:
|
155
|
+
raise
|
156
|
+
|
157
|
+
# Save image to temporary file for Ollama
|
158
|
+
import tempfile
|
159
|
+
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
|
160
|
+
tmp_file.write(img_bytes)
|
161
|
+
tmp_path = tmp_file.name
|
162
|
+
|
163
|
+
try:
|
164
|
+
# Call Ollama with the image and prompt
|
165
|
+
response = ollama.chat(
|
166
|
+
messages=[{
|
167
|
+
"role": "user",
|
168
|
+
"content": text_prompt,
|
169
|
+
"images": [tmp_path],
|
170
|
+
}],
|
171
|
+
model=self.model_name,
|
172
|
+
format=schema.model_json_schema(), # Use Pydantic schema for structured output
|
173
|
+
)
|
174
|
+
|
175
|
+
# Handle different response formats
|
176
|
+
if 'message' in response and 'content' in response['message']:
|
177
|
+
content = response['message']['content']
|
178
|
+
elif 'response' in response:
|
179
|
+
content = response['response']
|
180
|
+
else:
|
181
|
+
content = str(response)
|
182
|
+
|
183
|
+
# Try to parse as JSON
|
184
|
+
try:
|
185
|
+
result = schema.model_validate_json(content)
|
186
|
+
return result
|
187
|
+
except Exception as json_error:
|
188
|
+
# Try to extract data manually from text response
|
189
|
+
return self._extract_from_text_response(content, schema)
|
190
|
+
|
191
|
+
except Exception as e:
|
192
|
+
# Return a default structure to prevent crashes
|
193
|
+
return schema(
|
194
|
+
title="Extraction Failed",
|
195
|
+
description="Failed to extract data from image",
|
196
|
+
headers=["Error"],
|
197
|
+
rows=[["Could not process image"]]
|
198
|
+
)
|
199
|
+
finally:
|
200
|
+
# Clean up temporary file
|
201
|
+
import os
|
202
|
+
try:
|
203
|
+
os.unlink(tmp_path)
|
204
|
+
except:
|
205
|
+
pass
|
206
|
+
|
207
|
+
def _extract_from_text_response(self, content: str, schema):
|
208
|
+
"""
|
209
|
+
Extract structured data from text response when JSON parsing fails.
|
210
|
+
|
211
|
+
:param content: Text response from Ollama
|
212
|
+
:param schema: Pydantic schema class
|
213
|
+
:return: Structured data object
|
214
|
+
"""
|
215
|
+
try:
|
216
|
+
# Try to find JSON in the response
|
217
|
+
import re
|
218
|
+
import json
|
219
|
+
|
220
|
+
# Look for JSON-like content
|
221
|
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
222
|
+
if json_match:
|
223
|
+
json_str = json_match.group()
|
224
|
+
return schema.model_validate_json(json_str)
|
225
|
+
|
226
|
+
# If no JSON found, create a basic structure
|
227
|
+
lines = content.split('\n')
|
228
|
+
title = "Extracted Data"
|
229
|
+
description = content[:300] if len(content) > 300 else content
|
230
|
+
|
231
|
+
# Try to extract headers and rows from text
|
232
|
+
headers = ["Column 1", "Column 2"] # Default headers
|
233
|
+
rows = [["Data 1", "Data 2"]] # Default row
|
234
|
+
|
235
|
+
# Look for table-like patterns
|
236
|
+
for line in lines:
|
237
|
+
if '|' in line and len(line.split('|')) > 2:
|
238
|
+
# This looks like a table row
|
239
|
+
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
240
|
+
if len(cells) > 1:
|
241
|
+
rows.append(cells)
|
242
|
+
|
243
|
+
return schema(
|
244
|
+
title=title,
|
245
|
+
description=description,
|
246
|
+
headers=headers,
|
247
|
+
rows=rows
|
248
|
+
)
|
249
|
+
|
250
|
+
except Exception as e:
|
251
|
+
# Return minimal structure
|
252
|
+
return schema(
|
253
|
+
title="Text Extraction",
|
254
|
+
description=content[:300] if len(content) > 300 else content,
|
255
|
+
headers=["Content"],
|
256
|
+
rows=[[content[:100]]]
|
257
|
+
)
|
@@ -88,11 +88,14 @@ class StructuredPDFParser:
|
|
88
88
|
self.use_vlm = use_vlm
|
89
89
|
self.vlm = None
|
90
90
|
if self.use_vlm:
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
91
|
+
try:
|
92
|
+
self.vlm = VLMStructuredExtractor(
|
93
|
+
vlm_provider=vlm_provider,
|
94
|
+
vlm_model=vlm_model,
|
95
|
+
api_key=vlm_api_key,
|
96
|
+
)
|
97
|
+
except Exception as e:
|
98
|
+
self.vlm = None
|
96
99
|
|
97
100
|
def parse(self, pdf_path: str) -> None:
|
98
101
|
"""
|
doctra/ui/enhanced_parser_ui.py
CHANGED
@@ -65,7 +65,7 @@ def run_enhanced_parse(
|
|
65
65
|
|
66
66
|
# Validate VLM configuration if VLM is enabled
|
67
67
|
if use_vlm:
|
68
|
-
vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
|
68
|
+
vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
69
69
|
if vlm_error:
|
70
70
|
return (vlm_error, None, [], "", None, None, "")
|
71
71
|
|
@@ -358,7 +358,7 @@ def create_enhanced_parser_tab() -> Tuple[gr.Tab, dict]:
|
|
358
358
|
# VLM settings
|
359
359
|
with gr.Row():
|
360
360
|
use_vlm_enhanced = gr.Checkbox(label="Use VLM (optional)", value=False)
|
361
|
-
vlm_provider_enhanced = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
|
361
|
+
vlm_provider_enhanced = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter", "ollama"], value="gemini", label="VLM Provider")
|
362
362
|
vlm_api_key_enhanced = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
|
363
363
|
|
364
364
|
# Advanced settings accordion
|
doctra/ui/full_parse_ui.py
CHANGED
@@ -60,7 +60,7 @@ def run_full_parse(
|
|
60
60
|
return ("No file provided.", None, [], [], "")
|
61
61
|
|
62
62
|
# Validate VLM configuration
|
63
|
-
vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
|
63
|
+
vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
64
64
|
if vlm_error:
|
65
65
|
return (vlm_error, None, [], [], "")
|
66
66
|
|
@@ -429,7 +429,7 @@ def create_full_parse_tab() -> Tuple[gr.Tab, dict]:
|
|
429
429
|
with gr.Row():
|
430
430
|
pdf = gr.File(file_types=[".pdf"], label="PDF")
|
431
431
|
use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
|
432
|
-
vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
|
432
|
+
vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter", "ollama"], value="gemini", label="VLM Provider")
|
433
433
|
vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
|
434
434
|
|
435
435
|
# Advanced settings accordion
|
doctra/ui/tables_charts_ui.py
CHANGED
@@ -48,7 +48,7 @@ def run_extract(
|
|
48
48
|
return ("No file provided.", "", [], [], "")
|
49
49
|
|
50
50
|
# Validate VLM configuration
|
51
|
-
vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
|
51
|
+
vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
|
52
52
|
if vlm_error:
|
53
53
|
return (vlm_error, "", [], [], "")
|
54
54
|
|
@@ -334,7 +334,7 @@ def create_tables_charts_tab() -> Tuple[gr.Tab, dict]:
|
|
334
334
|
pdf_e = gr.File(file_types=[".pdf"], label="PDF")
|
335
335
|
target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
|
336
336
|
use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
|
337
|
-
vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
|
337
|
+
vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter", "ollama"], value="gemini", label="VLM Provider")
|
338
338
|
vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
|
339
339
|
|
340
340
|
# Advanced settings accordion
|
doctra/ui/ui_helpers.py
CHANGED
@@ -261,21 +261,22 @@ def parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
|
|
261
261
|
return pages
|
262
262
|
|
263
263
|
|
264
|
-
def validate_vlm_config(use_vlm: bool, vlm_api_key: str) -> Optional[str]:
|
264
|
+
def validate_vlm_config(use_vlm: bool, vlm_api_key: str, vlm_provider: str = "gemini") -> Optional[str]:
|
265
265
|
"""
|
266
266
|
Validate VLM configuration parameters.
|
267
267
|
|
268
268
|
Args:
|
269
269
|
use_vlm: Whether VLM is enabled
|
270
270
|
vlm_api_key: API key for VLM provider
|
271
|
+
vlm_provider: VLM provider name (default: "gemini")
|
271
272
|
|
272
273
|
Returns:
|
273
274
|
Error message if validation fails, None if valid
|
274
275
|
"""
|
275
|
-
if use_vlm and not vlm_api_key:
|
276
|
-
return "❌ Error: VLM API key is required when using VLM"
|
276
|
+
if use_vlm and vlm_provider != "ollama" and not vlm_api_key:
|
277
|
+
return "❌ Error: VLM API key is required when using VLM (except for Ollama)"
|
277
278
|
|
278
|
-
if use_vlm and vlm_api_key:
|
279
|
+
if use_vlm and vlm_api_key and vlm_provider != "ollama":
|
279
280
|
# Basic API key validation
|
280
281
|
if len(vlm_api_key.strip()) < 10:
|
281
282
|
return "❌ Error: VLM API key appears to be too short or invalid"
|
doctra/version.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
"""Version information for Doctra."""
|
2
|
-
__version__ = '0.
|
2
|
+
__version__ = '0.5.0'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -240,6 +240,7 @@ Requires-Dist: anthropic>=0.40.0
|
|
240
240
|
Requires-Dist: outlines>=0.0.34
|
241
241
|
Requires-Dist: tqdm>=4.62.0
|
242
242
|
Requires-Dist: matplotlib>=3.5.0
|
243
|
+
Requires-Dist: click>=8.0.0
|
243
244
|
Provides-Extra: openai
|
244
245
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
245
246
|
Provides-Extra: gemini
|
@@ -259,26 +260,30 @@ Dynamic: requires-python
|
|
259
260
|
|
260
261
|
# 🚀 **Doctra - Document Parser Library** 📑🔎
|
261
262
|
|
262
|
-

|
263
264
|
|
264
265
|
<div align="center">
|
265
266
|
|
266
267
|
[](https://github.com/AdemBoukhris457/Doctra)
|
267
268
|
[](https://github.com/AdemBoukhris457/Doctra)
|
268
269
|
[](https://pypi.org/project/doctra/)
|
270
|
+
[](https://ademboukhris457.github.io/Doctra/index.html)
|
269
271
|
</div>
|
270
272
|
|
271
273
|
## 📋 Table of Contents
|
272
274
|
|
273
|
-
- [Installation](
|
274
|
-
- [Quick Start](
|
275
|
-
- [Core Components](
|
275
|
+
- [Installation](#🛠️-installation)
|
276
|
+
- [Quick Start](#⚡-quick-start)
|
277
|
+
- [Core Components](#🔧-core-components)
|
276
278
|
- [StructuredPDFParser](#structuredpdfparser)
|
279
|
+
- [EnhancedPDFParser](#enhancedpdfparser)
|
277
280
|
- [ChartTablePDFParser](#charttablepdfparser)
|
278
|
-
- [
|
279
|
-
- [
|
280
|
-
- [
|
281
|
-
- [
|
281
|
+
- [DocResEngine](#docresengine)
|
282
|
+
- [Web UI (Gradio)](#🖥️-web-ui-gradio)
|
283
|
+
- [Command Line Interface](#command-line-interface)
|
284
|
+
- [Visualization](#🎨-visualization)
|
285
|
+
- [Usage Examples](#📖-usage-examples)
|
286
|
+
- [Features](#✨-features)
|
282
287
|
|
283
288
|
## 🛠️ Installation
|
284
289
|
|
@@ -391,6 +396,70 @@ parser = StructuredPDFParser(
|
|
391
396
|
)
|
392
397
|
```
|
393
398
|
|
399
|
+
### EnhancedPDFParser
|
400
|
+
|
401
|
+
The `EnhancedPDFParser` extends the `StructuredPDFParser` with advanced image restoration capabilities using DocRes. This parser is ideal for processing scanned documents, low-quality PDFs, or documents with visual distortions that need enhancement before parsing.
|
402
|
+
|
403
|
+
#### Key Features:
|
404
|
+
- **Image Restoration**: Uses DocRes for document enhancement before processing
|
405
|
+
- **Multiple Restoration Tasks**: Supports dewarping, deshadowing, appearance enhancement, deblurring, binarization, and end-to-end restoration
|
406
|
+
- **Enhanced Quality**: Improves document quality for better OCR and layout detection
|
407
|
+
- **All StructuredPDFParser Features**: Inherits all capabilities of the base parser
|
408
|
+
- **Flexible Configuration**: Extensive options for restoration and processing
|
409
|
+
|
410
|
+
#### Basic Usage:
|
411
|
+
|
412
|
+
```python
|
413
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
414
|
+
|
415
|
+
# Basic enhanced parser with image restoration
|
416
|
+
parser = EnhancedPDFParser(
|
417
|
+
use_image_restoration=True,
|
418
|
+
restoration_task="appearance" # Default restoration task
|
419
|
+
)
|
420
|
+
|
421
|
+
# Parse document with enhancement
|
422
|
+
parser.parse("scanned_document.pdf")
|
423
|
+
```
|
424
|
+
|
425
|
+
#### Advanced Configuration:
|
426
|
+
|
427
|
+
```python
|
428
|
+
parser = EnhancedPDFParser(
|
429
|
+
# Image Restoration Settings
|
430
|
+
use_image_restoration=True,
|
431
|
+
restoration_task="dewarping", # Correct perspective distortion
|
432
|
+
restoration_device="cuda", # Use GPU for faster processing
|
433
|
+
restoration_dpi=300, # Higher DPI for better quality
|
434
|
+
|
435
|
+
# VLM Settings
|
436
|
+
use_vlm=True,
|
437
|
+
vlm_provider="openai",
|
438
|
+
vlm_model="gpt-4-vision",
|
439
|
+
vlm_api_key="your_api_key",
|
440
|
+
|
441
|
+
# Layout Detection Settings
|
442
|
+
layout_model_name="PP-DocLayout_plus-L",
|
443
|
+
dpi=200,
|
444
|
+
min_score=0.5,
|
445
|
+
|
446
|
+
# OCR Settings
|
447
|
+
ocr_lang="eng",
|
448
|
+
ocr_psm=6
|
449
|
+
)
|
450
|
+
```
|
451
|
+
|
452
|
+
#### DocRes Restoration Tasks:
|
453
|
+
|
454
|
+
| Task | Description | Best For |
|
455
|
+
|------|-------------|----------|
|
456
|
+
| `appearance` | General appearance enhancement | Most documents (default) |
|
457
|
+
| `dewarping` | Correct perspective distortion | Scanned documents with perspective issues |
|
458
|
+
| `deshadowing` | Remove shadows and lighting artifacts | Documents with shadow problems |
|
459
|
+
| `deblurring` | Reduce blur and improve sharpness | Blurry or low-quality scans |
|
460
|
+
| `binarization` | Convert to black and white | Documents needing clean binarization |
|
461
|
+
| `end2end` | Complete restoration pipeline | Severely degraded documents |
|
462
|
+
|
394
463
|
### ChartTablePDFParser
|
395
464
|
|
396
465
|
The `ChartTablePDFParser` is a specialized parser focused specifically on extracting charts and tables from PDF documents. It's optimized for scenarios where you only need these specific elements, providing faster processing and more targeted output.
|
@@ -444,6 +513,163 @@ parser = ChartTablePDFParser(
|
|
444
513
|
)
|
445
514
|
```
|
446
515
|
|
516
|
+
### DocResEngine
|
517
|
+
|
518
|
+
The `DocResEngine` provides direct access to DocRes image restoration capabilities. This engine is perfect for standalone image restoration tasks or when you need fine-grained control over the restoration process.
|
519
|
+
|
520
|
+
#### Key Features:
|
521
|
+
- **Direct Image Restoration**: Process individual images or entire PDFs
|
522
|
+
- **Multiple Restoration Tasks**: All 6 DocRes restoration tasks available
|
523
|
+
- **GPU Acceleration**: Automatic CUDA detection and optimization
|
524
|
+
- **Flexible Input/Output**: Support for various image formats and PDFs
|
525
|
+
- **Metadata Extraction**: Get detailed information about restoration process
|
526
|
+
|
527
|
+
#### Basic Usage:
|
528
|
+
|
529
|
+
```python
|
530
|
+
from doctra.engines.image_restoration import DocResEngine
|
531
|
+
|
532
|
+
# Initialize DocRes engine
|
533
|
+
docres = DocResEngine(device="cuda") # or "cpu" or None for auto-detect
|
534
|
+
|
535
|
+
# Restore a single image
|
536
|
+
restored_img, metadata = docres.restore_image(
|
537
|
+
image="path/to/image.jpg",
|
538
|
+
task="appearance"
|
539
|
+
)
|
540
|
+
|
541
|
+
# Restore entire PDF
|
542
|
+
enhanced_pdf = docres.restore_pdf(
|
543
|
+
pdf_path="document.pdf",
|
544
|
+
output_path="enhanced_document.pdf",
|
545
|
+
task="appearance"
|
546
|
+
)
|
547
|
+
```
|
548
|
+
|
549
|
+
#### Advanced Usage:
|
550
|
+
|
551
|
+
```python
|
552
|
+
# Initialize with custom settings
|
553
|
+
docres = DocResEngine(
|
554
|
+
device="cuda", # Force GPU usage
|
555
|
+
use_half_precision=True, # Use half precision for faster processing
|
556
|
+
model_path="custom/model.pth", # Custom model path (optional)
|
557
|
+
mbd_path="custom/mbd.pth" # Custom MBD model path (optional)
|
558
|
+
)
|
559
|
+
|
560
|
+
# Process multiple images
|
561
|
+
images = ["doc1.jpg", "doc2.jpg", "doc3.jpg"]
|
562
|
+
for img_path in images:
|
563
|
+
restored_img, metadata = docres.restore_image(
|
564
|
+
image=img_path,
|
565
|
+
task="dewarping"
|
566
|
+
)
|
567
|
+
print(f"Processed {img_path}: {metadata}")
|
568
|
+
|
569
|
+
# Batch PDF processing
|
570
|
+
pdfs = ["report1.pdf", "report2.pdf"]
|
571
|
+
for pdf_path in pdfs:
|
572
|
+
output_path = f"enhanced_{os.path.basename(pdf_path)}"
|
573
|
+
docres.restore_pdf(
|
574
|
+
pdf_path=pdf_path,
|
575
|
+
output_path=output_path,
|
576
|
+
task="end2end" # Complete restoration pipeline
|
577
|
+
)
|
578
|
+
```
|
579
|
+
|
580
|
+
#### Supported Restoration Tasks:
|
581
|
+
|
582
|
+
| Task | Description | Use Case |
|
583
|
+
|------|-------------|----------|
|
584
|
+
| `appearance` | General appearance enhancement | Default choice for most documents |
|
585
|
+
| `dewarping` | Correct document perspective distortion | Scanned documents with perspective issues |
|
586
|
+
| `deshadowing` | Remove shadows and lighting artifacts | Documents with shadow problems |
|
587
|
+
| `deblurring` | Reduce blur and improve sharpness | Blurry or low-quality scans |
|
588
|
+
| `binarization` | Convert to black and white | Documents needing clean binarization |
|
589
|
+
| `end2end` | Complete restoration pipeline | Severely degraded documents |
|
590
|
+
|
591
|
+
## 🖥️ Web UI (Gradio)
|
592
|
+
|
593
|
+
Doctra provides a comprehensive web interface built with Gradio that makes document processing accessible to non-technical users.
|
594
|
+
|
595
|
+
#### Features:
|
596
|
+
- **Drag & Drop Interface**: Upload PDFs by dragging and dropping
|
597
|
+
- **Multiple Parsers**: Choose between full parsing, enhanced parsing, and chart/table extraction
|
598
|
+
- **Real-time Processing**: See progress as documents are processed
|
599
|
+
- **VLM Integration**: Configure API keys for AI features
|
600
|
+
- **Output Preview**: View results directly in the browser
|
601
|
+
- **Download Results**: Download processed files as ZIP archives
|
602
|
+
|
603
|
+
#### Launch the Web UI:
|
604
|
+
|
605
|
+
```python
|
606
|
+
from doctra.ui.app import launch_ui
|
607
|
+
|
608
|
+
# Launch the web interface
|
609
|
+
launch_ui()
|
610
|
+
```
|
611
|
+
|
612
|
+
Or from command line:
|
613
|
+
```bash
|
614
|
+
python gradio_app.py
|
615
|
+
```
|
616
|
+
|
617
|
+
#### Web UI Components:
|
618
|
+
|
619
|
+
1. **Full Parse Tab**: Complete document processing with page navigation
|
620
|
+
2. **Tables & Charts Tab**: Specialized extraction with VLM integration
|
621
|
+
3. **DocRes Tab**: Image restoration with before/after comparison
|
622
|
+
4. **Enhanced Parser Tab**: Enhanced parsing with DocRes integration
|
623
|
+
|
624
|
+
## Command Line Interface
|
625
|
+
|
626
|
+
Doctra includes a powerful CLI for batch processing and automation.
|
627
|
+
|
628
|
+
#### Available Commands:
|
629
|
+
|
630
|
+
```bash
|
631
|
+
# Full document parsing
|
632
|
+
doctra parse document.pdf
|
633
|
+
|
634
|
+
# Enhanced parsing with image restoration
|
635
|
+
doctra enhance document.pdf --restoration-task appearance
|
636
|
+
|
637
|
+
# Extract only charts and tables
|
638
|
+
doctra extract charts document.pdf
|
639
|
+
doctra extract tables document.pdf
|
640
|
+
doctra extract both document.pdf --use-vlm
|
641
|
+
|
642
|
+
# Visualize layout detection
|
643
|
+
doctra visualize document.pdf
|
644
|
+
|
645
|
+
# Quick document analysis
|
646
|
+
doctra analyze document.pdf
|
647
|
+
|
648
|
+
# System information
|
649
|
+
doctra info
|
650
|
+
```
|
651
|
+
|
652
|
+
#### CLI Examples:
|
653
|
+
|
654
|
+
```bash
|
655
|
+
# Enhanced parsing with custom settings
|
656
|
+
doctra enhance document.pdf \
|
657
|
+
--restoration-task dewarping \
|
658
|
+
--restoration-device cuda \
|
659
|
+
--use-vlm \
|
660
|
+
--vlm-provider openai \
|
661
|
+
--vlm-api-key your_key
|
662
|
+
|
663
|
+
# Extract charts with VLM
|
664
|
+
doctra extract charts document.pdf \
|
665
|
+
--use-vlm \
|
666
|
+
--vlm-provider gemini \
|
667
|
+
--vlm-api-key your_key
|
668
|
+
|
669
|
+
# Batch processing
|
670
|
+
doctra parse *.pdf --output-dir results/
|
671
|
+
```
|
672
|
+
|
447
673
|
## 🎨 Visualization
|
448
674
|
|
449
675
|
Doctra provides powerful visualization capabilities to help you understand how the layout detection works and verify the accuracy of element extraction.
|
@@ -540,7 +766,53 @@ parser.parse("financial_report.pdf")
|
|
540
766
|
# - Markdown file with all content
|
541
767
|
```
|
542
768
|
|
543
|
-
### Example 2:
|
769
|
+
### Example 2: Enhanced Parsing with Image Restoration
|
770
|
+
|
771
|
+
```python
|
772
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
773
|
+
|
774
|
+
# Initialize enhanced parser with image restoration
|
775
|
+
parser = EnhancedPDFParser(
|
776
|
+
use_image_restoration=True,
|
777
|
+
restoration_task="dewarping", # Correct perspective distortion
|
778
|
+
restoration_device="cuda", # Use GPU for faster processing
|
779
|
+
use_vlm=True,
|
780
|
+
vlm_provider="openai",
|
781
|
+
vlm_api_key="your_api_key"
|
782
|
+
)
|
783
|
+
|
784
|
+
# Process scanned document with enhancement
|
785
|
+
parser.parse("scanned_document.pdf")
|
786
|
+
|
787
|
+
# Output will include:
|
788
|
+
# - Enhanced PDF with restored images
|
789
|
+
# - All standard parsing outputs
|
790
|
+
# - Improved OCR accuracy due to restoration
|
791
|
+
```
|
792
|
+
|
793
|
+
### Example 3: Direct Image Restoration
|
794
|
+
|
795
|
+
```python
|
796
|
+
from doctra.engines.image_restoration import DocResEngine
|
797
|
+
|
798
|
+
# Initialize DocRes engine
|
799
|
+
docres = DocResEngine(device="cuda")
|
800
|
+
|
801
|
+
# Restore individual images
|
802
|
+
restored_img, metadata = docres.restore_image(
|
803
|
+
image="blurry_document.jpg",
|
804
|
+
task="deblurring"
|
805
|
+
)
|
806
|
+
|
807
|
+
# Restore entire PDF
|
808
|
+
docres.restore_pdf(
|
809
|
+
pdf_path="low_quality.pdf",
|
810
|
+
output_path="enhanced.pdf",
|
811
|
+
task="appearance"
|
812
|
+
)
|
813
|
+
```
|
814
|
+
|
815
|
+
### Example 4: Chart and Table Extraction with VLM
|
544
816
|
|
545
817
|
```python
|
546
818
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
@@ -563,29 +835,42 @@ parser.parse("data_report.pdf", output_base_dir="extracted_data")
|
|
563
835
|
# - Markdown tables with extracted data
|
564
836
|
```
|
565
837
|
|
566
|
-
### Example
|
838
|
+
### Example 5: Web UI Usage
|
567
839
|
|
568
840
|
```python
|
569
|
-
from doctra.
|
841
|
+
from doctra.ui.app import launch_ui
|
570
842
|
|
571
|
-
#
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
ocr_psm=6, # Uniform block of text
|
582
|
-
box_separator="\n\n" # Double line breaks between elements
|
583
|
-
)
|
843
|
+
# Launch the web interface
|
844
|
+
launch_ui()
|
845
|
+
|
846
|
+
# Or build the interface programmatically
|
847
|
+
from doctra.ui.app import build_demo
|
848
|
+
demo = build_demo()
|
849
|
+
demo.launch(share=True) # Share publicly
|
850
|
+
```
|
851
|
+
|
852
|
+
### Example 6: Command Line Usage
|
584
853
|
|
585
|
-
|
854
|
+
```bash
|
855
|
+
# Enhanced parsing with custom settings
|
856
|
+
doctra enhance document.pdf \
|
857
|
+
--restoration-task dewarping \
|
858
|
+
--restoration-device cuda \
|
859
|
+
--use-vlm \
|
860
|
+
--vlm-provider openai \
|
861
|
+
--vlm-api-key your_key
|
862
|
+
|
863
|
+
# Extract charts with VLM
|
864
|
+
doctra extract charts document.pdf \
|
865
|
+
--use-vlm \
|
866
|
+
--vlm-provider gemini \
|
867
|
+
--vlm-api-key your_key
|
868
|
+
|
869
|
+
# Batch processing
|
870
|
+
doctra parse *.pdf --output-dir results/
|
586
871
|
```
|
587
872
|
|
588
|
-
### Example
|
873
|
+
### Example 7: Layout Visualization
|
589
874
|
|
590
875
|
```python
|
591
876
|
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
@@ -624,68 +909,41 @@ parser.display_pages_with_boxes("document.pdf")
|
|
624
909
|
- Organized output directory structure
|
625
910
|
- High-resolution image preservation
|
626
911
|
|
912
|
+
### 🔧 Image Restoration (DocRes)
|
913
|
+
- **6 Restoration Tasks**: Dewarping, deshadowing, appearance enhancement, deblurring, binarization, and end-to-end restoration
|
914
|
+
- **GPU Acceleration**: Automatic CUDA detection and optimization
|
915
|
+
- **Enhanced Quality**: Improves document quality for better OCR and layout detection
|
916
|
+
- **Flexible Processing**: Standalone image restoration or integrated with parsing
|
917
|
+
|
627
918
|
### 🤖 VLM Integration
|
628
919
|
- Vision Language Model support for structured data extraction
|
629
|
-
- Multiple provider options (Gemini,
|
920
|
+
- Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter)
|
630
921
|
- Automatic conversion of charts and tables to structured formats
|
631
922
|
|
632
923
|
### 📊 Multiple Output Formats
|
633
924
|
- **Markdown**: Human-readable document with embedded images and tables
|
634
925
|
- **Excel**: Structured data in spreadsheet format
|
635
926
|
- **JSON**: Programmatically accessible structured data
|
927
|
+
- **HTML**: Interactive web-ready documents
|
636
928
|
- **Images**: High-quality cropped visual elements
|
637
929
|
|
930
|
+
### 🖥️ User Interfaces
|
931
|
+
- **Web UI**: Gradio-based interface with drag & drop functionality
|
932
|
+
- **Command Line**: Powerful CLI for batch processing and automation
|
933
|
+
- **Multiple Tabs**: Full parsing, enhanced parsing, chart/table extraction, and image restoration
|
934
|
+
|
638
935
|
### ⚙️ Flexible Configuration
|
639
936
|
- Extensive customization options
|
640
937
|
- Performance tuning parameters
|
641
938
|
- Output format selection
|
939
|
+
- Device selection (CPU/GPU)
|
642
940
|
|
643
|
-
##
|
644
|
-
|
645
|
-
### Core Dependencies
|
646
|
-
- **PaddleOCR**: Document layout detection
|
647
|
-
- **Outlines**: Structured output generation
|
648
|
-
- **Tesseract**: OCR text extraction
|
649
|
-
- **Pillow**: Image processing
|
650
|
-
- **OpenCV**: Computer vision operations
|
651
|
-
- **Pandas**: Data manipulation
|
652
|
-
- **OpenPyXL**: Excel file generation
|
653
|
-
- **Google Generative AI**: For Gemini VLM integration
|
654
|
-
- **OpenAI**: For GPT-5 VLM integration
|
655
|
-
|
656
|
-
## 🖥️ Web Interface (Gradio)
|
657
|
-
|
658
|
-
You can try Doctra in a simple web UI powered by Gradio.
|
659
|
-
|
660
|
-
### Run locally
|
661
|
-
|
662
|
-
```bash
|
663
|
-
pip install -U gradio
|
664
|
-
python gradio_app.py
|
665
|
-
```
|
666
|
-
|
667
|
-
Then open the printed URL (default `http://127.0.0.1:7860`).
|
668
|
-
|
669
|
-
Notes:
|
670
|
-
- If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
|
671
|
-
- Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
|
672
|
-
|
673
|
-
### Deploy on Hugging Face Spaces
|
674
|
-
|
675
|
-
1) Create a new Space (type: Gradio, SDK: Python).
|
676
|
-
|
677
|
-
2) Add these files to the Space repo:
|
678
|
-
- Your package code (or install from PyPI).
|
679
|
-
- `gradio_app.py` (entry point).
|
680
|
-
- `requirements.txt` with at least:
|
681
|
-
|
682
|
-
```text
|
683
|
-
doctra
|
684
|
-
gradio
|
685
|
-
```
|
941
|
+
## 🙏 Acknowledgments
|
686
942
|
|
687
|
-
|
943
|
+
Doctra builds upon several excellent open-source projects:
|
688
944
|
|
689
|
-
|
945
|
+
- **[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)** - Advanced document layout detection and OCR capabilities
|
946
|
+
- **[DocRes](https://github.com/ZZZHANG-jx/DocRes)** - State-of-the-art document image restoration model
|
947
|
+
- **[Outlines](https://github.com/dottxt-ai/outlines)** - Structured output generation for LLMs
|
690
948
|
|
691
|
-
|
949
|
+
We thank the developers and contributors of these projects for their valuable work that makes Doctra possible.
|
@@ -1,8 +1,8 @@
|
|
1
1
|
doctra/__init__.py,sha256=rNLCyODOpaPb_TTP6qmQnuWZJW9JPXrxg1IfKnvb1No,773
|
2
|
-
doctra/version.py,sha256=
|
2
|
+
doctra/version.py,sha256=A8O6Kr44VM50_wm5hKwwcjBUZJFPAk64i5o7DatMRlQ,62
|
3
3
|
doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
|
4
|
-
doctra/cli/main.py,sha256=
|
5
|
-
doctra/cli/utils.py,sha256=
|
4
|
+
doctra/cli/main.py,sha256=UhWTatY3qIeutZzVo9syLG2srbs8MZuGaLo5tk9xC_M,43108
|
5
|
+
doctra/cli/utils.py,sha256=GKSSGi-JjNXufNekqCysSev7St1t32caYMduy0Tq96s,11971
|
6
6
|
doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
doctra/engines/image_restoration/__init__.py,sha256=vzcN6Rw7_U-5jIK2pdo2NlgqdLdXDShigrOGM7QLNEE,263
|
8
8
|
doctra/engines/image_restoration/docres_engine.py,sha256=wbo-FWEb6_Twq5KqzjPgGQwcAuFD98uBAiQBEY8vN2A,21592
|
@@ -15,7 +15,7 @@ doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4
|
|
15
15
|
doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
|
16
16
|
doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
doctra/engines/vlm/outlines_types.py,sha256=fQK6ru7XiXHaa8JPpaTTBaTk_zQ93ZyhFp4SyAnUdVU,1337
|
18
|
-
doctra/engines/vlm/provider.py,sha256=
|
18
|
+
doctra/engines/vlm/provider.py,sha256=lXQJNxDTxBHSxuEMbF37PjETEokR9o7rc2jLWEH9RnU,9943
|
19
19
|
doctra/engines/vlm/service.py,sha256=nygxMe7uTq6Bv70ycBPL59F2a0ESp1Hix4j833p6rUM,4343
|
20
20
|
doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
doctra/exporters/excel_writer.py,sha256=rwyqlH73P7z413BELovQY_pS6IMkkqHEho6mbPrJ2Sk,11857
|
@@ -26,7 +26,7 @@ doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN
|
|
26
26
|
doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
|
27
27
|
doctra/parsers/enhanced_pdf_parser.py,sha256=TG4uM_dK80-69y1C99HhSoVInHGwTb-sGJtmHBpZuMY,23756
|
28
28
|
doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
|
29
|
-
doctra/parsers/structured_pdf_parser.py,sha256=
|
29
|
+
doctra/parsers/structured_pdf_parser.py,sha256=3jPulhR0agnhP1r9j48WvH53-NZVMhePAmNLzy-_fes,22391
|
30
30
|
doctra/parsers/table_chart_extractor.py,sha256=ZD0l2V_8HBdHOAIhMIujfnd5ai3gXsSLL67VMVu3F8A,13905
|
31
31
|
doctra/third_party/docres/inference.py,sha256=krD5EQDiqki-5uTMqqHYivhL38sfSOhYgaihI751070,13576
|
32
32
|
doctra/third_party/docres/utils.py,sha256=N0ZVmOTB3wsinFlYu5hT84C4_MhWGdc98T8LTG-S9dA,14566
|
@@ -52,10 +52,10 @@ doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
|
|
52
52
|
doctra/ui/app.py,sha256=I9pX-U3VASGs4kfL6Tv3nDH2tlU4kSv5WrnsNDfYTbQ,2305
|
53
53
|
doctra/ui/docres_ui.py,sha256=QMTsNUdw2NGlHK-mYwB-j5i2QXEndYv8Zvc8213jXVA,13034
|
54
54
|
doctra/ui/docres_wrapper.py,sha256=BjcY5Xik9UBFPzPL-ONT2GIpTeRrYUXXzuDEq1QE28Q,4498
|
55
|
-
doctra/ui/enhanced_parser_ui.py,sha256=
|
56
|
-
doctra/ui/full_parse_ui.py,sha256=
|
57
|
-
doctra/ui/tables_charts_ui.py,sha256=
|
58
|
-
doctra/ui/ui_helpers.py,sha256=
|
55
|
+
doctra/ui/enhanced_parser_ui.py,sha256=oImlFfpjLGs3CpOIUIx_o-1fK7ddUhUCOYW4NUiuJrA,20778
|
56
|
+
doctra/ui/full_parse_ui.py,sha256=h-bckQq9FRbVA00l4VQXnzdLgNIrIeAtVVdHkihTPjE,18621
|
57
|
+
doctra/ui/tables_charts_ui.py,sha256=ZcRhTbi4iB0tBi3JC-Z3w6AN6dgUOWt9sV_-iJCkaFE,16558
|
58
|
+
doctra/ui/ui_helpers.py,sha256=Wx36d5rbUdRXQg98w45DIxH0Hib0mTMEmv2cH3ejyGI,15753
|
59
59
|
doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
60
|
doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
|
61
61
|
doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
|
@@ -66,8 +66,9 @@ doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
|
|
66
66
|
doctra/utils/progress.py,sha256=BD9YZqYLZw6yohQnyUV3w9QsQuiIrXM_EqByOSSJsDU,11912
|
67
67
|
doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
|
68
68
|
doctra/utils/structured_utils.py,sha256=vU84dsD8wIlTyMsA9hitorGH-eroQiVuWEpBTQBUT24,1478
|
69
|
-
doctra-0.
|
70
|
-
doctra-0.
|
71
|
-
doctra-0.
|
72
|
-
doctra-0.
|
73
|
-
doctra-0.
|
69
|
+
doctra-0.5.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
70
|
+
doctra-0.5.0.dist-info/METADATA,sha256=tall4Spu8hFtNARaVVCNl9QedT-4VUubsV4oqrMLxoc,37168
|
71
|
+
doctra-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
72
|
+
doctra-0.5.0.dist-info/entry_points.txt,sha256=4G2RHamA0llCiIXaQQm8EDkVK9JNGKbI7uDnXVFgIaY,47
|
73
|
+
doctra-0.5.0.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
|
74
|
+
doctra-0.5.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|