doctra 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doctra/cli/main.py CHANGED
@@ -28,6 +28,7 @@ except ImportError:
28
28
 
29
29
  # Import additional modules
30
30
  from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
31
+ from doctra.cli.utils import validate_vlm_config, handle_keyboard_interrupt
31
32
  from doctra.engines.image_restoration import DocResEngine
32
33
 
33
34
 
@@ -85,7 +86,7 @@ def vlm_options(func):
85
86
  """
86
87
  func = click.option('--use-vlm/--no-vlm', default=False,
87
88
  help='Use Vision Language Model for table/chart extraction')(func)
88
- func = click.option('--vlm-provider', type=click.Choice(['gemini', 'openai']), default='gemini',
89
+ func = click.option('--vlm-provider', type=click.Choice(['gemini', 'openai', 'anthropic', 'openrouter', 'ollama']), default='gemini',
89
90
  help='VLM provider to use (default: gemini)')(func)
90
91
  func = click.option('--vlm-model', type=str, default=None,
91
92
  help='Model name to use (defaults to provider-specific defaults)')(func)
@@ -141,23 +142,6 @@ def ocr_options(func):
141
142
  return func
142
143
 
143
144
 
144
- def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
145
- """
146
- Validate VLM configuration and exit with error if invalid.
147
-
148
- Checks if VLM is enabled but no API key is provided, and exits
149
- with an appropriate error message if the configuration is invalid.
150
-
151
- :param use_vlm: Whether VLM processing is enabled
152
- :param vlm_api_key: The VLM API key (can be None if VLM is disabled)
153
- :return: None
154
- :raises SystemExit: If VLM is enabled but no API key is provided
155
- """
156
- if use_vlm and not vlm_api_key:
157
- click.echo("❌ Error: VLM API key is required when using --use-vlm", err=True)
158
- click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
159
- click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
160
- sys.exit(1)
161
145
 
162
146
 
163
147
  @cli.command()
@@ -212,7 +196,7 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
212
196
  :param verbose: Whether to enable verbose output
213
197
  :return: None
214
198
  """
215
- validate_vlm_config(use_vlm, vlm_api_key)
199
+ validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
216
200
 
217
201
  if verbose:
218
202
  click.echo(f"🔍 Starting full PDF parsing...")
@@ -350,7 +334,7 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
350
334
  :param verbose: Whether to enable verbose output
351
335
  :return: None
352
336
  """
353
- validate_vlm_config(use_vlm, vlm_api_key)
337
+ validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
354
338
 
355
339
  if verbose:
356
340
  click.echo(f"🔍 Starting enhanced PDF parsing with DocRes...")
@@ -488,7 +472,7 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
488
472
  :param verbose: Whether to enable verbose output
489
473
  :return: None
490
474
  """
491
- validate_vlm_config(use_vlm, vlm_api_key)
475
+ validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
492
476
 
493
477
  if verbose:
494
478
  click.echo(f"📊 Starting chart extraction...")
@@ -564,7 +548,7 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
564
548
  :param verbose: Whether to enable verbose output
565
549
  :return: None
566
550
  """
567
- validate_vlm_config(use_vlm, vlm_api_key)
551
+ validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
568
552
 
569
553
  if verbose:
570
554
  click.echo(f"📋 Starting table extraction...")
@@ -642,7 +626,7 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
642
626
  :param verbose: Whether to enable verbose output
643
627
  :return: None
644
628
  """
645
- validate_vlm_config(use_vlm, vlm_api_key)
629
+ validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
646
630
 
647
631
  if verbose:
648
632
  click.echo(f"📊📋 Starting chart and table extraction...")
@@ -972,6 +956,9 @@ def info():
972
956
  click.echo("\nVLM Providers:")
973
957
  click.echo(" • Gemini (Google) - gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite, gemini-2.0-flash")
974
958
  click.echo(" • OpenAI - gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini, gpt-4o")
959
+ click.echo(" • Anthropic - claude-opus-4-1, claude-3.5-sonnet, claude-3-haiku")
960
+ click.echo(" • OpenRouter - x-ai/grok-4, meta-llama/llama-3.1-405b-instruct")
961
+ click.echo(" • Ollama (Local) - llava:latest, gemma3:latest, llama3.2-vision:latest")
975
962
 
976
963
  # Available layout models
977
964
  click.echo("\nLayout Detection Models:")
doctra/cli/utils.py CHANGED
@@ -13,20 +13,21 @@ from pathlib import Path
13
13
  from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
14
14
 
15
15
 
16
- def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
16
+ def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str], vlm_provider: str = "gemini") -> None:
17
17
  """
18
18
  Validate VLM configuration and exit with error if invalid.
19
19
 
20
- Checks if VLM is enabled but no API key is provided, and exits
20
+ Checks if VLM is enabled but no API key is provided (except for Ollama), and exits
21
21
  with an appropriate error message if the configuration is invalid.
22
22
 
23
23
  :param use_vlm: Whether VLM processing is enabled
24
- :param vlm_api_key: The VLM API key (can be None if VLM is disabled)
24
+ :param vlm_api_key: The VLM API key (can be None if VLM is disabled or using Ollama)
25
+ :param vlm_provider: VLM provider name (default: "gemini")
25
26
  :return: None
26
- :raises SystemExit: If VLM is enabled but no API key is provided
27
+ :raises SystemExit: If VLM is enabled but no API key is provided (except for Ollama)
27
28
  """
28
- if use_vlm and not vlm_api_key:
29
- click.echo("❌ Error: VLM API key is required when using --use-vlm", err=True)
29
+ if use_vlm and vlm_provider != "ollama" and not vlm_api_key:
30
+ click.echo("❌ Error: VLM API key is required when using --use-vlm (except for Ollama)", err=True)
30
31
  click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
31
32
  click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
32
33
  sys.exit(1)
@@ -1,86 +1,272 @@
1
- from __future__ import annotations
2
-
3
- # --- keep these imports to match your snippet style ---
4
- import io
5
- import PIL
6
- import openai
7
- import outlines
8
- from pydantic import BaseModel
9
- from google.genai import Client
10
- from outlines.inputs import Image
11
- from anthropic import Anthropic
12
- # ------------------------------------------------------
13
-
14
- def make_model(
15
- vlm_provider: str | None = "gemini",
16
- vlm_model: str | None = None,
17
- *,
18
- api_key: str | None = None,
19
- ):
20
- """
21
- Build a callable Outlines model for VLM processing.
22
-
23
- Creates an Outlines model instance configured for Gemini, OpenAI, Anthropic, or OpenRouter
24
- providers. Only one backend is active at a time, with Gemini as the default.
25
-
26
- :param vlm_provider: VLM provider to use ("gemini", "openai", or "anthropic", default: "gemini")
27
- :param vlm_model: Model name to use (defaults to provider-specific defaults)
28
- :param api_key: API key for the VLM provider (required for all providers)
29
- :return: Configured Outlines model instance
30
- :raises ValueError: If provider is unsupported or API key is missing
31
- """
32
- vlm_provider = (vlm_provider or "gemini").lower()
33
-
34
- # Set default models if not provided
35
- if vlm_model is None:
36
- if vlm_provider == "gemini":
37
- vlm_model = "gemini-2.5-pro"
38
- elif vlm_provider == "openai":
39
- vlm_model = "gpt-5"
40
- elif vlm_provider == "anthropic":
41
- vlm_model = "claude-opus-4-1"
42
- elif vlm_provider == "openrouter":
43
- vlm_model = "x-ai/grok-4"
44
-
45
- if vlm_provider == "gemini":
46
- if not api_key:
47
- raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
48
- # Create the model (exactly like your snippet)
49
- return outlines.from_gemini(
50
- Client(api_key=api_key),
51
- vlm_model,
52
- )
53
-
54
- if vlm_provider == "openai":
55
- if not api_key:
56
- raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
57
- # this part is for the openai models (exactly like your snippet)
58
- return outlines.from_openai(
59
- openai.OpenAI(api_key=api_key),
60
- vlm_model,
61
- )
62
-
63
- if vlm_provider == "anthropic":
64
- if not api_key:
65
- raise ValueError("Anthropic provider requires api_key to be passed to make_model(...).")
66
- # Create the Anthropic client and model (exactly like your snippet)
67
- client = Anthropic(api_key=api_key)
68
- return outlines.from_anthropic(
69
- client,
70
- vlm_model,
71
- )
72
-
73
- if vlm_provider == "openrouter":
74
- if not api_key:
75
- raise ValueError("OpenRouter provider requires api_key to be passed to make_model(...).")
76
- # Create the Anthropic client and model (exactly like your snippet)
77
- client = openai.OpenAI(
78
- base_url="https://openrouter.ai/api/v1",
79
- api_key=api_key,
80
- )
81
- return outlines.from_openai(
82
- client,
83
- vlm_model
84
- )
85
-
86
- raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini', 'openai', or 'anthropic'.")
1
+ from __future__ import annotations
2
+
3
+ # --- keep these imports to match your snippet style ---
4
+ import io
5
+ import os
6
+ import PIL
7
+ import openai
8
+ import outlines
9
+ from pydantic import BaseModel
10
+ from google.genai import Client
11
+ from outlines.inputs import Image
12
+ from anthropic import Anthropic
13
+ import ollama
14
+ # ------------------------------------------------------
15
+
16
+ def make_model(
17
+ vlm_provider: str | None = "gemini",
18
+ vlm_model: str | None = None,
19
+ *,
20
+ api_key: str | None = None,
21
+ ):
22
+ """
23
+ Build a callable Outlines model for VLM processing.
24
+
25
+ Creates an Outlines model instance configured for Gemini, OpenAI, Anthropic, OpenRouter, Qianfan, or Ollama
26
+ providers. Only one backend is active at a time, with Gemini as the default.
27
+
28
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", "openrouter", "qianfan", or "ollama", default: "gemini")
29
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
30
+ :param api_key: API key for the VLM provider (required for all providers except Ollama)
31
+ :return: Configured Outlines model instance
32
+ :raises ValueError: If provider is unsupported or API key is missing
33
+ """
34
+ vlm_provider = (vlm_provider or "gemini").lower()
35
+
36
+ # Set default models if not provided
37
+ if vlm_model is None:
38
+ if vlm_provider == "gemini":
39
+ vlm_model = "gemini-2.5-pro"
40
+ elif vlm_provider == "openai":
41
+ vlm_model = "gpt-5"
42
+ elif vlm_provider == "anthropic":
43
+ vlm_model = "claude-opus-4-1"
44
+ elif vlm_provider == "openrouter":
45
+ vlm_model = "x-ai/grok-4"
46
+ elif vlm_provider == "qianfan":
47
+ vlm_model = "ernie-4.5-turbo-vl-32k"
48
+ elif vlm_provider == "ollama":
49
+ vlm_model = "llava:latest"
50
+
51
+ if vlm_provider == "gemini":
52
+ if not api_key:
53
+ raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
54
+ # Create the model (exactly like your snippet)
55
+ return outlines.from_gemini(
56
+ Client(api_key=api_key),
57
+ vlm_model,
58
+ )
59
+
60
+ if vlm_provider == "openai":
61
+ if not api_key:
62
+ raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
63
+ # this part is for the openai models (exactly like your snippet)
64
+ return outlines.from_openai(
65
+ openai.OpenAI(api_key=api_key),
66
+ vlm_model,
67
+ )
68
+
69
+ if vlm_provider == "anthropic":
70
+ if not api_key:
71
+ raise ValueError("Anthropic provider requires api_key to be passed to make_model(...).")
72
+ # Create the Anthropic client and model (exactly like your snippet)
73
+ client = Anthropic(api_key=api_key)
74
+ return outlines.from_anthropic(
75
+ client,
76
+ vlm_model,
77
+ )
78
+
79
+ if vlm_provider == "openrouter":
80
+ if not api_key:
81
+ raise ValueError("OpenRouter provider requires api_key to be passed to make_model(...).")
82
+ # Create the Anthropic client and model (exactly like your snippet)
83
+ client = openai.OpenAI(
84
+ base_url="https://openrouter.ai/api/v1",
85
+ api_key=api_key,
86
+ )
87
+ return outlines.from_openai(
88
+ client,
89
+ vlm_model
90
+ )
91
+
92
+ if vlm_provider == "qianfan":
93
+ if not api_key:
94
+ raise ValueError("Qianfan provider requires api_key to be passed to make_model(...).")
95
+ # Create the Qianfan client with OpenAI-compatible interface
96
+ client = openai.OpenAI(
97
+ base_url="https://qianfan.baidubce.com/v2",
98
+ api_key=api_key,
99
+ )
100
+ return outlines.from_openai(
101
+ client,
102
+ vlm_model
103
+ )
104
+
105
+ if vlm_provider == "ollama":
106
+ # Ollama doesn't use Outlines, so we return a custom wrapper
107
+ return OllamaModelWrapper(vlm_model)
108
+
109
+ raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini', 'openai', 'anthropic', 'openrouter', 'qianfan', or 'ollama'.")
110
+
111
+
112
+ class OllamaModelWrapper:
113
+ """
114
+ Wrapper class to make Ollama compatible with the Outlines interface.
115
+
116
+ This class provides a callable interface that matches the Outlines model
117
+ signature, allowing Ollama to be used as a drop-in replacement for other
118
+ VLM providers in the Doctra framework.
119
+ """
120
+
121
+ def __init__(self, model_name: str):
122
+ """
123
+ Initialize the Ollama model wrapper.
124
+
125
+ :param model_name: Name of the Ollama model to use (e.g., "llava:latest", "gemma3:latest")
126
+ """
127
+ self.model_name = model_name
128
+
129
+ def __call__(self, prompt, schema):
130
+ """
131
+ Call the Ollama model with the given prompt and schema.
132
+
133
+ :param prompt: List containing [text_prompt, Image] - the text prompt and PIL Image
134
+ :param schema: Pydantic model class for structured output
135
+ :return: Structured data object matching the provided schema
136
+ """
137
+ if not isinstance(prompt, list) or len(prompt) != 2:
138
+ raise ValueError("Prompt must be a list with [text, image] format")
139
+
140
+ text_prompt, image = prompt
141
+
142
+ # Convert Image object to bytes for Ollama
143
+ # The Image object from Outlines might be a PIL Image or a different type
144
+ try:
145
+ # Try to get the PIL Image from the Outlines Image object
146
+ if hasattr(image, 'image'):
147
+ pil_image = image.image
148
+ elif hasattr(image, '_image'):
149
+ pil_image = image._image
150
+ else:
151
+ pil_image = image
152
+
153
+ # Convert to bytes
154
+ img_buffer = io.BytesIO()
155
+ pil_image.save(img_buffer, format='JPEG')
156
+ img_bytes = img_buffer.getvalue()
157
+ except Exception as e:
158
+ # Try alternative approach - save the image directly to a file
159
+ import tempfile
160
+ with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
161
+ try:
162
+ if hasattr(image, 'image'):
163
+ image.image.save(tmp_file.name, format='JPEG')
164
+ else:
165
+ image.save(tmp_file.name, format='JPEG')
166
+ with open(tmp_file.name, 'rb') as f:
167
+ img_bytes = f.read()
168
+ os.unlink(tmp_file.name)
169
+ except Exception as e2:
170
+ raise
171
+
172
+ # Save image to temporary file for Ollama
173
+ import tempfile
174
+ with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
175
+ tmp_file.write(img_bytes)
176
+ tmp_path = tmp_file.name
177
+
178
+ try:
179
+ # Call Ollama with the image and prompt
180
+ response = ollama.chat(
181
+ messages=[{
182
+ "role": "user",
183
+ "content": text_prompt,
184
+ "images": [tmp_path],
185
+ }],
186
+ model=self.model_name,
187
+ format=schema.model_json_schema(), # Use Pydantic schema for structured output
188
+ )
189
+
190
+ # Handle different response formats
191
+ if 'message' in response and 'content' in response['message']:
192
+ content = response['message']['content']
193
+ elif 'response' in response:
194
+ content = response['response']
195
+ else:
196
+ content = str(response)
197
+
198
+ # Try to parse as JSON
199
+ try:
200
+ result = schema.model_validate_json(content)
201
+ return result
202
+ except Exception as json_error:
203
+ # Try to extract data manually from text response
204
+ return self._extract_from_text_response(content, schema)
205
+
206
+ except Exception as e:
207
+ # Return a default structure to prevent crashes
208
+ return schema(
209
+ title="Extraction Failed",
210
+ description="Failed to extract data from image",
211
+ headers=["Error"],
212
+ rows=[["Could not process image"]]
213
+ )
214
+ finally:
215
+ # Clean up temporary file
216
+ import os
217
+ try:
218
+ os.unlink(tmp_path)
219
+ except:
220
+ pass
221
+
222
+ def _extract_from_text_response(self, content: str, schema):
223
+ """
224
+ Extract structured data from text response when JSON parsing fails.
225
+
226
+ :param content: Text response from Ollama
227
+ :param schema: Pydantic schema class
228
+ :return: Structured data object
229
+ """
230
+ try:
231
+ # Try to find JSON in the response
232
+ import re
233
+ import json
234
+
235
+ # Look for JSON-like content
236
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
237
+ if json_match:
238
+ json_str = json_match.group()
239
+ return schema.model_validate_json(json_str)
240
+
241
+ # If no JSON found, create a basic structure
242
+ lines = content.split('\n')
243
+ title = "Extracted Data"
244
+ description = content[:300] if len(content) > 300 else content
245
+
246
+ # Try to extract headers and rows from text
247
+ headers = ["Column 1", "Column 2"] # Default headers
248
+ rows = [["Data 1", "Data 2"]] # Default row
249
+
250
+ # Look for table-like patterns
251
+ for line in lines:
252
+ if '|' in line and len(line.split('|')) > 2:
253
+ # This looks like a table row
254
+ cells = [cell.strip() for cell in line.split('|') if cell.strip()]
255
+ if len(cells) > 1:
256
+ rows.append(cells)
257
+
258
+ return schema(
259
+ title=title,
260
+ description=description,
261
+ headers=headers,
262
+ rows=rows
263
+ )
264
+
265
+ except Exception as e:
266
+ # Return minimal structure
267
+ return schema(
268
+ title="Text Extraction",
269
+ description=content[:300] if len(content) > 300 else content,
270
+ headers=["Content"],
271
+ rows=[[content[:100]]]
272
+ )
@@ -32,7 +32,7 @@ class VLMStructuredExtractor:
32
32
  """
33
33
  Initialize the VLMStructuredExtractor with provider configuration.
34
34
 
35
- :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
35
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", "openrouter", "qianfan", or "ollama", default: "gemini")
36
36
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
37
37
  :param api_key: API key for the VLM provider (required for all providers)
38
38
  """
@@ -88,11 +88,14 @@ class StructuredPDFParser:
88
88
  self.use_vlm = use_vlm
89
89
  self.vlm = None
90
90
  if self.use_vlm:
91
- self.vlm = VLMStructuredExtractor(
92
- vlm_provider=vlm_provider,
93
- vlm_model=vlm_model,
94
- api_key=vlm_api_key,
95
- )
91
+ try:
92
+ self.vlm = VLMStructuredExtractor(
93
+ vlm_provider=vlm_provider,
94
+ vlm_model=vlm_model,
95
+ api_key=vlm_api_key,
96
+ )
97
+ except Exception as e:
98
+ self.vlm = None
96
99
 
97
100
  def parse(self, pdf_path: str) -> None:
98
101
  """
@@ -65,7 +65,7 @@ def run_enhanced_parse(
65
65
 
66
66
  # Validate VLM configuration if VLM is enabled
67
67
  if use_vlm:
68
- vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
68
+ vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
69
69
  if vlm_error:
70
70
  return (vlm_error, None, [], "", None, None, "")
71
71
 
@@ -358,7 +358,7 @@ def create_enhanced_parser_tab() -> Tuple[gr.Tab, dict]:
358
358
  # VLM settings
359
359
  with gr.Row():
360
360
  use_vlm_enhanced = gr.Checkbox(label="Use VLM (optional)", value=False)
361
- vlm_provider_enhanced = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
361
+ vlm_provider_enhanced = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter", "ollama"], value="gemini", label="VLM Provider")
362
362
  vlm_api_key_enhanced = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
363
363
 
364
364
  # Advanced settings accordion
@@ -60,7 +60,7 @@ def run_full_parse(
60
60
  return ("No file provided.", None, [], [], "")
61
61
 
62
62
  # Validate VLM configuration
63
- vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
63
+ vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
64
64
  if vlm_error:
65
65
  return (vlm_error, None, [], [], "")
66
66
 
@@ -429,7 +429,7 @@ def create_full_parse_tab() -> Tuple[gr.Tab, dict]:
429
429
  with gr.Row():
430
430
  pdf = gr.File(file_types=[".pdf"], label="PDF")
431
431
  use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
432
- vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
432
+ vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter", "ollama"], value="gemini", label="VLM Provider")
433
433
  vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
434
434
 
435
435
  # Advanced settings accordion
@@ -48,7 +48,7 @@ def run_extract(
48
48
  return ("No file provided.", "", [], [], "")
49
49
 
50
50
  # Validate VLM configuration
51
- vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
51
+ vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
52
52
  if vlm_error:
53
53
  return (vlm_error, "", [], [], "")
54
54
 
@@ -334,7 +334,7 @@ def create_tables_charts_tab() -> Tuple[gr.Tab, dict]:
334
334
  pdf_e = gr.File(file_types=[".pdf"], label="PDF")
335
335
  target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
336
336
  use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
337
- vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
337
+ vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter", "ollama"], value="gemini", label="VLM Provider")
338
338
  vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
339
339
 
340
340
  # Advanced settings accordion
doctra/ui/ui_helpers.py CHANGED
@@ -261,21 +261,22 @@ def parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
261
261
  return pages
262
262
 
263
263
 
264
- def validate_vlm_config(use_vlm: bool, vlm_api_key: str) -> Optional[str]:
264
+ def validate_vlm_config(use_vlm: bool, vlm_api_key: str, vlm_provider: str = "gemini") -> Optional[str]:
265
265
  """
266
266
  Validate VLM configuration parameters.
267
267
 
268
268
  Args:
269
269
  use_vlm: Whether VLM is enabled
270
270
  vlm_api_key: API key for VLM provider
271
+ vlm_provider: VLM provider name (default: "gemini")
271
272
 
272
273
  Returns:
273
274
  Error message if validation fails, None if valid
274
275
  """
275
- if use_vlm and not vlm_api_key:
276
- return "❌ Error: VLM API key is required when using VLM"
276
+ if use_vlm and vlm_provider != "ollama" and not vlm_api_key:
277
+ return "❌ Error: VLM API key is required when using VLM (except for Ollama)"
277
278
 
278
- if use_vlm and vlm_api_key:
279
+ if use_vlm and vlm_api_key and vlm_provider != "ollama":
279
280
  # Basic API key validation
280
281
  if len(vlm_api_key.strip()) < 10:
281
282
  return "❌ Error: VLM API key appears to be too short or invalid"
doctra/version.py CHANGED
@@ -1,2 +1,2 @@
1
1
  """Version information for Doctra."""
2
- __version__ = '0.4.3'
2
+ __version__ = '0.5.1'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctra
3
- Version: 0.4.3
3
+ Version: 0.5.1
4
4
  Summary: Parse, extract, and analyze documents with ease
5
5
  Home-page: https://github.com/AdemBoukhris457/Doctra
6
6
  Author: Adem Boukhris
@@ -267,6 +267,7 @@ Dynamic: requires-python
267
267
  [![stars](https://img.shields.io/github/stars/AdemBoukhris457/Doctra.svg)](https://github.com/AdemBoukhris457/Doctra)
268
268
  [![forks](https://img.shields.io/github/forks/AdemBoukhris457/Doctra.svg)](https://github.com/AdemBoukhris457/Doctra)
269
269
  [![PyPI version](https://img.shields.io/pypi/v/doctra)](https://pypi.org/project/doctra/)
270
+ [![Documentation](https://img.shields.io/badge/documentation-available-success)](https://ademboukhris457.github.io/Doctra/index.html)
270
271
  </div>
271
272
 
272
273
  ## 📋 Table of Contents
@@ -361,7 +362,7 @@ parser = StructuredPDFParser()
361
362
  # Parser with VLM for structured data extraction
362
363
  parser = StructuredPDFParser(
363
364
  use_vlm=True,
364
- vlm_provider="openai", # or "gemini" or "anthropic" or "openrouter"
365
+ vlm_provider="openai", # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
365
366
  vlm_api_key="your_api_key_here"
366
367
  )
367
368
 
@@ -916,7 +917,7 @@ parser.display_pages_with_boxes("document.pdf")
916
917
 
917
918
  ### 🤖 VLM Integration
918
919
  - Vision Language Model support for structured data extraction
919
- - Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter)
920
+ - Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter, Qianfan, Ollama)
920
921
  - Automatic conversion of charts and tables to structured formats
921
922
 
922
923
  ### 📊 Multiple Output Formats
@@ -1,8 +1,8 @@
1
1
  doctra/__init__.py,sha256=rNLCyODOpaPb_TTP6qmQnuWZJW9JPXrxg1IfKnvb1No,773
2
- doctra/version.py,sha256=UtaT-N7wXotEga348278k_4dwz6xpN5W57ulX1lo5vU,62
2
+ doctra/version.py,sha256=b0vGLL2RHYHeqdwkHTZmk2FRb1-xEhcO1auAArG969s,62
3
3
  doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
4
- doctra/cli/main.py,sha256=_gvG8bm-Mn1tIEw6eJUgqz9dYEo9klXGiJDJzjqgPyo,43503
5
- doctra/cli/utils.py,sha256=w3Bxyzczcbl_cs1Cea8C3ehv7dkGl_wecprYZXrcGhk,11772
4
+ doctra/cli/main.py,sha256=UhWTatY3qIeutZzVo9syLG2srbs8MZuGaLo5tk9xC_M,43108
5
+ doctra/cli/utils.py,sha256=GKSSGi-JjNXufNekqCysSev7St1t32caYMduy0Tq96s,11971
6
6
  doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  doctra/engines/image_restoration/__init__.py,sha256=vzcN6Rw7_U-5jIK2pdo2NlgqdLdXDShigrOGM7QLNEE,263
8
8
  doctra/engines/image_restoration/docres_engine.py,sha256=wbo-FWEb6_Twq5KqzjPgGQwcAuFD98uBAiQBEY8vN2A,21592
@@ -15,8 +15,8 @@ doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4
15
15
  doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
16
16
  doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  doctra/engines/vlm/outlines_types.py,sha256=fQK6ru7XiXHaa8JPpaTTBaTk_zQ93ZyhFp4SyAnUdVU,1337
18
- doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
19
- doctra/engines/vlm/service.py,sha256=nygxMe7uTq6Bv70ycBPL59F2a0ESp1Hix4j833p6rUM,4343
18
+ doctra/engines/vlm/provider.py,sha256=QMr-gcbhyXgTQOHPIjIrmsLTNfkbDR69I3uR5Z2QVU0,10521
19
+ doctra/engines/vlm/service.py,sha256=8o3JbNEkAFLNxSyu3KW7srI25PSLY-epzNZquKTxgcU,4364
20
20
  doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  doctra/exporters/excel_writer.py,sha256=rwyqlH73P7z413BELovQY_pS6IMkkqHEho6mbPrJ2Sk,11857
22
22
  doctra/exporters/html_writer.py,sha256=zJPoMiFF9lx9fHpdqk0y8diNNeQVC68wNvUInX918fY,46017
@@ -26,7 +26,7 @@ doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN
26
26
  doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
27
27
  doctra/parsers/enhanced_pdf_parser.py,sha256=TG4uM_dK80-69y1C99HhSoVInHGwTb-sGJtmHBpZuMY,23756
28
28
  doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
29
- doctra/parsers/structured_pdf_parser.py,sha256=RSduGt7L5HcoB7JE7zbAjlkvEMk2XQnQhHHD8p7QjQ4,22284
29
+ doctra/parsers/structured_pdf_parser.py,sha256=3jPulhR0agnhP1r9j48WvH53-NZVMhePAmNLzy-_fes,22391
30
30
  doctra/parsers/table_chart_extractor.py,sha256=ZD0l2V_8HBdHOAIhMIujfnd5ai3gXsSLL67VMVu3F8A,13905
31
31
  doctra/third_party/docres/inference.py,sha256=krD5EQDiqki-5uTMqqHYivhL38sfSOhYgaihI751070,13576
32
32
  doctra/third_party/docres/utils.py,sha256=N0ZVmOTB3wsinFlYu5hT84C4_MhWGdc98T8LTG-S9dA,14566
@@ -52,10 +52,10 @@ doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
52
52
  doctra/ui/app.py,sha256=I9pX-U3VASGs4kfL6Tv3nDH2tlU4kSv5WrnsNDfYTbQ,2305
53
53
  doctra/ui/docres_ui.py,sha256=QMTsNUdw2NGlHK-mYwB-j5i2QXEndYv8Zvc8213jXVA,13034
54
54
  doctra/ui/docres_wrapper.py,sha256=BjcY5Xik9UBFPzPL-ONT2GIpTeRrYUXXzuDEq1QE28Q,4498
55
- doctra/ui/enhanced_parser_ui.py,sha256=OVPwv9yErjg1lL-dEVH5KWrc7YqEP7QmFa80WPhaCX0,20754
56
- doctra/ui/full_parse_ui.py,sha256=19EsprqeegZAj24KhAWKvyR1hW8HC3nE_f4UFpY-dfQ,18597
57
- doctra/ui/tables_charts_ui.py,sha256=x0YmERDyfkUruAbHqQ-Kc0_cDOuqf64l_fjBvVOULOI,16534
58
- doctra/ui/ui_helpers.py,sha256=LthpitCrZOpjXcQvpctyNaDz3T26V06TpAy3r_ChLhY,15584
55
+ doctra/ui/enhanced_parser_ui.py,sha256=oImlFfpjLGs3CpOIUIx_o-1fK7ddUhUCOYW4NUiuJrA,20778
56
+ doctra/ui/full_parse_ui.py,sha256=h-bckQq9FRbVA00l4VQXnzdLgNIrIeAtVVdHkihTPjE,18621
57
+ doctra/ui/tables_charts_ui.py,sha256=ZcRhTbi4iB0tBi3JC-Z3w6AN6dgUOWt9sV_-iJCkaFE,16558
58
+ doctra/ui/ui_helpers.py,sha256=Wx36d5rbUdRXQg98w45DIxH0Hib0mTMEmv2cH3ejyGI,15753
59
59
  doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
61
61
  doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
@@ -66,9 +66,9 @@ doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
66
66
  doctra/utils/progress.py,sha256=BD9YZqYLZw6yohQnyUV3w9QsQuiIrXM_EqByOSSJsDU,11912
67
67
  doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
68
68
  doctra/utils/structured_utils.py,sha256=vU84dsD8wIlTyMsA9hitorGH-eroQiVuWEpBTQBUT24,1478
69
- doctra-0.4.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
70
- doctra-0.4.3.dist-info/METADATA,sha256=YoaPW5G3wdM9zNCb1M_FTM5JmDnUM4MqgS-aVMOBO-M,37033
71
- doctra-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
72
- doctra-0.4.3.dist-info/entry_points.txt,sha256=4G2RHamA0llCiIXaQQm8EDkVK9JNGKbI7uDnXVFgIaY,47
73
- doctra-0.4.3.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
74
- doctra-0.4.3.dist-info/RECORD,,
69
+ doctra-0.5.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
70
+ doctra-0.5.1.dist-info/METADATA,sha256=IInFIxxklcgLQHTvStUSTkqQXwXGly0JbZOSpBQAu0A,37202
71
+ doctra-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
72
+ doctra-0.5.1.dist-info/entry_points.txt,sha256=4G2RHamA0llCiIXaQQm8EDkVK9JNGKbI7uDnXVFgIaY,47
73
+ doctra-0.5.1.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
74
+ doctra-0.5.1.dist-info/RECORD,,
File without changes