vlmparse 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlmparse/converter.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import asyncio
2
2
  import threading
3
3
  import time
4
- import traceback
5
4
  from pathlib import Path
6
5
  from typing import Literal
7
6
 
@@ -9,6 +8,8 @@ from loguru import logger
9
8
  from PIL import Image
10
9
  from pydantic import Field
11
10
 
11
+ from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME
12
+
12
13
  from .base_model import VLMParseBaseModel
13
14
  from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
14
15
  from .constants import IMAGE_EXTENSIONS, PDF_EXTENSION
@@ -19,9 +20,20 @@ PDFIUM_LOCK = threading.Lock()
19
20
 
20
21
 
21
22
  class ConverterConfig(VLMParseBaseModel):
23
+ model_name: str
22
24
  aliases: list[str] = Field(default_factory=list)
23
- dpi: int = 175
24
- max_image_size: int | None = 4000
25
+ dpi: int = Field(default=175, ge=30, le=600)
26
+ max_image_size: int | None = Field(default=4000, ge=50)
27
+ base_url: str | None = None
28
+ default_model_name: str = DEFAULT_MODEL_NAME
29
+ conversion_mode: Literal[
30
+ "ocr",
31
+ "ocr_layout",
32
+ "table",
33
+ "image_description",
34
+ "formula",
35
+ "chart",
36
+ ] = "ocr"
25
37
 
26
38
  def get_client(self, **kwargs) -> "BaseConverter":
27
39
  return BaseConverter(config=self, **kwargs)
@@ -94,14 +106,22 @@ class BaseConverter:
94
106
  page = await self.async_call_inside_page(page)
95
107
  toc = time.perf_counter()
96
108
  page.latency = toc - tic
97
- logger.debug(f"Time taken: {page.latency} seconds")
109
+ logger.debug(
110
+ "Page {page_idx} processed in {latency:.2f}s",
111
+ page_idx=page_idx,
112
+ latency=page.latency,
113
+ )
98
114
  except KeyboardInterrupt:
99
115
  raise
100
116
  except Exception:
101
117
  if self.debug:
102
118
  raise
103
119
  else:
104
- logger.exception(traceback.format_exc())
120
+ logger.opt(exception=True).error(
121
+ "Error processing page {page_idx} of {file_path}",
122
+ page_idx=page_idx,
123
+ file_path=str(file_path),
124
+ )
105
125
  page.error = ProcessingError.from_class(self)
106
126
  if not self.save_page_images:
107
127
  page.buffer_image = dict(
@@ -122,12 +142,19 @@ class BaseConverter:
122
142
  if self.debug:
123
143
  raise
124
144
  else:
125
- logger.exception(traceback.format_exc())
145
+ logger.opt(exception=True).error(
146
+ "Error processing document {file_path}",
147
+ file_path=str(file_path),
148
+ )
126
149
  document.error = ProcessingError.from_class(self)
127
150
  return document
128
151
  toc = time.perf_counter()
129
152
  document.latency = toc - tic
130
- logger.debug(f"Time taken to process the document: {document.latency} seconds")
153
+ logger.debug(
154
+ "Document {file_path} processed in {latency:.2f}s",
155
+ file_path=str(file_path),
156
+ latency=document.latency,
157
+ )
131
158
  if self.save_folder is not None:
132
159
  self._save_document(document)
133
160
 
@@ -169,8 +196,16 @@ class BaseConverter:
169
196
  else:
170
197
  logger.warning(f"Unknown save_mode: {self.save_mode}, skipping save")
171
198
 
199
+ async def _async_call_with_cleanup(self, file_path: str | Path):
200
+ """Call async_call and ensure cleanup."""
201
+ try:
202
+ return await self.async_call(file_path)
203
+ finally:
204
+ if hasattr(self, "aclose"):
205
+ await self.aclose()
206
+
172
207
  def __call__(self, file_path: str | Path):
173
- return asyncio.run(self.async_call(file_path))
208
+ return asyncio.run(self._async_call_with_cleanup(file_path))
174
209
 
175
210
  async def async_batch(self, file_paths: list[str | Path]) -> list[Document] | None:
176
211
  """Process multiple files concurrently with semaphore limit."""
@@ -184,9 +219,14 @@ class BaseConverter:
184
219
  await self.async_call(file_path)
185
220
 
186
221
  tasks = [asyncio.create_task(worker(file_path)) for file_path in file_paths]
187
- documents = await asyncio.gather(*tasks)
188
- if self.return_documents_in_batch_mode:
189
- return documents
222
+ try:
223
+ documents = await asyncio.gather(*tasks)
224
+ if self.return_documents_in_batch_mode:
225
+ return documents
226
+ finally:
227
+ # Close async resources before the event loop ends
228
+ if hasattr(self, "aclose"):
229
+ await self.aclose()
190
230
 
191
231
  def batch(self, file_paths: list[str | Path]) -> list[Document] | None:
192
232
  """Synchronous wrapper for async_batch."""
@@ -5,10 +5,61 @@ from typing import Literal
5
5
 
6
6
  from loguru import logger
7
7
 
8
+ from vlmparse.constants import DEFAULT_SERVER_PORT
8
9
  from vlmparse.servers.utils import get_model_from_uri
9
10
  from vlmparse.utils import get_file_paths
10
11
 
11
12
 
13
+ def start_server(
14
+ model: str,
15
+ gpus: str,
16
+ port: None | int = None,
17
+ with_vllm_server: bool = True,
18
+ vllm_args: list[str] = {},
19
+ forget_predefined_vllm_args: bool = False,
20
+ auto_stop: bool = False,
21
+ ):
22
+ from vlmparse.registries import docker_config_registry
23
+
24
+ base_url = ""
25
+ container = None
26
+ docker_config = docker_config_registry.get(model, default=with_vllm_server)
27
+
28
+ if port is None:
29
+ port = DEFAULT_SERVER_PORT
30
+
31
+ if docker_config is None:
32
+ logger.warning(
33
+ f"No Docker configuration found for model: {model}, using default configuration"
34
+ )
35
+ return "", container, None, docker_config
36
+
37
+ gpu_device_ids = None
38
+ if gpus is not None:
39
+ gpu_device_ids = [g.strip() for g in str(gpus).split(",")]
40
+
41
+ if docker_config is not None:
42
+ if port is not None:
43
+ docker_config.docker_port = port
44
+ docker_config.gpu_device_ids = gpu_device_ids
45
+ docker_config.update_command_args(
46
+ vllm_args,
47
+ forget_predefined_vllm_args=forget_predefined_vllm_args,
48
+ )
49
+
50
+ logger.info(
51
+ f"Deploying VLLM server for {docker_config.model_name} on port {port}..."
52
+ )
53
+ server = docker_config.get_server(auto_stop=auto_stop)
54
+ if server is None:
55
+ logger.error(f"Model server not found for model: {model}")
56
+ return "", container, None, docker_config
57
+
58
+ base_url, container = server.start()
59
+
60
+ return base_url, container, server, docker_config
61
+
62
+
12
63
  class ConverterWithServer:
13
64
  def __init__(
14
65
  self,
@@ -18,58 +69,60 @@ class ConverterWithServer:
18
69
  port: int | None = None,
19
70
  with_vllm_server: bool = False,
20
71
  concurrency: int = 10,
21
- vllm_kwargs: dict | None = None,
22
- forget_predefined_vllm_kwargs: bool = False,
72
+ vllm_args: dict | None = None,
73
+ forget_predefined_vllm_args: bool = False,
74
+ return_documents: bool = False,
23
75
  ):
76
+ if model is None and uri is None:
77
+ raise ValueError("Either 'model' or 'uri' must be provided")
78
+
79
+ if concurrency < 1:
80
+ raise ValueError("concurrency must be at least 1")
81
+
24
82
  self.model = model
25
83
  self.uri = uri
26
84
  self.port = port
27
85
  self.gpus = gpus
28
86
  self.with_vllm_server = with_vllm_server
29
87
  self.concurrency = concurrency
30
- self.vllm_kwargs = vllm_kwargs
31
- self.forget_predefined_vllm_kwargs = forget_predefined_vllm_kwargs
88
+ self.vllm_args = vllm_args
89
+ self.forget_predefined_vllm_args = forget_predefined_vllm_args
90
+ self.return_documents = return_documents
32
91
  self.server = None
33
92
  self.client = None
34
93
 
35
- if self.uri is not None and self.model is None:
94
+ if self.uri is not None:
36
95
  self.model = get_model_from_uri(self.uri)
37
96
 
38
97
  def start_server_and_client(self):
39
- from vlmparse.registries import (
40
- converter_config_registry,
41
- docker_config_registry,
42
- )
43
-
44
- gpu_device_ids = None
45
- if self.gpus is not None:
46
- gpu_device_ids = [g.strip() for g in self.gpus.split(",")]
98
+ from vlmparse.registries import converter_config_registry
47
99
 
48
100
  if self.uri is None:
49
- docker_config = docker_config_registry.get(
50
- self.model, default=self.with_vllm_server
101
+ _, _, self.server, docker_config = start_server(
102
+ model=self.model,
103
+ gpus=self.gpus,
104
+ port=self.port,
105
+ with_vllm_server=self.with_vllm_server,
106
+ vllm_args=self.vllm_args,
107
+ forget_predefined_vllm_args=self.forget_predefined_vllm_args,
108
+ auto_stop=True,
51
109
  )
52
110
 
53
111
  if docker_config is not None:
54
- if self.port is not None:
55
- docker_config.docker_port = self.port
56
- docker_config.gpu_device_ids = gpu_device_ids
57
- docker_config.update_command_args(
58
- self.vllm_kwargs,
59
- forget_predefined_vllm_kwargs=self.forget_predefined_vllm_kwargs,
112
+ self.client = docker_config.get_client(
113
+ return_documents_in_batch_mode=self.return_documents
60
114
  )
61
- self.server = docker_config.get_server(auto_stop=True)
62
-
63
- self.server.start()
64
-
65
- self.client = docker_config.get_client()
66
115
  else:
67
- self.client = converter_config_registry.get(self.model).get_client()
116
+ self.client = converter_config_registry.get(self.model).get_client(
117
+ return_documents_in_batch_mode=self.return_documents
118
+ )
68
119
 
69
120
  else:
70
121
  client_config = converter_config_registry.get(self.model, uri=self.uri)
71
122
 
72
- self.client = client_config.get_client()
123
+ self.client = client_config.get_client(
124
+ return_documents_in_batch_mode=self.return_documents
125
+ )
73
126
 
74
127
  def stop_server(self):
75
128
  if self.server is not None and self.server.auto_stop:
@@ -80,16 +133,30 @@ class ConverterWithServer:
80
133
  return self
81
134
 
82
135
  def __exit__(self, exc_type, exc_value, traceback):
83
- self.stop_server()
136
+ try:
137
+ self.stop_server()
138
+ except Exception as e:
139
+ logger.warning(f"Error stopping server during cleanup: {e}")
140
+ return False # Don't suppress exceptions
84
141
 
85
142
  def parse(
86
143
  self,
87
144
  inputs: str | list[str],
88
145
  out_folder: str = ".",
89
146
  mode: Literal["document", "md", "md_page"] = "document",
147
+ conversion_mode: Literal[
148
+ "ocr",
149
+ "ocr_layout",
150
+ "table",
151
+ "image_description",
152
+ "formula",
153
+ "chart",
154
+ ]
155
+ | None = None,
90
156
  dpi: int | None = None,
91
157
  debug: bool = False,
92
158
  retrylast: bool = False,
159
+ completion_kwargs: dict | None = None,
93
160
  ):
94
161
  assert (
95
162
  self.client is not None
@@ -126,6 +193,14 @@ class ConverterWithServer:
126
193
  if dpi is not None:
127
194
  self.client.config.dpi = int(dpi)
128
195
 
196
+ if conversion_mode is not None:
197
+ self.client.config.conversion_mode = conversion_mode
198
+
199
+ if completion_kwargs is not None and hasattr(
200
+ self.client.config, "completion_kwargs"
201
+ ):
202
+ self.client.config.completion_kwargs |= completion_kwargs
203
+
129
204
  if debug:
130
205
  self.client.debug = debug
131
206
 
vlmparse/registries.py CHANGED
@@ -1,37 +1,24 @@
1
1
  import os
2
2
  from collections.abc import Callable
3
3
 
4
- from vlmparse.clients.chandra import ChandraConverterConfig, ChandraDockerServerConfig
5
- from vlmparse.clients.deepseekocr import (
6
- DeepSeekOCRConverterConfig,
7
- DeepSeekOCRDockerServerConfig,
8
- )
9
- from vlmparse.clients.docling import DoclingConverterConfig, DoclingDockerServerConfig
10
- from vlmparse.clients.dotsocr import DotsOCRConverterConfig, DotsOCRDockerServerConfig
11
- from vlmparse.clients.granite_docling import (
12
- GraniteDoclingConverterConfig,
13
- GraniteDoclingDockerServerConfig,
14
- )
15
- from vlmparse.clients.hunyuanocr import (
16
- HunyuanOCRConverterConfig,
17
- HunyuanOCRDockerServerConfig,
18
- )
4
+ from vlmparse.clients.chandra import ChandraDockerServerConfig
5
+ from vlmparse.clients.deepseekocr import DeepSeekOCRDockerServerConfig
6
+ from vlmparse.clients.docling import DoclingDockerServerConfig
7
+ from vlmparse.clients.dotsocr import DotsOCRDockerServerConfig
8
+ from vlmparse.clients.granite_docling import GraniteDoclingDockerServerConfig
9
+ from vlmparse.clients.hunyuanocr import HunyuanOCRDockerServerConfig
19
10
  from vlmparse.clients.lightonocr import (
20
- LightOnOCRConverterConfig,
11
+ LightonOCR21BServerConfig,
21
12
  LightOnOCRDockerServerConfig,
22
13
  )
23
- from vlmparse.clients.mineru import MinerUConverterConfig, MinerUDockerServerConfig
24
- from vlmparse.clients.nanonetocr import (
25
- NanonetOCR2ConverterConfig,
26
- NanonetOCR2DockerServerConfig,
27
- )
28
- from vlmparse.clients.olmocr import OlmOCRConverterConfig, OlmOCRDockerServerConfig
29
- from vlmparse.clients.openai_converter import LLMParams, OpenAIConverterConfig
30
- from vlmparse.clients.paddleocrvl import (
31
- PaddleOCRVLConverterConfig,
32
- PaddleOCRVLDockerServerConfig,
33
- )
34
- from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME, docker_config_registry
14
+ from vlmparse.clients.mineru import MinerUDockerServerConfig
15
+ from vlmparse.clients.mistral_converter import MistralOCRConverterConfig
16
+ from vlmparse.clients.nanonetocr import NanonetOCR2DockerServerConfig
17
+ from vlmparse.clients.olmocr import OlmOCRDockerServerConfig
18
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
19
+ from vlmparse.clients.paddleocrvl import PaddleOCRVLDockerServerConfig
20
+ from vlmparse.converter import ConverterConfig
21
+ from vlmparse.servers.docker_server import DockerServerConfig, docker_config_registry
35
22
 
36
23
 
37
24
  def get_default(cls, field_name):
@@ -43,7 +30,8 @@ def get_default(cls, field_name):
43
30
  return field_info.default
44
31
 
45
32
 
46
- for server_config_cls in [
33
+ # All server configs - single source of truth
34
+ SERVER_CONFIGS: list[type[DockerServerConfig]] = [
47
35
  ChandraDockerServerConfig,
48
36
  LightOnOCRDockerServerConfig,
49
37
  DotsOCRDockerServerConfig,
@@ -55,7 +43,11 @@ for server_config_cls in [
55
43
  MinerUDockerServerConfig,
56
44
  DeepSeekOCRDockerServerConfig,
57
45
  GraniteDoclingDockerServerConfig,
58
- ]:
46
+ LightonOCR21BServerConfig,
47
+ ]
48
+
49
+ # Register docker server configs
50
+ for server_config_cls in SERVER_CONFIGS:
59
51
  aliases = get_default(server_config_cls, "aliases") or []
60
52
  model_name = get_default(server_config_cls, "model_name")
61
53
  names = [n for n in aliases + [model_name] if isinstance(n, str)]
@@ -64,37 +56,81 @@ for server_config_cls in [
64
56
 
65
57
 
66
58
  class ConverterConfigRegistry:
67
- """Registry for mapping model names to their Docker configurations."""
59
+ """Registry for mapping model names to their converter configurations.
60
+
61
+ Thread-safe registry that maps model names to their converter configuration factories.
62
+ """
68
63
 
69
64
  def __init__(self):
70
- self._registry = dict()
65
+ import threading
66
+
67
+ self._registry: dict[str, Callable[[str | None], ConverterConfig]] = {}
68
+ self._lock = threading.RLock()
71
69
 
72
70
  def register(
73
71
  self,
74
72
  model_name: str,
75
- config_factory: Callable[[str], OpenAIConverterConfig | None],
73
+ config_factory: Callable[[str | None], ConverterConfig],
76
74
  ):
77
- """Register a config factory for a model name."""
78
- self._registry[model_name] = config_factory
75
+ """Register a config factory for a model name (thread-safe)."""
76
+ with self._lock:
77
+ self._registry[model_name] = config_factory
79
78
 
80
- def get(self, model_name: str, uri: str | None = None) -> OpenAIConverterConfig:
81
- """Get config for a model name. Returns default if not registered."""
82
- if model_name in self._registry:
83
- return self._registry[model_name](uri=uri)
79
+ def register_from_server(
80
+ self,
81
+ server_config_cls: type[DockerServerConfig],
82
+ ):
83
+ """Register converter config derived from a server config class.
84
+
85
+ This ensures model_name and default_model_name are consistently
86
+ passed from server to client config via _create_client_kwargs.
87
+ """
88
+ aliases = get_default(server_config_cls, "aliases") or []
89
+ model_name = get_default(server_config_cls, "model_name")
90
+ names = [n for n in aliases + [model_name] if isinstance(n, str)]
91
+ # Also register short name (after last /)
92
+ if model_name and "/" in model_name:
93
+ names.append(model_name.split("/")[-1])
94
+
95
+ def factory(uri: str | None, cls=server_config_cls) -> ConverterConfig:
96
+ server = cls()
97
+ client_config = server.client_config
98
+ # Override base_url if provided
99
+ if uri is not None:
100
+ client_config = client_config.model_copy(update={"base_url": uri})
101
+ return client_config
102
+
103
+ with self._lock:
104
+ for name in names:
105
+ self._registry[name] = factory
106
+
107
+ def get(self, model_name: str, uri: str | None = None) -> ConverterConfig:
108
+ """Get config for a model name (thread-safe). Returns default if not registered."""
109
+ with self._lock:
110
+ factory = self._registry.get(model_name)
111
+
112
+ if factory is not None:
113
+ return factory(uri)
84
114
  # Fallback to OpenAIConverterConfig for unregistered models
85
115
  if uri is not None:
86
- return OpenAIConverterConfig(
87
- llm_params=LLMParams(base_url=uri, model_name=model_name)
88
- )
89
- return OpenAIConverterConfig(llm_params=LLMParams(model_name=model_name))
116
+ return OpenAIConverterConfig(base_url=uri)
117
+ return OpenAIConverterConfig(model_name=model_name)
90
118
 
91
119
  def list_models(self) -> list[str]:
92
- """List all registered model names."""
93
- return list(self._registry.keys())
120
+ """List all registered model names (thread-safe)."""
121
+ with self._lock:
122
+ return list(self._registry.keys())
94
123
 
95
124
 
96
125
  # Global registry instance
97
126
  converter_config_registry = ConverterConfigRegistry()
127
+
128
+ # Register all server-backed converters through the server config
129
+ # This ensures model_name and default_model_name are consistently passed
130
+ for server_config_cls in SERVER_CONFIGS:
131
+ converter_config_registry.register_from_server(server_config_cls)
132
+
133
+ # External API configs (no server config - these are cloud APIs)
98
134
  GOOGLE_API_BASE_URL = (
99
135
  os.getenv("GOOGLE_API_BASE_URL")
100
136
  or "https://generativelanguage.googleapis.com/v1beta/openai/"
@@ -111,11 +147,10 @@ for gemini_model in [
111
147
  converter_config_registry.register(
112
148
  gemini_model,
113
149
  lambda uri=None, model=gemini_model: OpenAIConverterConfig(
114
- llm_params=LLMParams(
115
- model_name=model,
116
- base_url=GOOGLE_API_BASE_URL if uri is None else uri,
117
- api_key=os.getenv("GOOGLE_API_KEY"),
118
- )
150
+ model_name=model,
151
+ base_url=GOOGLE_API_BASE_URL if uri is None else uri,
152
+ api_key=os.getenv("GOOGLE_API_KEY"),
153
+ default_model_name=model,
119
154
  ),
120
155
  )
121
156
  for openai_model in [
@@ -126,45 +161,18 @@ for openai_model in [
126
161
  converter_config_registry.register(
127
162
  openai_model,
128
163
  lambda uri=None, model=openai_model: OpenAIConverterConfig(
129
- llm_params=LLMParams(
130
- model_name=model,
131
- base_url=None,
132
- api_key=os.getenv("OPENAI_API_KEY"),
133
- )
164
+ model_name=model,
165
+ base_url=None,
166
+ api_key=os.getenv("OPENAI_API_KEY"),
167
+ default_model_name=model,
134
168
  ),
135
169
  )
136
170
 
137
- for converter_config_cls in [
138
- ChandraConverterConfig,
139
- LightOnOCRConverterConfig,
140
- DotsOCRConverterConfig,
141
- PaddleOCRVLConverterConfig,
142
- NanonetOCR2ConverterConfig,
143
- HunyuanOCRConverterConfig,
144
- DeepSeekOCRConverterConfig,
145
- GraniteDoclingConverterConfig,
146
- OlmOCRConverterConfig,
147
- ]:
148
- aliases = get_default(converter_config_cls, "aliases") or []
149
- model_name = get_default(converter_config_cls, "model_name")
150
- names = [n for n in aliases + [model_name] if isinstance(n, str)]
151
- for name in names:
152
- converter_config_registry.register(
153
- name,
154
- lambda uri, cls=converter_config_cls: cls(
155
- llm_params=LLMParams(
156
- base_url=uri,
157
- model_name=DEFAULT_MODEL_NAME,
158
- api_key="",
159
- )
160
- ),
161
- )
162
- for converter_config_cls in [MinerUConverterConfig, DoclingConverterConfig]:
163
- aliases = get_default(converter_config_cls, "aliases") or []
164
- model_name = get_default(converter_config_cls, "model_name")
165
- names = [n for n in aliases + [model_name] if isinstance(n, str)]
166
- for name in names:
167
- converter_config_registry.register(
168
- name,
169
- lambda uri, cls=converter_config_cls: cls(base_url=uri),
170
- )
171
+ for mistral_model in ["mistral-ocr-latest", "mistral-ocr"]:
172
+ converter_config_registry.register(
173
+ mistral_model,
174
+ lambda uri=None, model=mistral_model: MistralOCRConverterConfig(
175
+ base_url="https://api.mistral.ai/v1" if uri is None else uri,
176
+ api_key=os.getenv("MISTRAL_API_KEY"),
177
+ ),
178
+ )