docvision 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docvision/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .__version__ import __version__
2
+ from .core import DocumentParser, ParsingMode
3
+
4
+ __all__ = ["__version__", "ParsingMode", "DocumentParser"]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
@@ -0,0 +1,27 @@
1
+ from .client import VLMClient
2
+ from .constants import (
3
+ CONTINUE_PROMPT,
4
+ DEFAULT_SYSTEM_PROMPT,
5
+ DEFAULT_USER_PROMPT,
6
+ FIX_PROMPT,
7
+ TRANSCRIPTION,
8
+ )
9
+ from .parser import DocumentParser
10
+ from .types import (
11
+ AgenticParseState,
12
+ ParseResult,
13
+ ParsingMode,
14
+ )
15
+
16
+ __all__ = [
17
+ "VLMClient",
18
+ "DocumentParser",
19
+ "ParsingMode",
20
+ "ParseResult",
21
+ "AgenticParseState",
22
+ "DEFAULT_SYSTEM_PROMPT",
23
+ "DEFAULT_USER_PROMPT",
24
+ "TRANSCRIPTION",
25
+ "CONTINUE_PROMPT",
26
+ "FIX_PROMPT",
27
+ ]
@@ -0,0 +1,163 @@
1
+ import asyncio
2
+ import os
3
+ from typing import Any, Dict, List, Optional, Type
4
+
5
+ from openai import AsyncOpenAI
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class VLMClient:
10
+ """
11
+ A client for interacting with Vision Language Models (VLMs) via OpenAI-compatible APIs.
12
+
13
+ Attributes:
14
+ model_name: Name of the model to use.
15
+ max_tokens: Maximum number of tokens for the completion.
16
+ temperature: Sampling temperature.
17
+ timeout: Request timeout in seconds.
18
+ max_retries: Number of retry attempts.
19
+ retry_delay: Delay between retries in seconds.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ base_url: str = "https://api.openai.com/v1",
25
+ api_key: Optional[str] = None,
26
+ model_name: str = "gpt-4o-mini",
27
+ max_tokens: int = 4096,
28
+ temperature: float = 0.1,
29
+ timeout: int = 300,
30
+ max_retries: int = 3,
31
+ retry_delay: float = 2.0,
32
+ ):
33
+ """
34
+ Initialize the VLMClient.
35
+
36
+ Args:
37
+ base_url: The base URL for the API.
38
+ api_key: The API key. If not provided, it will look for the OPENAI_API_KEY environment variable.
39
+ model_name: The name of the model to use.
40
+ max_tokens: Maximum number of tokens to generate.
41
+ temperature: Sampling temperature.
42
+ timeout: Request timeout in seconds.
43
+ max_retries: Maximum number of retry attempts for failed requests.
44
+ retry_delay: Delay between retries in seconds.
45
+ """
46
+ self.model_name = model_name
47
+ self.max_tokens = max_tokens
48
+ self.temperature = temperature
49
+ self.timeout = timeout
50
+ self.max_retries = max_retries
51
+ self.retry_delay = retry_delay
52
+
53
+ # Ensure we have an API key or a placeholder for local servers
54
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY", "EMPTY")
55
+ self.async_client = AsyncOpenAI(base_url=base_url, api_key=self.api_key, timeout=timeout)
56
+
57
+ async def invoke(
58
+ self,
59
+ image_b64: str,
60
+ mime_type: str,
61
+ system_prompt: Optional[str] = None,
62
+ user_prompt: Optional[str] = None,
63
+ output_schema: Optional[Type[BaseModel]] = None,
64
+ ) -> Any:
65
+ """
66
+ Make an asynchronous call to the VLM.
67
+
68
+ Args:
69
+ image_b64: Base64 encoded image string.
70
+ mime_type: The MIME type of the image.
71
+ system_prompt: Optional system prompt override.
72
+ user_prompt: Optional user prompt override.
73
+ output_schema: Optional Pydantic model for structured output parsing.
74
+
75
+ Returns:
76
+ The API response object.
77
+
78
+ Raises:
79
+ RuntimeError: If the request fails after all retry attempts.
80
+ """
81
+ messages = self._build_message(
82
+ image_b64, mime_type, system_prompt, user_prompt, output_schema
83
+ )
84
+
85
+ for attempt in range(self.max_retries):
86
+ try:
87
+ if output_schema:
88
+ return await self.async_client.chat.completions.parse(
89
+ model=self.model_name,
90
+ messages=messages,
91
+ max_tokens=self.max_tokens,
92
+ temperature=self.temperature,
93
+ response_format=output_schema,
94
+ )
95
+ else:
96
+ return await self.async_client.chat.completions.create(
97
+ model=self.model_name,
98
+ messages=messages,
99
+ max_tokens=self.max_tokens,
100
+ temperature=self.temperature,
101
+ )
102
+
103
+ except Exception as e:
104
+ if attempt < self.max_retries - 1:
105
+ wait_time = self.retry_delay * (attempt + 1)
106
+ await asyncio.sleep(wait_time)
107
+ else:
108
+ raise RuntimeError(
109
+ f"Asynchronous VLM call failed after {self.max_retries} attempts: {str(e)}"
110
+ ) from e
111
+
112
+ def _build_message(
113
+ self,
114
+ image_b64: str,
115
+ mime_type: str,
116
+ system_prompt: Optional[str] = None,
117
+ user_prompt: Optional[str] = None,
118
+ output_schema: Optional[Type[BaseModel]] = None,
119
+ ) -> List[Dict[str, Any]]:
120
+ """
121
+ Construct the message payload for the API call.
122
+
123
+ Args:
124
+ image_b64: Base64 encoded image string.
125
+ mime_type: The MIME type of the image.
126
+ system_prompt: Optional system prompt override.
127
+ user_prompt: Optional user prompt override.
128
+ output_schema: Optional output schema.
129
+
130
+ Returns:
131
+ A list of message dictionaries.
132
+ """
133
+ messages = []
134
+
135
+ if output_schema is not None:
136
+ if not system_prompt:
137
+ raise ValueError(
138
+ "When using response_format, you MUST provide a system_prompt explicitly "
139
+ "(default XML prompt is disabled because it conflicts with structured output)."
140
+ )
141
+ final_system_prompt = system_prompt
142
+ else:
143
+ from .constants import DEFAULT_SYSTEM_PROMPT, TRANSCRIPTION
144
+
145
+ if system_prompt:
146
+ final_system_prompt = f"{system_prompt}\n\n{TRANSCRIPTION}"
147
+ else:
148
+ final_system_prompt = DEFAULT_SYSTEM_PROMPT
149
+
150
+ messages.append({"role": "system", "content": final_system_prompt})
151
+
152
+ from .constants import DEFAULT_USER_PROMPT
153
+
154
+ user_content = [
155
+ {"type": "text", "text": user_prompt or DEFAULT_USER_PROMPT},
156
+ {
157
+ "type": "image_url",
158
+ "image_url": {"url": f"data:{mime_type};base64,{image_b64}"},
159
+ },
160
+ ]
161
+
162
+ messages.append({"role": "user", "content": user_content})
163
+ return messages
@@ -0,0 +1,69 @@
1
+ DEFAULT_SYSTEM_PROMPT = """\
2
+ You are a document transcription engine. Transcribe only visible text into Markdown.
3
+
4
+ ACCURACY
5
+ - Copy all numbers, dates, codes, and names exactly as written. Never round or reformat.
6
+ - If text is unreadable, write [UNCLEAR]. Never guess or infer.
7
+
8
+ COMPLETENESS
9
+ - Transcribe ALL text content without exception. Never skip, summarize, or paraphrase.
10
+ - Transcribe in reading order: top to bottom.
11
+
12
+ LAYOUT
13
+ - Single column: transcribe top to bottom.
14
+ - Two columns: transcribe full LEFT column first (top to bottom), then full RIGHT column.
15
+ - Three or more columns: transcribe left to right, one full column at a time.
16
+ - If columns share a spanning header, write the header first before columns begin.
17
+
18
+ TABLES
19
+ - Detect ALL tables, even if borders are faint, dashed, or missing.
20
+ - Format every table with pipe syntax:
21
+ | Header 1 | Header 2 | Header 3 |
22
+ |----------|----------|----------|
23
+ | value | value | value |
24
+ - Keep column count consistent across all rows.
25
+ - Pad missing cells with empty pipes: | |
26
+ - For merged/spanned cells, repeat the value in each affected cell.
27
+ - Never collapse a table into paragraph text.
28
+
29
+ HEADINGS
30
+ - Use # for the document title only (maximum one per page).
31
+ - Use ## for main section headers.
32
+ - Use ### for sub-sections.
33
+ - Never use #### or deeper.
34
+ - Never add headings to regular body text.
35
+
36
+ IMAGES & CHARTS
37
+ - Charts or graphs: <chart>One sentence description of what the chart shows</chart>
38
+ - Photos or illustrations: <image_desc>One sentence description</image_desc>
39
+ - Logos: <logo>Company or brand name</logo>
40
+ - Keep all descriptions to one sentence. Do not elaborate.
41
+
42
+ IGNORE
43
+ - Standalone page numbers (e.g., "1", "- 2 -", "Page 3 of 10").
44
+ - Running headers or footers that repeat identically across pages.
45
+ - Watermarks and diagonal background text.
46
+ """
47
+
48
+ DEFAULT_USER_PROMPT = "Transcribe this document into Markdown. Follow all rules. Output only the <transcription> block."
49
+
50
+ CONTINUE_PROMPT = """\
51
+ Continue the transcription from exactly where you stopped. Do not repeat what is already written.
52
+
53
+ Last content written:
54
+ ...{context}
55
+
56
+ Continue from this point. Close with </transcription> when the full page is done.\
57
+ """
58
+
59
+ FIX_PROMPT = """\
60
+ You got stuck in a repetition loop. Resume transcription from before the loop started.
61
+
62
+ Last valid content:
63
+ ...{restart_from}
64
+
65
+ Continue from this exact point. Do not repeat anything already written.
66
+ Close with </transcription> when the full page is done.\
67
+ """
68
+
69
+ TRANSCRIPTION = "IMPORTANT: Wrap ONLY the content in <transcription></transcription> tags."