tinydoc 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinydoc/__init__.py +4 -0
- tinydoc/extractor.py +291 -0
- tinydoc/models.py +23 -0
- tinydoc-0.1.0.dist-info/METADATA +37 -0
- tinydoc-0.1.0.dist-info/RECORD +7 -0
- tinydoc-0.1.0.dist-info/WHEEL +5 -0
- tinydoc-0.1.0.dist-info/top_level.txt +1 -0
tinydoc/__init__.py
ADDED
tinydoc/extractor.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import re
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union, Dict, Any, Optional, List
|
|
7
|
+
|
|
8
|
+
from PIL import Image
|
|
9
|
+
import torch
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
# Try to import from tinydoc_vlm package
|
|
13
|
+
try:
|
|
14
|
+
from tinydoc_vlm import (
|
|
15
|
+
TinyDocVLMForConditionalGeneration,
|
|
16
|
+
TinyDocVLMProcessor,
|
|
17
|
+
TinyDocVLMConfig
|
|
18
|
+
)
|
|
19
|
+
except ImportError:
|
|
20
|
+
# If running from inside the repository and not installed as package
|
|
21
|
+
import sys
|
|
22
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
23
|
+
from tinydoc_vlm import (
|
|
24
|
+
TinyDocVLMForConditionalGeneration,
|
|
25
|
+
TinyDocVLMProcessor,
|
|
26
|
+
TinyDocVLMConfig
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
from .models import ExtractionResult, QAResult, TableResult
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
class TinyDocExtractor:
|
|
34
|
+
"""
|
|
35
|
+
Python SDK for TinyDoc-VLM document extraction and VQA.
|
|
36
|
+
Provides simple, one-liner APIs for document understanding tasks.
|
|
37
|
+
Supports PyTorch and ONNX Runtime backends.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
model_path_or_id: str = "eulogik/TinyDoc-VLM-256M",
|
|
43
|
+
device: Optional[str] = None,
|
|
44
|
+
use_onnx: bool = False,
|
|
45
|
+
onnx_model_path: Optional[str] = None
|
|
46
|
+
):
|
|
47
|
+
self.use_onnx = use_onnx or (onnx_model_path is not None)
|
|
48
|
+
self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
|
|
49
|
+
|
|
50
|
+
# Load processor
|
|
51
|
+
logger.info(f"Loading processor from {model_path_or_id}...")
|
|
52
|
+
try:
|
|
53
|
+
self.processor = TinyDocVLMProcessor.from_pretrained(model_path_or_id)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.warning(f"Could not load processor from {model_path_or_id} ({e}). Initialising processor with default configuration.")
|
|
56
|
+
self.processor = TinyDocVLMProcessor()
|
|
57
|
+
|
|
58
|
+
if self.use_onnx:
|
|
59
|
+
import onnxruntime as ort
|
|
60
|
+
self.onnx_path = onnx_model_path or str(Path(model_path_or_id) / "model.onnx")
|
|
61
|
+
logger.info(f"Loading ONNX model from {self.onnx_path}...")
|
|
62
|
+
self.session = ort.InferenceSession(self.onnx_path)
|
|
63
|
+
self.model = None
|
|
64
|
+
else:
|
|
65
|
+
logger.info(f"Loading PyTorch model from {model_path_or_id}...")
|
|
66
|
+
try:
|
|
67
|
+
self.model = TinyDocVLMForConditionalGeneration.from_pretrained(model_path_or_id)
|
|
68
|
+
self.model.to(self.device)
|
|
69
|
+
self.model.eval()
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.warning(
|
|
72
|
+
f"Could not load PyTorch weights from {model_path_or_id} ({e}). "
|
|
73
|
+
"Creating a randomly initialized model for development/testing."
|
|
74
|
+
)
|
|
75
|
+
config = TinyDocVLMConfig()
|
|
76
|
+
self.model = TinyDocVLMForConditionalGeneration(config)
|
|
77
|
+
self.model.decoder.resize_token_embeddings(len(self.processor.tokenizer))
|
|
78
|
+
self.model.to(self.device)
|
|
79
|
+
self.model.eval()
|
|
80
|
+
|
|
81
|
+
def _load_image(self, image_or_pdf: Union[str, Path, Image.Image], page: int = 1) -> Image.Image:
|
|
82
|
+
"""Helper to load input file/image into a PIL Image."""
|
|
83
|
+
if isinstance(image_or_pdf, Image.Image):
|
|
84
|
+
return image_or_pdf.convert("RGB")
|
|
85
|
+
|
|
86
|
+
path = Path(image_or_pdf)
|
|
87
|
+
if not path.exists():
|
|
88
|
+
raise FileNotFoundError(f"Input file not found: {path}")
|
|
89
|
+
|
|
90
|
+
if path.suffix.lower() == ".pdf":
|
|
91
|
+
try:
|
|
92
|
+
from pdf2image import convert_from_path
|
|
93
|
+
images = convert_from_path(path, first_page=page, last_page=page)
|
|
94
|
+
if not images:
|
|
95
|
+
raise ValueError(f"Could not extract page {page} from PDF: {path}")
|
|
96
|
+
return images[0].convert("RGB")
|
|
97
|
+
except ImportError:
|
|
98
|
+
raise ImportError("Please install pdf2image (`pip install pdf2image`) and ensure poppler is installed to read PDFs.")
|
|
99
|
+
else:
|
|
100
|
+
return Image.open(path).convert("RGB")
|
|
101
|
+
|
|
102
|
+
def _generate(
|
|
103
|
+
self,
|
|
104
|
+
image: Image.Image,
|
|
105
|
+
prompt: str,
|
|
106
|
+
max_new_tokens: int = 512,
|
|
107
|
+
temperature: float = 0.2
|
|
108
|
+
) -> tuple[str, float, int]:
|
|
109
|
+
"""Runs autoregressive text generation using PyTorch or ONNX."""
|
|
110
|
+
start_time = time.time()
|
|
111
|
+
|
|
112
|
+
# Preprocess text and image
|
|
113
|
+
inputs = self.processor(text=prompt, images=image, padding=True)
|
|
114
|
+
|
|
115
|
+
# Extracted shapes
|
|
116
|
+
input_ids = inputs["input_ids"]
|
|
117
|
+
attention_mask = inputs["attention_mask"]
|
|
118
|
+
pixel_values = inputs.get("pixel_values")
|
|
119
|
+
|
|
120
|
+
generated_tokens = 0
|
|
121
|
+
|
|
122
|
+
if self.use_onnx:
|
|
123
|
+
# Greedy generation loop using ONNX
|
|
124
|
+
eos_token_id = self.processor.tokenizer.eos_token_id
|
|
125
|
+
|
|
126
|
+
for _ in range(max_new_tokens):
|
|
127
|
+
# Format ONNX input feed
|
|
128
|
+
feed = {
|
|
129
|
+
"input_ids": input_ids.numpy(),
|
|
130
|
+
"attention_mask": attention_mask.numpy()
|
|
131
|
+
}
|
|
132
|
+
if pixel_values is not None:
|
|
133
|
+
feed["pixel_values"] = pixel_values.numpy()
|
|
134
|
+
|
|
135
|
+
outputs = self.session.run(None, feed)
|
|
136
|
+
logits = outputs[0] # Shape: (1, seq_len, vocab_size)
|
|
137
|
+
|
|
138
|
+
# Next token prediction (greedy)
|
|
139
|
+
next_token = int(np.argmax(logits[0, -1, :]))
|
|
140
|
+
generated_tokens += 1
|
|
141
|
+
|
|
142
|
+
if next_token == eos_token_id:
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
# Append next token
|
|
146
|
+
input_ids = torch.cat([input_ids, torch.tensor([[next_token]])], dim=-1)
|
|
147
|
+
attention_mask = torch.cat([attention_mask, torch.tensor([[1]])], dim=-1)
|
|
148
|
+
|
|
149
|
+
output_text = self.processor.tokenizer.decode(
|
|
150
|
+
input_ids[0, inputs["input_ids"].shape[1]:],
|
|
151
|
+
skip_special_tokens=True
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
# PyTorch generation
|
|
155
|
+
input_ids = input_ids.to(self.device)
|
|
156
|
+
attention_mask = attention_mask.to(self.device)
|
|
157
|
+
if pixel_values is not None:
|
|
158
|
+
pixel_values = pixel_values.to(self.device)
|
|
159
|
+
|
|
160
|
+
with torch.no_grad():
|
|
161
|
+
outputs = self.model.generate(
|
|
162
|
+
input_ids=input_ids,
|
|
163
|
+
pixel_values=pixel_values,
|
|
164
|
+
attention_mask=attention_mask,
|
|
165
|
+
max_new_tokens=max_new_tokens,
|
|
166
|
+
do_sample=temperature > 0.0,
|
|
167
|
+
temperature=temperature if temperature > 0.0 else None,
|
|
168
|
+
eos_token_id=self.processor.tokenizer.eos_token_id,
|
|
169
|
+
pad_token_id=self.processor.tokenizer.pad_token_id,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
generated_tokens = outputs.shape[1] - input_ids.shape[1]
|
|
173
|
+
output_text = self.processor.tokenizer.decode(
|
|
174
|
+
outputs[0, input_ids.shape[1]:],
|
|
175
|
+
skip_special_tokens=True
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
179
|
+
return output_text.strip(), latency_ms, generated_tokens
|
|
180
|
+
|
|
181
|
+
def extract(
|
|
182
|
+
self,
|
|
183
|
+
image_or_pdf: Union[str, Path, Image.Image],
|
|
184
|
+
output_format: str = "json",
|
|
185
|
+
page: int = 1
|
|
186
|
+
) -> ExtractionResult:
|
|
187
|
+
"""
|
|
188
|
+
Extracts document information as structured JSON fields.
|
|
189
|
+
"""
|
|
190
|
+
img = self._load_image(image_or_pdf, page=page)
|
|
191
|
+
|
|
192
|
+
prompt = "<image>\nExtract all fields as JSON."
|
|
193
|
+
output_text, latency_ms, gen_tokens = self._generate(img, prompt)
|
|
194
|
+
|
|
195
|
+
# Attempt to parse JSON from output text
|
|
196
|
+
fields = {}
|
|
197
|
+
# Clean any markdown block formatting (e.g. ```json ... ```)
|
|
198
|
+
cleaned = re.sub(r"```(?:json)?\s*(.*?)\s*```", r"\1", output_text, flags=re.DOTALL).strip()
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
fields = json.loads(cleaned)
|
|
202
|
+
except json.JSONDecodeError:
|
|
203
|
+
# Fallback regex search for JSON block
|
|
204
|
+
match = re.search(r"\{.*\}", cleaned, re.DOTALL)
|
|
205
|
+
if match:
|
|
206
|
+
try:
|
|
207
|
+
fields = json.loads(match.group(0))
|
|
208
|
+
except json.JSONDecodeError:
|
|
209
|
+
logger.warning("Failed to parse JSON structure from model output.")
|
|
210
|
+
else:
|
|
211
|
+
logger.warning("No JSON structure found in model output.")
|
|
212
|
+
|
|
213
|
+
return ExtractionResult(
|
|
214
|
+
raw_text=output_text,
|
|
215
|
+
fields=fields,
|
|
216
|
+
latency_ms=latency_ms,
|
|
217
|
+
num_tokens_generated=gen_tokens
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def ask(
|
|
221
|
+
self,
|
|
222
|
+
image_or_pdf: Union[str, Path, Image.Image],
|
|
223
|
+
question: str,
|
|
224
|
+
page: int = 1
|
|
225
|
+
) -> QAResult:
|
|
226
|
+
"""
|
|
227
|
+
Asks a question about the document image or PDF.
|
|
228
|
+
"""
|
|
229
|
+
img = self._load_image(image_or_pdf, page=page)
|
|
230
|
+
prompt = f"<image>\nAnswer the following question about this document: {question}"
|
|
231
|
+
|
|
232
|
+
output_text, latency_ms, gen_tokens = self._generate(img, prompt)
|
|
233
|
+
|
|
234
|
+
return QAResult(
|
|
235
|
+
question=question,
|
|
236
|
+
answer=output_text,
|
|
237
|
+
latency_ms=latency_ms,
|
|
238
|
+
num_tokens_generated=gen_tokens
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def extract_table(
|
|
242
|
+
self,
|
|
243
|
+
image_or_pdf: Union[str, Path, Image.Image],
|
|
244
|
+
page: int = 1,
|
|
245
|
+
format: str = "markdown"
|
|
246
|
+
) -> TableResult:
|
|
247
|
+
"""
|
|
248
|
+
Extracts table structures from document pages and converts to Markdown or HTML.
|
|
249
|
+
"""
|
|
250
|
+
img = self._load_image(image_or_pdf, page=page)
|
|
251
|
+
prompt = "<image>\nConvert this table to HTML markup."
|
|
252
|
+
|
|
253
|
+
output_text, latency_ms, gen_tokens = self._generate(img, prompt)
|
|
254
|
+
|
|
255
|
+
markdown_table = self._html_table_to_markdown(output_text)
|
|
256
|
+
|
|
257
|
+
return TableResult(
|
|
258
|
+
raw_table=output_text,
|
|
259
|
+
markdown=markdown_table,
|
|
260
|
+
latency_ms=latency_ms,
|
|
261
|
+
num_tokens_generated=gen_tokens
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def _html_table_to_markdown(self, html: str) -> str:
|
|
265
|
+
"""Convert basic HTML table representation to markdown."""
|
|
266
|
+
rows = re.findall(r"<tr>(.*?)</tr>", html, re.DOTALL)
|
|
267
|
+
if not rows:
|
|
268
|
+
return html
|
|
269
|
+
|
|
270
|
+
md_rows = []
|
|
271
|
+
headers = []
|
|
272
|
+
|
|
273
|
+
# Try to extract headers from the first row
|
|
274
|
+
header_matches = re.findall(r"<t[dh]>(.*?)</t[dh]>", rows[0], re.DOTALL)
|
|
275
|
+
if header_matches:
|
|
276
|
+
headers = [h.strip() for h in header_matches]
|
|
277
|
+
md_rows.append("| " + " | ".join(headers) + " |")
|
|
278
|
+
md_rows.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
|
279
|
+
|
|
280
|
+
for row in rows[1:]:
|
|
281
|
+
cell_matches = re.findall(r"<t[dh]>(.*?)</t[dh]>", row, re.DOTALL)
|
|
282
|
+
if cell_matches:
|
|
283
|
+
cells = [c.strip() for c in cell_matches]
|
|
284
|
+
if headers:
|
|
285
|
+
if len(cells) < len(headers):
|
|
286
|
+
cells += [""] * (len(headers) - len(cells))
|
|
287
|
+
elif len(cells) > len(headers):
|
|
288
|
+
cells = cells[:len(headers)]
|
|
289
|
+
md_rows.append("| " + " | ".join(cells) + " |")
|
|
290
|
+
|
|
291
|
+
return "\n".join(md_rows)
|
tinydoc/models.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import Dict, Any, List, Optional
|
|
3
|
+
|
|
4
|
+
class ExtractionResult(BaseModel):
|
|
5
|
+
"""Result of a general JSON or key-value extraction task."""
|
|
6
|
+
raw_text: str = Field(description="The raw string output generated by the model.")
|
|
7
|
+
fields: Dict[str, Any] = Field(default_factory=dict, description="Parsed JSON fields extracted from the document.")
|
|
8
|
+
latency_ms: float = Field(description="Inference latency in milliseconds.")
|
|
9
|
+
num_tokens_generated: int = Field(description="Number of tokens generated by the model.")
|
|
10
|
+
|
|
11
|
+
class QAResult(BaseModel):
|
|
12
|
+
"""Result of a question answering task."""
|
|
13
|
+
question: str = Field(description="The question that was asked.")
|
|
14
|
+
answer: str = Field(description="The answer generated by the model.")
|
|
15
|
+
latency_ms: float = Field(description="Inference latency in milliseconds.")
|
|
16
|
+
num_tokens_generated: int = Field(description="Number of tokens generated by the model.")
|
|
17
|
+
|
|
18
|
+
class TableResult(BaseModel):
|
|
19
|
+
"""Result of a table extraction task."""
|
|
20
|
+
raw_table: str = Field(description="The raw table output from the model (e.g., HTML structure).")
|
|
21
|
+
markdown: str = Field(description="Markdown representation of the extracted table.")
|
|
22
|
+
latency_ms: float = Field(description="Inference latency in milliseconds.")
|
|
23
|
+
num_tokens_generated: int = Field(description="Number of tokens generated by the model.")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tinydoc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for TinyDoc-VLM document understanding — the world's smallest document-specialist VLM
|
|
5
|
+
Home-page: https://github.com/eulogik/TinyDoc-VLM
|
|
6
|
+
Author: eulogik
|
|
7
|
+
Author-email: hello@eulogik.com
|
|
8
|
+
Project-URL: HuggingFace Model, https://huggingface.co/eulogik/TinyDoc-VLM-256M
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/eulogik/TinyDoc-VLM/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/eulogik/TinyDoc-VLM#readme
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: pydantic>=2.8.0
|
|
20
|
+
Requires-Dist: pillow>=10.0.0
|
|
21
|
+
Requires-Dist: torch>=2.2.0
|
|
22
|
+
Requires-Dist: numpy>=1.26.0
|
|
23
|
+
Requires-Dist: transformers>=4.48.0
|
|
24
|
+
Requires-Dist: sentencepiece>=0.2.0
|
|
25
|
+
Provides-Extra: onnx
|
|
26
|
+
Requires-Dist: onnxruntime>=1.19.0; extra == "onnx"
|
|
27
|
+
Requires-Dist: optimum>=1.22.0; extra == "onnx"
|
|
28
|
+
Dynamic: author
|
|
29
|
+
Dynamic: author-email
|
|
30
|
+
Dynamic: classifier
|
|
31
|
+
Dynamic: description-content-type
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: project-url
|
|
34
|
+
Dynamic: provides-extra
|
|
35
|
+
Dynamic: requires-dist
|
|
36
|
+
Dynamic: requires-python
|
|
37
|
+
Dynamic: summary
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
tinydoc/__init__.py,sha256=v39M2kwxqJaKdumaFZr1lcEnHyWlmcgUQIWTGXMhG_M,179
|
|
2
|
+
tinydoc/extractor.py,sha256=MfFo0RszmwLtu9UD-l-HKJ6MeBDVfa3crdmoMmiTtBk,11206
|
|
3
|
+
tinydoc/models.py,sha256=chPrSel541ZN4ddSewLgld2NkaweoWdEPRkxsbHb8dI,1399
|
|
4
|
+
tinydoc-0.1.0.dist-info/METADATA,sha256=k49tmy9yX8i6RoJAB6jF_sABt4OX8uQ6WtzwY9kzKwc,1394
|
|
5
|
+
tinydoc-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
tinydoc-0.1.0.dist-info/top_level.txt,sha256=cJfLZqrxpudm7HLN0LH2j2YfPQ6MIUYp9OwE97JLeKI,8
|
|
7
|
+
tinydoc-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tinydoc
|