tinydoc 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tinydoc/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .extractor import TinyDocExtractor
2
+ from .models import ExtractionResult, QAResult, TableResult
3
+
4
+ __all__ = ["TinyDocExtractor", "ExtractionResult", "QAResult", "TableResult"]
tinydoc/extractor.py ADDED
@@ -0,0 +1,291 @@
1
+ import time
2
+ import re
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Union, Dict, Any, Optional, List
7
+
8
+ from PIL import Image
9
+ import torch
10
+ import numpy as np
11
+
12
+ # Try to import from tinydoc_vlm package
13
+ try:
14
+ from tinydoc_vlm import (
15
+ TinyDocVLMForConditionalGeneration,
16
+ TinyDocVLMProcessor,
17
+ TinyDocVLMConfig
18
+ )
19
+ except ImportError:
20
+ # If running from inside the repository and not installed as package
21
+ import sys
22
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
23
+ from tinydoc_vlm import (
24
+ TinyDocVLMForConditionalGeneration,
25
+ TinyDocVLMProcessor,
26
+ TinyDocVLMConfig
27
+ )
28
+
29
+ from .models import ExtractionResult, QAResult, TableResult
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ class TinyDocExtractor:
34
+ """
35
+ Python SDK for TinyDoc-VLM document extraction and VQA.
36
+ Provides simple, one-liner APIs for document understanding tasks.
37
+ Supports PyTorch and ONNX Runtime backends.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ model_path_or_id: str = "eulogik/TinyDoc-VLM-256M",
43
+ device: Optional[str] = None,
44
+ use_onnx: bool = False,
45
+ onnx_model_path: Optional[str] = None
46
+ ):
47
+ self.use_onnx = use_onnx or (onnx_model_path is not None)
48
+ self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
49
+
50
+ # Load processor
51
+ logger.info(f"Loading processor from {model_path_or_id}...")
52
+ try:
53
+ self.processor = TinyDocVLMProcessor.from_pretrained(model_path_or_id)
54
+ except Exception as e:
55
+ logger.warning(f"Could not load processor from {model_path_or_id} ({e}). Initialising processor with default configuration.")
56
+ self.processor = TinyDocVLMProcessor()
57
+
58
+ if self.use_onnx:
59
+ import onnxruntime as ort
60
+ self.onnx_path = onnx_model_path or str(Path(model_path_or_id) / "model.onnx")
61
+ logger.info(f"Loading ONNX model from {self.onnx_path}...")
62
+ self.session = ort.InferenceSession(self.onnx_path)
63
+ self.model = None
64
+ else:
65
+ logger.info(f"Loading PyTorch model from {model_path_or_id}...")
66
+ try:
67
+ self.model = TinyDocVLMForConditionalGeneration.from_pretrained(model_path_or_id)
68
+ self.model.to(self.device)
69
+ self.model.eval()
70
+ except Exception as e:
71
+ logger.warning(
72
+ f"Could not load PyTorch weights from {model_path_or_id} ({e}). "
73
+ "Creating a randomly initialized model for development/testing."
74
+ )
75
+ config = TinyDocVLMConfig()
76
+ self.model = TinyDocVLMForConditionalGeneration(config)
77
+ self.model.decoder.resize_token_embeddings(len(self.processor.tokenizer))
78
+ self.model.to(self.device)
79
+ self.model.eval()
80
+
81
+ def _load_image(self, image_or_pdf: Union[str, Path, Image.Image], page: int = 1) -> Image.Image:
82
+ """Helper to load input file/image into a PIL Image."""
83
+ if isinstance(image_or_pdf, Image.Image):
84
+ return image_or_pdf.convert("RGB")
85
+
86
+ path = Path(image_or_pdf)
87
+ if not path.exists():
88
+ raise FileNotFoundError(f"Input file not found: {path}")
89
+
90
+ if path.suffix.lower() == ".pdf":
91
+ try:
92
+ from pdf2image import convert_from_path
93
+ images = convert_from_path(path, first_page=page, last_page=page)
94
+ if not images:
95
+ raise ValueError(f"Could not extract page {page} from PDF: {path}")
96
+ return images[0].convert("RGB")
97
+ except ImportError:
98
+ raise ImportError("Please install pdf2image (`pip install pdf2image`) and ensure poppler is installed to read PDFs.")
99
+ else:
100
+ return Image.open(path).convert("RGB")
101
+
102
+ def _generate(
103
+ self,
104
+ image: Image.Image,
105
+ prompt: str,
106
+ max_new_tokens: int = 512,
107
+ temperature: float = 0.2
108
+ ) -> tuple[str, float, int]:
109
+ """Runs autoregressive text generation using PyTorch or ONNX."""
110
+ start_time = time.time()
111
+
112
+ # Preprocess text and image
113
+ inputs = self.processor(text=prompt, images=image, padding=True)
114
+
115
+ # Extracted shapes
116
+ input_ids = inputs["input_ids"]
117
+ attention_mask = inputs["attention_mask"]
118
+ pixel_values = inputs.get("pixel_values")
119
+
120
+ generated_tokens = 0
121
+
122
+ if self.use_onnx:
123
+ # Greedy generation loop using ONNX
124
+ eos_token_id = self.processor.tokenizer.eos_token_id
125
+
126
+ for _ in range(max_new_tokens):
127
+ # Format ONNX input feed
128
+ feed = {
129
+ "input_ids": input_ids.numpy(),
130
+ "attention_mask": attention_mask.numpy()
131
+ }
132
+ if pixel_values is not None:
133
+ feed["pixel_values"] = pixel_values.numpy()
134
+
135
+ outputs = self.session.run(None, feed)
136
+ logits = outputs[0] # Shape: (1, seq_len, vocab_size)
137
+
138
+ # Next token prediction (greedy)
139
+ next_token = int(np.argmax(logits[0, -1, :]))
140
+ generated_tokens += 1
141
+
142
+ if next_token == eos_token_id:
143
+ break
144
+
145
+ # Append next token
146
+ input_ids = torch.cat([input_ids, torch.tensor([[next_token]])], dim=-1)
147
+ attention_mask = torch.cat([attention_mask, torch.tensor([[1]])], dim=-1)
148
+
149
+ output_text = self.processor.tokenizer.decode(
150
+ input_ids[0, inputs["input_ids"].shape[1]:],
151
+ skip_special_tokens=True
152
+ )
153
+ else:
154
+ # PyTorch generation
155
+ input_ids = input_ids.to(self.device)
156
+ attention_mask = attention_mask.to(self.device)
157
+ if pixel_values is not None:
158
+ pixel_values = pixel_values.to(self.device)
159
+
160
+ with torch.no_grad():
161
+ outputs = self.model.generate(
162
+ input_ids=input_ids,
163
+ pixel_values=pixel_values,
164
+ attention_mask=attention_mask,
165
+ max_new_tokens=max_new_tokens,
166
+ do_sample=temperature > 0.0,
167
+ temperature=temperature if temperature > 0.0 else None,
168
+ eos_token_id=self.processor.tokenizer.eos_token_id,
169
+ pad_token_id=self.processor.tokenizer.pad_token_id,
170
+ )
171
+
172
+ generated_tokens = outputs.shape[1] - input_ids.shape[1]
173
+ output_text = self.processor.tokenizer.decode(
174
+ outputs[0, input_ids.shape[1]:],
175
+ skip_special_tokens=True
176
+ )
177
+
178
+ latency_ms = (time.time() - start_time) * 1000
179
+ return output_text.strip(), latency_ms, generated_tokens
180
+
181
+ def extract(
182
+ self,
183
+ image_or_pdf: Union[str, Path, Image.Image],
184
+ output_format: str = "json",
185
+ page: int = 1
186
+ ) -> ExtractionResult:
187
+ """
188
+ Extracts document information as structured JSON fields.
189
+ """
190
+ img = self._load_image(image_or_pdf, page=page)
191
+
192
+ prompt = "<image>\nExtract all fields as JSON."
193
+ output_text, latency_ms, gen_tokens = self._generate(img, prompt)
194
+
195
+ # Attempt to parse JSON from output text
196
+ fields = {}
197
+ # Clean any markdown block formatting (e.g. ```json ... ```)
198
+ cleaned = re.sub(r"```(?:json)?\s*(.*?)\s*```", r"\1", output_text, flags=re.DOTALL).strip()
199
+
200
+ try:
201
+ fields = json.loads(cleaned)
202
+ except json.JSONDecodeError:
203
+ # Fallback regex search for JSON block
204
+ match = re.search(r"\{.*\}", cleaned, re.DOTALL)
205
+ if match:
206
+ try:
207
+ fields = json.loads(match.group(0))
208
+ except json.JSONDecodeError:
209
+ logger.warning("Failed to parse JSON structure from model output.")
210
+ else:
211
+ logger.warning("No JSON structure found in model output.")
212
+
213
+ return ExtractionResult(
214
+ raw_text=output_text,
215
+ fields=fields,
216
+ latency_ms=latency_ms,
217
+ num_tokens_generated=gen_tokens
218
+ )
219
+
220
+ def ask(
221
+ self,
222
+ image_or_pdf: Union[str, Path, Image.Image],
223
+ question: str,
224
+ page: int = 1
225
+ ) -> QAResult:
226
+ """
227
+ Asks a question about the document image or PDF.
228
+ """
229
+ img = self._load_image(image_or_pdf, page=page)
230
+ prompt = f"<image>\nAnswer the following question about this document: {question}"
231
+
232
+ output_text, latency_ms, gen_tokens = self._generate(img, prompt)
233
+
234
+ return QAResult(
235
+ question=question,
236
+ answer=output_text,
237
+ latency_ms=latency_ms,
238
+ num_tokens_generated=gen_tokens
239
+ )
240
+
241
+ def extract_table(
242
+ self,
243
+ image_or_pdf: Union[str, Path, Image.Image],
244
+ page: int = 1,
245
+ format: str = "markdown"
246
+ ) -> TableResult:
247
+ """
248
+ Extracts table structures from document pages and converts to Markdown or HTML.
249
+ """
250
+ img = self._load_image(image_or_pdf, page=page)
251
+ prompt = "<image>\nConvert this table to HTML markup."
252
+
253
+ output_text, latency_ms, gen_tokens = self._generate(img, prompt)
254
+
255
+ markdown_table = self._html_table_to_markdown(output_text)
256
+
257
+ return TableResult(
258
+ raw_table=output_text,
259
+ markdown=markdown_table,
260
+ latency_ms=latency_ms,
261
+ num_tokens_generated=gen_tokens
262
+ )
263
+
264
+ def _html_table_to_markdown(self, html: str) -> str:
265
+ """Convert basic HTML table representation to markdown."""
266
+ rows = re.findall(r"<tr>(.*?)</tr>", html, re.DOTALL)
267
+ if not rows:
268
+ return html
269
+
270
+ md_rows = []
271
+ headers = []
272
+
273
+ # Try to extract headers from the first row
274
+ header_matches = re.findall(r"<t[dh]>(.*?)</t[dh]>", rows[0], re.DOTALL)
275
+ if header_matches:
276
+ headers = [h.strip() for h in header_matches]
277
+ md_rows.append("| " + " | ".join(headers) + " |")
278
+ md_rows.append("| " + " | ".join(["---"] * len(headers)) + " |")
279
+
280
+ for row in rows[1:]:
281
+ cell_matches = re.findall(r"<t[dh]>(.*?)</t[dh]>", row, re.DOTALL)
282
+ if cell_matches:
283
+ cells = [c.strip() for c in cell_matches]
284
+ if headers:
285
+ if len(cells) < len(headers):
286
+ cells += [""] * (len(headers) - len(cells))
287
+ elif len(cells) > len(headers):
288
+ cells = cells[:len(headers)]
289
+ md_rows.append("| " + " | ".join(cells) + " |")
290
+
291
+ return "\n".join(md_rows)
tinydoc/models.py ADDED
@@ -0,0 +1,23 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import Dict, Any, List, Optional
3
+
4
+ class ExtractionResult(BaseModel):
5
+ """Result of a general JSON or key-value extraction task."""
6
+ raw_text: str = Field(description="The raw string output generated by the model.")
7
+ fields: Dict[str, Any] = Field(default_factory=dict, description="Parsed JSON fields extracted from the document.")
8
+ latency_ms: float = Field(description="Inference latency in milliseconds.")
9
+ num_tokens_generated: int = Field(description="Number of tokens generated by the model.")
10
+
11
+ class QAResult(BaseModel):
12
+ """Result of a question answering task."""
13
+ question: str = Field(description="The question that was asked.")
14
+ answer: str = Field(description="The answer generated by the model.")
15
+ latency_ms: float = Field(description="Inference latency in milliseconds.")
16
+ num_tokens_generated: int = Field(description="Number of tokens generated by the model.")
17
+
18
+ class TableResult(BaseModel):
19
+ """Result of a table extraction task."""
20
+ raw_table: str = Field(description="The raw table output from the model (e.g., HTML structure).")
21
+ markdown: str = Field(description="Markdown representation of the extracted table.")
22
+ latency_ms: float = Field(description="Inference latency in milliseconds.")
23
+ num_tokens_generated: int = Field(description="Number of tokens generated by the model.")
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.4
2
+ Name: tinydoc
3
+ Version: 0.1.0
4
+ Summary: Python SDK for TinyDoc-VLM document understanding — the world's smallest document-specialist VLM
5
+ Home-page: https://github.com/eulogik/TinyDoc-VLM
6
+ Author: eulogik
7
+ Author-email: hello@eulogik.com
8
+ Project-URL: HuggingFace Model, https://huggingface.co/eulogik/TinyDoc-VLM-256M
9
+ Project-URL: Bug Tracker, https://github.com/eulogik/TinyDoc-VLM/issues
10
+ Project-URL: Documentation, https://github.com/eulogik/TinyDoc-VLM#readme
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Intended Audience :: Developers
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: pydantic>=2.8.0
20
+ Requires-Dist: pillow>=10.0.0
21
+ Requires-Dist: torch>=2.2.0
22
+ Requires-Dist: numpy>=1.26.0
23
+ Requires-Dist: transformers>=4.48.0
24
+ Requires-Dist: sentencepiece>=0.2.0
25
+ Provides-Extra: onnx
26
+ Requires-Dist: onnxruntime>=1.19.0; extra == "onnx"
27
+ Requires-Dist: optimum>=1.22.0; extra == "onnx"
28
+ Dynamic: author
29
+ Dynamic: author-email
30
+ Dynamic: classifier
31
+ Dynamic: description-content-type
32
+ Dynamic: home-page
33
+ Dynamic: project-url
34
+ Dynamic: provides-extra
35
+ Dynamic: requires-dist
36
+ Dynamic: requires-python
37
+ Dynamic: summary
@@ -0,0 +1,7 @@
1
+ tinydoc/__init__.py,sha256=v39M2kwxqJaKdumaFZr1lcEnHyWlmcgUQIWTGXMhG_M,179
2
+ tinydoc/extractor.py,sha256=MfFo0RszmwLtu9UD-l-HKJ6MeBDVfa3crdmoMmiTtBk,11206
3
+ tinydoc/models.py,sha256=chPrSel541ZN4ddSewLgld2NkaweoWdEPRkxsbHb8dI,1399
4
+ tinydoc-0.1.0.dist-info/METADATA,sha256=k49tmy9yX8i6RoJAB6jF_sABt4OX8uQ6WtzwY9kzKwc,1394
5
+ tinydoc-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
6
+ tinydoc-0.1.0.dist-info/top_level.txt,sha256=cJfLZqrxpudm7HLN0LH2j2YfPQ6MIUYp9OwE97JLeKI,8
7
+ tinydoc-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ tinydoc