sinapsis-deepseek-ocr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_deepseek_ocr/__init__.py +0 -0
- sinapsis_deepseek_ocr/helpers/__init__.py +0 -0
- sinapsis_deepseek_ocr/helpers/bbox_utils.py +18 -0
- sinapsis_deepseek_ocr/helpers/grounding_parser.py +38 -0
- sinapsis_deepseek_ocr/helpers/mode_registry.py +33 -0
- sinapsis_deepseek_ocr/helpers/schemas.py +59 -0
- sinapsis_deepseek_ocr/helpers/tags.py +19 -0
- sinapsis_deepseek_ocr/templates/__init__.py +18 -0
- sinapsis_deepseek_ocr/templates/deepseek_ocr_inference.py +218 -0
- sinapsis_deepseek_ocr-0.1.0.dist-info/METADATA +295 -0
- sinapsis_deepseek_ocr-0.1.0.dist-info/RECORD +14 -0
- sinapsis_deepseek_ocr-0.1.0.dist-info/WHEEL +5 -0
- sinapsis_deepseek_ocr-0.1.0.dist-info/licenses/LICENSE +661 -0
- sinapsis_deepseek_ocr-0.1.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
def denormalize_bbox(coords: tuple[int, int, int, int], width: int, height: int) -> tuple[int, int, int, int]:
|
|
2
|
+
"""Convert normalized 0-999 coordinates to pixel coordinates.
|
|
3
|
+
|
|
4
|
+
Args:
|
|
5
|
+
coords: (x1, y1, x2, y2) in 0-999 range.
|
|
6
|
+
width: Image width in pixels.
|
|
7
|
+
height: Image height in pixels.
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
(x1, y1, x2, y2) in pixel coordinates.
|
|
11
|
+
"""
|
|
12
|
+
x1, y1, x2, y2 = coords
|
|
13
|
+
return (
|
|
14
|
+
round(x1 * width / 999),
|
|
15
|
+
round(y1 * height / 999),
|
|
16
|
+
round(x2 * width / 999),
|
|
17
|
+
round(y2 * height / 999),
|
|
18
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Parser for DeepSeek-OCR grounding output."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from sinapsis_deepseek_ocr.helpers.schemas import GroundingResult
|
|
6
|
+
|
|
7
|
+
GROUNDING_PATTERN = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _parse_coordinates(coords_str: str) -> list[tuple[int, int, int, int]] | None:
|
|
11
|
+
"""Parse coordinates string, returning None on failure."""
|
|
12
|
+
try:
|
|
13
|
+
coords_list = eval(coords_str)
|
|
14
|
+
if isinstance(coords_list, list):
|
|
15
|
+
return [tuple(c) for c in coords_list]
|
|
16
|
+
return None
|
|
17
|
+
except (SyntaxError, ValueError):
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parse_grounding_output(text: str) -> list[GroundingResult]:
|
|
22
|
+
"""Parse grounding tags from model output.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text: Raw model output containing grounding tags.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
List of GroundingResult with parsed grounding data.
|
|
29
|
+
"""
|
|
30
|
+
matches = re.findall(GROUNDING_PATTERN, text, re.DOTALL)
|
|
31
|
+
results: list[GroundingResult] = []
|
|
32
|
+
|
|
33
|
+
for _full_match, label, coords_str in matches:
|
|
34
|
+
coordinates = _parse_coordinates(coords_str)
|
|
35
|
+
if coordinates is not None:
|
|
36
|
+
results.append(GroundingResult(label=label, coordinates=coordinates))
|
|
37
|
+
|
|
38
|
+
return results
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DeepSeekOCRModeConfig(BaseModel):
|
|
5
|
+
"""Configuration for a DeepSeek OCR inference mode.
|
|
6
|
+
|
|
7
|
+
Attributes:
|
|
8
|
+
base_size: The base resolution for image processing.
|
|
9
|
+
image_size: The target image size for inference.
|
|
10
|
+
crop_mode: Whether to use crop mode for large images.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
base_size: int
|
|
14
|
+
image_size: int
|
|
15
|
+
crop_mode: bool
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DeepSeekOCRModeRegistry:
|
|
19
|
+
"""Registry of predefined DeepSeek OCR mode configurations.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
TINY: Configuration for tiny mode (512x512, no crop).
|
|
23
|
+
SMALL: Configuration for small mode (640x640, no crop).
|
|
24
|
+
GUNDAM: Configuration for gundam mode (1024 base, 640 image, with crop).
|
|
25
|
+
BASE: Configuration for base mode (1024x1024, no crop).
|
|
26
|
+
LARGE: Configuration for large mode (1280x1280, no crop).
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
TINY = DeepSeekOCRModeConfig(base_size=512, image_size=512, crop_mode=False)
|
|
30
|
+
SMALL = DeepSeekOCRModeConfig(base_size=640, image_size=640, crop_mode=False)
|
|
31
|
+
GUNDAM = DeepSeekOCRModeConfig(base_size=1024, image_size=640, crop_mode=True)
|
|
32
|
+
BASE = DeepSeekOCRModeConfig(base_size=1024, image_size=1024, crop_mode=False)
|
|
33
|
+
LARGE = DeepSeekOCRModeConfig(base_size=1280, image_size=1280, crop_mode=False)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, model_validator
|
|
5
|
+
from pydantic.dataclasses import dataclass
|
|
6
|
+
from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
|
|
7
|
+
|
|
8
|
+
_DTYPE_MAP = {"float16": torch.float16, "bfloat16": torch.bfloat16}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DeepSeekOCRInitArgs(BaseModel):
|
|
12
|
+
"""Initialization arguments for the DeepSeek OCR model.
|
|
13
|
+
|
|
14
|
+
Note: This model requires CUDA. CPU inference is not supported as DeepSeek's
|
|
15
|
+
infer() method internally requires CUDA tensors.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
pretrained_model_name_or_path (str): HuggingFace model identifier or local path.
|
|
19
|
+
cache_dir (str): Directory to cache downloaded models.
|
|
20
|
+
torch_dtype (Literal["float16", "bfloat16", "auto"] | torch.dtype): Precision for model weights.
|
|
21
|
+
attn_implementation (Literal["flash_attention_2", "eager"]: Attention implementation.
|
|
22
|
+
trust_remote_code (Literal[True]): Whether to trust remote code from HuggingFace.
|
|
23
|
+
use_safetensors (Literal[True]): Whether to use safetensors format.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
pretrained_model_name_or_path: str = "deepseek-ai/DeepSeek-OCR"
|
|
27
|
+
cache_dir: str = SINAPSIS_CACHE_DIR
|
|
28
|
+
torch_dtype: Literal["float16", "bfloat16", "auto"] | torch.dtype = "auto"
|
|
29
|
+
attn_implementation: Literal["flash_attention_2", "eager"] = "flash_attention_2"
|
|
30
|
+
trust_remote_code: Literal[True] = True
|
|
31
|
+
use_safetensors: Literal[True] = True
|
|
32
|
+
|
|
33
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
34
|
+
|
|
35
|
+
@model_validator(mode="after")
|
|
36
|
+
def resolve_torch_dtype(self) -> "DeepSeekOCRInitArgs":
|
|
37
|
+
"""Resolve 'auto' torch_dtype to 'float16' or 'bfloat16' based on availability.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
DeepSeekOCRInitArgs: The validated instance with resolved torch_dtype.
|
|
41
|
+
"""
|
|
42
|
+
if self.torch_dtype == "auto":
|
|
43
|
+
self.torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
|
44
|
+
elif isinstance(self.torch_dtype, str):
|
|
45
|
+
self.torch_dtype = _DTYPE_MAP.get(self.torch_dtype, self.torch_dtype)
|
|
46
|
+
return self
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class GroundingResult:
|
|
51
|
+
"""A single grounding result with label and bounding box coordinates.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
label: The text label for this grounding region.
|
|
55
|
+
coordinates: List of (x1, y1, x2, y2) tuples in normalized 0-999 range.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
label: str
|
|
59
|
+
coordinates: list[tuple[int, int, int, int]]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Tags(Enum):
|
|
5
|
+
"""UI tags for categorizing DeepSeek OCR templates.
|
|
6
|
+
|
|
7
|
+
Attributes:
|
|
8
|
+
DEEPSEEK: Tag for DeepSeek-related templates.
|
|
9
|
+
IMAGE: Tag for image processing templates.
|
|
10
|
+
OCR: Tag for optical character recognition templates.
|
|
11
|
+
TEXT: Tag for text-related templates.
|
|
12
|
+
TEXT_RECOGNITION: Tag for text recognition templates.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
DEEPSEEK = "deepseek"
|
|
16
|
+
IMAGE = "image"
|
|
17
|
+
OCR = "optical_character_recognition"
|
|
18
|
+
TEXT = "text"
|
|
19
|
+
TEXT_RECOGNITION = "text_recognition"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
|
|
4
|
+
_root_lib_path = "sinapsis_deepseek_ocr.templates"
|
|
5
|
+
|
|
6
|
+
_template_lookup = {
|
|
7
|
+
"DeepSeekOCRInference": f"{_root_lib_path}.deepseek_ocr_inference",
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def __getattr__(name: str) -> Callable:
|
|
12
|
+
if name in _template_lookup:
|
|
13
|
+
module = importlib.import_module(_template_lookup[name])
|
|
14
|
+
return getattr(module, name)
|
|
15
|
+
raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = list(_template_lookup.keys())
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from typing import Any, Literal
|
|
5
|
+
|
|
6
|
+
import cv2
|
|
7
|
+
import torch
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
from sinapsis_core.data_containers.annotations import BoundingBox, ImageAnnotations
|
|
10
|
+
from sinapsis_core.data_containers.data_packet import DataContainer, ImageColor, ImagePacket, TextPacket
|
|
11
|
+
from sinapsis_core.template_base import Template
|
|
12
|
+
from sinapsis_core.template_base.base_models import (
|
|
13
|
+
OutputTypes,
|
|
14
|
+
TemplateAttributes,
|
|
15
|
+
UIPropertiesMetadata,
|
|
16
|
+
)
|
|
17
|
+
from sinapsis_generic_data_tools.helpers.image_color_space_converter_cv import convert_color_space_cv
|
|
18
|
+
from transformers import AutoModel, AutoTokenizer
|
|
19
|
+
|
|
20
|
+
from sinapsis_deepseek_ocr.helpers.bbox_utils import denormalize_bbox
|
|
21
|
+
from sinapsis_deepseek_ocr.helpers.grounding_parser import parse_grounding_output
|
|
22
|
+
from sinapsis_deepseek_ocr.helpers.mode_registry import DeepSeekOCRModeRegistry
|
|
23
|
+
from sinapsis_deepseek_ocr.helpers.schemas import DeepSeekOCRInitArgs, GroundingResult
|
|
24
|
+
from sinapsis_deepseek_ocr.helpers.tags import Tags
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DeepSeekOCRInferenceAttributes(TemplateAttributes):
|
|
28
|
+
"""Attributes for DeepSeek OCR inference.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
prompt: The prompt to send to the model (without <image> or <|grounding|> tags).
|
|
32
|
+
enable_grounding: Whether to enable grounding for bounding box extraction.
|
|
33
|
+
mode: The inference mode configuration to use.
|
|
34
|
+
init_args: Initialization arguments for the model.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
prompt: str = "OCR the image."
|
|
38
|
+
enable_grounding: bool = False
|
|
39
|
+
mode: Literal["tiny", "small", "gundam", "base", "large"] = "base"
|
|
40
|
+
init_args: DeepSeekOCRInitArgs = Field(default_factory=DeepSeekOCRInitArgs)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DeepSeekOCRInference(Template):
|
|
44
|
+
"""Template for DeepSeek OCR inference.
|
|
45
|
+
|
|
46
|
+
This template uses the DeepSeek OCR model to extract text from images.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
MODE_REGISTRY: Registry of available inference mode configurations.
|
|
50
|
+
AttributesBaseModel: The Pydantic model for template attributes.
|
|
51
|
+
UIProperties: UI metadata for the template.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
MODE_REGISTRY = DeepSeekOCRModeRegistry
|
|
55
|
+
AttributesBaseModel = DeepSeekOCRInferenceAttributes
|
|
56
|
+
UIProperties = UIPropertiesMetadata(
|
|
57
|
+
category="OCR",
|
|
58
|
+
output_type=OutputTypes.MULTIMODAL,
|
|
59
|
+
tags=[
|
|
60
|
+
Tags.DEEPSEEK,
|
|
61
|
+
Tags.IMAGE,
|
|
62
|
+
Tags.OCR,
|
|
63
|
+
Tags.TEXT,
|
|
64
|
+
Tags.TEXT_RECOGNITION,
|
|
65
|
+
],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def __init__(self, attributes: TemplateAttributes) -> None:
|
|
69
|
+
super().__init__(attributes)
|
|
70
|
+
self.initialize()
|
|
71
|
+
|
|
72
|
+
def initialize(self) -> None:
|
|
73
|
+
"""Load the model and tokenizer from pretrained weights."""
|
|
74
|
+
self.model = self._initialize_model()
|
|
75
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
76
|
+
self.attributes.init_args.pretrained_model_name_or_path,
|
|
77
|
+
cache_dir=self.attributes.init_args.cache_dir,
|
|
78
|
+
trust_remote_code=self.attributes.init_args.trust_remote_code,
|
|
79
|
+
)
|
|
80
|
+
self.mode_config = self._get_mode_config()
|
|
81
|
+
self.full_prompt = self._build_prompt()
|
|
82
|
+
|
|
83
|
+
def _initialize_model(self) -> AutoModel:
|
|
84
|
+
"""Initialize and return the model with appropriate configuration.
|
|
85
|
+
|
|
86
|
+
Note: Always uses CUDA as DeepSeek's infer() method requires it.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
AutoModel: The initialized model ready for inference.
|
|
90
|
+
"""
|
|
91
|
+
model = AutoModel.from_pretrained(**self.attributes.init_args.model_dump())
|
|
92
|
+
return model.eval().cuda().to(self.attributes.init_args.torch_dtype)
|
|
93
|
+
|
|
94
|
+
def _get_mode_config(self) -> dict[str, Any]:
|
|
95
|
+
"""Get the mode configuration for the current inference mode.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
dict[str, Any]: The mode configuration as a dictionary.
|
|
99
|
+
"""
|
|
100
|
+
mode_config = getattr(self.MODE_REGISTRY, self.attributes.mode.upper())
|
|
101
|
+
return mode_config.model_dump()
|
|
102
|
+
|
|
103
|
+
def _build_prompt(self) -> str:
|
|
104
|
+
"""Build the full prompt with appropriate tags.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Full prompt with <image> and optional <|grounding|> tags.
|
|
108
|
+
"""
|
|
109
|
+
if self.attributes.enable_grounding:
|
|
110
|
+
return f"<image>\n<|grounding|>{self.attributes.prompt}"
|
|
111
|
+
return f"<image>\n{self.attributes.prompt}"
|
|
112
|
+
|
|
113
|
+
def infer(self, image_packet: ImagePacket) -> str:
|
|
114
|
+
"""Run OCR inference on an image packet.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
image_packet: The image packet containing the image to process.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
str: The raw OCR result from the model.
|
|
121
|
+
"""
|
|
122
|
+
fd, tmp_path = tempfile.mkstemp(suffix=".jpg")
|
|
123
|
+
os.close(fd)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
image_packet = convert_color_space_cv(image_packet=image_packet, desired_color_space=ImageColor.BGR)
|
|
127
|
+
cv2.imwrite(tmp_path, image_packet.content)
|
|
128
|
+
|
|
129
|
+
raw_result = self.model.infer(
|
|
130
|
+
tokenizer=self.tokenizer,
|
|
131
|
+
prompt=self.full_prompt,
|
|
132
|
+
image_file=tmp_path,
|
|
133
|
+
output_path=self.attributes.init_args.cache_dir,
|
|
134
|
+
eval_mode=True,
|
|
135
|
+
save_results=False,
|
|
136
|
+
**self.mode_config,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return raw_result
|
|
140
|
+
|
|
141
|
+
finally:
|
|
142
|
+
if os.path.exists(tmp_path):
|
|
143
|
+
os.remove(tmp_path)
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def grounding_to_annotations(results: list[GroundingResult], width: int, height: int) -> list[ImageAnnotations]:
|
|
147
|
+
"""Convert GroundingResult list to ImageAnnotations.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
results: List of parsed grounding results.
|
|
151
|
+
width: Image width in pixels.
|
|
152
|
+
height: Image height in pixels.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
List of ImageAnnotations with denormalized bounding boxes.
|
|
156
|
+
"""
|
|
157
|
+
annotations: list[ImageAnnotations] = []
|
|
158
|
+
|
|
159
|
+
for result in results:
|
|
160
|
+
for coords in result.coordinates:
|
|
161
|
+
x1, y1, x2, y2 = denormalize_bbox(coords, width, height)
|
|
162
|
+
bbox = BoundingBox(x=x1, y=y1, w=x2 - x1, h=y2 - y1)
|
|
163
|
+
ann = ImageAnnotations(
|
|
164
|
+
label_str=result.label,
|
|
165
|
+
bbox=bbox,
|
|
166
|
+
text=result.label,
|
|
167
|
+
)
|
|
168
|
+
annotations.append(ann)
|
|
169
|
+
|
|
170
|
+
return annotations
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def clear_memory() -> None:
|
|
174
|
+
"""Clear memory to free up resources.
|
|
175
|
+
|
|
176
|
+
Performs garbage collection and clears GPU memory if available.
|
|
177
|
+
"""
|
|
178
|
+
gc.collect()
|
|
179
|
+
if torch.cuda.is_available():
|
|
180
|
+
torch.cuda.empty_cache()
|
|
181
|
+
torch.cuda.ipc_collect()
|
|
182
|
+
|
|
183
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
184
|
+
"""Reset the template state by reinitializing the model.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
template_name: Optional template name (unused, for interface compatibility).
|
|
188
|
+
"""
|
|
189
|
+
_ = template_name
|
|
190
|
+
|
|
191
|
+
if hasattr(self, "model"):
|
|
192
|
+
del self.model
|
|
193
|
+
|
|
194
|
+
self.clear_memory()
|
|
195
|
+
self.initialize()
|
|
196
|
+
self.logger.info(f"Reset template instance `{self.instance_name}`")
|
|
197
|
+
|
|
198
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
199
|
+
"""Execute OCR on all images in the container.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
container: The data container with images to process.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
DataContainer: The container with annotations or text packets.
|
|
206
|
+
"""
|
|
207
|
+
for image_packet in container.images:
|
|
208
|
+
raw_result = self.infer(image_packet)
|
|
209
|
+
|
|
210
|
+
if self.attributes.enable_grounding:
|
|
211
|
+
grounding_results = parse_grounding_output(raw_result)
|
|
212
|
+
height, width = image_packet.shape[:2]
|
|
213
|
+
annotations = self.grounding_to_annotations(grounding_results, width, height)
|
|
214
|
+
image_packet.annotations.extend(annotations)
|
|
215
|
+
else:
|
|
216
|
+
container.texts.append(TextPacket(content=raw_result))
|
|
217
|
+
|
|
218
|
+
return container
|