sinapsis-deepseek-ocr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,18 @@
1
+ def denormalize_bbox(coords: tuple[int, int, int, int], width: int, height: int) -> tuple[int, int, int, int]:
2
+ """Convert normalized 0-999 coordinates to pixel coordinates.
3
+
4
+ Args:
5
+ coords: (x1, y1, x2, y2) in 0-999 range.
6
+ width: Image width in pixels.
7
+ height: Image height in pixels.
8
+
9
+ Returns:
10
+ (x1, y1, x2, y2) in pixel coordinates.
11
+ """
12
+ x1, y1, x2, y2 = coords
13
+ return (
14
+ round(x1 * width / 999),
15
+ round(y1 * height / 999),
16
+ round(x2 * width / 999),
17
+ round(y2 * height / 999),
18
+ )
@@ -0,0 +1,38 @@
1
+ """Parser for DeepSeek-OCR grounding output."""
2
+
3
+ import re
4
+
5
+ from sinapsis_deepseek_ocr.helpers.schemas import GroundingResult
6
+
7
+ GROUNDING_PATTERN = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
8
+
9
+
10
+ def _parse_coordinates(coords_str: str) -> list[tuple[int, int, int, int]] | None:
11
+ """Parse coordinates string, returning None on failure."""
12
+ try:
13
+ coords_list = eval(coords_str)
14
+ if isinstance(coords_list, list):
15
+ return [tuple(c) for c in coords_list]
16
+ return None
17
+ except (SyntaxError, ValueError):
18
+ return None
19
+
20
+
21
+ def parse_grounding_output(text: str) -> list[GroundingResult]:
22
+ """Parse grounding tags from model output.
23
+
24
+ Args:
25
+ text: Raw model output containing grounding tags.
26
+
27
+ Returns:
28
+ List of GroundingResult with parsed grounding data.
29
+ """
30
+ matches = re.findall(GROUNDING_PATTERN, text, re.DOTALL)
31
+ results: list[GroundingResult] = []
32
+
33
+ for _full_match, label, coords_str in matches:
34
+ coordinates = _parse_coordinates(coords_str)
35
+ if coordinates is not None:
36
+ results.append(GroundingResult(label=label, coordinates=coordinates))
37
+
38
+ return results
@@ -0,0 +1,33 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class DeepSeekOCRModeConfig(BaseModel):
5
+ """Configuration for a DeepSeek OCR inference mode.
6
+
7
+ Attributes:
8
+ base_size: The base resolution for image processing.
9
+ image_size: The target image size for inference.
10
+ crop_mode: Whether to use crop mode for large images.
11
+ """
12
+
13
+ base_size: int
14
+ image_size: int
15
+ crop_mode: bool
16
+
17
+
18
+ class DeepSeekOCRModeRegistry:
19
+ """Registry of predefined DeepSeek OCR mode configurations.
20
+
21
+ Attributes:
22
+ TINY: Configuration for tiny mode (512x512, no crop).
23
+ SMALL: Configuration for small mode (640x640, no crop).
24
+ GUNDAM: Configuration for gundam mode (1024 base, 640 image, with crop).
25
+ BASE: Configuration for base mode (1024x1024, no crop).
26
+ LARGE: Configuration for large mode (1280x1280, no crop).
27
+ """
28
+
29
+ TINY = DeepSeekOCRModeConfig(base_size=512, image_size=512, crop_mode=False)
30
+ SMALL = DeepSeekOCRModeConfig(base_size=640, image_size=640, crop_mode=False)
31
+ GUNDAM = DeepSeekOCRModeConfig(base_size=1024, image_size=640, crop_mode=True)
32
+ BASE = DeepSeekOCRModeConfig(base_size=1024, image_size=1024, crop_mode=False)
33
+ LARGE = DeepSeekOCRModeConfig(base_size=1280, image_size=1280, crop_mode=False)
@@ -0,0 +1,59 @@
1
+ from typing import Literal
2
+
3
+ import torch
4
+ from pydantic import BaseModel, ConfigDict, model_validator
5
+ from pydantic.dataclasses import dataclass
6
+ from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
7
+
8
+ _DTYPE_MAP = {"float16": torch.float16, "bfloat16": torch.bfloat16}
9
+
10
+
11
+ class DeepSeekOCRInitArgs(BaseModel):
12
+ """Initialization arguments for the DeepSeek OCR model.
13
+
14
+ Note: This model requires CUDA. CPU inference is not supported as DeepSeek's
15
+ infer() method internally requires CUDA tensors.
16
+
17
+ Attributes:
18
+ pretrained_model_name_or_path (str): HuggingFace model identifier or local path.
19
+ cache_dir (str): Directory to cache downloaded models.
20
+ torch_dtype (Literal["float16", "bfloat16", "auto"] | torch.dtype): Precision for model weights.
21
+ attn_implementation (Literal["flash_attention_2", "eager"]: Attention implementation.
22
+ trust_remote_code (Literal[True]): Whether to trust remote code from HuggingFace.
23
+ use_safetensors (Literal[True]): Whether to use safetensors format.
24
+ """
25
+
26
+ pretrained_model_name_or_path: str = "deepseek-ai/DeepSeek-OCR"
27
+ cache_dir: str = SINAPSIS_CACHE_DIR
28
+ torch_dtype: Literal["float16", "bfloat16", "auto"] | torch.dtype = "auto"
29
+ attn_implementation: Literal["flash_attention_2", "eager"] = "flash_attention_2"
30
+ trust_remote_code: Literal[True] = True
31
+ use_safetensors: Literal[True] = True
32
+
33
+ model_config = ConfigDict(arbitrary_types_allowed=True)
34
+
35
+ @model_validator(mode="after")
36
+ def resolve_torch_dtype(self) -> "DeepSeekOCRInitArgs":
37
+ """Resolve 'auto' torch_dtype to 'float16' or 'bfloat16' based on availability.
38
+
39
+ Returns:
40
+ DeepSeekOCRInitArgs: The validated instance with resolved torch_dtype.
41
+ """
42
+ if self.torch_dtype == "auto":
43
+ self.torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
44
+ elif isinstance(self.torch_dtype, str):
45
+ self.torch_dtype = _DTYPE_MAP.get(self.torch_dtype, self.torch_dtype)
46
+ return self
47
+
48
+
49
+ @dataclass
50
+ class GroundingResult:
51
+ """A single grounding result with label and bounding box coordinates.
52
+
53
+ Attributes:
54
+ label: The text label for this grounding region.
55
+ coordinates: List of (x1, y1, x2, y2) tuples in normalized 0-999 range.
56
+ """
57
+
58
+ label: str
59
+ coordinates: list[tuple[int, int, int, int]]
@@ -0,0 +1,19 @@
1
+ from enum import Enum
2
+
3
+
4
+ class Tags(Enum):
5
+ """UI tags for categorizing DeepSeek OCR templates.
6
+
7
+ Attributes:
8
+ DEEPSEEK: Tag for DeepSeek-related templates.
9
+ IMAGE: Tag for image processing templates.
10
+ OCR: Tag for optical character recognition templates.
11
+ TEXT: Tag for text-related templates.
12
+ TEXT_RECOGNITION: Tag for text recognition templates.
13
+ """
14
+
15
+ DEEPSEEK = "deepseek"
16
+ IMAGE = "image"
17
+ OCR = "optical_character_recognition"
18
+ TEXT = "text"
19
+ TEXT_RECOGNITION = "text_recognition"
@@ -0,0 +1,18 @@
1
+ import importlib
2
+ from collections.abc import Callable
3
+
4
+ _root_lib_path = "sinapsis_deepseek_ocr.templates"
5
+
6
+ _template_lookup = {
7
+ "DeepSeekOCRInference": f"{_root_lib_path}.deepseek_ocr_inference",
8
+ }
9
+
10
+
11
+ def __getattr__(name: str) -> Callable:
12
+ if name in _template_lookup:
13
+ module = importlib.import_module(_template_lookup[name])
14
+ return getattr(module, name)
15
+ raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
16
+
17
+
18
+ __all__ = list(_template_lookup.keys())
@@ -0,0 +1,218 @@
1
+ import gc
2
+ import os
3
+ import tempfile
4
+ from typing import Any, Literal
5
+
6
+ import cv2
7
+ import torch
8
+ from pydantic import Field
9
+ from sinapsis_core.data_containers.annotations import BoundingBox, ImageAnnotations
10
+ from sinapsis_core.data_containers.data_packet import DataContainer, ImageColor, ImagePacket, TextPacket
11
+ from sinapsis_core.template_base import Template
12
+ from sinapsis_core.template_base.base_models import (
13
+ OutputTypes,
14
+ TemplateAttributes,
15
+ UIPropertiesMetadata,
16
+ )
17
+ from sinapsis_generic_data_tools.helpers.image_color_space_converter_cv import convert_color_space_cv
18
+ from transformers import AutoModel, AutoTokenizer
19
+
20
+ from sinapsis_deepseek_ocr.helpers.bbox_utils import denormalize_bbox
21
+ from sinapsis_deepseek_ocr.helpers.grounding_parser import parse_grounding_output
22
+ from sinapsis_deepseek_ocr.helpers.mode_registry import DeepSeekOCRModeRegistry
23
+ from sinapsis_deepseek_ocr.helpers.schemas import DeepSeekOCRInitArgs, GroundingResult
24
+ from sinapsis_deepseek_ocr.helpers.tags import Tags
25
+
26
+
27
+ class DeepSeekOCRInferenceAttributes(TemplateAttributes):
28
+ """Attributes for DeepSeek OCR inference.
29
+
30
+ Attributes:
31
+ prompt: The prompt to send to the model (without <image> or <|grounding|> tags).
32
+ enable_grounding: Whether to enable grounding for bounding box extraction.
33
+ mode: The inference mode configuration to use.
34
+ init_args: Initialization arguments for the model.
35
+ """
36
+
37
+ prompt: str = "OCR the image."
38
+ enable_grounding: bool = False
39
+ mode: Literal["tiny", "small", "gundam", "base", "large"] = "base"
40
+ init_args: DeepSeekOCRInitArgs = Field(default_factory=DeepSeekOCRInitArgs)
41
+
42
+
43
+ class DeepSeekOCRInference(Template):
44
+ """Template for DeepSeek OCR inference.
45
+
46
+ This template uses the DeepSeek OCR model to extract text from images.
47
+
48
+ Attributes:
49
+ MODE_REGISTRY: Registry of available inference mode configurations.
50
+ AttributesBaseModel: The Pydantic model for template attributes.
51
+ UIProperties: UI metadata for the template.
52
+ """
53
+
54
+ MODE_REGISTRY = DeepSeekOCRModeRegistry
55
+ AttributesBaseModel = DeepSeekOCRInferenceAttributes
56
+ UIProperties = UIPropertiesMetadata(
57
+ category="OCR",
58
+ output_type=OutputTypes.MULTIMODAL,
59
+ tags=[
60
+ Tags.DEEPSEEK,
61
+ Tags.IMAGE,
62
+ Tags.OCR,
63
+ Tags.TEXT,
64
+ Tags.TEXT_RECOGNITION,
65
+ ],
66
+ )
67
+
68
+ def __init__(self, attributes: TemplateAttributes) -> None:
69
+ super().__init__(attributes)
70
+ self.initialize()
71
+
72
+ def initialize(self) -> None:
73
+ """Load the model and tokenizer from pretrained weights."""
74
+ self.model = self._initialize_model()
75
+ self.tokenizer = AutoTokenizer.from_pretrained(
76
+ self.attributes.init_args.pretrained_model_name_or_path,
77
+ cache_dir=self.attributes.init_args.cache_dir,
78
+ trust_remote_code=self.attributes.init_args.trust_remote_code,
79
+ )
80
+ self.mode_config = self._get_mode_config()
81
+ self.full_prompt = self._build_prompt()
82
+
83
+ def _initialize_model(self) -> AutoModel:
84
+ """Initialize and return the model with appropriate configuration.
85
+
86
+ Note: Always uses CUDA as DeepSeek's infer() method requires it.
87
+
88
+ Returns:
89
+ AutoModel: The initialized model ready for inference.
90
+ """
91
+ model = AutoModel.from_pretrained(**self.attributes.init_args.model_dump())
92
+ return model.eval().cuda().to(self.attributes.init_args.torch_dtype)
93
+
94
+ def _get_mode_config(self) -> dict[str, Any]:
95
+ """Get the mode configuration for the current inference mode.
96
+
97
+ Returns:
98
+ dict[str, Any]: The mode configuration as a dictionary.
99
+ """
100
+ mode_config = getattr(self.MODE_REGISTRY, self.attributes.mode.upper())
101
+ return mode_config.model_dump()
102
+
103
+ def _build_prompt(self) -> str:
104
+ """Build the full prompt with appropriate tags.
105
+
106
+ Returns:
107
+ Full prompt with <image> and optional <|grounding|> tags.
108
+ """
109
+ if self.attributes.enable_grounding:
110
+ return f"<image>\n<|grounding|>{self.attributes.prompt}"
111
+ return f"<image>\n{self.attributes.prompt}"
112
+
113
+ def infer(self, image_packet: ImagePacket) -> str:
114
+ """Run OCR inference on an image packet.
115
+
116
+ Args:
117
+ image_packet: The image packet containing the image to process.
118
+
119
+ Returns:
120
+ str: The raw OCR result from the model.
121
+ """
122
+ fd, tmp_path = tempfile.mkstemp(suffix=".jpg")
123
+ os.close(fd)
124
+
125
+ try:
126
+ image_packet = convert_color_space_cv(image_packet=image_packet, desired_color_space=ImageColor.BGR)
127
+ cv2.imwrite(tmp_path, image_packet.content)
128
+
129
+ raw_result = self.model.infer(
130
+ tokenizer=self.tokenizer,
131
+ prompt=self.full_prompt,
132
+ image_file=tmp_path,
133
+ output_path=self.attributes.init_args.cache_dir,
134
+ eval_mode=True,
135
+ save_results=False,
136
+ **self.mode_config,
137
+ )
138
+
139
+ return raw_result
140
+
141
+ finally:
142
+ if os.path.exists(tmp_path):
143
+ os.remove(tmp_path)
144
+
145
+ @staticmethod
146
+ def grounding_to_annotations(results: list[GroundingResult], width: int, height: int) -> list[ImageAnnotations]:
147
+ """Convert GroundingResult list to ImageAnnotations.
148
+
149
+ Args:
150
+ results: List of parsed grounding results.
151
+ width: Image width in pixels.
152
+ height: Image height in pixels.
153
+
154
+ Returns:
155
+ List of ImageAnnotations with denormalized bounding boxes.
156
+ """
157
+ annotations: list[ImageAnnotations] = []
158
+
159
+ for result in results:
160
+ for coords in result.coordinates:
161
+ x1, y1, x2, y2 = denormalize_bbox(coords, width, height)
162
+ bbox = BoundingBox(x=x1, y=y1, w=x2 - x1, h=y2 - y1)
163
+ ann = ImageAnnotations(
164
+ label_str=result.label,
165
+ bbox=bbox,
166
+ text=result.label,
167
+ )
168
+ annotations.append(ann)
169
+
170
+ return annotations
171
+
172
+ @staticmethod
173
+ def clear_memory() -> None:
174
+ """Clear memory to free up resources.
175
+
176
+ Performs garbage collection and clears GPU memory if available.
177
+ """
178
+ gc.collect()
179
+ if torch.cuda.is_available():
180
+ torch.cuda.empty_cache()
181
+ torch.cuda.ipc_collect()
182
+
183
+ def reset_state(self, template_name: str | None = None) -> None:
184
+ """Reset the template state by reinitializing the model.
185
+
186
+ Args:
187
+ template_name: Optional template name (unused, for interface compatibility).
188
+ """
189
+ _ = template_name
190
+
191
+ if hasattr(self, "model"):
192
+ del self.model
193
+
194
+ self.clear_memory()
195
+ self.initialize()
196
+ self.logger.info(f"Reset template instance `{self.instance_name}`")
197
+
198
+ def execute(self, container: DataContainer) -> DataContainer:
199
+ """Execute OCR on all images in the container.
200
+
201
+ Args:
202
+ container: The data container with images to process.
203
+
204
+ Returns:
205
+ DataContainer: The container with annotations or text packets.
206
+ """
207
+ for image_packet in container.images:
208
+ raw_result = self.infer(image_packet)
209
+
210
+ if self.attributes.enable_grounding:
211
+ grounding_results = parse_grounding_output(raw_result)
212
+ height, width = image_packet.shape[:2]
213
+ annotations = self.grounding_to_annotations(grounding_results, width, height)
214
+ image_packet.annotations.extend(annotations)
215
+ else:
216
+ container.texts.append(TextPacket(content=raw_result))
217
+
218
+ return container