xinference 1.11.0.post1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (39) hide show
  1. xinference/__init__.py +8 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/oauth2/utils.py +26 -5
  4. xinference/core/model.py +1 -10
  5. xinference/device_utils.py +11 -1
  6. xinference/model/embedding/model_spec.json +70 -0
  7. xinference/model/image/core.py +20 -10
  8. xinference/model/image/model_spec.json +55 -3
  9. xinference/model/image/ocr/__init__.py +5 -0
  10. xinference/model/image/ocr/deepseek_ocr.py +958 -0
  11. xinference/model/llm/core.py +2 -0
  12. xinference/model/llm/llama_cpp/core.py +2 -0
  13. xinference/model/llm/llm_family.json +319 -6
  14. xinference/model/llm/lmdeploy/core.py +2 -0
  15. xinference/model/llm/sglang/core.py +2 -0
  16. xinference/model/llm/transformers/core.py +2 -0
  17. xinference/model/llm/transformers/multimodal/qwen-omni.py +60 -11
  18. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  19. xinference/model/llm/vllm/core.py +2 -0
  20. xinference/model/rerank/model_spec.json +368 -252
  21. xinference/model/rerank/sentence_transformers/core.py +10 -2
  22. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +71 -5
  23. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +51 -1
  24. xinference/ui/gradio/media_interface.py +469 -4
  25. xinference/ui/gradio/utils/__init__.py +19 -0
  26. xinference/ui/gradio/utils/latex.py +342 -0
  27. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  28. xinference/ui/web/ui/build/index.html +1 -1
  29. xinference/ui/web/ui/build/static/js/{main.e4d9a9e1.js → main.87d6859b.js} +3 -3
  30. xinference/ui/web/ui/build/static/js/{main.e4d9a9e1.js.map → main.87d6859b.js.map} +1 -1
  31. xinference/ui/web/ui/node_modules/.cache/babel-loader/412a6b414a8267c7a349d9beda4593cdf218abf32edaaf339e6a230df40397b8.json +1 -0
  32. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/METADATA +10 -11
  33. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/RECORD +38 -35
  34. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +0 -1
  35. /xinference/ui/web/ui/build/static/js/{main.e4d9a9e1.js.LICENSE.txt → main.87d6859b.js.LICENSE.txt} +0 -0
  36. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/WHEEL +0 -0
  37. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/entry_points.txt +0 -0
  38. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/licenses/LICENSE +0 -0
  39. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,958 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ import os
17
+ import re
18
+ import tempfile
19
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
20
+
21
+ import numpy as np
22
+ import PIL.Image
23
+ import torch
24
+ import torch.nn as nn
25
+ from torchvision import transforms
26
+
27
+ if TYPE_CHECKING:
28
+ from ..core import ImageModelFamilyV2
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class DeepSeekOCRModelSize:
34
+ """DeepSeek-OCR model size configurations."""
35
+
36
+ TINY = ("tiny", 512, 512, False)
37
+ SMALL = ("small", 640, 640, False)
38
+ BASE = ("base", 1024, 1024, False)
39
+ LARGE = ("large", 1280, 1280, False)
40
+ GUNDAM = ("gundam", 1024, 640, True)
41
+
42
+ def __init__(self, size_type: str):
43
+ self.size_type = size_type
44
+ # Map size type to configuration
45
+ self._config_map = {
46
+ "tiny": self.TINY,
47
+ "small": self.SMALL,
48
+ "base": self.BASE,
49
+ "large": self.LARGE,
50
+ "gundam": self.GUNDAM,
51
+ }
52
+
53
+ if size_type in self._config_map:
54
+ self.name, self.base_size, self.image_size, self.crop_mode = (
55
+ self._config_map[size_type]
56
+ )
57
+ else:
58
+ # Default to Gundam
59
+ self.name, self.base_size, self.image_size, self.crop_mode = self.GUNDAM
60
+
61
+ @classmethod
62
+ def from_string(cls, size_str: str) -> "DeepSeekOCRModelSize":
63
+ """Get model size from string."""
64
+ return cls(size_str.lower())
65
+
66
+ def __str__(self) -> str:
67
+ return self.name
68
+
69
+
70
+ def load_image(image_path: str) -> Optional[PIL.Image.Image]:
71
+ """Load image with EXIF correction."""
72
+ try:
73
+ image = PIL.Image.open(image_path)
74
+ # Correct image orientation based on EXIF data
75
+ corrected_image = PIL.ImageOps.exif_transpose(image)
76
+ return corrected_image
77
+ except Exception as e:
78
+ logger.error(f"Error loading image {image_path}: {e}")
79
+ try:
80
+ return PIL.Image.open(image_path)
81
+ except:
82
+ return None
83
+
84
+
85
+ def find_closest_aspect_ratio(
86
+ aspect_ratio: float,
87
+ target_ratios: List[Tuple[int, int]],
88
+ width: int,
89
+ height: int,
90
+ image_size: int,
91
+ ) -> Tuple[int, int]:
92
+ """Find the closest aspect ratio to target."""
93
+ best_ratio_diff = float("inf")
94
+ best_ratio = (1, 1)
95
+ area = width * height
96
+
97
+ for ratio in target_ratios:
98
+ target_aspect_ratio = ratio[0] / ratio[1]
99
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
100
+ if ratio_diff < best_ratio_diff:
101
+ best_ratio_diff = ratio_diff
102
+ best_ratio = ratio
103
+ elif ratio_diff == best_ratio_diff:
104
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
105
+ best_ratio = ratio
106
+
107
+ return best_ratio
108
+
109
+
110
+ def dynamic_preprocess(
111
+ image: PIL.Image.Image,
112
+ min_num: int = 2,
113
+ max_num: int = 9,
114
+ image_size: int = 640,
115
+ use_thumbnail: bool = False,
116
+ ) -> Tuple[List[PIL.Image.Image], Tuple[int, int]]:
117
+ """Dynamically preprocess image by cropping."""
118
+ orig_width, orig_height = image.size
119
+ aspect_ratio = orig_width / orig_height
120
+
121
+ # Calculate target ratios
122
+ target_ratios = [
123
+ (i, j)
124
+ for n in range(min_num, max_num + 1)
125
+ for i in range(1, n + 1)
126
+ for j in range(1, n + 1)
127
+ if i * j <= max_num and i * j >= min_num
128
+ ]
129
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
130
+
131
+ # Find the closest aspect ratio
132
+ target_aspect_ratio = find_closest_aspect_ratio(
133
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size
134
+ )
135
+
136
+ # Calculate target dimensions
137
+ target_width = image_size * target_aspect_ratio[0]
138
+ target_height = image_size * target_aspect_ratio[1]
139
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
140
+
141
+ # Resize the image
142
+ resized_img = image.resize((target_width, target_height))
143
+ processed_images = []
144
+
145
+ for i in range(blocks):
146
+ box = (
147
+ (i % (target_width // image_size)) * image_size,
148
+ (i // (target_width // image_size)) * image_size,
149
+ ((i % (target_width // image_size)) + 1) * image_size,
150
+ ((i // (target_width // image_size)) + 1) * image_size,
151
+ )
152
+ split_img = resized_img.crop(box)
153
+ processed_images.append(split_img)
154
+
155
+ assert len(processed_images) == blocks
156
+
157
+ if use_thumbnail and len(processed_images) != 1:
158
+ thumbnail_img = image.resize((image_size, image_size))
159
+ processed_images.append(thumbnail_img)
160
+
161
+ return processed_images, target_aspect_ratio
162
+
163
+
164
+ def normalize_transform(
165
+ mean: Optional[Union[Tuple[float, float, float], List[float]]],
166
+ std: Optional[Union[Tuple[float, float, float], List[float]]],
167
+ ):
168
+ """Create normalization transform."""
169
+ if mean is None and std is None:
170
+ return None
171
+ elif mean is None and std is not None:
172
+ mean = [0.0] * len(std)
173
+ return transforms.Normalize(mean=mean, std=std)
174
+ elif mean is not None and std is None:
175
+ std = [1.0] * len(mean)
176
+ return transforms.Normalize(mean=mean, std=std)
177
+ else:
178
+ return transforms.Normalize(mean=mean, std=std)
179
+
180
+
181
+ class BasicImageTransform:
182
+ """Basic image transformation for DeepSeek-OCR."""
183
+
184
+ def __init__(
185
+ self,
186
+ mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
187
+ std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
188
+ normalize: bool = True,
189
+ ):
190
+ self.mean = mean
191
+ self.std = std
192
+
193
+ transform_pipelines = [transforms.ToTensor()]
194
+
195
+ if normalize:
196
+ normalize_transform_func = normalize_transform(mean, std)
197
+ if normalize_transform_func is not None:
198
+ transform_pipelines.append(normalize_transform_func)
199
+ else:
200
+ transform_pipelines.append(nn.Identity())
201
+
202
+ self.transform = transforms.Compose(transform_pipelines)
203
+
204
+ def __call__(self, x: PIL.Image.Image) -> torch.Tensor:
205
+ return self.transform(x)
206
+
207
+
208
+ def re_match(text: str) -> Tuple[List[Tuple], List[str], List[str]]:
209
+ """Extract references and detections from text."""
210
+ pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
211
+ matches = re.findall(pattern, text, re.DOTALL)
212
+
213
+ mathes_image = []
214
+ mathes_other = []
215
+ for a_match in matches:
216
+ if "<|ref|>image<|/ref|>" in a_match[0]:
217
+ mathes_image.append(a_match[0])
218
+ else:
219
+ mathes_other.append(a_match[0])
220
+ return matches, mathes_image, mathes_other
221
+
222
+
223
+ def extract_coordinates_and_label(
224
+ ref_text: Tuple, image_width: int, image_height: int
225
+ ) -> Optional[Tuple]:
226
+ """Extract coordinates and label from reference text."""
227
+ try:
228
+ label_type = ref_text[1]
229
+ cor_list = eval(ref_text[2])
230
+ except Exception as e:
231
+ logger.error(f"Error extracting coordinates: {e}")
232
+ return None
233
+
234
+ return (label_type, cor_list)
235
+
236
+
237
+ def draw_bounding_boxes(
238
+ image: PIL.Image.Image, refs: List[Tuple], output_path: str
239
+ ) -> PIL.Image.Image:
240
+ """Draw bounding boxes on image with labels."""
241
+ image_width, image_height = image.size
242
+
243
+ img_draw = image.copy()
244
+ draw = PIL.ImageDraw.Draw(img_draw)
245
+
246
+ overlay = PIL.Image.new("RGBA", img_draw.size, (0, 0, 0, 0))
247
+ draw2 = PIL.ImageDraw.Draw(overlay)
248
+
249
+ # Use default font
250
+ try:
251
+ font = PIL.ImageFont.load_default()
252
+ except:
253
+ font = None
254
+
255
+ img_idx = 0
256
+
257
+ for i, ref in enumerate(refs):
258
+ try:
259
+ result = extract_coordinates_and_label(ref, image_width, image_height)
260
+ if result:
261
+ label_type, points_list = result
262
+
263
+ color = (
264
+ np.random.randint(0, 200),
265
+ np.random.randint(0, 200),
266
+ np.random.randint(0, 255),
267
+ )
268
+ color_a = color + (20,)
269
+
270
+ for points in points_list:
271
+ x1, y1, x2, y2 = points
272
+
273
+ # Convert from relative coordinates (0-999) to absolute pixel coordinates
274
+ x1 = int(x1 / 999 * image_width)
275
+ y1 = int(y1 / 999 * image_height)
276
+ x2 = int(x2 / 999 * image_width)
277
+ y2 = int(y2 / 999 * image_height)
278
+
279
+ if label_type == "image":
280
+ try:
281
+ cropped = image.crop((x1, y1, x2, y2))
282
+ cropped.save(f"{output_path}/images/{img_idx}.jpg")
283
+ except Exception as e:
284
+ logger.error(f"Error saving cropped image: {e}")
285
+ img_idx += 1
286
+
287
+ try:
288
+ if label_type == "title":
289
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
290
+ draw2.rectangle(
291
+ [x1, y1, x2, y2],
292
+ fill=color_a,
293
+ outline=(0, 0, 0, 0),
294
+ width=1,
295
+ )
296
+ else:
297
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
298
+ draw2.rectangle(
299
+ [x1, y1, x2, y2],
300
+ fill=color_a,
301
+ outline=(0, 0, 0, 0),
302
+ width=1,
303
+ )
304
+
305
+ if font:
306
+ text_x = x1
307
+ text_y = max(0, y1 - 15)
308
+
309
+ text_bbox = draw.textbbox((0, 0), label_type, font=font)
310
+ text_width = text_bbox[2] - text_bbox[0]
311
+ text_height = text_bbox[3] - text_bbox[1]
312
+
313
+ draw.rectangle(
314
+ [
315
+ text_x,
316
+ text_y,
317
+ text_x + text_width,
318
+ text_y + text_height,
319
+ ],
320
+ fill=(255, 255, 255, 30),
321
+ )
322
+
323
+ draw.text(
324
+ (text_x, text_y), label_type, font=font, fill=color
325
+ )
326
+ except Exception as e:
327
+ logger.error(f"Error drawing text: {e}")
328
+ pass
329
+ except Exception as e:
330
+ logger.error(f"Error processing reference: {e}")
331
+ continue
332
+
333
+ img_draw.paste(overlay, (0, 0), overlay)
334
+ return img_draw
335
+
336
+
337
+ def process_image_with_refs(
338
+ image: PIL.Image.Image, ref_texts: List[Tuple], output_path: str
339
+ ) -> PIL.Image.Image:
340
+ """Process image with reference texts and draw bounding boxes."""
341
+ result_image = draw_bounding_boxes(image, ref_texts, output_path)
342
+ return result_image
343
+
344
+
345
+ def clean_ocr_annotations(text: str) -> str:
346
+ """
347
+ Clean OCR annotations and return plain text.
348
+
349
+ Removes <|ref|>...<|/ref|><|det|>...<|/det|> annotations while preserving the text content.
350
+
351
+ Args:
352
+ text: Raw OCR output with annotations
353
+
354
+ Returns:
355
+ Cleaned plain text
356
+ """
357
+ if not isinstance(text, str):
358
+ return str(text)
359
+
360
+ # Pattern to match the full annotation blocks
361
+ annotation_pattern = r"<\|ref\|>.*?<\|/ref\|><\|det\|>\[\[.*?\]\]<\|/det\|>"
362
+
363
+ # Remove all annotation blocks
364
+ cleaned_text = re.sub(annotation_pattern, "", text, flags=re.DOTALL)
365
+
366
+ # Clean up extra whitespace and line breaks
367
+ cleaned_text = re.sub(r"\n\s*\n", "\n", cleaned_text.strip())
368
+
369
+ return cleaned_text
370
+
371
+
372
+ def extract_text_blocks(text: str) -> List[Dict[str, Any]]:
373
+ """
374
+ Extract text blocks with their coordinates from OCR annotations.
375
+
376
+ Args:
377
+ text: Raw OCR output with annotations
378
+
379
+ Returns:
380
+ List of dictionaries containing text and coordinates
381
+ """
382
+ if not isinstance(text, str):
383
+ return []
384
+
385
+ # Pattern to extract text and coordinates
386
+ block_pattern = (
387
+ r"<\|ref\|>(.*?)<\|/ref\|><\|det\|>\[\[(.*?)\]\]<\|/det\|>(.*?)(?=<\|ref\|>|$)"
388
+ )
389
+
390
+ blocks = []
391
+ for match in re.finditer(block_pattern, text, re.DOTALL):
392
+ label_type = match.group(1).strip()
393
+ coords_str = match.group(2).strip()
394
+ content = match.group(3).strip()
395
+
396
+ try:
397
+ coords = eval(f"[{coords_str}]") # Convert string coordinates to list
398
+ if isinstance(coords, list) and len(coords) > 0:
399
+ blocks.append(
400
+ {
401
+ "label_type": label_type,
402
+ "coordinates": coords,
403
+ "text": content,
404
+ "bbox": coords[0] if len(coords) == 1 else coords,
405
+ }
406
+ )
407
+ except:
408
+ # Skip if coordinates can't be parsed
409
+ continue
410
+
411
+ return blocks
412
+
413
+
414
+ class DeepSeekOCRModel:
415
+ def __init__(
416
+ self,
417
+ model_uid: str,
418
+ model_path: Optional[str] = None,
419
+ device: Optional[str] = None,
420
+ model_spec: Optional["ImageModelFamilyV2"] = None,
421
+ **kwargs,
422
+ ):
423
+ self.model_family = model_spec
424
+ self._model_uid = model_uid
425
+ self._model_path = model_path
426
+ self._device = device
427
+ # model info when loading
428
+ self._model = None
429
+ self._tokenizer = None
430
+ # info
431
+ self._model_spec = model_spec
432
+ self._abilities = model_spec.model_ability or [] # type: ignore
433
+ self._kwargs = kwargs
434
+
435
+ @property
436
+ def model_ability(self):
437
+ return self._abilities
438
+
439
+ def load(self):
440
+ from transformers import AutoModel, AutoTokenizer
441
+
442
+ logger.info(f"Loading DeepSeek-OCR model from {self._model_path}")
443
+
444
+ try:
445
+ self._tokenizer = AutoTokenizer.from_pretrained(
446
+ self._model_path,
447
+ trust_remote_code=True,
448
+ use_fast=False,
449
+ )
450
+ if self._device != "cpu":
451
+ # Use CUDA if available
452
+ model = AutoModel.from_pretrained(
453
+ self._model_path,
454
+ trust_remote_code=True,
455
+ low_cpu_mem_usage=True,
456
+ device_map="auto",
457
+ use_safetensors=True,
458
+ pad_token_id=self._tokenizer.eos_token_id,
459
+ )
460
+ self._model = model.eval()
461
+ else:
462
+ # Force CPU-only execution
463
+ model = AutoModel.from_pretrained(
464
+ self._model_path,
465
+ trust_remote_code=True,
466
+ low_cpu_mem_usage=True,
467
+ device_map="cpu",
468
+ use_safetensors=True,
469
+ pad_token_id=self._tokenizer.eos_token_id,
470
+ torch_dtype=torch.float32, # Use float32 for CPU
471
+ )
472
+ self._model = model.eval()
473
+ logger.info("DeepSeek-OCR model loaded successfully")
474
+ except Exception as e:
475
+ logger.error(f"Failed to load DeepSeek-OCR model: {e}")
476
+ raise
477
+
478
+ def ocr(
479
+ self,
480
+ image: Union[PIL.Image.Image, List[PIL.Image.Image]],
481
+ **kwargs,
482
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
483
+ """
484
+ Perform OCR on single or multiple images with enhanced features.
485
+
486
+ Args:
487
+ image: PIL Image or list of PIL Images
488
+ **kwargs: Additional parameters including:
489
+ - prompt: OCR prompt (default: "<image>\nFree OCR.")
490
+ - model_size: Model size (default: "gundam")
491
+ - test_compress: Whether to test compression ratio (default: False)
492
+ - save_results: Whether to save results (default: False)
493
+ - save_dir: Directory to save results
494
+ - eval_mode: Whether to use evaluation mode (default: False)
495
+
496
+ Returns:
497
+ OCR results as dict or list of dicts
498
+ """
499
+ logger.info("DeepSeek-OCR kwargs: %s", kwargs)
500
+
501
+ # Set default values for DeepSeek-OCR specific parameters
502
+ prompt = kwargs.get("prompt", "<image>\nFree OCR.")
503
+ model_size = kwargs.get("model_size", "gundam")
504
+ test_compress = kwargs.get("test_compress", False)
505
+ save_results = kwargs.get("save_results", False)
506
+ save_dir = kwargs.get("save_dir", None)
507
+ eval_mode = kwargs.get("eval_mode", False)
508
+
509
+ # Smart detection: Check if this should be a visualization request
510
+ # Visualization is triggered when:
511
+ # 1. prompt contains grounding keywords
512
+ # 2. save_results is True (default behavior for visualization)
513
+ # 3. Explicit visualization parameters are provided
514
+ is_visualization_request = (
515
+ "grounding" in prompt.lower()
516
+ or "convert" in prompt.lower()
517
+ or "markdown" in prompt.lower()
518
+ or save_results
519
+ or any(
520
+ key in kwargs
521
+ for key in ["save_results", "output_format", "annotations", "visualize"]
522
+ )
523
+ )
524
+
525
+ if is_visualization_request:
526
+ logger.info("Detected visualization request, delegating to visualize_ocr")
527
+ # Delegate to visualize_ocr for visualization functionality
528
+ # Pass all parameters through kwargs to avoid duplication
529
+ return self.visualize_ocr(image=image, **kwargs)
530
+
531
+ if self._model is None or self._tokenizer is None:
532
+ raise RuntimeError("Model not loaded. Please call load() first.")
533
+
534
+ # Validate parameters
535
+ if save_results and not save_dir:
536
+ raise ValueError("save_dir must be provided when save_results=True")
537
+
538
+ # Handle single image input
539
+ if isinstance(image, PIL.Image.Image):
540
+ return self._ocr_single(
541
+ image,
542
+ prompt,
543
+ model_size,
544
+ test_compress,
545
+ save_results,
546
+ save_dir,
547
+ eval_mode,
548
+ **kwargs,
549
+ )
550
+ # Handle batch image input
551
+ elif isinstance(image, list):
552
+ return [
553
+ self._ocr_single(
554
+ img,
555
+ prompt,
556
+ model_size,
557
+ test_compress,
558
+ save_results,
559
+ save_dir,
560
+ eval_mode,
561
+ **kwargs,
562
+ )
563
+ for img in image
564
+ ]
565
+ else:
566
+ raise ValueError("Input must be a PIL Image or list of PIL Images")
567
+
568
+ def visualize_ocr(
569
+ self,
570
+ image: Union[PIL.Image.Image, List[PIL.Image.Image]],
571
+ prompt: str = "<image>\n<|grounding|>Convert the document to markdown.",
572
+ model_size: str = "gundam",
573
+ save_results: bool = True,
574
+ save_dir: Optional[str] = None,
575
+ eval_mode: bool = False,
576
+ **kwargs,
577
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
578
+ """
579
+ Perform OCR with visualization (bounding boxes and annotations).
580
+
581
+ Args:
582
+ image: PIL Image or list of PIL Images
583
+ prompt: OCR prompt with grounding, defaults to document conversion
584
+ model_size: Model size configuration
585
+ save_results: Whether to save results with annotations
586
+ save_dir: Directory to save results
587
+ eval_mode: Whether to use evaluation mode
588
+ **kwargs: Additional parameters
589
+
590
+ Returns:
591
+ OCR results with visualization information
592
+ """
593
+ if self._model is None or self._tokenizer is None:
594
+ raise RuntimeError("Model not loaded. Please call load() first.")
595
+
596
+ # Handle single image input
597
+ if isinstance(image, PIL.Image.Image):
598
+ result = self._visualize_single(
599
+ image, prompt, model_size, save_results, save_dir, eval_mode, **kwargs
600
+ )
601
+
602
+ # Apply LaTeX post-processing using unified function
603
+ try:
604
+ from ...ui.gradio.utils.latex import process_ocr_result_with_latex
605
+
606
+ result = process_ocr_result_with_latex(
607
+ result, output_format="markdown", debug_info=True
608
+ )
609
+ except ImportError:
610
+ # Fallback: no LaTeX processing if import fails
611
+ pass
612
+
613
+ return result
614
+ # Handle batch image input
615
+ elif isinstance(image, list):
616
+ results = []
617
+ for img in image:
618
+ result = self._visualize_single(
619
+ img, prompt, model_size, save_results, save_dir, eval_mode, **kwargs
620
+ )
621
+
622
+ # Apply LaTeX post-processing using unified function
623
+ try:
624
+ from ...ui.gradio.utils.latex import process_ocr_result_with_latex
625
+
626
+ result = process_ocr_result_with_latex(
627
+ result, output_format="markdown", debug_info=False
628
+ )
629
+ except ImportError:
630
+ # Fallback: no LaTeX processing if import fails
631
+ pass
632
+
633
+ results.append(result)
634
+ return results
635
+ else:
636
+ raise ValueError("Input must be a PIL Image or list of PIL Images")
637
+
638
+ def _visualize_single(
639
+ self,
640
+ image: PIL.Image.Image,
641
+ prompt: str,
642
+ model_size: str,
643
+ save_results: bool,
644
+ save_dir: Optional[str],
645
+ eval_mode: bool,
646
+ **kwargs,
647
+ ) -> Dict[str, Any]:
648
+ """Perform OCR with visualization for a single image."""
649
+ # Convert image to RGB if needed
650
+ if image.mode in ["RGBA", "CMYK"]:
651
+ image = image.convert("RGB")
652
+
653
+ # Get model configuration
654
+ model_config = DeepSeekOCRModelSize.from_string(model_size)
655
+
656
+ # Create save directory if needed
657
+ if save_results and save_dir:
658
+ os.makedirs(save_dir, exist_ok=True)
659
+ os.makedirs(f"{save_dir}/images", exist_ok=True)
660
+
661
+ if self._model is None:
662
+ raise RuntimeError("Model is not loaded. Call load() method first.")
663
+
664
+ try:
665
+ # Save image to temporary file
666
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
667
+ image.save(temp_file.name, "JPEG")
668
+ temp_image_path = temp_file.name
669
+
670
+ # Create output directory
671
+ output_path = tempfile.mkdtemp() if not save_dir else save_dir
672
+
673
+ try:
674
+ # Use DeepSeek-OCR's infer method with save_results enabled
675
+ result = self._model.infer(
676
+ tokenizer=self._tokenizer,
677
+ prompt=prompt,
678
+ image_file=temp_image_path,
679
+ output_path=output_path,
680
+ base_size=model_config.base_size,
681
+ image_size=model_config.image_size,
682
+ crop_mode=model_config.crop_mode,
683
+ save_results=save_results,
684
+ eval_mode=eval_mode,
685
+ )
686
+
687
+ # Process visualization if save_results is enabled
688
+ visualization_info = {}
689
+ if save_results and save_dir and isinstance(result, str):
690
+ try:
691
+ # Extract references from result
692
+ matches_ref, matches_images, matches_other = re_match(result)
693
+
694
+ # Process image with references
695
+ if matches_ref:
696
+ result_image = process_image_with_refs(
697
+ image.copy(), matches_ref, save_dir
698
+ )
699
+ result_image.save(f"{save_dir}/result_with_boxes.jpg")
700
+
701
+ # Process image references in text
702
+ processed_text = result
703
+ for idx, match_image in enumerate(matches_images):
704
+ processed_text = processed_text.replace(
705
+ match_image, f"![](images/{idx}.jpg)\n"
706
+ )
707
+
708
+ # Remove other reference markers
709
+ for idx, match_other in enumerate(matches_other):
710
+ processed_text = processed_text.replace(match_other, "")
711
+
712
+ # Save processed text as markdown
713
+ with open(
714
+ f"{save_dir}/result.mmd", "w", encoding="utf-8"
715
+ ) as f:
716
+ f.write(processed_text)
717
+
718
+ visualization_info = {
719
+ "has_annotations": True,
720
+ "num_bounding_boxes": len(matches_ref),
721
+ "num_extracted_images": len(matches_images),
722
+ "annotated_image_path": f"{save_dir}/result_with_boxes.jpg",
723
+ "markdown_path": f"{save_dir}/result.mmd",
724
+ "extracted_images_dir": f"{save_dir}/images/",
725
+ }
726
+ else:
727
+ visualization_info = {
728
+ "has_annotations": False,
729
+ "message": "No annotations found in OCR result",
730
+ }
731
+ except Exception as e:
732
+ logger.error(f"Error processing visualization: {e}")
733
+ visualization_info = {"error": str(e)}
734
+
735
+ # Prepare response
736
+ response = {
737
+ "text": result,
738
+ "model": "deepseek-ocr",
739
+ "success": True,
740
+ "model_size": model_size,
741
+ "base_size": model_config.base_size,
742
+ "image_size": model_config.image_size,
743
+ "crop_mode": model_config.crop_mode,
744
+ "visualization": visualization_info,
745
+ }
746
+
747
+ # Add file info if saved
748
+ if save_results and save_dir:
749
+ response["saved_files"] = {
750
+ "output_dir": save_dir,
751
+ "result_file": (
752
+ f"{save_dir}/result.mmd"
753
+ if os.path.exists(f"{save_dir}/result.mmd")
754
+ else None
755
+ ),
756
+ "annotated_image": (
757
+ f"{save_dir}/result_with_boxes.jpg"
758
+ if os.path.exists(f"{save_dir}/result_with_boxes.jpg")
759
+ else None
760
+ ),
761
+ }
762
+
763
+ return response
764
+
765
+ finally:
766
+ # Clean up temporary file
767
+ os.unlink(temp_image_path)
768
+
769
+ except Exception as e:
770
+ logger.error(f"OCR visualization failed: {e}")
771
+ return {
772
+ "text": "",
773
+ "model": "deepseek-ocr",
774
+ "success": False,
775
+ "error": str(e),
776
+ "model_size": model_size,
777
+ "visualization": {"error": str(e)},
778
+ }
779
+
780
+ def _ocr_single(
781
+ self,
782
+ image: PIL.Image.Image,
783
+ prompt: str,
784
+ model_size: str = "gundam",
785
+ test_compress: bool = False,
786
+ save_results: bool = False,
787
+ save_dir: Optional[str] = None,
788
+ eval_mode: bool = False,
789
+ **kwargs,
790
+ ) -> Dict[str, Any]:
791
+ """Perform OCR on a single image with all enhanced features."""
792
+ # Convert image to RGB if needed
793
+ if image.mode in ["RGBA", "CMYK"]:
794
+ image = image.convert("RGB")
795
+
796
+ if self._model is None or self._tokenizer is None:
797
+ raise RuntimeError("Model not loaded. Please call load() first.")
798
+
799
+ # Get model configuration
800
+ model_config = DeepSeekOCRModelSize.from_string(model_size)
801
+
802
+ # Create save directory if needed
803
+ if save_results and save_dir:
804
+ os.makedirs(save_dir, exist_ok=True)
805
+ os.makedirs(f"{save_dir}/images", exist_ok=True)
806
+
807
+ try:
808
+ # Save image to temporary file for DeepSeek-OCR's infer method
809
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
810
+ image.save(temp_file.name, "JPEG")
811
+ temp_image_path = temp_file.name
812
+
813
+ # Create output directory
814
+ output_path = tempfile.mkdtemp() if not save_dir else save_dir
815
+
816
+ try:
817
+ # Use DeepSeek-OCR's infer method with all parameters
818
+ result = self._model.infer(
819
+ tokenizer=self._tokenizer,
820
+ prompt=prompt,
821
+ image_file=temp_image_path,
822
+ output_path=output_path,
823
+ base_size=model_config.base_size,
824
+ image_size=model_config.image_size,
825
+ crop_mode=model_config.crop_mode,
826
+ test_compress=test_compress,
827
+ save_results=save_results,
828
+ eval_mode=eval_mode,
829
+ )
830
+
831
+ # Apply LaTeX post-processing using unified function
832
+ try:
833
+ from ...ui.gradio.utils.latex import process_ocr_result_with_latex
834
+
835
+ # Process the result and extract LaTeX info
836
+ processed_result = process_ocr_result_with_latex(
837
+ result, output_format="markdown", debug_info=True
838
+ )
839
+
840
+ # Extract text and LaTeX info
841
+ if isinstance(processed_result, dict):
842
+ latex_info = processed_result.get("latex_processing")
843
+ processed_result = processed_result.get("text", result)
844
+ else:
845
+ processed_result = (
846
+ processed_result if processed_result else result
847
+ )
848
+ latex_info = None
849
+
850
+ except ImportError:
851
+ processed_result = result
852
+ latex_info = None
853
+
854
+ # Prepare response
855
+ response = {
856
+ "text": processed_result,
857
+ "model": "deepseek-ocr",
858
+ "success": True,
859
+ "model_size": model_size,
860
+ "base_size": model_config.base_size,
861
+ "image_size": model_config.image_size,
862
+ "crop_mode": model_config.crop_mode,
863
+ }
864
+
865
+ # Include LaTeX processing info in response
866
+ if latex_info:
867
+ response["latex_processing"] = latex_info
868
+
869
+ # Add compression info if tested
870
+ if test_compress:
871
+ # Calculate compression ratio (simplified version)
872
+ if hasattr(self._model, "_last_compression_info"):
873
+ response.update(self._model._last_compression_info)
874
+
875
+ # Add file info if saved
876
+ if save_results and save_dir:
877
+ response["saved_files"] = {
878
+ "output_dir": save_dir,
879
+ "result_file": (
880
+ f"{save_dir}/result.mmd"
881
+ if os.path.exists(f"{save_dir}/result.mmd")
882
+ else None
883
+ ),
884
+ "annotated_image": (
885
+ f"{save_dir}/result_with_boxes.jpg"
886
+ if os.path.exists(f"{save_dir}/result_with_boxes.jpg")
887
+ else None
888
+ ),
889
+ }
890
+
891
+ return response
892
+
893
+ finally:
894
+ # Clean up temporary file
895
+ os.unlink(temp_image_path)
896
+
897
+ except Exception as e:
898
+ logger.error(f"OCR processing failed: {e}")
899
+ return {
900
+ "text": "",
901
+ "model": "deepseek-ocr",
902
+ "success": False,
903
+ "error": str(e),
904
+ "model_size": model_size,
905
+ }
906
+
907
+ def infer(
908
+ self,
909
+ image_paths: Union[str, List[str]],
910
+ prompt: str = "<image>\nFree OCR.",
911
+ **kwargs,
912
+ ) -> Dict[str, Any]:
913
+ """
914
+ Inference method for compatibility with Xinference interface.
915
+
916
+ Args:
917
+ image_paths: Single path or list of paths to images
918
+ prompt: OCR prompt
919
+ **kwargs: Additional parameters
920
+
921
+ Returns:
922
+ Dictionary containing OCR results
923
+ """
924
+ from PIL import Image
925
+
926
+ # Convert string input to list
927
+ if isinstance(image_paths, str):
928
+ image_paths = [image_paths]
929
+
930
+ # Load images
931
+ images = []
932
+ for path in image_paths:
933
+ try:
934
+ img = Image.open(path)
935
+ images.append(img)
936
+ except Exception as e:
937
+ logger.error(f"Failed to load image {path}: {e}")
938
+ images.append(None)
939
+
940
+ # Process images
941
+ results = []
942
+ for i, img in enumerate(images):
943
+ if img is None:
944
+ results.append(
945
+ {
946
+ "image": image_paths[i],
947
+ "text": "",
948
+ "success": False,
949
+ "error": "Failed to load image",
950
+ }
951
+ )
952
+ else:
953
+ text_result = self._ocr_single(img, prompt, **kwargs)
954
+ results.append(
955
+ {"image": image_paths[i], "text": text_result, "success": True}
956
+ )
957
+
958
+ return {"results": results}