xinference 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (43) hide show
  1. xinference/__init__.py +8 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/oauth2/utils.py +26 -5
  4. xinference/core/model.py +1 -10
  5. xinference/device_utils.py +11 -1
  6. xinference/model/embedding/model_spec.json +70 -0
  7. xinference/model/image/core.py +20 -10
  8. xinference/model/image/model_spec.json +55 -3
  9. xinference/model/image/ocr/__init__.py +5 -0
  10. xinference/model/image/ocr/deepseek_ocr.py +958 -0
  11. xinference/model/llm/core.py +2 -0
  12. xinference/model/llm/llama_cpp/core.py +2 -0
  13. xinference/model/llm/llm_family.json +319 -6
  14. xinference/model/llm/lmdeploy/core.py +2 -0
  15. xinference/model/llm/sglang/core.py +2 -0
  16. xinference/model/llm/transformers/core.py +22 -36
  17. xinference/model/llm/transformers/multimodal/qwen-omni.py +60 -11
  18. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  19. xinference/model/llm/transformers/utils.py +0 -20
  20. xinference/model/llm/vllm/core.py +2 -0
  21. xinference/model/rerank/model_spec.json +368 -252
  22. xinference/model/rerank/sentence_transformers/core.py +10 -2
  23. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +71 -5
  24. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +51 -1
  25. xinference/ui/gradio/media_interface.py +469 -4
  26. xinference/ui/gradio/utils/__init__.py +19 -0
  27. xinference/ui/gradio/utils/latex.py +342 -0
  28. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  29. xinference/ui/web/ui/build/index.html +1 -1
  30. xinference/ui/web/ui/build/static/js/{main.45e78536.js → main.87d6859b.js} +3 -3
  31. xinference/ui/web/ui/build/static/js/main.87d6859b.js.map +1 -0
  32. xinference/ui/web/ui/node_modules/.cache/babel-loader/412a6b414a8267c7a349d9beda4593cdf218abf32edaaf339e6a230df40397b8.json +1 -0
  33. xinference/ui/web/ui/node_modules/.cache/babel-loader/e6770a05771952175c9fbf48fce283c9bb1bc8b5763e39edc36d099d1fe16b4a.json +1 -0
  34. {xinference-1.11.0.dist-info → xinference-1.12.0.dist-info}/METADATA +11 -11
  35. {xinference-1.11.0.dist-info → xinference-1.12.0.dist-info}/RECORD +40 -37
  36. xinference/ui/web/ui/build/static/js/main.45e78536.js.map +0 -1
  37. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +0 -1
  38. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +0 -1
  39. /xinference/ui/web/ui/build/static/js/{main.45e78536.js.LICENSE.txt → main.87d6859b.js.LICENSE.txt} +0 -0
  40. {xinference-1.11.0.dist-info → xinference-1.12.0.dist-info}/WHEEL +0 -0
  41. {xinference-1.11.0.dist-info → xinference-1.12.0.dist-info}/entry_points.txt +0 -0
  42. {xinference-1.11.0.dist-info → xinference-1.12.0.dist-info}/licenses/LICENSE +0 -0
  43. {xinference-1.11.0.dist-info → xinference-1.12.0.dist-info}/top_level.txt +0 -0
@@ -63,9 +63,7 @@ class MediaInterface:
63
63
  )
64
64
 
65
65
  def build(self) -> gr.Blocks:
66
- if self.model_type == "image":
67
- assert "stable_diffusion" in self.model_family
68
-
66
+ # Remove the stable_diffusion restriction to support OCR models
69
67
  interface = self.build_main_interface()
70
68
  interface.queue()
71
69
  # Gradio initiates the queue during a startup event, but since the app has already been
@@ -1233,9 +1231,392 @@ class MediaInterface:
1233
1231
 
1234
1232
  return tts_ui
1235
1233
 
1234
+ def ocr_interface(self) -> "gr.Blocks":
1235
+ def extract_text_from_image(
1236
+ image: "PIL.Image.Image",
1237
+ ocr_type: str = "ocr",
1238
+ model_size: str = "gundam",
1239
+ test_compress: bool = False,
1240
+ enable_visualization: bool = False,
1241
+ save_results: bool = False,
1242
+ progress=gr.Progress(),
1243
+ ) -> Union[str, Tuple[str, str, str]]:
1244
+ from ...client import RESTfulClient
1245
+
1246
+ client = RESTfulClient(self.endpoint)
1247
+ client._set_token(self.access_token)
1248
+ model = client.get_model(self.model_uid)
1249
+ assert hasattr(model, "ocr")
1250
+
1251
+ # Convert PIL image to bytes
1252
+ import io
1253
+
1254
+ buffered = io.BytesIO()
1255
+ if image.mode == "RGBA" or image.mode == "CMYK":
1256
+ image = image.convert("RGB")
1257
+ image.save(buffered, format="PNG")
1258
+ image_bytes = buffered.getvalue()
1259
+
1260
+ progress(0.1, desc="Processing image for OCR")
1261
+
1262
+ # Prepare prompt based on OCR type
1263
+ if ocr_type == "markdown":
1264
+ prompt = "<image>\nConvert this document to clean markdown format. Extract the text content and format it properly using markdown syntax. Do not include any coordinate annotations or special formatting markers."
1265
+ elif ocr_type == "format":
1266
+ prompt = "<image>\n<|grounding|>Convert the document to markdown with structure annotations. Include coordinate information for text regions and maintain the document structure."
1267
+ else: # ocr
1268
+ prompt = "<image>\nFree OCR. Extract all text content from the image."
1269
+
1270
+ try:
1271
+ logger.info(
1272
+ f"Starting OCR processing - Type: {ocr_type}, Model Size: {model_size}"
1273
+ )
1274
+ logger.info(
1275
+ f"Image info: {image.size if image else 'None'}, Mode: {image.mode if image else 'None'}"
1276
+ )
1277
+
1278
+ if enable_visualization and hasattr(model, "visualize_ocr"):
1279
+ # Use visualization method
1280
+ logger.info("Using visualization method")
1281
+ response = model.visualize_ocr(
1282
+ image=image_bytes,
1283
+ prompt=prompt,
1284
+ model_size=model_size,
1285
+ save_results=save_results,
1286
+ eval_mode=True,
1287
+ )
1288
+
1289
+ progress(0.8, desc="Processing visualization")
1290
+
1291
+ # Debug: Log response type and content
1292
+ logger.info(f"Visualization response type: {type(response)}")
1293
+ logger.info(f"Visualization response: {response}")
1294
+
1295
+ # Format response - handle both string and dict responses
1296
+ if isinstance(response, dict):
1297
+ if response.get("success"):
1298
+ text_result = response.get("text", "No text extracted")
1299
+ else:
1300
+ error_msg = response.get(
1301
+ "error", "OCR visualization failed"
1302
+ )
1303
+ # Return formatted error message for Markdown
1304
+ error_md = f"**Error**: {error_msg}"
1305
+ return error_md, "", ""
1306
+ elif isinstance(response, str):
1307
+ # Handle string response from original model
1308
+ text_result = response
1309
+ else:
1310
+ text_result = str(response)
1311
+
1312
+ # Check if the result looks like Markdown and format it properly
1313
+ if ocr_type == "markdown" and isinstance(text_result, str):
1314
+ # Markdown mode - process LaTeX formulas for better rendering
1315
+ try:
1316
+ from .utils.latex import process_ocr_latex
1317
+
1318
+ if "\\" in text_result and (
1319
+ "\\[" in text_result
1320
+ or "\\(" in text_result
1321
+ or "$" in text_result
1322
+ ):
1323
+ # Process LaTeX formulas for Markdown compatibility
1324
+ text_result = process_ocr_latex(
1325
+ text_result, output_format="markdown"
1326
+ )
1327
+ logger.info(
1328
+ "Applied LaTeX processing for Markdown rendering (visualization)"
1329
+ )
1330
+ except ImportError:
1331
+ logger.warning(
1332
+ "LaTeX processing utils not available, using raw text"
1333
+ )
1334
+ pass
1335
+ elif ocr_type == "format" and isinstance(text_result, str):
1336
+ # For format mode, keep annotations but format as code block
1337
+ if "<|ref|>" in text_result:
1338
+ text_result = f"```\n{text_result}\n```"
1339
+ elif ocr_type == "ocr" and isinstance(text_result, str):
1340
+ # For plain text, format as a simple block
1341
+ text_result = text_result # Keep as plain text, Markdown will render it normally
1342
+
1343
+ # Add compression info if available
1344
+ if (
1345
+ isinstance(response, dict)
1346
+ and test_compress
1347
+ and "compression_ratio" in response
1348
+ ):
1349
+ compression_info = (
1350
+ f"\n\n--- Compression Ratio Information ---\n"
1351
+ )
1352
+ compression_info += f"Compression Ratio: {response.get('compression_ratio', 'N/A')}\n"
1353
+ compression_info += f"Valid Image Tokens: {response.get('valid_image_tokens', 'N/A')}\n"
1354
+ compression_info += f"Output Text Tokens: {response.get('output_text_tokens', 'N/A')}\n"
1355
+ text_result += compression_info
1356
+
1357
+ # Add visualization info
1358
+ viz_info = {}
1359
+ if isinstance(response, dict):
1360
+ viz_info = response.get("visualization", {})
1361
+ if viz_info.get("has_annotations"):
1362
+ viz_text = f"\n\n--- Visualization Information ---\n"
1363
+ viz_text += f"Number of Bounding Boxes: {viz_info.get('num_bounding_boxes', 0)}\n"
1364
+ viz_text += f"Number of Extracted Images: {viz_info.get('num_extracted_images', 0)}\n"
1365
+ text_result += viz_text
1366
+
1367
+ saved_files = response.get("saved_files", {})
1368
+ else:
1369
+ saved_files = {}
1370
+
1371
+ # Return text and visualization info
1372
+ return text_result, str(viz_info), str(saved_files)
1373
+ else:
1374
+ # Standard OCR branch
1375
+ logger.info("Using standard OCR branch (not visualization)")
1376
+ response = model.ocr(
1377
+ image=image_bytes,
1378
+ prompt=prompt,
1379
+ model_size=model_size,
1380
+ test_compress=test_compress,
1381
+ save_results=save_results,
1382
+ eval_mode=True,
1383
+ )
1384
+
1385
+ progress(0.8, desc="Extracting text")
1386
+
1387
+ # Debug: Log response type and content
1388
+ logger.info(f"Standard OCR response type: {type(response)}")
1389
+ logger.info(
1390
+ f"Standard OCR response content: {str(response)[:200]}..."
1391
+ )
1392
+
1393
+ # Format response - handle both string and dict responses
1394
+ if isinstance(response, dict):
1395
+ if response.get("success"):
1396
+ text_result = response.get("text", "No text extracted")
1397
+
1398
+ # Debug: Check if text is empty
1399
+ if not text_result or not text_result.strip():
1400
+ logger.warning("OCR returned empty text")
1401
+ logger.warning(f"Full response: {response}")
1402
+ # Return a helpful message instead of empty result
1403
+ text_result = """**OCR Recognition Complete, No Text Detected**
1404
+
1405
+ **Possible Reasons:**
1406
+ - Text in image is unclear or insufficient resolution
1407
+ - Image format not supported
1408
+ - Model unable to recognize text in image
1409
+
1410
+ **Suggestions:**
1411
+ - Try uploading a clearer image
1412
+ - Ensure text in image is clear and legible
1413
+ - Handwritten text may have poor results
1414
+
1415
+ **Technical Information:**
1416
+ - Model Status: Normal
1417
+ - Image Size: Original {image.size if image else 'Unknown'}, Processed {response.get('image_size', 'Unknown')}
1418
+ - Processing Mode: {response.get('model_size', 'Unknown')}"""
1419
+ else:
1420
+ error_msg = response.get("error", "OCR failed")
1421
+ error_md = f"**Error**: {error_msg}"
1422
+ return error_md, "", ""
1423
+ elif isinstance(response, str):
1424
+ # Handle string response from original model
1425
+ text_result = response
1426
+ else:
1427
+ text_result = str(response)
1428
+
1429
+ # Format based on OCR type
1430
+ if ocr_type == "markdown" and isinstance(text_result, str):
1431
+ # Markdown mode - process LaTeX formulas for better rendering
1432
+ try:
1433
+ from .utils.latex import process_ocr_latex
1434
+
1435
+ if "\\" in text_result and (
1436
+ "\\[" in text_result
1437
+ or "\\(" in text_result
1438
+ or "$" in text_result
1439
+ ):
1440
+ # Process LaTeX formulas for Markdown compatibility
1441
+ text_result = process_ocr_latex(
1442
+ text_result, output_format="markdown"
1443
+ )
1444
+ logger.info(
1445
+ "Applied LaTeX processing for Markdown rendering"
1446
+ )
1447
+ except ImportError:
1448
+ logger.warning(
1449
+ "LaTeX processing utils not available, using raw text"
1450
+ )
1451
+ pass
1452
+ elif ocr_type == "format" and isinstance(text_result, str):
1453
+ # Format mode - show annotations in code block
1454
+ if "<|ref|>" in text_result:
1455
+ text_result = f"```text\n{text_result}\n```"
1456
+ elif ocr_type == "ocr" and isinstance(text_result, str):
1457
+ # Plain text mode - keep as plain text
1458
+ text_result = text_result
1459
+
1460
+ # Add compression info if available
1461
+ if (
1462
+ isinstance(response, dict)
1463
+ and test_compress
1464
+ and "compression_ratio" in response
1465
+ ):
1466
+ compression_info = (
1467
+ f"\n\n--- Compression Ratio Information ---\n"
1468
+ )
1469
+ compression_info += f"Compression Ratio: {response.get('compression_ratio', 'N/A')}\n"
1470
+ compression_info += f"Valid Image Tokens: {response.get('valid_image_tokens', 'N/A')}\n"
1471
+ compression_info += f"Output Text Tokens: {response.get('output_text_tokens', 'N/A')}\n"
1472
+ text_result += compression_info
1473
+
1474
+ return text_result, "", ""
1475
+
1476
+ except Exception as e:
1477
+ logger.error(f"OCR processing error: {e}")
1478
+ import traceback
1479
+
1480
+ error_details = traceback.format_exc()
1481
+ logger.error(f"Full traceback: {error_details}")
1482
+ # Show error in markdown format for better visibility
1483
+ error_msg = f"""**OCR Processing Error**
1484
+
1485
+ ```
1486
+ {str(e)}
1487
+ ```
1488
+
1489
+ **Debug Info:**
1490
+ - OCR Type: {ocr_type}
1491
+ - Model Size: {model_size}
1492
+ - Image Mode: {image.mode if image else 'None'}
1493
+ - Image Size: {image.size if image else 'None'}
1494
+ """
1495
+ return error_msg, "", ""
1496
+
1497
+ finally:
1498
+ progress(1.0, desc="OCR complete")
1499
+
1500
+ with gr.Blocks() as ocr_interface:
1501
+ gr.Markdown(f"### Enhanced OCR Text Extraction with {self.model_name}")
1502
+
1503
+ with gr.Row():
1504
+ with gr.Column(scale=1):
1505
+ image_input = gr.Image(
1506
+ type="pil",
1507
+ label="Upload Image for OCR",
1508
+ interactive=True,
1509
+ height=400,
1510
+ )
1511
+
1512
+ gr.Markdown(f"**Current OCR Model:** {self.model_name}")
1513
+
1514
+ # Model configuration options
1515
+ model_size = gr.Dropdown(
1516
+ choices=["tiny", "small", "base", "large", "gundam"],
1517
+ value="gundam",
1518
+ label="Model Size",
1519
+ info="Choose model size configuration",
1520
+ )
1521
+
1522
+ ocr_type = gr.Dropdown(
1523
+ choices=["ocr", "format", "markdown"],
1524
+ value="ocr",
1525
+ label="Output Format",
1526
+ info="ocr: Plain text extraction, format: Structured document (with annotations), markdown: Standard Markdown format",
1527
+ )
1528
+
1529
+ enable_visualization = gr.Checkbox(
1530
+ label="Enable Visualization",
1531
+ value=False,
1532
+ info="Generate bounding boxes and annotations (only applicable to format mode)",
1533
+ )
1534
+
1535
+ test_compress = gr.Checkbox(
1536
+ label="Test Compression Ratio",
1537
+ value=False,
1538
+ info="Analyze image compression performance",
1539
+ )
1540
+
1541
+ save_results = gr.Checkbox(
1542
+ label="Save Results",
1543
+ value=False,
1544
+ info="Save OCR results to files (if supported)",
1545
+ )
1546
+
1547
+ extract_btn = gr.Button("Extract Text", variant="primary")
1548
+
1549
+ with gr.Column(scale=1):
1550
+ # Create a bordered container for the output
1551
+ with gr.Group(elem_classes="output-container"):
1552
+ gr.Markdown("### 📄 Extraction Results")
1553
+
1554
+ text_output = gr.Markdown(
1555
+ value="Extracted text will be displayed here...",
1556
+ elem_classes="output-text",
1557
+ container=False,
1558
+ )
1559
+
1560
+ # Additional info outputs (hidden by default)
1561
+ viz_info_output = gr.Textbox(
1562
+ label="Visualization Info",
1563
+ lines=5,
1564
+ visible=False,
1565
+ interactive=False,
1566
+ )
1567
+
1568
+ file_info_output = gr.Textbox(
1569
+ label="File Info",
1570
+ lines=3,
1571
+ visible=False,
1572
+ interactive=False,
1573
+ )
1574
+
1575
+ # Toggle visibility of additional outputs
1576
+ def toggle_additional_outputs(enable_viz):
1577
+ return {
1578
+ viz_info_output: gr.update(visible=enable_viz),
1579
+ file_info_output: gr.update(visible=enable_viz),
1580
+ }
1581
+
1582
+ enable_visualization.change(
1583
+ fn=toggle_additional_outputs,
1584
+ inputs=[enable_visualization],
1585
+ outputs=[viz_info_output, file_info_output],
1586
+ )
1587
+
1588
+ # Examples section
1589
+ gr.Markdown("### Examples")
1590
+ gr.Examples(
1591
+ examples=[
1592
+ # You can add example image paths here if needed
1593
+ ],
1594
+ inputs=[image_input],
1595
+ label="Example Images",
1596
+ )
1597
+
1598
+ # Extract button click event
1599
+ extract_btn.click(
1600
+ fn=extract_text_from_image,
1601
+ inputs=[
1602
+ image_input,
1603
+ ocr_type,
1604
+ model_size,
1605
+ test_compress,
1606
+ enable_visualization,
1607
+ save_results,
1608
+ ],
1609
+ outputs=[text_output, viz_info_output, file_info_output],
1610
+ )
1611
+
1612
+ return ocr_interface
1613
+
1236
1614
  def build_main_interface(self) -> "gr.Blocks":
1237
1615
  if self.model_type == "image":
1238
- title = f"🎨 Xinference Stable Diffusion: {self.model_name} 🎨"
1616
+ if "ocr" in self.model_ability:
1617
+ title = f"🔍 Xinference OCR: {self.model_name} 🔍"
1618
+ else:
1619
+ title = f"🎨 Xinference Stable Diffusion: {self.model_name} 🎨"
1239
1620
  elif self.model_type == "video":
1240
1621
  title = f"🎨 Xinference Video Generation: {self.model_name} 🎨"
1241
1622
  else:
@@ -1251,6 +1632,87 @@ class MediaInterface:
1251
1632
  padding: 0px;
1252
1633
  color: #9ea4b0 !important;
1253
1634
  }
1635
+
1636
+ .output-container {
1637
+ border: 1px solid #e0e0e0;
1638
+ border-radius: 8px;
1639
+ padding: 16px;
1640
+ background-color: #f8f9fa;
1641
+ margin: 8px 0;
1642
+ }
1643
+
1644
+ .output-text {
1645
+ background-color: white;
1646
+ border: 1px solid #dee2e6;
1647
+ border-radius: 6px;
1648
+ padding: 16px;
1649
+ min-height: 200px;
1650
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
1651
+ line-height: 1.6;
1652
+ }
1653
+
1654
+ .output-text h1, .output-text h2, .output-text h3,
1655
+ .output-text h4, .output-text h5, .output-text h6 {
1656
+ margin-top: 0.5em !important;
1657
+ margin-bottom: 0.5em !important;
1658
+ color: #2d3748 !important;
1659
+ }
1660
+
1661
+ .output-text p {
1662
+ margin: 0.5em 0 !important;
1663
+ }
1664
+
1665
+ .output-text pre {
1666
+ background-color: #f6f8fa !important;
1667
+ border: 1px solid #e9ecef !important;
1668
+ border-radius: 4px !important;
1669
+ padding: 12px !important;
1670
+ margin: 8px 0 !important;
1671
+ }
1672
+
1673
+ .output-text code {
1674
+ background-color: #e9ecef !important;
1675
+ padding: 2px 4px !important;
1676
+ border-radius: 3px !important;
1677
+ font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace !important;
1678
+ }
1679
+
1680
+ .output-text ul, .output-text ol {
1681
+ margin: 0.5em 0 !important;
1682
+ padding-left: 20px !important;
1683
+ }
1684
+
1685
+ .output-text blockquote {
1686
+ border-left: 4px solid #6c757d !important;
1687
+ padding-left: 16px !important;
1688
+ margin: 0.5em 0 !important;
1689
+ color: #6c757d !important;
1690
+ background-color: #f8f9fa !important;
1691
+ }
1692
+
1693
+ .output-text table {
1694
+ border-collapse: collapse !important;
1695
+ width: 100% !important;
1696
+ margin: 8px 0 !important;
1697
+ }
1698
+
1699
+ .output-text th, .output-text td {
1700
+ border: 1px solid #dee2e6 !important;
1701
+ padding: 8px 12px !important;
1702
+ text-align: left !important;
1703
+ }
1704
+
1705
+ .output-text th {
1706
+ background-color: #f8f9fa !important;
1707
+ font-weight: bold !important;
1708
+ }
1709
+
1710
+ /* Ensure Markdown displays correctly */
1711
+ .output-text .katex-display {
1712
+ display: block !important;
1713
+ text-align: center !important;
1714
+ margin: 1em 0 !important;
1715
+ }
1254
1716
  """,
1255
1717
  analytics_enabled=False,
1256
1718
  ) as app:
@@ -1266,6 +1728,9 @@ class MediaInterface:
1266
1728
  </div>
1267
1729
  """
1268
1730
  )
1731
+ if "ocr" in self.model_ability:
1732
+ with gr.Tab("OCR"):
1733
+ self.ocr_interface()
1269
1734
  if "text2image" in self.model_ability:
1270
1735
  with gr.Tab("Text to Image"):
1271
1736
  self.text2image_interface()
@@ -0,0 +1,19 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Utilities for Gradio UI components."""
16
+
17
+ from .latex import clean_latex_syntax, process_latex_formulas, process_ocr_latex
18
+
19
+ __all__ = ["process_latex_formulas", "clean_latex_syntax", "process_ocr_latex"]