xinference 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +44 -9
  5. xinference/core/model.py +4 -4
  6. xinference/core/scheduler.py +1 -2
  7. xinference/core/worker.py +1 -1
  8. xinference/deploy/cmdline.py +2 -2
  9. xinference/deploy/test/test_cmdline.py +7 -7
  10. xinference/model/llm/__init__.py +20 -27
  11. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  12. xinference/model/llm/llm_family.json +448 -1153
  13. xinference/model/llm/llm_family.py +14 -139
  14. xinference/model/llm/llm_family_modelscope.json +230 -313
  15. xinference/model/llm/memory.py +9 -9
  16. xinference/model/llm/sglang/core.py +2 -2
  17. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  18. xinference/model/llm/{pytorch → transformers}/core.py +2 -10
  19. xinference/model/llm/transformers/intern_vl.py +457 -0
  20. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  21. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
  22. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  23. xinference/model/llm/utils.py +76 -70
  24. xinference/model/llm/vllm/core.py +110 -11
  25. xinference/model/utils.py +1 -95
  26. xinference/thirdparty/internvl/__init__.py +0 -0
  27. xinference/thirdparty/internvl/conversation.py +393 -0
  28. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  29. xinference/web/ui/build/asset-manifest.json +3 -3
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
  32. xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  45. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/METADATA +12 -15
  46. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
  47. xinference/locale/utils.py +0 -39
  48. xinference/locale/zh_CN.json +0 -26
  49. xinference/model/llm/ggml/tools/__init__.py +0 -15
  50. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  51. xinference/model/llm/ggml/tools/gguf.py +0 -884
  52. xinference/model/llm/pytorch/__init__.py +0 -13
  53. xinference/model/llm/pytorch/baichuan.py +0 -81
  54. xinference/model/llm/pytorch/falcon.py +0 -138
  55. xinference/model/llm/pytorch/intern_vl.py +0 -352
  56. xinference/model/llm/pytorch/vicuna.py +0 -69
  57. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  58. xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  71. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  72. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  73. /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
  74. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  75. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  76. /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
  77. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  78. /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
  79. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  80. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  81. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  82. /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
  83. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
  84. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
  85. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
  86. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
  87. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-08-09T18:03:26+0800",
11
+ "date": "2024-08-16T18:10:38+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "3e7ed865c3b3de601c92edbd9744f2bff9054051",
15
- "version": "0.14.1"
14
+ "full-revisionid": "e4d225774dc7a9a9260396bf833e03a1df8e8a92",
15
+ "version": "0.14.2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -1682,18 +1682,9 @@ class RESTfulAPI:
1682
1682
 
1683
1683
  model_family = desc.get("model_family", "")
1684
1684
  function_call_models = (
1685
- ["chatglm3", "gorilla-openfunctions-v1"]
1686
- + QWEN_TOOL_CALL_FAMILY
1687
- + GLM4_TOOL_CALL_FAMILY
1685
+ ["gorilla-openfunctions-v1"] + QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY
1688
1686
  )
1689
1687
 
1690
- is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family
1691
-
1692
- if is_qwen and system_prompt is not None:
1693
- raise HTTPException(
1694
- status_code=400, detail="Qwen ggml does not have system prompt"
1695
- )
1696
-
1697
1688
  if model_family not in function_call_models:
1698
1689
  if body.tools:
1699
1690
  raise HTTPException(
@@ -1724,18 +1715,13 @@ class RESTfulAPI:
1724
1715
  iterator = None
1725
1716
  try:
1726
1717
  try:
1727
- if is_qwen:
1728
- iterator = await model.chat(
1729
- prompt, chat_history, kwargs, raw_params=raw_kwargs
1730
- )
1731
- else:
1732
- iterator = await model.chat(
1733
- prompt,
1734
- system_prompt,
1735
- chat_history,
1736
- kwargs,
1737
- raw_params=raw_kwargs,
1738
- )
1718
+ iterator = await model.chat(
1719
+ prompt,
1720
+ system_prompt,
1721
+ chat_history,
1722
+ kwargs,
1723
+ raw_params=raw_kwargs,
1724
+ )
1739
1725
  except RuntimeError as re:
1740
1726
  await self._report_error_event(model_uid, str(re))
1741
1727
  self.handle_request_limit_error(re)
@@ -1763,18 +1749,13 @@ class RESTfulAPI:
1763
1749
  return EventSourceResponse(stream_results())
1764
1750
  else:
1765
1751
  try:
1766
- if is_qwen:
1767
- data = await model.chat(
1768
- prompt, chat_history, kwargs, raw_params=raw_kwargs
1769
- )
1770
- else:
1771
- data = await model.chat(
1772
- prompt,
1773
- system_prompt,
1774
- chat_history,
1775
- kwargs,
1776
- raw_params=raw_kwargs,
1777
- )
1752
+ data = await model.chat(
1753
+ prompt,
1754
+ system_prompt,
1755
+ chat_history,
1756
+ kwargs,
1757
+ raw_params=raw_kwargs,
1758
+ )
1778
1759
  return Response(content=data, media_type="application/json")
1779
1760
  except Exception as e:
1780
1761
  logger.error(e, exc_info=True)
@@ -426,7 +426,7 @@ class RESTfulGenerateModelHandle(RESTfulModelHandle):
426
426
  The user's message or user's input.
427
427
  generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
428
428
  Additional configuration for the chat generation.
429
- "LlamaCppGenerateConfig" -> Configuration for ggml model
429
+ "LlamaCppGenerateConfig" -> Configuration for llama-cpp-python model
430
430
  "PytorchGenerateConfig" -> Configuration for pytorch model
431
431
 
432
432
  Returns
@@ -493,7 +493,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
493
493
  A tool list.
494
494
  generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
495
495
  Additional configuration for the chat generation.
496
- "LlamaCppGenerateConfig" -> configuration for ggml model
496
+ "LlamaCppGenerateConfig" -> configuration for llama-cpp-python model
497
497
  "PytorchGenerateConfig" -> configuration for pytorch model
498
498
 
499
499
  Returns
@@ -236,8 +236,8 @@ class GradioInterface:
236
236
  bot[-1][1] = history[-1]["content"]
237
237
  yield history, bot
238
238
 
239
- def add_text(history, bot, text, image):
240
- logger.debug("Add text, text: %s, image: %s", text, image)
239
+ def add_text(history, bot, text, image, video):
240
+ logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
241
241
  if image:
242
242
  buffered = BytesIO()
243
243
  with PIL.Image.open(image) as img:
@@ -257,16 +257,47 @@ class GradioInterface:
257
257
  },
258
258
  ],
259
259
  }
260
+ elif video:
261
+
262
+ def video_to_base64(video_path):
263
+ with open(video_path, "rb") as video_file:
264
+ encoded_string = base64.b64encode(video_file.read()).decode(
265
+ "utf-8"
266
+ )
267
+ return encoded_string
268
+
269
+ def generate_html_video(video_path):
270
+ base64_video = video_to_base64(video_path)
271
+ video_format = video_path.split(".")[-1]
272
+ html_code = f"""
273
+ <video controls>
274
+ <source src="data:video/{video_format};base64,{base64_video}" type="video/{video_format}">
275
+ Your browser does not support the video tag.
276
+ </video>
277
+ """
278
+ return html_code
279
+
280
+ display_content = f"{generate_html_video(video)}\n{text}"
281
+ message = {
282
+ "role": "user",
283
+ "content": [
284
+ {"type": "text", "text": text},
285
+ {
286
+ "type": "video_url",
287
+ "video_url": {"url": video},
288
+ },
289
+ ],
290
+ }
260
291
  else:
261
292
  display_content = text
262
293
  message = {"role": "user", "content": text}
263
294
  history = history + [message]
264
295
  bot = bot + [[display_content, None]]
265
- return history, bot, "", None
296
+ return history, bot, "", None, None
266
297
 
267
298
  def clear_history():
268
299
  logger.debug("Clear history.")
269
- return [], None, "", None
300
+ return [], None, "", None, None
270
301
 
271
302
  def update_button(text):
272
303
  return gr.update(interactive=bool(text))
@@ -313,6 +344,7 @@ class GradioInterface:
313
344
  )
314
345
  with gr.Column(scale=3):
315
346
  imagebox = gr.Image(type="filepath")
347
+ videobox = gr.Video()
316
348
  textbox = gr.Textbox(
317
349
  show_label=False,
318
350
  placeholder="Enter text and press ENTER",
@@ -340,8 +372,8 @@ class GradioInterface:
340
372
 
341
373
  textbox.submit(
342
374
  add_text,
343
- [state, chatbot, textbox, imagebox],
344
- [state, chatbot, textbox, imagebox],
375
+ [state, chatbot, textbox, imagebox, videobox],
376
+ [state, chatbot, textbox, imagebox, videobox],
345
377
  queue=False,
346
378
  ).then(
347
379
  predict,
@@ -351,8 +383,8 @@ class GradioInterface:
351
383
 
352
384
  submit_btn.click(
353
385
  add_text,
354
- [state, chatbot, textbox, imagebox],
355
- [state, chatbot, textbox, imagebox],
386
+ [state, chatbot, textbox, imagebox, videobox],
387
+ [state, chatbot, textbox, imagebox, videobox],
356
388
  queue=False,
357
389
  ).then(
358
390
  predict,
@@ -361,7 +393,10 @@ class GradioInterface:
361
393
  )
362
394
 
363
395
  clear_btn.click(
364
- clear_history, None, [state, chatbot, textbox, imagebox], queue=False
396
+ clear_history,
397
+ None,
398
+ [state, chatbot, textbox, imagebox, videobox],
399
+ queue=False,
365
400
  )
366
401
 
367
402
  return chat_vl_interface
xinference/core/model.py CHANGED
@@ -132,8 +132,8 @@ class ModelActor(xo.StatelessActor):
132
132
 
133
133
  async def __pre_destroy__(self):
134
134
  from ..model.embedding.core import EmbeddingModel
135
- from ..model.llm.pytorch.core import PytorchModel as LLMPytorchModel
136
135
  from ..model.llm.sglang.core import SGLANGModel
136
+ from ..model.llm.transformers.core import PytorchModel as LLMPytorchModel
137
137
  from ..model.llm.vllm.core import VLLMModel as LLMVLLMModel
138
138
 
139
139
  if self.allow_batching():
@@ -177,8 +177,8 @@ class ModelActor(xo.StatelessActor):
177
177
  request_limits: Optional[int] = None,
178
178
  ):
179
179
  super().__init__()
180
- from ..model.llm.pytorch.core import PytorchModel
181
180
  from ..model.llm.sglang.core import SGLANGModel
181
+ from ..model.llm.transformers.core import PytorchModel
182
182
  from ..model.llm.vllm.core import VLLMModel
183
183
 
184
184
  self._worker_address = worker_address
@@ -272,7 +272,7 @@ class ModelActor(xo.StatelessActor):
272
272
  return isinstance(self._model, VLLMModel)
273
273
 
274
274
  def allow_batching(self) -> bool:
275
- from ..model.llm.pytorch.core import PytorchModel
275
+ from ..model.llm.transformers.core import PytorchModel
276
276
 
277
277
  model_ability = self._model_description.get("model_ability", [])
278
278
 
@@ -415,7 +415,7 @@ class ModelActor(xo.StatelessActor):
415
415
  ret = await asyncio.to_thread(fn, *args, **kwargs)
416
416
 
417
417
  if self._lock is not None and self._current_generator():
418
- raise Exception("Parallel generation is not supported by ggml.")
418
+ raise Exception("Parallel generation is not supported by llama-cpp-python.")
419
419
 
420
420
  if inspect.isgenerator(ret):
421
421
  gen = self._to_generator(output_type, ret)
@@ -24,7 +24,6 @@ import xoscar as xo
24
24
 
25
25
  logger = logging.getLogger(__name__)
26
26
 
27
- XINFERENCE_BATCHING_CLEAN_CACHE_INTERVAL = 5
28
27
  XINFERENCE_STREAMING_DONE_FLAG = "<XINFERENCE_STREAMING_DONE>"
29
28
  XINFERENCE_STREAMING_ERROR_FLAG = "<XINFERENCE_STREAMING_ERROR>"
30
29
  XINFERENCE_STREAMING_ABORT_FLAG = "<XINFERENCE_STREAMING_ABORT>"
@@ -359,7 +358,7 @@ class SchedulerActor(xo.StatelessActor):
359
358
 
360
359
  @staticmethod
361
360
  def _empty_cache():
362
- from ..model.llm.pytorch.utils import empty_cache
361
+ from ..model.llm.transformers.utils import empty_cache
363
362
 
364
363
  empty_cache()
365
364
 
xinference/core/worker.py CHANGED
@@ -830,7 +830,7 @@ class WorkerActor(xo.StatelessActor):
830
830
  raise ValueError(
831
831
  f"PEFT adaptors cannot be applied to embedding or rerank models."
832
832
  )
833
- if model_type == "LLM" and model_format in ("ggufv2", "ggmlv3"):
833
+ if model_type == "LLM" and model_format in ("ggufv2",):
834
834
  raise ValueError(
835
835
  f"PEFT adaptors can only be applied to pytorch-like models"
836
836
  )
@@ -750,7 +750,7 @@ def remove_cache(
750
750
  "-f",
751
751
  default=None,
752
752
  type=str,
753
- help="Specify the format of the model, e.g. pytorch, ggmlv3, etc.",
753
+ help="Specify the format of the model, e.g. pytorch, ggufv2, etc.",
754
754
  )
755
755
  @click.option(
756
756
  "--quantization",
@@ -1516,7 +1516,7 @@ def query_engine_by_model_name(
1516
1516
  "-f",
1517
1517
  type=str,
1518
1518
  required=True,
1519
- help="Specify the format of the model, e.g. pytorch, ggmlv3, etc.",
1519
+ help="Specify the format of the model, e.g. pytorch, ggufv2, etc.",
1520
1520
  )
1521
1521
  @click.option(
1522
1522
  "--quantization",
@@ -66,10 +66,10 @@ def test_cmdline(setup, stream, model_uid):
66
66
  replica = 1
67
67
  original_model_uid = model_uid
68
68
  model_uid = client.launch_model(
69
- model_name="orca",
69
+ model_name="qwen1.5-chat",
70
70
  model_engine="llama.cpp",
71
71
  model_uid=model_uid,
72
- model_size_in_billions=3,
72
+ model_size_in_billions="0_5",
73
73
  quantization="q4_0",
74
74
  replica=replica,
75
75
  )
@@ -249,10 +249,10 @@ def test_rotate_logs(setup_with_file_logging):
249
249
  runner = CliRunner()
250
250
  replica = 1 if os.name == "nt" else 2
251
251
  model_uid = client.launch_model(
252
- model_name="orca",
252
+ model_name="qwen1.5-chat",
253
253
  model_engine="llama.cpp",
254
254
  model_uid=None,
255
- model_size_in_billions=3,
255
+ model_size_in_billions="0_5",
256
256
  quantization="q4_0",
257
257
  replica=replica,
258
258
  )
@@ -288,7 +288,7 @@ def test_list_cached_models(setup):
288
288
 
289
289
  result = runner.invoke(
290
290
  list_cached_models,
291
- ["--endpoint", endpoint, "--model_name", "orca"],
291
+ ["--endpoint", endpoint, "--model_name", "qwen1.5-chat"],
292
292
  )
293
293
  assert "model_name" in result.stdout
294
294
  assert "model_format" in result.stdout
@@ -305,9 +305,9 @@ def test_remove_cache(setup):
305
305
 
306
306
  result = runner.invoke(
307
307
  remove_cache,
308
- ["--endpoint", endpoint, "--model_version", "orca"],
308
+ ["--endpoint", endpoint, "--model_version", "qwen1.5-chat"],
309
309
  input="y\n",
310
310
  )
311
311
 
312
312
  assert result.exit_code == 0
313
- assert "Cache directory orca has been deleted."
313
+ assert "Cache directory qwen1.5-chat has been deleted."
@@ -40,7 +40,7 @@ from .llm_family import (
40
40
  TRANSFORMERS_CLASSES,
41
41
  VLLM_CLASSES,
42
42
  CustomLLMFamilyV1,
43
- GgmlLLMSpecV1,
43
+ LlamaCppLLMSpecV1,
44
44
  LLMFamilyV1,
45
45
  LLMSpecV1,
46
46
  MLXLLMSpecV1,
@@ -55,10 +55,10 @@ from .llm_family import (
55
55
 
56
56
 
57
57
  def check_format_with_engine(model_format, engine):
58
- # only llama-cpp-python support and only support ggufv2 and ggmlv3
59
- if model_format in ["ggufv2", "ggmlv3"] and engine != "llama.cpp":
58
+ # only llama-cpp-python support and only support ggufv2
59
+ if model_format in ["ggufv2"] and engine != "llama.cpp":
60
60
  return False
61
- if model_format not in ["ggufv2", "ggmlv3"] and engine == "llama.cpp":
61
+ if model_format not in ["ggufv2"] and engine == "llama.cpp":
62
62
  return False
63
63
  return True
64
64
 
@@ -112,28 +112,25 @@ def generate_engine_config_by_model_family(model_family):
112
112
 
113
113
 
114
114
  def _install():
115
- from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
115
+ from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
116
116
  from .mlx.core import MLXChatModel, MLXModel
117
- from .pytorch.baichuan import BaichuanPytorchChatModel
118
- from .pytorch.chatglm import ChatglmPytorchChatModel
119
- from .pytorch.cogvlm2 import CogVLM2Model
120
- from .pytorch.core import PytorchChatModel, PytorchModel
121
- from .pytorch.deepseek_vl import DeepSeekVLChatModel
122
- from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
123
- from .pytorch.glm4v import Glm4VModel
124
- from .pytorch.intern_vl import InternVLChatModel
125
- from .pytorch.internlm2 import Internlm2PytorchChatModel
126
- from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
127
- from .pytorch.minicpmv25 import MiniCPMV25Model
128
- from .pytorch.minicpmv26 import MiniCPMV26Model
129
- from .pytorch.qwen_vl import QwenVLChatModel
130
- from .pytorch.vicuna import VicunaPytorchChatModel
131
- from .pytorch.yi_vl import YiVLChatModel
132
117
  from .sglang.core import SGLANGChatModel, SGLANGModel
133
- from .vllm.core import VLLMChatModel, VLLMModel
118
+ from .transformers.chatglm import ChatglmPytorchChatModel
119
+ from .transformers.cogvlm2 import CogVLM2Model
120
+ from .transformers.core import PytorchChatModel, PytorchModel
121
+ from .transformers.deepseek_vl import DeepSeekVLChatModel
122
+ from .transformers.glm4v import Glm4VModel
123
+ from .transformers.intern_vl import InternVLChatModel
124
+ from .transformers.internlm2 import Internlm2PytorchChatModel
125
+ from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
126
+ from .transformers.minicpmv25 import MiniCPMV25Model
127
+ from .transformers.minicpmv26 import MiniCPMV26Model
128
+ from .transformers.qwen_vl import QwenVLChatModel
129
+ from .transformers.yi_vl import YiVLChatModel
130
+ from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
134
131
 
135
132
  try:
136
- from .pytorch.omnilmm import OmniLMMModel
133
+ from .transformers.omnilmm import OmniLMMModel
137
134
  except ImportError as e:
138
135
  # For quite old transformers version,
139
136
  # import will generate error
@@ -148,18 +145,14 @@ def _install():
148
145
  ]
149
146
  )
150
147
  SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
151
- VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
148
+ VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
152
149
  MLX_CLASSES.extend([MLXModel, MLXChatModel])
153
150
  TRANSFORMERS_CLASSES.extend(
154
151
  [
155
- BaichuanPytorchChatModel,
156
- VicunaPytorchChatModel,
157
- FalconPytorchChatModel,
158
152
  ChatglmPytorchChatModel,
159
153
  LlamaPytorchModel,
160
154
  LlamaPytorchChatModel,
161
155
  PytorchChatModel,
162
- FalconPytorchModel,
163
156
  Internlm2PytorchChatModel,
164
157
  QwenVLChatModel,
165
158
  YiVLChatModel,
@@ -11,7 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import datetime
15
14
  import logging
16
15
  import os
17
16
  import time
@@ -104,35 +103,6 @@ class LlamaCppModel(LLM):
104
103
  generate_config.pop("lora_name", None) # type: ignore
105
104
  return generate_config
106
105
 
107
- def _convert_ggml_to_gguf(self, model_path: str) -> str:
108
- from .tools import convert
109
-
110
- root_dir = os.path.dirname(os.path.dirname(model_path))
111
- gguf_dir = os.path.join(
112
- root_dir,
113
- "{}-ggufv2-{}b".format(
114
- self.model_family.model_name, self.model_spec.model_size_in_billions
115
- ),
116
- )
117
- os.makedirs(gguf_dir, exist_ok=True)
118
- gguf_path = os.path.join(
119
- gguf_dir,
120
- "{}.{}.ggufv2".format(self.model_family.model_name, self.quantization),
121
- )
122
- # trick for validation, use a mark file to make sure the gguf file is converted
123
- mark_file = os.path.join(gguf_dir, f"__valid_{self.quantization}")
124
- if os.path.exists(mark_file):
125
- return gguf_path
126
- else:
127
- logger.warning(
128
- "You are using a model with ggmlv3, "
129
- "and it will take some time to convert to ggufv2"
130
- )
131
- convert(model_path, gguf_path)
132
- with open(mark_file, "w") as f:
133
- f.write(str(datetime.datetime.now()))
134
- return gguf_path
135
-
136
106
  def load(self):
137
107
  try:
138
108
  import llama_cpp
@@ -167,9 +137,6 @@ class LlamaCppModel(LLM):
167
137
  if os.path.exists(legacy_model_file_path):
168
138
  model_path = legacy_model_file_path
169
139
 
170
- if self.model_spec.model_format == "ggmlv3":
171
- model_path = self._convert_ggml_to_gguf(model_path)
172
-
173
140
  try:
174
141
  self._llm = Llama(
175
142
  model_path=model_path,
@@ -183,7 +150,7 @@ class LlamaCppModel(LLM):
183
150
  def match(
184
151
  cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
185
152
  ) -> bool:
186
- if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
153
+ if llm_spec.model_format not in ["ggufv2"]:
187
154
  return False
188
155
  if "qwen" in llm_family.model_name:
189
156
  return False
@@ -285,7 +252,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
285
252
  def match(
286
253
  cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
287
254
  ) -> bool:
288
- if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
255
+ if llm_spec.model_format not in ["ggufv2"]:
289
256
  return False
290
257
  if "chat" not in llm_family.model_ability:
291
258
  return False