xinference 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (70) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +123 -3
  3. xinference/client/restful/restful_client.py +131 -2
  4. xinference/core/model.py +93 -24
  5. xinference/core/supervisor.py +132 -15
  6. xinference/core/worker.py +165 -8
  7. xinference/deploy/cmdline.py +5 -0
  8. xinference/model/audio/chattts.py +46 -14
  9. xinference/model/audio/core.py +23 -15
  10. xinference/model/core.py +12 -3
  11. xinference/model/embedding/core.py +25 -16
  12. xinference/model/flexible/__init__.py +40 -0
  13. xinference/model/flexible/core.py +228 -0
  14. xinference/model/flexible/launchers/__init__.py +15 -0
  15. xinference/model/flexible/launchers/transformers_launcher.py +63 -0
  16. xinference/model/flexible/utils.py +33 -0
  17. xinference/model/image/core.py +21 -14
  18. xinference/model/image/custom.py +1 -1
  19. xinference/model/image/model_spec.json +14 -0
  20. xinference/model/image/stable_diffusion/core.py +43 -6
  21. xinference/model/llm/__init__.py +0 -2
  22. xinference/model/llm/core.py +3 -2
  23. xinference/model/llm/ggml/llamacpp.py +1 -10
  24. xinference/model/llm/llm_family.json +292 -36
  25. xinference/model/llm/llm_family.py +97 -52
  26. xinference/model/llm/llm_family_modelscope.json +220 -27
  27. xinference/model/llm/pytorch/core.py +0 -80
  28. xinference/model/llm/sglang/core.py +7 -2
  29. xinference/model/llm/utils.py +4 -2
  30. xinference/model/llm/vllm/core.py +3 -0
  31. xinference/model/rerank/core.py +24 -25
  32. xinference/types.py +0 -1
  33. xinference/web/ui/build/asset-manifest.json +3 -3
  34. xinference/web/ui/build/index.html +1 -1
  35. xinference/web/ui/build/static/js/{main.0fb6f3ab.js → main.95c1d652.js} +3 -3
  36. xinference/web/ui/build/static/js/main.95c1d652.js.map +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/70fa8c07463a5fe57c68bf92502910105a8f647371836fe8c3a7408246ca7ba0.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +1 -0
  43. {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/METADATA +9 -11
  44. {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/RECORD +49 -58
  45. xinference/model/llm/ggml/chatglm.py +0 -457
  46. xinference/thirdparty/ChatTTS/__init__.py +0 -1
  47. xinference/thirdparty/ChatTTS/core.py +0 -200
  48. xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
  49. xinference/thirdparty/ChatTTS/experimental/llm.py +0 -40
  50. xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
  51. xinference/thirdparty/ChatTTS/infer/api.py +0 -125
  52. xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
  53. xinference/thirdparty/ChatTTS/model/dvae.py +0 -155
  54. xinference/thirdparty/ChatTTS/model/gpt.py +0 -265
  55. xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
  56. xinference/thirdparty/ChatTTS/utils/gpu_utils.py +0 -23
  57. xinference/thirdparty/ChatTTS/utils/infer_utils.py +0 -141
  58. xinference/thirdparty/ChatTTS/utils/io_utils.py +0 -14
  59. xinference/web/ui/build/static/js/main.0fb6f3ab.js.map +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/0f6b391abec76271137faad13a3793fe7acc1024e8cd2269c147b653ecd3a73b.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/30a0c79d8025d6441eb75b2df5bc2750a14f30119c869ef02570d294dff65c2f.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/40486e655c3c5801f087e2cf206c0b5511aaa0dfdba78046b7181bf9c17e54c5.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/b5507cd57f16a3a230aa0128e39fe103e928de139ea29e2679e4c64dcbba3b3a.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/d779b915f83f9c7b5a72515b6932fdd114f1822cef90ae01cc0d12bca59abc2d.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/d87824cb266194447a9c0c69ebab2d507bfc3e3148976173760d18c035e9dd26.json +0 -1
  66. /xinference/web/ui/build/static/js/{main.0fb6f3ab.js.LICENSE.txt → main.95c1d652.js.LICENSE.txt} +0 -0
  67. {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/LICENSE +0 -0
  68. {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/WHEEL +0 -0
  69. {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/entry_points.txt +0 -0
  70. {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@
14
14
 
15
15
  import logging
16
16
  import os
17
- import platform
18
17
  import shutil
19
18
  from threading import Lock
20
19
  from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -541,25 +540,50 @@ def _get_cache_dir_for_model_mem(
541
540
  def _get_cache_dir(
542
541
  llm_family: LLMFamilyV1,
543
542
  llm_spec: "LLMSpecV1",
543
+ quantization: Optional[str] = None,
544
544
  create_if_not_exist=True,
545
545
  ):
546
546
  # If the model id contains quantization, then we should give each
547
547
  # quantization a dedicated cache dir.
548
548
  quant_suffix = ""
549
- for q in llm_spec.quantizations:
550
- if llm_spec.model_id and q in llm_spec.model_id:
551
- quant_suffix = q
552
- break
553
- cache_dir_name = (
549
+ if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
550
+ quant_suffix = quantization
551
+ else:
552
+ for q in llm_spec.quantizations:
553
+ if llm_spec.model_id and q in llm_spec.model_id:
554
+ quant_suffix = q
555
+ break
556
+
557
+ # some model name includes ".", e.g. qwen1.5-chat
558
+ # if the model does not require trust_remote_code, it's OK
559
+ # because no need to import modeling_xxx.py from the path
560
+ # but when the model need to trust_remote_code,
561
+ # e.g. internlm2.5-chat, the import will fail,
562
+ # but before the model may have been downloaded,
563
+ # thus we check it first, if exist, return it,
564
+ # otherwise, we replace the "." with "_" in model name
565
+ old_cache_dir_name = (
554
566
  f"{llm_family.model_name}-{llm_spec.model_format}"
555
567
  f"-{llm_spec.model_size_in_billions}b"
556
568
  )
557
569
  if quant_suffix:
558
- cache_dir_name += f"-{quant_suffix}"
559
- cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
560
- if create_if_not_exist and not os.path.exists(cache_dir):
561
- os.makedirs(cache_dir, exist_ok=True)
562
- return cache_dir
570
+ old_cache_dir_name += f"-{quant_suffix}"
571
+ old_cache_dir = os.path.realpath(
572
+ os.path.join(XINFERENCE_CACHE_DIR, old_cache_dir_name)
573
+ )
574
+ if os.path.exists(old_cache_dir):
575
+ return old_cache_dir
576
+ else:
577
+ cache_dir_name = (
578
+ f"{llm_family.model_name.replace('.', '_')}-{llm_spec.model_format}"
579
+ f"-{llm_spec.model_size_in_billions}b"
580
+ )
581
+ if quant_suffix:
582
+ cache_dir_name += f"-{quant_suffix}"
583
+ cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
584
+ if create_if_not_exist and not os.path.exists(cache_dir):
585
+ os.makedirs(cache_dir, exist_ok=True)
586
+ return cache_dir
563
587
 
564
588
 
565
589
  def _get_meta_path(
@@ -900,6 +924,7 @@ def _check_revision(
900
924
  llm_spec: "LLMSpecV1",
901
925
  builtin: list,
902
926
  meta_path: str,
927
+ quantization: Optional[str] = None,
903
928
  ) -> bool:
904
929
  for family in builtin:
905
930
  if llm_family.model_name == family.model_name:
@@ -908,59 +933,63 @@ def _check_revision(
908
933
  if (
909
934
  spec.model_format == "pytorch"
910
935
  and spec.model_size_in_billions == llm_spec.model_size_in_billions
936
+ and (quantization is None or quantization in spec.quantizations)
911
937
  ):
912
938
  return valid_model_revision(meta_path, spec.model_revision)
913
939
  return False
914
940
 
915
941
 
916
942
  def get_cache_status(
917
- llm_family: LLMFamilyV1,
918
- llm_spec: "LLMSpecV1",
943
+ llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
919
944
  ) -> Union[bool, List[bool]]:
920
945
  """
921
- When calling this function from above, `llm_family` is constructed only from BUILTIN_LLM_FAMILIES,
922
- so we should check both huggingface and modelscope cache files.
946
+ Checks if a model's cache status is available based on the model format and quantization.
947
+ Supports different directories and model formats.
923
948
  """
924
- cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
925
- # check revision for pytorch model
926
- if llm_spec.model_format == "pytorch":
927
- hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
928
- ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
929
- revisions = [
930
- _check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
931
- _check_revision(
932
- llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
933
- ),
934
- ]
935
- return any(revisions)
936
- # just check meta file for ggml and gptq model
937
- elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
938
- ret = []
939
- for q in llm_spec.quantizations:
940
- assert q is not None
941
- hf_meta_path = _get_meta_path(
942
- cache_dir, llm_spec.model_format, "huggingface", q
943
- )
944
- ms_meta_path = _get_meta_path(
945
- cache_dir, llm_spec.model_format, "modelscope", q
946
- )
947
- results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
948
- ret.append(any(results))
949
- return ret
950
- else:
951
- raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
952
-
953
949
 
954
- def _is_linux():
955
- return platform.system() == "Linux"
950
+ def check_file_status(meta_path: str) -> bool:
951
+ return os.path.exists(meta_path)
956
952
 
953
+ def check_revision_status(
954
+ meta_path: str, families: list, quantization: Optional[str] = None
955
+ ) -> bool:
956
+ return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
957
957
 
958
- def _has_cuda_device():
959
- # `cuda_count` method already contains the logic for the
960
- # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
961
- from ...utils import cuda_count
958
+ def handle_quantization(q: Union[str, None]) -> bool:
959
+ specific_cache_dir = _get_cache_dir(
960
+ llm_family, llm_spec, q, create_if_not_exist=False
961
+ )
962
+ meta_paths = {
963
+ "huggingface": _get_meta_path(
964
+ specific_cache_dir, llm_spec.model_format, "huggingface", q
965
+ ),
966
+ "modelscope": _get_meta_path(
967
+ specific_cache_dir, llm_spec.model_format, "modelscope", q
968
+ ),
969
+ }
970
+ if llm_spec.model_format == "pytorch":
971
+ return check_revision_status(
972
+ meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
973
+ ) or check_revision_status(
974
+ meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
975
+ )
976
+ else:
977
+ return check_file_status(meta_paths["huggingface"]) or check_file_status(
978
+ meta_paths["modelscope"]
979
+ )
962
980
 
963
- return cuda_count() > 0
981
+ if llm_spec.model_id and "{" in llm_spec.model_id:
982
+ return (
983
+ [handle_quantization(q) for q in llm_spec.quantizations]
984
+ if quantization is None
985
+ else handle_quantization(quantization)
986
+ )
987
+ else:
988
+ return (
989
+ [handle_quantization(q) for q in llm_spec.quantizations]
990
+ if llm_spec.model_format != "pytorch"
991
+ else handle_quantization(None)
992
+ )
964
993
 
965
994
 
966
995
  def get_user_defined_llm_families():
@@ -1006,6 +1035,7 @@ def match_llm(
1006
1035
  model_format: Optional[str] = None,
1007
1036
  model_size_in_billions: Optional[Union[int, str]] = None,
1008
1037
  quantization: Optional[str] = None,
1038
+ download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
1009
1039
  ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
1010
1040
  """
1011
1041
  Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -1029,7 +1059,22 @@ def match_llm(
1029
1059
  spec.model_id = spec.model_id.format(quantization=q)
1030
1060
  return spec
1031
1061
 
1032
- if download_from_modelscope():
1062
+ # priority: download_hub > download_from_modelscope() and download_from_csghub()
1063
+ if download_hub == "modelscope":
1064
+ all_families = (
1065
+ BUILTIN_MODELSCOPE_LLM_FAMILIES
1066
+ + BUILTIN_LLM_FAMILIES
1067
+ + user_defined_llm_families
1068
+ )
1069
+ elif download_hub == "csghub":
1070
+ all_families = (
1071
+ BUILTIN_CSGHUB_LLM_FAMILIES
1072
+ + BUILTIN_LLM_FAMILIES
1073
+ + user_defined_llm_families
1074
+ )
1075
+ elif download_hub == "huggingface":
1076
+ all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
1077
+ elif download_from_modelscope():
1033
1078
  all_families = (
1034
1079
  BUILTIN_MODELSCOPE_LLM_FAMILIES
1035
1080
  + BUILTIN_LLM_FAMILIES
@@ -304,21 +304,6 @@
304
304
  ],
305
305
  "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
306
306
  "model_specs": [
307
- {
308
- "model_format": "ggmlv3",
309
- "model_size_in_billions": 6,
310
- "quantizations": [
311
- "q4_0",
312
- "q4_1",
313
- "q5_0",
314
- "q5_1",
315
- "q8_0"
316
- ],
317
- "model_hub": "modelscope",
318
- "model_id": "Xorbits/chatglm2-6B-GGML",
319
- "model_revision": "v1.0.0",
320
- "model_file_name_template": "chatglm2-ggml-{quantization}.bin"
321
- },
322
307
  {
323
308
  "model_format": "pytorch",
324
309
  "model_size_in_billions": 6,
@@ -392,17 +377,6 @@
392
377
  ],
393
378
  "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
394
379
  "model_specs": [
395
- {
396
- "model_format": "ggmlv3",
397
- "model_size_in_billions": 6,
398
- "quantizations": [
399
- "q4_0"
400
- ],
401
- "model_hub": "modelscope",
402
- "model_id": "Xorbits/chatglm3-ggml",
403
- "model_revision": "v1.0.0",
404
- "model_file_name_template": "chatglm3-ggml-{quantization}.bin"
405
- },
406
380
  {
407
381
  "model_format": "pytorch",
408
382
  "model_size_in_billions": 6,
@@ -547,6 +521,33 @@
547
521
  "model_hub": "modelscope",
548
522
  "model_id": "ZhipuAI/glm-4-9b-chat",
549
523
  "model_revision": "master"
524
+ },
525
+ {
526
+ "model_format": "ggufv2",
527
+ "model_size_in_billions": 9,
528
+ "quantizations": [
529
+ "Q2_K",
530
+ "IQ3_XS",
531
+ "IQ3_S",
532
+ "IQ3_M",
533
+ "Q3_K_S",
534
+ "Q3_K_L",
535
+ "Q3_K",
536
+ "IQ4_XS",
537
+ "IQ4_NL",
538
+ "Q4_K_S",
539
+ "Q4_K",
540
+ "Q5_K_S",
541
+ "Q5_K",
542
+ "Q6_K",
543
+ "Q8_0",
544
+ "BF16",
545
+ "FP16"
546
+ ],
547
+ "model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
548
+ "model_hub": "modelscope",
549
+ "model_id": "LLM-Research/glm-4-9b-chat-GGUF",
550
+ "model_revision": "master"
550
551
  }
551
552
  ],
552
553
  "prompt_style": {
@@ -593,6 +594,33 @@
593
594
  "model_hub": "modelscope",
594
595
  "model_id": "ZhipuAI/glm-4-9b-chat-1m",
595
596
  "model_revision": "master"
597
+ },
598
+ {
599
+ "model_format": "ggufv2",
600
+ "model_size_in_billions": 9,
601
+ "quantizations": [
602
+ "Q2_K",
603
+ "IQ3_XS",
604
+ "IQ3_S",
605
+ "IQ3_M",
606
+ "Q3_K_S",
607
+ "Q3_K_L",
608
+ "Q3_K",
609
+ "IQ4_XS",
610
+ "IQ4_NL",
611
+ "Q4_K_S",
612
+ "Q4_K",
613
+ "Q5_K_S",
614
+ "Q5_K",
615
+ "Q6_K",
616
+ "Q8_0",
617
+ "BF16",
618
+ "FP16"
619
+ ],
620
+ "model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
621
+ "model_hub": "modelscope",
622
+ "model_id": "LLM-Research/glm-4-9b-chat-1m-GGUF",
623
+ "model_revision": "master"
596
624
  }
597
625
  ],
598
626
  "prompt_style": {
@@ -660,6 +688,66 @@
660
688
  ]
661
689
  }
662
690
  },
691
+ {
692
+ "version": 1,
693
+ "context_length": 131072,
694
+ "model_name": "codegeex4",
695
+ "model_lang": [
696
+ "en",
697
+ "zh"
698
+ ],
699
+ "model_ability": [
700
+ "chat"
701
+ ],
702
+ "model_description": "the open-source version of the latest CodeGeeX4 model series",
703
+ "model_specs": [
704
+ {
705
+ "model_format": "pytorch",
706
+ "model_size_in_billions": 9,
707
+ "quantizations": [
708
+ "4-bit",
709
+ "8-bit",
710
+ "none"
711
+ ],
712
+ "model_id": "ZhipuAI/codegeex4-all-9b",
713
+ "model_hub": "modelscope",
714
+ "model_revision": "master"
715
+ },
716
+ {
717
+ "model_format": "ggufv2",
718
+ "model_size_in_billions": 9,
719
+ "quantizations": [
720
+ "IQ2_M",
721
+ "IQ3_M",
722
+ "Q4_K_M",
723
+ "Q5_K_M",
724
+ "Q6_K_L",
725
+ "Q8_0"
726
+ ],
727
+ "model_file_name_template": "codegeex4-all-9b-{quantization}.gguf",
728
+ "model_id": "ZhipuAI/codegeex4-all-9b-GGUF",
729
+ "model_hub": "modelscope"
730
+ }
731
+ ],
732
+ "prompt_style": {
733
+ "style_name": "CHATGLM3",
734
+ "system_prompt": "",
735
+ "roles": [
736
+ "user",
737
+ "assistant"
738
+ ],
739
+ "stop_token_ids": [
740
+ 151329,
741
+ 151336,
742
+ 151338
743
+ ],
744
+ "stop": [
745
+ "<|endoftext|>",
746
+ "<|user|>",
747
+ "<|observation|>"
748
+ ]
749
+ }
750
+ },
663
751
  {
664
752
  "version": 1,
665
753
  "context_length": 2048,
@@ -900,6 +988,88 @@
900
988
  ]
901
989
  }
902
990
  },
991
+ {
992
+ "version": 1,
993
+ "context_length": 32768,
994
+ "model_name": "internlm2.5-chat",
995
+ "model_lang": [
996
+ "en",
997
+ "zh"
998
+ ],
999
+ "model_ability": [
1000
+ "chat"
1001
+ ],
1002
+ "model_description": "InternLM2.5 series of the InternLM model.",
1003
+ "model_specs": [
1004
+ {
1005
+ "model_format": "pytorch",
1006
+ "model_size_in_billions": 7,
1007
+ "quantizations": [
1008
+ "none"
1009
+ ],
1010
+ "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
1011
+ "model_hub": "modelscope"
1012
+ }
1013
+ ],
1014
+ "prompt_style": {
1015
+ "style_name": "INTERNLM2",
1016
+ "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
1017
+ "roles": [
1018
+ "<|im_start|>user",
1019
+ "<|im_start|>assistant"
1020
+ ],
1021
+ "intra_message_sep": "<|im_end|>",
1022
+ "stop_token_ids": [
1023
+ 2,
1024
+ 92542
1025
+ ],
1026
+ "stop": [
1027
+ "</s>",
1028
+ "<|im_end|>"
1029
+ ]
1030
+ }
1031
+ },
1032
+ {
1033
+ "version": 1,
1034
+ "context_length": 262144,
1035
+ "model_name": "internlm2.5-chat-1m",
1036
+ "model_lang": [
1037
+ "en",
1038
+ "zh"
1039
+ ],
1040
+ "model_ability": [
1041
+ "chat"
1042
+ ],
1043
+ "model_description": "InternLM2.5 series of the InternLM model supports 1M long-context",
1044
+ "model_specs": [
1045
+ {
1046
+ "model_format": "pytorch",
1047
+ "model_size_in_billions": 7,
1048
+ "quantizations": [
1049
+ "none"
1050
+ ],
1051
+ "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m",
1052
+ "model_hub": "modelscope"
1053
+ }
1054
+ ],
1055
+ "prompt_style": {
1056
+ "style_name": "INTERNLM2",
1057
+ "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
1058
+ "roles": [
1059
+ "<|im_start|>user",
1060
+ "<|im_start|>assistant"
1061
+ ],
1062
+ "intra_message_sep": "<|im_end|>",
1063
+ "stop_token_ids": [
1064
+ 2,
1065
+ 92542
1066
+ ],
1067
+ "stop": [
1068
+ "</s>",
1069
+ "<|im_end|>"
1070
+ ]
1071
+ }
1072
+ },
903
1073
  {
904
1074
  "version": 1,
905
1075
  "context_length": 100000,
@@ -3771,6 +3941,29 @@
3771
3941
  ],
3772
3942
  "model_id": "AI-ModelScope/gemma-2-27b-it",
3773
3943
  "model_hub": "modelscope"
3944
+ },
3945
+ {
3946
+ "model_format": "ggufv2",
3947
+ "model_size_in_billions": 9,
3948
+ "quantizations": [
3949
+ "Q2_K",
3950
+ "Q3_K_L",
3951
+ "Q3_K_M",
3952
+ "Q3_K_S",
3953
+ "Q4_K_L",
3954
+ "Q4_K_M",
3955
+ "Q4_K_S",
3956
+ "Q5_K_L",
3957
+ "Q5_K_M",
3958
+ "Q5_K_S",
3959
+ "Q6_K",
3960
+ "Q6_K_L",
3961
+ "Q8_0",
3962
+ "f32"
3963
+ ],
3964
+ "model_id": "LLM-Research/gemma-2-9b-it-GGUF",
3965
+ "model_file_name_template": "gemma-2-9b-it-{quantization}.gguf",
3966
+ "model_hub": "modelscope"
3774
3967
  }
3775
3968
  ],
3776
3969
  "prompt_style": {
@@ -4115,7 +4308,7 @@
4115
4308
  "zh"
4116
4309
  ],
4117
4310
  "model_ability": [
4118
- "generate"
4311
+ "chat"
4119
4312
  ],
4120
4313
  "model_description": "Aquila2-chat series models are the chat models",
4121
4314
  "model_specs": [
@@ -34,9 +34,6 @@ from ....types import (
34
34
  CompletionChoice,
35
35
  CompletionChunk,
36
36
  CreateCompletionTorch,
37
- Embedding,
38
- EmbeddingData,
39
- EmbeddingUsage,
40
37
  LoRA,
41
38
  PytorchGenerateConfig,
42
39
  PytorchModelConfig,
@@ -673,83 +670,6 @@ class PytorchModel(LLM):
673
670
  )
674
671
  self.handle_batch_inference_results(req_list)
675
672
 
676
- def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
677
- try:
678
- import torch
679
- import torch.nn.functional as F
680
- except ImportError as e:
681
- raise ImportError(
682
- "Could not import torch. Please install it with `pip install torch`."
683
- ) from e
684
-
685
- if isinstance(input, str):
686
- inputs = [input]
687
- else:
688
- inputs = input
689
-
690
- tokenizer = self._tokenizer
691
- tokenizer.pad_token = tokenizer.eos_token
692
- is_llama = "llama" in str(type(self._model)) # llama supports batch inference
693
- is_chatglm = "chatglm" in str(type(self._model))
694
- if is_llama:
695
- encoding = tokenizer.batch_encode_plus(
696
- inputs, padding=True, return_tensors="pt"
697
- )
698
- input_ids = encoding["input_ids"].to(self._device)
699
- attention_mask = encoding["attention_mask"].to(self._device)
700
- model_output = self._model(
701
- input_ids, attention_mask, output_hidden_states=True
702
- )
703
- data = model_output.hidden_states[-1]
704
- mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
705
- masked_embeddings = data * mask
706
- sum_embeddings = torch.sum(masked_embeddings, dim=1)
707
- seq_length = torch.sum(mask, dim=1)
708
- embedding = sum_embeddings / seq_length
709
- normalized_embeddings = F.normalize(embedding, p=2, dim=1)
710
- normalized_embeddings = normalized_embeddings.tolist()
711
- token_num = torch.sum(attention_mask).item()
712
-
713
- embedding_list = []
714
- for index, data in enumerate(normalized_embeddings):
715
- embedding_list.append(
716
- EmbeddingData(index=index, object="embedding", embedding=data)
717
- )
718
-
719
- usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
720
-
721
- ret = Embedding(
722
- object="list",
723
- model=self.model_uid,
724
- data=embedding_list,
725
- usage=usage,
726
- )
727
-
728
- else:
729
- embedding = []
730
- token_num = 0
731
- for index, text in enumerate(inputs):
732
- input_ids = tokenizer.encode(text, return_tensors="pt").to(self._device)
733
- model_output = self._model(input_ids, output_hidden_states=True)
734
- if is_chatglm:
735
- data = (model_output.hidden_states[-1].transpose(0, 1))[0]
736
- else:
737
- data = model_output.hidden_states[-1][0]
738
- data = F.normalize(torch.mean(data, dim=0), p=2, dim=0)
739
- data = data.tolist()
740
-
741
- embedding.append(
742
- EmbeddingData(index=index, object="embedding", embedding=data)
743
- )
744
- token_num += len(input_ids[0])
745
-
746
- usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
747
- ret = Embedding(
748
- object="list", model=self.model_uid, data=embedding, usage=usage
749
- )
750
-
751
- return ret
752
-
753
673
 
754
674
  class PytorchChatModel(PytorchModel, ChatModelMixin):
755
675
  def __init__(
@@ -269,8 +269,13 @@ class SGLANGModel(LLM):
269
269
  )
270
270
  stream = sanitized_generate_config.pop("stream")
271
271
  stream_options = sanitized_generate_config.pop("stream_options")
272
- if isinstance(stream_options, dict):
273
- include_usage = stream_options.pop("include_usage", False)
272
+
273
+ include_usage = (
274
+ stream_options.pop("include_usage")
275
+ if isinstance(stream_options, dict)
276
+ else False
277
+ )
278
+
274
279
  request_id = str(uuid.uuid1())
275
280
  state = pipeline.run(
276
281
  question=prompt,
@@ -779,8 +779,10 @@ Begin!"""
779
779
  def get_file_location(
780
780
  llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
781
781
  ) -> Tuple[str, bool]:
782
- cache_dir = _get_cache_dir(llm_family, spec, create_if_not_exist=False)
783
- cache_status = get_cache_status(llm_family, spec)
782
+ cache_dir = _get_cache_dir(
783
+ llm_family, spec, quantization, create_if_not_exist=False
784
+ )
785
+ cache_status = get_cache_status(llm_family, spec, quantization)
784
786
  if isinstance(cache_status, list):
785
787
  is_cached = None
786
788
  for q, cs in zip(spec.quantizations, cache_status):
@@ -112,6 +112,8 @@ VLLM_SUPPORTED_CHAT_MODELS = [
112
112
  "internlm-chat-8k",
113
113
  "internlm-chat-20b",
114
114
  "internlm2-chat",
115
+ "internlm2.5-chat",
116
+ "internlm2.5-chat-1m",
115
117
  "qwen-chat",
116
118
  "Yi-chat",
117
119
  "Yi-1.5-chat",
@@ -127,6 +129,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
127
129
  "chatglm3-128k",
128
130
  "glm4-chat",
129
131
  "glm4-chat-1m",
132
+ "codegeex4",
130
133
  "deepseek-chat",
131
134
  "deepseek-coder-instruct",
132
135
  ]