xinference 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

@@ -103,6 +103,86 @@
103
103
  "model_ability": "audio-to-text",
104
104
  "multilingual": false
105
105
  },
106
+ {
107
+ "model_name": "whisper-tiny-mlx",
108
+ "model_family": "whisper",
109
+ "model_id": "mlx-community/whisper-tiny",
110
+ "model_ability": "audio-to-text",
111
+ "multilingual": true,
112
+ "engine": "mlx"
113
+ },
114
+ {
115
+ "model_name": "whisper-tiny.en-mlx",
116
+ "model_family": "whisper",
117
+ "model_id": "mlx-community/whisper-tiny.en-mlx",
118
+ "model_ability": "audio-to-text",
119
+ "multilingual": false,
120
+ "engine": "mlx"
121
+ },
122
+ {
123
+ "model_name": "whisper-base-mlx",
124
+ "model_family": "whisper",
125
+ "model_id": "mlx-community/whisper-base-mlx",
126
+ "model_ability": "audio-to-text",
127
+ "multilingual": true,
128
+ "engine": "mlx"
129
+ },
130
+ {
131
+ "model_name": "whisper-base.en-mlx",
132
+ "model_family": "whisper",
133
+ "model_id": "mlx-community/whisper-base.en-mlx",
134
+ "model_ability": "audio-to-text",
135
+ "multilingual": false,
136
+ "engine": "mlx"
137
+ },
138
+ {
139
+ "model_name": "whisper-small-mlx",
140
+ "model_family": "whisper",
141
+ "model_id": "mlx-community/whisper-small-mlx",
142
+ "model_ability": "audio-to-text",
143
+ "multilingual": true,
144
+ "engine": "mlx"
145
+ },
146
+ {
147
+ "model_name": "whisper-small.en-mlx",
148
+ "model_family": "whisper",
149
+ "model_id": "mlx-community/whisper-small.en-mlx",
150
+ "model_ability": "audio-to-text",
151
+ "multilingual": false,
152
+ "engine": "mlx"
153
+ },
154
+ {
155
+ "model_name": "whisper-medium-mlx",
156
+ "model_family": "whisper",
157
+ "model_id": "mlx-community/whisper-medium-mlx",
158
+ "model_ability": "audio-to-text",
159
+ "multilingual": true,
160
+ "engine": "mlx"
161
+ },
162
+ {
163
+ "model_name": "whisper-medium.en-mlx",
164
+ "model_family": "whisper",
165
+ "model_id": "mlx-community/whisper-medium.en-mlx",
166
+ "model_ability": "audio-to-text",
167
+ "multilingual": false,
168
+ "engine": "mlx"
169
+ },
170
+ {
171
+ "model_name": "whisper-large-v3-mlx",
172
+ "model_family": "whisper",
173
+ "model_id": "mlx-community/whisper-large-v3-mlx",
174
+ "model_ability": "audio-to-text",
175
+ "multilingual": true,
176
+ "engine": "mlx"
177
+ },
178
+ {
179
+ "model_name": "whisper-large-v3-turbo-mlx",
180
+ "model_family": "whisper",
181
+ "model_id": "mlx-community/whisper-large-v3-turbo",
182
+ "model_ability": "audio-to-text",
183
+ "multilingual": true,
184
+ "engine": "mlx"
185
+ },
106
186
  {
107
187
  "model_name": "SenseVoiceSmall",
108
188
  "model_family": "funasr",
@@ -0,0 +1,208 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import functools
15
+ import itertools
16
+ import logging
17
+ import tempfile
18
+ from typing import TYPE_CHECKING, List, Optional
19
+
20
+ if TYPE_CHECKING:
21
+ from .core import AudioModelFamilyV1
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class WhisperMLXModel:
27
+ def __init__(
28
+ self,
29
+ model_uid: str,
30
+ model_path: str,
31
+ model_spec: "AudioModelFamilyV1",
32
+ device: Optional[str] = None,
33
+ **kwargs,
34
+ ):
35
+ self._model_uid = model_uid
36
+ self._model_path = model_path
37
+ self._model_spec = model_spec
38
+ self._device = device
39
+ self._model = None
40
+ self._kwargs = kwargs
41
+ self._use_lighting = False
42
+
43
+ @property
44
+ def model_ability(self):
45
+ return self._model_spec.model_ability
46
+
47
+ def load(self):
48
+ use_lightning = self._kwargs.get("use_lightning", "auto")
49
+ if use_lightning not in ("auto", True, False, None):
50
+ raise ValueError("use_lightning can only be True, False, None or auto")
51
+
52
+ if use_lightning == "auto" or use_lightning is True:
53
+ try:
54
+ import mlx.core as mx
55
+ from lightning_whisper_mlx.transcribe import ModelHolder
56
+ except ImportError:
57
+ if use_lightning == "auto":
58
+ use_lightning = False
59
+ else:
60
+ error_message = "Failed to import module 'lightning_whisper_mlx'"
61
+ installation_guide = [
62
+ "Please make sure 'lightning_whisper_mlx' is installed.\n",
63
+ ]
64
+
65
+ raise ImportError(
66
+ f"{error_message}\n\n{''.join(installation_guide)}"
67
+ )
68
+ else:
69
+ use_lightning = True
70
+ if not use_lightning:
71
+ try:
72
+ import mlx.core as mx # noqa: F811
73
+ from mlx_whisper.transcribe import ModelHolder # noqa: F811
74
+ except ImportError:
75
+ error_message = "Failed to import module 'mlx_whisper'"
76
+ installation_guide = [
77
+ "Please make sure 'mlx_whisper' is installed.\n",
78
+ ]
79
+
80
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
81
+ else:
82
+ use_lightning = False
83
+
84
+ logger.info(
85
+ "Loading MLX whisper from %s, use lightning: %s",
86
+ self._model_path,
87
+ use_lightning,
88
+ )
89
+ self._use_lighting = use_lightning
90
+ self._model = ModelHolder.get_model(self._model_path, mx.float16)
91
+
92
+ def transcriptions(
93
+ self,
94
+ audio: bytes,
95
+ language: Optional[str] = None,
96
+ prompt: Optional[str] = None,
97
+ response_format: str = "json",
98
+ temperature: float = 0,
99
+ timestamp_granularities: Optional[List[str]] = None,
100
+ ):
101
+ return self._call(
102
+ audio,
103
+ language=language,
104
+ prompt=prompt,
105
+ response_format=response_format,
106
+ temperature=temperature,
107
+ timestamp_granularities=timestamp_granularities,
108
+ task="transcribe",
109
+ )
110
+
111
+ def translations(
112
+ self,
113
+ audio: bytes,
114
+ language: Optional[str] = None,
115
+ prompt: Optional[str] = None,
116
+ response_format: str = "json",
117
+ temperature: float = 0,
118
+ timestamp_granularities: Optional[List[str]] = None,
119
+ ):
120
+ if not self._model_spec.multilingual:
121
+ raise RuntimeError(
122
+ f"Model {self._model_spec.model_name} is not suitable for translations."
123
+ )
124
+ return self._call(
125
+ audio,
126
+ language=language,
127
+ prompt=prompt,
128
+ response_format=response_format,
129
+ temperature=temperature,
130
+ timestamp_granularities=timestamp_granularities,
131
+ task="translate",
132
+ )
133
+
134
+ def _call(
135
+ self,
136
+ audio: bytes,
137
+ language: Optional[str] = None,
138
+ prompt: Optional[str] = None,
139
+ response_format: str = "json",
140
+ temperature: float = 0,
141
+ timestamp_granularities: Optional[List[str]] = None,
142
+ task: str = "transcribe",
143
+ ):
144
+ if self._use_lighting:
145
+ from lightning_whisper_mlx.transcribe import transcribe_audio
146
+
147
+ transcribe = functools.partial(
148
+ transcribe_audio, batch_size=self._kwargs.get("batch_size", 12)
149
+ )
150
+ else:
151
+ from mlx_whisper import transcribe # type: ignore
152
+
153
+ with tempfile.NamedTemporaryFile(delete=True) as f:
154
+ f.write(audio)
155
+
156
+ kwargs = {"task": task}
157
+ if response_format == "verbose_json":
158
+ if timestamp_granularities == ["word"]:
159
+ kwargs["word_timestamps"] = True # type: ignore
160
+
161
+ result = transcribe(
162
+ f.name,
163
+ path_or_hf_repo=self._model_path,
164
+ language=language,
165
+ temperature=temperature,
166
+ initial_prompt=prompt,
167
+ **kwargs,
168
+ )
169
+ text = result["text"]
170
+ segments = result["segments"]
171
+ language = result["language"]
172
+
173
+ if response_format == "json":
174
+ return {"text": text}
175
+ elif response_format == "verbose_json":
176
+ if not timestamp_granularities or timestamp_granularities == [
177
+ "segment"
178
+ ]:
179
+ return {
180
+ "task": task,
181
+ "language": language,
182
+ "duration": segments[-1]["end"] if segments else 0,
183
+ "text": text,
184
+ "segments": segments,
185
+ }
186
+ else:
187
+ assert timestamp_granularities == ["word"]
188
+
189
+ def _extract_word(word: dict) -> dict:
190
+ return {
191
+ "start": word["start"].item(),
192
+ "end": word["end"].item(),
193
+ "word": word["word"],
194
+ }
195
+
196
+ words = [
197
+ _extract_word(w)
198
+ for w in itertools.chain(*[s["words"] for s in segments])
199
+ ]
200
+ return {
201
+ "task": task,
202
+ "language": language,
203
+ "duration": words[-1]["end"] if words else 0,
204
+ "text": text,
205
+ "words": words,
206
+ }
207
+ else:
208
+ raise ValueError(f"Unsupported response format: {response_format}")
@@ -21,6 +21,7 @@ from typing import Dict, List, Literal, Optional, Tuple, Union, no_type_check
21
21
  import numpy as np
22
22
  import torch
23
23
 
24
+ from ..._compat import ROOT_KEY, ErrorWrapper, ValidationError
24
25
  from ...device_utils import empty_cache
25
26
  from ...types import Embedding, EmbeddingData, EmbeddingUsage
26
27
  from ..core import CacheableModelSpec, ModelDescription
@@ -193,6 +194,27 @@ class EmbeddingModel:
193
194
  device=self._device,
194
195
  model_kwargs=model_kwargs,
195
196
  )
197
+ elif (
198
+ self._kwargs.get("hybrid_mode")
199
+ and "m3" in self._model_spec.model_name.lower()
200
+ ):
201
+ try:
202
+ from FlagEmbedding import BGEM3FlagModel
203
+ except ImportError:
204
+ error_message = "Failed to import module 'BGEM3FlagModel'"
205
+ installation_guide = [
206
+ "Please make sure 'FlagEmbedding' is installed. ",
207
+ "You can install it by `pip install FlagEmbedding`\n",
208
+ ]
209
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
210
+
211
+ model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
212
+ self._model = BGEM3FlagModel(
213
+ self._model_path,
214
+ device=self._device,
215
+ model_kwargs=model_kwargs,
216
+ trust_remote_code=True,
217
+ )
196
218
  else:
197
219
  model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
198
220
  self._model = SentenceTransformer(
@@ -202,11 +224,192 @@ class EmbeddingModel:
202
224
  trust_remote_code=True,
203
225
  )
204
226
 
227
+ def _fix_langchain_openai_inputs(self, sentences: Union[str, List[str]]):
228
+ # Check if sentences is a two-dimensional list of integers
229
+ if (
230
+ isinstance(sentences, list)
231
+ and len(sentences) > 0
232
+ and isinstance(sentences[0], list)
233
+ and len(sentences[0]) > 0
234
+ and isinstance(sentences[0][0], int)
235
+ ):
236
+ # List[List[int]] stands for encoded inputs
237
+ import tiktoken
238
+
239
+ enc = tiktoken.get_encoding("cl100k_base")
240
+ lines_decoded = []
241
+
242
+ for line in sentences:
243
+ try:
244
+ # Decode each token into bytes, then join them into a complete string
245
+ output = b"".join(
246
+ enc.decode_single_token_bytes(token) for token in line
247
+ )
248
+ # Convert the byte sequence into a UTF-8 encoded string
249
+ decoded_line = output.decode("utf-8")
250
+ lines_decoded.append(decoded_line)
251
+ except (ValueError, TypeError, UnicodeDecodeError) as e:
252
+ raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], self)
253
+
254
+ # Update sentences to be the list of decoded strings
255
+ if len(lines_decoded) == 1:
256
+ sentences = lines_decoded[0]
257
+ else:
258
+ sentences = lines_decoded
259
+ return sentences
260
+
205
261
  def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
262
+ sentences = self._fix_langchain_openai_inputs(sentences)
263
+
264
+ from FlagEmbedding import BGEM3FlagModel
206
265
  from sentence_transformers import SentenceTransformer
207
266
 
208
267
  kwargs.setdefault("normalize_embeddings", True)
209
268
 
269
+ @no_type_check
270
+ def _encode_bgem3(
271
+ model: Union[SentenceTransformer, BGEM3FlagModel],
272
+ sentences: Union[str, List[str]],
273
+ batch_size: int = 32,
274
+ show_progress_bar: bool = None,
275
+ output_value: str = "sparse_embedding",
276
+ convert_to_numpy: bool = True,
277
+ convert_to_tensor: bool = False,
278
+ device: str = None,
279
+ normalize_embeddings: bool = False,
280
+ **kwargs,
281
+ ):
282
+ """
283
+ Computes sentence embeddings with bge-m3 model
284
+ Nothing special here, just replace sentence-transformer with FlagEmbedding
285
+ TODO: think about how to solve the redundant code of encode method in the future
286
+
287
+ :param sentences: the sentences to embed
288
+ :param batch_size: the batch size used for the computation
289
+ :param show_progress_bar: Output a progress bar when encode sentences
290
+ :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
291
+ :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
292
+ :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
293
+ :param device: Which torch.device to use for the computation
294
+ :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
295
+
296
+ :return:
297
+ By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
298
+ """
299
+ import torch
300
+ from tqdm.autonotebook import trange
301
+
302
+ if show_progress_bar is None:
303
+ show_progress_bar = (
304
+ logger.getEffectiveLevel() == logging.INFO
305
+ or logger.getEffectiveLevel() == logging.DEBUG
306
+ )
307
+
308
+ if convert_to_tensor:
309
+ convert_to_numpy = False
310
+
311
+ if output_value != "sparse_embedding":
312
+ convert_to_tensor = False
313
+ convert_to_numpy = False
314
+
315
+ input_was_string = False
316
+ if isinstance(sentences, str) or not hasattr(
317
+ sentences, "__len__"
318
+ ): # Cast an individual sentence to a list with length 1
319
+ sentences = [sentences]
320
+ input_was_string = True
321
+
322
+ if device is None:
323
+ # Same as SentenceTransformer.py
324
+ from sentence_transformers.util import get_device_name
325
+
326
+ device = get_device_name()
327
+ logger.info(f"Use pytorch device_name: {device}")
328
+
329
+ all_embeddings = []
330
+ all_token_nums = 0
331
+
332
+ # The original code does not support other inference engines
333
+ def _text_length(text):
334
+ if isinstance(text, dict): # {key: value} case
335
+ return len(next(iter(text.values())))
336
+ elif not hasattr(text, "__len__"): # Object has no len() method
337
+ return 1
338
+ elif len(text) == 0 or isinstance(
339
+ text[0], int
340
+ ): # Empty string or list of ints
341
+ return len(text)
342
+ else:
343
+ return sum(
344
+ [len(t) for t in text]
345
+ ) # Sum of length of individual strings
346
+
347
+ length_sorted_idx = np.argsort([-_text_length(sen) for sen in sentences])
348
+ sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
349
+
350
+ for start_index in trange(
351
+ 0,
352
+ len(sentences),
353
+ batch_size,
354
+ desc="Batches",
355
+ disable=not show_progress_bar,
356
+ ):
357
+ sentences_batch = sentences_sorted[
358
+ start_index : start_index + batch_size
359
+ ]
360
+
361
+ with torch.no_grad():
362
+ out_features = model.encode(sentences_batch, **kwargs)
363
+
364
+ if output_value == "token_embeddings":
365
+ embeddings = []
366
+ for token_emb, attention in zip(
367
+ out_features[output_value], out_features["attention_mask"]
368
+ ):
369
+ last_mask_id = len(attention) - 1
370
+ while (
371
+ last_mask_id > 0 and attention[last_mask_id].item() == 0
372
+ ):
373
+ last_mask_id -= 1
374
+
375
+ embeddings.append(token_emb[0 : last_mask_id + 1])
376
+ elif output_value is None: # Return all outputs
377
+ embeddings = []
378
+ for sent_idx in range(len(out_features["sentence_embedding"])):
379
+ row = {
380
+ name: out_features[name][sent_idx]
381
+ for name in out_features
382
+ }
383
+ embeddings.append(row)
384
+ # for sparse embedding
385
+ else:
386
+ if kwargs.get("return_sparse"):
387
+ embeddings = out_features["lexical_weights"]
388
+ else:
389
+ embeddings = out_features["dense_vecs"]
390
+
391
+ if convert_to_numpy:
392
+ embeddings = embeddings.cpu()
393
+
394
+ all_embeddings.extend(embeddings)
395
+
396
+ all_embeddings = [
397
+ all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
398
+ ]
399
+
400
+ if convert_to_tensor:
401
+ if len(all_embeddings):
402
+ all_embeddings = torch.stack(all_embeddings)
403
+ else:
404
+ all_embeddings = torch.Tensor()
405
+ elif convert_to_numpy:
406
+ all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
407
+
408
+ if input_was_string:
409
+ all_embeddings = all_embeddings[0]
410
+
411
+ return all_embeddings, all_token_nums
412
+
210
413
  # copied from sentence-transformers, and modify it to return tokens num
211
414
  @no_type_check
212
415
  def encode(
@@ -390,6 +593,10 @@ class EmbeddingModel:
390
593
  convert_to_numpy=False,
391
594
  **kwargs,
392
595
  )
596
+ elif isinstance(self._model, BGEM3FlagModel):
597
+ all_embeddings, all_token_nums = _encode_bgem3(
598
+ self._model, sentences, convert_to_numpy=False, **kwargs
599
+ )
393
600
  else:
394
601
  all_embeddings, all_token_nums = encode(
395
602
  self._model,
@@ -401,14 +608,30 @@ class EmbeddingModel:
401
608
  all_embeddings = [all_embeddings]
402
609
  embedding_list = []
403
610
  for index, data in enumerate(all_embeddings):
404
- embedding_list.append(
405
- EmbeddingData(index=index, object="embedding", embedding=data.tolist())
406
- )
611
+ if kwargs.get("return_sparse") and isinstance(self._model, BGEM3FlagModel):
612
+ embedding_list.append(
613
+ EmbeddingData(
614
+ index=index,
615
+ object="embedding",
616
+ embedding={k: float(v) for k, v in data.items()},
617
+ )
618
+ )
619
+ else:
620
+ embedding_list.append(
621
+ EmbeddingData(
622
+ index=index, object="embedding", embedding=data.tolist()
623
+ )
624
+ )
407
625
  usage = EmbeddingUsage(
408
626
  prompt_tokens=all_token_nums, total_tokens=all_token_nums
409
627
  )
410
628
  result = Embedding(
411
- object="list",
629
+ object=(
630
+ "list" # type: ignore
631
+ if not isinstance(self._model, BGEM3FlagModel)
632
+ and not kwargs.get("return_sparse")
633
+ else "dict"
634
+ ),
412
635
  model=self._model_uid,
413
636
  data=embedding_list,
414
637
  usage=usage,
@@ -430,6 +653,38 @@ class EmbeddingModel:
430
653
 
431
654
  return result
432
655
 
656
+ def convert_ids_to_tokens(
657
+ self,
658
+ batch_token_ids: Union[List[Union[int, str]], List[List[Union[int, str]]]],
659
+ **kwargs,
660
+ ) -> Union[List[str]]:
661
+ batch_decoded_texts: List[str] = []
662
+
663
+ assert self._model is not None
664
+
665
+ if isinstance(batch_token_ids, (int, str)):
666
+ return self._model.tokenizer.convert_ids_to_tokens(
667
+ [int(str(batch_token_ids))]
668
+ )[0]
669
+
670
+ # check if it's a nested list
671
+ if (
672
+ isinstance(batch_token_ids, list)
673
+ and batch_token_ids
674
+ and isinstance(batch_token_ids[0], list)
675
+ ):
676
+ for token_ids in batch_token_ids:
677
+ token_ids = [int(token_id) for token_id in token_ids]
678
+ batch_decoded_texts.append(
679
+ self._model.tokenizer.convert_ids_to_tokens(token_ids)
680
+ )
681
+ else:
682
+ batch_token_ids = [int(token_id) for token_id in batch_token_ids]
683
+ batch_decoded_texts = self._model.tokenizer.convert_ids_to_tokens(
684
+ batch_token_ids
685
+ )
686
+ return batch_decoded_texts
687
+
433
688
 
434
689
  def match_embedding(
435
690
  model_name: str,
@@ -233,7 +233,7 @@
233
233
  },
234
234
  {
235
235
  "model_name": "gte-Qwen2",
236
- "dimensions": 4096,
236
+ "dimensions": 3584,
237
237
  "max_tokens": 32000,
238
238
  "language": ["zh", "en"],
239
239
  "model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
@@ -235,7 +235,7 @@
235
235
  },
236
236
  {
237
237
  "model_name": "gte-Qwen2",
238
- "dimensions": 4096,
238
+ "dimensions": 3584,
239
239
  "max_tokens": 32000,
240
240
  "language": ["zh", "en"],
241
241
  "model_id": "iic/gte_Qwen2-7B-instruct",
@@ -143,6 +143,7 @@ def _install():
143
143
  )
144
144
  from .transformers.deepseek_vl import DeepSeekVLChatModel
145
145
  from .transformers.glm4v import Glm4VModel
146
+ from .transformers.glm_edge_v import GlmEdgeVModel
146
147
  from .transformers.intern_vl import InternVLChatModel
147
148
  from .transformers.internlm2 import Internlm2PytorchChatModel
148
149
  from .transformers.minicpmv25 import MiniCPMV25Model
@@ -193,6 +194,7 @@ def _install():
193
194
  DeepSeekV2PytorchModel,
194
195
  DeepSeekV2PytorchChatModel,
195
196
  OptPytorchModel,
197
+ GlmEdgeVModel,
196
198
  ]
197
199
  )
198
200
  if OmniLMMModel: # type: ignore