nexaai 1.0.4rc13__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nexaai might be problematic. Click here for more details.
- nexaai/__init__.py +71 -0
- nexaai/_stub.cp310-win_amd64.pyd +0 -0
- nexaai/_version.py +4 -0
- nexaai/asr.py +60 -0
- nexaai/asr_impl/__init__.py +0 -0
- nexaai/asr_impl/mlx_asr_impl.py +91 -0
- nexaai/asr_impl/pybind_asr_impl.py +43 -0
- nexaai/base.py +39 -0
- nexaai/binds/__init__.py +3 -0
- nexaai/binds/common_bind.cp310-win_amd64.pyd +0 -0
- nexaai/binds/embedder_bind.cp310-win_amd64.pyd +0 -0
- nexaai/binds/llm_bind.cp310-win_amd64.pyd +0 -0
- nexaai/binds/nexa_bridge.dll +0 -0
- nexaai/binds/nexa_llama_cpp/ggml-base.dll +0 -0
- nexaai/binds/nexa_llama_cpp/ggml-cpu.dll +0 -0
- nexaai/binds/nexa_llama_cpp/ggml-cuda.dll +0 -0
- nexaai/binds/nexa_llama_cpp/ggml-vulkan.dll +0 -0
- nexaai/binds/nexa_llama_cpp/ggml.dll +0 -0
- nexaai/binds/nexa_llama_cpp/llama.dll +0 -0
- nexaai/binds/nexa_llama_cpp/mtmd.dll +0 -0
- nexaai/binds/nexa_llama_cpp/nexa_plugin.dll +0 -0
- nexaai/common.py +61 -0
- nexaai/cv.py +87 -0
- nexaai/cv_impl/__init__.py +0 -0
- nexaai/cv_impl/mlx_cv_impl.py +88 -0
- nexaai/cv_impl/pybind_cv_impl.py +31 -0
- nexaai/embedder.py +68 -0
- nexaai/embedder_impl/__init__.py +0 -0
- nexaai/embedder_impl/mlx_embedder_impl.py +114 -0
- nexaai/embedder_impl/pybind_embedder_impl.py +91 -0
- nexaai/image_gen.py +136 -0
- nexaai/image_gen_impl/__init__.py +0 -0
- nexaai/image_gen_impl/mlx_image_gen_impl.py +291 -0
- nexaai/image_gen_impl/pybind_image_gen_impl.py +84 -0
- nexaai/llm.py +89 -0
- nexaai/llm_impl/__init__.py +0 -0
- nexaai/llm_impl/mlx_llm_impl.py +249 -0
- nexaai/llm_impl/pybind_llm_impl.py +207 -0
- nexaai/rerank.py +51 -0
- nexaai/rerank_impl/__init__.py +0 -0
- nexaai/rerank_impl/mlx_rerank_impl.py +91 -0
- nexaai/rerank_impl/pybind_rerank_impl.py +42 -0
- nexaai/runtime.py +64 -0
- nexaai/tts.py +70 -0
- nexaai/tts_impl/__init__.py +0 -0
- nexaai/tts_impl/mlx_tts_impl.py +93 -0
- nexaai/tts_impl/pybind_tts_impl.py +42 -0
- nexaai/utils/avatar_fetcher.py +104 -0
- nexaai/utils/decode.py +18 -0
- nexaai/utils/model_manager.py +1195 -0
- nexaai/utils/progress_tracker.py +372 -0
- nexaai/vlm.py +120 -0
- nexaai/vlm_impl/__init__.py +0 -0
- nexaai/vlm_impl/mlx_vlm_impl.py +205 -0
- nexaai/vlm_impl/pybind_vlm_impl.py +228 -0
- nexaai-1.0.4rc13.dist-info/METADATA +26 -0
- nexaai-1.0.4rc13.dist-info/RECORD +59 -0
- nexaai-1.0.4rc13.dist-info/WHEEL +5 -0
- nexaai-1.0.4rc13.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
from typing import Generator, Optional, Any
|
|
2
|
+
|
|
3
|
+
from nexaai.common import ModelConfig, GenerationConfig, ChatMessage
|
|
4
|
+
from nexaai.llm import LLM
|
|
5
|
+
from nexaai.mlx_backend.llm.interface import LLM as MLXLLMInterface
|
|
6
|
+
from nexaai.mlx_backend.ml import ModelConfig as MLXModelConfig, SamplerConfig as MLXSamplerConfig, GenerationConfig as MLXGenerationConfig, EmbeddingConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MLXLLMImpl(LLM):
|
|
10
|
+
def __init__(self, m_cfg: ModelConfig = ModelConfig()):
|
|
11
|
+
"""Initialize MLX LLM implementation."""
|
|
12
|
+
super().__init__(m_cfg)
|
|
13
|
+
self._mlx_llm = None
|
|
14
|
+
|
|
15
|
+
@classmethod
|
|
16
|
+
def _load_from(cls,
|
|
17
|
+
local_path: str,
|
|
18
|
+
tokenizer_path: Optional[str] = None,
|
|
19
|
+
m_cfg: ModelConfig = ModelConfig(),
|
|
20
|
+
plugin_id: str = "mlx",
|
|
21
|
+
device_id: Optional[str] = None
|
|
22
|
+
) -> 'MLXLLMImpl':
|
|
23
|
+
"""Load model from local path using MLX backend."""
|
|
24
|
+
try:
|
|
25
|
+
# MLX interface and configs are already imported
|
|
26
|
+
|
|
27
|
+
# Convert our ModelConfig to MLX ModelConfig
|
|
28
|
+
mlx_config = MLXModelConfig()
|
|
29
|
+
mlx_config.n_ctx = m_cfg.n_ctx
|
|
30
|
+
mlx_config.n_threads = m_cfg.n_threads
|
|
31
|
+
mlx_config.n_threads_batch = m_cfg.n_threads_batch
|
|
32
|
+
mlx_config.n_batch = m_cfg.n_batch
|
|
33
|
+
mlx_config.n_ubatch = m_cfg.n_ubatch
|
|
34
|
+
mlx_config.n_seq_max = m_cfg.n_seq_max
|
|
35
|
+
mlx_config.chat_template_path = m_cfg.chat_template_path
|
|
36
|
+
mlx_config.chat_template_content = m_cfg.chat_template_content
|
|
37
|
+
|
|
38
|
+
# Create instance and load MLX model
|
|
39
|
+
instance = cls(m_cfg)
|
|
40
|
+
instance._mlx_llm = MLXLLMInterface(
|
|
41
|
+
model_path=local_path,
|
|
42
|
+
tokenizer_path=tokenizer_path or local_path,
|
|
43
|
+
config=mlx_config,
|
|
44
|
+
device=device_id
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return instance
|
|
48
|
+
except Exception as e:
|
|
49
|
+
raise RuntimeError(f"Failed to load MLX LLM: {str(e)}")
|
|
50
|
+
|
|
51
|
+
def eject(self):
|
|
52
|
+
"""Release the model from memory."""
|
|
53
|
+
if self._mlx_llm:
|
|
54
|
+
self._mlx_llm.destroy()
|
|
55
|
+
self._mlx_llm = None
|
|
56
|
+
|
|
57
|
+
def apply_chat_template(self, messages: list[ChatMessage]) -> str:
|
|
58
|
+
"""Apply the chat template to messages."""
|
|
59
|
+
if not self._mlx_llm:
|
|
60
|
+
raise RuntimeError("MLX LLM not loaded")
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
# Convert to MLX ChatMessage format
|
|
64
|
+
mlx_messages = []
|
|
65
|
+
for msg in messages:
|
|
66
|
+
# Create a simple object with role and content attributes
|
|
67
|
+
class MLXChatMessage:
|
|
68
|
+
def __init__(self, role, content):
|
|
69
|
+
self.role = role
|
|
70
|
+
self.content = content
|
|
71
|
+
mlx_messages.append(MLXChatMessage(msg["role"], msg["content"]))
|
|
72
|
+
|
|
73
|
+
return self._mlx_llm.apply_chat_template(mlx_messages)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
raise RuntimeError(f"Failed to apply chat template: {str(e)}")
|
|
76
|
+
|
|
77
|
+
def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
|
|
78
|
+
"""Generate text with streaming."""
|
|
79
|
+
if not self._mlx_llm:
|
|
80
|
+
raise RuntimeError("MLX LLM not loaded")
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
import queue
|
|
84
|
+
import threading
|
|
85
|
+
|
|
86
|
+
# Convert GenerationConfig to MLX format
|
|
87
|
+
|
|
88
|
+
mlx_gen_config = MLXGenerationConfig()
|
|
89
|
+
mlx_gen_config.max_tokens = g_cfg.max_tokens
|
|
90
|
+
mlx_gen_config.stop = g_cfg.stop_words
|
|
91
|
+
mlx_gen_config.image_paths = g_cfg.image_paths
|
|
92
|
+
mlx_gen_config.audio_paths = g_cfg.audio_paths
|
|
93
|
+
|
|
94
|
+
if g_cfg.sampler_config:
|
|
95
|
+
mlx_sampler_config = MLXSamplerConfig()
|
|
96
|
+
mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
|
|
97
|
+
mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
|
|
98
|
+
mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
|
|
99
|
+
mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
|
|
100
|
+
mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
|
|
101
|
+
mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
|
|
102
|
+
mlx_sampler_config.seed = g_cfg.sampler_config.seed
|
|
103
|
+
mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
|
|
104
|
+
mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
|
|
105
|
+
mlx_gen_config.sampler_config = mlx_sampler_config
|
|
106
|
+
|
|
107
|
+
# Create a queue for streaming tokens
|
|
108
|
+
token_queue = queue.Queue()
|
|
109
|
+
exception_container = [None]
|
|
110
|
+
self.reset_cancel() # Reset cancel flag before generation
|
|
111
|
+
|
|
112
|
+
def token_callback(token: str, user_data: Any = None) -> bool:
|
|
113
|
+
if self._cancel_event.is_set():
|
|
114
|
+
token_queue.put(('end', None))
|
|
115
|
+
return False
|
|
116
|
+
try:
|
|
117
|
+
token_queue.put(('token', token))
|
|
118
|
+
return True
|
|
119
|
+
except Exception as e:
|
|
120
|
+
exception_container[0] = e
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
# Run generation in a separate thread
|
|
124
|
+
def generate():
|
|
125
|
+
try:
|
|
126
|
+
self._mlx_llm.generate_stream(prompt, mlx_gen_config, token_callback)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
exception_container[0] = e
|
|
129
|
+
finally:
|
|
130
|
+
token_queue.put(('end', None))
|
|
131
|
+
|
|
132
|
+
thread = threading.Thread(target=generate)
|
|
133
|
+
thread.start()
|
|
134
|
+
|
|
135
|
+
# Yield tokens as they come from the queue
|
|
136
|
+
while True:
|
|
137
|
+
if exception_container[0]:
|
|
138
|
+
raise exception_container[0]
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
msg_type, token = token_queue.get(timeout=0.1)
|
|
142
|
+
if msg_type == 'end':
|
|
143
|
+
break
|
|
144
|
+
elif msg_type == 'token':
|
|
145
|
+
yield token
|
|
146
|
+
except queue.Empty:
|
|
147
|
+
if not thread.is_alive():
|
|
148
|
+
break
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
thread.join()
|
|
152
|
+
|
|
153
|
+
if exception_container[0]:
|
|
154
|
+
raise exception_container[0]
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
raise RuntimeError(f"Failed to generate streaming text: {str(e)}")
|
|
158
|
+
|
|
159
|
+
def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Generate text without streaming.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
prompt (str): The prompt to generate text from.
|
|
165
|
+
g_cfg (GenerationConfig): Generation configuration.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
str: The generated text.
|
|
169
|
+
"""
|
|
170
|
+
if not self._mlx_llm:
|
|
171
|
+
raise RuntimeError("MLX LLM not loaded")
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# Convert GenerationConfig to MLX format
|
|
175
|
+
|
|
176
|
+
mlx_gen_config = MLXGenerationConfig()
|
|
177
|
+
mlx_gen_config.max_tokens = g_cfg.max_tokens
|
|
178
|
+
mlx_gen_config.stop = g_cfg.stop_words
|
|
179
|
+
mlx_gen_config.image_paths = g_cfg.image_paths
|
|
180
|
+
mlx_gen_config.audio_paths = g_cfg.audio_paths
|
|
181
|
+
|
|
182
|
+
if g_cfg.sampler_config:
|
|
183
|
+
mlx_sampler_config = MLXSamplerConfig()
|
|
184
|
+
mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
|
|
185
|
+
mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
|
|
186
|
+
mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
|
|
187
|
+
mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
|
|
188
|
+
mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
|
|
189
|
+
mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
|
|
190
|
+
mlx_sampler_config.seed = g_cfg.sampler_config.seed
|
|
191
|
+
mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
|
|
192
|
+
mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
|
|
193
|
+
mlx_gen_config.sampler_config = mlx_sampler_config
|
|
194
|
+
|
|
195
|
+
# Simple token callback that just continues
|
|
196
|
+
def token_callback(token: str, user_data: Any = None) -> bool:
|
|
197
|
+
return not self._cancel_event.is_set()
|
|
198
|
+
|
|
199
|
+
# Use MLX streaming generation and return the full result
|
|
200
|
+
return self._mlx_llm.generate_stream(prompt, mlx_gen_config, token_callback)
|
|
201
|
+
|
|
202
|
+
except Exception as e:
|
|
203
|
+
raise RuntimeError(f"Failed to generate text: {str(e)}")
|
|
204
|
+
|
|
205
|
+
def save_kv_cache(self, path: str):
|
|
206
|
+
"""
|
|
207
|
+
Save the key-value cache to the file.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
path (str): The path to the file.
|
|
211
|
+
"""
|
|
212
|
+
if not self._mlx_llm:
|
|
213
|
+
raise RuntimeError("MLX LLM not loaded")
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
success = self._mlx_llm.save_kv_cache(path)
|
|
217
|
+
if not success:
|
|
218
|
+
raise RuntimeError("Failed to save KV cache")
|
|
219
|
+
except Exception as e:
|
|
220
|
+
raise RuntimeError(f"Failed to save KV cache: {str(e)}")
|
|
221
|
+
|
|
222
|
+
def load_kv_cache(self, path: str):
|
|
223
|
+
"""
|
|
224
|
+
Load the key-value cache from the file.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
path (str): The path to the file.
|
|
228
|
+
"""
|
|
229
|
+
if not self._mlx_llm:
|
|
230
|
+
raise RuntimeError("MLX LLM not loaded")
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
success = self._mlx_llm.load_kv_cache(path)
|
|
234
|
+
if not success:
|
|
235
|
+
raise RuntimeError("Failed to load KV cache")
|
|
236
|
+
except Exception as e:
|
|
237
|
+
raise RuntimeError(f"Failed to load KV cache: {str(e)}")
|
|
238
|
+
|
|
239
|
+
def reset(self):
|
|
240
|
+
"""
|
|
241
|
+
Reset the LLM model context and KV cache.
|
|
242
|
+
"""
|
|
243
|
+
if not self._mlx_llm:
|
|
244
|
+
raise RuntimeError("MLX LLM not loaded")
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
self._mlx_llm.reset()
|
|
248
|
+
except Exception as e:
|
|
249
|
+
raise RuntimeError(f"Failed to reset MLX LLM: {str(e)}")
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
from typing import Generator, Optional
|
|
2
|
+
import queue
|
|
3
|
+
import threading
|
|
4
|
+
|
|
5
|
+
from nexaai.common import ModelConfig, GenerationConfig, ChatMessage
|
|
6
|
+
from nexaai.binds import llm_bind, common_bind
|
|
7
|
+
from nexaai.runtime import _ensure_runtime
|
|
8
|
+
from nexaai.llm import LLM
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PyBindLLMImpl(LLM):
|
|
12
|
+
def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
|
|
13
|
+
"""Private constructor, should not be called directly."""
|
|
14
|
+
super().__init__(m_cfg)
|
|
15
|
+
self._handle = handle # This is a py::capsule
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def _load_from(cls,
|
|
19
|
+
local_path: str,
|
|
20
|
+
tokenizer_path: Optional[str] = None,
|
|
21
|
+
m_cfg: ModelConfig = ModelConfig(),
|
|
22
|
+
plugin_id: str = "llama_cpp",
|
|
23
|
+
device_id: Optional[str] = None
|
|
24
|
+
) -> 'PyBindLLMImpl':
|
|
25
|
+
"""Load model from local path."""
|
|
26
|
+
_ensure_runtime()
|
|
27
|
+
|
|
28
|
+
config = common_bind.ModelConfig()
|
|
29
|
+
|
|
30
|
+
config.n_ctx = m_cfg.n_ctx
|
|
31
|
+
if m_cfg.n_threads is not None:
|
|
32
|
+
config.n_threads = m_cfg.n_threads
|
|
33
|
+
if m_cfg.n_threads_batch is not None:
|
|
34
|
+
config.n_threads_batch = m_cfg.n_threads_batch
|
|
35
|
+
if m_cfg.n_batch is not None:
|
|
36
|
+
config.n_batch = m_cfg.n_batch
|
|
37
|
+
if m_cfg.n_ubatch is not None:
|
|
38
|
+
config.n_ubatch = m_cfg.n_ubatch
|
|
39
|
+
if m_cfg.n_seq_max is not None:
|
|
40
|
+
config.n_seq_max = m_cfg.n_seq_max
|
|
41
|
+
if m_cfg.n_gpu_layers is not None:
|
|
42
|
+
config.n_gpu_layers = m_cfg.n_gpu_layers
|
|
43
|
+
|
|
44
|
+
# handle chat template strings
|
|
45
|
+
if m_cfg.chat_template_path:
|
|
46
|
+
config.chat_template_path = m_cfg.chat_template_path
|
|
47
|
+
|
|
48
|
+
if m_cfg.chat_template_content:
|
|
49
|
+
config.chat_template_content = m_cfg.chat_template_content
|
|
50
|
+
|
|
51
|
+
# Create handle : returns py::capsule with automatic cleanup
|
|
52
|
+
handle = llm_bind.ml_llm_create(
|
|
53
|
+
model_path=local_path,
|
|
54
|
+
tokenizer_path=tokenizer_path,
|
|
55
|
+
model_config=config,
|
|
56
|
+
plugin_id=plugin_id,
|
|
57
|
+
device_id=device_id
|
|
58
|
+
)
|
|
59
|
+
return cls(handle, m_cfg)
|
|
60
|
+
|
|
61
|
+
def eject(self):
|
|
62
|
+
"""Release the model from memory."""
|
|
63
|
+
# py::capsule handles cleanup automatically
|
|
64
|
+
del self._handle
|
|
65
|
+
self._handle = None
|
|
66
|
+
|
|
67
|
+
def apply_chat_template(self, messages: list[ChatMessage]) -> str:
|
|
68
|
+
"""Apply the chat template to messages."""
|
|
69
|
+
# Convert TypedDict to list of dicts for binding
|
|
70
|
+
message_dicts = [
|
|
71
|
+
{"role": m["role"], "content": m["content"]}
|
|
72
|
+
for m in messages
|
|
73
|
+
]
|
|
74
|
+
return llm_bind.ml_llm_apply_chat_template(self._handle, message_dicts)
|
|
75
|
+
|
|
76
|
+
def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
|
|
77
|
+
"""Generate text with streaming."""
|
|
78
|
+
token_queue = queue.Queue()
|
|
79
|
+
exception_container = [None]
|
|
80
|
+
self.reset_cancel() # Reset cancel flag before generation
|
|
81
|
+
|
|
82
|
+
def on_token(token: str, user_data) -> bool:
|
|
83
|
+
if self._cancel_event.is_set():
|
|
84
|
+
token_queue.put(('end', None))
|
|
85
|
+
return False # Stop generation
|
|
86
|
+
try:
|
|
87
|
+
token_queue.put(('token', token))
|
|
88
|
+
return True # Continue generation
|
|
89
|
+
except Exception as e:
|
|
90
|
+
exception_container[0] = e
|
|
91
|
+
return False # Stop generation
|
|
92
|
+
|
|
93
|
+
config = self._convert_generation_config(g_cfg)
|
|
94
|
+
|
|
95
|
+
# Run generation in thread
|
|
96
|
+
def generate():
|
|
97
|
+
try:
|
|
98
|
+
llm_bind.ml_llm_generate(
|
|
99
|
+
handle=self._handle,
|
|
100
|
+
prompt=prompt,
|
|
101
|
+
config=config,
|
|
102
|
+
on_token=on_token,
|
|
103
|
+
user_data=None
|
|
104
|
+
)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
exception_container[0] = e
|
|
107
|
+
finally:
|
|
108
|
+
token_queue.put(('end', None))
|
|
109
|
+
|
|
110
|
+
thread = threading.Thread(target=generate)
|
|
111
|
+
thread.start()
|
|
112
|
+
|
|
113
|
+
# Yield tokens as they come
|
|
114
|
+
try:
|
|
115
|
+
while True:
|
|
116
|
+
msg_type, token = token_queue.get()
|
|
117
|
+
if msg_type == 'token':
|
|
118
|
+
yield token
|
|
119
|
+
elif msg_type in ('error', 'end'):
|
|
120
|
+
break
|
|
121
|
+
finally:
|
|
122
|
+
thread.join()
|
|
123
|
+
|
|
124
|
+
if exception_container[0]:
|
|
125
|
+
raise exception_container[0]
|
|
126
|
+
|
|
127
|
+
def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
|
|
128
|
+
"""
|
|
129
|
+
Generate text without streaming.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
|
|
133
|
+
g_cfg (GenerationConfig): Generation configuration.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
str: The generated text.
|
|
137
|
+
"""
|
|
138
|
+
config = self._convert_generation_config(g_cfg)
|
|
139
|
+
result = llm_bind.ml_llm_generate(
|
|
140
|
+
handle=self._handle,
|
|
141
|
+
prompt=prompt,
|
|
142
|
+
config=config,
|
|
143
|
+
on_token=None, # No callback for non-streaming
|
|
144
|
+
user_data=None
|
|
145
|
+
)
|
|
146
|
+
return result.get("text", "")
|
|
147
|
+
|
|
148
|
+
def save_kv_cache(self, path: str):
|
|
149
|
+
"""
|
|
150
|
+
Save the key-value cache to the file.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
path (str): The path to the file.
|
|
154
|
+
"""
|
|
155
|
+
llm_bind.ml_llm_save_kv_cache(self._handle, path)
|
|
156
|
+
|
|
157
|
+
def load_kv_cache(self, path: str):
|
|
158
|
+
"""
|
|
159
|
+
Load the key-value cache from the file.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
path (str): The path to the file.
|
|
163
|
+
"""
|
|
164
|
+
llm_bind.ml_llm_load_kv_cache(self._handle, path)
|
|
165
|
+
|
|
166
|
+
def reset(self):
|
|
167
|
+
"""
|
|
168
|
+
Reset the LLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
|
|
169
|
+
If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
|
|
170
|
+
"""
|
|
171
|
+
llm_bind.ml_llm_reset(self._handle)
|
|
172
|
+
|
|
173
|
+
def _convert_generation_config(self, g_cfg: GenerationConfig):
|
|
174
|
+
"""Convert GenerationConfig to binding format."""
|
|
175
|
+
config = common_bind.GenerationConfig()
|
|
176
|
+
|
|
177
|
+
# Set basic generation parameters
|
|
178
|
+
config.max_tokens = g_cfg.max_tokens
|
|
179
|
+
|
|
180
|
+
if g_cfg.stop_words:
|
|
181
|
+
config.stop = g_cfg.stop_words
|
|
182
|
+
|
|
183
|
+
if g_cfg.image_paths:
|
|
184
|
+
config.image_paths = g_cfg.image_paths
|
|
185
|
+
|
|
186
|
+
if g_cfg.audio_paths:
|
|
187
|
+
config.audio_paths = g_cfg.audio_paths
|
|
188
|
+
|
|
189
|
+
if g_cfg.sampler_config:
|
|
190
|
+
sampler = common_bind.SamplerConfig()
|
|
191
|
+
sampler.temperature = g_cfg.sampler_config.temperature
|
|
192
|
+
sampler.top_p = g_cfg.sampler_config.top_p
|
|
193
|
+
sampler.top_k = g_cfg.sampler_config.top_k
|
|
194
|
+
sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
|
|
195
|
+
sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
|
|
196
|
+
sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
|
|
197
|
+
sampler.seed = g_cfg.sampler_config.seed
|
|
198
|
+
|
|
199
|
+
if g_cfg.sampler_config.grammar_path:
|
|
200
|
+
sampler.grammar_path = g_cfg.sampler_config.grammar_path
|
|
201
|
+
|
|
202
|
+
if g_cfg.sampler_config.grammar_string:
|
|
203
|
+
sampler.grammar_string = g_cfg.sampler_config.grammar_string
|
|
204
|
+
|
|
205
|
+
config.sampler_config = sampler
|
|
206
|
+
|
|
207
|
+
return config
|
nexaai/rerank.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import List, Optional, Sequence
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from nexaai.base import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class RerankConfig:
|
|
10
|
+
"""Configuration for reranking."""
|
|
11
|
+
batch_size: int = 1
|
|
12
|
+
normalize: bool = True
|
|
13
|
+
normalize_method: str = "softmax" # "softmax" | "min-max" | "none"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Reranker(BaseModel):
|
|
17
|
+
"""Abstract base class for reranker models."""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
"""Initialize base Reranker class."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def _load_from(cls,
|
|
25
|
+
model_path: str,
|
|
26
|
+
tokenizer_file: str = "tokenizer.json",
|
|
27
|
+
plugin_id: str = "llama_cpp",
|
|
28
|
+
device_id: Optional[str] = None
|
|
29
|
+
) -> 'Reranker':
|
|
30
|
+
"""Load reranker model from local path, routing to appropriate implementation."""
|
|
31
|
+
if plugin_id == "mlx":
|
|
32
|
+
from nexaai.rerank_impl.mlx_rerank_impl import MLXRerankImpl
|
|
33
|
+
return MLXRerankImpl._load_from(model_path, tokenizer_file, plugin_id, device_id)
|
|
34
|
+
else:
|
|
35
|
+
from nexaai.rerank_impl.pybind_rerank_impl import PyBindRerankImpl
|
|
36
|
+
return PyBindRerankImpl._load_from(model_path, tokenizer_file, plugin_id, device_id)
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def load_model(self, model_path: str, extra_data: Optional[str] = None) -> bool:
|
|
40
|
+
"""Load model from path."""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def rerank(
|
|
45
|
+
self,
|
|
46
|
+
query: str,
|
|
47
|
+
documents: Sequence[str],
|
|
48
|
+
config: Optional[RerankConfig] = None,
|
|
49
|
+
) -> List[float]:
|
|
50
|
+
"""Rerank documents given a query."""
|
|
51
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Note: This code is generated by Cursor, not tested yet.
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional, Sequence
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from nexaai.rerank import Reranker, RerankConfig
|
|
7
|
+
from nexaai.mlx_backend.rerank.interface import Reranker as MLXRerankInterface, create_reranker
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MLXRerankImpl(Reranker):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
"""Initialize MLX Rerank implementation."""
|
|
13
|
+
super().__init__()
|
|
14
|
+
self._mlx_reranker = None
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def _load_from(cls,
|
|
18
|
+
model_path: str,
|
|
19
|
+
tokenizer_file: str = "tokenizer.json",
|
|
20
|
+
plugin_id: str = "mlx",
|
|
21
|
+
device_id: Optional[str] = None
|
|
22
|
+
) -> 'MLXRerankImpl':
|
|
23
|
+
"""Load reranker model from local path using MLX backend."""
|
|
24
|
+
try:
|
|
25
|
+
# MLX Rerank interfaces are already imported
|
|
26
|
+
|
|
27
|
+
# Create instance and load MLX reranker
|
|
28
|
+
instance = cls()
|
|
29
|
+
instance._mlx_reranker = create_reranker(
|
|
30
|
+
model_path=model_path,
|
|
31
|
+
tokenizer_path=tokenizer_file,
|
|
32
|
+
device=device_id
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Load the model
|
|
36
|
+
success = instance._mlx_reranker.load_model(model_path)
|
|
37
|
+
if not success:
|
|
38
|
+
raise RuntimeError("Failed to load MLX reranker model")
|
|
39
|
+
|
|
40
|
+
return instance
|
|
41
|
+
except Exception as e:
|
|
42
|
+
raise RuntimeError(f"Failed to load MLX Reranker: {str(e)}")
|
|
43
|
+
|
|
44
|
+
def eject(self):
|
|
45
|
+
"""Destroy the model and free resources."""
|
|
46
|
+
if self._mlx_reranker:
|
|
47
|
+
self._mlx_reranker.destroy()
|
|
48
|
+
self._mlx_reranker = None
|
|
49
|
+
|
|
50
|
+
def load_model(self, model_path: str, extra_data: Optional[str] = None) -> bool:
|
|
51
|
+
"""Load model from path."""
|
|
52
|
+
if not self._mlx_reranker:
|
|
53
|
+
raise RuntimeError("MLX Reranker not initialized")
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
return self._mlx_reranker.load_model(model_path, extra_data)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
raise RuntimeError(f"Failed to load reranker model: {str(e)}")
|
|
59
|
+
|
|
60
|
+
def rerank(
|
|
61
|
+
self,
|
|
62
|
+
query: str,
|
|
63
|
+
documents: Sequence[str],
|
|
64
|
+
config: Optional[RerankConfig] = None,
|
|
65
|
+
) -> List[float]:
|
|
66
|
+
"""Rerank documents given a query."""
|
|
67
|
+
if not self._mlx_reranker:
|
|
68
|
+
raise RuntimeError("MLX Reranker not loaded")
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Convert our config to MLX format if provided
|
|
72
|
+
mlx_config = None
|
|
73
|
+
if config:
|
|
74
|
+
from nexaai.mlx_backend.rerank.interface import RerankConfig as MLXRerankConfig
|
|
75
|
+
|
|
76
|
+
mlx_config = MLXRerankConfig(
|
|
77
|
+
batch_size=config.batch_size,
|
|
78
|
+
normalize=config.normalize,
|
|
79
|
+
normalize_method=config.normalize_method
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Use MLX reranking
|
|
83
|
+
scores = self._mlx_reranker.rerank(query, documents, mlx_config)
|
|
84
|
+
|
|
85
|
+
# Convert mx.array to Python list of floats
|
|
86
|
+
return scores.tolist()
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
raise RuntimeError(f"Failed to rerank documents: {str(e)}")
|
|
90
|
+
|
|
91
|
+
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import List, Optional, Sequence
|
|
2
|
+
|
|
3
|
+
from nexaai.rerank import Reranker, RerankConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PyBindRerankImpl(Reranker):
|
|
7
|
+
def __init__(self):
|
|
8
|
+
"""Initialize PyBind Rerank implementation."""
|
|
9
|
+
super().__init__()
|
|
10
|
+
# TODO: Add PyBind-specific initialization
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def _load_from(cls,
|
|
14
|
+
model_path: str,
|
|
15
|
+
tokenizer_file: str = "tokenizer.json",
|
|
16
|
+
plugin_id: str = "llama_cpp",
|
|
17
|
+
device_id: Optional[str] = None
|
|
18
|
+
) -> 'PyBindRerankImpl':
|
|
19
|
+
"""Load reranker model from local path using PyBind backend."""
|
|
20
|
+
# TODO: Implement PyBind reranker loading
|
|
21
|
+
instance = cls()
|
|
22
|
+
return instance
|
|
23
|
+
|
|
24
|
+
def eject(self):
|
|
25
|
+
"""Destroy the model and free resources."""
|
|
26
|
+
# TODO: Implement PyBind reranker cleanup
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def load_model(self, model_path: str, extra_data: Optional[str] = None) -> bool:
|
|
30
|
+
"""Load model from path."""
|
|
31
|
+
# TODO: Implement PyBind reranker model loading
|
|
32
|
+
raise NotImplementedError("PyBind reranker model loading not yet implemented")
|
|
33
|
+
|
|
34
|
+
def rerank(
|
|
35
|
+
self,
|
|
36
|
+
query: str,
|
|
37
|
+
documents: Sequence[str],
|
|
38
|
+
config: Optional[RerankConfig] = None,
|
|
39
|
+
) -> List[float]:
|
|
40
|
+
"""Rerank documents given a query."""
|
|
41
|
+
# TODO: Implement PyBind reranking
|
|
42
|
+
raise NotImplementedError("PyBind reranking not yet implemented")
|