nexaai 1.0.16rc13__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.17__cp310-cp310-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Binary file
nexaai/_version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # This file is generated by CMake from _version.py.in
2
2
  # Do not modify this file manually - it will be overwritten
3
3
 
4
- __version__ = "1.0.16-rc13"
4
+ __version__ = "1.0.17"
Binary file
@@ -41,7 +41,12 @@ except ImportError:
41
41
  from ml import ChatMessage
42
42
  from dataclasses import dataclass
43
43
  from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
44
- from .generate import GenerationResult
44
+ from .generate import GenerationResult
45
+
46
+ # Custom exception for context length exceeded
47
+ class ContextLengthExceededError(Exception):
48
+ """Raised when input context length exceeds model's maximum context size"""
49
+ pass
45
50
 
46
51
  @dataclass
47
52
  class Qwen3VLBundledModel:
@@ -67,6 +72,7 @@ def load_qwen3_vl(
67
72
 
68
73
  Parameters are aligned with .generate.load for compatibility.
69
74
  """
75
+
70
76
  model_path = Path(path_or_repo)
71
77
  if not model_path.exists():
72
78
  if "/" in path_or_repo:
@@ -154,7 +160,6 @@ def load_qwen3_vl(
154
160
  if quantization_bits in [4, 8]:
155
161
  nn.quantize(llm_model, bits=quantization_bits, group_size=64,
156
162
  class_predicate=quant_predicate)
157
- # For f32 (32-bit), no quantization needed
158
163
 
159
164
  llm_model.load_weights(str(llm_weights_path), strict=True)
160
165
 
@@ -166,11 +171,15 @@ def load_qwen3_vl(
166
171
 
167
172
  def apply_chat_template_qwen3_vl(messages: Sequence[ChatMessage], num_images: int = 0, num_audios: int = 0, tools: Optional[str] = None, enable_thinking: bool = False) -> str:
168
173
  """Apply chat template: serialize messages with content as a list of typed items."""
174
+
169
175
  messages_dict = []
170
- for msg in messages:
176
+ for i, msg in enumerate(messages):
171
177
  content_items = [{"type": "text", "text": msg.content}]
172
178
  messages_dict.append({"role": msg.role, "content": content_items})
173
- return json.dumps(messages_dict)
179
+
180
+ result = json.dumps(messages_dict)
181
+
182
+ return result
174
183
 
175
184
 
176
185
  def stream_generate_qwen3_vl(
@@ -184,15 +193,22 @@ def stream_generate_qwen3_vl(
184
193
 
185
194
  ) -> Generator[Any, None, None]:
186
195
  """Stream generation yielding .generate.GenerationResult-compatible chunks."""
187
- messages = json.loads(prompt)
196
+
197
+ try:
198
+ messages = json.loads(prompt)
199
+ except json.JSONDecodeError as e:
200
+ raise
201
+
188
202
  if image is not None:
189
203
  image_list = image if isinstance(image, list) else [image]
190
204
  pil_images = []
191
- for p in image_list:
205
+ for i, p in enumerate(image_list):
192
206
  try:
193
- pil_images.append(Image.open(p))
194
- except Exception:
207
+ img = Image.open(p)
208
+ pil_images.append(img)
209
+ except Exception as e:
195
210
  continue
211
+
196
212
  contents = [{"type": "image", "image": img} for img in pil_images]
197
213
  if messages:
198
214
  if "content" not in messages[-1] or not isinstance(messages[-1]["content"], list):
@@ -201,6 +217,7 @@ def stream_generate_qwen3_vl(
201
217
 
202
218
  raw_text, processed_images = processor.messages_to_text(
203
219
  messages, add_generation_prompt=True)
220
+
204
221
 
205
222
  inputs = processor.text_to_input_ids(
206
223
  raw_text, images=processed_images, return_tensors="mlx")
@@ -208,10 +225,18 @@ def stream_generate_qwen3_vl(
208
225
  input_ids = inputs["input_ids"]
209
226
  pixel_values = inputs.get("pixel_values")
210
227
  image_grid_thw = inputs.get("image_grid_thw")
228
+
229
+
230
+ # Check if input context exceeds KV cache size and raise error
231
+ max_kv_size = 4096 # This should match the max_kv_size used in make_prompt_cache and nexa_generate_step
232
+ if input_ids.size > max_kv_size:
233
+ error_msg = f"Input context length ({input_ids.size} tokens) exceeds maximum supported context size ({max_kv_size} tokens). Please reduce the input length."
234
+ raise ContextLengthExceededError(error_msg)
211
235
 
212
236
  inputs_embeds, deepstack_visual_embeds, visual_pos_masks, cos, sin, rope_deltas = handle_multimodal_embeds(
213
237
  model.vision_model, model.llm_model, input_ids, pixel_values, image_grid_thw
214
238
  )
239
+
215
240
 
216
241
  prompt_cache = make_prompt_cache(model.llm_model, max_kv_size=4096)
217
242
  tokenizer = processor.tokenizer
@@ -222,37 +247,45 @@ def stream_generate_qwen3_vl(
222
247
 
223
248
  gen_count = 0
224
249
  tic = time.perf_counter()
250
+
225
251
 
226
- for token, logprobs in nexa_generate_step(
227
- model=model.llm_model,
228
- prompt=None,
229
- input_embeddings=inputs_embeds,
230
- max_tokens=max_tokens,
231
- max_kv_size=4096,
232
- prompt_cache=prompt_cache,
233
- visual_pos_masks=visual_pos_masks,
234
- deepstack_visual_embeds=deepstack_visual_embeds,
235
- cos=cos,
236
- sin=sin,
237
- rope_deltas=rope_deltas,
238
- ):
239
- if token == tokenizer.eos_token_id:
240
- break
241
-
242
- text_piece = tokenizer.decode([token])
243
- gen_count += 1
244
-
245
- yield GenerationResult(
246
- text=text_piece,
247
- token=token,
248
- logprobs=logprobs,
249
- prompt_tokens=int(input_ids.size),
250
- generation_tokens=gen_count,
251
- prompt_tps=float(prompt_tps),
252
- generation_tps=float(
253
- gen_count / max(1e-6, (time.perf_counter() - tic))),
254
- peak_memory=float(mx.get_peak_memory() / 1e9),
255
- )
252
+ try:
253
+ for token, logprobs in nexa_generate_step(
254
+ model=model.llm_model,
255
+ prompt=None,
256
+ input_embeddings=inputs_embeds,
257
+ max_tokens=max_tokens,
258
+ max_kv_size=4096,
259
+ prompt_cache=prompt_cache,
260
+ visual_pos_masks=visual_pos_masks,
261
+ deepstack_visual_embeds=deepstack_visual_embeds,
262
+ cos=cos,
263
+ sin=sin,
264
+ rope_deltas=rope_deltas,
265
+ ):
266
+ if token == tokenizer.eos_token_id:
267
+ break
268
+
269
+ text_piece = tokenizer.decode([token])
270
+ gen_count += 1
271
+
272
+ current_tps = gen_count / max(1e-6, (time.perf_counter() - tic))
273
+
274
+ yield GenerationResult(
275
+ text=text_piece,
276
+ token=token,
277
+ logprobs=logprobs,
278
+ prompt_tokens=int(input_ids.size),
279
+ generation_tokens=gen_count,
280
+ prompt_tps=float(prompt_tps),
281
+ generation_tps=float(current_tps),
282
+ peak_memory=float(mx.get_peak_memory() / 1e9),
283
+ )
284
+ except Exception as e:
285
+ import traceback
286
+ traceback.print_exc()
287
+ raise
288
+
256
289
 
257
290
  def quant_predicate(path: str, mod: nn.Module) -> bool:
258
291
  """Quantization predicate to exclude certain layers from quantization."""
@@ -25,7 +25,7 @@ from profiling import ProfilingMixin, ProfilingData, StopReason
25
25
 
26
26
  # Import from the actual mlx_vlm structure
27
27
  from .generate import generate, stream_generate, load
28
- from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl
28
+ from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl, ContextLengthExceededError
29
29
 
30
30
  from .modeling.prompt_utils import apply_chat_template
31
31
 
@@ -80,6 +80,9 @@ class VLM(ProfilingMixin):
80
80
 
81
81
  # Init deafutl sampler config with defualt.
82
82
  self.sampler_config = SamplerConfig()
83
+
84
+ # Track global character position for incremental processing
85
+ self.global_n_past_chars = 0
83
86
 
84
87
  def destroy(self) -> None:
85
88
  """Destroy the model and free resources."""
@@ -89,6 +92,7 @@ class VLM(ProfilingMixin):
89
92
  def reset(self) -> None:
90
93
  """Reset the model state."""
91
94
  self._reset_cache()
95
+ self.global_n_past_chars = 0
92
96
 
93
97
  def _reset_cache(self) -> None:
94
98
  """Reset the KV cache."""
@@ -120,7 +124,7 @@ class VLM(ProfilingMixin):
120
124
  prompt: str,
121
125
  config: Optional[GenerationConfig] = None,
122
126
  ) -> GenerationResult:
123
- """Generate text from prompt."""
127
+ """Generate text from prompt."""
124
128
  # Start profiling
125
129
  self._start_profiling()
126
130
 
@@ -141,6 +145,19 @@ class VLM(ProfilingMixin):
141
145
  image_list = [str(path) for path in image_paths] if image_paths else None
142
146
  audio_list = [str(path) for path in audio_paths] if audio_paths else None
143
147
 
148
+ # Extract incremental portion of the prompt (similar to llama.cpp VLM)
149
+ full_prompt_len = len(prompt)
150
+ incremental_prompt = prompt
151
+
152
+ # Apply incremental processing only for non-qwen3vl models
153
+ # qwen3vl requires complete JSON conversation structure
154
+ if self.model_name != "qwen3vl":
155
+ if self.global_n_past_chars < full_prompt_len:
156
+ incremental_prompt = prompt[self.global_n_past_chars:]
157
+ else:
158
+ # No new text to process
159
+ incremental_prompt = ""
160
+
144
161
  # End prompt processing, start decode
145
162
  self._prompt_end()
146
163
  self._decode_start()
@@ -152,7 +169,7 @@ class VLM(ProfilingMixin):
152
169
  text, stats = generate(
153
170
  self.model,
154
171
  self.processor,
155
- prompt,
172
+ incremental_prompt, # Use incremental prompt instead of full prompt
156
173
  image=image_list,
157
174
  audio=audio_list,
158
175
  **gen_kwargs,
@@ -181,10 +198,16 @@ class VLM(ProfilingMixin):
181
198
  self._update_prompt_tokens(prompt_tokens)
182
199
  self._update_generated_tokens(generated_tokens)
183
200
  self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
201
+
202
+ # Update global character position (not needed for qwen3vl JSON processing)
203
+ if self.model_name != "qwen3vl":
204
+ old_pos = self.global_n_past_chars
205
+ self.global_n_past_chars = full_prompt_len + len(text)
206
+
184
207
  self._decode_end()
185
208
  self._end_profiling()
186
209
 
187
- return GenerationResult(
210
+ result = GenerationResult(
188
211
  text=text,
189
212
  prompt_tokens=prompt_tokens,
190
213
  generation_tokens=generated_tokens,
@@ -193,7 +216,18 @@ class VLM(ProfilingMixin):
193
216
  generation_tps=stats.get("generation_tps", 0.0),
194
217
  peak_memory=stats.get("peak_memory", 0.0),
195
218
  )
219
+
220
+ return result
221
+
222
+ except ContextLengthExceededError as e:
223
+ self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
224
+ self._decode_end()
225
+ self._end_profiling()
226
+ # Re-raise the original exception without wrapping it
227
+ raise e
196
228
  except Exception as e:
229
+ import traceback
230
+ traceback.print_exc()
197
231
  self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
198
232
  self._decode_end()
199
233
  self._end_profiling()
@@ -206,6 +240,7 @@ class VLM(ProfilingMixin):
206
240
  on_token: Optional[TokenCallback],
207
241
  ) -> GenerationResult:
208
242
  """Generate text with streaming callback. Unified method for both text and multimodal generation."""
243
+
209
244
  # Start profiling
210
245
  self._start_profiling()
211
246
 
@@ -218,6 +253,7 @@ class VLM(ProfilingMixin):
218
253
  if self.sampler_config is not None:
219
254
  gen_kwargs.update(self.sampler_config.__dict__)
220
255
 
256
+
221
257
  # Get image and audio paths from config
222
258
  image_paths = config.image_paths if config else None
223
259
  audio_paths = config.audio_paths if config else None
@@ -226,6 +262,21 @@ class VLM(ProfilingMixin):
226
262
  image_list = [str(path) for path in image_paths] if image_paths else None
227
263
  audio_list = [str(path) for path in audio_paths] if audio_paths else None
228
264
 
265
+
266
+ # Extract incremental portion of the prompt (similar to llama.cpp VLM)
267
+ full_prompt_len = len(prompt)
268
+ incremental_prompt = prompt
269
+
270
+
271
+ # Apply incremental processing only for non-qwen3vl models
272
+ # qwen3vl requires complete JSON conversation structure
273
+ if self.model_name != "qwen3vl":
274
+ if self.global_n_past_chars < full_prompt_len:
275
+ incremental_prompt = prompt[self.global_n_past_chars:]
276
+ else:
277
+ # No new text to process
278
+ incremental_prompt = ""
279
+
229
280
  # End prompt processing, start decode
230
281
  self._prompt_end()
231
282
  self._decode_start()
@@ -236,15 +287,19 @@ class VLM(ProfilingMixin):
236
287
  stream_generate_impl = stream_generate_qwen3_vl if self.model_name == "qwen3vl" else stream_generate
237
288
 
238
289
  try:
290
+ token_count = 0
291
+
239
292
  for result in stream_generate_impl(
240
293
  self.model,
241
294
  self.processor,
242
- prompt,
295
+ incremental_prompt, # Use incremental prompt instead of full prompt
243
296
  image=image_list,
244
297
  audio=audio_list,
245
298
  **gen_kwargs,
246
299
  ):
247
- # Record TTFT on first token
300
+ token_count += 1
301
+
302
+ # Record TTFT on first token
248
303
  if first_token:
249
304
  self._record_ttft()
250
305
  first_token = False
@@ -257,6 +312,7 @@ class VLM(ProfilingMixin):
257
312
  text += result.text
258
313
  last_result = result
259
314
 
315
+
260
316
  # Set stop reason if not user stop
261
317
  if self._profiling_context.stop_reason != StopReason.ML_STOP_REASON_USER:
262
318
  self._set_stop_reason(StopReason.ML_STOP_REASON_EOS)
@@ -266,10 +322,15 @@ class VLM(ProfilingMixin):
266
322
  self._update_prompt_tokens(last_result.prompt_tokens)
267
323
  self._update_generated_tokens(last_result.generation_tokens)
268
324
 
325
+ # Update global character position (not needed for qwen3vl JSON processing)
326
+ if self.model_name != "qwen3vl":
327
+ old_pos = self.global_n_past_chars
328
+ self.global_n_past_chars = full_prompt_len + len(text)
329
+
269
330
  self._decode_end()
270
331
  self._end_profiling()
271
332
 
272
- return GenerationResult(
333
+ result = GenerationResult(
273
334
  text=text,
274
335
  token=last_result.token if last_result else None,
275
336
  logprobs=last_result.logprobs if last_result else None,
@@ -280,7 +341,18 @@ class VLM(ProfilingMixin):
280
341
  generation_tps=last_result.generation_tps if last_result else 0.0,
281
342
  peak_memory=last_result.peak_memory if last_result else 0.0,
282
343
  )
344
+
345
+ return result
346
+
347
+ except ContextLengthExceededError as e:
348
+ self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
349
+ self._decode_end()
350
+ self._end_profiling()
351
+ # Re-raise the original exception without wrapping it
352
+ raise e
283
353
  except Exception as e:
354
+ import traceback
355
+ traceback.print_exc()
284
356
  self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
285
357
  self._decode_end()
286
358
  self._end_profiling()
@@ -232,7 +232,7 @@ def generate_step(
232
232
  prompt_progress_callback(prompt_processed_tokens, total_prompt_tokens)
233
233
  prompt_processed_tokens += prefill_step_size
234
234
  y = y[prefill_step_size:]
235
- mx.metal.clear_cache()
235
+ mx.clear_cache()
236
236
 
237
237
  y, logprobs = _step(y)
238
238
 
@@ -249,7 +249,7 @@ def generate_step(
249
249
  break
250
250
  yield y.item(), logprobs
251
251
  if n % 256 == 0:
252
- mx.metal.clear_cache()
252
+ mx.clear_cache()
253
253
  y, logprobs = next_y, next_logprobs
254
254
  n += 1
255
255
 
@@ -371,7 +371,7 @@ def nexa_generate_step(
371
371
  prompt_progress_callback(prompt_processed_tokens, total_prompt_tokens)
372
372
  prompt_processed_tokens += prefill_step_size
373
373
  y = y[prefill_step_size:]
374
- mx.metal.clear_cache()
374
+ mx.clear_cache()
375
375
 
376
376
  y, logprobs = _step(y)
377
377
 
@@ -388,7 +388,7 @@ def nexa_generate_step(
388
388
  break
389
389
  yield y.item(), logprobs
390
390
  if n % 256 == 0:
391
- mx.metal.clear_cache()
391
+ mx.clear_cache()
392
392
  y, logprobs = next_y, next_logprobs
393
393
  n += 1
394
394
 
@@ -507,7 +507,7 @@ def nexa_multimodal_generate_step(
507
507
  prompt_progress_callback(prompt_processed_tokens, total_prompt_tokens)
508
508
  prompt_processed_tokens += prefill_step_size
509
509
  y = y[prefill_step_size:]
510
- mx.metal.clear_cache()
510
+ mx.clear_cache()
511
511
 
512
512
  y, logprobs = _step(y)
513
513
 
@@ -524,7 +524,7 @@ def nexa_multimodal_generate_step(
524
524
  break
525
525
  yield y.item(), logprobs
526
526
  if n % 256 == 0:
527
- mx.metal.clear_cache()
527
+ mx.clear_cache()
528
528
  y, logprobs = next_y, next_logprobs
529
529
  n += 1
530
530
 
@@ -632,7 +632,7 @@ def speculative_generate_step(
632
632
  quantize_cache_fn(cache)
633
633
  mx.eval([c.state for c in cache])
634
634
  y = y[prefill_step_size:]
635
- mx.metal.clear_cache()
635
+ mx.clear_cache()
636
636
  return y
637
637
 
638
638
  def _rewind_cache(num_draft, num_accept):
@@ -35,15 +35,8 @@ def process_manifest_metadata(manifest: Dict[str, Any], repo_id: str) -> Dict[st
35
35
  # Handle download_time - keep as null if missing
36
36
  download_time = manifest.get('download_time')
37
37
 
38
- # Handle avatar_url - fetch on-the-fly if missing/null
38
+ # Handle avatar_url - leave it null if missing/null
39
39
  avatar_url = manifest.get('avatar_url')
40
- if not avatar_url:
41
- try:
42
- from .avatar_fetcher import get_avatar_url_for_repo
43
- avatar_url = get_avatar_url_for_repo(repo_id)
44
- except Exception:
45
- # If fetching fails, leave as None
46
- avatar_url = None
47
40
 
48
41
  # Return processed metadata
49
42
  processed_manifest = manifest.copy()
@@ -171,9 +164,9 @@ def create_gguf_manifest(repo_id: str, files: List[str], directory_path: str, ol
171
164
  },
172
165
  "ExtraFiles": None,
173
166
  # Preserve old metadata fields
174
- "pipeline_tag": old_metadata.get('pipeline_tag'),
175
- "download_time": old_metadata.get('download_time'),
176
- "avatar_url": old_metadata.get('avatar_url')
167
+ "pipeline_tag": old_metadata.get('pipeline_tag') if old_metadata.get('pipeline_tag') else existing_manifest.get('pipeline_tag'),
168
+ "download_time": old_metadata.get('download_time') if old_metadata.get('download_time') else existing_manifest.get('download_time'),
169
+ "avatar_url": old_metadata.get('avatar_url') if old_metadata.get('avatar_url') else existing_manifest.get('avatar_url')
177
170
  }
178
171
 
179
172
  return manifest
@@ -182,6 +175,9 @@ def create_gguf_manifest(repo_id: str, files: List[str], directory_path: str, ol
182
175
  def create_mlx_manifest(repo_id: str, files: List[str], directory_path: str, old_metadata: Dict[str, Any], is_mmproj: bool = False, file_name: Optional[Union[str, List[str]]] = None) -> Dict[str, Any]:
183
176
  """Create MLX format manifest."""
184
177
 
178
+ # Load existing manifest to merge MLX files if it exists
179
+ existing_manifest = load_nexa_manifest(directory_path)
180
+
185
181
  model_files = {}
186
182
  extra_files = []
187
183
 
@@ -250,9 +246,9 @@ def create_mlx_manifest(repo_id: str, files: List[str], directory_path: str, old
250
246
  },
251
247
  "ExtraFiles": extra_files if extra_files else None,
252
248
  # Preserve old metadata fields
253
- "pipeline_tag": old_metadata.get('pipeline_tag'),
254
- "download_time": old_metadata.get('download_time'),
255
- "avatar_url": old_metadata.get('avatar_url')
249
+ "pipeline_tag": old_metadata.get('pipeline_tag') if old_metadata.get('pipeline_tag') else existing_manifest.get('pipeline_tag'),
250
+ "download_time": old_metadata.get('download_time') if old_metadata.get('download_time') else existing_manifest.get('download_time'),
251
+ "avatar_url": old_metadata.get('avatar_url') if old_metadata.get('avatar_url') else existing_manifest.get('avatar_url')
256
252
  }
257
253
 
258
254
  return manifest
@@ -11,7 +11,6 @@ from huggingface_hub import HfApi
11
11
  from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
12
12
 
13
13
  from .progress_tracker import CustomProgressTqdm, DownloadProgressTracker
14
- from .avatar_fetcher import get_avatar_url_for_repo
15
14
  from .manifest_utils import (
16
15
  load_download_metadata,
17
16
  save_download_metadata,
@@ -790,7 +789,7 @@ class HuggingFaceDownloader:
790
789
  # If no expected size, just check that file is not empty
791
790
  return os.path.getsize(file_path) > 0
792
791
 
793
- def _fetch_and_save_metadata(self, repo_id: str, local_dir: str, is_mmproj: bool = False, file_name: Optional[Union[str, List[str]]] = None) -> None:
792
+ def _fetch_and_save_metadata(self, repo_id: str, local_dir: str, is_mmproj: bool = False, file_name: Optional[Union[str, List[str]]] = None, **kwargs) -> None:
794
793
  """Fetch model info and save metadata after successful download."""
795
794
  # Initialize metadata with defaults to ensure manifest is always created
796
795
  old_metadata = {
@@ -809,14 +808,9 @@ class HuggingFaceDownloader:
809
808
  # Log the error but continue with manifest creation
810
809
  print(f"Warning: Could not fetch model info for {repo_id}: {e}")
811
810
 
812
- try:
813
- # Get avatar URL
814
- avatar_url = get_avatar_url_for_repo(repo_id, custom_endpoint=self.endpoint)
815
- if avatar_url:
816
- old_metadata['avatar_url'] = avatar_url
817
- except Exception as e:
818
- # Log the error but continue with manifest creation
819
- print(f"Warning: Could not fetch avatar URL for {repo_id}: {e}")
811
+ # Use input avater url if provided
812
+ old_metadata['avatar_url'] = kwargs.get('avatar_url')
813
+
820
814
 
821
815
  # CRITICAL: Always create the manifest file, regardless of metadata fetch failures
822
816
  try:
@@ -850,7 +844,8 @@ class HuggingFaceDownloader:
850
844
  file_name: str,
851
845
  local_dir: str,
852
846
  progress_tracker: Optional[DownloadProgressTracker],
853
- force_download: bool = False
847
+ force_download: bool = False,
848
+ **kwargs
854
849
  ) -> str:
855
850
  """Download a single file from the repository using HuggingFace Hub API."""
856
851
  # Create repo-specific directory for the single file
@@ -882,7 +877,7 @@ class HuggingFaceDownloader:
882
877
  progress_tracker.stop_tracking()
883
878
 
884
879
  # Save metadata after successful download
885
- self._fetch_and_save_metadata(repo_id, file_local_dir, self._current_is_mmproj, self._current_file_name)
880
+ self._fetch_and_save_metadata(repo_id, file_local_dir, self._current_is_mmproj, self._current_file_name, **kwargs)
886
881
 
887
882
  return downloaded_path
888
883
 
@@ -901,7 +896,8 @@ class HuggingFaceDownloader:
901
896
  repo_id: str,
902
897
  local_dir: str,
903
898
  progress_tracker: Optional[DownloadProgressTracker],
904
- force_download: bool = False
899
+ force_download: bool = False,
900
+ **kwargs
905
901
  ) -> str:
906
902
  """Download the entire repository."""
907
903
  # Create a subdirectory for this specific repo
@@ -927,7 +923,7 @@ class HuggingFaceDownloader:
927
923
  progress_tracker.stop_tracking()
928
924
 
929
925
  # Save metadata after successful download
930
- self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name)
926
+ self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name, **kwargs)
931
927
 
932
928
  return downloaded_path
933
929
 
@@ -944,7 +940,8 @@ class HuggingFaceDownloader:
944
940
  file_names: List[str],
945
941
  local_dir: str,
946
942
  progress_tracker: Optional[DownloadProgressTracker],
947
- force_download: bool = False
943
+ force_download: bool = False,
944
+ **kwargs
948
945
  ) -> str:
949
946
  """Download multiple specific files from HuggingFace Hub."""
950
947
  # Create repo-specific directory
@@ -989,7 +986,7 @@ class HuggingFaceDownloader:
989
986
  progress_tracker.stop_tracking()
990
987
 
991
988
  # Save metadata after successful download
992
- self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name)
989
+ self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name, **kwargs)
993
990
 
994
991
  return repo_local_dir
995
992
 
@@ -1015,7 +1012,8 @@ class HuggingFaceDownloader:
1015
1012
  progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1016
1013
  show_progress: bool = True,
1017
1014
  force_download: bool = False,
1018
- is_mmproj: bool = False
1015
+ is_mmproj: bool = False,
1016
+ **kwargs
1019
1017
  ) -> str:
1020
1018
  """
1021
1019
  Main download method that handles all download scenarios.
@@ -1062,13 +1060,13 @@ class HuggingFaceDownloader:
1062
1060
  if file_name is None:
1063
1061
  # Download entire repository
1064
1062
  return self._download_entire_repository(
1065
- repo_id, local_dir, progress_tracker, force_download
1063
+ repo_id, local_dir, progress_tracker, force_download, **kwargs
1066
1064
  )
1067
1065
  elif isinstance(file_name, str):
1068
1066
  # Download specific single file
1069
1067
  self._validate_file_exists_in_repo(file_name, info, repo_id, progress_tracker)
1070
1068
  return self._download_single_file(
1071
- repo_id, file_name, local_dir, progress_tracker, force_download
1069
+ repo_id, file_name, local_dir, progress_tracker, force_download, **kwargs
1072
1070
  )
1073
1071
  else: # file_name is a list
1074
1072
  # Download multiple specific files
@@ -1077,7 +1075,7 @@ class HuggingFaceDownloader:
1077
1075
  self._validate_file_exists_in_repo(fname, info, repo_id, progress_tracker)
1078
1076
 
1079
1077
  return self._download_multiple_files_from_hf(
1080
- repo_id, file_name, local_dir, progress_tracker, force_download
1078
+ repo_id, file_name, local_dir, progress_tracker, force_download, **kwargs
1081
1079
  )
1082
1080
 
1083
1081
  except Exception as e:
@@ -1107,7 +1105,8 @@ def download_from_huggingface(
1107
1105
  token: Union[bool, str, None] = None,
1108
1106
  custom_endpoint: Optional[str] = None,
1109
1107
  force_download: bool = False,
1110
- is_mmproj: Optional[bool] = None
1108
+ is_mmproj: Optional[bool] = None,
1109
+ **kwargs
1111
1110
  ) -> str:
1112
1111
  """
1113
1112
  Download models or files from HuggingFace Hub or custom mirror endpoints.
@@ -1197,7 +1196,8 @@ def download_from_huggingface(
1197
1196
  progress_callback=progress_callback,
1198
1197
  show_progress=show_progress,
1199
1198
  force_download=force_download,
1200
- is_mmproj=is_mmproj
1199
+ is_mmproj=is_mmproj,
1200
+ **kwargs
1201
1201
  )
1202
1202
 
1203
1203
 
@@ -1211,7 +1211,8 @@ def _download_model_if_needed(
1211
1211
  param_name: str,
1212
1212
  progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1213
1213
  token: Union[bool, str, None] = None,
1214
- is_mmproj: bool = False
1214
+ is_mmproj: bool = False,
1215
+ **kwargs
1215
1216
  ) -> str:
1216
1217
  """
1217
1218
  Helper function to download a model from HuggingFace if it doesn't exist locally.
@@ -1247,7 +1248,8 @@ def _download_model_if_needed(
1247
1248
  progress_callback=progress_callback,
1248
1249
  show_progress=True,
1249
1250
  token=token,
1250
- is_mmproj=is_mmproj
1251
+ is_mmproj=is_mmproj,
1252
+ **kwargs
1251
1253
  )
1252
1254
 
1253
1255
  return downloaded_path
@@ -1320,7 +1322,7 @@ def auto_download_model(func: Callable) -> Callable:
1320
1322
  if name_or_path is not None:
1321
1323
  try:
1322
1324
  downloaded_name_path = _download_model_if_needed(
1323
- name_or_path, 'name_or_path', progress_callback, token
1325
+ name_or_path, 'name_or_path', progress_callback, token, **kwargs
1324
1326
  )
1325
1327
 
1326
1328
  # Replace name_or_path with downloaded path
@@ -1338,7 +1340,7 @@ def auto_download_model(func: Callable) -> Callable:
1338
1340
  if mmproj_path is not None:
1339
1341
  try:
1340
1342
  downloaded_mmproj_path = _download_model_if_needed(
1341
- mmproj_path, 'mmproj_path', progress_callback, token, is_mmproj=True
1343
+ mmproj_path, 'mmproj_path', progress_callback, token, is_mmproj=True, **kwargs
1342
1344
  )
1343
1345
 
1344
1346
  # Replace mmproj_path with downloaded path
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nexaai
3
- Version: 1.0.16rc13
3
+ Version: 1.0.17
4
4
  Summary: Python bindings for NexaSDK C-lib backend
5
5
  Author-email: "Nexa AI, Inc." <dev@nexa.ai>
6
6
  Project-URL: Homepage, https://github.com/NexaAI/nexasdk-bridge
@@ -1,6 +1,6 @@
1
1
  nexaai/__init__.py,sha256=L8oB7GFZZMGnUpCg0PecDbI_ycKuQak-ZEJ4Y12_QIw,2184
2
- nexaai/_stub.cpython-310-darwin.so,sha256=Bd-r6O9pG8m0SL3rkS3PQF8Z1ie_WD7uqaFPRyitb9E,49832
3
- nexaai/_version.py,sha256=w1I23pLkLt0xrD0hMhWK5fW9rqbLqnW1ii4yYp9UCTo,144
2
+ nexaai/_stub.cpython-310-darwin.so,sha256=HjqUYc8SyajzyySZk1eBJdO7Rc_db2F-kS3KdPSPB5o,49832
3
+ nexaai/_version.py,sha256=eaXF_gF6uNVz9AglXCAwIyseTDCCAGEhr3CCnSfr3tY,139
4
4
  nexaai/asr.py,sha256=NljMXDErwPNMOPaRkJZMEDka9Nk8xyur7L8i924TStY,2054
5
5
  nexaai/base.py,sha256=N8PRgDFA-XPku2vWnQIofQ7ipz3pPlO6f8YZGnuhquE,982
6
6
  nexaai/common.py,sha256=Y0NJNLTi4Nq4x1WL6PQsSvGUto0eGmWhjpsC6jcekfA,3444
@@ -17,9 +17,9 @@ nexaai/asr_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  nexaai/asr_impl/mlx_asr_impl.py,sha256=eosd8-TIWAOwV0HltmoFrLwzXHcU4jyxtncvuZE9pgA,3257
18
18
  nexaai/asr_impl/pybind_asr_impl.py,sha256=pE9Hb_hMi5yAc4MF83bLVOb8zDtreCkB3_u7XED9YpA,1516
19
19
  nexaai/binds/__init__.py,sha256=eYuay_8DDXeOUWz2_R9HFSabohxs6hvZn391t2L0Po0,104
20
- nexaai/binds/common_bind.cpython-310-darwin.so,sha256=km1TU5WOJHVjvyM4l5mgAkS_omxuKt8pM92E9Wv0VqM,235488
20
+ nexaai/binds/common_bind.cpython-310-darwin.so,sha256=BoXByRlNGDaNS1YyZyCF-s7h0vXP9NLPlJMQQ5pqusU,235488
21
21
  nexaai/binds/embedder_bind.cpython-310-darwin.so,sha256=b2NoXFAJvPLi_P1X7lXLKmAUU0v2HJI3Zwa10gfqHdw,202032
22
- nexaai/binds/libnexa_bridge.dylib,sha256=SLP_DHAJeSl5gJMSs2fZtPLv-VgNyojZTK0auqDXSpo,250408
22
+ nexaai/binds/libnexa_bridge.dylib,sha256=e6uFx8ENEdCWk8whKyoVvX-e9-Bk_35kqIDV3kRDuXU,250408
23
23
  nexaai/binds/llm_bind.cpython-310-darwin.so,sha256=p1ZTGMolEkWywkmwzOUjTr3RpSEH21BHZAggVzo89Ks,183088
24
24
  nexaai/binds/vlm_bind.cpython-310-darwin.so,sha256=LGd-tykePnQFfGca25HnPIBfXsfrMzbwyx6d5Ld3xps,183000
25
25
  nexaai/binds/nexa_llama_cpp/libggml-base.dylib,sha256=GyOkHOM-5uHp7NUZ4Sr9BWak6BYpcc9aqI9A-zPnQp4,629528
@@ -246,8 +246,8 @@ nexaai/mlx_backend/tts/__init__.py,sha256=fuT_9_xpYJ28m4yjly5L2jChUrzlSQz-b_S7nu
246
246
  nexaai/mlx_backend/tts/interface.py,sha256=0FvZbIyOvg8jERZEQ6bygbv7v02O9xHO4-TPUlar0b4,9568
247
247
  nexaai/mlx_backend/vlm/__init__.py,sha256=_25kvMEviX16Hg3bro8Ws70V0eeIEqYKV8ZDXqYzKew,73
248
248
  nexaai/mlx_backend/vlm/generate.py,sha256=DqHFEAuqk-nko8ho6U9GAXTDAWz4d8GTe_hCt-XFyCw,19071
249
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=undjso1mfxqpd6FMTksSA5qagRttxAGbOBj1x7cqI1s,9211
250
- nexaai/mlx_backend/vlm/interface.py,sha256=vFTzJCbqq55ybv_tbDBC9NVn1_sXgCfqXdsV-3ia8vo,16177
249
+ nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=eeizW18u6dHPZOOnJtQUJkiqMAIIpOSS-IOjacXGsz4,10240
250
+ nexaai/mlx_backend/vlm/interface.py,sha256=HOPzWNMs6QaHO6x0Z83kW1xkRRmb8_xo6xQLKsOWqAo,19013
251
251
  nexaai/mlx_backend/vlm/main.py,sha256=nPcg25jupeDD74uvRoxpWp3Dsulw7WddI7vll6zejak,10664
252
252
  nexaai/mlx_backend/vlm/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
253
253
  nexaai/mlx_backend/vlm/modeling/convert.py,sha256=ia5i9cgTufFGmKyhkYUaW0nfNqT_bMo8i-Hg_zy5JC4,1863
@@ -362,7 +362,7 @@ nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py,sha256=LArnNtI98B_GJO
362
362
  nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
363
363
  nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py,sha256=4RlZwgz8YX2ngmJNaymxFFpw9hJu-0EMw9xwXpngW9o,3496
364
364
  nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py,sha256=NMOB6x-RT6svF4H-Ymo5WqnP7ptAal3aaKjWZXWGMsM,17671
365
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py,sha256=Mw7Btz0_t7erQOrfWzCXT-ktEwZl61OODcmDMIo3VS0,26719
365
+ nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py,sha256=bchCpnlewysWQss5TQKxdKPXYd5VA7ySUDfRt8Xj_H4,26677
366
366
  nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py,sha256=ty0dA3SsEUFtFbHo16tKdnKymrNKKsUO3KMYapMajbY,8704
367
367
  nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py,sha256=8SEeVwgjuvaYy-4ALAU0RHQMuRr2k7EkXba_csxk498,10673
368
368
  nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py,sha256=Gqanx4hBDcon_k5ClhUsS4YpMbZNiee8jvImGS9h43s,13229
@@ -378,17 +378,16 @@ nexaai/rerank_impl/pybind_rerank_impl.py,sha256=CtwkG7YrW58GPMDERJSnISGTVCXWNju5
378
378
  nexaai/tts_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
379
379
  nexaai/tts_impl/mlx_tts_impl.py,sha256=i_uNPdvlXYtL3e01oKjDlP9jgkWCRt1bBHsExaaiJi8,3101
380
380
  nexaai/tts_impl/pybind_tts_impl.py,sha256=mpn44r6pfYLIl-NrEy2dXHjGtWtNCmM7HRyxiANxUI4,1444
381
- nexaai/utils/avatar_fetcher.py,sha256=bWy8ujgbOiTHFCjFxTwkn3uXbZ84PgEGUkXkR3MH4bI,3821
382
381
  nexaai/utils/decode.py,sha256=61n4Zf6c5QLyqGoctEitlI9BX3tPlP2a5aaKNHbw3T4,404
383
- nexaai/utils/manifest_utils.py,sha256=sR9Nme4GbD3Cb3fMd55yLvGZpqxb71vd6b2XZTsrIGM,12328
384
- nexaai/utils/model_manager.py,sha256=p2kJKK63Zk-rEUucFsgY0T5PyXi_IvJY0gKewUVcAV4,56081
382
+ nexaai/utils/manifest_utils.py,sha256=PA84obFP7W1dlneURlIHIzJjWIF5dbDHGdNeHouUy68,12659
383
+ nexaai/utils/model_manager.py,sha256=_WKJP7YVk7q587OoOWwDNWVR-8tbKZkmHKjcCZN8Q4M,55979
385
384
  nexaai/utils/model_types.py,sha256=-DER8L4lAUR_iLS99F0r57avwqWtuN21ug5pX2p24_E,1369
386
385
  nexaai/utils/progress_tracker.py,sha256=jdUqtmPqyhwC9uSKvQcJEYETwSt-OhP4oitdJ94614o,15394
387
386
  nexaai/utils/quantization_utils.py,sha256=FYcNSAKGlBqFDUTx3jSKOr2lnq4nyiyC0ZG8oSxFwiU,7825
388
387
  nexaai/vlm_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
389
388
  nexaai/vlm_impl/mlx_vlm_impl.py,sha256=pLtWm_ckz8a0U-AtAOMVseFDO4OVPvHyYO2KlfBaGYk,10833
390
389
  nexaai/vlm_impl/pybind_vlm_impl.py,sha256=FAbhpRJzHgI78r0mUvKybO97R1szvNhH0aTn_I52oT4,8597
391
- nexaai-1.0.16rc13.dist-info/METADATA,sha256=eqPLK_7JBryWiB7qvdppmdEoHd42jZohyBHi0j1Lges,1202
392
- nexaai-1.0.16rc13.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
393
- nexaai-1.0.16rc13.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
394
- nexaai-1.0.16rc13.dist-info/RECORD,,
390
+ nexaai-1.0.17.dist-info/METADATA,sha256=BMYxa8SkZYJx_zRraC8kS32fkBpXFsrKthZBJxISykc,1198
391
+ nexaai-1.0.17.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
392
+ nexaai-1.0.17.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
393
+ nexaai-1.0.17.dist-info/RECORD,,
@@ -1,104 +0,0 @@
1
- """Utility for fetching avatar URLs from HuggingFace."""
2
-
3
- import logging
4
- from typing import Dict, Optional
5
- import httpx
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- def fetch_avatar_urls_from_hf_api(query: str, custom_endpoint: Optional[str] = None) -> Dict[str, str]:
11
- """
12
- Fetch avatar URLs from HuggingFace models-json endpoint.
13
-
14
- Args:
15
- query: Search query to fetch models for
16
- custom_endpoint: Optional custom HuggingFace endpoint
17
-
18
- Returns:
19
- Dictionary mapping author names to avatar URLs
20
- """
21
- avatar_map = {}
22
- try:
23
- # Use the base URL from the configured endpoint
24
- base_url = custom_endpoint if custom_endpoint else "https://huggingface.co"
25
-
26
- # Build the URL with query parameter
27
- url = f"{base_url}/models-json?sort=trending&search={query}&withCount=true"
28
-
29
- # Make the HTTP request with a timeout
30
- with httpx.Client(timeout=2.0) as client:
31
- response = client.get(url)
32
-
33
- if response.status_code == 200:
34
- data = response.json()
35
- models = data.get("models", [])
36
-
37
- # Build a map of author names to avatar URLs
38
- for model in models:
39
- author = model.get("author")
40
- author_data = model.get("authorData", {})
41
- avatar_url = author_data.get("avatarUrl")
42
-
43
- if author and avatar_url:
44
- # Handle relative URLs by prepending appropriate base URL
45
- if avatar_url.startswith("/"):
46
- avatar_url = f"{base_url}{avatar_url}"
47
- avatar_map[author] = avatar_url
48
-
49
- logger.debug(f"Fetched {len(avatar_map)} avatar URLs from HuggingFace API")
50
- else:
51
- logger.warning(f"Failed to fetch avatar URLs: HTTP {response.status_code}")
52
-
53
- except Exception as e:
54
- logger.warning(f"Error fetching avatar URLs from HuggingFace API: {e}")
55
- # Return empty map on error - we'll fall back to default behavior
56
-
57
- return avatar_map
58
-
59
-
60
- def get_avatar_url_for_repo(repo_id: str, search_query: Optional[str] = None,
61
- custom_endpoint: Optional[str] = None) -> Optional[str]:
62
- """
63
- Get avatar URL for a repository ID.
64
-
65
- This method tries multiple strategies:
66
- 1. If search_query is provided, fetch from HuggingFace API with that query
67
- 2. Try fetching with the full repo_id as query
68
- 3. Try fetching with just the organization name as query
69
- 4. Fall back to CDN URL pattern
70
-
71
- Args:
72
- repo_id: Repository ID in format "owner/repo"
73
- search_query: Optional search query to use for fetching avatars
74
- custom_endpoint: Optional custom HuggingFace endpoint
75
-
76
- Returns:
77
- Avatar URL or None if not found
78
- """
79
- if "/" not in repo_id:
80
- return None
81
-
82
- org_name = repo_id.split("/")[0]
83
-
84
- # Try with search query if provided
85
- if search_query:
86
- avatar_map = fetch_avatar_urls_from_hf_api(search_query, custom_endpoint)
87
- avatar_url = avatar_map.get(org_name)
88
- if avatar_url:
89
- return avatar_url
90
-
91
- # Try with full repo_id
92
- avatar_map = fetch_avatar_urls_from_hf_api(repo_id, custom_endpoint)
93
- avatar_url = avatar_map.get(org_name)
94
- if avatar_url:
95
- return avatar_url
96
-
97
- # Try with just organization name
98
- avatar_map = fetch_avatar_urls_from_hf_api(org_name, custom_endpoint)
99
- avatar_url = avatar_map.get(org_name)
100
- if avatar_url:
101
- return avatar_url
102
-
103
- # Fallback to CDN URL pattern
104
- return f"https://cdn-thumbnails.huggingface.co/social-thumbnails/{org_name}.png"