nexaai 1.0.16rc14__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.17__cp310-cp310-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Binary file
nexaai/_version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # This file is generated by CMake from _version.py.in
2
2
  # Do not modify this file manually - it will be overwritten
3
3
 
4
- __version__ = "1.0.16-rc14"
4
+ __version__ = "1.0.17"
Binary file
@@ -41,7 +41,12 @@ except ImportError:
41
41
  from ml import ChatMessage
42
42
  from dataclasses import dataclass
43
43
  from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
44
- from .generate import GenerationResult
44
+ from .generate import GenerationResult
45
+
46
+ # Custom exception for context length exceeded
47
+ class ContextLengthExceededError(Exception):
48
+ """Raised when input context length exceeds model's maximum context size"""
49
+ pass
45
50
 
46
51
  @dataclass
47
52
  class Qwen3VLBundledModel:
@@ -67,6 +72,7 @@ def load_qwen3_vl(
67
72
 
68
73
  Parameters are aligned with .generate.load for compatibility.
69
74
  """
75
+
70
76
  model_path = Path(path_or_repo)
71
77
  if not model_path.exists():
72
78
  if "/" in path_or_repo:
@@ -154,7 +160,6 @@ def load_qwen3_vl(
154
160
  if quantization_bits in [4, 8]:
155
161
  nn.quantize(llm_model, bits=quantization_bits, group_size=64,
156
162
  class_predicate=quant_predicate)
157
- # For f32 (32-bit), no quantization needed
158
163
 
159
164
  llm_model.load_weights(str(llm_weights_path), strict=True)
160
165
 
@@ -166,11 +171,15 @@ def load_qwen3_vl(
166
171
 
167
172
  def apply_chat_template_qwen3_vl(messages: Sequence[ChatMessage], num_images: int = 0, num_audios: int = 0, tools: Optional[str] = None, enable_thinking: bool = False) -> str:
168
173
  """Apply chat template: serialize messages with content as a list of typed items."""
174
+
169
175
  messages_dict = []
170
- for msg in messages:
176
+ for i, msg in enumerate(messages):
171
177
  content_items = [{"type": "text", "text": msg.content}]
172
178
  messages_dict.append({"role": msg.role, "content": content_items})
173
- return json.dumps(messages_dict)
179
+
180
+ result = json.dumps(messages_dict)
181
+
182
+ return result
174
183
 
175
184
 
176
185
  def stream_generate_qwen3_vl(
@@ -184,15 +193,22 @@ def stream_generate_qwen3_vl(
184
193
 
185
194
  ) -> Generator[Any, None, None]:
186
195
  """Stream generation yielding .generate.GenerationResult-compatible chunks."""
187
- messages = json.loads(prompt)
196
+
197
+ try:
198
+ messages = json.loads(prompt)
199
+ except json.JSONDecodeError as e:
200
+ raise
201
+
188
202
  if image is not None:
189
203
  image_list = image if isinstance(image, list) else [image]
190
204
  pil_images = []
191
- for p in image_list:
205
+ for i, p in enumerate(image_list):
192
206
  try:
193
- pil_images.append(Image.open(p))
194
- except Exception:
207
+ img = Image.open(p)
208
+ pil_images.append(img)
209
+ except Exception as e:
195
210
  continue
211
+
196
212
  contents = [{"type": "image", "image": img} for img in pil_images]
197
213
  if messages:
198
214
  if "content" not in messages[-1] or not isinstance(messages[-1]["content"], list):
@@ -201,6 +217,7 @@ def stream_generate_qwen3_vl(
201
217
 
202
218
  raw_text, processed_images = processor.messages_to_text(
203
219
  messages, add_generation_prompt=True)
220
+
204
221
 
205
222
  inputs = processor.text_to_input_ids(
206
223
  raw_text, images=processed_images, return_tensors="mlx")
@@ -208,10 +225,18 @@ def stream_generate_qwen3_vl(
208
225
  input_ids = inputs["input_ids"]
209
226
  pixel_values = inputs.get("pixel_values")
210
227
  image_grid_thw = inputs.get("image_grid_thw")
228
+
229
+
230
+ # Check if input context exceeds KV cache size and raise error
231
+ max_kv_size = 4096 # This should match the max_kv_size used in make_prompt_cache and nexa_generate_step
232
+ if input_ids.size > max_kv_size:
233
+ error_msg = f"Input context length ({input_ids.size} tokens) exceeds maximum supported context size ({max_kv_size} tokens). Please reduce the input length."
234
+ raise ContextLengthExceededError(error_msg)
211
235
 
212
236
  inputs_embeds, deepstack_visual_embeds, visual_pos_masks, cos, sin, rope_deltas = handle_multimodal_embeds(
213
237
  model.vision_model, model.llm_model, input_ids, pixel_values, image_grid_thw
214
238
  )
239
+
215
240
 
216
241
  prompt_cache = make_prompt_cache(model.llm_model, max_kv_size=4096)
217
242
  tokenizer = processor.tokenizer
@@ -222,37 +247,45 @@ def stream_generate_qwen3_vl(
222
247
 
223
248
  gen_count = 0
224
249
  tic = time.perf_counter()
250
+
225
251
 
226
- for token, logprobs in nexa_generate_step(
227
- model=model.llm_model,
228
- prompt=None,
229
- input_embeddings=inputs_embeds,
230
- max_tokens=max_tokens,
231
- max_kv_size=4096,
232
- prompt_cache=prompt_cache,
233
- visual_pos_masks=visual_pos_masks,
234
- deepstack_visual_embeds=deepstack_visual_embeds,
235
- cos=cos,
236
- sin=sin,
237
- rope_deltas=rope_deltas,
238
- ):
239
- if token == tokenizer.eos_token_id:
240
- break
241
-
242
- text_piece = tokenizer.decode([token])
243
- gen_count += 1
244
-
245
- yield GenerationResult(
246
- text=text_piece,
247
- token=token,
248
- logprobs=logprobs,
249
- prompt_tokens=int(input_ids.size),
250
- generation_tokens=gen_count,
251
- prompt_tps=float(prompt_tps),
252
- generation_tps=float(
253
- gen_count / max(1e-6, (time.perf_counter() - tic))),
254
- peak_memory=float(mx.get_peak_memory() / 1e9),
255
- )
252
+ try:
253
+ for token, logprobs in nexa_generate_step(
254
+ model=model.llm_model,
255
+ prompt=None,
256
+ input_embeddings=inputs_embeds,
257
+ max_tokens=max_tokens,
258
+ max_kv_size=4096,
259
+ prompt_cache=prompt_cache,
260
+ visual_pos_masks=visual_pos_masks,
261
+ deepstack_visual_embeds=deepstack_visual_embeds,
262
+ cos=cos,
263
+ sin=sin,
264
+ rope_deltas=rope_deltas,
265
+ ):
266
+ if token == tokenizer.eos_token_id:
267
+ break
268
+
269
+ text_piece = tokenizer.decode([token])
270
+ gen_count += 1
271
+
272
+ current_tps = gen_count / max(1e-6, (time.perf_counter() - tic))
273
+
274
+ yield GenerationResult(
275
+ text=text_piece,
276
+ token=token,
277
+ logprobs=logprobs,
278
+ prompt_tokens=int(input_ids.size),
279
+ generation_tokens=gen_count,
280
+ prompt_tps=float(prompt_tps),
281
+ generation_tps=float(current_tps),
282
+ peak_memory=float(mx.get_peak_memory() / 1e9),
283
+ )
284
+ except Exception as e:
285
+ import traceback
286
+ traceback.print_exc()
287
+ raise
288
+
256
289
 
257
290
  def quant_predicate(path: str, mod: nn.Module) -> bool:
258
291
  """Quantization predicate to exclude certain layers from quantization."""
@@ -25,7 +25,7 @@ from profiling import ProfilingMixin, ProfilingData, StopReason
25
25
 
26
26
  # Import from the actual mlx_vlm structure
27
27
  from .generate import generate, stream_generate, load
28
- from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl
28
+ from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl, ContextLengthExceededError
29
29
 
30
30
  from .modeling.prompt_utils import apply_chat_template
31
31
 
@@ -124,7 +124,7 @@ class VLM(ProfilingMixin):
124
124
  prompt: str,
125
125
  config: Optional[GenerationConfig] = None,
126
126
  ) -> GenerationResult:
127
- """Generate text from prompt."""
127
+ """Generate text from prompt."""
128
128
  # Start profiling
129
129
  self._start_profiling()
130
130
 
@@ -148,12 +148,15 @@ class VLM(ProfilingMixin):
148
148
  # Extract incremental portion of the prompt (similar to llama.cpp VLM)
149
149
  full_prompt_len = len(prompt)
150
150
  incremental_prompt = prompt
151
-
152
- if self.global_n_past_chars < full_prompt_len:
153
- incremental_prompt = prompt[self.global_n_past_chars:]
154
- else:
155
- # No new text to process
156
- incremental_prompt = ""
151
+
152
+ # Apply incremental processing only for non-qwen3vl models
153
+ # qwen3vl requires complete JSON conversation structure
154
+ if self.model_name != "qwen3vl":
155
+ if self.global_n_past_chars < full_prompt_len:
156
+ incremental_prompt = prompt[self.global_n_past_chars:]
157
+ else:
158
+ # No new text to process
159
+ incremental_prompt = ""
157
160
 
158
161
  # End prompt processing, start decode
159
162
  self._prompt_end()
@@ -196,13 +199,15 @@ class VLM(ProfilingMixin):
196
199
  self._update_generated_tokens(generated_tokens)
197
200
  self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
198
201
 
199
- # Update global character position
200
- self.global_n_past_chars = full_prompt_len + len(text)
202
+ # Update global character position (not needed for qwen3vl JSON processing)
203
+ if self.model_name != "qwen3vl":
204
+ old_pos = self.global_n_past_chars
205
+ self.global_n_past_chars = full_prompt_len + len(text)
201
206
 
202
207
  self._decode_end()
203
208
  self._end_profiling()
204
209
 
205
- return GenerationResult(
210
+ result = GenerationResult(
206
211
  text=text,
207
212
  prompt_tokens=prompt_tokens,
208
213
  generation_tokens=generated_tokens,
@@ -211,7 +216,18 @@ class VLM(ProfilingMixin):
211
216
  generation_tps=stats.get("generation_tps", 0.0),
212
217
  peak_memory=stats.get("peak_memory", 0.0),
213
218
  )
219
+
220
+ return result
221
+
222
+ except ContextLengthExceededError as e:
223
+ self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
224
+ self._decode_end()
225
+ self._end_profiling()
226
+ # Re-raise the original exception without wrapping it
227
+ raise e
214
228
  except Exception as e:
229
+ import traceback
230
+ traceback.print_exc()
215
231
  self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
216
232
  self._decode_end()
217
233
  self._end_profiling()
@@ -224,6 +240,7 @@ class VLM(ProfilingMixin):
224
240
  on_token: Optional[TokenCallback],
225
241
  ) -> GenerationResult:
226
242
  """Generate text with streaming callback. Unified method for both text and multimodal generation."""
243
+
227
244
  # Start profiling
228
245
  self._start_profiling()
229
246
 
@@ -236,6 +253,7 @@ class VLM(ProfilingMixin):
236
253
  if self.sampler_config is not None:
237
254
  gen_kwargs.update(self.sampler_config.__dict__)
238
255
 
256
+
239
257
  # Get image and audio paths from config
240
258
  image_paths = config.image_paths if config else None
241
259
  audio_paths = config.audio_paths if config else None
@@ -244,15 +262,20 @@ class VLM(ProfilingMixin):
244
262
  image_list = [str(path) for path in image_paths] if image_paths else None
245
263
  audio_list = [str(path) for path in audio_paths] if audio_paths else None
246
264
 
265
+
247
266
  # Extract incremental portion of the prompt (similar to llama.cpp VLM)
248
267
  full_prompt_len = len(prompt)
249
268
  incremental_prompt = prompt
250
269
 
251
- if self.global_n_past_chars < full_prompt_len:
252
- incremental_prompt = prompt[self.global_n_past_chars:]
253
- else:
254
- # No new text to process
255
- incremental_prompt = ""
270
+
271
+ # Apply incremental processing only for non-qwen3vl models
272
+ # qwen3vl requires complete JSON conversation structure
273
+ if self.model_name != "qwen3vl":
274
+ if self.global_n_past_chars < full_prompt_len:
275
+ incremental_prompt = prompt[self.global_n_past_chars:]
276
+ else:
277
+ # No new text to process
278
+ incremental_prompt = ""
256
279
 
257
280
  # End prompt processing, start decode
258
281
  self._prompt_end()
@@ -264,6 +287,8 @@ class VLM(ProfilingMixin):
264
287
  stream_generate_impl = stream_generate_qwen3_vl if self.model_name == "qwen3vl" else stream_generate
265
288
 
266
289
  try:
290
+ token_count = 0
291
+
267
292
  for result in stream_generate_impl(
268
293
  self.model,
269
294
  self.processor,
@@ -272,7 +297,9 @@ class VLM(ProfilingMixin):
272
297
  audio=audio_list,
273
298
  **gen_kwargs,
274
299
  ):
275
- # Record TTFT on first token
300
+ token_count += 1
301
+
302
+ # Record TTFT on first token
276
303
  if first_token:
277
304
  self._record_ttft()
278
305
  first_token = False
@@ -285,6 +312,7 @@ class VLM(ProfilingMixin):
285
312
  text += result.text
286
313
  last_result = result
287
314
 
315
+
288
316
  # Set stop reason if not user stop
289
317
  if self._profiling_context.stop_reason != StopReason.ML_STOP_REASON_USER:
290
318
  self._set_stop_reason(StopReason.ML_STOP_REASON_EOS)
@@ -294,13 +322,15 @@ class VLM(ProfilingMixin):
294
322
  self._update_prompt_tokens(last_result.prompt_tokens)
295
323
  self._update_generated_tokens(last_result.generation_tokens)
296
324
 
297
- # Update global character position
298
- self.global_n_past_chars = full_prompt_len + len(text)
325
+ # Update global character position (not needed for qwen3vl JSON processing)
326
+ if self.model_name != "qwen3vl":
327
+ old_pos = self.global_n_past_chars
328
+ self.global_n_past_chars = full_prompt_len + len(text)
299
329
 
300
330
  self._decode_end()
301
331
  self._end_profiling()
302
332
 
303
- return GenerationResult(
333
+ result = GenerationResult(
304
334
  text=text,
305
335
  token=last_result.token if last_result else None,
306
336
  logprobs=last_result.logprobs if last_result else None,
@@ -311,7 +341,18 @@ class VLM(ProfilingMixin):
311
341
  generation_tps=last_result.generation_tps if last_result else 0.0,
312
342
  peak_memory=last_result.peak_memory if last_result else 0.0,
313
343
  )
344
+
345
+ return result
346
+
347
+ except ContextLengthExceededError as e:
348
+ self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
349
+ self._decode_end()
350
+ self._end_profiling()
351
+ # Re-raise the original exception without wrapping it
352
+ raise e
314
353
  except Exception as e:
354
+ import traceback
355
+ traceback.print_exc()
315
356
  self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
316
357
  self._decode_end()
317
358
  self._end_profiling()
@@ -35,15 +35,8 @@ def process_manifest_metadata(manifest: Dict[str, Any], repo_id: str) -> Dict[st
35
35
  # Handle download_time - keep as null if missing
36
36
  download_time = manifest.get('download_time')
37
37
 
38
- # Handle avatar_url - fetch on-the-fly if missing/null
38
+ # Handle avatar_url - leave it null if missing/null
39
39
  avatar_url = manifest.get('avatar_url')
40
- if not avatar_url:
41
- try:
42
- from .avatar_fetcher import get_avatar_url_for_repo
43
- avatar_url = get_avatar_url_for_repo(repo_id)
44
- except Exception:
45
- # If fetching fails, leave as None
46
- avatar_url = None
47
40
 
48
41
  # Return processed metadata
49
42
  processed_manifest = manifest.copy()
@@ -171,9 +164,9 @@ def create_gguf_manifest(repo_id: str, files: List[str], directory_path: str, ol
171
164
  },
172
165
  "ExtraFiles": None,
173
166
  # Preserve old metadata fields
174
- "pipeline_tag": old_metadata.get('pipeline_tag'),
175
- "download_time": old_metadata.get('download_time'),
176
- "avatar_url": old_metadata.get('avatar_url')
167
+ "pipeline_tag": old_metadata.get('pipeline_tag') if old_metadata.get('pipeline_tag') else existing_manifest.get('pipeline_tag'),
168
+ "download_time": old_metadata.get('download_time') if old_metadata.get('download_time') else existing_manifest.get('download_time'),
169
+ "avatar_url": old_metadata.get('avatar_url') if old_metadata.get('avatar_url') else existing_manifest.get('avatar_url')
177
170
  }
178
171
 
179
172
  return manifest
@@ -182,6 +175,9 @@ def create_gguf_manifest(repo_id: str, files: List[str], directory_path: str, ol
182
175
  def create_mlx_manifest(repo_id: str, files: List[str], directory_path: str, old_metadata: Dict[str, Any], is_mmproj: bool = False, file_name: Optional[Union[str, List[str]]] = None) -> Dict[str, Any]:
183
176
  """Create MLX format manifest."""
184
177
 
178
+ # Load existing manifest to merge MLX files if it exists
179
+ existing_manifest = load_nexa_manifest(directory_path)
180
+
185
181
  model_files = {}
186
182
  extra_files = []
187
183
 
@@ -250,9 +246,9 @@ def create_mlx_manifest(repo_id: str, files: List[str], directory_path: str, old
250
246
  },
251
247
  "ExtraFiles": extra_files if extra_files else None,
252
248
  # Preserve old metadata fields
253
- "pipeline_tag": old_metadata.get('pipeline_tag'),
254
- "download_time": old_metadata.get('download_time'),
255
- "avatar_url": old_metadata.get('avatar_url')
249
+ "pipeline_tag": old_metadata.get('pipeline_tag') if old_metadata.get('pipeline_tag') else existing_manifest.get('pipeline_tag'),
250
+ "download_time": old_metadata.get('download_time') if old_metadata.get('download_time') else existing_manifest.get('download_time'),
251
+ "avatar_url": old_metadata.get('avatar_url') if old_metadata.get('avatar_url') else existing_manifest.get('avatar_url')
256
252
  }
257
253
 
258
254
  return manifest
@@ -11,7 +11,6 @@ from huggingface_hub import HfApi
11
11
  from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
12
12
 
13
13
  from .progress_tracker import CustomProgressTqdm, DownloadProgressTracker
14
- from .avatar_fetcher import get_avatar_url_for_repo
15
14
  from .manifest_utils import (
16
15
  load_download_metadata,
17
16
  save_download_metadata,
@@ -790,7 +789,7 @@ class HuggingFaceDownloader:
790
789
  # If no expected size, just check that file is not empty
791
790
  return os.path.getsize(file_path) > 0
792
791
 
793
- def _fetch_and_save_metadata(self, repo_id: str, local_dir: str, is_mmproj: bool = False, file_name: Optional[Union[str, List[str]]] = None) -> None:
792
+ def _fetch_and_save_metadata(self, repo_id: str, local_dir: str, is_mmproj: bool = False, file_name: Optional[Union[str, List[str]]] = None, **kwargs) -> None:
794
793
  """Fetch model info and save metadata after successful download."""
795
794
  # Initialize metadata with defaults to ensure manifest is always created
796
795
  old_metadata = {
@@ -809,14 +808,9 @@ class HuggingFaceDownloader:
809
808
  # Log the error but continue with manifest creation
810
809
  print(f"Warning: Could not fetch model info for {repo_id}: {e}")
811
810
 
812
- try:
813
- # Get avatar URL
814
- avatar_url = get_avatar_url_for_repo(repo_id, custom_endpoint=self.endpoint)
815
- if avatar_url:
816
- old_metadata['avatar_url'] = avatar_url
817
- except Exception as e:
818
- # Log the error but continue with manifest creation
819
- print(f"Warning: Could not fetch avatar URL for {repo_id}: {e}")
811
+ # Use input avater url if provided
812
+ old_metadata['avatar_url'] = kwargs.get('avatar_url')
813
+
820
814
 
821
815
  # CRITICAL: Always create the manifest file, regardless of metadata fetch failures
822
816
  try:
@@ -850,7 +844,8 @@ class HuggingFaceDownloader:
850
844
  file_name: str,
851
845
  local_dir: str,
852
846
  progress_tracker: Optional[DownloadProgressTracker],
853
- force_download: bool = False
847
+ force_download: bool = False,
848
+ **kwargs
854
849
  ) -> str:
855
850
  """Download a single file from the repository using HuggingFace Hub API."""
856
851
  # Create repo-specific directory for the single file
@@ -882,7 +877,7 @@ class HuggingFaceDownloader:
882
877
  progress_tracker.stop_tracking()
883
878
 
884
879
  # Save metadata after successful download
885
- self._fetch_and_save_metadata(repo_id, file_local_dir, self._current_is_mmproj, self._current_file_name)
880
+ self._fetch_and_save_metadata(repo_id, file_local_dir, self._current_is_mmproj, self._current_file_name, **kwargs)
886
881
 
887
882
  return downloaded_path
888
883
 
@@ -901,7 +896,8 @@ class HuggingFaceDownloader:
901
896
  repo_id: str,
902
897
  local_dir: str,
903
898
  progress_tracker: Optional[DownloadProgressTracker],
904
- force_download: bool = False
899
+ force_download: bool = False,
900
+ **kwargs
905
901
  ) -> str:
906
902
  """Download the entire repository."""
907
903
  # Create a subdirectory for this specific repo
@@ -927,7 +923,7 @@ class HuggingFaceDownloader:
927
923
  progress_tracker.stop_tracking()
928
924
 
929
925
  # Save metadata after successful download
930
- self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name)
926
+ self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name, **kwargs)
931
927
 
932
928
  return downloaded_path
933
929
 
@@ -944,7 +940,8 @@ class HuggingFaceDownloader:
944
940
  file_names: List[str],
945
941
  local_dir: str,
946
942
  progress_tracker: Optional[DownloadProgressTracker],
947
- force_download: bool = False
943
+ force_download: bool = False,
944
+ **kwargs
948
945
  ) -> str:
949
946
  """Download multiple specific files from HuggingFace Hub."""
950
947
  # Create repo-specific directory
@@ -989,7 +986,7 @@ class HuggingFaceDownloader:
989
986
  progress_tracker.stop_tracking()
990
987
 
991
988
  # Save metadata after successful download
992
- self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name)
989
+ self._fetch_and_save_metadata(repo_id, repo_local_dir, self._current_is_mmproj, self._current_file_name, **kwargs)
993
990
 
994
991
  return repo_local_dir
995
992
 
@@ -1015,7 +1012,8 @@ class HuggingFaceDownloader:
1015
1012
  progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1016
1013
  show_progress: bool = True,
1017
1014
  force_download: bool = False,
1018
- is_mmproj: bool = False
1015
+ is_mmproj: bool = False,
1016
+ **kwargs
1019
1017
  ) -> str:
1020
1018
  """
1021
1019
  Main download method that handles all download scenarios.
@@ -1062,13 +1060,13 @@ class HuggingFaceDownloader:
1062
1060
  if file_name is None:
1063
1061
  # Download entire repository
1064
1062
  return self._download_entire_repository(
1065
- repo_id, local_dir, progress_tracker, force_download
1063
+ repo_id, local_dir, progress_tracker, force_download, **kwargs
1066
1064
  )
1067
1065
  elif isinstance(file_name, str):
1068
1066
  # Download specific single file
1069
1067
  self._validate_file_exists_in_repo(file_name, info, repo_id, progress_tracker)
1070
1068
  return self._download_single_file(
1071
- repo_id, file_name, local_dir, progress_tracker, force_download
1069
+ repo_id, file_name, local_dir, progress_tracker, force_download, **kwargs
1072
1070
  )
1073
1071
  else: # file_name is a list
1074
1072
  # Download multiple specific files
@@ -1077,7 +1075,7 @@ class HuggingFaceDownloader:
1077
1075
  self._validate_file_exists_in_repo(fname, info, repo_id, progress_tracker)
1078
1076
 
1079
1077
  return self._download_multiple_files_from_hf(
1080
- repo_id, file_name, local_dir, progress_tracker, force_download
1078
+ repo_id, file_name, local_dir, progress_tracker, force_download, **kwargs
1081
1079
  )
1082
1080
 
1083
1081
  except Exception as e:
@@ -1107,7 +1105,8 @@ def download_from_huggingface(
1107
1105
  token: Union[bool, str, None] = None,
1108
1106
  custom_endpoint: Optional[str] = None,
1109
1107
  force_download: bool = False,
1110
- is_mmproj: Optional[bool] = None
1108
+ is_mmproj: Optional[bool] = None,
1109
+ **kwargs
1111
1110
  ) -> str:
1112
1111
  """
1113
1112
  Download models or files from HuggingFace Hub or custom mirror endpoints.
@@ -1197,7 +1196,8 @@ def download_from_huggingface(
1197
1196
  progress_callback=progress_callback,
1198
1197
  show_progress=show_progress,
1199
1198
  force_download=force_download,
1200
- is_mmproj=is_mmproj
1199
+ is_mmproj=is_mmproj,
1200
+ **kwargs
1201
1201
  )
1202
1202
 
1203
1203
 
@@ -1211,7 +1211,8 @@ def _download_model_if_needed(
1211
1211
  param_name: str,
1212
1212
  progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1213
1213
  token: Union[bool, str, None] = None,
1214
- is_mmproj: bool = False
1214
+ is_mmproj: bool = False,
1215
+ **kwargs
1215
1216
  ) -> str:
1216
1217
  """
1217
1218
  Helper function to download a model from HuggingFace if it doesn't exist locally.
@@ -1247,7 +1248,8 @@ def _download_model_if_needed(
1247
1248
  progress_callback=progress_callback,
1248
1249
  show_progress=True,
1249
1250
  token=token,
1250
- is_mmproj=is_mmproj
1251
+ is_mmproj=is_mmproj,
1252
+ **kwargs
1251
1253
  )
1252
1254
 
1253
1255
  return downloaded_path
@@ -1320,7 +1322,7 @@ def auto_download_model(func: Callable) -> Callable:
1320
1322
  if name_or_path is not None:
1321
1323
  try:
1322
1324
  downloaded_name_path = _download_model_if_needed(
1323
- name_or_path, 'name_or_path', progress_callback, token
1325
+ name_or_path, 'name_or_path', progress_callback, token, **kwargs
1324
1326
  )
1325
1327
 
1326
1328
  # Replace name_or_path with downloaded path
@@ -1338,7 +1340,7 @@ def auto_download_model(func: Callable) -> Callable:
1338
1340
  if mmproj_path is not None:
1339
1341
  try:
1340
1342
  downloaded_mmproj_path = _download_model_if_needed(
1341
- mmproj_path, 'mmproj_path', progress_callback, token, is_mmproj=True
1343
+ mmproj_path, 'mmproj_path', progress_callback, token, is_mmproj=True, **kwargs
1342
1344
  )
1343
1345
 
1344
1346
  # Replace mmproj_path with downloaded path
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nexaai
3
- Version: 1.0.16rc14
3
+ Version: 1.0.17
4
4
  Summary: Python bindings for NexaSDK C-lib backend
5
5
  Author-email: "Nexa AI, Inc." <dev@nexa.ai>
6
6
  Project-URL: Homepage, https://github.com/NexaAI/nexasdk-bridge
@@ -1,6 +1,6 @@
1
1
  nexaai/__init__.py,sha256=L8oB7GFZZMGnUpCg0PecDbI_ycKuQak-ZEJ4Y12_QIw,2184
2
- nexaai/_stub.cpython-310-darwin.so,sha256=KljcA21kmHR-5BpCJJJMrHFMwI2Wgixalw80FJSNBe8,49832
3
- nexaai/_version.py,sha256=bFprtDX2rUyZwPnP3h4-J3LGlRF9uZAd4KOCazdh12I,144
2
+ nexaai/_stub.cpython-310-darwin.so,sha256=HjqUYc8SyajzyySZk1eBJdO7Rc_db2F-kS3KdPSPB5o,49832
3
+ nexaai/_version.py,sha256=eaXF_gF6uNVz9AglXCAwIyseTDCCAGEhr3CCnSfr3tY,139
4
4
  nexaai/asr.py,sha256=NljMXDErwPNMOPaRkJZMEDka9Nk8xyur7L8i924TStY,2054
5
5
  nexaai/base.py,sha256=N8PRgDFA-XPku2vWnQIofQ7ipz3pPlO6f8YZGnuhquE,982
6
6
  nexaai/common.py,sha256=Y0NJNLTi4Nq4x1WL6PQsSvGUto0eGmWhjpsC6jcekfA,3444
@@ -17,9 +17,9 @@ nexaai/asr_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  nexaai/asr_impl/mlx_asr_impl.py,sha256=eosd8-TIWAOwV0HltmoFrLwzXHcU4jyxtncvuZE9pgA,3257
18
18
  nexaai/asr_impl/pybind_asr_impl.py,sha256=pE9Hb_hMi5yAc4MF83bLVOb8zDtreCkB3_u7XED9YpA,1516
19
19
  nexaai/binds/__init__.py,sha256=eYuay_8DDXeOUWz2_R9HFSabohxs6hvZn391t2L0Po0,104
20
- nexaai/binds/common_bind.cpython-310-darwin.so,sha256=km1TU5WOJHVjvyM4l5mgAkS_omxuKt8pM92E9Wv0VqM,235488
20
+ nexaai/binds/common_bind.cpython-310-darwin.so,sha256=BoXByRlNGDaNS1YyZyCF-s7h0vXP9NLPlJMQQ5pqusU,235488
21
21
  nexaai/binds/embedder_bind.cpython-310-darwin.so,sha256=b2NoXFAJvPLi_P1X7lXLKmAUU0v2HJI3Zwa10gfqHdw,202032
22
- nexaai/binds/libnexa_bridge.dylib,sha256=v770dZQxEZvtXZN8drsqLrrCPfblKSBJQOIbu96YUUY,250408
22
+ nexaai/binds/libnexa_bridge.dylib,sha256=e6uFx8ENEdCWk8whKyoVvX-e9-Bk_35kqIDV3kRDuXU,250408
23
23
  nexaai/binds/llm_bind.cpython-310-darwin.so,sha256=p1ZTGMolEkWywkmwzOUjTr3RpSEH21BHZAggVzo89Ks,183088
24
24
  nexaai/binds/vlm_bind.cpython-310-darwin.so,sha256=LGd-tykePnQFfGca25HnPIBfXsfrMzbwyx6d5Ld3xps,183000
25
25
  nexaai/binds/nexa_llama_cpp/libggml-base.dylib,sha256=GyOkHOM-5uHp7NUZ4Sr9BWak6BYpcc9aqI9A-zPnQp4,629528
@@ -246,8 +246,8 @@ nexaai/mlx_backend/tts/__init__.py,sha256=fuT_9_xpYJ28m4yjly5L2jChUrzlSQz-b_S7nu
246
246
  nexaai/mlx_backend/tts/interface.py,sha256=0FvZbIyOvg8jERZEQ6bygbv7v02O9xHO4-TPUlar0b4,9568
247
247
  nexaai/mlx_backend/vlm/__init__.py,sha256=_25kvMEviX16Hg3bro8Ws70V0eeIEqYKV8ZDXqYzKew,73
248
248
  nexaai/mlx_backend/vlm/generate.py,sha256=DqHFEAuqk-nko8ho6U9GAXTDAWz4d8GTe_hCt-XFyCw,19071
249
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=undjso1mfxqpd6FMTksSA5qagRttxAGbOBj1x7cqI1s,9211
250
- nexaai/mlx_backend/vlm/interface.py,sha256=0BLfodbYOU71jFvAvv01FuLBE_KBtyB-8Cd7LqzzRHY,17450
249
+ nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=eeizW18u6dHPZOOnJtQUJkiqMAIIpOSS-IOjacXGsz4,10240
250
+ nexaai/mlx_backend/vlm/interface.py,sha256=HOPzWNMs6QaHO6x0Z83kW1xkRRmb8_xo6xQLKsOWqAo,19013
251
251
  nexaai/mlx_backend/vlm/main.py,sha256=nPcg25jupeDD74uvRoxpWp3Dsulw7WddI7vll6zejak,10664
252
252
  nexaai/mlx_backend/vlm/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
253
253
  nexaai/mlx_backend/vlm/modeling/convert.py,sha256=ia5i9cgTufFGmKyhkYUaW0nfNqT_bMo8i-Hg_zy5JC4,1863
@@ -378,17 +378,16 @@ nexaai/rerank_impl/pybind_rerank_impl.py,sha256=CtwkG7YrW58GPMDERJSnISGTVCXWNju5
378
378
  nexaai/tts_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
379
379
  nexaai/tts_impl/mlx_tts_impl.py,sha256=i_uNPdvlXYtL3e01oKjDlP9jgkWCRt1bBHsExaaiJi8,3101
380
380
  nexaai/tts_impl/pybind_tts_impl.py,sha256=mpn44r6pfYLIl-NrEy2dXHjGtWtNCmM7HRyxiANxUI4,1444
381
- nexaai/utils/avatar_fetcher.py,sha256=bWy8ujgbOiTHFCjFxTwkn3uXbZ84PgEGUkXkR3MH4bI,3821
382
381
  nexaai/utils/decode.py,sha256=61n4Zf6c5QLyqGoctEitlI9BX3tPlP2a5aaKNHbw3T4,404
383
- nexaai/utils/manifest_utils.py,sha256=sR9Nme4GbD3Cb3fMd55yLvGZpqxb71vd6b2XZTsrIGM,12328
384
- nexaai/utils/model_manager.py,sha256=p2kJKK63Zk-rEUucFsgY0T5PyXi_IvJY0gKewUVcAV4,56081
382
+ nexaai/utils/manifest_utils.py,sha256=PA84obFP7W1dlneURlIHIzJjWIF5dbDHGdNeHouUy68,12659
383
+ nexaai/utils/model_manager.py,sha256=_WKJP7YVk7q587OoOWwDNWVR-8tbKZkmHKjcCZN8Q4M,55979
385
384
  nexaai/utils/model_types.py,sha256=-DER8L4lAUR_iLS99F0r57avwqWtuN21ug5pX2p24_E,1369
386
385
  nexaai/utils/progress_tracker.py,sha256=jdUqtmPqyhwC9uSKvQcJEYETwSt-OhP4oitdJ94614o,15394
387
386
  nexaai/utils/quantization_utils.py,sha256=FYcNSAKGlBqFDUTx3jSKOr2lnq4nyiyC0ZG8oSxFwiU,7825
388
387
  nexaai/vlm_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
389
388
  nexaai/vlm_impl/mlx_vlm_impl.py,sha256=pLtWm_ckz8a0U-AtAOMVseFDO4OVPvHyYO2KlfBaGYk,10833
390
389
  nexaai/vlm_impl/pybind_vlm_impl.py,sha256=FAbhpRJzHgI78r0mUvKybO97R1szvNhH0aTn_I52oT4,8597
391
- nexaai-1.0.16rc14.dist-info/METADATA,sha256=rD9zD2HduPUSrlSkUZUQ4Ut2g6nvwQ-PN0kgODO7TEU,1202
392
- nexaai-1.0.16rc14.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
393
- nexaai-1.0.16rc14.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
394
- nexaai-1.0.16rc14.dist-info/RECORD,,
390
+ nexaai-1.0.17.dist-info/METADATA,sha256=BMYxa8SkZYJx_zRraC8kS32fkBpXFsrKthZBJxISykc,1198
391
+ nexaai-1.0.17.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
392
+ nexaai-1.0.17.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
393
+ nexaai-1.0.17.dist-info/RECORD,,
@@ -1,104 +0,0 @@
1
- """Utility for fetching avatar URLs from HuggingFace."""
2
-
3
- import logging
4
- from typing import Dict, Optional
5
- import httpx
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- def fetch_avatar_urls_from_hf_api(query: str, custom_endpoint: Optional[str] = None) -> Dict[str, str]:
11
- """
12
- Fetch avatar URLs from HuggingFace models-json endpoint.
13
-
14
- Args:
15
- query: Search query to fetch models for
16
- custom_endpoint: Optional custom HuggingFace endpoint
17
-
18
- Returns:
19
- Dictionary mapping author names to avatar URLs
20
- """
21
- avatar_map = {}
22
- try:
23
- # Use the base URL from the configured endpoint
24
- base_url = custom_endpoint if custom_endpoint else "https://huggingface.co"
25
-
26
- # Build the URL with query parameter
27
- url = f"{base_url}/models-json?sort=trending&search={query}&withCount=true"
28
-
29
- # Make the HTTP request with a timeout
30
- with httpx.Client(timeout=2.0) as client:
31
- response = client.get(url)
32
-
33
- if response.status_code == 200:
34
- data = response.json()
35
- models = data.get("models", [])
36
-
37
- # Build a map of author names to avatar URLs
38
- for model in models:
39
- author = model.get("author")
40
- author_data = model.get("authorData", {})
41
- avatar_url = author_data.get("avatarUrl")
42
-
43
- if author and avatar_url:
44
- # Handle relative URLs by prepending appropriate base URL
45
- if avatar_url.startswith("/"):
46
- avatar_url = f"{base_url}{avatar_url}"
47
- avatar_map[author] = avatar_url
48
-
49
- logger.debug(f"Fetched {len(avatar_map)} avatar URLs from HuggingFace API")
50
- else:
51
- logger.warning(f"Failed to fetch avatar URLs: HTTP {response.status_code}")
52
-
53
- except Exception as e:
54
- logger.warning(f"Error fetching avatar URLs from HuggingFace API: {e}")
55
- # Return empty map on error - we'll fall back to default behavior
56
-
57
- return avatar_map
58
-
59
-
60
- def get_avatar_url_for_repo(repo_id: str, search_query: Optional[str] = None,
61
- custom_endpoint: Optional[str] = None) -> Optional[str]:
62
- """
63
- Get avatar URL for a repository ID.
64
-
65
- This method tries multiple strategies:
66
- 1. If search_query is provided, fetch from HuggingFace API with that query
67
- 2. Try fetching with the full repo_id as query
68
- 3. Try fetching with just the organization name as query
69
- 4. Fall back to CDN URL pattern
70
-
71
- Args:
72
- repo_id: Repository ID in format "owner/repo"
73
- search_query: Optional search query to use for fetching avatars
74
- custom_endpoint: Optional custom HuggingFace endpoint
75
-
76
- Returns:
77
- Avatar URL or None if not found
78
- """
79
- if "/" not in repo_id:
80
- return None
81
-
82
- org_name = repo_id.split("/")[0]
83
-
84
- # Try with search query if provided
85
- if search_query:
86
- avatar_map = fetch_avatar_urls_from_hf_api(search_query, custom_endpoint)
87
- avatar_url = avatar_map.get(org_name)
88
- if avatar_url:
89
- return avatar_url
90
-
91
- # Try with full repo_id
92
- avatar_map = fetch_avatar_urls_from_hf_api(repo_id, custom_endpoint)
93
- avatar_url = avatar_map.get(org_name)
94
- if avatar_url:
95
- return avatar_url
96
-
97
- # Try with just organization name
98
- avatar_map = fetch_avatar_urls_from_hf_api(org_name, custom_endpoint)
99
- avatar_url = avatar_map.get(org_name)
100
- if avatar_url:
101
- return avatar_url
102
-
103
- # Fallback to CDN URL pattern
104
- return f"https://cdn-thumbnails.huggingface.co/social-thumbnails/{org_name}.png"