ltcai 0.3.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/llm_router.py CHANGED
@@ -227,6 +227,18 @@ def ensure_mlx_runtime() -> None:
227
227
  except Exception as e:
228
228
  raise RuntimeError(f"MLX runtime is not available after install: {e}") from e
229
229
 
230
+ def _mlx_sampler(temperature: float):
231
+ """Build an MLX sampler callable for the given temperature.
232
+
233
+ mlx_lm >= 0.20 removed the ``temp`` keyword from generate_step in favour of a
234
+ ``sampler`` callable, and mlx_vlm follows the same convention. Passing
235
+ ``temp=`` to generate/stream_generate now raises
236
+ ``generate_step() got an unexpected keyword argument 'temp'``. Both libraries
237
+ accept ``sampler=`` and share make_sampler from mlx_lm.sample_utils.
238
+ """
239
+ from mlx_lm.sample_utils import make_sampler
240
+ return make_sampler(temp=temperature)
241
+
230
242
  class LLMRouter:
231
243
  def __init__(self):
232
244
  self._cache: Dict[str, Tuple] = {}
@@ -514,10 +526,10 @@ class LLMRouter:
514
526
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
515
527
  if is_gemma4 and VLM_AVAILABLE:
516
528
  from mlx_vlm import generate as vlm_gen
517
- return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
529
+ return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
518
530
  else:
519
531
  from mlx_lm import generate as lm_gen
520
- return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
532
+ return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
521
533
  result = await loop.run_in_executor(executor, _gen)
522
534
  # mlx-vlm might return a GenerationResult object; extract the text
523
535
  if hasattr(result, "text"):
@@ -571,10 +583,10 @@ class LLMRouter:
571
583
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
572
584
  if is_gemma4 and VLM_AVAILABLE:
573
585
  from mlx_vlm import stream_generate as vlm_stream
574
- gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
586
+ gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
575
587
  else:
576
588
  from mlx_lm import stream_generate as lm_stream
577
- gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
589
+ gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
578
590
 
579
591
  for chunk in gen:
580
592
  text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
@@ -666,10 +678,10 @@ class LLMRouter:
666
678
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
667
679
  if is_gemma4 and VLM_AVAILABLE:
668
680
  from mlx_vlm import generate as vlm_gen
669
- return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
681
+ return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
670
682
  else:
671
683
  from mlx_lm import generate as lm_gen
672
- return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
684
+ return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
673
685
  result = await loop.run_in_executor(executor, _gen)
674
686
  if hasattr(result, "text"):
675
687
  return normalize_branding(result.text)
@@ -733,10 +745,10 @@ class LLMRouter:
733
745
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
734
746
  if is_gemma4 and VLM_AVAILABLE:
735
747
  from mlx_vlm import stream_generate as vlm_stream
736
- gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
748
+ gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
737
749
  else:
738
750
  from mlx_lm import stream_generate as lm_stream
739
- gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
751
+ gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
740
752
  for chunk in gen:
741
753
  text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
742
754
  loop.call_soon_threadsafe(queue.put_nowait, text)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ltcai",
3
- "version": "0.3.2",
3
+ "version": "0.5.0",
4
4
  "description": "Lattice AI local MLX/cloud LLM workspace server",
5
5
  "homepage": "https://github.com/TaeSooPark-PTS/LatticeAI#readme",
6
6
  "repository": {