ltcai 0.3.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +285 -224
- package/docs/CHANGELOG.md +60 -0
- package/kg_schema.py +42 -0
- package/knowledge_graph.py +232 -36
- package/latticeai/core/agent.py +453 -0
- package/latticeai/core/config.py +178 -0
- package/llm_router.py +20 -8
- package/package.json +1 -1
- package/server.py +92 -436
- package/tools.py +87 -115
package/llm_router.py
CHANGED
|
@@ -227,6 +227,18 @@ def ensure_mlx_runtime() -> None:
|
|
|
227
227
|
except Exception as e:
|
|
228
228
|
raise RuntimeError(f"MLX runtime is not available after install: {e}") from e
|
|
229
229
|
|
|
230
|
+
def _mlx_sampler(temperature: float):
|
|
231
|
+
"""Build an MLX sampler callable for the given temperature.
|
|
232
|
+
|
|
233
|
+
mlx_lm >= 0.20 removed the ``temp`` keyword from generate_step in favour of a
|
|
234
|
+
``sampler`` callable, and mlx_vlm follows the same convention. Passing
|
|
235
|
+
``temp=`` to generate/stream_generate now raises
|
|
236
|
+
``generate_step() got an unexpected keyword argument 'temp'``. Both libraries
|
|
237
|
+
accept ``sampler=`` and share make_sampler from mlx_lm.sample_utils.
|
|
238
|
+
"""
|
|
239
|
+
from mlx_lm.sample_utils import make_sampler
|
|
240
|
+
return make_sampler(temp=temperature)
|
|
241
|
+
|
|
230
242
|
class LLMRouter:
|
|
231
243
|
def __init__(self):
|
|
232
244
|
self._cache: Dict[str, Tuple] = {}
|
|
@@ -514,10 +526,10 @@ class LLMRouter:
|
|
|
514
526
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
515
527
|
if is_gemma4 and VLM_AVAILABLE:
|
|
516
528
|
from mlx_vlm import generate as vlm_gen
|
|
517
|
-
return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens,
|
|
529
|
+
return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
518
530
|
else:
|
|
519
531
|
from mlx_lm import generate as lm_gen
|
|
520
|
-
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
532
|
+
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
521
533
|
result = await loop.run_in_executor(executor, _gen)
|
|
522
534
|
# mlx-vlm might return a GenerationResult object; extract the text
|
|
523
535
|
if hasattr(result, "text"):
|
|
@@ -571,10 +583,10 @@ class LLMRouter:
|
|
|
571
583
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
572
584
|
if is_gemma4 and VLM_AVAILABLE:
|
|
573
585
|
from mlx_vlm import stream_generate as vlm_stream
|
|
574
|
-
gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens,
|
|
586
|
+
gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
575
587
|
else:
|
|
576
588
|
from mlx_lm import stream_generate as lm_stream
|
|
577
|
-
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
589
|
+
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
578
590
|
|
|
579
591
|
for chunk in gen:
|
|
580
592
|
text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
|
|
@@ -666,10 +678,10 @@ class LLMRouter:
|
|
|
666
678
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
667
679
|
if is_gemma4 and VLM_AVAILABLE:
|
|
668
680
|
from mlx_vlm import generate as vlm_gen
|
|
669
|
-
return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens,
|
|
681
|
+
return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
670
682
|
else:
|
|
671
683
|
from mlx_lm import generate as lm_gen
|
|
672
|
-
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
684
|
+
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
673
685
|
result = await loop.run_in_executor(executor, _gen)
|
|
674
686
|
if hasattr(result, "text"):
|
|
675
687
|
return normalize_branding(result.text)
|
|
@@ -733,10 +745,10 @@ class LLMRouter:
|
|
|
733
745
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
734
746
|
if is_gemma4 and VLM_AVAILABLE:
|
|
735
747
|
from mlx_vlm import stream_generate as vlm_stream
|
|
736
|
-
gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens,
|
|
748
|
+
gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
737
749
|
else:
|
|
738
750
|
from mlx_lm import stream_generate as lm_stream
|
|
739
|
-
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
751
|
+
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
740
752
|
for chunk in gen:
|
|
741
753
|
text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
|
|
742
754
|
loop.call_soon_threadsafe(queue.put_nowait, text)
|