ltcai 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/CHANGELOG.md +22 -0
- package/llm_router.py +20 -8
- package/package.json +1 -1
- package/server.py +2 -2
package/docs/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,27 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.5.0] - 2026-05-31
|
|
4
|
+
|
|
5
|
+
> MLX 샘플링 API 호환성 버그 수정 + 릴리스 워크플로 build-only 전환.
|
|
6
|
+
|
|
7
|
+
### Fixed
|
|
8
|
+
|
|
9
|
+
- **MLX `temp` kwarg 제거 대응** — `llm_router.py`의 로컬 MLX 추론 경로(텍스트/
|
|
10
|
+
비전, 동기/스트리밍, 문서 생성 4계열·총 8개 호출부)가 `mlx_lm.generate` /
|
|
11
|
+
`mlx_vlm.generate`에 `temp=temperature`를 직접 넘기다가
|
|
12
|
+
`generate_step() got an unexpected keyword argument 'temp'`로 실패하던 문제
|
|
13
|
+
수정. mlx_lm ≥ 0.20 / mlx_vlm는 `temp` 키워드를 제거하고 `sampler` 콜러블을
|
|
14
|
+
받도록 API가 바뀌었으므로, `make_sampler(temp=...)`로 만든 sampler를
|
|
15
|
+
`sampler=`로 전달하도록 `_mlx_sampler()` 헬퍼를 도입.
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
|
|
19
|
+
- **릴리스 워크플로 build-only 전환** — `.github/workflows/release.yml`이 v* 태그
|
|
20
|
+
push 시 단위 테스트와 빌드 산출물 생성(`python -m build`, `twine check`,
|
|
21
|
+
`npm pack`, `vsce package`)까지만 수행. `publish-pypi`/`publish-npm`/
|
|
22
|
+
`publish-vscode`/`publish-ovsx` job과 GitHub Secrets 의존(`if: secrets.*`)을
|
|
23
|
+
제거. 배포는 로컬에서 수동 인증 후 진행.
|
|
24
|
+
|
|
3
25
|
## [0.4.0] - 2026-05-31
|
|
4
26
|
|
|
5
27
|
> Knowledge Graph v2 read/write cutover — legacy/v2 동등성 보장, dual-write
|
package/llm_router.py
CHANGED
|
@@ -227,6 +227,18 @@ def ensure_mlx_runtime() -> None:
|
|
|
227
227
|
except Exception as e:
|
|
228
228
|
raise RuntimeError(f"MLX runtime is not available after install: {e}") from e
|
|
229
229
|
|
|
230
|
+
def _mlx_sampler(temperature: float):
|
|
231
|
+
"""Build an MLX sampler callable for the given temperature.
|
|
232
|
+
|
|
233
|
+
mlx_lm >= 0.20 removed the ``temp`` keyword from generate_step in favour of a
|
|
234
|
+
``sampler`` callable, and mlx_vlm follows the same convention. Passing
|
|
235
|
+
``temp=`` to generate/stream_generate now raises
|
|
236
|
+
``generate_step() got an unexpected keyword argument 'temp'``. Both libraries
|
|
237
|
+
accept ``sampler=`` and share make_sampler from mlx_lm.sample_utils.
|
|
238
|
+
"""
|
|
239
|
+
from mlx_lm.sample_utils import make_sampler
|
|
240
|
+
return make_sampler(temp=temperature)
|
|
241
|
+
|
|
230
242
|
class LLMRouter:
|
|
231
243
|
def __init__(self):
|
|
232
244
|
self._cache: Dict[str, Tuple] = {}
|
|
@@ -514,10 +526,10 @@ class LLMRouter:
|
|
|
514
526
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
515
527
|
if is_gemma4 and VLM_AVAILABLE:
|
|
516
528
|
from mlx_vlm import generate as vlm_gen
|
|
517
|
-
return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens,
|
|
529
|
+
return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
518
530
|
else:
|
|
519
531
|
from mlx_lm import generate as lm_gen
|
|
520
|
-
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
532
|
+
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
521
533
|
result = await loop.run_in_executor(executor, _gen)
|
|
522
534
|
# mlx-vlm might return a GenerationResult object; extract the text
|
|
523
535
|
if hasattr(result, "text"):
|
|
@@ -571,10 +583,10 @@ class LLMRouter:
|
|
|
571
583
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
572
584
|
if is_gemma4 and VLM_AVAILABLE:
|
|
573
585
|
from mlx_vlm import stream_generate as vlm_stream
|
|
574
|
-
gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens,
|
|
586
|
+
gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
575
587
|
else:
|
|
576
588
|
from mlx_lm import stream_generate as lm_stream
|
|
577
|
-
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
589
|
+
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
578
590
|
|
|
579
591
|
for chunk in gen:
|
|
580
592
|
text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
|
|
@@ -666,10 +678,10 @@ class LLMRouter:
|
|
|
666
678
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
667
679
|
if is_gemma4 and VLM_AVAILABLE:
|
|
668
680
|
from mlx_vlm import generate as vlm_gen
|
|
669
|
-
return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens,
|
|
681
|
+
return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
670
682
|
else:
|
|
671
683
|
from mlx_lm import generate as lm_gen
|
|
672
|
-
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
684
|
+
return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
673
685
|
result = await loop.run_in_executor(executor, _gen)
|
|
674
686
|
if hasattr(result, "text"):
|
|
675
687
|
return normalize_branding(result.text)
|
|
@@ -733,10 +745,10 @@ class LLMRouter:
|
|
|
733
745
|
is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
|
|
734
746
|
if is_gemma4 and VLM_AVAILABLE:
|
|
735
747
|
from mlx_vlm import stream_generate as vlm_stream
|
|
736
|
-
gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens,
|
|
748
|
+
gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
|
|
737
749
|
else:
|
|
738
750
|
from mlx_lm import stream_generate as lm_stream
|
|
739
|
-
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens,
|
|
751
|
+
gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
|
|
740
752
|
for chunk in gen:
|
|
741
753
|
text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
|
|
742
754
|
loop.call_soon_threadsafe(queue.put_nowait, text)
|
package/package.json
CHANGED
package/server.py
CHANGED
|
@@ -1121,7 +1121,7 @@ async def lifespan(app: FastAPI):
|
|
|
1121
1121
|
except Exception:
|
|
1122
1122
|
pass
|
|
1123
1123
|
|
|
1124
|
-
app = FastAPI(title=f"Lattice AI Server ({APP_MODE})", version="0.
|
|
1124
|
+
app = FastAPI(title=f"Lattice AI Server ({APP_MODE})", version="0.5.0", lifespan=lifespan)
|
|
1125
1125
|
|
|
1126
1126
|
CORS_ALLOWED_ORIGINS = [
|
|
1127
1127
|
f"http://localhost:{DEFAULT_PORT}",
|
|
@@ -3466,7 +3466,7 @@ async def verify_cloud_models(force: bool = False, provider_filter: Optional[str
|
|
|
3466
3466
|
|
|
3467
3467
|
@app.get("/health")
|
|
3468
3468
|
async def health(request: Request):
|
|
3469
|
-
base = {"status": "ok", "version": "0.
|
|
3469
|
+
base = {"status": "ok", "version": "0.5.0", "mode": APP_MODE}
|
|
3470
3470
|
if not get_current_user(request) and REQUIRE_AUTH:
|
|
3471
3471
|
return base
|
|
3472
3472
|
engines = await asyncio.to_thread(engine_status)
|