ltcai 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/CHANGELOG.md CHANGED
@@ -1,5 +1,27 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.5.0] - 2026-05-31
4
+
5
+ > MLX 샘플링 API 호환성 버그 수정 + 릴리스 워크플로 build-only 전환.
6
+
7
+ ### Fixed
8
+
9
+ - **MLX `temp` kwarg 제거 대응** — `llm_router.py`의 로컬 MLX 추론 경로(텍스트/
10
+ 비전, 동기/스트리밍, 문서 생성 4계열·총 8개 호출부)가 `mlx_lm.generate` /
11
+ `mlx_vlm.generate`에 `temp=temperature`를 직접 넘기다가
12
+ `generate_step() got an unexpected keyword argument 'temp'`로 실패하던 문제
13
+ 수정. mlx_lm ≥ 0.20 / mlx_vlm는 `temp` 키워드를 제거하고 `sampler` 콜러블을
14
+ 받도록 API가 바뀌었으므로, `make_sampler(temp=...)`로 만든 sampler를
15
+ `sampler=`로 전달하도록 `_mlx_sampler()` 헬퍼를 도입.
16
+
17
+ ### Changed
18
+
19
+ - **릴리스 워크플로 build-only 전환** — `.github/workflows/release.yml`이 v* 태그
20
+ push 시 단위 테스트와 빌드 산출물 생성(`python -m build`, `twine check`,
21
+ `npm pack`, `vsce package`)까지만 수행. `publish-pypi`/`publish-npm`/
22
+ `publish-vscode`/`publish-ovsx` job과 GitHub Secrets 의존(`if: secrets.*`)을
23
+ 제거. 배포는 로컬에서 수동 인증 후 진행.
24
+
3
25
  ## [0.4.0] - 2026-05-31
4
26
 
5
27
  > Knowledge Graph v2 read/write cutover — legacy/v2 동등성 보장, dual-write
package/llm_router.py CHANGED
@@ -227,6 +227,18 @@ def ensure_mlx_runtime() -> None:
227
227
  except Exception as e:
228
228
  raise RuntimeError(f"MLX runtime is not available after install: {e}") from e
229
229
 
230
+ def _mlx_sampler(temperature: float):
231
+ """Build an MLX sampler callable for the given temperature.
232
+
233
+ mlx_lm >= 0.20 removed the ``temp`` keyword from generate_step in favour of a
234
+ ``sampler`` callable, and mlx_vlm follows the same convention. Passing
235
+ ``temp=`` to generate/stream_generate now raises
236
+ ``generate_step() got an unexpected keyword argument 'temp'``. Both libraries
237
+ accept ``sampler=`` and share make_sampler from mlx_lm.sample_utils.
238
+ """
239
+ from mlx_lm.sample_utils import make_sampler
240
+ return make_sampler(temp=temperature)
241
+
230
242
  class LLMRouter:
231
243
  def __init__(self):
232
244
  self._cache: Dict[str, Tuple] = {}
@@ -514,10 +526,10 @@ class LLMRouter:
514
526
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
515
527
  if is_gemma4 and VLM_AVAILABLE:
516
528
  from mlx_vlm import generate as vlm_gen
517
- return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
529
+ return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
518
530
  else:
519
531
  from mlx_lm import generate as lm_gen
520
- return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
532
+ return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
521
533
  result = await loop.run_in_executor(executor, _gen)
522
534
  # mlx-vlm might return a GenerationResult object; extract the text
523
535
  if hasattr(result, "text"):
@@ -571,10 +583,10 @@ class LLMRouter:
571
583
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
572
584
  if is_gemma4 and VLM_AVAILABLE:
573
585
  from mlx_vlm import stream_generate as vlm_stream
574
- gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
586
+ gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data), max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
575
587
  else:
576
588
  from mlx_lm import stream_generate as lm_stream
577
- gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
589
+ gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
578
590
 
579
591
  for chunk in gen:
580
592
  text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
@@ -666,10 +678,10 @@ class LLMRouter:
666
678
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
667
679
  if is_gemma4 and VLM_AVAILABLE:
668
680
  from mlx_vlm import generate as vlm_gen
669
- return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
681
+ return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
670
682
  else:
671
683
  from mlx_lm import generate as lm_gen
672
- return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
684
+ return lm_gen(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
673
685
  result = await loop.run_in_executor(executor, _gen)
674
686
  if hasattr(result, "text"):
675
687
  return normalize_branding(result.text)
@@ -733,10 +745,10 @@ class LLMRouter:
733
745
  is_gemma4 = "gemma-4" in self._current.lower() or "gemma4" in self._current.lower()
734
746
  if is_gemma4 and VLM_AVAILABLE:
735
747
  from mlx_vlm import stream_generate as vlm_stream
736
- gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, temp=temperature, draft_model=draft_model, draft_kind="mtp")
748
+ gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
737
749
  else:
738
750
  from mlx_lm import stream_generate as lm_stream
739
- gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, temp=temperature, draft_model=draft_model)
751
+ gen = lm_stream(model, tokenizer, prompt=prompt, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model)
740
752
  for chunk in gen:
741
753
  text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
742
754
  loop.call_soon_threadsafe(queue.put_nowait, text)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ltcai",
3
- "version": "0.4.0",
3
+ "version": "0.5.0",
4
4
  "description": "Lattice AI local MLX/cloud LLM workspace server",
5
5
  "homepage": "https://github.com/TaeSooPark-PTS/LatticeAI#readme",
6
6
  "repository": {
package/server.py CHANGED
@@ -1121,7 +1121,7 @@ async def lifespan(app: FastAPI):
1121
1121
  except Exception:
1122
1122
  pass
1123
1123
 
1124
- app = FastAPI(title=f"Lattice AI Server ({APP_MODE})", version="0.4.0", lifespan=lifespan)
1124
+ app = FastAPI(title=f"Lattice AI Server ({APP_MODE})", version="0.5.0", lifespan=lifespan)
1125
1125
 
1126
1126
  CORS_ALLOWED_ORIGINS = [
1127
1127
  f"http://localhost:{DEFAULT_PORT}",
@@ -3466,7 +3466,7 @@ async def verify_cloud_models(force: bool = False, provider_filter: Optional[str
3466
3466
 
3467
3467
  @app.get("/health")
3468
3468
  async def health(request: Request):
3469
- base = {"status": "ok", "version": "0.4.0", "mode": APP_MODE}
3469
+ base = {"status": "ok", "version": "0.5.0", "mode": APP_MODE}
3470
3470
  if not get_current_user(request) and REQUIRE_AUTH:
3471
3471
  return base
3472
3472
  engines = await asyncio.to_thread(engine_status)