ltcai 0.2.1 β†’ 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/server.py CHANGED
@@ -46,8 +46,10 @@ from pydantic import BaseModel
46
46
  from PIL import Image
47
47
 
48
48
  from llm_router import AsyncOpenAI, LLMRouter, OPENAI_COMPATIBLE_PROVIDERS, HF_MODELS_ROOT, ensure_mlx_runtime, hf_model_dir, parse_model_ref, mx, normalize_branding
49
- from knowledge_graph import KnowledgeGraphStore
49
+ from knowledge_graph import KnowledgeGraphStore, set_llm_router
50
50
  from knowledge_graph_api import create_knowledge_graph_router
51
+ from latticeai.core.context_builder import retrieve_context_for_generation, format_sources_footnote
52
+ from latticeai.core.document_generator import detect_document_intent, DocumentGenerationSession
51
53
  from local_knowledge_api import LocalKnowledgeWatcher, create_local_knowledge_router
52
54
  from latticeai.core.security import (
53
55
  hash_password as _hash_password,
@@ -1001,7 +1003,9 @@ def build_admin_audit_report(users: Dict) -> Dict:
1001
1003
  )
1002
1004
 
1003
1005
  router = LLMRouter()
1006
+ set_llm_router(router)
1004
1007
  gardener = PReinforceGardener()
1008
+ _doc_gen_sessions: dict = {} # conversation_id β†’ DocumentGenerationSession
1005
1009
 
1006
1010
  async def autoload_default_model() -> None:
1007
1011
  if not AUTOLOAD_MODELS:
@@ -1103,7 +1107,7 @@ async def lifespan(app: FastAPI):
1103
1107
  except Exception:
1104
1108
  pass
1105
1109
 
1106
- app = FastAPI(title=f"Lattice AI Server ({APP_MODE})", version="2.1.0", lifespan=lifespan)
1110
+ app = FastAPI(title=f"Lattice AI Server ({APP_MODE})", version="0.2.2", lifespan=lifespan)
1107
1111
 
1108
1112
  CORS_ALLOWED_ORIGINS = [
1109
1113
  f"http://localhost:{DEFAULT_PORT}",
@@ -1620,6 +1624,9 @@ ENGINE_MODEL_CATALOG = {
1620
1624
  {"id": "mlx-community/Qwen3-VL-30B-A3B-Instruct-4bit", "name": "Qwen3-VL 30B A3B", "family": "Qwen3-VL", "tag": "local-vlm", "size": "18GB", "pullable": True},
1621
1625
  {"id": "mlx-community/gemma-3-27b-it-4bit", "name": "Gemma 3 27B", "family": "Gemma 3", "tag": "local-vlm", "size": "17GB", "pullable": True},
1622
1626
  {"id": "mlx-community/gemma-4-26b-a4b-it-4bit", "name": "Gemma 4 26B A4B Instruct", "family": "Gemma 4", "tag": "local-vlm", "size": "15.6GB", "pullable": True},
1627
+ {"id": "mlx-community/gemma-4-31b-it-4bit", "name": "Gemma 4 31B Instruct", "family": "Gemma 4", "tag": "local-vlm", "size": "18.4GB", "pullable": True},
1628
+ {"id": "mlx-community/gpt-oss-20b-MXFP4-Q8", "name": "GPT-OSS 20B", "family": "GPT-OSS", "tag": "local-reasoning", "size": "12.1GB", "pullable": True},
1629
+ {"id": "mlx-community/gpt-oss-120b-MXFP4-Q4", "name": "GPT-OSS 120B", "family": "GPT-OSS", "tag": "local-large", "size": "62.3GB", "pullable": True},
1623
1630
  {"id": "mlx-community/Llama-3.3-70B-Instruct-4bit", "name": "Llama 3.3 70B", "family": "Llama 3.x", "tag": "local-general", "size": "40GB+", "pullable": True},
1624
1631
  {"id": "mlx-community/Llama-3.1-70B-Instruct-4bit", "name": "Llama 3.1 70B", "family": "Llama 3.1", "tag": "local-general", "size": "40GB+", "pullable": True},
1625
1632
  ],
@@ -1627,6 +1634,9 @@ ENGINE_MODEL_CATALOG = {
1627
1634
  {"id": "ollama:qwen3-vl:4b", "name": "Qwen3-VL 4B via Ollama", "family": "Qwen3-VL", "tag": "local-vlm", "size": "pull required", "pullable": True},
1628
1635
  {"id": "ollama:qwen3-vl:8b", "name": "Qwen3-VL 8B via Ollama", "family": "Qwen3-VL", "tag": "local-vlm", "size": "pull required", "pullable": True},
1629
1636
  {"id": "ollama:qwen3-vl:30b", "name": "Qwen3-VL 30B via Ollama", "family": "Qwen3-VL", "tag": "local-vlm", "size": "pull required", "pullable": True},
1637
+ {"id": "ollama:gpt-oss:20b", "name": "GPT-OSS 20B via Ollama", "family": "GPT-OSS", "tag": "local-reasoning", "size": "pull required", "pullable": True},
1638
+ {"id": "ollama:gpt-oss:120b", "name": "GPT-OSS 120B via Ollama", "family": "GPT-OSS", "tag": "local-large", "size": "pull required", "pullable": True},
1639
+ {"id": "ollama:hf.co/ggml-org/gemma-4-31B-it-GGUF:Q4_K_M", "name": "Gemma 4 31B Q4 via Ollama", "family": "Gemma 4", "tag": "local-vlm", "size": "18.7GB", "pullable": True},
1630
1640
  {"id": "ollama:qwen3:8b", "name": "Qwen3 8B via Ollama", "family": "Qwen", "tag": "local-server", "size": "pull required", "pullable": True},
1631
1641
  {"id": "ollama:qwen2.5-coder:14b", "name": "Qwen2.5 Coder 14B via Ollama", "family": "Qwen", "tag": "local-coding", "size": "pull required", "pullable": True},
1632
1642
  {"id": "ollama:gemma3:1b", "name": "Gemma 3 1B via Ollama", "family": "Gemma", "tag": "local-light", "size": "pull required", "pullable": True},
@@ -1649,6 +1659,8 @@ ENGINE_MODEL_CATALOG = {
1649
1659
  {"id": "ollama:smollm2:1.7b", "name": "SmolLM2 1.7B via Ollama", "family": "SmolLM", "tag": "local-light", "size": "pull required", "pullable": True},
1650
1660
  ],
1651
1661
  "vllm": [
1662
+ {"id": "vllm:openai/gpt-oss-20b", "name": "GPT-OSS 20B via vLLM", "family": "GPT-OSS", "tag": "local-reasoning", "size": "server model", "pullable": True},
1663
+ {"id": "vllm:openai/gpt-oss-120b", "name": "GPT-OSS 120B via vLLM", "family": "GPT-OSS", "tag": "local-large", "size": "server model", "pullable": True},
1652
1664
  {"id": "vllm:Qwen/Qwen3-VL-4B-Instruct", "name": "Qwen3-VL 4B via vLLM", "family": "Qwen3-VL", "tag": "local-vlm", "size": "server model", "pullable": True},
1653
1665
  {"id": "vllm:Qwen/Qwen3-VL-8B-Instruct", "name": "Qwen3-VL 8B via vLLM", "family": "Qwen3-VL", "tag": "local-vlm", "size": "server model", "pullable": True},
1654
1666
  {"id": "vllm:Qwen/Qwen3-VL-30B-A3B-Instruct", "name": "Qwen3-VL 30B A3B via vLLM", "family": "Qwen3-VL", "tag": "local-vlm", "size": "server model", "pullable": True},
@@ -1671,6 +1683,9 @@ ENGINE_MODEL_CATALOG = {
1671
1683
  {"id": "vllm:meta-llama/Llama-3.1-70B-Instruct", "name": "Llama 3.1 70B via vLLM", "family": "Llama 3.1", "tag": "local-server", "size": "server model", "pullable": True},
1672
1684
  ],
1673
1685
  "lmstudio": [
1686
+ {"id": "lmstudio:openai/gpt-oss-20b", "name": "GPT-OSS 20B via LM Studio", "family": "GPT-OSS", "tag": "local-reasoning", "size": "server model", "pullable": True},
1687
+ {"id": "lmstudio:openai/gpt-oss-120b", "name": "GPT-OSS 120B via LM Studio", "family": "GPT-OSS", "tag": "local-large", "size": "server model", "pullable": True},
1688
+ {"id": "lmstudio:ggml-org/gemma-4-31B-it-GGUF", "name": "Gemma 4 31B 4-bit via LM Studio", "family": "Gemma 4", "tag": "local-vlm", "size": "server model", "pullable": True},
1674
1689
  {"id": "lmstudio:Qwen/Qwen3-VL-4B-Instruct", "name": "Qwen3-VL 4B via LM Studio", "family": "Qwen3-VL", "tag": "local-vlm", "size": "server model", "pullable": True},
1675
1690
  {"id": "lmstudio:Qwen/Qwen3-VL-8B-Instruct", "name": "Qwen3-VL 8B via LM Studio", "family": "Qwen3-VL", "tag": "local-vlm", "size": "server model", "pullable": True},
1676
1691
  {"id": "lmstudio:Qwen/Qwen3-VL-30B-A3B-Instruct", "name": "Qwen3-VL 30B A3B via LM Studio", "family": "Qwen3-VL", "tag": "local-vlm", "size": "server model", "pullable": True},
@@ -1691,6 +1706,9 @@ ENGINE_MODEL_CATALOG = {
1691
1706
  {"id": "lmstudio:meta-llama/Llama-3.1-70B-Instruct", "name": "Llama 3.1 70B via LM Studio", "family": "Llama 3.1", "tag": "local-server", "size": "server model", "pullable": True},
1692
1707
  ],
1693
1708
  "llamacpp": [
1709
+ {"id": "llamacpp:ggml-org/gpt-oss-20b-GGUF", "name": "GPT-OSS 20B GGUF via llama.cpp", "family": "GPT-OSS", "tag": "gguf-q4", "size": "gguf", "pullable": True},
1710
+ {"id": "llamacpp:ggml-org/gpt-oss-120b-GGUF", "name": "GPT-OSS 120B GGUF via llama.cpp", "family": "GPT-OSS", "tag": "gguf-q4", "size": "gguf", "pullable": True},
1711
+ {"id": "llamacpp:ggml-org/gemma-4-31B-it-GGUF", "name": "Gemma 4 31B GGUF via llama.cpp", "family": "Gemma 4", "tag": "gguf-q4", "size": "gguf", "pullable": True},
1694
1712
  {"id": "llamacpp:Qwen/Qwen3-VL-4B-Instruct-GGUF", "name": "Qwen3-VL 4B GGUF via llama.cpp", "family": "Qwen3-VL", "tag": "gguf-vlm", "size": "gguf", "pullable": True},
1695
1713
  {"id": "llamacpp:Qwen/Qwen3-VL-8B-Instruct-GGUF", "name": "Qwen3-VL 8B GGUF via llama.cpp", "family": "Qwen3-VL", "tag": "gguf-vlm", "size": "gguf", "pullable": True},
1696
1714
  {"id": "llamacpp:unsloth/gemma-2-2b-it-GGUF", "name": "Gemma 2 2B GGUF via llama.cpp", "family": "Gemma", "tag": "gguf-q4", "size": "gguf", "pullable": True},
@@ -1706,6 +1724,97 @@ ENGINE_MODEL_CATALOG = {
1706
1724
  ],
1707
1725
  }
1708
1726
 
1727
+ MODEL_ENGINE_ALIASES = {
1728
+ "gpt-oss-20b": {
1729
+ "local_mlx": "mlx-community/gpt-oss-20b-MXFP4-Q8",
1730
+ "ollama": "gpt-oss:20b",
1731
+ "vllm": "openai/gpt-oss-20b",
1732
+ "lmstudio": "openai/gpt-oss-20b",
1733
+ "llamacpp": "ggml-org/gpt-oss-20b-GGUF",
1734
+ },
1735
+ "openai/gpt-oss-20b": {
1736
+ "local_mlx": "mlx-community/gpt-oss-20b-MXFP4-Q8",
1737
+ "ollama": "gpt-oss:20b",
1738
+ "vllm": "openai/gpt-oss-20b",
1739
+ "lmstudio": "openai/gpt-oss-20b",
1740
+ "llamacpp": "ggml-org/gpt-oss-20b-GGUF",
1741
+ },
1742
+ "gpt-oss-120b": {
1743
+ "local_mlx": "mlx-community/gpt-oss-120b-MXFP4-Q4",
1744
+ "ollama": "gpt-oss:120b",
1745
+ "vllm": "openai/gpt-oss-120b",
1746
+ "lmstudio": "openai/gpt-oss-120b",
1747
+ "llamacpp": "ggml-org/gpt-oss-120b-GGUF",
1748
+ },
1749
+ "openai/gpt-oss-120b": {
1750
+ "local_mlx": "mlx-community/gpt-oss-120b-MXFP4-Q4",
1751
+ "ollama": "gpt-oss:120b",
1752
+ "vllm": "openai/gpt-oss-120b",
1753
+ "lmstudio": "openai/gpt-oss-120b",
1754
+ "llamacpp": "ggml-org/gpt-oss-120b-GGUF",
1755
+ },
1756
+ "gemma-4-31b-it-4bit": {
1757
+ "local_mlx": "mlx-community/gemma-4-31b-it-4bit",
1758
+ "ollama": "hf.co/ggml-org/gemma-4-31B-it-GGUF:Q4_K_M",
1759
+ "vllm": "suitch/gemma-4-31B-it-4bit",
1760
+ "lmstudio": "ggml-org/gemma-4-31B-it-GGUF",
1761
+ "llamacpp": "ggml-org/gemma-4-31B-it-GGUF",
1762
+ },
1763
+ "suitch/gemma-4-31b-it-4bit": {
1764
+ "local_mlx": "mlx-community/gemma-4-31b-it-4bit",
1765
+ "ollama": "hf.co/ggml-org/gemma-4-31B-it-GGUF:Q4_K_M",
1766
+ "vllm": "suitch/gemma-4-31B-it-4bit",
1767
+ "lmstudio": "ggml-org/gemma-4-31B-it-GGUF",
1768
+ "llamacpp": "ggml-org/gemma-4-31B-it-GGUF",
1769
+ },
1770
+ "mlx-community/gemma-4-31b-it-4bit": {
1771
+ "local_mlx": "mlx-community/gemma-4-31b-it-4bit",
1772
+ "ollama": "hf.co/ggml-org/gemma-4-31B-it-GGUF:Q4_K_M",
1773
+ "vllm": "suitch/gemma-4-31B-it-4bit",
1774
+ "lmstudio": "ggml-org/gemma-4-31B-it-GGUF",
1775
+ "llamacpp": "ggml-org/gemma-4-31B-it-GGUF",
1776
+ },
1777
+ }
1778
+
1779
+ _VERSIONED_MODEL_PATTERNS = (
1780
+ ("gemma", re.compile(r"\bgemma[-\s]?(\d+(?:\.\d+)?)", re.IGNORECASE)),
1781
+ ("qwen", re.compile(r"\bqwen[-\s]?(\d+(?:\.\d+)?)", re.IGNORECASE)),
1782
+ ("llama", re.compile(r"\bllama[-\s]?(\d+(?:\.\d+)?)", re.IGNORECASE)),
1783
+ ("phi", re.compile(r"\bphi[-\s]?(\d+(?:\.\d+)?)", re.IGNORECASE)),
1784
+ )
1785
+
1786
+
1787
+ def _version_tuple(raw: str) -> tuple[int, ...]:
1788
+ return tuple(int(part) for part in raw.split(".") if part.isdigit())
1789
+
1790
+
1791
+ def _model_family_version(model: Dict[str, object]) -> Optional[tuple[str, tuple[int, ...]]]:
1792
+ text = " ".join(str(model.get(key) or "") for key in ("family", "name", "id"))
1793
+ for family, pattern in _VERSIONED_MODEL_PATTERNS:
1794
+ match = pattern.search(text)
1795
+ if match:
1796
+ version = _version_tuple(match.group(1))
1797
+ if version:
1798
+ return family, version
1799
+ return None
1800
+
1801
+
1802
+ def filter_lower_family_versions(models: List[Dict[str, object]]) -> List[Dict[str, object]]:
1803
+ max_versions: Dict[str, tuple[int, ...]] = {}
1804
+ detected: List[tuple[Dict[str, object], Optional[tuple[str, tuple[int, ...]]]]] = []
1805
+ for model in models:
1806
+ version_info = _model_family_version(model)
1807
+ detected.append((model, version_info))
1808
+ if not version_info:
1809
+ continue
1810
+ family, version = version_info
1811
+ if version > max_versions.get(family, (0,)):
1812
+ max_versions[family] = version
1813
+ return [
1814
+ model for model, version_info in detected
1815
+ if not version_info or version_info[1] >= max_versions.get(version_info[0], version_info[1])
1816
+ ]
1817
+
1709
1818
  def _update_env_file(env_file: Path, key: str, value: str) -> None:
1710
1819
  lines = []
1711
1820
  found = False
@@ -2525,17 +2634,20 @@ def engine_status() -> List[Dict]:
2525
2634
  for m in ENGINE_MODEL_CATALOG["ollama"]:
2526
2635
  pull_name = m["id"].removeprefix("ollama:")
2527
2636
  ollama_models.append({**m, "pulled": pull_name in pulled})
2637
+ ollama_models = filter_lower_family_versions(ollama_models)
2528
2638
 
2529
2639
  HF_MODELS_ROOT.mkdir(parents=True, exist_ok=True)
2530
2640
  mlx_models = []
2531
2641
  for m in ENGINE_MODEL_CATALOG.get("local_mlx", []):
2532
2642
  repo_id = m["id"]
2533
2643
  mlx_models.append({**m, "pulled": hf_model_ready(repo_id, "local_mlx")})
2644
+ mlx_models = filter_lower_family_versions(mlx_models)
2534
2645
 
2535
2646
  vllm_models = []
2536
2647
  for m in ENGINE_MODEL_CATALOG.get("vllm", []):
2537
2648
  repo_id = m["id"].removeprefix("vllm:")
2538
2649
  vllm_models.append({**m, "pulled": hf_model_ready(repo_id, "vllm")})
2650
+ vllm_models = filter_lower_family_versions(vllm_models)
2539
2651
 
2540
2652
  lmstudio_models = []
2541
2653
  downloaded_lmstudio = get_lmstudio_models()
@@ -2567,11 +2679,13 @@ def engine_status() -> List[Dict]:
2567
2679
  repo_id = m["id"].removeprefix("lmstudio:")
2568
2680
  if f"lmstudio:{repo_id}" not in known_ids and repo_id not in downloaded_by_key:
2569
2681
  lmstudio_models.append({**m, "pulled": False})
2682
+ lmstudio_models = filter_lower_family_versions(lmstudio_models)
2570
2683
 
2571
2684
  llamacpp_models = []
2572
2685
  for m in ENGINE_MODEL_CATALOG.get("llamacpp", []):
2573
2686
  repo_id = m["id"].removeprefix("llamacpp:")
2574
2687
  llamacpp_models.append({**m, "pulled": hf_model_ready(repo_id, "llamacpp")})
2688
+ llamacpp_models = filter_lower_family_versions(llamacpp_models)
2575
2689
 
2576
2690
  local_server_specs = [
2577
2691
  {
@@ -2768,8 +2882,29 @@ def install_engine(engine: str) -> Dict:
2768
2882
  return result
2769
2883
 
2770
2884
 
2885
+ def _resolve_model_alias(model_id: str, engine: Optional[str] = None) -> str:
2886
+ raw = model_id.strip()
2887
+ engine_hint = (engine or "").strip().lower()
2888
+ provider: Optional[str] = None
2889
+ model_name = raw
2890
+ if ":" in raw:
2891
+ prefix, rest = raw.split(":", 1)
2892
+ prefix = prefix.strip().lower()
2893
+ if prefix in {"ollama", "vllm", "lmstudio", "llamacpp", "local_mlx", "mlx"}:
2894
+ provider = "local_mlx" if prefix in {"local_mlx", "mlx"} else prefix
2895
+ model_name = rest.strip()
2896
+ provider = provider or ("local_mlx" if engine_hint in {"", "local_mlx", "mlx"} else engine_hint)
2897
+ aliases = MODEL_ENGINE_ALIASES.get(model_name.lower())
2898
+ if not aliases:
2899
+ return raw
2900
+ mapped = aliases.get(provider)
2901
+ if not mapped:
2902
+ return raw
2903
+ return mapped if provider == "local_mlx" else f"{provider}:{mapped}"
2904
+
2905
+
2771
2906
  def normalize_local_model_request(model_id: str, engine: Optional[str] = None) -> str:
2772
- model_id = model_id.strip()
2907
+ model_id = _resolve_model_alias(model_id, engine)
2773
2908
  engine = (engine or "").strip().lower()
2774
2909
  if engine in {"local_mlx", "mlx"} and model_id.startswith(("local_mlx:", "mlx:")):
2775
2910
  return model_id.split(":", 1)[1].strip()
@@ -3165,7 +3300,7 @@ async def verify_cloud_models(force: bool = False, provider_filter: Optional[str
3165
3300
 
3166
3301
  @app.get("/health")
3167
3302
  async def health(request: Request):
3168
- base = {"status": "ok", "version": "2.1.0", "mode": APP_MODE}
3303
+ base = {"status": "ok", "version": "0.2.2", "mode": APP_MODE}
3169
3304
  if not get_current_user(request) and REQUIRE_AUTH:
3170
3305
  return base
3171
3306
  engines = await asyncio.to_thread(engine_status)
@@ -3206,7 +3341,7 @@ async def engines_verify_cloud(req: VerifyCloudRequest, request: Request):
3206
3341
  @app.post("/engines/pull-model")
3207
3342
  async def pull_ollama_model(req: PullModelRequest, request: Request):
3208
3343
  require_user(request)
3209
- model_ref = req.model.strip()
3344
+ model_ref = normalize_local_model_request(req.model, None)
3210
3345
  if not model_ref:
3211
3346
  raise HTTPException(status_code=400, detail="λͺ¨λΈ μ‹λ³„μžκ°€ λΉ„μ–΄ μžˆμŠ΅λ‹ˆλ‹€.")
3212
3347
 
@@ -3324,23 +3459,8 @@ async def set_api_key(req: SetApiKeyRequest, request: Request):
3324
3459
  async def list_models():
3325
3460
  """HuggingFace μΆ”μ²œ λͺ¨λΈ λͺ©λ‘ 및 λ‘œλ“œ μƒνƒœ λ°˜ν™˜"""
3326
3461
  recommended = [
3327
- {"id": "mlx-community/Qwen3-VL-4B-Instruct-4bit", "name": "Qwen3-VL 4B", "tag": "multimodal", "size": "2.7GB"},
3328
- {"id": "mlx-community/Qwen3-VL-8B-Instruct-4bit", "name": "Qwen3-VL 8B", "tag": "multimodal", "size": "4.8GB"},
3329
- {"id": "mlx-community/Qwen3-VL-30B-A3B-Instruct-4bit", "name": "Qwen3-VL 30B A3B","tag": "multimodal", "size": "18GB"},
3330
- {"id": "mlx-community/SmolLM-1.7B-Instruct-4bit", "name": "SmolLM 1.7B", "tag": "ultra-light", "size": "963MB"},
3331
- {"id": "mlx-community/gemma-3-1b-it-4bit", "name": "Gemma 3 1B", "tag": "ultra-light", "size": "733MB"},
3332
- {"id": "mlx-community/Llama-3.2-1B-Instruct-4bit", "name": "Llama 3.2 1B", "tag": "light", "size": "1.3GB"},
3333
- {"id": "mlx-community/Llama-3.2-3B-Instruct-4bit", "name": "Llama 3.2 3B", "tag": "light", "size": "2.0GB"},
3334
- {"id": "mlx-community/Phi-4-mini-instruct-4bit", "name": "Phi 4 Mini", "tag": "coding", "size": "2.2GB"},
3335
- {"id": "mlx-community/Qwen2.5-VL-7B-Instruct-4bit", "name": "Qwen2.5-VL 7B", "tag": "multimodal", "size": "4.4GB"},
3336
- {"id": "mlx-community/Mistral-7B-Instruct-v0.3-4bit", "name": "Mistral 7B v0.3", "tag": "general", "size": "4.1GB"},
3337
- {"id": "mlx-community/Llama-3.1-8B-Instruct-4bit", "name": "Llama 3.1 8B", "tag": "general", "size": "4.7GB"},
3338
- {"id": "mlx-community/gemma-4-e4b-it-4bit", "name": "Gemma 4 E4B", "tag": "multimodal", "size": "5.2GB"},
3339
- {"id": "mlx-community/gemma-3-12b-it-4bit", "name": "Gemma 3 12B", "tag": "balanced", "size": "8.0GB"},
3340
- {"id": "mlx-community/phi-4-4bit", "name": "Phi 4", "tag": "coding", "size": "8.3GB"},
3341
- {"id": "mlx-community/Mistral-Small-24B-Instruct-2501-4bit", "name": "Mistral Small 24B", "tag": "large", "size": "13.3GB"},
3342
- {"id": "mlx-community/Qwen2.5-Coder-32B-Instruct-4bit", "name": "Qwen2.5 Coder 32B","tag": "coding", "size": "18.5GB"},
3343
- {"id": "mlx-community/gemma-4-26b-a4b-it-4bit", "name": "Gemma 4 26B A4B", "tag": "multimodal", "size": "15.6GB"},
3462
+ {"id": item["id"], "name": item["name"], "tag": item["tag"], "size": item["size"]}
3463
+ for item in filter_lower_family_versions(ENGINE_MODEL_CATALOG.get("local_mlx", []))
3344
3464
  ]
3345
3465
  return {
3346
3466
  "recommended": recommended,
@@ -3520,12 +3640,24 @@ async def chat(req: ChatRequest, request: Request):
3520
3640
  except Exception as e:
3521
3641
  logging.warning("Knowledge reinforcement skipped: %s", e)
3522
3642
 
3643
+ is_doc_gen = detect_document_intent(req.message)
3644
+ doc_gen_context_result = None
3645
+
3523
3646
  try:
3524
3647
  if ENABLE_GRAPH and KNOWLEDGE_GRAPH:
3525
- graph_context = KNOWLEDGE_GRAPH.context_for_query(req.message)
3526
- if graph_context:
3527
- context += f"\n\n[KNOWLEDGE GRAPH]\n{graph_context}"
3528
- print("πŸ•ΈοΈ Context reinforced with knowledge graph.")
3648
+ if is_doc_gen:
3649
+ doc_gen_context_result = retrieve_context_for_generation(
3650
+ KNOWLEDGE_GRAPH, req.message, max_results=10, max_hops=2,
3651
+ )
3652
+ graph_md = doc_gen_context_result.get("context_markdown", "")
3653
+ if graph_md:
3654
+ context += f"\n\n[KNOWLEDGE GRAPH β€” Document Generation Context]\n{graph_md}"
3655
+ print("πŸ“ Document generation context retrieved from knowledge graph.")
3656
+ else:
3657
+ graph_context = KNOWLEDGE_GRAPH.context_for_query(req.message)
3658
+ if graph_context:
3659
+ context += f"\n\n[KNOWLEDGE GRAPH]\n{graph_context}"
3660
+ print("πŸ•ΈοΈ Context reinforced with knowledge graph.")
3529
3661
  except Exception as e:
3530
3662
  logging.warning("Knowledge graph reinforcement skipped: %s", e)
3531
3663
 
@@ -3535,7 +3667,6 @@ async def chat(req: ChatRequest, request: Request):
3535
3667
  context += f"\n\n{screenshot_context}"
3536
3668
 
3537
3669
  if env_bool("LATTICEAI_AUTO_READ_CHAT_PATHS", default=False):
3538
- # Off by default: automatic local-file injection can leak files to cloud models.
3539
3670
  _file_path_re = re.compile(r'(?:^|[\s\'\"(])((~|/[\w.])[^\s\'")\]]*)', re.MULTILINE)
3540
3671
  for _m in _file_path_re.finditer(req.message or ""):
3541
3672
  _fpath = _m.group(1).strip()
@@ -3553,6 +3684,55 @@ async def chat(req: ChatRequest, request: Request):
3553
3684
  if req.source != "telegram":
3554
3685
  asyncio.create_task(broadcast_web_chat("user", req.message))
3555
3686
 
3687
+ if is_doc_gen and ENABLE_GRAPH and KNOWLEDGE_GRAPH:
3688
+ conv_key = req.conversation_id or "default"
3689
+ session = _doc_gen_sessions.get(conv_key)
3690
+ if session is None:
3691
+ session = DocumentGenerationSession()
3692
+ _doc_gen_sessions[conv_key] = session
3693
+ graph_md = (doc_gen_context_result or {}).get("context_markdown", "")
3694
+ system_prompt = session.get_system_prompt(graph_md)
3695
+ sources = (doc_gen_context_result or {}).get("sources", [])
3696
+ footnote = format_sources_footnote(sources)
3697
+
3698
+ if req.stream:
3699
+ async def _stream_doc_gen():
3700
+ collected = []
3701
+ async for chunk in router.stream_generate_document(
3702
+ req.message, system_prompt,
3703
+ max_tokens=req.max_tokens or 8192,
3704
+ temperature=req.temperature or 0.3,
3705
+ ):
3706
+ collected.append(chunk)
3707
+ yield f"data: {json.dumps({'text': chunk}, ensure_ascii=False)}\n\n"
3708
+ full_text = "".join(collected)
3709
+ if footnote:
3710
+ yield f"data: {json.dumps({'text': footnote}, ensure_ascii=False)}\n\n"
3711
+ full_text += footnote
3712
+ session.update(graph_md, full_text, req.conversation_id)
3713
+ save_to_history("assistant", full_text, source=req.source or "web", conversation_id=req.conversation_id, **history_user)
3714
+ if req.source != "telegram":
3715
+ asyncio.create_task(broadcast_web_chat("assistant", full_text))
3716
+ yield "data: [DONE]\n\n"
3717
+ return StreamingResponse(
3718
+ _stream_doc_gen(),
3719
+ media_type="text/event-stream",
3720
+ headers={"X-Model": router.current_model_id, "X-Doc-Gen": "true"},
3721
+ )
3722
+ else:
3723
+ result = await router.generate_document(
3724
+ req.message, system_prompt,
3725
+ max_tokens=req.max_tokens or 8192,
3726
+ temperature=req.temperature or 0.3,
3727
+ )
3728
+ if footnote:
3729
+ result += footnote
3730
+ session.update(graph_md, result, req.conversation_id)
3731
+ save_to_history("assistant", str(result), source=req.source or "web", conversation_id=req.conversation_id, **history_user)
3732
+ if req.source != "telegram":
3733
+ asyncio.create_task(broadcast_web_chat("assistant", str(result)))
3734
+ return JSONResponse(content={"response": str(result)})
3735
+
3556
3736
  if req.stream:
3557
3737
  recent_context = build_recent_chat_context(user_email=effective_email, conversation_id=req.conversation_id)
3558
3738
  stream_context = context
@@ -159,6 +159,32 @@
159
159
  --radius-sm: var(--lt-radius-sm);
160
160
  }
161
161
 
162
+ /* ── Global polish ─────────────────────────────────────────── */
163
+ ::selection {
164
+ background: rgba(110, 74, 230, 0.18);
165
+ color: var(--lt-color-ink-900);
166
+ }
167
+
168
+ :focus-visible {
169
+ outline: 2px solid rgba(110, 74, 230, 0.40);
170
+ outline-offset: 2px;
171
+ }
172
+
173
+ ::-webkit-scrollbar {
174
+ width: 6px;
175
+ height: 6px;
176
+ }
177
+ ::-webkit-scrollbar-track {
178
+ background: transparent;
179
+ }
180
+ ::-webkit-scrollbar-thumb {
181
+ background: rgba(110, 74, 230, 0.16);
182
+ border-radius: 99px;
183
+ }
184
+ ::-webkit-scrollbar-thumb:hover {
185
+ background: rgba(110, 74, 230, 0.28);
186
+ }
187
+
162
188
  /* ── Reduced motion (a11y) ─────────────────────────────────── */
163
189
  @media (prefers-reduced-motion: reduce) {
164
190
  :root {