PyPI - fusefable - Versions diffs - 0.1.9__tar.gz → 0.3.0__tar.gz - Mend

fusefable 0.1.9tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{fusefable-0.1.9 → fusefable-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fusefable
-Version: 0.1.9
+Version: 0.3.0
 Summary: Fuse multiple AI models and judge the best answer for coding
 Author: proultrax9
 License: MIT
@@ -145,6 +145,46 @@ Exposes a tool `fuse_ask(question, models?, cheap?)` for any MCP client.
 > Requires `pip install "fusefable[mcp]"` and a completed `fusefable config`.
 > If `fusefable` isn't on the app's PATH, use a full path such as `python -m fusefable.cli`.
+## Ensemble, cache & budget
+```bash
+fusefable ask --ensemble "..."     # merge all answers into one (vs picking one)
+fusefable ask --cache "..."        # reuse the answer for an identical question
+fusefable ask --no-cache "..."     # force a fresh run
+```
+- **Ensemble mode** (`--ensemble`, config `fusion_mode: ensemble`): instead of the judge
+  picking one answer, a model synthesizes a single answer combining the strengths of all
+  candidates (anonymized). Falls back to the first answer if synthesis fails.
+- **Cache** (`--cache`, config `cache: true`, `cache_ttl_seconds`): identical question +
+  same models/mode/compression returns the stored answer instantly with no API calls
+  (`cached, $0`). Stored in `~/.fusefable/cache/`. `cache_ttl_seconds: 0` = never expires.
+- **Budget cap** (config `budget_cap_usd`, `budget_action: warn|stop`): before firing,
+  the run estimates cost. If it exceeds the cap — `warn` prints a warning and continues,
+  `stop` aborts before spending anything.
+## Prompt compression (save tokens)
+Reduce token usage while keeping answer quality — useful when you pay per-provider
+directly. Two tiers, opt-in via `--compress`:
+```bash
+fusefable ask --compress "<long prompt or pasted code>"
+# [compressed: 5200→1800 chars, ~65% saved via llm]
+```
+- **Tier 1 (lossless):** trims trailing whitespace, collapses blank lines, strips
+  zero-width chars — keeps indentation and inner spacing intact (safe for code).
+- **Tier 2 (LLM):** for prompts above `compress_min_chars` (default 2000), a cheap
+  model compresses semantically — **once**, then the compressed prompt is sent to all
+  models, so you save `tokens × number-of-models`.
+- **Quality guards:** prompts under the threshold skip the LLM; if the compressed
+  result is empty, longer, or under 30% of the original, it falls back to the lossless
+  text. The judge always sees the **original** question.
+Config (`~/.fusefable/config.yaml`): `compress`, `compress_min_chars`, `compress_model`
+(empty = reuse the judge model).
 ## Architecture
 ```

{fusefable-0.1.9 → fusefable-0.3.0}/README.md RENAMED Viewed

@@ -117,6 +117,46 @@ Exposes a tool `fuse_ask(question, models?, cheap?)` for any MCP client.
 > Requires `pip install "fusefable[mcp]"` and a completed `fusefable config`.
 > If `fusefable` isn't on the app's PATH, use a full path such as `python -m fusefable.cli`.
+## Ensemble, cache & budget
+```bash
+fusefable ask --ensemble "..."     # merge all answers into one (vs picking one)
+fusefable ask --cache "..."        # reuse the answer for an identical question
+fusefable ask --no-cache "..."     # force a fresh run
+```
+- **Ensemble mode** (`--ensemble`, config `fusion_mode: ensemble`): instead of the judge
+  picking one answer, a model synthesizes a single answer combining the strengths of all
+  candidates (anonymized). Falls back to the first answer if synthesis fails.
+- **Cache** (`--cache`, config `cache: true`, `cache_ttl_seconds`): identical question +
+  same models/mode/compression returns the stored answer instantly with no API calls
+  (`cached, $0`). Stored in `~/.fusefable/cache/`. `cache_ttl_seconds: 0` = never expires.
+- **Budget cap** (config `budget_cap_usd`, `budget_action: warn|stop`): before firing,
+  the run estimates cost. If it exceeds the cap — `warn` prints a warning and continues,
+  `stop` aborts before spending anything.
+## Prompt compression (save tokens)
+Reduce token usage while keeping answer quality — useful when you pay per-provider
+directly. Two tiers, opt-in via `--compress`:
+```bash
+fusefable ask --compress "<long prompt or pasted code>"
+# [compressed: 5200→1800 chars, ~65% saved via llm]
+```
+- **Tier 1 (lossless):** trims trailing whitespace, collapses blank lines, strips
+  zero-width chars — keeps indentation and inner spacing intact (safe for code).
+- **Tier 2 (LLM):** for prompts above `compress_min_chars` (default 2000), a cheap
+  model compresses semantically — **once**, then the compressed prompt is sent to all
+  models, so you save `tokens × number-of-models`.
+- **Quality guards:** prompts under the threshold skip the LLM; if the compressed
+  result is empty, longer, or under 30% of the original, it falls back to the lossless
+  text. The judge always sees the **original** question.
+Config (`~/.fusefable/config.yaml`): `compress`, `compress_min_chars`, `compress_model`
+(empty = reuse the judge model).
 ## Architecture
 ```

fusefable-0.3.0/fusefable/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.0"

fusefable-0.3.0/fusefable/cache.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Cache คำตอบ — คำถามซ้ำ (config เดิม) ไม่ต้องยิงใหม่.
+เก็บเป็นไฟล์ JSON ใน ~/.fusefable/cache/<sha256>.json
+key มาจาก question + รายชื่อโมเดล + flags ที่กระทบผลลัพธ์
+"""
+from __future__ import annotations
+import hashlib
+import json
+import time
+from pathlib import Path
+from typing import Optional, Sequence
+from fusefable.models import Completion, FinalAnswer
+def cache_dir() -> Path:
+    return Path.home() / ".fusefable" / "cache"
+def make_key(question: str, models: Sequence[str], *, compress: bool,
+             mode: str, judge_model: str) -> str:
+    payload = json.dumps({
+        "q": question,
+        "models": sorted(models),
+        "compress": compress,
+        "mode": mode,
+        "judge": judge_model,
+    }, ensure_ascii=False, sort_keys=True)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+def _path(key: str) -> Path:
+    return cache_dir() / f"{key}.json"
+def load_cached(key: str, ttl_seconds: int, *, now: float) -> Optional[FinalAnswer]:
+    """คืน FinalAnswer (cached=True) ถ้ามีและยังไม่หมดอายุ; ไม่งั้น None.
+    ttl_seconds = 0 หมายถึงไม่หมดอายุ.
+    """
+    p = _path(key)
+    if not p.exists():
+        return None
+    try:
+        data = json.loads(p.read_text(encoding="utf-8"))
+    except (ValueError, OSError):
+        return None
+    if ttl_seconds > 0 and now - data.get("ts", 0) > ttl_seconds:
+        return None
+    d = data["answer"]
+    return FinalAnswer(
+        text=d["text"],
+        chosen_model=d["chosen_model"],
+        reason=d.get("reason", ""),
+        cost_usd=d.get("cost_usd", 0.0),
+        all_completions=[Completion(model=c["model"], text=c["text"])
+                         for c in d.get("candidates", [])],
+        cached=True,
+    )
+def save_cached(key: str, answer: FinalAnswer, *, now: float) -> None:
+    d = {
+        "ts": now,
+        "answer": {
+            "text": answer.text,
+            "chosen_model": answer.chosen_model,
+            "reason": answer.reason,
+            "cost_usd": answer.cost_usd,
+            "candidates": [{"model": c.model, "text": c.text}
+                           for c in answer.all_completions],
+        },
+    }
+    cache_dir().mkdir(parents=True, exist_ok=True)
+    _path(key).write_text(json.dumps(d, ensure_ascii=False), encoding="utf-8")

{fusefable-0.1.9 → fusefable-0.3.0}/fusefable/cli.py RENAMED Viewed

@@ -48,6 +48,12 @@ def ask(
     models: Optional[str] = typer.Option(None, "--models",
         help="จำกัดเฉพาะโมเดลที่ระบุ คั่นด้วย comma"),
     cheap: bool = typer.Option(False, "--cheap", help="ใช้ cheap_models ใน config"),
+    compress: Optional[bool] = typer.Option(None, "--compress/--no-compress",
+        help="บีบ prompt ก่อนส่งเพื่อลด token (default ตาม config)"),
+    ensemble: Optional[bool] = typer.Option(None, "--ensemble/--judge",
+        help="รวมคำตอบหลายตัวเป็นหนึ่ง (ensemble) แทนเลือกตัวเดียว (judge)"),
+    use_cache: Optional[bool] = typer.Option(None, "--cache/--no-cache",
+        help="ใช้ cache คำตอบ (default ตาม config)"),
     json_out: bool = typer.Option(False, "--json", help="output เป็น JSON"),
     quiet: bool = typer.Option(False, "--quiet", "-q",
         help="พิมพ์เฉพาะคำตอบ (เหมาะกับ pipe/subagent)"),
@@ -58,33 +64,55 @@ def ask(
     model_list = [m.strip() for m in models.split(",")] if models else None
     try:
-        result = asyncio.run(fuse(cfg, q, models=model_list, cheap=cheap))
+        result = asyncio.run(fuse(cfg, q, models=model_list, cheap=cheap,
+                                  compress=compress, ensemble=ensemble,
+                                  use_cache=use_cache))
     except RuntimeError as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1)
+    comp = result.compression
     if json_out:
-        typer.echo(json.dumps({
+        out = {
             "answer": result.text,
             "chosen_model": result.chosen_model,
             "reason": result.reason,
             "cost_usd": result.cost_usd,
             "candidates": [{"model": c.model, "text": c.text}
                            for c in result.all_completions],
-        }, ensure_ascii=False, indent=2))
+        }
+        if comp is not None:
+            out["compression"] = {
+                "original_chars": comp.original_chars,
+                "final_chars": comp.final_chars,
+                "saved_pct": round(comp.saved_pct, 1),
+                "method": comp.method,
+            }
+        out["cached"] = result.cached
+        if result.budget_warning:
+            out["budget_warning"] = result.budget_warning
+        typer.echo(json.dumps(out, ensure_ascii=False, indent=2))
         return
     if quiet:
         typer.echo(result.text)
         return
+    if result.budget_warning:
+        typer.echo(f"⚠️  {result.budget_warning}", err=True)
     if show_all:
         for c in result.all_completions:
             typer.echo(f"\n--- {c.model} ---\n{c.text}")
         typer.echo(f"\n=== Judge reason ===\n{result.reason}")
-    typer.echo(f"\n=== Best answer (from {result.chosen_model}) ===")
+    label = "ensemble" if result.chosen_model == "ensemble" else f"from {result.chosen_model}"
+    typer.echo(f"\n=== Best answer ({label}) ===")
     typer.echo(result.text)
-    typer.echo(f"\n[estimated cost: ${result.cost_usd:.4f}]")
+    if comp is not None:
+        typer.echo(f"\n[compressed: {comp.original_chars}→{comp.final_chars} chars, "
+                   f"~{comp.saved_pct:.0f}% saved via {comp.method}]")
+    cost_note = "cached, $0" if result.cached else f"${result.cost_usd:.4f}"
+    typer.echo(f"[estimated cost: {cost_note}]")
 @app.command()

fusefable-0.3.0/fusefable/compressor.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Prompt compressor — ลด token แต่คงความหมาย (2 ชั้น).
+ชั้น 1 (lossless): normalize whitespace/บรรทัดว่าง/zero-width — ปลอดภัย ไม่เสียความหมาย
+ชั้น 2 (LLM): ให้โมเดลถูกบีบเชิงความหมาย เฉพาะ prompt ยาวเกิน threshold
+มี guard: ถ้าผลบีบ ว่าง/ยาวกว่าเดิม/สั้นเกินไป → fallback ใช้ lossless
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from fusefable.client import call_model
+from fusefable.providers.base import Provider
+_BLANKS = re.compile(r"\n{3,}")
+_ZEROWIDTH = re.compile(r"[‌‍]")
+COMPRESS_SYSTEM = (
+    "You compress prompts to save tokens while preserving meaning EXACTLY. "
+    "Keep ALL technical details, code, numbers, names, constraints, and requirements. "
+    "Remove only filler words, redundancy, and repetition. "
+    "Output ONLY the compressed prompt itself — no preamble, no explanation, no quotes."
+)
+@dataclass
+class CompressionResult:
+    text: str
+    original_chars: int
+    final_chars: int
+    method: str  # "lossless" | "llm"
+    @property
+    def saved_pct(self) -> float:
+        if self.original_chars == 0:
+            return 0.0
+        return (1 - self.final_chars / self.original_chars) * 100
+def normalize_lossless(text: str) -> str:
+    """ชั้น 1: ตัด trailing space + บรรทัดว่างซ้ำ + zero-width.
+    คง indentation และช่องว่างภายในบรรทัดไว้ครบ (ปลอดภัยสำหรับโค้ด).
+    """
+    text = _ZEROWIDTH.sub("", text)
+    lines = [ln.rstrip() for ln in text.split("\n")]
+    text = "\n".join(lines)
+    text = _BLANKS.sub("\n\n", text)
+    return text.strip()
+async def compress_prompt(provider: Provider, model: str, text: str, *,
+                          min_chars: int, timeout_s: float,
+                          min_ratio: float = 0.3) -> CompressionResult:
+    """บีบ prompt 2 ชั้น. คืน CompressionResult (มี text ที่จะใช้จริง)."""
+    original = len(text)
+    lossless = normalize_lossless(text)
+    # prompt สั้น → ข้ามชั้น 2
+    if len(lossless) < min_chars:
+        return CompressionResult(lossless, original, len(lossless), "lossless")
+    # ชั้น 2: LLM
+    instruction = f"{COMPRESS_SYSTEM}\n\n---\n{lossless}"
+    result = await call_model(provider, model, instruction, timeout_s)
+    if result.is_error:
+        return CompressionResult(lossless, original, len(lossless), "lossless")
+    compressed = result.text.strip()
+    # guard กันคุณภาพตก: ว่าง / ยาวกว่าเดิม / สั้นเกินไป → ใช้ lossless
+    if (not compressed
+            or len(compressed) >= len(lossless)
+            or len(compressed) < len(lossless) * min_ratio):
+        return CompressionResult(lossless, original, len(lossless), "lossless")
+    return CompressionResult(compressed, original, len(compressed), "llm")

{fusefable-0.1.9 → fusefable-0.3.0}/fusefable/config.py RENAMED Viewed

@@ -27,6 +27,13 @@ class Config:
     min_responses: int = 1
     budget_cap_usd: float | None = None
     cheap_models: list[str] = field(default_factory=list)
+    compress: bool = False              # บีบ prompt ก่อนส่ง (opt-in)
+    compress_min_chars: int = 2000      # ต่ำกว่านี้ไม่เรียก LLM บีบ
+    compress_model: str = ""            # ว่าง = ใช้ judge_model
+    fusion_mode: str = "judge"          # "judge" (เลือกตัวดีสุด) | "ensemble" (รวมคำตอบ)
+    cache: bool = False                 # cache คำตอบ (opt-in)
+    cache_ttl_seconds: int = 0          # 0 = ไม่หมดอายุ
+    budget_action: str = "warn"         # "warn" | "stop" เมื่อประเมินเกิน budget_cap_usd
     def resolve_api_key(self) -> str:
         return os.environ.get(self.api_key_env, "")

fusefable-0.3.0/fusefable/core.py ADDED Viewed

@@ -0,0 +1,88 @@
+from __future__ import annotations
+import time
+from typing import Optional, Sequence
+import httpx
+from fusefable.config import Config
+from fusefable.routing import build_routes, build_judge_provider
+from fusefable.fusion import run_fusion
+from fusefable.compressor import compress_prompt
+from fusefable.cost import estimate_prefire_cost
+from fusefable import cache as cache_mod
+from fusefable.models import FinalAnswer
+def select_models(cfg: Config, models: Optional[Sequence[str]] = None,
+                  cheap: bool = False) -> Optional[set[str]]:
+    """ตัดสินว่าจะใช้โมเดลชุดไหน. คืน None = ใช้ทุกตัวตาม config."""
+    if models:
+        return set(models)
+    if cheap and cfg.cheap_models:
+        return set(cfg.cheap_models)
+    return None
+async def fuse(cfg: Config, question: str,
+               models: Optional[Sequence[str]] = None,
+               cheap: bool = False,
+               compress: Optional[bool] = None,
+               ensemble: Optional[bool] = None,
+               use_cache: Optional[bool] = None) -> FinalAnswer:
+    """entry point กลาง — ใช้ร่วมกันทั้ง CLI และ MCP server.
+    models: จำกัดเฉพาะโมเดลที่ระบุ
+    cheap: ใช้ cfg.cheap_models
+    compress: บีบ prompt (None = cfg.compress)
+    ensemble: รวมคำตอบแทนเลือกตัวเดียว (None = cfg.mode)
+    use_cache: ใช้ cache (None = cfg.cache)
+    """
+    only = select_models(cfg, models, cheap)
+    do_compress = cfg.compress if compress is None else compress
+    mode = cfg.fusion_mode if ensemble is None else ("ensemble" if ensemble else "judge")
+    do_cache = cfg.cache if use_cache is None else use_cache
+    effective_models = sorted(only) if only is not None else sorted(cfg.models)
+    key = cache_mod.make_key(question, effective_models, compress=do_compress,
+                             mode=mode, judge_model=cfg.judge_model)
+    if do_cache:
+        hit = cache_mod.load_cached(key, cfg.cache_ttl_seconds, now=time.time())
+        if hit is not None:
+            return hit
+    async with httpx.AsyncClient(timeout=None) as http:
+        routes = build_routes(cfg, http)
+        if only is not None:
+            routes = [(p, m) for (p, m) in routes if m in only]
+        if not routes:
+            raise RuntimeError("ไม่มีโมเดลให้ใช้ (ตรวจ --models / config)")
+        judge_prov = build_judge_provider(cfg, http)
+        # บีบ prompt ครั้งเดียว แล้วส่งตัวที่บีบไปทุกโมเดล (judge ใช้คำถามเดิม)
+        model_prompt = question
+        comp = None
+        if do_compress:
+            comp = await compress_prompt(
+                judge_prov, cfg.compress_model or cfg.judge_model, question,
+                min_chars=cfg.compress_min_chars, timeout_s=cfg.timeout_seconds)
+            model_prompt = comp.text
+        # budget cap — ประเมินก่อนยิง: stop = ยกเลิก, warn = เตือนแต่ทำต่อ
+        budget_warning = ""
+        if cfg.budget_cap_usd is not None:
+            est = estimate_prefire_cost(model_prompt, len(routes))
+            if est > cfg.budget_cap_usd:
+                if cfg.budget_action == "stop":
+                    raise RuntimeError(
+                        f"ประเมินค่าใช้จ่าย ~${est:.4f} เกิน budget "
+                        f"${cfg.budget_cap_usd} (budget_action=stop) — ยกเลิกก่อนยิง")
+                budget_warning = (f"ประเมิน ~${est:.4f} เกิน budget "
+                                  f"${cfg.budget_cap_usd} (budget_action=warn)")
+        result = await run_fusion(routes, judge_prov, cfg.judge_model,
+                                  model_prompt, cfg.timeout_seconds,
+                                  judge_question=question, mode=mode)
+        result.compression = comp
+        result.budget_warning = budget_warning
+    if do_cache:
+        cache_mod.save_cached(key, result, now=time.time())
+    return result

fusefable-0.3.0/fusefable/cost.py ADDED Viewed

@@ -0,0 +1,25 @@
+from __future__ import annotations
+from typing import Sequence
+from fusefable.models import Completion
+def estimate_cost(comps: Sequence[Completion],
+                  default_in: float = 1.0, default_out: float = 3.0) -> float:
+    """ประมาณค่าใช้จ่ายรวม (USD) จาก usage tokens. rate = $/1M tokens."""
+    total_in = sum(c.prompt_tokens for c in comps)
+    total_out = sum(c.completion_tokens for c in comps)
+    return total_in / 1_000_000 * default_in + total_out / 1_000_000 * default_out
+def estimate_prefire_cost(prompt: str, n_models: int,
+                          default_in: float = 1.0, default_out: float = 3.0,
+                          assumed_out_tokens: int = 600) -> float:
+    """ประเมินค่าใช้จ่ายคร่าวๆ ก่อนยิง (สำหรับ budget cap).
+    หยาบ: input tokens ≈ len(prompt)/4 ต่อโมเดล, output สมมติ assumed_out_tokens.
+    +1 สำหรับ judge/synthesize. ใช้เป็น guard ไม่ใช่ตัวเลขเป๊ะ.
+    """
+    in_tokens = len(prompt) / 4
+    per_model = (in_tokens / 1_000_000 * default_in
+                 + assumed_out_tokens / 1_000_000 * default_out)
+    return per_model * (n_models + 1)

fusefable-0.3.0/fusefable/ensemble.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Ensemble mode — รวมจุดเด่นหลายคำตอบเป็นคำตอบเดียว (แทนการเลือกตัวเดียว).
+ปกปิดชื่อโมเดลเหมือน judge เพื่อให้ synthesize ที่เนื้อหาล้วน.
+"""
+from __future__ import annotations
+from typing import Sequence
+from fusefable.client import call_model
+from fusefable.models import Completion
+from fusefable.providers.base import Provider
+_LABELS = "ABCDEFGHIJ"
+def build_ensemble_prompt(question: str,
+                          comps: Sequence[Completion]) -> str:
+    labels = [_LABELS[i] for i in range(len(comps))]
+    blocks = [f"### Answer {label}\n{c.text}" for label, c in zip(labels, comps)]
+    body = "\n\n".join(blocks)
+    return (
+        "You are merging multiple coding answers into ONE superior answer.\n"
+        "Combine correct and complementary parts, fix mistakes, drop redundancy.\n"
+        "Output ONLY the final merged answer — no commentary about the sources.\n\n"
+        f"## Question\n{question}\n\n"
+        f"## Candidate Answers\n{body}"
+    )
+async def synthesize(provider: Provider, model: str, question: str,
+                     comps: Sequence[Completion], timeout_s: float) -> str:
+    """คืนข้อความคำตอบที่สังเคราะห์รวม. ถ้าพัง → fallback คำตอบแรก."""
+    prompt = build_ensemble_prompt(question, comps)
+    result = await call_model(provider, model, prompt, timeout_s)
+    if result.is_error or not result.text.strip():
+        return comps[0].text
+    return result.text

fusefable-0.3.0/fusefable/fusion.py ADDED Viewed

@@ -0,0 +1,40 @@
+from __future__ import annotations
+from typing import Sequence, Tuple
+from fusefable.fanout import fan_out
+from fusefable.judge import judge
+from fusefable.ensemble import synthesize
+from fusefable.cost import estimate_cost
+from fusefable.models import FinalAnswer
+from fusefable.providers.base import Provider
+Route = Tuple[Provider, str]
+async def run_fusion(routes: Sequence[Route], judge_provider: Provider,
+                     judge_model: str, prompt: str, timeout_s: float,
+                     judge_question: str | None = None,
+                     mode: str = "judge") -> FinalAnswer:
+    """fan-out → judge/ensemble → FinalAnswer. โยน RuntimeError ถ้าไม่มีตัวไหนสำเร็จ.
+    prompt = ข้อความที่ส่งให้โมเดล (อาจถูกบีบแล้ว)
+    judge_question = คำถามที่ใช้ตัดสิน/สังเคราะห์ (default = prompt; ส่งคำถามเดิมเพื่อคงคุณภาพ)
+    mode = "judge" (เลือกตัวดีสุด) | "ensemble" (รวมคำตอบ)
+    """
+    completions = await fan_out(routes, prompt, timeout_s)
+    if not completions:
+        raise RuntimeError("no successful completions from any model")
+    q = judge_question if judge_question is not None else prompt
+    cost = estimate_cost(completions)
+    if mode == "ensemble":
+        text = await synthesize(judge_provider, judge_model, q,
+                                completions, timeout_s)
+        return FinalAnswer(text=text, chosen_model="ensemble",
+                           reason=f"synthesized from {len(completions)} answers",
+                           cost_usd=cost, all_completions=list(completions))
+    chosen, reason = await judge(judge_provider, judge_model, q,
+                                 completions, timeout_s)
+    return FinalAnswer(text=chosen.text, chosen_model=chosen.model,
+                       reason=reason, cost_usd=cost,
+                       all_completions=list(completions))

{fusefable-0.1.9 → fusefable-0.3.0}/fusefable/models.py RENAMED Viewed

@@ -37,3 +37,6 @@ class FinalAnswer:
     reason: str = ""
     cost_usd: float = 0.0
     all_completions: list = field(default_factory=list)
+    compression: object = None   # CompressionResult | None (กัน import วน)
+    cached: bool = False          # มาจาก cache หรือไม่
+    budget_warning: str = ""      # ข้อความเตือนงบ (ถ้ามี)

{fusefable-0.1.9 → fusefable-0.3.0}/fusefable/wizard.py RENAMED Viewed

@@ -21,6 +21,10 @@ KNOWN_GATEWAYS = {
 def build_config_from_answers(answers: dict) -> Config:
     """แปลงคำตอบจาก wizard เป็น Config (logic ล้วน — แยกจาก I/O เพื่อ test ได้)."""
+    extra = dict(
+        compress=answers.get("compress", False),
+        compress_min_chars=answers.get("compress_min_chars", 2000),
+    )
     if answers["mode"] == "gateway":
         return Config(
             mode="gateway",
@@ -30,6 +34,7 @@ def build_config_from_answers(answers: dict) -> Config:
             models=answers["models"],
             judge_model=answers["judge_model"],
             timeout_seconds=answers["timeout_seconds"],
+            **extra,
         )
     providers = [SingleProvider(**p) for p in answers["providers"]]
     all_models = [m for p in providers for m in p.models]
@@ -39,9 +44,19 @@ def build_config_from_answers(answers: dict) -> Config:
         models=all_models,
         judge_model=answers["judge_model"],
         timeout_seconds=answers["timeout_seconds"],
+        **extra,
     )
+def _ask_compression(prompt) -> dict:
+    """ถามตั้งค่า compression — คืน dict ใส่ใน answers."""
+    ans = prompt("เปิดการบีบ prompt เพื่อลด token? [y/N]: ").strip().lower()
+    if ans not in ("y", "yes"):
+        return {"compress": False}
+    raw = prompt("  บีบเมื่อ prompt ยาวเกินกี่ตัวอักษร? [2000]: ").strip()
+    return {"compress": True, "compress_min_chars": int(raw) if raw else 2000}
 def run_wizard(prompt=input) -> Config:
     """ถาม interactive แล้วคืน Config. `prompt` ฉีดเข้าได้เพื่อ test."""
     print("=== Fuse Fable setup ===")
@@ -65,10 +80,11 @@ def run_wizard(prompt=input) -> Config:
             if m:
                 models.append(m)
         judge = prompt("judge model: ").strip()
+        comp = _ask_compression(prompt)
         return build_config_from_answers({
             "mode": "gateway", "gateway_name": gw, "gateway_base_url": base,
             "api_key_env": key_env, "models": models, "judge_model": judge,
-            "timeout_seconds": 90,
+            "timeout_seconds": 90, **comp,
         })
     n = int(prompt("จะใช้กี่เจ้า?: ").strip())
@@ -89,7 +105,8 @@ def run_wizard(prompt=input) -> Config:
         providers.append({"name": name, "base_url": base, "kind": kind,
                           "api_key_env": key_env, "models": models})
     judge = prompt("judge model: ").strip()
+    comp = _ask_compression(prompt)
     return build_config_from_answers({
         "mode": "single", "providers": providers,
-        "judge_model": judge, "timeout_seconds": 90,
+        "judge_model": judge, "timeout_seconds": 90, **comp,
     })

{fusefable-0.1.9 → fusefable-0.3.0}/fusefable.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fusefable
-Version: 0.1.9
+Version: 0.3.0
 Summary: Fuse multiple AI models and judge the best answer for coding
 Author: proultrax9
 License: MIT
@@ -145,6 +145,46 @@ Exposes a tool `fuse_ask(question, models?, cheap?)` for any MCP client.
 > Requires `pip install "fusefable[mcp]"` and a completed `fusefable config`.
 > If `fusefable` isn't on the app's PATH, use a full path such as `python -m fusefable.cli`.
+## Ensemble, cache & budget
+```bash
+fusefable ask --ensemble "..."     # merge all answers into one (vs picking one)
+fusefable ask --cache "..."        # reuse the answer for an identical question
+fusefable ask --no-cache "..."     # force a fresh run
+```
+- **Ensemble mode** (`--ensemble`, config `fusion_mode: ensemble`): instead of the judge
+  picking one answer, a model synthesizes a single answer combining the strengths of all
+  candidates (anonymized). Falls back to the first answer if synthesis fails.
+- **Cache** (`--cache`, config `cache: true`, `cache_ttl_seconds`): identical question +
+  same models/mode/compression returns the stored answer instantly with no API calls
+  (`cached, $0`). Stored in `~/.fusefable/cache/`. `cache_ttl_seconds: 0` = never expires.
+- **Budget cap** (config `budget_cap_usd`, `budget_action: warn|stop`): before firing,
+  the run estimates cost. If it exceeds the cap — `warn` prints a warning and continues,
+  `stop` aborts before spending anything.
+## Prompt compression (save tokens)
+Reduce token usage while keeping answer quality — useful when you pay per-provider
+directly. Two tiers, opt-in via `--compress`:
+```bash
+fusefable ask --compress "<long prompt or pasted code>"
+# [compressed: 5200→1800 chars, ~65% saved via llm]
+```
+- **Tier 1 (lossless):** trims trailing whitespace, collapses blank lines, strips
+  zero-width chars — keeps indentation and inner spacing intact (safe for code).
+- **Tier 2 (LLM):** for prompts above `compress_min_chars` (default 2000), a cheap
+  model compresses semantically — **once**, then the compressed prompt is sent to all
+  models, so you save `tokens × number-of-models`.
+- **Quality guards:** prompts under the threshold skip the LLM; if the compressed
+  result is empty, longer, or under 30% of the original, it falls back to the lossless
+  text. The judge always sees the **original** question.
+Config (`~/.fusefable/config.yaml`): `compress`, `compress_min_chars`, `compress_model`
+(empty = reuse the judge model).
 ## Architecture
 ```

{fusefable-0.1.9 → fusefable-0.3.0}/fusefable.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,11 +2,14 @@ LICENSE
 README.md
 pyproject.toml
 fusefable/__init__.py
+fusefable/cache.py
 fusefable/cli.py
 fusefable/client.py
+fusefable/compressor.py
 fusefable/config.py
 fusefable/core.py
 fusefable/cost.py
+fusefable/ensemble.py
 fusefable/fanout.py
 fusefable/fusion.py
 fusefable/judge.py
@@ -26,11 +29,14 @@ fusefable/providers/base.py
 fusefable/providers/factory.py
 fusefable/providers/google.py
 fusefable/providers/openai_compat.py
+tests/test_cache.py
 tests/test_cli.py
 tests/test_client.py
+tests/test_compressor.py
 tests/test_config.py
 tests/test_core.py
 tests/test_cost.py
+tests/test_ensemble.py
 tests/test_fanout.py
 tests/test_fusion.py
 tests/test_judge.py

{fusefable-0.1.9 → fusefable-0.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "fusefable"
-version = "0.1.9"
+version = "0.3.0"
 description = "Fuse multiple AI models and judge the best answer for coding"
 readme = "README.md"
 license = { text = "MIT" }

fusefable-0.3.0/tests/test_cache.py ADDED Viewed

@@ -0,0 +1,48 @@
+import fusefable.cache as cache_mod
+from fusefable.cache import make_key, load_cached, save_cached
+from fusefable.models import FinalAnswer, Completion
+def _use_tmp(monkeypatch, tmp_path):
+    monkeypatch.setattr(cache_mod, "cache_dir", lambda: tmp_path / "cache")
+def test_make_key_stable_and_order_independent():
+    k1 = make_key("q", ["a", "b"], compress=False, mode="judge", judge_model="j")
+    k2 = make_key("q", ["b", "a"], compress=False, mode="judge", judge_model="j")
+    assert k1 == k2                          # ลำดับโมเดลไม่มีผล
+    k3 = make_key("q", ["a"], compress=False, mode="judge", judge_model="j")
+    assert k1 != k3                          # ชุดโมเดลต่าง = key ต่าง
+def test_make_key_differs_by_mode_and_compress():
+    base = dict(models=["a"], judge_model="j")
+    assert (make_key("q", compress=False, mode="judge", **base)
+            != make_key("q", compress=True, mode="judge", **base))
+    assert (make_key("q", compress=False, mode="judge", **base)
+            != make_key("q", compress=False, mode="ensemble", **base))
+def test_save_and_load_roundtrip(monkeypatch, tmp_path):
+    _use_tmp(monkeypatch, tmp_path)
+    ans = FinalAnswer(text="best", chosen_model="gpt", reason="r", cost_usd=0.02,
+                      all_completions=[Completion(model="gpt", text="best")])
+    save_cached("k1", ans, now=1000.0)
+    got = load_cached("k1", ttl_seconds=0, now=2000.0)
+    assert got is not None
+    assert got.text == "best"
+    assert got.cached is True                # mark ว่ามาจาก cache
+    assert got.all_completions[0].model == "gpt"
+def test_load_miss_returns_none(monkeypatch, tmp_path):
+    _use_tmp(monkeypatch, tmp_path)
+    assert load_cached("nope", ttl_seconds=0, now=1.0) is None
+def test_ttl_expiry(monkeypatch, tmp_path):
+    _use_tmp(monkeypatch, tmp_path)
+    ans = FinalAnswer(text="x", chosen_model="m")
+    save_cached("k", ans, now=1000.0)
+    assert load_cached("k", ttl_seconds=60, now=1030.0) is not None   # ภายใน TTL
+    assert load_cached("k", ttl_seconds=60, now=1100.0) is None       # เกิน TTL

fusefable-0.3.0/tests/test_compressor.py ADDED Viewed

@@ -0,0 +1,69 @@
+import pytest
+from fusefable.compressor import normalize_lossless, compress_prompt
+from fusefable.models import Completion
+def test_normalize_lossless_trims_safely_keeps_indent():
+    raw = "def  f():\n\n\n\n    return   1   \n"
+    out = normalize_lossless(raw)
+    assert "\n\n\n" not in out                 # บรรทัดว่างซ้ำถูกยุบ
+    assert out == "def  f():\n\n    return   1"  # คง indent + ช่องว่างภายใน, ตัดแค่ trailing
+def test_normalize_strips_zero_width():
+    assert normalize_lossless("ab‌") == "ab"
+class FakeProvider:
+    def __init__(self, text=None, error=False):
+        self.text, self.error = text, error
+        self.called = False
+    async def complete(self, model, prompt):
+        self.called = True
+        if self.error:
+            raise RuntimeError("boom")
+        return Completion(model=model, text=self.text)
+@pytest.mark.asyncio
+async def test_short_prompt_skips_llm():
+    prov = FakeProvider(text="should not be used")
+    r = await compress_prompt(prov, "m", "short text", min_chars=2000, timeout_s=5)
+    assert r.method == "lossless"
+    assert prov.called is False          # ไม่เรียก LLM
+@pytest.mark.asyncio
+async def test_long_prompt_uses_llm_when_shorter():
+    big = "word " * 1000                 # ~5000 chars (lossless ~4999)
+    prov = FakeProvider(text="C" * 2000) # อยู่ในช่วง 30%-100% → ผ่าน guard
+    r = await compress_prompt(prov, "m", big, min_chars=2000, timeout_s=5)
+    assert r.method == "llm"
+    assert r.final_chars == 2000
+    assert r.final_chars < r.original_chars
+    assert r.saved_pct > 0
+@pytest.mark.asyncio
+async def test_llm_failure_falls_back_to_lossless():
+    big = "word " * 1000
+    prov = FakeProvider(error=True)
+    r = await compress_prompt(prov, "m", big, min_chars=2000, timeout_s=5)
+    assert r.method == "lossless"        # LLM พัง → ใช้ lossless
+@pytest.mark.asyncio
+async def test_guard_rejects_too_short_compression():
+    big = "word " * 1000                 # ~5000 chars
+    prov = FakeProvider(text="x")        # สั้นเกินไป (< 30%)
+    r = await compress_prompt(prov, "m", big, min_chars=2000, timeout_s=5)
+    assert r.method == "lossless"        # ป้องกันโมเดลตัดเนื้อหาทิ้ง
+@pytest.mark.asyncio
+async def test_guard_rejects_longer_result():
+    big = "word " * 1000
+    prov = FakeProvider(text="y" * 99999)  # ยาวกว่าเดิม
+    r = await compress_prompt(prov, "m", big, min_chars=2000, timeout_s=5)
+    assert r.method == "lossless"

{fusefable-0.1.9 → fusefable-0.3.0}/tests/test_cost.py RENAMED Viewed

@@ -1,7 +1,14 @@
-from fusefable.cost import estimate_cost
+from fusefable.cost import estimate_cost, estimate_prefire_cost
 from fusefable.models import Completion
+def test_estimate_prefire_scales_with_models():
+    one = estimate_prefire_cost("x" * 4000, n_models=1)
+    five = estimate_prefire_cost("x" * 4000, n_models=5)
+    assert five > one              # ยิ่งหลายโมเดล ยิ่งแพง
+    assert one > 0
 def test_estimate_cost_sums_tokens():
     comps = [
         Completion(model="a", text="x", prompt_tokens=1000, completion_tokens=500),

fusefable-0.3.0/tests/test_ensemble.py ADDED Viewed

@@ -0,0 +1,36 @@
+import pytest
+from fusefable.ensemble import build_ensemble_prompt, synthesize
+from fusefable.models import Completion
+def test_build_ensemble_prompt_anonymizes():
+    comps = [Completion(model="claude", text="a1"),
+             Completion(model="gpt", text="a2")]
+    p = build_ensemble_prompt("q?", comps)
+    assert "claude" not in p and "gpt" not in p
+    assert "Answer A" in p and "Answer B" in p
+    assert "merg" in p.lower()
+@pytest.mark.asyncio
+async def test_synthesize_returns_merged_text():
+    comps = [Completion(model="a", text="x"), Completion(model="b", text="y")]
+    class P:
+        async def complete(self, model, prompt):
+            return Completion(model=model, text="MERGED")
+    out = await synthesize(P(), "judge", "q?", comps, timeout_s=5)
+    assert out == "MERGED"
+@pytest.mark.asyncio
+async def test_synthesize_fallback_on_error():
+    comps = [Completion(model="a", text="first"), Completion(model="b", text="y")]
+    class P:
+        async def complete(self, model, prompt):
+            raise RuntimeError("boom")
+    out = await synthesize(P(), "judge", "q?", comps, timeout_s=5)
+    assert out == "first"          # fallback คำตอบแรก

{fusefable-0.1.9 → fusefable-0.3.0}/tests/test_fusion.py RENAMED Viewed

@@ -26,6 +26,26 @@ async def test_run_fusion_end_to_end():
     assert len(result.all_completions) == 2
+@pytest.mark.asyncio
+async def test_run_fusion_uses_judge_question_for_judging():
+    seen = {}
+    class FakeProvider:
+        async def complete(self, model, prompt):
+            if model == "judge":
+                seen["judge_prompt"] = prompt
+                return Completion(model=model, text="I choose A")
+            return Completion(model=model, text="ans")
+    prov = FakeProvider()
+    routes = [(prov, "m1")]
+    await run_fusion(routes, prov, "judge", "COMPRESSED", timeout_s=5,
+                     judge_question="ORIGINAL QUESTION")
+    # judge ต้องเห็นคำถามเดิม ไม่ใช่ตัวที่บีบ
+    assert "ORIGINAL QUESTION" in seen["judge_prompt"]
+    assert "COMPRESSED" not in seen["judge_prompt"]
 @pytest.mark.asyncio
 async def test_run_fusion_raises_when_all_fail():
     class DeadProvider:

{fusefable-0.1.9 → fusefable-0.3.0}/tests/test_wizard.py RENAMED Viewed

@@ -22,9 +22,11 @@ def test_run_wizard_gateway_asks_how_many_then_each_model():
         "anthropic/claude-opus-4.1",  # โมเดลที่ 2
         "qwen/qwen3-coder",        # โมเดลที่ 3
         "deepseek/deepseek-chat",  # judge
+        "n",                       # ไม่เปิด compression
     ])
     cfg = run_wizard(prompt=answers)
     assert cfg.mode == "gateway"
+    assert cfg.compress is False
     assert cfg.gateway_name == "openrouter"
     assert cfg.gateway_base_url == "https://openrouter.ai/api/v1"  # เติมอัตโนมัติ
     assert len(cfg.models) == 3
@@ -36,16 +38,26 @@ def test_run_wizard_gateway_asks_how_many_then_each_model():
 def test_run_wizard_gateway_autofills_other_known_gateway():
     # groq เป็น gateway ที่รู้จัก → เติม base_url อัตโนมัติ ไม่ถาม URL
     answers = _scripted([
-        "1", "groq", "GROQ_API_KEY", "1", "llama-3.3-70b", "llama-3.3-70b",
+        "1", "groq", "GROQ_API_KEY", "1", "llama-3.3-70b", "llama-3.3-70b", "n",
     ])
     cfg = run_wizard(prompt=answers)
     assert cfg.gateway_base_url == "https://api.groq.com/openai/v1"
+def test_run_wizard_enables_compression_when_yes():
+    answers = _scripted([
+        "1", "openrouter", "OR_KEY", "1", "m1", "judge",
+        "y", "3000",               # เปิด compression, min 3000
+    ])
+    cfg = run_wizard(prompt=answers)
+    assert cfg.compress is True
+    assert cfg.compress_min_chars == 3000
 def test_run_wizard_gateway_unknown_asks_base_url():
     # gateway ที่ไม่รู้จัก → ถาม base_url เอง (รองรับทุกเจ้า)
     answers = _scripted([
-        "1", "mygw", "https://my.gateway/v1", "MY_KEY", "1", "m1", "m1",
+        "1", "mygw", "https://my.gateway/v1", "MY_KEY", "1", "m1", "m1", "n",
     ])
     cfg = run_wizard(prompt=answers)
     assert cfg.gateway_name == "mygw"
@@ -79,6 +91,7 @@ def test_run_wizard_single_mode_native_autofills_base_url():
         # เจ้าที่ 2: openai_compat
         "ds", "openai_compat", "https://api.deepseek.com/v1", "DS_KEY", "deepseek-chat",
         "deepseek-chat",           # judge
+        "n",                       # ไม่เปิด compression
     ])
     cfg = run_wizard(prompt=answers)
     assert cfg.mode == "single"

fusefable-0.1.9/fusefable/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.1.9"

fusefable-0.1.9/fusefable/core.py DELETED Viewed

@@ -1,37 +0,0 @@
-from __future__ import annotations
-from typing import Optional, Sequence
-import httpx
-from fusefable.config import Config
-from fusefable.routing import build_routes, build_judge_provider
-from fusefable.fusion import run_fusion
-from fusefable.models import FinalAnswer
-def select_models(cfg: Config, models: Optional[Sequence[str]] = None,
-                  cheap: bool = False) -> Optional[set[str]]:
-    """ตัดสินว่าจะใช้โมเดลชุดไหน. คืน None = ใช้ทุกตัวตาม config."""
-    if models:
-        return set(models)
-    if cheap and cfg.cheap_models:
-        return set(cfg.cheap_models)
-    return None
-async def fuse(cfg: Config, question: str,
-               models: Optional[Sequence[str]] = None,
-               cheap: bool = False) -> FinalAnswer:
-    """entry point กลาง — ใช้ร่วมกันทั้ง CLI และ MCP server.
-    models: จำกัดเฉพาะโมเดลที่ระบุ (เช่นจาก --models)
-    cheap: ใช้ cfg.cheap_models ถ้ามี
-    """
-    only = select_models(cfg, models, cheap)
-    async with httpx.AsyncClient(timeout=None) as http:
-        routes = build_routes(cfg, http)
-        if only is not None:
-            routes = [(p, m) for (p, m) in routes if m in only]
-        if not routes:
-            raise RuntimeError("ไม่มีโมเดลให้ใช้ (ตรวจ --models / config)")
-        judge_prov = build_judge_provider(cfg, http)
-        return await run_fusion(routes, judge_prov, cfg.judge_model,
-                                question, cfg.timeout_seconds)

fusefable-0.1.9/fusefable/cost.py DELETED Viewed

@@ -1,11 +0,0 @@
-from __future__ import annotations
-from typing import Sequence
-from fusefable.models import Completion
-def estimate_cost(comps: Sequence[Completion],
-                  default_in: float = 1.0, default_out: float = 3.0) -> float:
-    """ประมาณค่าใช้จ่ายรวม (USD) จาก usage tokens. rate = $/1M tokens."""
-    total_in = sum(c.prompt_tokens for c in comps)
-    total_out = sum(c.completion_tokens for c in comps)
-    return total_in / 1_000_000 * default_in + total_out / 1_000_000 * default_out

fusefable-0.1.9/fusefable/fusion.py DELETED Viewed

@@ -1,23 +0,0 @@
-from __future__ import annotations
-from typing import Sequence, Tuple
-from fusefable.fanout import fan_out
-from fusefable.judge import judge
-from fusefable.cost import estimate_cost
-from fusefable.models import FinalAnswer
-from fusefable.providers.base import Provider
-Route = Tuple[Provider, str]
-async def run_fusion(routes: Sequence[Route], judge_provider: Provider,
-                     judge_model: str, prompt: str, timeout_s: float) -> FinalAnswer:
-    """fan-out → judge → FinalAnswer. โยน RuntimeError ถ้าไม่มีตัวไหนสำเร็จ."""
-    completions = await fan_out(routes, prompt, timeout_s)
-    if not completions:
-        raise RuntimeError("no successful completions from any model")
-    chosen, reason = await judge(judge_provider, judge_model, prompt,
-                                 completions, timeout_s)
-    cost = estimate_cost(completions)
-    return FinalAnswer(text=chosen.text, chosen_model=chosen.model,
-                       reason=reason, cost_usd=cost,
-                       all_completions=list(completions))