PyPI - llmbuffet - Versions diffs - 0.2.0__py3-none-any.whl - Mend

llmbuffet 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

llmbuffet/__init__.py +27 -0
llmbuffet/__main__.py +4 -0
llmbuffet/cli.py +239 -0
llmbuffet/client.py +249 -0
llmbuffet/config.py +81 -0
llmbuffet/errors.py +37 -0
llmbuffet/models.py +76 -0
llmbuffet/providers.toml +218 -0
llmbuffet/proxy.py +261 -0
llmbuffet/quota.py +101 -0
llmbuffet/router.py +215 -0
llmbuffet-0.2.0.dist-info/METADATA +240 -0
llmbuffet-0.2.0.dist-info/RECORD +16 -0
llmbuffet-0.2.0.dist-info/WHEEL +4 -0
llmbuffet-0.2.0.dist-info/entry_points.txt +3 -0
llmbuffet-0.2.0.dist-info/licenses/LICENSE +21 -0

llmbuffet/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""llmbuffet — pool free-tier LLM APIs behind one OpenAI-compatible endpoint.
+Public API:
+    from llmbuffet import Buffet
+    buffet = Buffet.from_default_config()
+    reply = buffet.ask("Explain CAP theorem in one sentence.")
+    print(reply.text)
+"""
+from .errors import AllProvidersExhausted, BuffetError, NoProvidersConfigured
+from .models import Model, Provider, Reply
+from .router import Buffet
+__version__ = "0.2.0"
+__all__ = [
+    "Buffet",
+    "Provider",
+    "Model",
+    "Reply",
+    "BuffetError",
+    "NoProvidersConfigured",
+    "AllProvidersExhausted",
+    "__version__",
+]

llmbuffet/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

llmbuffet/cli.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""Command-line interface for llmbuffet.
+llmbuffet ask "question"        one-shot completion (reads stdin too)
+llmbuffet providers            list configured / available providers
+llmbuffet quota                show today's per-provider usage
+llmbuffet proxy                run the OpenAI-compatible proxy server
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+from . import __version__
+from .config import configured_providers, load_catalog
+from .errors import AllProvidersExhausted, NoProvidersConfigured
+from .quota import QuotaStore
+from .router import Buffet
+def _read_stdin() -> str:
+    if sys.stdin is None or sys.stdin.isatty():
+        return ""
+    return sys.stdin.read()
+def cmd_ask(args: argparse.Namespace) -> int:
+    stdin = _read_stdin()
+    prompt = args.prompt or ""
+    if stdin:
+        prompt = f"{stdin}\n\n{prompt}".strip() if prompt else stdin
+    if not prompt.strip():
+        print("llmbuffet: no prompt provided (pass text or pipe stdin)", file=sys.stderr)
+        return 3
+    # Support `--model provider/model` as a shorthand for picking an exact
+    # model on an exact provider (in addition to `--providers` + bare `--model`).
+    model_filter = args.model
+    provider_filter = args.providers.split(",") if args.providers else None
+    if model_filter and "/" in model_filter:
+        prov, _, mdl = model_filter.partition("/")
+        provider_filter = [prov]
+        model_filter = mdl
+    system = args.system
+    if args.json:
+        json_rule = "Respond with a single valid JSON value and nothing else — no prose, no markdown fences."
+        system = f"{system}\n{json_rule}" if system else json_rule
+    buffet = Buffet.from_default_config()
+    try:
+        reply = buffet.ask(
+            prompt,
+            system=system,
+            model=model_filter,
+            providers=provider_filter,
+            max_tokens=args.max_tokens,
+            temperature=args.temperature,
+        )
+    except NoProvidersConfigured as exc:
+        print(f"llmbuffet: {exc}", file=sys.stderr)
+        return 3
+    except AllProvidersExhausted as exc:
+        print(f"llmbuffet: {exc}", file=sys.stderr)
+        return 4
+    text = reply.text
+    if args.json:
+        text = _strip_fences(text)
+    print(text)
+    if args.verbose:
+        print(f"\n[served by {reply.provider_id}/{reply.model}]", file=sys.stderr)
+    return 0
+def _strip_fences(text: str) -> str:
+    """Remove a leading ```json / ``` fence and trailing ``` if present."""
+    t = text.strip()
+    if t.startswith("```"):
+        t = t.split("\n", 1)[1] if "\n" in t else t[3:]
+        if t.rstrip().endswith("```"):
+            t = t.rstrip()[:-3]
+    return t.strip()
+def cmd_providers(args: argparse.Namespace) -> int:
+    catalog = load_catalog()
+    configured = {p.id for p in configured_providers(catalog)}
+    n_models = sum(len(p.models) for p in catalog)
+    print(f"llmbuffet catalog: {len(catalog)} providers, {n_models} models\n")
+    for p in catalog:
+        mark = "✓" if p.id in configured else "·"
+        status = "configured" if p.id in configured else f"set {p.key_env}"
+        print(f"  {mark} {p.id:<12} {p.label:<28} {len(p.models):>2} models   [{status}]")
+    if not configured:
+        print("\nNo providers configured yet. See .env.example for the env vars to set.")
+    return 0
+def cmd_models(args: argparse.Namespace) -> int:
+    catalog = load_catalog()
+    configured = {p.id for p in configured_providers(catalog)}
+    only = set(args.providers.split(",")) if args.providers else None
+    shown = 0
+    for p in catalog:
+        if only is not None and p.id not in only:
+            continue
+        if args.configured_only and p.id not in configured:
+            continue
+        mark = "✓" if p.id in configured else "·"
+        keyless = "  (keyless)" if p.keyless and p.id in configured else ""
+        print(f"\n{mark} {p.id}  —  {p.label}{keyless}")
+        for m in p.models:
+            shown += 1
+            print(f"    {p.id}/{m.name}")
+    if shown == 0:
+        print("No models match. Try `llmbuffet providers` to see configuration status.")
+        return 0
+    print(
+        f"\nPass any id above to `--model`, e.g. "
+        f'`llmbuffet ask -m {catalog[0].id}/{catalog[0].models[0].name} "hi"`,'
+    )
+    print("or just `--model <model-name>` to use that model on any provider that has it.")
+    return 0
+def cmd_quota(args: argparse.Namespace) -> int:
+    store = QuotaStore()
+    snap = store.snapshot()
+    if not snap:
+        print("No usage recorded today (UTC).")
+        return 0
+    print("Today's usage (UTC):")
+    for key, count in sorted(snap.items(), key=lambda kv: -kv[1]):
+        print(f"  {count:>6}  {key}")
+    return 0
+def cmd_proxy(args: argparse.Namespace) -> int:
+    from .proxy import serve  # lazy: avoids http.server import on other paths
+    buffet = Buffet.from_default_config()
+    if not buffet.providers:
+        print(
+            "llmbuffet: no providers configured; set at least one API key "
+            "(see .env.example) before starting the proxy.",
+            file=sys.stderr,
+        )
+        return 3
+    proxy_key = args.api_key or os.environ.get("LLMBUFFET_PROXY_KEY") or None
+    loopback = args.host in {"127.0.0.1", "localhost", "::1"}
+    if not loopback and not proxy_key:
+        print(
+            f"llmbuffet: WARNING — binding to {args.host} (not loopback) with NO proxy key "
+            "exposes all your configured providers to the network. Set --api-key or "
+            "LLMBUFFET_PROXY_KEY, or bind to 127.0.0.1.",
+            file=sys.stderr,
+        )
+    httpd = serve(buffet, host=args.host, port=args.port, api_key=proxy_key)
+    n_models = sum(len(p.models) for p in buffet.providers)
+    auth_note = "  auth: Bearer key required\n" if proxy_key else ""
+    print(
+        f"llmbuffet proxy on http://{args.host}:{args.port}/v1  "
+        f"({len(buffet.providers)} providers, {n_models} models)\n"
+        f"{auth_note}"
+        f"  point your OpenAI client at:  OPENAI_BASE_URL=http://{args.host}:{args.port}/v1\n"
+        "  press Ctrl-C to stop",
+        file=sys.stderr,
+    )
+    try:
+        httpd.serve_forever()
+    except KeyboardInterrupt:
+        print("\nllmbuffet: shutting down", file=sys.stderr)
+    finally:
+        httpd.server_close()
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="llmbuffet",
+        description="Pool free-tier LLM APIs behind one OpenAI-compatible endpoint.",
+    )
+    parser.add_argument("--version", action="version", version=f"llmbuffet {__version__}")
+    sub = parser.add_subparsers(dest="command", required=True)
+    p_ask = sub.add_parser("ask", help="one-shot completion")
+    p_ask.add_argument("prompt", nargs="?", default="", help="prompt text (stdin is appended)")
+    p_ask.add_argument("-s", "--system", help="system prompt")
+    p_ask.add_argument(
+        "-m", "--model", help="model name, or provider/model (e.g. groq/llama-3.3-70b-versatile)"
+    )
+    p_ask.add_argument("-p", "--providers", help="comma-separated provider ids to allow")
+    p_ask.add_argument("--max-tokens", type=int, default=1024)
+    p_ask.add_argument("--temperature", type=float, default=0.0)
+    p_ask.add_argument(
+        "--json", action="store_true", help="ask for JSON output and strip code fences"
+    )
+    p_ask.add_argument("-v", "--verbose", action="store_true", help="report which provider served")
+    p_ask.set_defaults(func=cmd_ask)
+    p_prov = sub.add_parser("providers", help="list providers and configuration status")
+    p_prov.set_defaults(func=cmd_providers)
+    p_models = sub.add_parser("models", help="list every available provider/model id")
+    p_models.add_argument("-p", "--providers", help="comma-separated provider ids to filter")
+    p_models.add_argument(
+        "-c", "--configured-only", action="store_true", help="only show configured providers"
+    )
+    p_models.set_defaults(func=cmd_models)
+    p_quota = sub.add_parser("quota", help="show today's per-provider usage")
+    p_quota.set_defaults(func=cmd_quota)
+    p_proxy = sub.add_parser("proxy", help="run the OpenAI-compatible proxy server")
+    p_proxy.add_argument("--host", default="127.0.0.1")
+    p_proxy.add_argument("--port", type=int, default=8080)
+    p_proxy.add_argument(
+        "--api-key",
+        default=None,
+        help="require this Bearer token on requests (or set LLMBUFFET_PROXY_KEY)",
+    )
+    p_proxy.set_defaults(func=cmd_proxy)
+    return parser
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())

llmbuffet/client.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""HTTP client and per-adapter request/response shaping.
+Three adapters cover every provider in the catalog:
+* ``openai``     — standard ``/chat/completions`` (Groq, Cerebras, OpenRouter,
+                   GitHub Models, Mistral, Cohere, SambaNova, ...).
+* ``cloudflare`` — Cloudflare Workers AI, which exposes an OpenAI-compatible
+                   route once ``{account_id}`` is substituted into the URL.
+* ``gemini``     — Google Generative Language API (different body shape).
+All network access goes through a single injectable ``post`` callable so the
+router and adapters can be unit-tested without touching the network.
+"""
+from __future__ import annotations
+import json
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+from .errors import ProviderHTTPError
+from .models import Provider, Reply
+Message = dict[str, str]
+_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
+# Reasoning models burn output budget on hidden reasoning; give them headroom
+# if the caller left max_tokens at a small default.
+_THINKING_HINTS = (
+    "glm-4.7",
+    "-r1",
+    "reasoning",
+    "thinking",
+    "magistral",
+    "deepseek-r1",
+    "nemotron",
+)
+_THINKING_FLOOR = 8192
+def _is_thinking(model: str) -> bool:
+    m = model.lower()
+    return any(h in m for h in _THINKING_HINTS)
+def _strip_think(text: str) -> str:
+    return _THINK_RE.sub("", text).strip()
+@dataclass
+class HTTPResult:
+    status: int
+    body: dict
+    text: str
+PostFn = Callable[[str, dict, dict, float], HTTPResult]
+_USER_AGENT = "llmbuffet/0.2 (+https://github.com/0xzr/llmbuffet)"
+def default_post(url: str, headers: dict, json_body: dict, timeout: float) -> HTTPResult:
+    """Real network POST via httpx. Imported lazily so tests need no httpx."""
+    import httpx
+    headers = {"User-Agent": _USER_AGENT, **headers}
+    resp = httpx.post(url, headers=headers, json=json_body, timeout=timeout)
+    try:
+        body = resp.json()
+    except (json.JSONDecodeError, ValueError):
+        body = {}
+    return HTTPResult(status=resp.status_code, body=body, text=resp.text)
+def _retryable(status: int) -> bool:
+    # 429 (rate limit) and 5xx are worth trying another provider for.
+    # 408 request timeout too. 4xx config errors are not retryable per-call but
+    # the router still advances to a different provider regardless.
+    return status == 429 or status == 408 or 500 <= status < 600
+def _err_message(result: HTTPResult) -> str:
+    err = result.body.get("error")
+    if isinstance(err, dict):
+        return str(err.get("message") or err)
+    if isinstance(err, str):
+        return err
+    return (result.text or "").strip()[:200] or "no body"
+def _to_gemini_contents(messages: list[Message]) -> tuple[dict | None, list[dict]]:
+    """Split OpenAI-style messages into (systemInstruction, contents)."""
+    system: str | None = None
+    contents: list[dict] = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        text = msg.get("content", "")
+        if role == "system":
+            system = f"{system}\n{text}" if system else text
+            continue
+        gem_role = "model" if role == "assistant" else "user"
+        contents.append({"role": gem_role, "parts": [{"text": text}]})
+    system_instruction = {"parts": [{"text": system}]} if system else None
+    return system_instruction, contents
+def call(
+    provider: Provider,
+    model: str,
+    messages: list[Message],
+    *,
+    api_key: str | None,
+    env: dict[str, str],
+    max_tokens: int = 1024,
+    temperature: float = 0.0,
+    timeout: float = 90.0,
+    post: PostFn = default_post,
+) -> Reply:
+    """Dispatch one completion to ``provider`` and normalize the response.
+    Raises :class:`ProviderHTTPError` on a non-200 status.
+    """
+    if _is_thinking(model) and max_tokens < _THINKING_FLOOR:
+        # Give reasoning models room so hidden reasoning doesn't eat the whole
+        # budget and return empty content.
+        max_tokens = _THINKING_FLOOR
+    if provider.adapter == "gemini":
+        return _call_gemini(
+            provider,
+            model,
+            messages,
+            api_key=api_key,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            timeout=timeout,
+            post=post,
+        )
+    # openai + cloudflare share the chat/completions shape.
+    return _call_openai(
+        provider,
+        model,
+        messages,
+        api_key=api_key,
+        env=env,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        timeout=timeout,
+        post=post,
+    )
+def _call_openai(
+    provider: Provider,
+    model: str,
+    messages: list[Message],
+    *,
+    api_key: str | None,
+    env: dict[str, str],
+    max_tokens: int,
+    temperature: float,
+    timeout: float,
+    post: PostFn,
+) -> Reply:
+    base_url = provider.base_url
+    if provider.adapter == "cloudflare":
+        account_id = env.get("CLOUDFLARE_ACCOUNT_ID", "")
+        base_url = base_url.replace("{account_id}", account_id)
+    url = f"{base_url}/chat/completions"
+    headers = {"Content-Type": "application/json"}
+    if api_key:  # keyless providers (e.g. OVH anonymous) send no auth header
+        headers["Authorization"] = f"Bearer {api_key}"
+    body = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "stream": False,
+    }
+    result = post(url, headers, body, timeout)
+    if result.status != 200:
+        raise ProviderHTTPError(
+            result.status, _err_message(result), retryable=_retryable(result.status)
+        )
+    choices = result.body.get("choices") or []
+    if not choices:
+        raise ProviderHTTPError(502, "no choices in response", retryable=True)
+    message = choices[0].get("message") or {}
+    text = _strip_think(message.get("content") or "")
+    usage = result.body.get("usage") or {}
+    return Reply(
+        text=text,
+        provider_id=provider.id,
+        model=model,
+        raw=result.body,
+        prompt_tokens=usage.get("prompt_tokens"),
+        completion_tokens=usage.get("completion_tokens"),
+    )
+def _call_gemini(
+    provider: Provider,
+    model: str,
+    messages: list[Message],
+    *,
+    api_key: str | None,
+    max_tokens: int,
+    temperature: float,
+    timeout: float,
+    post: PostFn,
+) -> Reply:
+    system_instruction, contents = _to_gemini_contents(messages)
+    url = f"{provider.base_url}/models/{model}:generateContent"
+    headers = {
+        "Content-Type": "application/json",
+        "x-goog-api-key": api_key,
+    }
+    body: dict = {
+        "contents": contents,
+        "generationConfig": {
+            "maxOutputTokens": max_tokens,
+            "temperature": temperature,
+        },
+    }
+    if system_instruction:
+        body["systemInstruction"] = system_instruction
+    result = post(url, headers, body, timeout)
+    if result.status != 200:
+        raise ProviderHTTPError(
+            result.status, _err_message(result), retryable=_retryable(result.status)
+        )
+    candidates = result.body.get("candidates") or []
+    if not candidates:
+        raise ProviderHTTPError(502, "no candidates in response", retryable=True)
+    parts = (candidates[0].get("content") or {}).get("parts") or []
+    text = _strip_think("".join(p.get("text", "") for p in parts))
+    usage = result.body.get("usageMetadata") or {}
+    return Reply(
+        text=text,
+        provider_id=provider.id,
+        model=model,
+        raw=result.body,
+        prompt_tokens=usage.get("promptTokenCount"),
+        completion_tokens=usage.get("candidatesTokenCount"),
+    )

llmbuffet/config.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Configuration loading: provider catalog + user overrides.
+Resolution order for the provider catalog:
+1. The packaged ``providers.toml`` (the built-in catalog).
+2. A user catalog at ``$LLMBUFFET_CONFIG`` or
+   ``~/.config/llmbuffet/providers.toml`` if present. Providers with the same
+   ``id`` override the built-ins; new ids are appended.
+Only providers whose API key (and any extra env vars) are present in the
+environment are returned by :func:`configured_providers`.
+"""
+from __future__ import annotations
+import os
+import tomllib
+from pathlib import Path
+from .models import Model, Provider
+_PACKAGED_CATALOG = Path(__file__).with_name("providers.toml")
+def _user_catalog_path() -> Path | None:
+    override = os.environ.get("LLMBUFFET_CONFIG")
+    if override:
+        return Path(override).expanduser()
+    default = Path.home() / ".config" / "llmbuffet" / "providers.toml"
+    return default if default.exists() else None
+def _parse_catalog(data: dict) -> list[Provider]:
+    providers: list[Provider] = []
+    for row in data.get("provider", []):
+        models = tuple(
+            Model(name=m["name"], rpd=int(m.get("rpd", 0))) for m in row.get("models", [])
+        )
+        providers.append(
+            Provider(
+                id=row["id"],
+                label=row.get("label", row["id"]),
+                adapter=row.get("adapter", "openai"),
+                base_url=row["base_url"].rstrip("/"),
+                key_env=row.get("key_env"),
+                auth=row.get("auth", "bearer"),
+                key_optional=bool(row.get("key_optional", False)),
+                models=models,
+                extra_env=tuple(row.get("extra_env", [])),
+            )
+        )
+    return providers
+def load_catalog(path: Path | None = None) -> list[Provider]:
+    """Load the full provider catalog (built-ins + user overrides)."""
+    base_path = path or _PACKAGED_CATALOG
+    with base_path.open("rb") as fh:
+        providers = _parse_catalog(tomllib.load(fh))
+    if path is None:
+        user_path = _user_catalog_path()
+        if user_path is not None:
+            with user_path.open("rb") as fh:
+                user_providers = _parse_catalog(tomllib.load(fh))
+            by_id = {p.id: p for p in providers}
+            for up in user_providers:
+                by_id[up.id] = up
+            providers = list(by_id.values())
+    return providers
+def configured_providers(
+    catalog: list[Provider] | None = None,
+    env: dict[str, str] | None = None,
+) -> list[Provider]:
+    """Return only providers that have a usable API key in the environment."""
+    catalog = catalog if catalog is not None else load_catalog()
+    env = env if env is not None else dict(os.environ)
+    return [p for p in catalog if p.is_configured(env)]

llmbuffet/errors.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Exception hierarchy for llmbuffet."""
+from __future__ import annotations
+class BuffetError(Exception):
+    """Base class for all llmbuffet errors."""
+class NoProvidersConfigured(BuffetError):
+    """Raised when no provider has a usable API key in the environment."""
+class AllProvidersExhausted(BuffetError):
+    """Raised when every candidate provider failed or is over budget.
+    The ``attempts`` attribute holds a list of ``(target, reason)`` tuples
+    describing what was tried and why each one was skipped or failed.
+    """
+    def __init__(self, attempts: list[tuple[str, str]]):
+        self.attempts = attempts
+        detail = "; ".join(f"{name}: {reason}" for name, reason in attempts) or "no candidates"
+        super().__init__(f"all providers exhausted ({detail})")
+class ProviderHTTPError(BuffetError):
+    """A provider returned a non-success HTTP status.
+    ``status`` is the HTTP status code; ``retryable`` indicates whether the
+    router should move on to another provider (True) or give up (False).
+    """
+    def __init__(self, status: int, message: str, *, retryable: bool):
+        self.status = status
+        self.retryable = retryable
+        super().__init__(f"HTTP {status}: {message}")