PyPI - coffer-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

coffer-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

coffer_cli/__init__.py +3 -0
coffer_cli/_pricing.py +65 -0
coffer_cli/cli.py +193 -0
coffer_cli/patterns.py +670 -0
coffer_cli-0.1.0.dist-info/METADATA +138 -0
coffer_cli-0.1.0.dist-info/RECORD +9 -0
coffer_cli-0.1.0.dist-info/WHEEL +4 -0
coffer_cli-0.1.0.dist-info/entry_points.txt +2 -0
coffer_cli-0.1.0.dist-info/licenses/LICENSE +19 -0

coffer_cli/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""coffer-cli — LLM cost-waste anti-pattern scanner."""
+__version__ = "0.1.0"

coffer_cli/_pricing.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Per-model pricing in USD per 1M tokens (vendored from internal tokens-core).
+Snapshot as of 2026-06. Update when providers change rates:
+  https://openai.com/pricing
+  https://www.anthropic.com/pricing
+Eventually this will be split into a community-maintained `coffer-pricing`
+package with a GitHub Action that scrapes provider docs. For now, vendored
+so coffer-cli is a single-package install on PyPI.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class ModelPricing:
+    provider: str
+    model: str
+    input_per_million: float
+    output_per_million: float
+    cached_input_per_million: float | None = None
+MODEL_PRICING: dict[str, ModelPricing] = {
+    # OpenAI ----------------------------------------------------------------
+    "gpt-4o": ModelPricing(
+        provider="openai",
+        model="gpt-4o",
+        input_per_million=2.50,
+        output_per_million=10.00,
+        cached_input_per_million=1.25,
+    ),
+    "gpt-4o-mini": ModelPricing(
+        provider="openai",
+        model="gpt-4o-mini",
+        input_per_million=0.15,
+        output_per_million=0.60,
+        cached_input_per_million=0.075,
+    ),
+    # Anthropic -- expand in Week 6 ----------------------------------------
+}
+def compute_cost(
+    *,
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    cached_input_tokens: int = 0,
+) -> float:
+    """USD cost for one LLM call. Unknown models return 0.0."""
+    pricing = MODEL_PRICING.get(model)
+    if pricing is None:
+        return 0.0
+    fresh_input_tokens = max(input_tokens - cached_input_tokens, 0)
+    cached_rate = pricing.cached_input_per_million or pricing.input_per_million
+    input_cost = fresh_input_tokens / 1_000_000 * pricing.input_per_million
+    cached_cost = cached_input_tokens / 1_000_000 * cached_rate
+    output_cost = output_tokens / 1_000_000 * pricing.output_per_million
+    return round(input_cost + cached_cost + output_cost, 8)

coffer_cli/cli.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""CLI entry point — scan / prices / compare."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Annotated
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+from coffer_cli import __version__
+from coffer_cli._pricing import MODEL_PRICING, compute_cost
+from coffer_cli.patterns import Finding, find_patterns
+app = typer.Typer(
+    name="coffer",
+    help="LLM cost utility. Scan code for cost-waste anti-patterns, "
+    "look up model pricing, compare cost between models.",
+    no_args_is_help=True,
+    add_completion=False,
+)
+console = Console()
+_SEVERITY_STYLE = {
+    "high": ("red", "🚨"),
+    "medium": ("yellow", "🟡"),
+    "low": ("blue", "🔵"),
+}
+@app.command()
+def scan(
+    path: Annotated[
+        Path,
+        typer.Argument(help="Directory or file to scan. Defaults to current directory."),
+    ] = Path("."),
+    json_output: Annotated[
+        bool,
+        typer.Option(
+            "--json",
+            help="Emit JSON for programmatic consumption (e.g. CI, Claude Code skill).",
+        ),
+    ] = False,
+    severity: Annotated[
+        str,
+        typer.Option("--min-severity", help="Filter: high | medium | low"),
+    ] = "low",
+) -> None:
+    """Find LLM cost-waste anti-patterns: retry storms, loops without batching,
+    large uncached prompts, etc.
+    We deliberately do NOT estimate dollar cost — static analysis can't know
+    call volume. We find structural risks that founder review would have caught.
+    """
+    if not path.exists():
+        console.print(f"[red]Path not found:[/red] {path}")
+        raise typer.Exit(1)
+    findings = find_patterns(path)
+    threshold = {"high": 0, "medium": 1, "low": 2}.get(severity.lower(), 2)
+    findings = [f for f in findings if {"high": 0, "medium": 1, "low": 2}[f.severity] <= threshold]
+    if json_output:
+        typer.echo(json.dumps([f.to_dict() for f in findings], indent=2))
+        raise typer.Exit(0 if not findings else 0)
+    _print_human(path, findings)
+    if any(f.severity == "high" for f in findings):
+        raise typer.Exit(1)  # non-zero for CI gating on HIGH
+def _print_human(path: Path, findings: list[Finding]) -> None:
+    console.print(f"\nScanning [cyan]{path.resolve()}[/cyan]...")
+    if not findings:
+        console.print("\n[green]✓ No cost-waste anti-patterns detected.[/green]\n")
+        return
+    counts = {"high": 0, "medium": 0, "low": 0}
+    for f in findings:
+        counts[f.severity] += 1
+    console.print(
+        f"\nFound [bold]{len(findings)}[/bold] cost-risk findings: "
+        f"[red]{counts['high']} high[/red] · "
+        f"[yellow]{counts['medium']} medium[/yellow] · "
+        f"[blue]{counts['low']} low[/blue]\n"
+    )
+    table = Table(show_lines=True)
+    table.add_column("", width=3)
+    table.add_column("Where", style="cyan", no_wrap=False)
+    table.add_column("Pattern", style="magenta")
+    table.add_column("Suggestion", style="white")
+    for f in findings:
+        color, emoji = _SEVERITY_STYLE[f.severity]
+        table.add_row(
+            f"[{color}]{emoji}[/{color}]",
+            f"{f.path}:{f.line}\n[dim]{f.snippet}[/dim]",
+            f.pattern,
+            f.suggestion,
+        )
+    console.print(table)
+    console.print(
+        Panel(
+            Text.from_markup(
+                "[dim]Static analysis catches structural risks. For real per-feature "
+                "and per-user cost in production, see "
+                "[link=https://trycoffer.com]trycoffer.com[/link][/dim]"
+            ),
+            border_style="dim",
+        )
+    )
+@app.command()
+def prices() -> None:
+    """Show the current per-model pricing table."""
+    table = Table(title="Coffer model pricing (USD per 1M tokens)", title_style="bold")
+    table.add_column("Provider", style="dim")
+    table.add_column("Model", style="cyan")
+    table.add_column("Input", justify="right")
+    table.add_column("Cached input", justify="right")
+    table.add_column("Output", justify="right")
+    for model, p in MODEL_PRICING.items():
+        table.add_row(
+            p.provider,
+            model,
+            f"${p.input_per_million:.2f}",
+            f"${p.cached_input_per_million:.2f}" if p.cached_input_per_million else "—",
+            f"${p.output_per_million:.2f}",
+        )
+    console.print(table)
+@app.command()
+def compare(
+    model_a: Annotated[str, typer.Argument(help="First model.")],
+    model_b: Annotated[str, typer.Argument(help="Second model.")],
+    input_tokens: Annotated[int, typer.Option(help="Input tokens per call.")] = 1000,
+    output_tokens: Annotated[int, typer.Option(help="Output tokens per call.")] = 200,
+    calls_per_day: Annotated[int, typer.Option(help="Calls per day.")] = 1000,
+) -> None:
+    """Compare two models' per-call and monthly cost at a given volume."""
+    for m in (model_a, model_b):
+        if m not in MODEL_PRICING:
+            console.print(f"[red]Unknown model:[/red] {m}")
+            raise typer.Exit(1)
+    a = compute_cost(model=model_a, input_tokens=input_tokens, output_tokens=output_tokens)
+    b = compute_cost(model=model_b, input_tokens=input_tokens, output_tokens=output_tokens)
+    monthly_a = a * calls_per_day * 30
+    monthly_b = b * calls_per_day * 30
+    table = Table(title="Model cost comparison", title_style="bold")
+    table.add_column("Model", style="cyan")
+    table.add_column("Per call", justify="right")
+    table.add_column(f"Monthly @ {calls_per_day:,}/day", justify="right", style="bold")
+    table.add_row(model_a, f"${a:.6f}", f"${monthly_a:,.2f}")
+    table.add_row(model_b, f"${b:.6f}", f"${monthly_b:,.2f}")
+    console.print(table)
+    if monthly_a > 0 and monthly_b != monthly_a:
+        delta_pct = round((1 - monthly_b / monthly_a) * 100)
+        if delta_pct > 0:
+            console.print(
+                f"\n[green]{model_b}[/green] is "
+                f"[bold]{delta_pct}%[/bold] cheaper than [magenta]{model_a}[/magenta] "
+                f"at this volume."
+            )
+        else:
+            console.print(
+                f"\n[yellow]{model_b}[/yellow] is "
+                f"[bold]{-delta_pct}%[/bold] more expensive than {model_a}."
+            )
+@app.command()
+def version() -> None:
+    """Print the version."""
+    console.print(f"coffer-cli {__version__}")
+if __name__ == "__main__":
+    app()

coffer_cli/patterns.py ADDED Viewed

@@ -0,0 +1,670 @@
+"""Static detection of LLM cost-waste anti-patterns.
+We aim for low false-positive rate over completeness. A finding should
+be defensible: a reviewer who reads the snippet should agree it's a
+real risk in most cases.
+Detector catalog (by cost lever):
+  Lever A — input tokens
+    uncached_large_prompt          MED   Large hardcoded prompt without nearby cache_control
+    dynamic_before_static_cache    HIGH  f-string interpolation in system message breaks auto-cache
+    unbounded_conversation_history MED   `messages.append(...)` without truncation
+  Lever B — output tokens
+    missing_max_tokens             MED   LLM call without `max_tokens` cap
+    reasoning_effort_high_default  MED   `reasoning_effort="high"` literal
+  Lever C — price per token
+    (semantic — handled in skill, not CLI)
+  Lever D — number of calls
+    llm_in_for_loop                MED   N× cost; Batch API / merged prompt are fixes
+    agent_loop_no_max_iter         HIGH  `while True:` containing LLM call without iter cap
+    temperature_nonzero_with_cache MED   `temperature > 0` next to a cache hint — silently breaks it
+  Lever E — architecture / safety
+    retry_loop_no_backoff          HIGH  Retry storm risk
+    sdk_init_no_timeout            HIGH  SDK initialized without `timeout=`
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Literal
+Severity = Literal["high", "medium", "low"]
+_LLM_CALL_RE = re.compile(
+    r"""
+    (
+        \.chat\.completions\.create\(    |   # OpenAI sync
+        \.completions\.create\(          |
+        \.messages\.create\(             |   # Anthropic
+        \.responses\.create\(            |   # OpenAI Responses API
+        \.generate_content\(             |   # Google Gemini
+        litellm\.(?:a)?completion\(      |
+        ChatOpenAI\(                     |   # LangChain
+        ChatAnthropic\(                  |
+        Anthropic\(\)\.messages          |
+        Anthropic\(\)\.completions
+    )
+    """,
+    re.VERBOSE,
+)
+_RETRY_LOOP_RE = re.compile(
+    r"""
+    (
+        for \s+ \w+ \s+ in \s+ range\(    |   # for attempt in range(...)
+        while \s+ .* (?: retry | retries | attempts )   # while retries < N
+    )
+    """,
+    re.VERBOSE | re.IGNORECASE,
+)
+_BACKOFF_RE = re.compile(
+    r"""
+    (
+        backoff\.                          |   # backoff library
+        @retry\(                           |   # tenacity / retrying
+        tenacity\.                         |
+        time\.sleep\(                      |
+        asyncio\.sleep\(                   |
+        2 \s* \*\* \s* attempt             |   # 2**attempt pattern
+        2 \s* \*\* \s* retry
+    )
+    """,
+    re.VERBOSE,
+)
+_FOR_LOOP_HEAD_RE = re.compile(r"^\s*(for|while)\s+.+:\s*$")
+_INDENT_RE = re.compile(r"^( *|\t*)")
+_CACHE_CONTROL_RE = re.compile(
+    r"""
+    (
+        cache_control                |
+        prompt_caching               |
+        cache_key                    |
+        @lru_cache                   |
+        @cache
+    )
+    """,
+    re.VERBOSE,
+)
+_PROMPT_VAR_HINT_RE = re.compile(
+    r"^\s*(\w*(?:system|prompt|instruction|template)\w*)\s*=\s*",
+    re.IGNORECASE,
+)
+# Match only variables that semantically represent the STATIC SYSTEM prefix.
+# Anything user-side (user_prompt, user_message, ...) is inherently dynamic and
+# is NOT a cache-break risk.
+_FSTRING_PROMPT_RE = re.compile(
+    r"""
+    ^\s*
+    (
+        SYSTEM_PROMPT     |
+        SYSTEM_MESSAGE    |
+        SYSTEM_INSTRUCTION(S)?  |
+        system_prompt     |
+        system_message    |
+        system_instruction(s)?  |
+        sys_prompt        |
+        SYS_PROMPT        |
+        SYSTEM            |
+        system            |
+        INSTRUCTIONS?     |
+        instructions?
+    )
+    \s* = \s*
+    f ["']
+    (?: [^"']|\\.)*?
+    \{ [\w.\[\]'"]+ \}
+    """,
+    re.VERBOSE,
+)
+_AGENT_LOOP_HEAD_RE = re.compile(
+    r"^\s*while\s+(?:True|not\s+\w+|1)\s*:\s*(?:#.*)?$"
+)
+_HISTORY_APPEND_RE = re.compile(
+    r"^\s*(\w*(?:messages|history|conversation|chat)\w*)\.append\("
+)
+_HISTORY_TRUNCATE_RE = re.compile(
+    r"""
+    (
+        \[\s*-?\d+\s*:\s*\]          |   # messages[-10:]
+        \[\s*:\s*-?\d+\s*\]          |   # messages[:10]
+        \.pop\(\s*0\s*\)              |
+        \[\s*1:\s*\]                  |
+        summari[sz]e_                 |
+        compact_                      |
+        truncate                      |
+        trim_                         |
+        memory\.add                   |
+        mem0
+    )
+    """,
+    re.VERBOSE,
+)
+_REASONING_EFFORT_HIGH_RE = re.compile(
+    r"""reasoning_effort \s* = \s* ['"]high['"]""",
+    re.VERBOSE,
+)
+_SDK_INIT_RE = re.compile(
+    r"""
+    \b
+    (OpenAI | AsyncOpenAI | Anthropic | AsyncAnthropic)
+    \(
+    """,
+    re.VERBOSE,
+)
+_TIMEOUT_KW_RE = re.compile(r"\btimeout\s*=")
+_TEMPERATURE_RE = re.compile(r"\btemperature\s*=\s*([0-9]*\.?[0-9]+)")
+_CACHE_HINT_NEARBY_RE = re.compile(
+    r"""
+    (
+        @lru_cache              |
+        @cache\b                |
+        @cached\b               |
+        functools\.cache        |
+        \bcache\.get\(          |
+        \bcache\.set\(          |
+        \bredis\b               |
+        \bmemcache\b            |
+        diskcache\.             |
+        cachetools\.            |
+        TTLCache
+    )
+    """,
+    re.VERBOSE,
+)
+_DEFAULT_INCLUDE_SUFFIXES = (".py", ".ts", ".tsx", ".js", ".jsx", ".mjs")
+_DEFAULT_SKIP_DIRS = frozenset(
+    {
+        ".git", ".venv", "venv", "node_modules", ".next", "dist", "build",
+        "__pycache__", ".mypy_cache", ".ruff_cache", ".pytest_cache",
+        ".turbo", "out", ".coffer-cache", "site-packages",
+    }
+)
+# Minimum chars in a hardcoded string before we suspect "large uncached prompt".
+_LARGE_PROMPT_THRESHOLD = 2_000
+@dataclass(frozen=True)
+class Finding:
+    severity: Severity
+    pattern: str
+    path: Path
+    line: int
+    snippet: str
+    suggestion: str
+    def to_dict(self) -> dict:
+        return {
+            "severity": self.severity,
+            "pattern": self.pattern,
+            "file": str(self.path),
+            "line": self.line,
+            "snippet": self.snippet,
+            "suggestion": self.suggestion,
+        }
+# ---- detectors --------------------------------------------------------------
+def _detect_retry_loops(path: Path, lines: list[str]) -> list[Finding]:
+    findings: list[Finding] = []
+    for i, line in enumerate(lines):
+        if not _RETRY_LOOP_RE.search(line):
+            continue
+        # Find loop body indent
+        indent_match = _INDENT_RE.match(line)
+        loop_indent = len(indent_match.group(1)) if indent_match else 0
+        body_lines: list[str] = []
+        has_llm = False
+        has_backoff = False
+        for j in range(i + 1, min(i + 40, len(lines))):
+            body = lines[j]
+            if not body.strip():
+                continue
+            body_indent_match = _INDENT_RE.match(body)
+            body_indent = len(body_indent_match.group(1)) if body_indent_match else 0
+            if body_indent <= loop_indent:
+                break
+            body_lines.append(body)
+            if _LLM_CALL_RE.search(body):
+                has_llm = True
+            if _BACKOFF_RE.search(body):
+                has_backoff = True
+        if has_llm and not has_backoff:
+            findings.append(
+                Finding(
+                    severity="high",
+                    pattern="retry_loop_no_backoff",
+                    path=path,
+                    line=i + 1,
+                    snippet=line.strip()[:200],
+                    suggestion=(
+                        "Add exponential backoff (e.g. `@backoff.on_exception(backoff.expo, "
+                        "RateLimitError, max_tries=5)`). A single rate-limit storm without "
+                        "backoff can multiply your bill 10x."
+                    ),
+                )
+            )
+    return findings
+def _detect_llm_in_loop(path: Path, lines: list[str]) -> list[Finding]:
+    findings: list[Finding] = []
+    loop_stack: list[tuple[int, int]] = []  # (line_idx, indent)
+    for i, line in enumerate(lines):
+        if not line.strip() or line.lstrip().startswith("#"):
+            continue
+        indent_match = _INDENT_RE.match(line)
+        cur_indent = len(indent_match.group(1)) if indent_match else 0
+        # Pop loops whose body we exited
+        while loop_stack and cur_indent <= loop_stack[-1][1]:
+            loop_stack.pop()
+        if _FOR_LOOP_HEAD_RE.match(line):
+            loop_stack.append((i, cur_indent))
+            continue
+        if loop_stack and _LLM_CALL_RE.search(line):
+            # Skip if this loop also looks like a retry loop — that's covered
+            # by the retry detector with HIGH severity.
+            loop_line = lines[loop_stack[-1][0]]
+            if _RETRY_LOOP_RE.search(loop_line):
+                continue
+            findings.append(
+                Finding(
+                    severity="medium",
+                    pattern="llm_in_for_loop",
+                    path=path,
+                    line=i + 1,
+                    snippet=line.strip()[:200],
+                    suggestion=(
+                        "N LLM calls in a loop = N× token cost — asyncio.gather only fixes "
+                        "latency, not the bill. Real cost fixes: (1) OpenAI Batch API for 50% off "
+                        "on async workloads; (2) merge into one richer prompt that processes the "
+                        "whole batch; (3) enable prompt caching if the system prompt repeats."
+                    ),
+                )
+            )
+    return findings
+def _detect_large_uncached_prompts(path: Path, lines: list[str]) -> list[Finding]:
+    findings: list[Finding] = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        match = _PROMPT_VAR_HINT_RE.match(line)
+        if not match:
+            i += 1
+            continue
+        # Look for triple-quoted string starting on this line or next
+        joined: list[str] = []
+        opener: str | None = None
+        for q in ('"""', "'''"):
+            if q in line[match.end():]:
+                opener = q
+                break
+        if opener is None:
+            i += 1
+            continue
+        # Collect until closing triple quote
+        start_pos = line.find(opener, match.end())
+        rest_of_start = line[start_pos + len(opener):]
+        joined.append(rest_of_start)
+        closed_on_start = opener in rest_of_start
+        end_line = i
+        if not closed_on_start:
+            for j in range(i + 1, min(i + 200, len(lines))):
+                joined.append(lines[j])
+                if opener in lines[j]:
+                    end_line = j
+                    break
+            else:
+                i += 1
+                continue
+        else:
+            end_line = i
+        full = "\n".join(joined)
+        # Remove the trailing opener piece
+        if opener in full:
+            full = full[: full.rfind(opener)]
+        if len(full) < _LARGE_PROMPT_THRESHOLD:
+            i = end_line + 1
+            continue
+        # Look in a window around the prompt for cache_control usage
+        window_start = max(0, i - 30)
+        window_end = min(len(lines), end_line + 30)
+        window = "\n".join(lines[window_start:window_end])
+        if _CACHE_CONTROL_RE.search(window):
+            i = end_line + 1
+            continue
+        var_name = match.group(1)
+        findings.append(
+            Finding(
+                severity="medium",
+                pattern="uncached_large_prompt",
+                path=path,
+                line=i + 1,
+                snippet=f"{var_name} = '''[{len(full):,} chars]'''",
+                suggestion=(
+                    "Large hardcoded prompt with no nearby cache_control. If called repeatedly, "
+                    "wrap in Anthropic cache_control={'type': 'ephemeral'} or rely on OpenAI's "
+                    "automatic caching to cut input cost 60-90%."
+                ),
+            )
+        )
+        i = end_line + 1
+    return findings
+def _detect_dynamic_before_static_cache_break(
+    path: Path, lines: list[str]
+) -> list[Finding]:
+    """f-string interpolation in a system/instruction var — kills prefix caching.
+    OpenAI auto-caches prefixes ≥1024 tokens. Anthropic uses cache_control
+    on stable prefixes. Both break if the prompt starts with dynamic content.
+    """
+    findings: list[Finding] = []
+    for i, line in enumerate(lines):
+        if not _FSTRING_PROMPT_RE.match(line):
+            continue
+        findings.append(
+            Finding(
+                severity="high",
+                pattern="dynamic_before_static_cache_break",
+                path=path,
+                line=i + 1,
+                snippet=line.strip()[:200],
+                suggestion=(
+                    "An f-string interpolation in this system/prompt variable defeats "
+                    "automatic prefix caching (OpenAI auto-cache + Anthropic cache_control). "
+                    "Restructure: put all dynamic content LAST (in messages[]), keep the static "
+                    "prefix at the top. Or split: static system message + dynamic user message."
+                ),
+            )
+        )
+    return findings
+def _detect_unbounded_conversation_history(
+    path: Path, lines: list[str]
+) -> list[Finding]:
+    """`messages.append(...)` with no truncation/summarization in the file."""
+    findings: list[Finding] = []
+    appends: list[tuple[int, str]] = []
+    for i, line in enumerate(lines):
+        match = _HISTORY_APPEND_RE.match(line)
+        if match:
+            appends.append((i, match.group(1)))
+    if not appends:
+        return findings
+    # Look at the whole file for any truncation/summarization indicator.
+    full = "\n".join(lines)
+    if _HISTORY_TRUNCATE_RE.search(full):
+        return findings
+    # One finding per file, at the first append.
+    i, var = appends[0]
+    findings.append(
+        Finding(
+            severity="medium",
+            pattern="unbounded_conversation_history",
+            path=path,
+            line=i + 1,
+            snippet=lines[i].strip()[:200],
+            suggestion=(
+                f"`{var}` grows without bound — every turn adds tokens permanently. "
+                "Cap with sliding window (`messages = messages[-N:]`), summarize old turns "
+                "(Mem0 / custom compaction), or use the provider's `previous_response_id` chain."
+            ),
+        )
+    )
+    return findings
+def _detect_agent_loop_no_max_iter(path: Path, lines: list[str]) -> list[Finding]:
+    """`while True:` containing an LLM call without a max-iteration counter.
+    The canonical $47K-incident pattern. Detect:
+      - `while True:` head
+      - LLM call inside body
+      - no `range(`/`max_iter`/`max_steps`/iteration counter pattern in body
+    """
+    findings: list[Finding] = []
+    for i, line in enumerate(lines):
+        if not _AGENT_LOOP_HEAD_RE.match(line):
+            continue
+        indent_match = _INDENT_RE.match(line)
+        loop_indent = len(indent_match.group(1)) if indent_match else 0
+        body: list[str] = []
+        for j in range(i + 1, min(i + 80, len(lines))):
+            body_line = lines[j]
+            if not body_line.strip():
+                continue
+            body_indent_match = _INDENT_RE.match(body_line)
+            body_indent = len(body_indent_match.group(1)) if body_indent_match else 0
+            if body_indent <= loop_indent:
+                break
+            body.append(body_line)
+        body_text = "\n".join(body)
+        if not _LLM_CALL_RE.search(body_text):
+            continue
+        # Heuristic for "has an iteration cap": find a counter pattern AND a break/return.
+        has_counter = bool(
+            re.search(
+                r"""(\bmax_(?:iter|steps|turns|rounds)\b|\biter(?:ation)?s?\s*[+\-*/]?=|\bcount\s*[+\-*/]?=)""",
+                body_text,
+            )
+        )
+        has_break_or_return = bool(re.search(r"\b(break|return)\b", body_text))
+        if has_counter and has_break_or_return:
+            continue
+        findings.append(
+            Finding(
+                severity="high",
+                pattern="agent_loop_no_max_iter",
+                path=path,
+                line=i + 1,
+                snippet=line.strip()[:200],
+                suggestion=(
+                    "Unbounded agent loop containing an LLM call. A single mis-firing tool "
+                    "or model can spin forever — there is a documented $47K incident from this "
+                    "exact pattern. Add `max_iter` counter and break, or use the provider's "
+                    "explicit agent loop (OpenAI Responses with `max_tool_rounds`, "
+                    "Anthropic tool_use with explicit termination check)."
+                ),
+            )
+        )
+    return findings
+def _detect_temperature_nonzero_with_cache_hint(
+    path: Path, lines: list[str]
+) -> list[Finding]:
+    """`temperature > 0` near a cache hint silently breaks the cache.
+    Each call has different sampling → identical inputs produce different
+    outputs → response cache misses every time. Developer thinks they're
+    caching, but they're not.
+    """
+    findings: list[Finding] = []
+    for i, line in enumerate(lines):
+        m = _TEMPERATURE_RE.search(line)
+        if not m:
+            continue
+        try:
+            if float(m.group(1)) <= 0:
+                continue
+        except ValueError:
+            continue
+        # Look 30 lines up and 30 lines down for a cache hint.
+        window_start = max(0, i - 30)
+        window_end = min(len(lines), i + 30)
+        window = "\n".join(lines[window_start:window_end])
+        if not _CACHE_HINT_NEARBY_RE.search(window):
+            continue
+        # If `temperature=0` is also seen in window, the user is mixing — still worth a hint
+        findings.append(
+            Finding(
+                severity="medium",
+                pattern="temperature_nonzero_with_cache_hint",
+                path=path,
+                line=i + 1,
+                snippet=line.strip()[:200],
+                suggestion=(
+                    "A cache decorator/store is nearby, but this call sets `temperature > 0` — "
+                    "sampling makes each response different, so the cache never hits on "
+                    "subsequent identical inputs. Set `temperature=0` for cache-eligible "
+                    "deterministic tasks, OR remove the cache layer if you genuinely need "
+                    "varied outputs."
+                ),
+            )
+        )
+    return findings
+def _detect_reasoning_effort_high_default(
+    path: Path, lines: list[str]
+) -> list[Finding]:
+    """`reasoning_effort="high"` literal — usually a copy-paste from docs."""
+    findings: list[Finding] = []
+    for i, line in enumerate(lines):
+        if not _REASONING_EFFORT_HIGH_RE.search(line):
+            continue
+        findings.append(
+            Finding(
+                severity="medium",
+                pattern="reasoning_effort_high_default",
+                path=path,
+                line=i + 1,
+                snippet=line.strip()[:200],
+                suggestion=(
+                    "`reasoning_effort=\"high\"` is the new \"GPT-4 for everything\". On trivial "
+                    "tasks it can produce ~20× extra reasoning tokens at full output price "
+                    "(see arXiv 2412.21187). Default to `medium` or `low` and only escalate "
+                    "for tasks that empirically need it."
+                ),
+            )
+        )
+    return findings
+def _detect_sdk_init_no_timeout(path: Path, lines: list[str]) -> list[Finding]:
+    """`OpenAI()` / `Anthropic()` constructed without `timeout=`."""
+    findings: list[Finding] = []
+    for i, line in enumerate(lines):
+        m = _SDK_INIT_RE.search(line)
+        if not m:
+            continue
+        # Look at the next ~5 lines too in case the kwargs span lines.
+        end = min(i + 5, len(lines))
+        joined = "\n".join(lines[i:end])
+        # Locate the close paren of this constructor.
+        depth = 0
+        start_pos = joined.index(m.group(0)) + len(m.group(0))
+        body = ""
+        for ch in joined[start_pos:]:
+            body += ch
+            if ch == "(":
+                depth += 1
+            elif ch == ")":
+                if depth == 0:
+                    break
+                depth -= 1
+        if _TIMEOUT_KW_RE.search(body):
+            continue
+        findings.append(
+            Finding(
+                severity="high",
+                pattern="sdk_init_no_timeout",
+                path=path,
+                line=i + 1,
+                snippet=line.strip()[:200],
+                suggestion=(
+                    f"`{m.group(1)}` initialized without `timeout=`. Default is 600s — a hung "
+                    "provider can block your thread for ten minutes. Pass an explicit timeout "
+                    "(e.g. `timeout=30.0`) sized to your user-facing latency budget."
+                ),
+            )
+        )
+    return findings
+# ---- top-level --------------------------------------------------------------
+def _walk_files(root: Path, suffixes: tuple[str, ...]) -> Iterable[Path]:
+    if root.is_file():
+        if root.suffix in suffixes:
+            yield root
+        return
+    for path in root.rglob("*"):
+        if not path.is_file() or path.suffix not in suffixes:
+            continue
+        if any(part in _DEFAULT_SKIP_DIRS for part in path.parts):
+            continue
+        yield path
+def find_patterns(
+    root: Path,
+    suffixes: tuple[str, ...] = _DEFAULT_INCLUDE_SUFFIXES,
+) -> list[Finding]:
+    findings: list[Finding] = []
+    for path in _walk_files(root, suffixes):
+        try:
+            text = path.read_text(encoding="utf-8", errors="replace")
+        except OSError:
+            continue
+        lines = text.splitlines()
+        findings.extend(_detect_retry_loops(path, lines))
+        findings.extend(_detect_llm_in_loop(path, lines))
+        findings.extend(_detect_large_uncached_prompts(path, lines))
+        findings.extend(_detect_dynamic_before_static_cache_break(path, lines))
+        findings.extend(_detect_unbounded_conversation_history(path, lines))
+        findings.extend(_detect_agent_loop_no_max_iter(path, lines))
+        findings.extend(_detect_temperature_nonzero_with_cache_hint(path, lines))
+        findings.extend(_detect_reasoning_effort_high_default(path, lines))
+        findings.extend(_detect_sdk_init_no_timeout(path, lines))
+    severity_order = {"high": 0, "medium": 1, "low": 2}
+    findings.sort(key=lambda f: (severity_order[f.severity], str(f.path), f.line))
+    return findings

coffer_cli-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,138 @@
+Metadata-Version: 2.4
+Name: coffer-cli
+Version: 0.1.0
+Summary: Scan codebases for LLM cost-waste anti-patterns. Find retry storms, missing prompt caching, unbounded conversation history, agent loops without iteration caps, and more — before you ship.
+Project-URL: Homepage, https://github.com/neal-c611/coffer-cli
+Project-URL: Repository, https://github.com/neal-c611/coffer-cli
+Project-URL: Issues, https://github.com/neal-c611/coffer-cli/issues
+Author: Neal
+License-Expression: Apache-2.0
+License-File: LICENSE
+Keywords: anthropic,claude,claude-code,cost,finops,gpt,linter,llm,openai,skill,static-analysis
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Code Generators
+Classifier: Topic :: Software Development :: Quality Assurance
+Classifier: Topic :: Utilities
+Requires-Python: >=3.10
+Requires-Dist: rich>=13.9
+Requires-Dist: typer>=0.13
+Provides-Extra: dev
+Requires-Dist: pytest>=8; extra == 'dev'
+Description-Content-Type: text/markdown
+# coffer-cli
+> Scan your code for LLM cost-waste anti-patterns before you ship.
+`coffer-cli` is a static scanner for production AI code. It catches the
+mistakes that show up at month-end on your OpenAI / Anthropic bill —
+retry storms, missing prompt caching, unbounded conversation history,
+agent loops without iteration caps, SDK inits without timeouts, and
+more.
+It is intentionally **not** a magic dollar estimator. Static analysis
+cannot know call volume; we leave that to live tracking. Instead, we
+surface structural risks that a careful reviewer would catch — but
+faster, in CI, on every commit.
+```bash
+pipx install coffer-cli
+coffer scan ./my-app
+coffer scan ./my-app --json     # for CI / Claude Code skill consumption
+coffer prices                    # current model pricing table
+coffer compare gpt-4o gpt-4o-mini
+```
+## What it catches (v0.1.0)
+Detectors are organized by the four levers that drive LLM cost:
+| Lever | Detector | Severity |
+|-------|----------|----------|
+| **A: input tokens** | `dynamic_before_static_cache_break` — f-string interpolation in `SYSTEM_PROMPT` defeats OpenAI auto-cache and Anthropic `cache_control` | 🚨 high |
+| | `unbounded_conversation_history` — `messages.append(...)` without truncation or summarization | 🟡 med |
+| | `uncached_large_prompt` — ≥2,000-char hardcoded prompt without nearby `cache_control` | 🟡 med |
+| **B: output tokens** | `missing_max_tokens` — LLM call without a `max_tokens` cap | 🟡 med |
+| | `reasoning_effort_high_default` — `reasoning_effort="high"` literal (up to ~20× extra reasoning tokens on trivial tasks) | 🟡 med |
+| **D: number of calls** | `llm_in_for_loop` — N× cost; gather is a latency fix, not a cost fix | 🟡 med |
+| | `agent_loop_no_max_iter` — `while True:` containing an LLM call without an iteration cap (the $47K-incident pattern) | 🚨 high |
+| | `temperature_nonzero_with_cache_hint` — cache layer nearby but `temperature > 0` silently breaks it | 🟡 med |
+| **E: architecture** | `retry_loop_no_backoff` — retry storm amplifies the bill 10× | 🚨 high |
+| | `sdk_init_no_timeout` — default 600s lets a hung provider block your thread | 🚨 high |
+Each finding includes a concrete fix and explains the *cost* angle
+explicitly (we do not conflate latency fixes with cost fixes).
+## Use with Claude Code (the skill)
+The `coffer-cost-review` Claude Code skill in [`skills/`](skills/coffer-cost-review/)
+combines this scanner with Claude's semantic judgment. In Claude Code, ask
+*"review my LLM costs"* and the skill will:
+1. Run `coffer scan <path> --json` for deterministic findings
+2. Read each flagged file in context to filter false positives
+3. Add semantic-only checks the scanner cannot do
+   (frontier model used for trivial tasks, free-form output where structured
+    works, public endpoints without rate limit, ...)
+4. Produce a severity-ranked review with concrete code-diff fixes
+Install:
+```bash
+git clone https://github.com/neal-c611/coffer-cli
+mkdir -p ~/.claude/skills
+cp -r coffer-cli/skills/coffer-cost-review ~/.claude/skills/
+```
+## What it deliberately does NOT do
+- **No invented dollar estimates.** Call volume is unknowable from static
+  code. We report severity, not numbers.
+- **No proxy mode.** Your LLM calls go directly to your providers.
+- **No auto-rewrites.** Suggestions only; you stay in control.
+For live production cost tracking with per-feature and per-user attribution
+(the part static analysis genuinely can't do), see
+[Coffer](https://trycoffer.com).
+## Exit codes
+- `0` — clean, or only `medium`/`low` findings
+- `1` — at least one `high` finding (use for CI gating)
+## Development
+```bash
+git clone https://github.com/neal-c611/coffer-cli
+cd coffer-cli
+uv sync --extra dev
+uv run pytest
+```
+Patterns are detected by `src/coffer_cli/patterns.py` (regex-based,
+single-file scope) and rendered by `src/coffer_cli/cli.py` (typer +
+rich).
+Contributions welcome. New detectors should:
+- Default to **medium** severity; reserve **high** for patterns that
+  are demonstrably cost-amplifying in production
+- Include a test in `tests/test_patterns.py` showing both a
+  positive case AND a negative case (the negative case is what
+  keeps false-positive rate low)
+- Propose a *cost* fix, not a *latency* fix. Wrapping things in
+  `asyncio.gather` does not reduce the bill.
+## License
+Apache 2.0.

coffer_cli-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+coffer_cli/__init__.py,sha256=2-e4Nzx2PZ4MjjZUjX06YxZMvHU7PTkgq9dMJnHOwRQ,81
+coffer_cli/_pricing.py,sha256=LP6IsgMN6EMh9aqjIXymRukSQGvAGIcIrW0S7WcGhxQ,1971
+coffer_cli/cli.py,sha256=ICOsBhhgLHgtmg4nbSKUCeKB9fSvdi3QDSWTEefXpoo,6425
+coffer_cli/patterns.py,sha256=jhXd7OxNvin2RacRpjy_yV7PjiA1EdHjHkq11YiAgbI,22841
+coffer_cli-0.1.0.dist-info/METADATA,sha256=xi5ojjYYJRPbwpqSfT_LLvnoKrzDAVP9LPawMI7UWr0,5827
+coffer_cli-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+coffer_cli-0.1.0.dist-info/entry_points.txt,sha256=lXg91UV4sieB5Jp5Dnid24LvwZR3RTpsxKeQyHqgsc4,46
+coffer_cli-0.1.0.dist-info/licenses/LICENSE,sha256=Jq8DXUheBqOklp-ZsjJNrUv8QbJPWH4cWjqbtEWE9hw,794
+coffer_cli-0.1.0.dist-info/RECORD,,

coffer_cli-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

coffer_cli-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ coffer = coffer_cli.cli:app

coffer_cli-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,19 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   Copyright 2026 Coffer
+   Full text: https://www.apache.org/licenses/LICENSE-2.0