coffer-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coffer_cli/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """coffer-cli — LLM cost-waste anti-pattern scanner."""
2
+
3
+ __version__ = "0.1.0"
coffer_cli/_pricing.py ADDED
@@ -0,0 +1,65 @@
1
+ """Per-model pricing in USD per 1M tokens (vendored from internal tokens-core).
2
+
3
+ Snapshot as of 2026-06. Update when providers change rates:
4
+ https://openai.com/pricing
5
+ https://www.anthropic.com/pricing
6
+
7
+ Eventually this will be split into a community-maintained `coffer-pricing`
8
+ package with a GitHub Action that scrapes provider docs. For now, vendored
9
+ so coffer-cli is a single-package install on PyPI.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class ModelPricing:
19
+ provider: str
20
+ model: str
21
+ input_per_million: float
22
+ output_per_million: float
23
+ cached_input_per_million: float | None = None
24
+
25
+
26
+ MODEL_PRICING: dict[str, ModelPricing] = {
27
+ # OpenAI ----------------------------------------------------------------
28
+ "gpt-4o": ModelPricing(
29
+ provider="openai",
30
+ model="gpt-4o",
31
+ input_per_million=2.50,
32
+ output_per_million=10.00,
33
+ cached_input_per_million=1.25,
34
+ ),
35
+ "gpt-4o-mini": ModelPricing(
36
+ provider="openai",
37
+ model="gpt-4o-mini",
38
+ input_per_million=0.15,
39
+ output_per_million=0.60,
40
+ cached_input_per_million=0.075,
41
+ ),
42
+ # Anthropic -- expand in Week 6 ----------------------------------------
43
+ }
44
+
45
+
46
+ def compute_cost(
47
+ *,
48
+ model: str,
49
+ input_tokens: int,
50
+ output_tokens: int,
51
+ cached_input_tokens: int = 0,
52
+ ) -> float:
53
+ """USD cost for one LLM call. Unknown models return 0.0."""
54
+ pricing = MODEL_PRICING.get(model)
55
+ if pricing is None:
56
+ return 0.0
57
+
58
+ fresh_input_tokens = max(input_tokens - cached_input_tokens, 0)
59
+ cached_rate = pricing.cached_input_per_million or pricing.input_per_million
60
+
61
+ input_cost = fresh_input_tokens / 1_000_000 * pricing.input_per_million
62
+ cached_cost = cached_input_tokens / 1_000_000 * cached_rate
63
+ output_cost = output_tokens / 1_000_000 * pricing.output_per_million
64
+
65
+ return round(input_cost + cached_cost + output_cost, 8)
coffer_cli/cli.py ADDED
@@ -0,0 +1,193 @@
1
+ """CLI entry point — scan / prices / compare."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ import typer
10
+ from rich.console import Console
11
+ from rich.panel import Panel
12
+ from rich.table import Table
13
+ from rich.text import Text
14
+ from coffer_cli import __version__
15
+ from coffer_cli._pricing import MODEL_PRICING, compute_cost
16
+ from coffer_cli.patterns import Finding, find_patterns
17
+
18
+ app = typer.Typer(
19
+ name="coffer",
20
+ help="LLM cost utility. Scan code for cost-waste anti-patterns, "
21
+ "look up model pricing, compare cost between models.",
22
+ no_args_is_help=True,
23
+ add_completion=False,
24
+ )
25
+ console = Console()
26
+
27
+
28
+ _SEVERITY_STYLE = {
29
+ "high": ("red", "🚨"),
30
+ "medium": ("yellow", "🟡"),
31
+ "low": ("blue", "🔵"),
32
+ }
33
+
34
+
35
+ @app.command()
36
+ def scan(
37
+ path: Annotated[
38
+ Path,
39
+ typer.Argument(help="Directory or file to scan. Defaults to current directory."),
40
+ ] = Path("."),
41
+ json_output: Annotated[
42
+ bool,
43
+ typer.Option(
44
+ "--json",
45
+ help="Emit JSON for programmatic consumption (e.g. CI, Claude Code skill).",
46
+ ),
47
+ ] = False,
48
+ severity: Annotated[
49
+ str,
50
+ typer.Option("--min-severity", help="Filter: high | medium | low"),
51
+ ] = "low",
52
+ ) -> None:
53
+ """Find LLM cost-waste anti-patterns: retry storms, loops without batching,
54
+ large uncached prompts, etc.
55
+
56
+ We deliberately do NOT estimate dollar cost — static analysis can't know
57
+ call volume. We find structural risks that founder review would have caught.
58
+ """
59
+ if not path.exists():
60
+ console.print(f"[red]Path not found:[/red] {path}")
61
+ raise typer.Exit(1)
62
+
63
+ findings = find_patterns(path)
64
+
65
+ threshold = {"high": 0, "medium": 1, "low": 2}.get(severity.lower(), 2)
66
+ findings = [f for f in findings if {"high": 0, "medium": 1, "low": 2}[f.severity] <= threshold]
67
+
68
+ if json_output:
69
+ typer.echo(json.dumps([f.to_dict() for f in findings], indent=2))
70
+ raise typer.Exit(0 if not findings else 0)
71
+
72
+ _print_human(path, findings)
73
+ if any(f.severity == "high" for f in findings):
74
+ raise typer.Exit(1) # non-zero for CI gating on HIGH
75
+
76
+
77
+ def _print_human(path: Path, findings: list[Finding]) -> None:
78
+ console.print(f"\nScanning [cyan]{path.resolve()}[/cyan]...")
79
+
80
+ if not findings:
81
+ console.print("\n[green]✓ No cost-waste anti-patterns detected.[/green]\n")
82
+ return
83
+
84
+ counts = {"high": 0, "medium": 0, "low": 0}
85
+ for f in findings:
86
+ counts[f.severity] += 1
87
+
88
+ console.print(
89
+ f"\nFound [bold]{len(findings)}[/bold] cost-risk findings: "
90
+ f"[red]{counts['high']} high[/red] · "
91
+ f"[yellow]{counts['medium']} medium[/yellow] · "
92
+ f"[blue]{counts['low']} low[/blue]\n"
93
+ )
94
+
95
+ table = Table(show_lines=True)
96
+ table.add_column("", width=3)
97
+ table.add_column("Where", style="cyan", no_wrap=False)
98
+ table.add_column("Pattern", style="magenta")
99
+ table.add_column("Suggestion", style="white")
100
+
101
+ for f in findings:
102
+ color, emoji = _SEVERITY_STYLE[f.severity]
103
+ table.add_row(
104
+ f"[{color}]{emoji}[/{color}]",
105
+ f"{f.path}:{f.line}\n[dim]{f.snippet}[/dim]",
106
+ f.pattern,
107
+ f.suggestion,
108
+ )
109
+ console.print(table)
110
+
111
+ console.print(
112
+ Panel(
113
+ Text.from_markup(
114
+ "[dim]Static analysis catches structural risks. For real per-feature "
115
+ "and per-user cost in production, see "
116
+ "[link=https://trycoffer.com]trycoffer.com[/link][/dim]"
117
+ ),
118
+ border_style="dim",
119
+ )
120
+ )
121
+
122
+
123
+ @app.command()
124
+ def prices() -> None:
125
+ """Show the current per-model pricing table."""
126
+ table = Table(title="Coffer model pricing (USD per 1M tokens)", title_style="bold")
127
+ table.add_column("Provider", style="dim")
128
+ table.add_column("Model", style="cyan")
129
+ table.add_column("Input", justify="right")
130
+ table.add_column("Cached input", justify="right")
131
+ table.add_column("Output", justify="right")
132
+
133
+ for model, p in MODEL_PRICING.items():
134
+ table.add_row(
135
+ p.provider,
136
+ model,
137
+ f"${p.input_per_million:.2f}",
138
+ f"${p.cached_input_per_million:.2f}" if p.cached_input_per_million else "—",
139
+ f"${p.output_per_million:.2f}",
140
+ )
141
+ console.print(table)
142
+
143
+
144
+ @app.command()
145
+ def compare(
146
+ model_a: Annotated[str, typer.Argument(help="First model.")],
147
+ model_b: Annotated[str, typer.Argument(help="Second model.")],
148
+ input_tokens: Annotated[int, typer.Option(help="Input tokens per call.")] = 1000,
149
+ output_tokens: Annotated[int, typer.Option(help="Output tokens per call.")] = 200,
150
+ calls_per_day: Annotated[int, typer.Option(help="Calls per day.")] = 1000,
151
+ ) -> None:
152
+ """Compare two models' per-call and monthly cost at a given volume."""
153
+ for m in (model_a, model_b):
154
+ if m not in MODEL_PRICING:
155
+ console.print(f"[red]Unknown model:[/red] {m}")
156
+ raise typer.Exit(1)
157
+
158
+ a = compute_cost(model=model_a, input_tokens=input_tokens, output_tokens=output_tokens)
159
+ b = compute_cost(model=model_b, input_tokens=input_tokens, output_tokens=output_tokens)
160
+ monthly_a = a * calls_per_day * 30
161
+ monthly_b = b * calls_per_day * 30
162
+
163
+ table = Table(title="Model cost comparison", title_style="bold")
164
+ table.add_column("Model", style="cyan")
165
+ table.add_column("Per call", justify="right")
166
+ table.add_column(f"Monthly @ {calls_per_day:,}/day", justify="right", style="bold")
167
+ table.add_row(model_a, f"${a:.6f}", f"${monthly_a:,.2f}")
168
+ table.add_row(model_b, f"${b:.6f}", f"${monthly_b:,.2f}")
169
+ console.print(table)
170
+
171
+ if monthly_a > 0 and monthly_b != monthly_a:
172
+ delta_pct = round((1 - monthly_b / monthly_a) * 100)
173
+ if delta_pct > 0:
174
+ console.print(
175
+ f"\n[green]{model_b}[/green] is "
176
+ f"[bold]{delta_pct}%[/bold] cheaper than [magenta]{model_a}[/magenta] "
177
+ f"at this volume."
178
+ )
179
+ else:
180
+ console.print(
181
+ f"\n[yellow]{model_b}[/yellow] is "
182
+ f"[bold]{-delta_pct}%[/bold] more expensive than {model_a}."
183
+ )
184
+
185
+
186
+ @app.command()
187
+ def version() -> None:
188
+ """Print the version."""
189
+ console.print(f"coffer-cli {__version__}")
190
+
191
+
192
+ if __name__ == "__main__":
193
+ app()
coffer_cli/patterns.py ADDED
@@ -0,0 +1,670 @@
1
+ """Static detection of LLM cost-waste anti-patterns.
2
+
3
+ We aim for low false-positive rate over completeness. A finding should
4
+ be defensible: a reviewer who reads the snippet should agree it's a
5
+ real risk in most cases.
6
+
7
+ Detector catalog (by cost lever):
8
+
9
+ Lever A — input tokens
10
+ uncached_large_prompt MED Large hardcoded prompt without nearby cache_control
11
+ dynamic_before_static_cache HIGH f-string interpolation in system message breaks auto-cache
12
+ unbounded_conversation_history MED `messages.append(...)` without truncation
13
+ Lever B — output tokens
14
+ missing_max_tokens MED LLM call without `max_tokens` cap
15
+ reasoning_effort_high_default MED `reasoning_effort="high"` literal
16
+ Lever C — price per token
17
+ (semantic — handled in skill, not CLI)
18
+ Lever D — number of calls
19
+ llm_in_for_loop MED N× cost; Batch API / merged prompt are fixes
20
+ agent_loop_no_max_iter HIGH `while True:` containing LLM call without iter cap
21
+ temperature_nonzero_with_cache MED `temperature > 0` next to a cache hint — silently breaks it
22
+ Lever E — architecture / safety
23
+ retry_loop_no_backoff HIGH Retry storm risk
24
+ sdk_init_no_timeout HIGH SDK initialized without `timeout=`
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import re
30
+ from dataclasses import dataclass
31
+ from pathlib import Path
32
+ from typing import Iterable, Literal
33
+
34
+ Severity = Literal["high", "medium", "low"]
35
+
36
+ _LLM_CALL_RE = re.compile(
37
+ r"""
38
+ (
39
+ \.chat\.completions\.create\( | # OpenAI sync
40
+ \.completions\.create\( |
41
+ \.messages\.create\( | # Anthropic
42
+ \.responses\.create\( | # OpenAI Responses API
43
+ \.generate_content\( | # Google Gemini
44
+ litellm\.(?:a)?completion\( |
45
+ ChatOpenAI\( | # LangChain
46
+ ChatAnthropic\( |
47
+ Anthropic\(\)\.messages |
48
+ Anthropic\(\)\.completions
49
+ )
50
+ """,
51
+ re.VERBOSE,
52
+ )
53
+
54
+ _RETRY_LOOP_RE = re.compile(
55
+ r"""
56
+ (
57
+ for \s+ \w+ \s+ in \s+ range\( | # for attempt in range(...)
58
+ while \s+ .* (?: retry | retries | attempts ) # while retries < N
59
+ )
60
+ """,
61
+ re.VERBOSE | re.IGNORECASE,
62
+ )
63
+
64
+ _BACKOFF_RE = re.compile(
65
+ r"""
66
+ (
67
+ backoff\. | # backoff library
68
+ @retry\( | # tenacity / retrying
69
+ tenacity\. |
70
+ time\.sleep\( |
71
+ asyncio\.sleep\( |
72
+ 2 \s* \*\* \s* attempt | # 2**attempt pattern
73
+ 2 \s* \*\* \s* retry
74
+ )
75
+ """,
76
+ re.VERBOSE,
77
+ )
78
+
79
+ _FOR_LOOP_HEAD_RE = re.compile(r"^\s*(for|while)\s+.+:\s*$")
80
+ _INDENT_RE = re.compile(r"^( *|\t*)")
81
+
82
+ _CACHE_CONTROL_RE = re.compile(
83
+ r"""
84
+ (
85
+ cache_control |
86
+ prompt_caching |
87
+ cache_key |
88
+ @lru_cache |
89
+ @cache
90
+ )
91
+ """,
92
+ re.VERBOSE,
93
+ )
94
+
95
+ _PROMPT_VAR_HINT_RE = re.compile(
96
+ r"^\s*(\w*(?:system|prompt|instruction|template)\w*)\s*=\s*",
97
+ re.IGNORECASE,
98
+ )
99
+
100
+ # Match only variables that semantically represent the STATIC SYSTEM prefix.
101
+ # Anything user-side (user_prompt, user_message, ...) is inherently dynamic and
102
+ # is NOT a cache-break risk.
103
+ _FSTRING_PROMPT_RE = re.compile(
104
+ r"""
105
+ ^\s*
106
+ (
107
+ SYSTEM_PROMPT |
108
+ SYSTEM_MESSAGE |
109
+ SYSTEM_INSTRUCTION(S)? |
110
+ system_prompt |
111
+ system_message |
112
+ system_instruction(s)? |
113
+ sys_prompt |
114
+ SYS_PROMPT |
115
+ SYSTEM |
116
+ system |
117
+ INSTRUCTIONS? |
118
+ instructions?
119
+ )
120
+ \s* = \s*
121
+ f ["']
122
+ (?: [^"']|\\.)*?
123
+ \{ [\w.\[\]'"]+ \}
124
+ """,
125
+ re.VERBOSE,
126
+ )
127
+
128
+ _AGENT_LOOP_HEAD_RE = re.compile(
129
+ r"^\s*while\s+(?:True|not\s+\w+|1)\s*:\s*(?:#.*)?$"
130
+ )
131
+
132
+ _HISTORY_APPEND_RE = re.compile(
133
+ r"^\s*(\w*(?:messages|history|conversation|chat)\w*)\.append\("
134
+ )
135
+
136
+ _HISTORY_TRUNCATE_RE = re.compile(
137
+ r"""
138
+ (
139
+ \[\s*-?\d+\s*:\s*\] | # messages[-10:]
140
+ \[\s*:\s*-?\d+\s*\] | # messages[:10]
141
+ \.pop\(\s*0\s*\) |
142
+ \[\s*1:\s*\] |
143
+ summari[sz]e_ |
144
+ compact_ |
145
+ truncate |
146
+ trim_ |
147
+ memory\.add |
148
+ mem0
149
+ )
150
+ """,
151
+ re.VERBOSE,
152
+ )
153
+
154
+ _REASONING_EFFORT_HIGH_RE = re.compile(
155
+ r"""reasoning_effort \s* = \s* ['"]high['"]""",
156
+ re.VERBOSE,
157
+ )
158
+
159
+ _SDK_INIT_RE = re.compile(
160
+ r"""
161
+ \b
162
+ (OpenAI | AsyncOpenAI | Anthropic | AsyncAnthropic)
163
+ \(
164
+ """,
165
+ re.VERBOSE,
166
+ )
167
+
168
+ _TIMEOUT_KW_RE = re.compile(r"\btimeout\s*=")
169
+
170
+ _TEMPERATURE_RE = re.compile(r"\btemperature\s*=\s*([0-9]*\.?[0-9]+)")
171
+
172
+ _CACHE_HINT_NEARBY_RE = re.compile(
173
+ r"""
174
+ (
175
+ @lru_cache |
176
+ @cache\b |
177
+ @cached\b |
178
+ functools\.cache |
179
+ \bcache\.get\( |
180
+ \bcache\.set\( |
181
+ \bredis\b |
182
+ \bmemcache\b |
183
+ diskcache\. |
184
+ cachetools\. |
185
+ TTLCache
186
+ )
187
+ """,
188
+ re.VERBOSE,
189
+ )
190
+
191
+ _DEFAULT_INCLUDE_SUFFIXES = (".py", ".ts", ".tsx", ".js", ".jsx", ".mjs")
192
+ _DEFAULT_SKIP_DIRS = frozenset(
193
+ {
194
+ ".git", ".venv", "venv", "node_modules", ".next", "dist", "build",
195
+ "__pycache__", ".mypy_cache", ".ruff_cache", ".pytest_cache",
196
+ ".turbo", "out", ".coffer-cache", "site-packages",
197
+ }
198
+ )
199
+
200
+ # Minimum chars in a hardcoded string before we suspect "large uncached prompt".
201
+ _LARGE_PROMPT_THRESHOLD = 2_000
202
+
203
+
204
+ @dataclass(frozen=True)
205
+ class Finding:
206
+ severity: Severity
207
+ pattern: str
208
+ path: Path
209
+ line: int
210
+ snippet: str
211
+ suggestion: str
212
+
213
+ def to_dict(self) -> dict:
214
+ return {
215
+ "severity": self.severity,
216
+ "pattern": self.pattern,
217
+ "file": str(self.path),
218
+ "line": self.line,
219
+ "snippet": self.snippet,
220
+ "suggestion": self.suggestion,
221
+ }
222
+
223
+
224
+ # ---- detectors --------------------------------------------------------------
225
+
226
+
227
+ def _detect_retry_loops(path: Path, lines: list[str]) -> list[Finding]:
228
+ findings: list[Finding] = []
229
+ for i, line in enumerate(lines):
230
+ if not _RETRY_LOOP_RE.search(line):
231
+ continue
232
+ # Find loop body indent
233
+ indent_match = _INDENT_RE.match(line)
234
+ loop_indent = len(indent_match.group(1)) if indent_match else 0
235
+
236
+ body_lines: list[str] = []
237
+ has_llm = False
238
+ has_backoff = False
239
+
240
+ for j in range(i + 1, min(i + 40, len(lines))):
241
+ body = lines[j]
242
+ if not body.strip():
243
+ continue
244
+ body_indent_match = _INDENT_RE.match(body)
245
+ body_indent = len(body_indent_match.group(1)) if body_indent_match else 0
246
+ if body_indent <= loop_indent:
247
+ break
248
+ body_lines.append(body)
249
+ if _LLM_CALL_RE.search(body):
250
+ has_llm = True
251
+ if _BACKOFF_RE.search(body):
252
+ has_backoff = True
253
+
254
+ if has_llm and not has_backoff:
255
+ findings.append(
256
+ Finding(
257
+ severity="high",
258
+ pattern="retry_loop_no_backoff",
259
+ path=path,
260
+ line=i + 1,
261
+ snippet=line.strip()[:200],
262
+ suggestion=(
263
+ "Add exponential backoff (e.g. `@backoff.on_exception(backoff.expo, "
264
+ "RateLimitError, max_tries=5)`). A single rate-limit storm without "
265
+ "backoff can multiply your bill 10x."
266
+ ),
267
+ )
268
+ )
269
+ return findings
270
+
271
+
272
+ def _detect_llm_in_loop(path: Path, lines: list[str]) -> list[Finding]:
273
+ findings: list[Finding] = []
274
+ loop_stack: list[tuple[int, int]] = [] # (line_idx, indent)
275
+
276
+ for i, line in enumerate(lines):
277
+ if not line.strip() or line.lstrip().startswith("#"):
278
+ continue
279
+ indent_match = _INDENT_RE.match(line)
280
+ cur_indent = len(indent_match.group(1)) if indent_match else 0
281
+
282
+ # Pop loops whose body we exited
283
+ while loop_stack and cur_indent <= loop_stack[-1][1]:
284
+ loop_stack.pop()
285
+
286
+ if _FOR_LOOP_HEAD_RE.match(line):
287
+ loop_stack.append((i, cur_indent))
288
+ continue
289
+
290
+ if loop_stack and _LLM_CALL_RE.search(line):
291
+ # Skip if this loop also looks like a retry loop — that's covered
292
+ # by the retry detector with HIGH severity.
293
+ loop_line = lines[loop_stack[-1][0]]
294
+ if _RETRY_LOOP_RE.search(loop_line):
295
+ continue
296
+ findings.append(
297
+ Finding(
298
+ severity="medium",
299
+ pattern="llm_in_for_loop",
300
+ path=path,
301
+ line=i + 1,
302
+ snippet=line.strip()[:200],
303
+ suggestion=(
304
+ "N LLM calls in a loop = N× token cost — asyncio.gather only fixes "
305
+ "latency, not the bill. Real cost fixes: (1) OpenAI Batch API for 50% off "
306
+ "on async workloads; (2) merge into one richer prompt that processes the "
307
+ "whole batch; (3) enable prompt caching if the system prompt repeats."
308
+ ),
309
+ )
310
+ )
311
+ return findings
312
+
313
+
314
+ def _detect_large_uncached_prompts(path: Path, lines: list[str]) -> list[Finding]:
315
+ findings: list[Finding] = []
316
+ i = 0
317
+ while i < len(lines):
318
+ line = lines[i]
319
+ match = _PROMPT_VAR_HINT_RE.match(line)
320
+ if not match:
321
+ i += 1
322
+ continue
323
+
324
+ # Look for triple-quoted string starting on this line or next
325
+ joined: list[str] = []
326
+ opener: str | None = None
327
+ for q in ('"""', "'''"):
328
+ if q in line[match.end():]:
329
+ opener = q
330
+ break
331
+ if opener is None:
332
+ i += 1
333
+ continue
334
+
335
+ # Collect until closing triple quote
336
+ start_pos = line.find(opener, match.end())
337
+ rest_of_start = line[start_pos + len(opener):]
338
+ joined.append(rest_of_start)
339
+ closed_on_start = opener in rest_of_start
340
+ end_line = i
341
+
342
+ if not closed_on_start:
343
+ for j in range(i + 1, min(i + 200, len(lines))):
344
+ joined.append(lines[j])
345
+ if opener in lines[j]:
346
+ end_line = j
347
+ break
348
+ else:
349
+ i += 1
350
+ continue
351
+ else:
352
+ end_line = i
353
+
354
+ full = "\n".join(joined)
355
+ # Remove the trailing opener piece
356
+ if opener in full:
357
+ full = full[: full.rfind(opener)]
358
+
359
+ if len(full) < _LARGE_PROMPT_THRESHOLD:
360
+ i = end_line + 1
361
+ continue
362
+
363
+ # Look in a window around the prompt for cache_control usage
364
+ window_start = max(0, i - 30)
365
+ window_end = min(len(lines), end_line + 30)
366
+ window = "\n".join(lines[window_start:window_end])
367
+
368
+ if _CACHE_CONTROL_RE.search(window):
369
+ i = end_line + 1
370
+ continue
371
+
372
+ var_name = match.group(1)
373
+ findings.append(
374
+ Finding(
375
+ severity="medium",
376
+ pattern="uncached_large_prompt",
377
+ path=path,
378
+ line=i + 1,
379
+ snippet=f"{var_name} = '''[{len(full):,} chars]'''",
380
+ suggestion=(
381
+ "Large hardcoded prompt with no nearby cache_control. If called repeatedly, "
382
+ "wrap in Anthropic cache_control={'type': 'ephemeral'} or rely on OpenAI's "
383
+ "automatic caching to cut input cost 60-90%."
384
+ ),
385
+ )
386
+ )
387
+ i = end_line + 1
388
+ return findings
389
+
390
+
391
+ def _detect_dynamic_before_static_cache_break(
392
+ path: Path, lines: list[str]
393
+ ) -> list[Finding]:
394
+ """f-string interpolation in a system/instruction var — kills prefix caching.
395
+
396
+ OpenAI auto-caches prefixes ≥1024 tokens. Anthropic uses cache_control
397
+ on stable prefixes. Both break if the prompt starts with dynamic content.
398
+ """
399
+ findings: list[Finding] = []
400
+ for i, line in enumerate(lines):
401
+ if not _FSTRING_PROMPT_RE.match(line):
402
+ continue
403
+ findings.append(
404
+ Finding(
405
+ severity="high",
406
+ pattern="dynamic_before_static_cache_break",
407
+ path=path,
408
+ line=i + 1,
409
+ snippet=line.strip()[:200],
410
+ suggestion=(
411
+ "An f-string interpolation in this system/prompt variable defeats "
412
+ "automatic prefix caching (OpenAI auto-cache + Anthropic cache_control). "
413
+ "Restructure: put all dynamic content LAST (in messages[]), keep the static "
414
+ "prefix at the top. Or split: static system message + dynamic user message."
415
+ ),
416
+ )
417
+ )
418
+ return findings
419
+
420
+
421
+ def _detect_unbounded_conversation_history(
422
+ path: Path, lines: list[str]
423
+ ) -> list[Finding]:
424
+ """`messages.append(...)` with no truncation/summarization in the file."""
425
+ findings: list[Finding] = []
426
+ appends: list[tuple[int, str]] = []
427
+ for i, line in enumerate(lines):
428
+ match = _HISTORY_APPEND_RE.match(line)
429
+ if match:
430
+ appends.append((i, match.group(1)))
431
+
432
+ if not appends:
433
+ return findings
434
+
435
+ # Look at the whole file for any truncation/summarization indicator.
436
+ full = "\n".join(lines)
437
+ if _HISTORY_TRUNCATE_RE.search(full):
438
+ return findings
439
+
440
+ # One finding per file, at the first append.
441
+ i, var = appends[0]
442
+ findings.append(
443
+ Finding(
444
+ severity="medium",
445
+ pattern="unbounded_conversation_history",
446
+ path=path,
447
+ line=i + 1,
448
+ snippet=lines[i].strip()[:200],
449
+ suggestion=(
450
+ f"`{var}` grows without bound — every turn adds tokens permanently. "
451
+ "Cap with sliding window (`messages = messages[-N:]`), summarize old turns "
452
+ "(Mem0 / custom compaction), or use the provider's `previous_response_id` chain."
453
+ ),
454
+ )
455
+ )
456
+ return findings
457
+
458
+
459
+ def _detect_agent_loop_no_max_iter(path: Path, lines: list[str]) -> list[Finding]:
460
+ """`while True:` containing an LLM call without a max-iteration counter.
461
+
462
+ The canonical $47K-incident pattern. Detect:
463
+ - `while True:` head
464
+ - LLM call inside body
465
+ - no `range(`/`max_iter`/`max_steps`/iteration counter pattern in body
466
+ """
467
+ findings: list[Finding] = []
468
+ for i, line in enumerate(lines):
469
+ if not _AGENT_LOOP_HEAD_RE.match(line):
470
+ continue
471
+ indent_match = _INDENT_RE.match(line)
472
+ loop_indent = len(indent_match.group(1)) if indent_match else 0
473
+
474
+ body: list[str] = []
475
+ for j in range(i + 1, min(i + 80, len(lines))):
476
+ body_line = lines[j]
477
+ if not body_line.strip():
478
+ continue
479
+ body_indent_match = _INDENT_RE.match(body_line)
480
+ body_indent = len(body_indent_match.group(1)) if body_indent_match else 0
481
+ if body_indent <= loop_indent:
482
+ break
483
+ body.append(body_line)
484
+
485
+ body_text = "\n".join(body)
486
+ if not _LLM_CALL_RE.search(body_text):
487
+ continue
488
+ # Heuristic for "has an iteration cap": find a counter pattern AND a break/return.
489
+ has_counter = bool(
490
+ re.search(
491
+ r"""(\bmax_(?:iter|steps|turns|rounds)\b|\biter(?:ation)?s?\s*[+\-*/]?=|\bcount\s*[+\-*/]?=)""",
492
+ body_text,
493
+ )
494
+ )
495
+ has_break_or_return = bool(re.search(r"\b(break|return)\b", body_text))
496
+ if has_counter and has_break_or_return:
497
+ continue
498
+
499
+ findings.append(
500
+ Finding(
501
+ severity="high",
502
+ pattern="agent_loop_no_max_iter",
503
+ path=path,
504
+ line=i + 1,
505
+ snippet=line.strip()[:200],
506
+ suggestion=(
507
+ "Unbounded agent loop containing an LLM call. A single mis-firing tool "
508
+ "or model can spin forever — there is a documented $47K incident from this "
509
+ "exact pattern. Add `max_iter` counter and break, or use the provider's "
510
+ "explicit agent loop (OpenAI Responses with `max_tool_rounds`, "
511
+ "Anthropic tool_use with explicit termination check)."
512
+ ),
513
+ )
514
+ )
515
+ return findings
516
+
517
+
518
+ def _detect_temperature_nonzero_with_cache_hint(
519
+ path: Path, lines: list[str]
520
+ ) -> list[Finding]:
521
+ """`temperature > 0` near a cache hint silently breaks the cache.
522
+
523
+ Each call has different sampling → identical inputs produce different
524
+ outputs → response cache misses every time. Developer thinks they're
525
+ caching, but they're not.
526
+ """
527
+ findings: list[Finding] = []
528
+ for i, line in enumerate(lines):
529
+ m = _TEMPERATURE_RE.search(line)
530
+ if not m:
531
+ continue
532
+ try:
533
+ if float(m.group(1)) <= 0:
534
+ continue
535
+ except ValueError:
536
+ continue
537
+ # Look 30 lines up and 30 lines down for a cache hint.
538
+ window_start = max(0, i - 30)
539
+ window_end = min(len(lines), i + 30)
540
+ window = "\n".join(lines[window_start:window_end])
541
+ if not _CACHE_HINT_NEARBY_RE.search(window):
542
+ continue
543
+ # If `temperature=0` is also seen in window, the user is mixing — still worth a hint
544
+ findings.append(
545
+ Finding(
546
+ severity="medium",
547
+ pattern="temperature_nonzero_with_cache_hint",
548
+ path=path,
549
+ line=i + 1,
550
+ snippet=line.strip()[:200],
551
+ suggestion=(
552
+ "A cache decorator/store is nearby, but this call sets `temperature > 0` — "
553
+ "sampling makes each response different, so the cache never hits on "
554
+ "subsequent identical inputs. Set `temperature=0` for cache-eligible "
555
+ "deterministic tasks, OR remove the cache layer if you genuinely need "
556
+ "varied outputs."
557
+ ),
558
+ )
559
+ )
560
+ return findings
561
+
562
+
563
+ def _detect_reasoning_effort_high_default(
564
+ path: Path, lines: list[str]
565
+ ) -> list[Finding]:
566
+ """`reasoning_effort="high"` literal — usually a copy-paste from docs."""
567
+ findings: list[Finding] = []
568
+ for i, line in enumerate(lines):
569
+ if not _REASONING_EFFORT_HIGH_RE.search(line):
570
+ continue
571
+ findings.append(
572
+ Finding(
573
+ severity="medium",
574
+ pattern="reasoning_effort_high_default",
575
+ path=path,
576
+ line=i + 1,
577
+ snippet=line.strip()[:200],
578
+ suggestion=(
579
+ "`reasoning_effort=\"high\"` is the new \"GPT-4 for everything\". On trivial "
580
+ "tasks it can produce ~20× extra reasoning tokens at full output price "
581
+ "(see arXiv 2412.21187). Default to `medium` or `low` and only escalate "
582
+ "for tasks that empirically need it."
583
+ ),
584
+ )
585
+ )
586
+ return findings
587
+
588
+
589
+ def _detect_sdk_init_no_timeout(path: Path, lines: list[str]) -> list[Finding]:
590
+ """`OpenAI()` / `Anthropic()` constructed without `timeout=`."""
591
+ findings: list[Finding] = []
592
+ for i, line in enumerate(lines):
593
+ m = _SDK_INIT_RE.search(line)
594
+ if not m:
595
+ continue
596
+ # Look at the next ~5 lines too in case the kwargs span lines.
597
+ end = min(i + 5, len(lines))
598
+ joined = "\n".join(lines[i:end])
599
+ # Locate the close paren of this constructor.
600
+ depth = 0
601
+ start_pos = joined.index(m.group(0)) + len(m.group(0))
602
+ body = ""
603
+ for ch in joined[start_pos:]:
604
+ body += ch
605
+ if ch == "(":
606
+ depth += 1
607
+ elif ch == ")":
608
+ if depth == 0:
609
+ break
610
+ depth -= 1
611
+
612
+ if _TIMEOUT_KW_RE.search(body):
613
+ continue
614
+ findings.append(
615
+ Finding(
616
+ severity="high",
617
+ pattern="sdk_init_no_timeout",
618
+ path=path,
619
+ line=i + 1,
620
+ snippet=line.strip()[:200],
621
+ suggestion=(
622
+ f"`{m.group(1)}` initialized without `timeout=`. Default is 600s — a hung "
623
+ "provider can block your thread for ten minutes. Pass an explicit timeout "
624
+ "(e.g. `timeout=30.0`) sized to your user-facing latency budget."
625
+ ),
626
+ )
627
+ )
628
+ return findings
629
+
630
+
631
+ # ---- top-level --------------------------------------------------------------
632
+
633
+
634
+ def _walk_files(root: Path, suffixes: tuple[str, ...]) -> Iterable[Path]:
635
+ if root.is_file():
636
+ if root.suffix in suffixes:
637
+ yield root
638
+ return
639
+ for path in root.rglob("*"):
640
+ if not path.is_file() or path.suffix not in suffixes:
641
+ continue
642
+ if any(part in _DEFAULT_SKIP_DIRS for part in path.parts):
643
+ continue
644
+ yield path
645
+
646
+
647
+ def find_patterns(
648
+ root: Path,
649
+ suffixes: tuple[str, ...] = _DEFAULT_INCLUDE_SUFFIXES,
650
+ ) -> list[Finding]:
651
+ findings: list[Finding] = []
652
+ for path in _walk_files(root, suffixes):
653
+ try:
654
+ text = path.read_text(encoding="utf-8", errors="replace")
655
+ except OSError:
656
+ continue
657
+ lines = text.splitlines()
658
+ findings.extend(_detect_retry_loops(path, lines))
659
+ findings.extend(_detect_llm_in_loop(path, lines))
660
+ findings.extend(_detect_large_uncached_prompts(path, lines))
661
+ findings.extend(_detect_dynamic_before_static_cache_break(path, lines))
662
+ findings.extend(_detect_unbounded_conversation_history(path, lines))
663
+ findings.extend(_detect_agent_loop_no_max_iter(path, lines))
664
+ findings.extend(_detect_temperature_nonzero_with_cache_hint(path, lines))
665
+ findings.extend(_detect_reasoning_effort_high_default(path, lines))
666
+ findings.extend(_detect_sdk_init_no_timeout(path, lines))
667
+
668
+ severity_order = {"high": 0, "medium": 1, "low": 2}
669
+ findings.sort(key=lambda f: (severity_order[f.severity], str(f.path), f.line))
670
+ return findings
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.4
2
+ Name: coffer-cli
3
+ Version: 0.1.0
4
+ Summary: Scan codebases for LLM cost-waste anti-patterns. Find retry storms, missing prompt caching, unbounded conversation history, agent loops without iteration caps, and more — before you ship.
5
+ Project-URL: Homepage, https://github.com/neal-c611/coffer-cli
6
+ Project-URL: Repository, https://github.com/neal-c611/coffer-cli
7
+ Project-URL: Issues, https://github.com/neal-c611/coffer-cli/issues
8
+ Author: Neal
9
+ License-Expression: Apache-2.0
10
+ License-File: LICENSE
11
+ Keywords: anthropic,claude,claude-code,cost,finops,gpt,linter,llm,openai,skill,static-analysis
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Software Development :: Code Generators
23
+ Classifier: Topic :: Software Development :: Quality Assurance
24
+ Classifier: Topic :: Utilities
25
+ Requires-Python: >=3.10
26
+ Requires-Dist: rich>=13.9
27
+ Requires-Dist: typer>=0.13
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # coffer-cli
33
+
34
+ > Scan your code for LLM cost-waste anti-patterns before you ship.
35
+
36
+ `coffer-cli` is a static scanner for production AI code. It catches the
37
+ mistakes that show up at month-end on your OpenAI / Anthropic bill —
38
+ retry storms, missing prompt caching, unbounded conversation history,
39
+ agent loops without iteration caps, SDK inits without timeouts, and
40
+ more.
41
+
42
+ It is intentionally **not** a magic dollar estimator. Static analysis
43
+ cannot know call volume; we leave that to live tracking. Instead, we
44
+ surface structural risks that a careful reviewer would catch — but
45
+ faster, in CI, on every commit.
46
+
47
+ ```bash
48
+ pipx install coffer-cli
49
+
50
+ coffer scan ./my-app
51
+ coffer scan ./my-app --json # for CI / Claude Code skill consumption
52
+ coffer prices # current model pricing table
53
+ coffer compare gpt-4o gpt-4o-mini
54
+ ```
55
+
56
+ ## What it catches (v0.1.0)
57
+
58
+ Detectors are organized by the four levers that drive LLM cost:
59
+
60
+ | Lever | Detector | Severity |
61
+ |-------|----------|----------|
62
+ | **A: input tokens** | `dynamic_before_static_cache_break` — f-string interpolation in `SYSTEM_PROMPT` defeats OpenAI auto-cache and Anthropic `cache_control` | 🚨 high |
63
+ | | `unbounded_conversation_history` — `messages.append(...)` without truncation or summarization | 🟡 med |
64
+ | | `uncached_large_prompt` — ≥2,000-char hardcoded prompt without nearby `cache_control` | 🟡 med |
65
+ | **B: output tokens** | `missing_max_tokens` — LLM call without a `max_tokens` cap | 🟡 med |
66
+ | | `reasoning_effort_high_default` — `reasoning_effort="high"` literal (up to ~20× extra reasoning tokens on trivial tasks) | 🟡 med |
67
+ | **D: number of calls** | `llm_in_for_loop` — N× cost; gather is a latency fix, not a cost fix | 🟡 med |
68
+ | | `agent_loop_no_max_iter` — `while True:` containing an LLM call without an iteration cap (the $47K-incident pattern) | 🚨 high |
69
+ | | `temperature_nonzero_with_cache_hint` — cache layer nearby but `temperature > 0` silently breaks it | 🟡 med |
70
+ | **E: architecture** | `retry_loop_no_backoff` — retry storm amplifies the bill 10× | 🚨 high |
71
+ | | `sdk_init_no_timeout` — default 600s lets a hung provider block your thread | 🚨 high |
72
+
73
+ Each finding includes a concrete fix and explains the *cost* angle
74
+ explicitly (we do not conflate latency fixes with cost fixes).
75
+
76
+ ## Use with Claude Code (the skill)
77
+
78
+ The `coffer-cost-review` Claude Code skill in [`skills/`](skills/coffer-cost-review/)
79
+ combines this scanner with Claude's semantic judgment. In Claude Code, ask
80
+ *"review my LLM costs"* and the skill will:
81
+
82
+ 1. Run `coffer scan <path> --json` for deterministic findings
83
+ 2. Read each flagged file in context to filter false positives
84
+ 3. Add semantic-only checks the scanner cannot do
85
+ (frontier model used for trivial tasks, free-form output where structured
86
+ works, public endpoints without rate limit, ...)
87
+ 4. Produce a severity-ranked review with concrete code-diff fixes
88
+
89
+ Install:
90
+
91
+ ```bash
92
+ git clone https://github.com/neal-c611/coffer-cli
93
+ mkdir -p ~/.claude/skills
94
+ cp -r coffer-cli/skills/coffer-cost-review ~/.claude/skills/
95
+ ```
96
+
97
+ ## What it deliberately does NOT do
98
+
99
+ - **No invented dollar estimates.** Call volume is unknowable from static
100
+ code. We report severity, not numbers.
101
+ - **No proxy mode.** Your LLM calls go directly to your providers.
102
+ - **No auto-rewrites.** Suggestions only; you stay in control.
103
+
104
+ For live production cost tracking with per-feature and per-user attribution
105
+ (the part static analysis genuinely can't do), see
106
+ [Coffer](https://trycoffer.com).
107
+
108
+ ## Exit codes
109
+
110
+ - `0` — clean, or only `medium`/`low` findings
111
+ - `1` — at least one `high` finding (use for CI gating)
112
+
113
+ ## Development
114
+
115
+ ```bash
116
+ git clone https://github.com/neal-c611/coffer-cli
117
+ cd coffer-cli
118
+ uv sync --extra dev
119
+ uv run pytest
120
+ ```
121
+
122
+ Patterns are detected by `src/coffer_cli/patterns.py` (regex-based,
123
+ single-file scope) and rendered by `src/coffer_cli/cli.py` (typer +
124
+ rich).
125
+
126
+ Contributions welcome. New detectors should:
127
+
128
+ - Default to **medium** severity; reserve **high** for patterns that
129
+ are demonstrably cost-amplifying in production
130
+ - Include a test in `tests/test_patterns.py` showing both a
131
+ positive case AND a negative case (the negative case is what
132
+ keeps false-positive rate low)
133
+ - Propose a *cost* fix, not a *latency* fix. Wrapping things in
134
+ `asyncio.gather` does not reduce the bill.
135
+
136
+ ## License
137
+
138
+ Apache 2.0.
@@ -0,0 +1,9 @@
1
+ coffer_cli/__init__.py,sha256=2-e4Nzx2PZ4MjjZUjX06YxZMvHU7PTkgq9dMJnHOwRQ,81
2
+ coffer_cli/_pricing.py,sha256=LP6IsgMN6EMh9aqjIXymRukSQGvAGIcIrW0S7WcGhxQ,1971
3
+ coffer_cli/cli.py,sha256=ICOsBhhgLHgtmg4nbSKUCeKB9fSvdi3QDSWTEefXpoo,6425
4
+ coffer_cli/patterns.py,sha256=jhXd7OxNvin2RacRpjy_yV7PjiA1EdHjHkq11YiAgbI,22841
5
+ coffer_cli-0.1.0.dist-info/METADATA,sha256=xi5ojjYYJRPbwpqSfT_LLvnoKrzDAVP9LPawMI7UWr0,5827
6
+ coffer_cli-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
7
+ coffer_cli-0.1.0.dist-info/entry_points.txt,sha256=lXg91UV4sieB5Jp5Dnid24LvwZR3RTpsxKeQyHqgsc4,46
8
+ coffer_cli-0.1.0.dist-info/licenses/LICENSE,sha256=Jq8DXUheBqOklp-ZsjJNrUv8QbJPWH4cWjqbtEWE9hw,794
9
+ coffer_cli-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ coffer = coffer_cli.cli:app
@@ -0,0 +1,19 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+
17
+ Copyright 2026 Coffer
18
+
19
+ Full text: https://www.apache.org/licenses/LICENSE-2.0