overllm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- overllm/__init__.py +11 -0
- overllm/__main__.py +4 -0
- overllm/analyze.py +118 -0
- overllm/cli.py +64 -0
- overllm/config.py +66 -0
- overllm/detector.py +435 -0
- overllm/models.py +29 -0
- overllm/report.py +157 -0
- overllm/rules.py +137 -0
- overllm-0.1.0.dist-info/METADATA +144 -0
- overllm-0.1.0.dist-info/RECORD +14 -0
- overllm-0.1.0.dist-info/WHEEL +4 -0
- overllm-0.1.0.dist-info/entry_points.txt +2 -0
- overllm-0.1.0.dist-info/licenses/LICENSE +21 -0
overllm/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""overllm - catch the LLM/AI calls you didn't need.
|
|
2
|
+
|
|
3
|
+
A fast, deterministic (no-LLM) linter that flags LLM API calls where plain,
|
|
4
|
+
cheaper, more reliable code would do the same job. Built on Python's own `ast`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
from .models import Finding
|
|
10
|
+
|
|
11
|
+
__all__ = ["Finding", "__version__"]
|
overllm/__main__.py
ADDED
overllm/analyze.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Walk files, parse them, run rules, and apply suppression + config."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .config import DEFAULT_EXCLUDES, Config
|
|
10
|
+
from .detector import find_llm_calls
|
|
11
|
+
from .models import Finding
|
|
12
|
+
from .rules import run_rules
|
|
13
|
+
|
|
14
|
+
# `# overllm: ignore` or `# overllm: ignore=rule-a,rule-b`
|
|
15
|
+
_IGNORE_RE = re.compile(r"#\s*overllm:\s*ignore(?:=([\w\-,\s]+))?")
|
|
16
|
+
_IGNORE_FILE_RE = re.compile(r"#\s*overllm:\s*ignore-file\b")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def iter_python_files(paths: list[str], excludes: tuple[str, ...]) -> list[Path]:
|
|
20
|
+
all_excludes = set(DEFAULT_EXCLUDES) | set(excludes)
|
|
21
|
+
|
|
22
|
+
def excluded(p: Path) -> bool:
|
|
23
|
+
parts = set(p.parts)
|
|
24
|
+
if parts & set(DEFAULT_EXCLUDES):
|
|
25
|
+
return True
|
|
26
|
+
s = str(p)
|
|
27
|
+
return any(ex and ex in s for ex in excludes)
|
|
28
|
+
|
|
29
|
+
out: list[Path] = []
|
|
30
|
+
for raw in paths:
|
|
31
|
+
p = Path(raw)
|
|
32
|
+
if p.is_file():
|
|
33
|
+
if p.suffix == ".py" and not excluded(p):
|
|
34
|
+
out.append(p)
|
|
35
|
+
elif p.is_dir():
|
|
36
|
+
for f in sorted(p.rglob("*.py")):
|
|
37
|
+
if not excluded(f):
|
|
38
|
+
out.append(f)
|
|
39
|
+
# dedupe, keep order
|
|
40
|
+
seen: set[str] = set()
|
|
41
|
+
uniq: list[Path] = []
|
|
42
|
+
for f in out:
|
|
43
|
+
r = str(f.resolve())
|
|
44
|
+
if r not in seen:
|
|
45
|
+
seen.add(r)
|
|
46
|
+
uniq.append(f)
|
|
47
|
+
return uniq
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _line_directives(lines: list[str]) -> tuple[bool, dict[int, set[str] | None]]:
|
|
51
|
+
"""Return (ignore_whole_file, {line_no: None-for-all | set-of-rule-ids})."""
|
|
52
|
+
per_line: dict[int, set[str] | None] = {}
|
|
53
|
+
ignore_file = False
|
|
54
|
+
for i, line in enumerate(lines, start=1):
|
|
55
|
+
if _IGNORE_FILE_RE.search(line):
|
|
56
|
+
ignore_file = True
|
|
57
|
+
m = _IGNORE_RE.search(line)
|
|
58
|
+
if m:
|
|
59
|
+
raw = m.group(1)
|
|
60
|
+
if raw:
|
|
61
|
+
per_line[i] = {r.strip() for r in raw.split(",") if r.strip()}
|
|
62
|
+
else:
|
|
63
|
+
per_line[i] = None # ignore all rules on this line
|
|
64
|
+
return ignore_file, per_line
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _suppressed(finding: Finding, node_lines: range, per_line: dict[int, set[str] | None]) -> bool:
|
|
68
|
+
# a directive on any physical line of the call (or the line above it) suppresses it
|
|
69
|
+
candidate_lines = set(node_lines) | {finding.line, finding.line - 1}
|
|
70
|
+
for ln in candidate_lines:
|
|
71
|
+
if ln in per_line:
|
|
72
|
+
rules = per_line[ln]
|
|
73
|
+
if rules is None or finding.rule in rules:
|
|
74
|
+
return True
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def analyze_file(path: Path, config: Config) -> list[Finding]:
|
|
79
|
+
try:
|
|
80
|
+
source = path.read_text(encoding="utf-8")
|
|
81
|
+
except (OSError, UnicodeDecodeError):
|
|
82
|
+
return []
|
|
83
|
+
try:
|
|
84
|
+
tree = ast.parse(source, filename=str(path))
|
|
85
|
+
except SyntaxError:
|
|
86
|
+
return [] # not our job to report parse errors; stay quiet
|
|
87
|
+
|
|
88
|
+
lines = source.splitlines()
|
|
89
|
+
ignore_file, per_line = _line_directives(lines)
|
|
90
|
+
if ignore_file:
|
|
91
|
+
return []
|
|
92
|
+
|
|
93
|
+
calls = find_llm_calls(tree, lines)
|
|
94
|
+
findings: list[Finding] = []
|
|
95
|
+
seen: set[tuple] = set()
|
|
96
|
+
display_path = str(path)
|
|
97
|
+
for call in calls:
|
|
98
|
+
node_start = getattr(call.node, "lineno", call.line)
|
|
99
|
+
node_end = getattr(call.node, "end_lineno", node_start) or node_start
|
|
100
|
+
node_lines = range(node_start, node_end + 1)
|
|
101
|
+
for f in run_rules(call, display_path):
|
|
102
|
+
if not config.enabled(f.rule):
|
|
103
|
+
continue
|
|
104
|
+
if f.key in seen:
|
|
105
|
+
continue
|
|
106
|
+
if _suppressed(f, node_lines, per_line):
|
|
107
|
+
continue
|
|
108
|
+
seen.add(f.key)
|
|
109
|
+
findings.append(f)
|
|
110
|
+
return findings
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def analyze_paths(paths: list[str], config: Config) -> list[Finding]:
|
|
114
|
+
findings: list[Finding] = []
|
|
115
|
+
for f in iter_python_files(paths, config.exclude):
|
|
116
|
+
findings.extend(analyze_file(f, config))
|
|
117
|
+
findings.sort(key=lambda x: (x.path, x.line, x.col, x.rule))
|
|
118
|
+
return findings
|
overllm/cli.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Command-line entry point for overllm."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from . import __version__
|
|
10
|
+
from .analyze import analyze_paths
|
|
11
|
+
from .config import load_config
|
|
12
|
+
from .report import render
|
|
13
|
+
from .rules import ALL_RULES
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
17
|
+
p = argparse.ArgumentParser(
|
|
18
|
+
prog="overllm",
|
|
19
|
+
description="Catch the LLM/AI calls you didn't need. Flags LLM API calls "
|
|
20
|
+
"where deterministic code is simpler, cheaper, and more reliable.",
|
|
21
|
+
)
|
|
22
|
+
p.add_argument("paths", nargs="*", default=["."], help="files or directories to scan (default: .)")
|
|
23
|
+
p.add_argument("--format", choices=["human", "json", "sarif", "markdown"], default="human")
|
|
24
|
+
p.add_argument("--select", help="comma-separated rule ids to run (default: all)")
|
|
25
|
+
p.add_argument("--ignore", help="comma-separated rule ids to skip")
|
|
26
|
+
p.add_argument("--config", type=Path, help="path to a config file (pyproject.toml or .overllm.toml)")
|
|
27
|
+
p.add_argument("--exit-zero", action="store_true", help="always exit 0, even when findings exist")
|
|
28
|
+
p.add_argument("--no-color", action="store_true", help="disable ANSI colors")
|
|
29
|
+
p.add_argument("--version", action="version", version=f"overllm {__version__}")
|
|
30
|
+
return p
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _split(value: str | None) -> tuple[str, ...]:
|
|
34
|
+
if not value:
|
|
35
|
+
return ()
|
|
36
|
+
return tuple(v.strip() for v in value.split(",") if v.strip())
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main(argv: list[str] | None = None) -> int:
|
|
40
|
+
args = build_parser().parse_args(argv)
|
|
41
|
+
paths = args.paths or ["."]
|
|
42
|
+
|
|
43
|
+
config = load_config(explicit=args.config)
|
|
44
|
+
select = _split(args.select)
|
|
45
|
+
if select:
|
|
46
|
+
config.select = tuple(r for r in select if r in ALL_RULES) or ALL_RULES
|
|
47
|
+
ignore = _split(args.ignore)
|
|
48
|
+
if ignore:
|
|
49
|
+
config.ignore = tuple(set(config.ignore) | set(ignore))
|
|
50
|
+
|
|
51
|
+
findings = analyze_paths(paths, config)
|
|
52
|
+
|
|
53
|
+
use_color = sys.stdout.isatty() and not args.no_color
|
|
54
|
+
output = render(findings, args.format, use_color=use_color)
|
|
55
|
+
if output:
|
|
56
|
+
print(output)
|
|
57
|
+
|
|
58
|
+
if args.exit_zero:
|
|
59
|
+
return 0
|
|
60
|
+
return 1 if findings else 0
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
raise SystemExit(main())
|
overllm/config.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Configuration loading from pyproject.toml [tool.overllm] or .overllm.toml."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
try: # tomllib is stdlib on 3.11+; config files are simply ignored below it
|
|
9
|
+
import tomllib
|
|
10
|
+
except ModuleNotFoundError: # pragma: no cover
|
|
11
|
+
tomllib = None
|
|
12
|
+
|
|
13
|
+
from .rules import ALL_RULES
|
|
14
|
+
|
|
15
|
+
DEFAULT_EXCLUDES = (
|
|
16
|
+
".git", ".hg", ".svn", "__pycache__", ".venv", "venv", "env",
|
|
17
|
+
"node_modules", "build", "dist", ".mypy_cache", ".pytest_cache",
|
|
18
|
+
".tox", "site-packages", ".eggs",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Config:
|
|
24
|
+
select: tuple[str, ...] = ALL_RULES
|
|
25
|
+
ignore: tuple[str, ...] = ()
|
|
26
|
+
exclude: tuple[str, ...] = ()
|
|
27
|
+
|
|
28
|
+
def enabled(self, rule: str) -> bool:
|
|
29
|
+
return rule in self.select and rule not in self.ignore
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _read_table(path: Path) -> dict:
|
|
33
|
+
if tomllib is None:
|
|
34
|
+
return {}
|
|
35
|
+
try:
|
|
36
|
+
data = tomllib.loads(path.read_text(encoding="utf-8"))
|
|
37
|
+
except (OSError, tomllib.TOMLDecodeError):
|
|
38
|
+
return {}
|
|
39
|
+
if path.name == "pyproject.toml":
|
|
40
|
+
return data.get("tool", {}).get("overllm", {}) or {}
|
|
41
|
+
return data
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_config(start: Path | None = None, explicit: Path | None = None) -> Config:
|
|
45
|
+
table: dict = {}
|
|
46
|
+
if explicit is not None:
|
|
47
|
+
table = _read_table(explicit)
|
|
48
|
+
else:
|
|
49
|
+
base = (start or Path.cwd()).resolve()
|
|
50
|
+
for parent in [base, *base.parents]:
|
|
51
|
+
for name in (".overllm.toml", "pyproject.toml"):
|
|
52
|
+
p = parent / name
|
|
53
|
+
if p.is_file():
|
|
54
|
+
t = _read_table(p)
|
|
55
|
+
if t or name == ".overllm.toml":
|
|
56
|
+
table = t
|
|
57
|
+
break
|
|
58
|
+
if table:
|
|
59
|
+
break
|
|
60
|
+
|
|
61
|
+
select = tuple(table.get("select", ALL_RULES))
|
|
62
|
+
ignore = tuple(table.get("ignore", ()))
|
|
63
|
+
exclude = tuple(table.get("exclude", ()))
|
|
64
|
+
# keep only known rule ids in select, so a typo does not silently disable everything
|
|
65
|
+
select = tuple(r for r in select if r in ALL_RULES) or ALL_RULES
|
|
66
|
+
return Config(select=select, ignore=ignore, exclude=exclude)
|
overllm/detector.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
"""Detect LLM/AI API calls in a Python AST and extract their prompt.
|
|
2
|
+
|
|
3
|
+
Deterministic and conservative: a call is only treated as an LLM call when it
|
|
4
|
+
matches a known SDK signature (or a raw HTTP request to a known LLM host). The
|
|
5
|
+
known-SDK surface is the precision anchor - we would rather miss an exotic
|
|
6
|
+
wrapper than false-positive on an unrelated `.create()`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
# --- Known call surfaces -----------------------------------------------------
|
|
15
|
+
|
|
16
|
+
# Method-call suffixes (trailing attribute names). Matched against the tail of
|
|
17
|
+
# the call's attribute chain, so the receiver variable name does not matter:
|
|
18
|
+
# `client.chat.completions.create`, `self.oai.chat.completions.create`, etc.
|
|
19
|
+
LLM_METHOD_SUFFIXES = (
|
|
20
|
+
("chat", "completions", "create"),
|
|
21
|
+
("chat", "completions", "parse"),
|
|
22
|
+
("completions", "create"),
|
|
23
|
+
("responses", "create"),
|
|
24
|
+
("responses", "parse"),
|
|
25
|
+
("messages", "create"),
|
|
26
|
+
("messages", "stream"),
|
|
27
|
+
("chat", "create"),
|
|
28
|
+
("chat", "complete"),
|
|
29
|
+
("models", "generate_content"),
|
|
30
|
+
("generate_content",),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Legacy module-level surfaces: openai.ChatCompletion.create, Completion.create.
|
|
34
|
+
LLM_LEGACY_SUFFIXES = (
|
|
35
|
+
("ChatCompletion", "create"),
|
|
36
|
+
("ChatCompletion", "acreate"),
|
|
37
|
+
("Completion", "create"),
|
|
38
|
+
("Completion", "acreate"),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Fully-qualified free/dotted calls.
|
|
42
|
+
LLM_DOTTED_FQNS = {
|
|
43
|
+
"litellm.completion",
|
|
44
|
+
"litellm.acompletion",
|
|
45
|
+
"ollama.chat",
|
|
46
|
+
"ollama.generate",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Bare free functions, only when imported from one of these modules.
|
|
50
|
+
LLM_FREE_FUNCS = {
|
|
51
|
+
"completion": {"litellm"},
|
|
52
|
+
"acompletion": {"litellm"},
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Constructors that produce an LLM/chat object (mostly LangChain + SDK clients).
|
|
56
|
+
LLM_CONSTRUCTORS = {
|
|
57
|
+
"ChatOpenAI", "AzureChatOpenAI", "OpenAI", "AzureOpenAI", "AsyncOpenAI",
|
|
58
|
+
"ChatAnthropic", "Anthropic", "AsyncAnthropic", "AnthropicVertex",
|
|
59
|
+
"ChatGoogleGenerativeAI", "GenerativeModel", "ChatVertexAI",
|
|
60
|
+
"ChatMistralAI", "MistralClient", "Mistral",
|
|
61
|
+
"ChatCohere", "ChatGroq", "ChatOllama", "OllamaLLM", "ChatBedrock",
|
|
62
|
+
"ChatLiteLLM", "LlamaCpp", "HuggingFaceHub", "LLMChain",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Methods on a tracked LLM object that count as a call.
|
|
66
|
+
LLM_OBJ_METHODS = {
|
|
67
|
+
"invoke", "ainvoke", "stream", "astream", "batch", "abatch",
|
|
68
|
+
"predict", "apredict", "generate", "agenerate", "complete", "acomplete",
|
|
69
|
+
"run", "call", "generate_content",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Hosts that identify a raw-HTTP LLM request.
|
|
73
|
+
LLM_HOSTS = (
|
|
74
|
+
"api.openai.com", "api.anthropic.com", "generativelanguage.googleapis.com",
|
|
75
|
+
"api.mistral.ai", "api.cohere.ai", "api.cohere.com", "api.groq.com",
|
|
76
|
+
"openrouter.ai", "api.together.xyz", "api.deepseek.com", "api.perplexity.ai",
|
|
77
|
+
"api.x.ai", ":11434", # ollama default port
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Kwargs that all LLM chat/completion calls carry - used as a precision guard so
|
|
81
|
+
# an unrelated `.messages.create()` (e.g. an ORM) is not mistaken for an LLM call.
|
|
82
|
+
LLM_KWARGS = {"model", "messages", "prompt", "input", "contents"}
|
|
83
|
+
|
|
84
|
+
LOOP_NODES = (ast.For, ast.AsyncFor, ast.comprehension)
|
|
85
|
+
FUNC_NODES = (ast.FunctionDef, ast.AsyncFunctionDef)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class LLMCall:
|
|
90
|
+
node: ast.Call
|
|
91
|
+
line: int
|
|
92
|
+
col: int
|
|
93
|
+
api: str # openai_like | anthropic | google | positional | ollama_chat | ollama_generate | http
|
|
94
|
+
prompt_text: str = "" # lowercased visible literal text of the user prompt
|
|
95
|
+
prompt_static: bool = False # the user prompt is a compile-time constant
|
|
96
|
+
prompt_resolved: bool = False # we could locate and read the user prompt
|
|
97
|
+
in_loop: bool = False
|
|
98
|
+
snippet: str = ""
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# --- AST helpers -------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
def _trailing_attrs(func: ast.expr) -> tuple[ast.expr, tuple[str, ...]]:
|
|
104
|
+
attrs: list[str] = []
|
|
105
|
+
node = func
|
|
106
|
+
while isinstance(node, ast.Attribute):
|
|
107
|
+
attrs.append(node.attr)
|
|
108
|
+
node = node.value
|
|
109
|
+
attrs.reverse()
|
|
110
|
+
return node, tuple(attrs)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _dotted(node: ast.expr) -> str | None:
|
|
114
|
+
parts: list[str] = []
|
|
115
|
+
while isinstance(node, ast.Attribute):
|
|
116
|
+
parts.append(node.attr)
|
|
117
|
+
node = node.value
|
|
118
|
+
if isinstance(node, ast.Name):
|
|
119
|
+
parts.append(node.id)
|
|
120
|
+
parts.reverse()
|
|
121
|
+
return ".".join(parts)
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _ctor_name(call: ast.Call) -> str | None:
|
|
126
|
+
f = call.func
|
|
127
|
+
if isinstance(f, ast.Name):
|
|
128
|
+
return f.id
|
|
129
|
+
if isinstance(f, ast.Attribute):
|
|
130
|
+
return f.attr
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _kw(call: ast.Call, name: str) -> ast.expr | None:
|
|
135
|
+
for k in call.keywords:
|
|
136
|
+
if k.arg == name:
|
|
137
|
+
return k.value
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _has_llm_kwarg(call: ast.Call) -> bool:
|
|
142
|
+
return any(k.arg in LLM_KWARGS for k in call.keywords)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _const_str(node: ast.expr | None) -> str | None:
|
|
146
|
+
if isinstance(node, ast.Constant) and isinstance(node.value, str):
|
|
147
|
+
return node.value
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _dict_get(d: ast.Dict, key: str) -> ast.expr | None:
|
|
152
|
+
for k, v in zip(d.keys, d.values):
|
|
153
|
+
if _const_str(k) == key:
|
|
154
|
+
return v
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _literal_text_and_static(node: ast.expr) -> tuple[str, bool]:
|
|
159
|
+
"""Collect the visible literal text of an expression and whether it is fully static.
|
|
160
|
+
|
|
161
|
+
Handles string literals, f-strings, `+` concatenation, `.format()` and `%`
|
|
162
|
+
templates, and lists/tuples of the above. Any variable, call, or interpolated
|
|
163
|
+
value makes it non-static (but we still keep literal siblings for keyword
|
|
164
|
+
matching).
|
|
165
|
+
"""
|
|
166
|
+
texts: list[str] = []
|
|
167
|
+
static = True
|
|
168
|
+
|
|
169
|
+
def walk(n: ast.expr) -> None:
|
|
170
|
+
nonlocal static
|
|
171
|
+
if isinstance(n, ast.Constant):
|
|
172
|
+
if isinstance(n.value, str):
|
|
173
|
+
texts.append(n.value)
|
|
174
|
+
elif isinstance(n, ast.JoinedStr):
|
|
175
|
+
for v in n.values:
|
|
176
|
+
if isinstance(v, ast.Constant) and isinstance(v.value, str):
|
|
177
|
+
texts.append(v.value)
|
|
178
|
+
elif isinstance(v, ast.FormattedValue):
|
|
179
|
+
static = False
|
|
180
|
+
else:
|
|
181
|
+
walk(v)
|
|
182
|
+
elif isinstance(n, ast.BinOp):
|
|
183
|
+
if isinstance(n.op, ast.Add):
|
|
184
|
+
walk(n.left)
|
|
185
|
+
walk(n.right)
|
|
186
|
+
elif isinstance(n.op, ast.Mod):
|
|
187
|
+
walk(n.left) # the template string
|
|
188
|
+
static = False
|
|
189
|
+
else:
|
|
190
|
+
static = False
|
|
191
|
+
elif isinstance(n, (ast.List, ast.Tuple)):
|
|
192
|
+
for e in n.elts:
|
|
193
|
+
walk(e)
|
|
194
|
+
elif isinstance(n, ast.Call):
|
|
195
|
+
f = n.func
|
|
196
|
+
if (
|
|
197
|
+
isinstance(f, ast.Attribute)
|
|
198
|
+
and f.attr in ("format", "join", "strip")
|
|
199
|
+
and isinstance(f.value, ast.Constant)
|
|
200
|
+
and isinstance(f.value.value, str)
|
|
201
|
+
):
|
|
202
|
+
texts.append(f.value.value)
|
|
203
|
+
static = False
|
|
204
|
+
else:
|
|
205
|
+
static = False
|
|
206
|
+
|
|
207
|
+
walk(node)
|
|
208
|
+
return " ".join(texts).lower().strip(), static
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _messages_user_prompt(msgs: ast.expr) -> tuple[list[ast.expr], bool]:
|
|
212
|
+
"""Extract the `content` of user-role messages from a `messages=[...]` list.
|
|
213
|
+
|
|
214
|
+
Returns (content_nodes, resolved). resolved is False when the messages list
|
|
215
|
+
is a variable or otherwise not a static list of dict literals with constant
|
|
216
|
+
roles - in which case we stay silent rather than guess.
|
|
217
|
+
"""
|
|
218
|
+
if not isinstance(msgs, ast.List):
|
|
219
|
+
return [], False
|
|
220
|
+
contents: list[ast.expr] = []
|
|
221
|
+
for el in msgs.elts:
|
|
222
|
+
if not isinstance(el, ast.Dict):
|
|
223
|
+
return [], False
|
|
224
|
+
role = _const_str(_dict_get(el, "role"))
|
|
225
|
+
content = _dict_get(el, "content")
|
|
226
|
+
if role is None:
|
|
227
|
+
return [], False
|
|
228
|
+
if role == "user" and content is not None:
|
|
229
|
+
contents.append(content)
|
|
230
|
+
return contents, True
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# --- Prompt extraction per API shape ----------------------------------------
|
|
234
|
+
|
|
235
|
+
def _extract_prompt(call: ast.Call, api: str) -> tuple[str, bool, bool]:
|
|
236
|
+
"""Return (prompt_text_lower, is_static, resolved) for a detected LLM call."""
|
|
237
|
+
nodes: list[ast.expr] = []
|
|
238
|
+
resolved = True
|
|
239
|
+
|
|
240
|
+
if api in ("openai_like", "anthropic", "ollama_chat"):
|
|
241
|
+
messages = _kw(call, "messages")
|
|
242
|
+
if messages is not None:
|
|
243
|
+
nodes, resolved = _messages_user_prompt(messages)
|
|
244
|
+
else:
|
|
245
|
+
single = _kw(call, "prompt") or _kw(call, "input")
|
|
246
|
+
if single is None and call.args:
|
|
247
|
+
single = call.args[0]
|
|
248
|
+
if single is not None:
|
|
249
|
+
nodes = [single]
|
|
250
|
+
else:
|
|
251
|
+
resolved = False
|
|
252
|
+
elif api == "ollama_generate":
|
|
253
|
+
single = _kw(call, "prompt") or (call.args[0] if call.args else None)
|
|
254
|
+
if single is None:
|
|
255
|
+
resolved = False
|
|
256
|
+
else:
|
|
257
|
+
nodes = [single]
|
|
258
|
+
elif api == "google":
|
|
259
|
+
single = _kw(call, "contents") or (call.args[0] if call.args else None)
|
|
260
|
+
if single is None:
|
|
261
|
+
resolved = False
|
|
262
|
+
else:
|
|
263
|
+
nodes = [single]
|
|
264
|
+
elif api == "positional":
|
|
265
|
+
single = _kw(call, "input") or (call.args[0] if call.args else None)
|
|
266
|
+
if single is None:
|
|
267
|
+
resolved = False
|
|
268
|
+
else:
|
|
269
|
+
nodes = [single]
|
|
270
|
+
else: # http / generic - prompt not statically inspectable
|
|
271
|
+
return "", False, False
|
|
272
|
+
|
|
273
|
+
if not resolved or not nodes:
|
|
274
|
+
return "", False, resolved and bool(nodes)
|
|
275
|
+
|
|
276
|
+
texts = []
|
|
277
|
+
static = True
|
|
278
|
+
for n in nodes:
|
|
279
|
+
t, s = _literal_text_and_static(n)
|
|
280
|
+
if t:
|
|
281
|
+
texts.append(t)
|
|
282
|
+
static = static and s
|
|
283
|
+
return " ".join(texts).strip(), static, True
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# --- Import / variable context ----------------------------------------------
|
|
287
|
+
|
|
288
|
+
class _Context:
|
|
289
|
+
def __init__(self) -> None:
|
|
290
|
+
self.free_llm_names: set[str] = set() # bare names that are LLM free funcs
|
|
291
|
+
self.llm_vars: set[str] = set() # dotted names bound to an LLM object
|
|
292
|
+
|
|
293
|
+
def scan(self, tree: ast.AST) -> None:
|
|
294
|
+
for node in ast.walk(tree):
|
|
295
|
+
if isinstance(node, ast.ImportFrom) and node.module:
|
|
296
|
+
for alias in node.names:
|
|
297
|
+
fn = alias.name
|
|
298
|
+
mods = LLM_FREE_FUNCS.get(fn)
|
|
299
|
+
if mods and node.module.split(".")[0] in mods:
|
|
300
|
+
self.free_llm_names.add(alias.asname or fn)
|
|
301
|
+
elif isinstance(node, (ast.Assign, ast.AnnAssign)):
|
|
302
|
+
value = node.value
|
|
303
|
+
if isinstance(value, ast.Call) and _ctor_name(value) in LLM_CONSTRUCTORS:
|
|
304
|
+
targets = node.targets if isinstance(node, ast.Assign) else [node.target]
|
|
305
|
+
for t in targets:
|
|
306
|
+
d = _dotted(t)
|
|
307
|
+
if d:
|
|
308
|
+
self.llm_vars.add(d)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# --- Classification ----------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
_SUFFIX_API = {
|
|
314
|
+
("chat", "completions", "create"): "openai_like",
|
|
315
|
+
("chat", "completions", "parse"): "openai_like",
|
|
316
|
+
("completions", "create"): "openai_like",
|
|
317
|
+
("responses", "create"): "openai_like",
|
|
318
|
+
("responses", "parse"): "openai_like",
|
|
319
|
+
("messages", "create"): "anthropic",
|
|
320
|
+
("messages", "stream"): "anthropic",
|
|
321
|
+
("chat", "create"): "openai_like",
|
|
322
|
+
("chat", "complete"): "openai_like",
|
|
323
|
+
("models", "generate_content"): "google",
|
|
324
|
+
("generate_content",): "google",
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _classify(call: ast.Call, ctx: _Context) -> str | None:
|
|
329
|
+
base, attrs = _trailing_attrs(call.func)
|
|
330
|
+
|
|
331
|
+
# 1. Known method suffixes (need an LLM-shaped kwarg as a precision guard).
|
|
332
|
+
for suf in LLM_METHOD_SUFFIXES:
|
|
333
|
+
if len(attrs) >= len(suf) and attrs[-len(suf):] == suf:
|
|
334
|
+
if suf == ("generate_content",) or _has_llm_kwarg(call) or call.args:
|
|
335
|
+
return _SUFFIX_API[suf]
|
|
336
|
+
|
|
337
|
+
# 2. Legacy module-level completion APIs.
|
|
338
|
+
for suf in LLM_LEGACY_SUFFIXES:
|
|
339
|
+
if len(attrs) >= len(suf) and attrs[-len(suf):] == suf:
|
|
340
|
+
return "openai_like"
|
|
341
|
+
|
|
342
|
+
# 3. LangChain / SDK object methods on a tracked LLM variable.
|
|
343
|
+
if attrs and attrs[-1] in LLM_OBJ_METHODS and isinstance(call.func, ast.Attribute):
|
|
344
|
+
recv = _dotted(call.func.value)
|
|
345
|
+
if recv in ctx.llm_vars:
|
|
346
|
+
return "positional"
|
|
347
|
+
|
|
348
|
+
# 4. Bare free functions imported from a known LLM module.
|
|
349
|
+
if isinstance(call.func, ast.Name) and call.func.id in ctx.free_llm_names:
|
|
350
|
+
return "openai_like"
|
|
351
|
+
|
|
352
|
+
# 5. Fully-qualified free/dotted calls.
|
|
353
|
+
d = _dotted(call.func)
|
|
354
|
+
if d in LLM_DOTTED_FQNS:
|
|
355
|
+
if d == "ollama.generate":
|
|
356
|
+
return "ollama_generate"
|
|
357
|
+
if d == "ollama.chat":
|
|
358
|
+
return "ollama_chat"
|
|
359
|
+
return "openai_like"
|
|
360
|
+
|
|
361
|
+
# 6. Raw HTTP to a known LLM host.
|
|
362
|
+
if attrs and attrs[-1] == "post" and _has_llm_host_arg(call):
|
|
363
|
+
return "http"
|
|
364
|
+
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _has_llm_host_arg(call: ast.Call) -> bool:
|
|
369
|
+
strings: list[str] = []
|
|
370
|
+
for a in call.args:
|
|
371
|
+
s = _const_str(a)
|
|
372
|
+
if s:
|
|
373
|
+
strings.append(s)
|
|
374
|
+
for k in call.keywords:
|
|
375
|
+
s = _const_str(k.value)
|
|
376
|
+
if s:
|
|
377
|
+
strings.append(s)
|
|
378
|
+
joined = " ".join(strings).lower()
|
|
379
|
+
return any(host in joined for host in LLM_HOSTS)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
# --- Loop ancestry -----------------------------------------------------------
|
|
383
|
+
|
|
384
|
+
def _build_parents(tree: ast.AST) -> dict[int, ast.AST]:
|
|
385
|
+
parents: dict[int, ast.AST] = {}
|
|
386
|
+
for node in ast.walk(tree):
|
|
387
|
+
for child in ast.iter_child_nodes(node):
|
|
388
|
+
parents[id(child)] = node
|
|
389
|
+
return parents
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _in_loop(node: ast.AST, parents: dict[int, ast.AST]) -> bool:
|
|
393
|
+
cur = parents.get(id(node))
|
|
394
|
+
while cur is not None:
|
|
395
|
+
if isinstance(cur, FUNC_NODES):
|
|
396
|
+
return False
|
|
397
|
+
if isinstance(cur, (ast.For, ast.AsyncFor)):
|
|
398
|
+
return True
|
|
399
|
+
if isinstance(cur, (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)):
|
|
400
|
+
return True
|
|
401
|
+
cur = parents.get(id(cur))
|
|
402
|
+
return False
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# --- Entry point -------------------------------------------------------------
|
|
406
|
+
|
|
407
|
+
def find_llm_calls(tree: ast.AST, source_lines: list[str]) -> list[LLMCall]:
|
|
408
|
+
ctx = _Context()
|
|
409
|
+
ctx.scan(tree)
|
|
410
|
+
parents = _build_parents(tree)
|
|
411
|
+
|
|
412
|
+
calls: list[LLMCall] = []
|
|
413
|
+
for node in ast.walk(tree):
|
|
414
|
+
if not isinstance(node, ast.Call):
|
|
415
|
+
continue
|
|
416
|
+
api = _classify(node, ctx)
|
|
417
|
+
if api is None:
|
|
418
|
+
continue
|
|
419
|
+
text, static, resolved = _extract_prompt(node, api)
|
|
420
|
+
line = getattr(node, "lineno", 0)
|
|
421
|
+
snippet = source_lines[line - 1].strip() if 0 < line <= len(source_lines) else ""
|
|
422
|
+
calls.append(
|
|
423
|
+
LLMCall(
|
|
424
|
+
node=node,
|
|
425
|
+
line=line,
|
|
426
|
+
col=getattr(node, "col_offset", 0),
|
|
427
|
+
api=api,
|
|
428
|
+
prompt_text=text,
|
|
429
|
+
prompt_static=static,
|
|
430
|
+
prompt_resolved=resolved,
|
|
431
|
+
in_loop=_in_loop(node, parents),
|
|
432
|
+
snippet=snippet,
|
|
433
|
+
)
|
|
434
|
+
)
|
|
435
|
+
return calls
|
overllm/models.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Core data types shared across overllm."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
SEVERITIES = ("error", "warning", "info")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class Finding:
|
|
12
|
+
"""One flagged LLM call.
|
|
13
|
+
|
|
14
|
+
A finding always names a concrete deterministic replacement in `suggestion`.
|
|
15
|
+
The rule fires only on an observable code pattern, never on taste.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
path: str
|
|
19
|
+
line: int
|
|
20
|
+
col: int
|
|
21
|
+
rule: str
|
|
22
|
+
message: str
|
|
23
|
+
suggestion: str = ""
|
|
24
|
+
severity: str = "warning"
|
|
25
|
+
snippet: str = ""
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def key(self) -> tuple:
|
|
29
|
+
return (self.path, self.line, self.col, self.rule)
|
overllm/report.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Output formatters: human, json, sarif, markdown (for the PR comment)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
|
|
8
|
+
from . import __version__
|
|
9
|
+
from .models import Finding
|
|
10
|
+
|
|
11
|
+
_COLORS = {
|
|
12
|
+
"reset": "\033[0m",
|
|
13
|
+
"dim": "\033[2m",
|
|
14
|
+
"bold": "\033[1m",
|
|
15
|
+
"yellow": "\033[33m",
|
|
16
|
+
"cyan": "\033[36m",
|
|
17
|
+
"green": "\033[32m",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
_RULE_HELP = "https://github.com/theadamdanielsson/overllm#rules"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _c(name: str, use_color: bool) -> str:
|
|
24
|
+
return _COLORS[name] if use_color else ""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def format_human(findings: list[Finding], use_color: bool = True) -> str:
|
|
28
|
+
if not findings:
|
|
29
|
+
return f"{_c('green', use_color)}overllm: no needless LLM calls found.{_c('reset', use_color)}"
|
|
30
|
+
lines: list[str] = []
|
|
31
|
+
for f in findings:
|
|
32
|
+
loc = f"{f.path}:{f.line}:{f.col + 1}"
|
|
33
|
+
lines.append(
|
|
34
|
+
f"{_c('bold', use_color)}{loc}{_c('reset', use_color)} "
|
|
35
|
+
f"{_c('yellow', use_color)}{f.rule}{_c('reset', use_color)} {f.message}"
|
|
36
|
+
)
|
|
37
|
+
if f.snippet:
|
|
38
|
+
lines.append(f" {_c('dim', use_color)}{f.snippet}{_c('reset', use_color)}")
|
|
39
|
+
if f.suggestion:
|
|
40
|
+
lines.append(f" {_c('cyan', use_color)}-> {f.suggestion}{_c('reset', use_color)}")
|
|
41
|
+
lines.append("")
|
|
42
|
+
n = len(findings)
|
|
43
|
+
files = len({f.path for f in findings})
|
|
44
|
+
lines.append(
|
|
45
|
+
f"{_c('bold', use_color)}{n} needless LLM call{'s' if n != 1 else ''} "
|
|
46
|
+
f"in {files} file{'s' if files != 1 else ''}.{_c('reset', use_color)}"
|
|
47
|
+
)
|
|
48
|
+
return "\n".join(lines)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def format_json(findings: list[Finding]) -> str:
|
|
52
|
+
payload = [
|
|
53
|
+
{
|
|
54
|
+
"path": f.path,
|
|
55
|
+
"line": f.line,
|
|
56
|
+
"col": f.col,
|
|
57
|
+
"rule": f.rule,
|
|
58
|
+
"message": f.message,
|
|
59
|
+
"suggestion": f.suggestion,
|
|
60
|
+
"severity": f.severity,
|
|
61
|
+
"snippet": f.snippet,
|
|
62
|
+
}
|
|
63
|
+
for f in findings
|
|
64
|
+
]
|
|
65
|
+
return json.dumps(payload, indent=2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def format_sarif(findings: list[Finding]) -> str:
|
|
69
|
+
rule_ids = sorted({f.rule for f in findings})
|
|
70
|
+
rules = [
|
|
71
|
+
{
|
|
72
|
+
"id": rid,
|
|
73
|
+
"name": rid,
|
|
74
|
+
"shortDescription": {"text": f"overllm {rid}"},
|
|
75
|
+
"helpUri": _RULE_HELP,
|
|
76
|
+
}
|
|
77
|
+
for rid in rule_ids
|
|
78
|
+
]
|
|
79
|
+
results = [
|
|
80
|
+
{
|
|
81
|
+
"ruleId": f.rule,
|
|
82
|
+
"level": "warning",
|
|
83
|
+
"message": {"text": f"{f.message}. {f.suggestion}".strip()},
|
|
84
|
+
"locations": [
|
|
85
|
+
{
|
|
86
|
+
"physicalLocation": {
|
|
87
|
+
"artifactLocation": {"uri": f.path},
|
|
88
|
+
"region": {"startLine": max(f.line, 1), "startColumn": f.col + 1},
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
],
|
|
92
|
+
}
|
|
93
|
+
for f in findings
|
|
94
|
+
]
|
|
95
|
+
sarif = {
|
|
96
|
+
"version": "2.1.0",
|
|
97
|
+
"$schema": "https://json.schemastore.org/sarif-2.1.0.json",
|
|
98
|
+
"runs": [
|
|
99
|
+
{
|
|
100
|
+
"tool": {
|
|
101
|
+
"driver": {
|
|
102
|
+
"name": "overllm",
|
|
103
|
+
"version": __version__,
|
|
104
|
+
"informationUri": "https://github.com/theadamdanielsson/overllm",
|
|
105
|
+
"rules": rules,
|
|
106
|
+
}
|
|
107
|
+
},
|
|
108
|
+
"results": results,
|
|
109
|
+
}
|
|
110
|
+
],
|
|
111
|
+
}
|
|
112
|
+
return json.dumps(sarif, indent=2)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
_MARKER = "<!-- overllm-report -->"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def format_markdown(findings: list[Finding]) -> str:
|
|
119
|
+
"""One grounded PR comment. Humble, cites each line, silent posting handled upstream."""
|
|
120
|
+
if not findings:
|
|
121
|
+
return ""
|
|
122
|
+
n = len(findings)
|
|
123
|
+
by_file: dict[str, list[Finding]] = defaultdict(list)
|
|
124
|
+
for f in findings:
|
|
125
|
+
by_file[f.path].append(f)
|
|
126
|
+
|
|
127
|
+
out = [
|
|
128
|
+
_MARKER,
|
|
129
|
+
f"### \U0001f9f9 overllm found {n} LLM call{'s' if n != 1 else ''} that plain code could handle",
|
|
130
|
+
"",
|
|
131
|
+
"Each of these calls an AI model to do something a library or a few lines of code do "
|
|
132
|
+
"faster, cheaper, and deterministically. Worth a look, not gospel - "
|
|
133
|
+
"add `# overllm: ignore` on a line to silence a false positive.",
|
|
134
|
+
"",
|
|
135
|
+
]
|
|
136
|
+
for path in sorted(by_file):
|
|
137
|
+
out.append(f"**`{path}`**")
|
|
138
|
+
for f in by_file[path]:
|
|
139
|
+
out.append(f"- `L{f.line}` **{f.rule}** - {f.message}.")
|
|
140
|
+
if f.suggestion:
|
|
141
|
+
out.append(f" -> {f.suggestion}")
|
|
142
|
+
out.append("")
|
|
143
|
+
out.append(
|
|
144
|
+
"<sub>Flagged by [overllm](https://github.com/theadamdanielsson/overllm) - "
|
|
145
|
+
"it only reports calls where deterministic code wins, and stays silent otherwise.</sub>"
|
|
146
|
+
)
|
|
147
|
+
return "\n".join(out)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def render(findings: list[Finding], fmt: str, use_color: bool = True) -> str:
|
|
151
|
+
if fmt == "json":
|
|
152
|
+
return format_json(findings)
|
|
153
|
+
if fmt == "sarif":
|
|
154
|
+
return format_sarif(findings)
|
|
155
|
+
if fmt == "markdown":
|
|
156
|
+
return format_markdown(findings)
|
|
157
|
+
return format_human(findings, use_color=use_color)
|
overllm/rules.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""The rule set. Every rule keys off an observable code pattern, not taste, and
|
|
2
|
+
every finding names a concrete deterministic replacement.
|
|
3
|
+
|
|
4
|
+
v1 is deliberately about one thing: LLM calls you did not need. It does not do
|
|
5
|
+
generic complexity, dead code, or style - that ground is already covered by
|
|
6
|
+
other tools, and it is where an opinionated linter turns into noise.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
from .detector import LLMCall
|
|
14
|
+
from .models import Finding
|
|
15
|
+
|
|
16
|
+
# Rule ids
|
|
17
|
+
STATIC_PROMPT = "static-prompt"
|
|
18
|
+
LLM_EXTRACTION = "llm-extraction"
|
|
19
|
+
LLM_IN_LOOP = "llm-in-loop"
|
|
20
|
+
LLM_MECHANICAL = "llm-mechanical"
|
|
21
|
+
|
|
22
|
+
ALL_RULES = (STATIC_PROMPT, LLM_EXTRACTION, LLM_IN_LOOP, LLM_MECHANICAL)
|
|
23
|
+
|
|
24
|
+
# (compiled pattern, human message, suggestion)
|
|
25
|
+
_EXTRACTION_PATTERNS: list[tuple[re.Pattern, str, str]] = [
|
|
26
|
+
(re.compile(r"\bextract\b.*\b(e-?mail)\b"),
|
|
27
|
+
"asks the model to extract an email address",
|
|
28
|
+
"match it with a regex, or use the `email-validator` package"),
|
|
29
|
+
(re.compile(r"\bextract\b.*\b(url|link|hyperlink)\b"),
|
|
30
|
+
"asks the model to extract a URL",
|
|
31
|
+
"use a regex or `urllib.parse`"),
|
|
32
|
+
(re.compile(r"\b(extract|parse|get)\b.*\b(date|datetime|timestamp)\b"),
|
|
33
|
+
"asks the model to extract or parse a date",
|
|
34
|
+
"use `datetime.strptime` or `dateutil.parser`"),
|
|
35
|
+
(re.compile(r"\bextract\b.*\b(phone|number|integer|amount|price|digit)\b"),
|
|
36
|
+
"asks the model to extract a number",
|
|
37
|
+
"use a regex, then `int()` / `float()`"),
|
|
38
|
+
(re.compile(r"\b(return|reply|respond|output|answer)\b.*\bonly\b.*\bjson\b"),
|
|
39
|
+
"asks the model to return raw JSON",
|
|
40
|
+
"use the SDK's structured-output / JSON mode, or `json.loads` on typed fields"),
|
|
41
|
+
(re.compile(r"\bvalid\s+json\b|\bjson\s+format\b|\bas\s+json\b"),
|
|
42
|
+
"asks the model to format output as JSON",
|
|
43
|
+
"use `json.dumps`, or the SDK's structured-output mode"),
|
|
44
|
+
(re.compile(r"\bparse\b.*\b(the\s+)?(json|csv|xml|yaml|html)\b"),
|
|
45
|
+
"asks the model to parse a structured format",
|
|
46
|
+
"use `json` / `csv` / `xml` / a real parser"),
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
_MECHANICAL_PATTERNS: list[tuple[re.Pattern, str, str]] = [
|
|
50
|
+
(re.compile(r"\b(sort|order)\b.*\b(these|this|them|the (list|items|array|names|numbers))\b|\balphabetize\b"),
|
|
51
|
+
"asks the model to sort",
|
|
52
|
+
"use `sorted()`"),
|
|
53
|
+
(re.compile(r"\breverse\b.*\b(the\s+)?(string|list|order|text|these|it)\b"),
|
|
54
|
+
"asks the model to reverse a sequence",
|
|
55
|
+
"use `reversed()` or `[::-1]`"),
|
|
56
|
+
(re.compile(r"\bcount\b.*\b(the\s+)?(number of\s+)?(words|characters|letters|items|occurrences|lines|elements)\b"),
|
|
57
|
+
"asks the model to count",
|
|
58
|
+
"use `len()`, `str.count()`, or `collections.Counter`"),
|
|
59
|
+
(re.compile(r"\b(sum|add up|total|average|mean)\b.*\b(these|the)\b.*\b(numbers|values|amounts)\b|\bcalculate the (sum|total|average|mean)\b"),
|
|
60
|
+
"asks the model to do arithmetic over values",
|
|
61
|
+
"use `sum()` / `statistics.mean()`"),
|
|
62
|
+
(re.compile(r"\b(remove duplicates|deduplicate|de-duplicate|unique(\s+values)?)\b"),
|
|
63
|
+
"asks the model to deduplicate",
|
|
64
|
+
"use `set()` or `dict.fromkeys()`"),
|
|
65
|
+
(re.compile(r"\b(uppercase|lowercase|to upper|to lower|capitalize|title[- ]?case)\b"),
|
|
66
|
+
"asks the model to change letter case",
|
|
67
|
+
"use `str.upper()` / `.lower()` / `.title()`"),
|
|
68
|
+
(re.compile(r"\bbase64\b"),
|
|
69
|
+
"asks the model to base64 encode/decode",
|
|
70
|
+
"use the `base64` module"),
|
|
71
|
+
(re.compile(r"\b(what is|calculate|compute)\b[^.]*\b\d+\s*[-+*/x×]\s*\d+"),
|
|
72
|
+
"asks the model to compute arithmetic on literal numbers",
|
|
73
|
+
"just compute it in code"),
|
|
74
|
+
(re.compile(r"\bformat\b.*\b(the\s+)?date\b|\bconvert\b.*\bdate\b"),
|
|
75
|
+
"asks the model to format a date",
|
|
76
|
+
"use `datetime.strftime` / `strptime`"),
|
|
77
|
+
(re.compile(r"\b(pretty[- ]?print|minify|format)\b.*\bjson\b"),
|
|
78
|
+
"asks the model to reformat JSON",
|
|
79
|
+
"use `json.dumps(indent=...)`"),
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
_MIN_STATIC_LEN = 12 # ignore trivial/placeholder prompts for the static rule
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _finding(call: LLMCall, path: str, rule: str, message: str, suggestion: str) -> Finding:
|
|
86
|
+
return Finding(
|
|
87
|
+
path=path,
|
|
88
|
+
line=call.line,
|
|
89
|
+
col=call.col,
|
|
90
|
+
rule=rule,
|
|
91
|
+
message=message,
|
|
92
|
+
suggestion=suggestion,
|
|
93
|
+
snippet=call.snippet,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def run_rules(call: LLMCall, path: str) -> list[Finding]:
|
|
98
|
+
out: list[Finding] = []
|
|
99
|
+
text = call.prompt_text
|
|
100
|
+
|
|
101
|
+
# R1: fully static user prompt - constant input, so the call buys nothing.
|
|
102
|
+
if (
|
|
103
|
+
call.prompt_resolved
|
|
104
|
+
and call.prompt_static
|
|
105
|
+
and len(text) >= _MIN_STATIC_LEN
|
|
106
|
+
and re.search(r"[a-z]{3,}", text)
|
|
107
|
+
):
|
|
108
|
+
out.append(_finding(
|
|
109
|
+
call, path, STATIC_PROMPT,
|
|
110
|
+
"LLM call with a fully static prompt (no variables). The input is "
|
|
111
|
+
"constant, so this pays latency, money, and nondeterminism for a fixed result",
|
|
112
|
+
"precompute or cache the result; if you meant to include runtime data, interpolate it",
|
|
113
|
+
))
|
|
114
|
+
|
|
115
|
+
# R2: the prompt asks for something a parser/regex does deterministically.
|
|
116
|
+
if text:
|
|
117
|
+
for pat, msg, sug in _EXTRACTION_PATTERNS:
|
|
118
|
+
if pat.search(text):
|
|
119
|
+
out.append(_finding(call, path, LLM_EXTRACTION, "LLM call " + msg, sug))
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
# R4: the prompt asks for a mechanical transform with a stdlib one-liner.
|
|
123
|
+
if text:
|
|
124
|
+
for pat, msg, sug in _MECHANICAL_PATTERNS:
|
|
125
|
+
if pat.search(text):
|
|
126
|
+
out.append(_finding(call, path, LLM_MECHANICAL, "LLM call " + msg, sug))
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
# R3: an LLM call inside a loop - one API round-trip per iteration.
|
|
130
|
+
if call.in_loop:
|
|
131
|
+
out.append(_finding(
|
|
132
|
+
call, path, LLM_IN_LOOP,
|
|
133
|
+
"LLM call inside a loop: one API round-trip per iteration (N calls, N latencies, N times the cost)",
|
|
134
|
+
"batch the inputs into a single call, cache repeated results, or if the per-item work is deterministic use a function",
|
|
135
|
+
))
|
|
136
|
+
|
|
137
|
+
return out
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: overllm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Catch the LLM/AI calls you didn't need. A fast, deterministic linter that flags LLM API calls where plain code is simpler, cheaper, and more reliable.
|
|
5
|
+
Project-URL: Homepage, https://github.com/theadamdanielsson/overllm
|
|
6
|
+
Project-URL: Issues, https://github.com/theadamdanielsson/overllm/issues
|
|
7
|
+
Author-email: Adam Danielsson <the.adam.danielsson@gmail.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: ai,anthropic,code-quality,cost,linter,llm,openai,pre-commit,static-analysis
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# overllm
|
|
22
|
+
|
|
23
|
+
**Catch the LLM/AI calls you didn't need.**
|
|
24
|
+
|
|
25
|
+
overllm is a small, fast linter with one job: find the places in your code where you call an AI model to do something plain code does better. You called GPT to parse a date. You called a model to extract JSON that `json.loads` already handles. You are paying latency, money, and nondeterminism for a regex.
|
|
26
|
+
|
|
27
|
+
It reads your code with Python's own `ast` module. No model runs, no network, no API key. Same code in, same result out. Fast enough for a pre-commit hook.
|
|
28
|
+
|
|
29
|
+
Everyone else lints the code the AI wrote. overllm catches where you are paying an AI to do what a library already does.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install overllm
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Use it
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
overllm . # scan the current project
|
|
41
|
+
overllm src/ # scan a folder
|
|
42
|
+
overllm app.py # scan one file
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Example output:
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
app.py:42:5 llm-mechanical LLM call asks the model to sort
|
|
49
|
+
resp = client.chat.completions.create(model="gpt-4o", messages=[...])
|
|
50
|
+
-> use sorted()
|
|
51
|
+
|
|
52
|
+
app.py:88:1 llm-in-loop LLM call inside a loop: one API round-trip per iteration
|
|
53
|
+
completion(model="gpt-4o", messages=[{"role": "user", "content": f"tag {x}"}])
|
|
54
|
+
-> batch the inputs into a single call, cache repeated results, or use a function
|
|
55
|
+
|
|
56
|
+
2 needless LLM calls in 1 file.
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
overllm exits non-zero when it finds something, so it gates a commit or a CI check. Pass `--exit-zero` to report without failing.
|
|
60
|
+
|
|
61
|
+
## Rules
|
|
62
|
+
|
|
63
|
+
Every rule fires only on a concrete code pattern, and every finding names the deterministic replacement. It stays silent when it is not sure.
|
|
64
|
+
|
|
65
|
+
| Rule | Fires when | Suggests |
|
|
66
|
+
| --- | --- | --- |
|
|
67
|
+
| `static-prompt` | The user prompt is a compile-time constant (no variables). The input is fixed, so the call buys nothing. | precompute or cache the result |
|
|
68
|
+
| `llm-extraction` | The prompt asks the model to extract or parse an email, URL, date, number, or JSON. | a regex, `json`, `datetime`, or the SDK's structured-output mode |
|
|
69
|
+
| `llm-mechanical` | The prompt asks for a mechanical transform: sort, reverse, count, sum, deduplicate, change case, base64, arithmetic on literals. | the one-line stdlib equivalent |
|
|
70
|
+
| `llm-in-loop` | An LLM call sits inside a `for`/`async for`/comprehension. One API round-trip per iteration. | batch, cache, or move it out of the loop |
|
|
71
|
+
|
|
72
|
+
It detects calls to the OpenAI, Anthropic, Google, Mistral, Cohere, Groq, LangChain, LiteLLM, and Ollama SDKs, and raw HTTP requests to those hosts.
|
|
73
|
+
|
|
74
|
+
## Silence a false positive
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
resp = client.chat.completions.create(...) # overllm: ignore
|
|
78
|
+
resp = client.chat.completions.create(...) # overllm: ignore=llm-in-loop
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Put `# overllm: ignore-file` at the top of a file to skip the whole file.
|
|
82
|
+
|
|
83
|
+
## Configure
|
|
84
|
+
|
|
85
|
+
In `pyproject.toml` (Python 3.11+):
|
|
86
|
+
|
|
87
|
+
```toml
|
|
88
|
+
[tool.overllm]
|
|
89
|
+
ignore = ["llm-in-loop"]
|
|
90
|
+
exclude = ["examples/", "migrations/"]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or on the command line: `--select`, `--ignore`, `--exclude` via config, `--config PATH`.
|
|
94
|
+
|
|
95
|
+
## Pre-commit hook
|
|
96
|
+
|
|
97
|
+
In `.pre-commit-config.yaml`:
|
|
98
|
+
|
|
99
|
+
```yaml
|
|
100
|
+
repos:
|
|
101
|
+
- repo: https://github.com/theadamdanielsson/overllm
|
|
102
|
+
rev: v0.1.0
|
|
103
|
+
hooks:
|
|
104
|
+
- id: overllm
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## GitHub Action
|
|
108
|
+
|
|
109
|
+
overllm ships an Action that scans a pull request and leaves one grounded comment. It stays silent when there is nothing to say.
|
|
110
|
+
|
|
111
|
+
```yaml
|
|
112
|
+
name: overllm
|
|
113
|
+
on:
|
|
114
|
+
pull_request:
|
|
115
|
+
|
|
116
|
+
permissions:
|
|
117
|
+
contents: read
|
|
118
|
+
pull-requests: write
|
|
119
|
+
|
|
120
|
+
jobs:
|
|
121
|
+
check:
|
|
122
|
+
runs-on: ubuntu-latest
|
|
123
|
+
steps:
|
|
124
|
+
- uses: actions/checkout@v4
|
|
125
|
+
- uses: theadamdanielsson/overllm@v1
|
|
126
|
+
with:
|
|
127
|
+
paths: "."
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Other output formats
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
overllm --format json . # machine-readable
|
|
134
|
+
overllm --format sarif . # upload to GitHub code scanning
|
|
135
|
+
overllm --format markdown . # the PR-comment body
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Why not just use an AI code reviewer?
|
|
139
|
+
|
|
140
|
+
AI reviewers and AI-slop linters look at the code the model produced: comments, dead code, structure. None of them ask the question overllm asks, which is whether you needed the model at all. It is a different axis, and it is one plain static analysis can answer with high precision and zero cost.
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
MIT © Adam Danielsson
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
overllm/__init__.py,sha256=c0UnUFK21IKVMHsUtS87oWHLkKjPrAQO9E_6XDPYjuw,304
|
|
2
|
+
overllm/__main__.py,sha256=MHKZ_ae3fSLGTLUUMOx15fWdeOnJSHhq-zslRP5F5Lc,79
|
|
3
|
+
overllm/analyze.py,sha256=1fCT_RfaHKsTUdsJkeuUN2Cfb0WHNTQhBWLWE7dpEd0,3903
|
|
4
|
+
overllm/cli.py,sha256=mBsQ0LYl7_lWM_ozzDnFjvELagr9hpP0gH2phfwYQUs,2204
|
|
5
|
+
overllm/config.py,sha256=s-wbiyyYKOKICRSi-6UNRiyUxSMBZn233bCHSA7q9Ho,2116
|
|
6
|
+
overllm/detector.py,sha256=MoSxrpZWRR7mKwEkCI7-1MC4IbZQHKD_zJVQiMztt0c,14884
|
|
7
|
+
overllm/models.py,sha256=cURoXxAfEmbFz6GiZeGVhIz8x-6vPe-PM_pX_oxvLH4,642
|
|
8
|
+
overllm/report.py,sha256=Ki2RpmVEBI8EVQIhYGKgKZ6tLZky2J-QHhfSbYXyG9g,4947
|
|
9
|
+
overllm/rules.py,sha256=B0Wk5-gCl841FXcNPz3p8uvl20X4T8_8DSuYS4vPtU4,5908
|
|
10
|
+
overllm-0.1.0.dist-info/METADATA,sha256=6NBFTAvaSxGxW5OSFrz8wlpN7xYVoiiqVKv7QapPtBQ,5041
|
|
11
|
+
overllm-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
12
|
+
overllm-0.1.0.dist-info/entry_points.txt,sha256=lYfnMFlT1ZxjxtIgzMxpqOMDa5NcjDoNaToSmeb1l6g,45
|
|
13
|
+
overllm-0.1.0.dist-info/licenses/LICENSE,sha256=dJo02JmanV48uwAVydix7ep6cVMAmT1gY_yoWItnDXc,1072
|
|
14
|
+
overllm-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Adam Danielsson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|