specscore 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clitic/__init__.py +0 -0
- clitic/analyzer.py +158 -0
- clitic/cli.py +63 -0
- clitic/dimensions/__init__.py +0 -0
- clitic/dimensions/arg_design.py +100 -0
- clitic/dimensions/discoverability.py +107 -0
- clitic/dimensions/error_handling.py +94 -0
- clitic/dimensions/exit_codes.py +71 -0
- clitic/dimensions/output_format.py +112 -0
- clitic/models.py +46 -0
- clitic/report.py +185 -0
- clitic/scorer.py +36 -0
- sandbox/__init__.py +0 -0
- sandbox/cli.py +155 -0
- sandbox/data/sample_spec.yaml +163 -0
- sandbox/generator.py +107 -0
- sandbox/models.py +46 -0
- sandbox/prober.py +199 -0
- sandbox/report.py +153 -0
- sandbox/server.py +158 -0
- scorecard/__init__.py +0 -0
- scorecard/cli.py +71 -0
- scorecard/dimensions/__init__.py +31 -0
- scorecard/dimensions/agent_usability.py +97 -0
- scorecard/dimensions/ai_readiness.py +102 -0
- scorecard/dimensions/developer_experience.py +103 -0
- scorecard/dimensions/discoverability.py +113 -0
- scorecard/dimensions/foundational.py +63 -0
- scorecard/dimensions/security.py +99 -0
- scorecard/models.py +48 -0
- scorecard/parser.py +25 -0
- scorecard/report.py +110 -0
- scorecard/scorer.py +52 -0
- specscore-0.1.4.dist-info/METADATA +193 -0
- specscore-0.1.4.dist-info/RECORD +39 -0
- specscore-0.1.4.dist-info/WHEEL +4 -0
- specscore-0.1.4.dist-info/entry_points.txt +2 -0
- specscore-0.1.4.dist-info/licenses/LICENSE +193 -0
- specscore-0.1.4.dist-info/licenses/NOTICE +14 -0
clitic/__init__.py
ADDED
|
File without changes
|
clitic/analyzer.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
TIMEOUT = 5.0 # seconds per probe run
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class RunResult:
|
|
14
|
+
args: list[str]
|
|
15
|
+
exit_code: int
|
|
16
|
+
stdout: str
|
|
17
|
+
stderr: str
|
|
18
|
+
elapsed: float
|
|
19
|
+
timed_out: bool = False
|
|
20
|
+
error: str | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ToolProbe:
|
|
25
|
+
tool_name: str
|
|
26
|
+
tool_path: str | None
|
|
27
|
+
|
|
28
|
+
help_result: RunResult | None = None # tool --help
|
|
29
|
+
version_result: RunResult | None = None # tool --version
|
|
30
|
+
no_args_result: RunResult | None = None # tool (no args)
|
|
31
|
+
bad_args_result: RunResult | None = None # tool --xxxxclitictest (unknown flag)
|
|
32
|
+
json_results: list[RunResult] = field(default_factory=list) # --json / --output json / etc.
|
|
33
|
+
subcommand_names: list[str] = field(default_factory=list)
|
|
34
|
+
subcommand_help_results: list[tuple[str, RunResult]] = field(default_factory=list)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _run(cmd: list[str]) -> RunResult:
|
|
38
|
+
t0 = time.monotonic()
|
|
39
|
+
try:
|
|
40
|
+
proc = subprocess.run(
|
|
41
|
+
cmd,
|
|
42
|
+
capture_output=True,
|
|
43
|
+
text=True,
|
|
44
|
+
timeout=TIMEOUT,
|
|
45
|
+
)
|
|
46
|
+
elapsed = time.monotonic() - t0
|
|
47
|
+
return RunResult(
|
|
48
|
+
args=cmd,
|
|
49
|
+
exit_code=proc.returncode,
|
|
50
|
+
stdout=proc.stdout,
|
|
51
|
+
stderr=proc.stderr,
|
|
52
|
+
elapsed=elapsed,
|
|
53
|
+
)
|
|
54
|
+
except subprocess.TimeoutExpired:
|
|
55
|
+
elapsed = time.monotonic() - t0
|
|
56
|
+
return RunResult(args=cmd, exit_code=-1, stdout="", stderr="", elapsed=elapsed, timed_out=True)
|
|
57
|
+
except FileNotFoundError as exc:
|
|
58
|
+
elapsed = time.monotonic() - t0
|
|
59
|
+
return RunResult(args=cmd, exit_code=-1, stdout="", stderr="", elapsed=elapsed, error=str(exc))
|
|
60
|
+
except Exception as exc:
|
|
61
|
+
elapsed = time.monotonic() - t0
|
|
62
|
+
return RunResult(args=cmd, exit_code=-1, stdout="", stderr="", elapsed=elapsed, error=str(exc))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _extract_subcommands(help_text: str) -> list[str]:
|
|
66
|
+
"""Heuristically extract subcommand names from help text."""
|
|
67
|
+
lines = help_text.splitlines()
|
|
68
|
+
|
|
69
|
+
# Strategy 1: Named section headers (Commands:, Available Commands:, Subcommands:)
|
|
70
|
+
in_commands_section = False
|
|
71
|
+
section_subcommands: list[str] = []
|
|
72
|
+
skip_words = {"usage", "options", "arguments", "flags", "help", "version", "global", "topics"}
|
|
73
|
+
|
|
74
|
+
for line in lines:
|
|
75
|
+
stripped = line.strip()
|
|
76
|
+
if re.match(
|
|
77
|
+
r"^(commands?|subcommands?|available commands?|sub-commands?)\s*:?\s*$",
|
|
78
|
+
stripped,
|
|
79
|
+
re.IGNORECASE,
|
|
80
|
+
):
|
|
81
|
+
in_commands_section = True
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
if in_commands_section:
|
|
85
|
+
if not stripped:
|
|
86
|
+
if section_subcommands:
|
|
87
|
+
break
|
|
88
|
+
continue
|
|
89
|
+
# New section header ends the commands section
|
|
90
|
+
if line and not line[0].isspace() and re.search(r":\s*$", stripped):
|
|
91
|
+
break
|
|
92
|
+
match = re.match(r"^\s{1,8}([a-z][\w-]{0,30})\b", line)
|
|
93
|
+
if match:
|
|
94
|
+
candidate = match.group(1)
|
|
95
|
+
if candidate.lower() not in skip_words:
|
|
96
|
+
section_subcommands.append(candidate)
|
|
97
|
+
|
|
98
|
+
if section_subcommands:
|
|
99
|
+
return section_subcommands[:8]
|
|
100
|
+
|
|
101
|
+
# Strategy 2: Indented "word Description" lines (git-style)
|
|
102
|
+
pattern_subcommands: list[str] = []
|
|
103
|
+
for line in lines:
|
|
104
|
+
match = re.match(r"^\s{2,8}([a-z][\w-]{1,20})\s{2,}[A-Za-z]", line)
|
|
105
|
+
if match:
|
|
106
|
+
candidate = match.group(1)
|
|
107
|
+
if candidate.lower() not in skip_words:
|
|
108
|
+
pattern_subcommands.append(candidate)
|
|
109
|
+
|
|
110
|
+
return pattern_subcommands[:8]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def probe(tool_name: str) -> ToolProbe:
|
|
114
|
+
"""Run all probes for a CLI tool and return a ToolProbe."""
|
|
115
|
+
tool_path = shutil.which(tool_name)
|
|
116
|
+
result = ToolProbe(tool_name=tool_name, tool_path=tool_path)
|
|
117
|
+
|
|
118
|
+
if tool_path is None:
|
|
119
|
+
return result # tool not found; all results will be None
|
|
120
|
+
|
|
121
|
+
cmd_base = [tool_path]
|
|
122
|
+
|
|
123
|
+
# help probe
|
|
124
|
+
result.help_result = _run(cmd_base + ["--help"])
|
|
125
|
+
if result.help_result.exit_code != 0 and not result.help_result.error:
|
|
126
|
+
alt = _run(cmd_base + ["-h"])
|
|
127
|
+
if alt.exit_code == 0:
|
|
128
|
+
result.help_result = alt
|
|
129
|
+
|
|
130
|
+
# version probe
|
|
131
|
+
result.version_result = _run(cmd_base + ["--version"])
|
|
132
|
+
if result.version_result.exit_code != 0 and not result.version_result.error:
|
|
133
|
+
alt = _run(cmd_base + ["-V"])
|
|
134
|
+
if alt.exit_code == 0:
|
|
135
|
+
result.version_result = alt
|
|
136
|
+
|
|
137
|
+
# no-args probe
|
|
138
|
+
result.no_args_result = _run(cmd_base)
|
|
139
|
+
|
|
140
|
+
# bad args probe — unique flag an agent would never pass intentionally
|
|
141
|
+
result.bad_args_result = _run(cmd_base + ["--xxxxclitictest-unknown"])
|
|
142
|
+
|
|
143
|
+
# JSON output probes (try common patterns)
|
|
144
|
+
for json_flag in [["--json"], ["--output", "json"], ["-o", "json"], ["--format", "json"]]:
|
|
145
|
+
r = _run(cmd_base + json_flag)
|
|
146
|
+
if not r.error:
|
|
147
|
+
result.json_results.append(r)
|
|
148
|
+
|
|
149
|
+
# Subcommand discovery
|
|
150
|
+
help_text = (result.help_result.stdout or "") + (result.help_result.stderr or "")
|
|
151
|
+
result.subcommand_names = _extract_subcommands(help_text)
|
|
152
|
+
|
|
153
|
+
# Probe first few subcommands' help
|
|
154
|
+
for sub in result.subcommand_names[:3]:
|
|
155
|
+
sub_help = _run(cmd_base + [sub, "--help"])
|
|
156
|
+
result.subcommand_help_results.append((sub, sub_help))
|
|
157
|
+
|
|
158
|
+
return result
|
clitic/cli.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from typing_extensions import Annotated
|
|
7
|
+
|
|
8
|
+
from .analyzer import probe
|
|
9
|
+
from .scorer import run
|
|
10
|
+
from . import report as reporter
|
|
11
|
+
|
|
12
|
+
clitic_app = typer.Typer(
|
|
13
|
+
name="clitic",
|
|
14
|
+
help="Score a CLI tool for AI-agent readiness — the CLI Intelligence & Compliance Tester.",
|
|
15
|
+
add_completion=False,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
_DEMO_TOOL = "git"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _score_tool(tool_name: str, as_json: bool) -> None:
|
|
22
|
+
if not as_json:
|
|
23
|
+
tool_probe = reporter.animate_probe(
|
|
24
|
+
tool_name,
|
|
25
|
+
probe,
|
|
26
|
+
tool_name,
|
|
27
|
+
)
|
|
28
|
+
else:
|
|
29
|
+
tool_probe = probe(tool_name)
|
|
30
|
+
|
|
31
|
+
if tool_probe is None or tool_probe.tool_path is None:
|
|
32
|
+
typer.echo(f"Error: '{tool_name}' not found in PATH", err=True)
|
|
33
|
+
raise typer.Exit(1)
|
|
34
|
+
|
|
35
|
+
result = run(tool_probe)
|
|
36
|
+
|
|
37
|
+
if as_json:
|
|
38
|
+
reporter.print_json(result)
|
|
39
|
+
else:
|
|
40
|
+
reporter.print_report(result)
|
|
41
|
+
|
|
42
|
+
if result.overall_score < 60:
|
|
43
|
+
raise typer.Exit(2)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@clitic_app.command()
|
|
47
|
+
def score(
|
|
48
|
+
tool: Annotated[str, typer.Argument(help="CLI tool name or path (e.g. 'git', 'gh', 'curl')")],
|
|
49
|
+
json: Annotated[bool, typer.Option("--json", help="Output results as JSON")] = False,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Score a CLI tool for AI-agent readiness across 5 dimensions."""
|
|
52
|
+
_score_tool(tool, json)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@clitic_app.command()
|
|
56
|
+
def demo(
|
|
57
|
+
json: Annotated[bool, typer.Option("--json", help="Output results as JSON")] = False,
|
|
58
|
+
) -> None:
|
|
59
|
+
"""Score 'git' as a demo — see how a real-world CLI tool fares for AI agents."""
|
|
60
|
+
if shutil.which(_DEMO_TOOL) is None:
|
|
61
|
+
typer.echo(f"Error: '{_DEMO_TOOL}' not found in PATH — install git to run the demo.", err=True)
|
|
62
|
+
raise typer.Exit(1)
|
|
63
|
+
_score_tool(_DEMO_TOOL, json)
|
|
File without changes
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from ..models import DimensionResult, Issue
|
|
6
|
+
from ..analyzer import ToolProbe
|
|
7
|
+
|
|
8
|
+
NAME = "Argument & Interface Design"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _strip_ansi(text: str) -> str:
|
|
12
|
+
return re.sub(r"\x1b\[[0-9;]*m", "", text)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def score(probe: ToolProbe) -> DimensionResult:
|
|
16
|
+
issues: list[Issue] = []
|
|
17
|
+
total_score = 0.0
|
|
18
|
+
|
|
19
|
+
hr = probe.help_result
|
|
20
|
+
if hr is None:
|
|
21
|
+
return DimensionResult(name=NAME, score=0, issues=[
|
|
22
|
+
Issue(severity="error", message="Tool not found — cannot assess argument design", location="tool")
|
|
23
|
+
])
|
|
24
|
+
|
|
25
|
+
help_text = _strip_ansi((hr.stdout or "") + (hr.stderr or ""))
|
|
26
|
+
help_lower = help_text.lower()
|
|
27
|
+
|
|
28
|
+
long_flags = re.findall(r"--[\w-]+", help_text)
|
|
29
|
+
short_flags = re.findall(r"(?<!\-)(?<!\w)-[a-zA-Z]\b", help_text)
|
|
30
|
+
|
|
31
|
+
# 1. GNU-style long flags (--flag) present — 25 pts
|
|
32
|
+
if len(long_flags) >= 2:
|
|
33
|
+
total_score += 25
|
|
34
|
+
elif len(long_flags) == 1:
|
|
35
|
+
total_score += 12
|
|
36
|
+
issues.append(Issue(
|
|
37
|
+
severity="warning",
|
|
38
|
+
message="Only one long flag detected — prefer GNU-style --flags for agent-readable invocations",
|
|
39
|
+
location="flags",
|
|
40
|
+
))
|
|
41
|
+
else:
|
|
42
|
+
issues.append(Issue(
|
|
43
|
+
severity="warning",
|
|
44
|
+
message="No GNU-style long flags (--flag) detected — agents benefit from descriptive flag names",
|
|
45
|
+
location="flags",
|
|
46
|
+
))
|
|
47
|
+
|
|
48
|
+
# 2. Flags are kebab-case, not camelCase — 20 pts
|
|
49
|
+
camel_flags = [f for f in long_flags if re.search(r"--[a-z]+[A-Z]", f)]
|
|
50
|
+
if not camel_flags:
|
|
51
|
+
total_score += 20
|
|
52
|
+
else:
|
|
53
|
+
issues.append(Issue(
|
|
54
|
+
severity="warning",
|
|
55
|
+
message=f"camelCase flags detected ({', '.join(camel_flags[:3])}) — prefer kebab-case (--my-flag) by convention",
|
|
56
|
+
location="flags",
|
|
57
|
+
))
|
|
58
|
+
|
|
59
|
+
# 3. Standard flags present (--verbose, --help, --quiet) — 25 pts
|
|
60
|
+
standard = {
|
|
61
|
+
"--verbose / -v": ["--verbose", "-v ", "--debug"],
|
|
62
|
+
"--help / -h": ["--help", "-h "],
|
|
63
|
+
"--quiet / -q": ["--quiet", "-q ", "--silent"],
|
|
64
|
+
}
|
|
65
|
+
found = sum(
|
|
66
|
+
1 for variants in standard.values()
|
|
67
|
+
if any(v in help_lower for v in variants)
|
|
68
|
+
)
|
|
69
|
+
total_score += (found / len(standard)) * 25
|
|
70
|
+
if found < 2:
|
|
71
|
+
missing = [k for k, variants in standard.items() if not any(v in help_lower for v in variants)]
|
|
72
|
+
issues.append(Issue(
|
|
73
|
+
severity="info",
|
|
74
|
+
message=f"Missing standard flags: {', '.join(missing)} — standard flags aid agent discoverability",
|
|
75
|
+
location="flags",
|
|
76
|
+
))
|
|
77
|
+
|
|
78
|
+
# 4. Both short and long flag forms — 15 pts
|
|
79
|
+
if long_flags and short_flags:
|
|
80
|
+
total_score += 15
|
|
81
|
+
elif long_flags:
|
|
82
|
+
total_score += 8
|
|
83
|
+
issues.append(Issue(
|
|
84
|
+
severity="info",
|
|
85
|
+
message="No short flag aliases (-x) detected — short aliases improve usability in agent-generated commands",
|
|
86
|
+
location="flags",
|
|
87
|
+
))
|
|
88
|
+
|
|
89
|
+
# 5. Config / environment variable documentation — 15 pts
|
|
90
|
+
config_keywords = ["config", "env", "environment", ".env", "configuration", "settings", "$ "]
|
|
91
|
+
if any(kw in help_lower for kw in config_keywords):
|
|
92
|
+
total_score += 15
|
|
93
|
+
else:
|
|
94
|
+
issues.append(Issue(
|
|
95
|
+
severity="info",
|
|
96
|
+
message="No mention of config files or environment variables — agents benefit from knowing all configuration methods",
|
|
97
|
+
location="--help output",
|
|
98
|
+
))
|
|
99
|
+
|
|
100
|
+
return DimensionResult(name=NAME, score=round(min(total_score, 100), 1), issues=issues)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from ..models import DimensionResult, Issue
|
|
6
|
+
from ..analyzer import ToolProbe
|
|
7
|
+
|
|
8
|
+
NAME = "Help & Discoverability"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def score(probe: ToolProbe) -> DimensionResult:
|
|
12
|
+
issues: list[Issue] = []
|
|
13
|
+
total_score = 0.0
|
|
14
|
+
|
|
15
|
+
hr = probe.help_result
|
|
16
|
+
if hr is None:
|
|
17
|
+
issues.append(Issue(severity="error", message="Tool not found in PATH", location="tool"))
|
|
18
|
+
return DimensionResult(name=NAME, score=0, issues=issues)
|
|
19
|
+
|
|
20
|
+
# 1. --help works (exit 0) — 30 pts
|
|
21
|
+
if hr.timed_out:
|
|
22
|
+
issues.append(Issue(
|
|
23
|
+
severity="error",
|
|
24
|
+
message="--help timed out — tool hangs on --help, agents cannot introspect it",
|
|
25
|
+
location="--help",
|
|
26
|
+
))
|
|
27
|
+
elif hr.exit_code == 0:
|
|
28
|
+
total_score += 30
|
|
29
|
+
else:
|
|
30
|
+
combined = (hr.stdout or "") + (hr.stderr or "")
|
|
31
|
+
if len(combined.strip()) > 50:
|
|
32
|
+
total_score += 15 # help text exists but exit code is wrong
|
|
33
|
+
issues.append(Issue(
|
|
34
|
+
severity="warning",
|
|
35
|
+
message=f"--help exits {hr.exit_code} instead of 0 — agents may interpret this as failure",
|
|
36
|
+
location="--help",
|
|
37
|
+
))
|
|
38
|
+
else:
|
|
39
|
+
issues.append(Issue(
|
|
40
|
+
severity="error",
|
|
41
|
+
message="--help produced no useful output or failed entirely",
|
|
42
|
+
location="--help",
|
|
43
|
+
))
|
|
44
|
+
|
|
45
|
+
# 2. Help text is substantial — 20 pts
|
|
46
|
+
help_text = (hr.stdout or "") + (hr.stderr or "")
|
|
47
|
+
help_clean = re.sub(r"\x1b\[[0-9;]*m", "", help_text).strip()
|
|
48
|
+
if len(help_clean) >= 100:
|
|
49
|
+
total_score += 20
|
|
50
|
+
elif len(help_clean) >= 40:
|
|
51
|
+
total_score += 10
|
|
52
|
+
issues.append(Issue(
|
|
53
|
+
severity="warning",
|
|
54
|
+
message=f"Help text is very short ({len(help_clean)} chars) — agents need context to invoke the tool correctly",
|
|
55
|
+
location="--help output",
|
|
56
|
+
))
|
|
57
|
+
else:
|
|
58
|
+
issues.append(Issue(
|
|
59
|
+
severity="error",
|
|
60
|
+
message="Help text is minimal or absent",
|
|
61
|
+
location="--help output",
|
|
62
|
+
))
|
|
63
|
+
|
|
64
|
+
# 3. Subcommands listed — 20 pts
|
|
65
|
+
if probe.subcommand_names:
|
|
66
|
+
total_score += 20
|
|
67
|
+
else:
|
|
68
|
+
if len(help_clean) > 200:
|
|
69
|
+
total_score += 10 # likely a focused single-command tool
|
|
70
|
+
issues.append(Issue(
|
|
71
|
+
severity="info",
|
|
72
|
+
message="No subcommands detected — if this is a multi-command tool, explicit subcommand listing helps agents discover capabilities",
|
|
73
|
+
location="--help output",
|
|
74
|
+
))
|
|
75
|
+
else:
|
|
76
|
+
issues.append(Issue(
|
|
77
|
+
severity="info",
|
|
78
|
+
message="No subcommands detected in help output",
|
|
79
|
+
location="--help output",
|
|
80
|
+
))
|
|
81
|
+
|
|
82
|
+
# 4. --version works — 15 pts
|
|
83
|
+
vr = probe.version_result
|
|
84
|
+
if vr and vr.exit_code == 0 and (vr.stdout or vr.stderr).strip():
|
|
85
|
+
total_score += 15
|
|
86
|
+
elif vr and not vr.error:
|
|
87
|
+
issues.append(Issue(
|
|
88
|
+
severity="warning",
|
|
89
|
+
message="--version flag missing or non-functional — agents cannot verify tool version for compatibility checks",
|
|
90
|
+
location="--version",
|
|
91
|
+
))
|
|
92
|
+
|
|
93
|
+
# 5. Subcommand --help works — 15 pts
|
|
94
|
+
if probe.subcommand_help_results:
|
|
95
|
+
working = sum(1 for _, r in probe.subcommand_help_results if r.exit_code == 0)
|
|
96
|
+
ratio = working / len(probe.subcommand_help_results)
|
|
97
|
+
total_score += 15 * ratio
|
|
98
|
+
if ratio < 1.0:
|
|
99
|
+
issues.append(Issue(
|
|
100
|
+
severity="warning",
|
|
101
|
+
message=f"{len(probe.subcommand_help_results) - working}/{len(probe.subcommand_help_results)} subcommands don't respond to --help",
|
|
102
|
+
location="<subcommand> --help",
|
|
103
|
+
))
|
|
104
|
+
else:
|
|
105
|
+
total_score += 15 # no subcommands to test — not penalized
|
|
106
|
+
|
|
107
|
+
return DimensionResult(name=NAME, score=round(min(total_score, 100), 1), issues=issues)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from ..models import DimensionResult, Issue
|
|
6
|
+
from ..analyzer import ToolProbe
|
|
7
|
+
|
|
8
|
+
NAME = "Error Handling"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _strip_ansi(text: str) -> str:
|
|
12
|
+
return re.sub(r"\x1b\[[0-9;]*m", "", text)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _looks_like_traceback(text: str) -> bool:
|
|
16
|
+
patterns = [
|
|
17
|
+
r"Traceback \(most recent call last\)",
|
|
18
|
+
r'^\s+File ".*", line \d+',
|
|
19
|
+
r"^\s+at \w+[\.\w]+ \(", # JS/Java
|
|
20
|
+
r"\w+Error:",
|
|
21
|
+
r"Exception in thread",
|
|
22
|
+
]
|
|
23
|
+
return any(re.search(p, text, re.MULTILINE) for p in patterns)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _error_is_informative(text: str) -> bool:
|
|
27
|
+
clean = _strip_ansi(text).lower().strip()
|
|
28
|
+
if not clean or len(clean) < 20:
|
|
29
|
+
return False
|
|
30
|
+
helpful_keywords = ["usage", "try", "see", "run", "--help", "expected", "unknown",
|
|
31
|
+
"unrecognized", "invalid", "error:", "did you mean"]
|
|
32
|
+
return any(kw in clean for kw in helpful_keywords)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def score(probe: ToolProbe) -> DimensionResult:
|
|
36
|
+
issues: list[Issue] = []
|
|
37
|
+
total_score = 0.0
|
|
38
|
+
|
|
39
|
+
br = probe.bad_args_result
|
|
40
|
+
if br is None or br.error:
|
|
41
|
+
return DimensionResult(name=NAME, score=50, issues=[
|
|
42
|
+
Issue(severity="info", message="Could not probe error handling — tool unavailable", location="error probe")
|
|
43
|
+
])
|
|
44
|
+
|
|
45
|
+
stdout_clean = _strip_ansi(br.stdout or "")
|
|
46
|
+
stderr_clean = _strip_ansi(br.stderr or "")
|
|
47
|
+
combined = stdout_clean + stderr_clean
|
|
48
|
+
|
|
49
|
+
# 1. Error output goes to stderr — 30 pts
|
|
50
|
+
if br.exit_code != 0:
|
|
51
|
+
if stderr_clean.strip():
|
|
52
|
+
total_score += 30
|
|
53
|
+
elif stdout_clean.strip():
|
|
54
|
+
total_score += 15
|
|
55
|
+
issues.append(Issue(
|
|
56
|
+
severity="warning",
|
|
57
|
+
message="Error message printed to stdout instead of stderr — agents parsing stdout may misinterpret it as valid output",
|
|
58
|
+
location="stderr",
|
|
59
|
+
))
|
|
60
|
+
else:
|
|
61
|
+
issues.append(Issue(
|
|
62
|
+
severity="warning",
|
|
63
|
+
message="Bad args produced no error output at all",
|
|
64
|
+
location="stderr",
|
|
65
|
+
))
|
|
66
|
+
|
|
67
|
+
# 2. Error message is informative — 40 pts
|
|
68
|
+
if _error_is_informative(combined):
|
|
69
|
+
total_score += 40
|
|
70
|
+
elif combined.strip():
|
|
71
|
+
total_score += 20
|
|
72
|
+
issues.append(Issue(
|
|
73
|
+
severity="warning",
|
|
74
|
+
message="Error message for unknown flag is minimal — agents need clear guidance on what went wrong",
|
|
75
|
+
location="error message",
|
|
76
|
+
))
|
|
77
|
+
else:
|
|
78
|
+
issues.append(Issue(
|
|
79
|
+
severity="error",
|
|
80
|
+
message="No error message for invalid flag — agents cannot diagnose the failure",
|
|
81
|
+
location="error message",
|
|
82
|
+
))
|
|
83
|
+
|
|
84
|
+
# 3. No stack trace exposed — 30 pts
|
|
85
|
+
if not _looks_like_traceback(combined):
|
|
86
|
+
total_score += 30
|
|
87
|
+
else:
|
|
88
|
+
issues.append(Issue(
|
|
89
|
+
severity="error",
|
|
90
|
+
message="Stack trace exposed on invalid input — leaks internals and confuses agents parsing errors",
|
|
91
|
+
location="stderr",
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
return DimensionResult(name=NAME, score=round(min(total_score, 100), 1), issues=issues)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ..models import DimensionResult, Issue
|
|
4
|
+
from ..analyzer import ToolProbe
|
|
5
|
+
|
|
6
|
+
NAME = "Exit Code Semantics"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def score(probe: ToolProbe) -> DimensionResult:
|
|
10
|
+
issues: list[Issue] = []
|
|
11
|
+
total_score = 0.0
|
|
12
|
+
|
|
13
|
+
# 1. --help exits 0 — 25 pts
|
|
14
|
+
hr = probe.help_result
|
|
15
|
+
if hr and not hr.timed_out and not hr.error:
|
|
16
|
+
if hr.exit_code == 0:
|
|
17
|
+
total_score += 25
|
|
18
|
+
else:
|
|
19
|
+
issues.append(Issue(
|
|
20
|
+
severity="warning",
|
|
21
|
+
message=f"--help exits {hr.exit_code} instead of 0 — agents interpret non-zero as failure",
|
|
22
|
+
location="--help exit code",
|
|
23
|
+
))
|
|
24
|
+
|
|
25
|
+
# 2. --version exits 0 — 20 pts
|
|
26
|
+
vr = probe.version_result
|
|
27
|
+
if vr and not vr.timed_out and not vr.error:
|
|
28
|
+
if vr.exit_code == 0:
|
|
29
|
+
total_score += 20
|
|
30
|
+
else:
|
|
31
|
+
issues.append(Issue(
|
|
32
|
+
severity="warning",
|
|
33
|
+
message=f"--version exits {vr.exit_code} instead of 0",
|
|
34
|
+
location="--version exit code",
|
|
35
|
+
))
|
|
36
|
+
else:
|
|
37
|
+
total_score += 10 # --version absent — partial credit, not a fatal flaw
|
|
38
|
+
|
|
39
|
+
# 3. Unknown flag exits non-zero — 35 pts
|
|
40
|
+
br = probe.bad_args_result
|
|
41
|
+
if br and not br.timed_out and not br.error:
|
|
42
|
+
if br.exit_code != 0:
|
|
43
|
+
total_score += 35
|
|
44
|
+
else:
|
|
45
|
+
issues.append(Issue(
|
|
46
|
+
severity="error",
|
|
47
|
+
message="Unknown flag '--xxxxclitictest-unknown' returned exit 0 — agents cannot detect invalid invocations",
|
|
48
|
+
location="bad args exit code",
|
|
49
|
+
))
|
|
50
|
+
|
|
51
|
+
# 4. No-args behavior is intentional — 20 pts
|
|
52
|
+
nar = probe.no_args_result
|
|
53
|
+
if nar and not nar.timed_out and not nar.error:
|
|
54
|
+
combined = (nar.stdout or "") + (nar.stderr or "")
|
|
55
|
+
if combined.strip():
|
|
56
|
+
total_score += 20 # produces output (help or usage error) — intentional
|
|
57
|
+
elif nar.exit_code == 0:
|
|
58
|
+
total_score += 10
|
|
59
|
+
issues.append(Issue(
|
|
60
|
+
severity="info",
|
|
61
|
+
message="Running with no args produces no output and exits 0 — consider showing usage or help",
|
|
62
|
+
location="no-args exit code",
|
|
63
|
+
))
|
|
64
|
+
else:
|
|
65
|
+
issues.append(Issue(
|
|
66
|
+
severity="warning",
|
|
67
|
+
message="No-args invocation is silent with non-zero exit — agents cannot tell what went wrong",
|
|
68
|
+
location="no-args exit code",
|
|
69
|
+
))
|
|
70
|
+
|
|
71
|
+
return DimensionResult(name=NAME, score=round(min(total_score, 100), 1), issues=issues)
|