pytest-skill-engineering 0.6.4__tar.gz → 0.6.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/PKG-INFO +5 -4
  2. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/README.md +3 -3
  3. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/pyproject.toml +3 -2
  4. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/judge.py +1 -1
  5. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/runner.py +1 -1
  6. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/evals.py +1 -1
  7. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/plugin.py +1 -1
  8. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/clarification.py +1 -1
  9. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/rate_limiter.py +2 -2
  10. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/factories.py +1 -1
  11. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/llm_assert.py +6 -4
  12. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/llm_assert_image.py +7 -4
  13. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/llm_score.py +9 -4
  14. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin.py +3 -2
  15. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin_options.py +3 -3
  16. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin_report.py +3 -2
  17. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/prompts/ai_summary.md +3 -3
  18. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/prompts/coding_agent_analysis.md +2 -2
  19. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/insights.py +2 -2
  20. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/.gitignore +0 -0
  21. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/LICENSE +0 -0
  22. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/__init__.py +0 -0
  23. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/cli.py +0 -0
  24. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/__init__.py +0 -0
  25. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/config.py +0 -0
  26. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/eval.py +0 -0
  27. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/evals.py +0 -0
  28. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/events.py +0 -0
  29. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/fixtures.py +0 -0
  30. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/personas.py +0 -0
  31. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/result.py +0 -0
  32. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/__init__.py +0 -0
  33. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/errors.py +0 -0
  34. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/prompt.py +0 -0
  35. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/result.py +0 -0
  36. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/scoring.py +0 -0
  37. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/serialization.py +0 -0
  38. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill.py +0 -0
  39. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_benchmark.py +0 -0
  40. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_eval_results.py +0 -0
  41. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_evals.py +0 -0
  42. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_grading.py +0 -0
  43. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_refiner.py +0 -0
  44. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/__init__.py +0 -0
  45. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/cost.py +0 -0
  46. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/servers.py +0 -0
  47. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/skill_tools.py +0 -0
  48. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/__init__.py +0 -0
  49. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/iteration.py +0 -0
  50. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/skill_benchmark.py +0 -0
  51. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/skill_eval.py +0 -0
  52. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/skill_refine.py +0 -0
  53. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/hooks.py +0 -0
  54. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin_recording.py +0 -0
  55. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/prompts/__init__.py +0 -0
  56. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/__init__.py +0 -0
  57. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/collector.py +0 -0
  58. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/__init__.py +0 -0
  59. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/agent_leaderboard.py +0 -0
  60. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/agent_selector.py +0 -0
  61. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/overlay.py +0 -0
  62. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/report.py +0 -0
  63. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/test_comparison.py +0 -0
  64. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/test_grid.py +0 -0
  65. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/types.py +0 -0
  66. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/generator.py +0 -0
  67. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/markdown.py +0 -0
  68. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/templates/partials/report.css +0 -0
  69. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/templates/partials/scripts.js +0 -0
  70. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/__init__.py +0 -0
  71. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/banking.py +0 -0
  72. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/banking_mcp.py +0 -0
  73. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/todo.py +0 -0
  74. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/todo_mcp.py +0 -0
  75. {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pytest-skill-engineering
3
- Version: 0.6.4
3
+ Version: 0.6.6
4
4
  Summary: The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix.
5
5
  Project-URL: Homepage, https://github.com/sbroenne/pytest-skill-engineering
6
6
  Project-URL: Repository, https://github.com/sbroenne/pytest-skill-engineering
@@ -28,6 +28,7 @@ Requires-Dist: nh3>=0.3.3
28
28
  Requires-Dist: pydantic>=2.0
29
29
  Requires-Dist: pytest>=9.0
30
30
  Requires-Dist: python-frontmatter>=1.1.0
31
+ Requires-Dist: trio>=0.33.0
31
32
  Provides-Extra: dev
32
33
  Requires-Dist: pre-commit>=4.5; extra == 'dev'
33
34
  Requires-Dist: pyright>=1.1.408; extra == 'dev'
@@ -62,7 +63,7 @@ Test MCP servers, CLI tools, Agent Skills, and custom agents using the **real Gi
62
63
  Your MCP server passes all unit tests. Then a user tries it in GitHub Copilot and:
63
64
 
64
65
  - Copilot picks the wrong tool
65
- - Passes garbage parameters
66
+ - Passes garbage parameters
66
67
  - Can't recover from errors
67
68
  - Ignores your skill's instructions
68
69
 
@@ -97,7 +98,7 @@ async def test_balance_query(copilot_eval):
97
98
  max_turns=10,
98
99
  )
99
100
  result = await copilot_eval(agent, "What's my checking balance?")
100
-
101
+
101
102
  assert result.success
102
103
  assert result.tool_was_called("get_balance")
103
104
  ```
@@ -148,7 +149,7 @@ The AI-powered report needs a model to generate insights. Configure it in `pypro
148
149
 
149
150
  ```toml
150
151
  [tool.pytest.ini_options]
151
- addopts = "--aitest-summary-model=copilot/gpt-5-mini"
152
+ addopts = "--aitest-summary-model=copilot/gpt-5.4-mini"
152
153
  ```
153
154
 
154
155
  You can also use Azure OpenAI or other providers if you prefer — see [Configuration](https://sbroenne.github.io/pytest-skill-engineering/reference/configuration/).
@@ -14,7 +14,7 @@ Test MCP servers, CLI tools, Agent Skills, and custom agents using the **real Gi
14
14
  Your MCP server passes all unit tests. Then a user tries it in GitHub Copilot and:
15
15
 
16
16
  - Copilot picks the wrong tool
17
- - Passes garbage parameters
17
+ - Passes garbage parameters
18
18
  - Can't recover from errors
19
19
  - Ignores your skill's instructions
20
20
 
@@ -49,7 +49,7 @@ async def test_balance_query(copilot_eval):
49
49
  max_turns=10,
50
50
  )
51
51
  result = await copilot_eval(agent, "What's my checking balance?")
52
-
52
+
53
53
  assert result.success
54
54
  assert result.tool_was_called("get_balance")
55
55
  ```
@@ -100,7 +100,7 @@ The AI-powered report needs a model to generate insights. Configure it in `pypro
100
100
 
101
101
  ```toml
102
102
  [tool.pytest.ini_options]
103
- addopts = "--aitest-summary-model=copilot/gpt-5-mini"
103
+ addopts = "--aitest-summary-model=copilot/gpt-5.4-mini"
104
104
  ```
105
105
 
106
106
  You can also use Azure OpenAI or other providers if you prefer — see [Configuration](https://sbroenne.github.io/pytest-skill-engineering/reference/configuration/).
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pytest-skill-engineering"
7
- version = "0.6.4"
7
+ version = "0.6.6"
8
8
  description = "The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -35,6 +35,7 @@ dependencies = [
35
35
  "python-frontmatter>=1.1.0",
36
36
  "nh3>=0.3.3",
37
37
  "github-copilot-sdk>=0.2.2",
38
+ "trio>=0.33.0",
38
39
  ]
39
40
 
40
41
  [project.optional-dependencies]
@@ -119,7 +120,7 @@ filterwarnings = [
119
120
  # This demonstrates the recommended setup - configure once in pyproject.toml.
120
121
  # LLM auth is handled by the GitHub Copilot SDK (gh auth login or GITHUB_TOKEN)
121
122
  addopts = """
122
- --aitest-summary-model=copilot/gpt-5.4
123
+ --aitest-summary-model=copilot/gpt-5.5
123
124
  --aitest-html=aitest-reports/report.html
124
125
  """
125
126
  markers = [
@@ -39,7 +39,7 @@ def _approve_all_permissions(*_args: Any, **_kwargs: Any) -> Any:
39
39
  """Approve all permission requests using the current SDK result type."""
40
40
  from copilot.session import PermissionRequestResult # noqa: PLC0415
41
41
 
42
- return PermissionRequestResult(kind="approved")
42
+ return PermissionRequestResult(kind="approve-once")
43
43
 
44
44
 
45
45
  def _get_data_field(event: Any, field: str, default: Any = None) -> Any:
@@ -115,7 +115,7 @@ def _approve_all_permissions(*_args: Any, **_kwargs: Any) -> Any:
115
115
  """Approve all permission requests using the current SDK result type."""
116
116
  from copilot.session import PermissionRequestResult
117
117
 
118
- return PermissionRequestResult(kind="approved")
118
+ return PermissionRequestResult(kind="approve-once")
119
119
 
120
120
 
121
121
  def _is_transient_error(error: str | None) -> bool:
@@ -38,7 +38,7 @@ Example usage::
38
38
  from pytest_skill_engineering import Eval, Provider
39
39
  agent = Eval.from_agent_file(
40
40
  ".github/agents/reviewer.agent.md",
41
- provider=Provider(model="azure/gpt-5-mini"),
41
+ provider=Provider(model="azure/gpt-5.4-mini"),
42
42
  )
43
43
 
44
44
  # Use with CopilotEval
@@ -23,7 +23,7 @@ Example::
23
23
  from pytest_skill_engineering import Eval, Provider
24
24
  agent = Eval.from_plugin(
25
25
  "my-plugin/",
26
- provider=Provider(model="azure/gpt-5-mini"),
26
+ provider=Provider(model="azure/gpt-5.4-mini"),
27
27
  )
28
28
  """
29
29
 
@@ -38,7 +38,7 @@ async def check_clarification(
38
38
 
39
39
  Args:
40
40
  response_text: The agent's final response text to classify.
41
- judge_model: Model string (e.g. "gpt-5-mini", "claude-sonnet-4").
41
+ judge_model: Model string (e.g. "gpt-5.4-mini", "claude-sonnet-4").
42
42
  timeout_seconds: Timeout for the judge LLM call.
43
43
 
44
44
  Returns:
@@ -5,7 +5,7 @@ tokens per minute (tpm). Rate limiters are shared across all engine instances
5
5
  using the same model, so concurrent tests respect deployment limits.
6
6
 
7
7
  Usage:
8
- limiter = get_rate_limiter("azure/gpt-5-mini", rpm=10, tpm=10000)
8
+ limiter = get_rate_limiter("azure/gpt-5.4-mini", rpm=10, tpm=10000)
9
9
  await limiter.acquire() # Waits if rate limit would be exceeded
10
10
  # ... make API call ...
11
11
  limiter.record_tokens(1500) # Track token usage for tpm enforcement
@@ -38,7 +38,7 @@ def get_rate_limiter(
38
38
  restrictive limits (minimum of old and new values).
39
39
 
40
40
  Args:
41
- model: Model identifier string (e.g. "azure/gpt-5-mini").
41
+ model: Model identifier string (e.g. "azure/gpt-5.4-mini").
42
42
  rpm: Requests per minute limit.
43
43
  tpm: Tokens per minute limit.
44
44
 
@@ -18,7 +18,7 @@ def skill_factory() -> Callable[[Path | str], Skill]:
18
18
  def test_with_skill(skill_factory, eval_run):
19
19
  skill = skill_factory("path/to/my-skill")
20
20
  agent = Eval(
21
- provider=Provider(model="azure/gpt-5-mini"),
21
+ provider=Provider(model="azure/gpt-5.4-mini"),
22
22
  skill=skill,
23
23
  )
24
24
  result = await eval_run(agent, "Do something with the skill")
@@ -12,7 +12,7 @@ from dataclasses import dataclass
12
12
 
13
13
  import pytest
14
14
 
15
- _LLM_MODEL_DEFAULT = "copilot/gpt-5-mini"
15
+ _LLM_MODEL_DEFAULT = "copilot/gpt-5.4-mini"
16
16
 
17
17
 
18
18
  @dataclass(slots=True)
@@ -111,17 +111,19 @@ def llm_assert(request: pytest.FixtureRequest) -> LLMAssert:
111
111
  The judge model is resolved in this order:
112
112
  1. ``--llm-model`` if explicitly set
113
113
  2. ``--aitest-summary-model`` (same model for analysis and assertions)
114
- 3. ``copilot/gpt-5-mini`` as final fallback
114
+ 3. ``copilot/gpt-5.4-mini`` as final fallback
115
115
 
116
116
  Example::
117
117
 
118
118
  def test_response(llm_assert):
119
119
  assert llm_assert("Your balance is $1,500", "mentions a dollar amount")
120
120
  """
121
- model_str: str = request.config.getoption("--llm-model")
121
+ model_str = request.config.getoption("--llm-model")
122
+ if not isinstance(model_str, str):
123
+ model_str = _LLM_MODEL_DEFAULT
122
124
  if model_str == _LLM_MODEL_DEFAULT:
123
125
  # Not explicitly set — fall back to summary model if available
124
126
  summary_model = request.config.getoption("--aitest-summary-model", default=None)
125
- if summary_model:
127
+ if isinstance(summary_model, str) and summary_model:
126
128
  model_str = summary_model
127
129
  return LLMAssert(model=model_str)
@@ -74,7 +74,7 @@ def llm_assert_image(request: pytest.FixtureRequest) -> LLMAssertImage:
74
74
  1. ``--llm-vision-model`` if explicitly set
75
75
  2. ``--llm-model`` (same model for text and image assertions)
76
76
  3. ``--aitest-summary-model``
77
- 4. ``copilot/gpt-5-mini`` as final fallback
77
+ 4. ``copilot/gpt-5.4-mini`` as final fallback
78
78
 
79
79
  NOTE: This fixture currently raises NotImplementedError when called,
80
80
  as the Copilot SDK does not yet support image inputs in a documented way.
@@ -86,19 +86,22 @@ def llm_assert_image(request: pytest.FixtureRequest) -> LLMAssertImage:
86
86
  screenshots = result.tool_images_for("screenshot")
87
87
  assert llm_assert_image(screenshots[-1], "shows a bar chart")
88
88
  """
89
- _LLM_MODEL_DEFAULT = "copilot/gpt-5-mini" # noqa: N806
89
+ _LLM_MODEL_DEFAULT = "copilot/gpt-5.4-mini" # noqa: N806
90
90
 
91
91
  # Try vision-specific model first
92
- vision_model_str: str | None = request.config.getoption("--llm-vision-model", default=None)
92
+ vision_model_option = request.config.getoption("--llm-vision-model", default=None)
93
+ vision_model_str = vision_model_option if isinstance(vision_model_option, str) else None
93
94
 
94
95
  if vision_model_str:
95
96
  model_str = vision_model_str
96
97
  else:
97
98
  # Fall back to llm-model → summary model → default
98
99
  model_str = request.config.getoption("--llm-model")
100
+ if not isinstance(model_str, str):
101
+ model_str = _LLM_MODEL_DEFAULT
99
102
  if model_str == _LLM_MODEL_DEFAULT:
100
103
  summary_model = request.config.getoption("--aitest-summary-model", default=None)
101
- if summary_model:
104
+ if isinstance(summary_model, str) and summary_model:
102
105
  model_str = summary_model
103
106
 
104
107
  return LLMAssertImage(model=model_str)
@@ -20,9 +20,12 @@ import pytest
20
20
  from pytest_skill_engineering.core.scoring import (
21
21
  ScoreResult,
22
22
  ScoringDimension,
23
+ assert_score,
23
24
  )
24
25
 
25
- _LLM_MODEL_DEFAULT = "copilot/gpt-5-mini"
26
+ _LLM_MODEL_DEFAULT = "copilot/gpt-5.4-mini"
27
+
28
+ __all__ = ["LLMScore", "ScoreResult", "ScoringDimension", "assert_score", "llm_score"]
26
29
 
27
30
 
28
31
  # ---------------------------------------------------------------------------
@@ -240,7 +243,7 @@ def llm_score(request: pytest.FixtureRequest) -> LLMScore:
240
243
 
241
244
  1. ``--llm-model`` if explicitly set
242
245
  2. ``--aitest-summary-model`` (shared analysis model)
243
- 3. ``copilot/gpt-5-mini`` as final fallback
246
+ 3. ``copilot/gpt-5.4-mini`` as final fallback
244
247
 
245
248
  Example::
246
249
 
@@ -254,9 +257,11 @@ def llm_score(request: pytest.FixtureRequest) -> LLMScore:
254
257
  result = llm_score(my_text, rubric)
255
258
  assert_score(result, min_total=7)
256
259
  """
257
- model_str: str = request.config.getoption("--llm-model")
260
+ model_str = request.config.getoption("--llm-model")
261
+ if not isinstance(model_str, str):
262
+ model_str = _LLM_MODEL_DEFAULT
258
263
  if model_str == _LLM_MODEL_DEFAULT:
259
264
  summary_model = request.config.getoption("--aitest-summary-model", default=None)
260
- if summary_model:
265
+ if isinstance(summary_model, str) and summary_model:
261
266
  model_str = summary_model
262
267
  return LLMScore(model=model_str)
@@ -181,7 +181,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
181
181
  receives the parameter even though it does not declare the fixture
182
182
  explicitly.
183
183
  """
184
- count: int = metafunc.config.getoption("--aitest-iterations", default=1)
184
+ count_option = metafunc.config.getoption("--aitest-iterations", default=1)
185
+ count = count_option if isinstance(count_option, int) else 1
185
186
  if count <= 1:
186
187
  return
187
188
  metafunc.fixturenames.append("_aitest_iteration")
@@ -360,7 +361,7 @@ def _add_junit_properties(
360
361
  <testcase name="test_balance">
361
362
  <properties>
362
363
  <property name="aitest.agent.name" value="banking-agent"/>
363
- <property name="aitest.model" value="gpt-5-mini"/>
364
+ <property name="aitest.model" value="gpt-5.4-mini"/>
364
365
  <property name="aitest.skill" value="financial-advisor"/>
365
366
  <property name="aitest.tools.called" value="get_balance,transfer"/>
366
367
  </properties>
@@ -24,7 +24,7 @@ def add_aitest_options(group: OptionGroup) -> None:
24
24
  default=None,
25
25
  help=(
26
26
  "Model for AI analysis. Required when generating reports. "
27
- "Use the most capable model you can afford (e.g., gpt-5.1-chat, claude-opus-4)."
27
+ "Use the most capable model you can afford (e.g., gpt-5.5, claude-opus-4)."
28
28
  ),
29
29
  )
30
30
 
@@ -107,10 +107,10 @@ def add_aitest_options(group: OptionGroup) -> None:
107
107
  # LLM judge model for llm_assert fixture
108
108
  group.addoption(
109
109
  "--llm-model",
110
- default="copilot/gpt-5-mini",
110
+ default="copilot/gpt-5.4-mini",
111
111
  help=(
112
112
  "Model for llm_assert semantic assertions. "
113
- "Defaults to --aitest-summary-model if set, otherwise copilot/gpt-5-mini."
113
+ "Defaults to --aitest-summary-model if set, otherwise copilot/gpt-5.4-mini."
114
114
  ),
115
115
  )
116
116
 
@@ -107,7 +107,8 @@ def generate_structured_insights(
107
107
  from pytest_skill_engineering.reporting.insights import generate_insights
108
108
 
109
109
  # Require dedicated summary model - no fallback
110
- model = config.getoption("--aitest-summary-model")
110
+ model_option = config.getoption("--aitest-summary-model")
111
+ model = model_option if isinstance(model_option, str) else None
111
112
  if not model:
112
113
  if required:
113
114
  raise pytest.UsageError(
@@ -196,7 +197,7 @@ def generate_structured_insights(
196
197
  model=model,
197
198
  min_pass_rate=config.getoption("--aitest-min-pass-rate"),
198
199
  analysis_prompt=analysis_prompt,
199
- compact=config.getoption("--aitest-summary-compact"),
200
+ compact=config.getoption("--aitest-summary-compact") is True,
200
201
  )
201
202
 
202
203
  # Use asyncio.run() instead of deprecated get_event_loop().run_until_complete()
@@ -5,7 +5,7 @@ You are analyzing test results for **pytest-skill-engineering**, a skill enginee
5
5
  ## Key Concepts
6
6
 
7
7
  An **Eval** is a complete test configuration — the harness that exercises the skill stack:
8
- - **Model**: The LLM (e.g., `gpt-5-mini`, `gpt-4.1`)
8
+ - **Model**: The LLM (e.g., `gpt-5.4-mini`, `gpt-4.1`)
9
9
  - **MCP/CLI Servers**: The tools being tested (tool descriptions + schemas)
10
10
  - **MCP Prompt Templates**: Slash-command prompts bundled with MCP servers (e.g., `/mcp.servername.promptname`)
11
11
  - **Skill**: Optional domain knowledge injected into context
@@ -309,7 +309,7 @@ Use these sections as needed (skip sections with no content):
309
309
  - **Effective**: Eval followed instructions correctly
310
310
  - **Mixed**: Some tests passed, others showed confusion
311
311
  - **Ineffective**: Instructions ignored or misunderstood
312
- - **Model-specific effectiveness**: Instructions that fail with one model may succeed with another. If a variant was tested with multiple models (e.g., `gpt-5-mini + detailed` failed but `gpt-4.1 + detailed` passed), label it **mixed** — NOT ineffective. Only label instructions **ineffective** if they failed across ALL models tested. Always qualify: "ineffective with gpt-5-mini" rather than just "ineffective".
312
+ - **Model-specific effectiveness**: Instructions that fail with one model may succeed with another. If a variant was tested with multiple models (e.g., `gpt-5.4-mini + detailed` failed but `gpt-4.1 + detailed` passed), label it **mixed** — NOT ineffective. Only label instructions **ineffective** if they failed across ALL models tested. Always qualify: "ineffective with gpt-5.4-mini" rather than just "ineffective".
313
313
  - Note token bloat: "150 tokens of examples could be removed"
314
314
 
315
315
  ### Skill Feedback
@@ -372,7 +372,7 @@ Use these sections as needed (skip sections with no content):
372
372
  - **Gauge color values**: green=#4ade80, amber=#facc15, red=#f87171, blue=#60a5fa
373
373
  12. **Use pre-computed numbers** — The input includes a "Pre-computed Eval Statistics" section with exact values for pass rates, costs, tokens, winner designation, and aggregate stats (total tests, failures, agents, avg turns). Use these numbers verbatim. Never estimate or approximate.
374
374
  13. **Cost comparisons must use actual data** — When comparing costs between agents, use the **actual per-test cost** from the pre-computed statistics (total cost ÷ number of tests). Never cite model list pricing or theoretical cost differences. A cheaper model may use more tokens, making the realized cost difference much smaller than the per-token price difference. For example, if model A costs $0.0018/test and model B costs $0.0025/test, say "~28% cheaper" — NOT "85% cheaper" or "6× cheaper" based on list pricing.
375
- 14. **Instruction labels must be model-specific** — Never label custom agent instructions as globally "ineffective" or globally "effective" when tested with multiple models and produced different outcomes. If `gpt-5-mini + detailed` failed but `gpt-4.1 + detailed` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5-mini. The same applies to the Optimizations section: do not say "restrict [instructions] usage" if they work correctly with some models.
375
+ 14. **Instruction labels must be model-specific** — Never label custom agent instructions as globally "ineffective" or globally "effective" when tested with multiple models and produced different outcomes. If `gpt-5.4-mini + detailed` failed but `gpt-4.1 + detailed` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5.4-mini. The same applies to the Optimizations section: do not say "restrict [instructions] usage" if they work correctly with some models.
376
376
  15. **Bullet lists need a blank line before them** — In markdown, a list must be preceded by a blank line to render correctly. NEVER put a bullet list directly after a `**bold label:**` on the next line — the markdown parser will collapse them into a single paragraph. Use `####` headings instead of bold labels when you need a label followed by a list.
377
377
  16. **Iteration awareness** — When iteration data is present ("Iter Pass Rate" in Pre-computed Eval Statistics), factor consistency into your recommendation. An agent with 100% pass rate at 5/5 iterations is more reliable than one with 100% pass rate at 3/5 iterations. Flag tests with <100% iteration pass rate as **flaky** in your analysis. When no iteration data is present, skip all iteration-related analysis.
378
378
  17. **Score awareness** — When LLM score data is present (`LLM Score: X/Y (Z%)`), mention the weighted score in the Winner Card summary and note any dimensions below 70% in the analysis. When no score data exists, skip all score-related commentary.
@@ -288,7 +288,7 @@ Use these sections as needed (skip sections with no content):
288
288
  - **Effective**: Eval followed instructions and completed tasks correctly
289
289
  - **Mixed**: Some tasks succeeded, others showed the agent ignoring or misunderstanding instructions
290
290
  - **Ineffective**: Instructions were ignored or produced worse behavior
291
- - **Model-specific effectiveness**: An instruction that fails with one model may succeed with another. If an instruction variant was tested with multiple models (e.g., `gpt-5-mini + verbose` failed but `gpt-4.1 + verbose` passed), label it **mixed** — NOT ineffective. Only label an instruction **ineffective** if it failed across ALL models it was tested with. Always qualify: "ineffective with gpt-5-mini" rather than just "ineffective".
291
+ - **Model-specific effectiveness**: An instruction that fails with one model may succeed with another. If an instruction variant was tested with multiple models (e.g., `gpt-5.4-mini + verbose` failed but `gpt-4.1 + verbose` passed), label it **mixed** — NOT ineffective. Only label an instruction **ineffective** if it failed across ALL models it was tested with. Always qualify: "ineffective with gpt-5.4-mini" rather than just "ineffective".
292
292
  - Always show the problematic instruction text and a concrete replacement
293
293
 
294
294
  ### Tool Usage
@@ -357,5 +357,5 @@ Use these sections as needed (skip sections with no content):
357
357
  - **No inline color styles** — use only the CSS class names (green, blue, amber, red) on metric-card and metric-value
358
358
  12. **Use pre-computed numbers** — The input includes a "Pre-computed Eval Statistics" section with exact values for pass rates, costs, tokens, winner designation, and aggregate stats (total tests, failures, agents, avg turns). Use these numbers verbatim. Never estimate or approximate.
359
359
  13. **Cost comparisons must use actual data** — When comparing costs between agents, use the **actual per-test cost** from the pre-computed statistics (total cost ÷ number of tests). Never cite model list pricing or theoretical cost differences. A cheaper model may use more tokens, making the realized cost difference much smaller than the per-token price difference.
360
- 14. **Instruction labels must be model-specific** — Never label instructions as globally "ineffective" or globally "effective" when tested with multiple models producing different outcomes. If `gpt-5-mini + verbose` failed but `gpt-4.1 + verbose` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5-mini.
360
+ 14. **Instruction labels must be model-specific** — Never label instructions as globally "ineffective" or globally "effective" when tested with multiple models producing different outcomes. If `gpt-5.4-mini + verbose` failed but `gpt-4.1 + verbose` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5.4-mini.
361
361
  15. **Bullet lists need a blank line before them** — In markdown, a list must be preceded by a blank line to render correctly. NEVER put a bullet list directly after a `**bold label:**` on the next line — the markdown parser will collapse them into a single paragraph. Use `####` headings instead of bold labels when you need a label followed by a list.
@@ -479,7 +479,7 @@ async def generate_insights(
479
479
  custom_agent_info: list[CustomAgentInfo] | None = None,
480
480
  prompt_names: list[str] | None = None,
481
481
  instruction_file_info: list[InstructionFileInfo] | None = None,
482
- model: str = "copilot/gpt-5-mini",
482
+ model: str = "copilot/gpt-5.4-mini",
483
483
  cache_dir: Path | None = None,
484
484
  min_pass_rate: int | None = None,
485
485
  analysis_prompt: str | None = None,
@@ -496,7 +496,7 @@ async def generate_insights(
496
496
  custom_agent_info: Custom agent metadata (optional)
497
497
  prompt_names: Names of prompt files tested (optional)
498
498
  instruction_file_info: Custom instruction file metadata (optional)
499
- model: Model identifier (e.g., "copilot/gpt-5-mini", "azure/gpt-5-mini")
499
+ model: Model identifier (e.g., "copilot/gpt-5.4-mini", "azure/gpt-5.4-mini")
500
500
  cache_dir: Directory for caching results (optional)
501
501
  min_pass_rate: Minimum pass rate threshold for disqualifying agents
502
502
  analysis_prompt: Custom analysis prompt text. If None, uses the built-in