ai-engineering-init 1.7.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/.claude/hooks/skill-forced-eval.js +46 -62
  2. package/.claude/settings.json +10 -1
  3. package/.claude/skills/api-development/SKILL.md +179 -130
  4. package/.claude/skills/architecture-design/SKILL.md +102 -212
  5. package/.claude/skills/backend-annotations/SKILL.md +166 -220
  6. package/.claude/skills/bug-detective/SKILL.md +225 -186
  7. package/.claude/skills/code-patterns/SKILL.md +127 -244
  8. package/.claude/skills/collaborating-with-codex/SKILL.md +96 -113
  9. package/.claude/skills/crud-development/SKILL.md +226 -307
  10. package/.claude/skills/data-permission/SKILL.md +131 -202
  11. package/.claude/skills/database-ops/SKILL.md +158 -355
  12. package/.claude/skills/error-handler/SKILL.md +224 -285
  13. package/.claude/skills/file-oss-management/SKILL.md +174 -169
  14. package/.claude/skills/git-workflow/SKILL.md +123 -341
  15. package/.claude/skills/json-serialization/SKILL.md +121 -137
  16. package/.claude/skills/performance-doctor/SKILL.md +83 -89
  17. package/.claude/skills/redis-cache/SKILL.md +134 -185
  18. package/.claude/skills/scheduled-jobs/SKILL.md +187 -224
  19. package/.claude/skills/security-guard/SKILL.md +168 -276
  20. package/.claude/skills/sms-mail/SKILL.md +266 -228
  21. package/.claude/skills/social-login/SKILL.md +257 -195
  22. package/.claude/skills/tenant-management/SKILL.md +172 -188
  23. package/.claude/skills/utils-toolkit/SKILL.md +214 -222
  24. package/.claude/skills/websocket-sse/SKILL.md +251 -172
  25. package/.claude/skills/workflow-engine/SKILL.md +178 -250
  26. package/.codex/skills/api-development/SKILL.md +179 -130
  27. package/.codex/skills/architecture-design/SKILL.md +102 -212
  28. package/.codex/skills/backend-annotations/SKILL.md +166 -220
  29. package/.codex/skills/bug-detective/SKILL.md +225 -186
  30. package/.codex/skills/code-patterns/SKILL.md +127 -244
  31. package/.codex/skills/collaborating-with-codex/SKILL.md +96 -113
  32. package/.codex/skills/crud-development/SKILL.md +226 -307
  33. package/.codex/skills/data-permission/SKILL.md +131 -202
  34. package/.codex/skills/database-ops/SKILL.md +158 -355
  35. package/.codex/skills/error-handler/SKILL.md +224 -285
  36. package/.codex/skills/file-oss-management/SKILL.md +174 -169
  37. package/.codex/skills/git-workflow/SKILL.md +123 -341
  38. package/.codex/skills/json-serialization/SKILL.md +121 -137
  39. package/.codex/skills/performance-doctor/SKILL.md +83 -89
  40. package/.codex/skills/redis-cache/SKILL.md +134 -185
  41. package/.codex/skills/scheduled-jobs/SKILL.md +187 -224
  42. package/.codex/skills/security-guard/SKILL.md +168 -276
  43. package/.codex/skills/sms-mail/SKILL.md +266 -228
  44. package/.codex/skills/social-login/SKILL.md +257 -195
  45. package/.codex/skills/tenant-management/SKILL.md +172 -188
  46. package/.codex/skills/utils-toolkit/SKILL.md +214 -222
  47. package/.codex/skills/websocket-sse/SKILL.md +251 -172
  48. package/.codex/skills/workflow-engine/SKILL.md +178 -250
  49. package/.cursor/hooks/cursor-skill-eval.js +66 -6
  50. package/.cursor/skills/api-development/SKILL.md +179 -130
  51. package/.cursor/skills/architecture-design/SKILL.md +102 -212
  52. package/.cursor/skills/backend-annotations/SKILL.md +166 -220
  53. package/.cursor/skills/bug-detective/SKILL.md +225 -186
  54. package/.cursor/skills/code-patterns/SKILL.md +127 -244
  55. package/.cursor/skills/collaborating-with-codex/SKILL.md +96 -113
  56. package/.cursor/skills/crud-development/SKILL.md +226 -307
  57. package/.cursor/skills/data-permission/SKILL.md +131 -202
  58. package/.cursor/skills/database-ops/SKILL.md +158 -355
  59. package/.cursor/skills/error-handler/SKILL.md +224 -285
  60. package/.cursor/skills/file-oss-management/SKILL.md +174 -169
  61. package/.cursor/skills/git-workflow/SKILL.md +123 -341
  62. package/.cursor/skills/json-serialization/SKILL.md +121 -137
  63. package/.cursor/skills/performance-doctor/SKILL.md +83 -89
  64. package/.cursor/skills/redis-cache/SKILL.md +134 -185
  65. package/.cursor/skills/scheduled-jobs/SKILL.md +187 -224
  66. package/.cursor/skills/security-guard/SKILL.md +168 -276
  67. package/.cursor/skills/sms-mail/SKILL.md +266 -228
  68. package/.cursor/skills/social-login/SKILL.md +257 -195
  69. package/.cursor/skills/tenant-management/SKILL.md +172 -188
  70. package/.cursor/skills/utils-toolkit/SKILL.md +214 -222
  71. package/.cursor/skills/websocket-sse/SKILL.md +251 -172
  72. package/.cursor/skills/workflow-engine/SKILL.md +178 -250
  73. package/AGENTS.md +49 -540
  74. package/CLAUDE.md +73 -119
  75. package/README.md +37 -6
  76. package/bin/index.js +5 -1
  77. package/package.json +1 -1
  78. package/src/skills/api-development/SKILL.md +179 -130
  79. package/src/skills/architecture-design/SKILL.md +102 -212
  80. package/src/skills/backend-annotations/SKILL.md +166 -220
  81. package/src/skills/bug-detective/SKILL.md +225 -186
  82. package/src/skills/code-patterns/SKILL.md +127 -244
  83. package/src/skills/collaborating-with-codex/SKILL.md +96 -113
  84. package/src/skills/crud-development/SKILL.md +226 -307
  85. package/src/skills/data-permission/SKILL.md +131 -202
  86. package/src/skills/database-ops/SKILL.md +158 -355
  87. package/src/skills/error-handler/SKILL.md +224 -285
  88. package/src/skills/file-oss-management/SKILL.md +174 -169
  89. package/src/skills/git-workflow/SKILL.md +123 -341
  90. package/src/skills/json-serialization/SKILL.md +121 -137
  91. package/src/skills/performance-doctor/SKILL.md +83 -89
  92. package/src/skills/redis-cache/SKILL.md +134 -185
  93. package/src/skills/scheduled-jobs/SKILL.md +187 -224
  94. package/src/skills/security-guard/SKILL.md +168 -276
  95. package/src/skills/sms-mail/SKILL.md +266 -228
  96. package/src/skills/social-login/SKILL.md +257 -195
  97. package/src/skills/tenant-management/SKILL.md +172 -188
  98. package/src/skills/utils-toolkit/SKILL.md +214 -222
  99. package/src/skills/websocket-sse/SKILL.md +251 -172
  100. package/src/skills/workflow-engine/SKILL.md +178 -250
  101. package/.claude/skills/skill-creator/LICENSE.txt +0 -202
  102. package/.claude/skills/skill-creator/SKILL.md +0 -479
  103. package/.claude/skills/skill-creator/agents/analyzer.md +0 -274
  104. package/.claude/skills/skill-creator/agents/comparator.md +0 -202
  105. package/.claude/skills/skill-creator/agents/grader.md +0 -223
  106. package/.claude/skills/skill-creator/assets/eval_review.html +0 -146
  107. package/.claude/skills/skill-creator/eval-viewer/generate_review.py +0 -471
  108. package/.claude/skills/skill-creator/eval-viewer/viewer.html +0 -1325
  109. package/.claude/skills/skill-creator/references/schemas.md +0 -430
  110. package/.claude/skills/skill-creator/scripts/__init__.py +0 -0
  111. package/.claude/skills/skill-creator/scripts/aggregate_benchmark.py +0 -401
  112. package/.claude/skills/skill-creator/scripts/generate_report.py +0 -326
  113. package/.claude/skills/skill-creator/scripts/improve_description.py +0 -248
  114. package/.claude/skills/skill-creator/scripts/package_skill.py +0 -136
  115. package/.claude/skills/skill-creator/scripts/quick_validate.py +0 -103
  116. package/.claude/skills/skill-creator/scripts/run_eval.py +0 -310
  117. package/.claude/skills/skill-creator/scripts/run_loop.py +0 -332
  118. package/.claude/skills/skill-creator/scripts/utils.py +0 -47
@@ -1,401 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Aggregate individual run results into benchmark summary statistics.
4
-
5
- Reads grading.json files from run directories and produces:
6
- - run_summary with mean, stddev, min, max for each metric
7
- - delta between with_skill and without_skill configurations
8
-
9
- Usage:
10
- python aggregate_benchmark.py <benchmark_dir>
11
-
12
- Example:
13
- python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
14
-
15
- The script supports two directory layouts:
16
-
17
- Workspace layout (from skill-creator iterations):
18
- <benchmark_dir>/
19
- └── eval-N/
20
- ├── with_skill/
21
- │ ├── run-1/grading.json
22
- │ └── run-2/grading.json
23
- └── without_skill/
24
- ├── run-1/grading.json
25
- └── run-2/grading.json
26
-
27
- Legacy layout (with runs/ subdirectory):
28
- <benchmark_dir>/
29
- └── runs/
30
- └── eval-N/
31
- ├── with_skill/
32
- │ └── run-1/grading.json
33
- └── without_skill/
34
- └── run-1/grading.json
35
- """
36
-
37
- import argparse
38
- import json
39
- import math
40
- import sys
41
- from datetime import datetime, timezone
42
- from pathlib import Path
43
-
44
-
45
- def calculate_stats(values: list[float]) -> dict:
46
- """Calculate mean, stddev, min, max for a list of values."""
47
- if not values:
48
- return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
49
-
50
- n = len(values)
51
- mean = sum(values) / n
52
-
53
- if n > 1:
54
- variance = sum((x - mean) ** 2 for x in values) / (n - 1)
55
- stddev = math.sqrt(variance)
56
- else:
57
- stddev = 0.0
58
-
59
- return {
60
- "mean": round(mean, 4),
61
- "stddev": round(stddev, 4),
62
- "min": round(min(values), 4),
63
- "max": round(max(values), 4)
64
- }
65
-
66
-
67
- def load_run_results(benchmark_dir: Path) -> dict:
68
- """
69
- Load all run results from a benchmark directory.
70
-
71
- Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
72
- or "new_skill"/"old_skill"), each containing a list of run results.
73
- """
74
- # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
75
- runs_dir = benchmark_dir / "runs"
76
- if runs_dir.exists():
77
- search_dir = runs_dir
78
- elif list(benchmark_dir.glob("eval-*")):
79
- search_dir = benchmark_dir
80
- else:
81
- print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
82
- return {}
83
-
84
- results: dict[str, list] = {}
85
-
86
- for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
87
- metadata_path = eval_dir / "eval_metadata.json"
88
- if metadata_path.exists():
89
- try:
90
- with open(metadata_path) as mf:
91
- eval_id = json.load(mf).get("eval_id", eval_idx)
92
- except (json.JSONDecodeError, OSError):
93
- eval_id = eval_idx
94
- else:
95
- try:
96
- eval_id = int(eval_dir.name.split("-")[1])
97
- except ValueError:
98
- eval_id = eval_idx
99
-
100
- # Discover config directories dynamically rather than hardcoding names
101
- for config_dir in sorted(eval_dir.iterdir()):
102
- if not config_dir.is_dir():
103
- continue
104
- # Skip non-config directories (inputs, outputs, etc.)
105
- if not list(config_dir.glob("run-*")):
106
- continue
107
- config = config_dir.name
108
- if config not in results:
109
- results[config] = []
110
-
111
- for run_dir in sorted(config_dir.glob("run-*")):
112
- run_number = int(run_dir.name.split("-")[1])
113
- grading_file = run_dir / "grading.json"
114
-
115
- if not grading_file.exists():
116
- print(f"Warning: grading.json not found in {run_dir}")
117
- continue
118
-
119
- try:
120
- with open(grading_file) as f:
121
- grading = json.load(f)
122
- except json.JSONDecodeError as e:
123
- print(f"Warning: Invalid JSON in {grading_file}: {e}")
124
- continue
125
-
126
- # Extract metrics
127
- result = {
128
- "eval_id": eval_id,
129
- "run_number": run_number,
130
- "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
131
- "passed": grading.get("summary", {}).get("passed", 0),
132
- "failed": grading.get("summary", {}).get("failed", 0),
133
- "total": grading.get("summary", {}).get("total", 0),
134
- }
135
-
136
- # Extract timing — check grading.json first, then sibling timing.json
137
- timing = grading.get("timing", {})
138
- result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
139
- timing_file = run_dir / "timing.json"
140
- if result["time_seconds"] == 0.0 and timing_file.exists():
141
- try:
142
- with open(timing_file) as tf:
143
- timing_data = json.load(tf)
144
- result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
145
- result["tokens"] = timing_data.get("total_tokens", 0)
146
- except json.JSONDecodeError:
147
- pass
148
-
149
- # Extract metrics if available
150
- metrics = grading.get("execution_metrics", {})
151
- result["tool_calls"] = metrics.get("total_tool_calls", 0)
152
- if not result.get("tokens"):
153
- result["tokens"] = metrics.get("output_chars", 0)
154
- result["errors"] = metrics.get("errors_encountered", 0)
155
-
156
- # Extract expectations — viewer requires fields: text, passed, evidence
157
- raw_expectations = grading.get("expectations", [])
158
- for exp in raw_expectations:
159
- if "text" not in exp or "passed" not in exp:
160
- print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
161
- result["expectations"] = raw_expectations
162
-
163
- # Extract notes from user_notes_summary
164
- notes_summary = grading.get("user_notes_summary", {})
165
- notes = []
166
- notes.extend(notes_summary.get("uncertainties", []))
167
- notes.extend(notes_summary.get("needs_review", []))
168
- notes.extend(notes_summary.get("workarounds", []))
169
- result["notes"] = notes
170
-
171
- results[config].append(result)
172
-
173
- return results
174
-
175
-
176
- def aggregate_results(results: dict) -> dict:
177
- """
178
- Aggregate run results into summary statistics.
179
-
180
- Returns run_summary with stats for each configuration and delta.
181
- """
182
- run_summary = {}
183
- configs = list(results.keys())
184
-
185
- for config in configs:
186
- runs = results.get(config, [])
187
-
188
- if not runs:
189
- run_summary[config] = {
190
- "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
191
- "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
192
- "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
193
- }
194
- continue
195
-
196
- pass_rates = [r["pass_rate"] for r in runs]
197
- times = [r["time_seconds"] for r in runs]
198
- tokens = [r.get("tokens", 0) for r in runs]
199
-
200
- run_summary[config] = {
201
- "pass_rate": calculate_stats(pass_rates),
202
- "time_seconds": calculate_stats(times),
203
- "tokens": calculate_stats(tokens)
204
- }
205
-
206
- # Calculate delta between the first two configs (if two exist)
207
- if len(configs) >= 2:
208
- primary = run_summary.get(configs[0], {})
209
- baseline = run_summary.get(configs[1], {})
210
- else:
211
- primary = run_summary.get(configs[0], {}) if configs else {}
212
- baseline = {}
213
-
214
- delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
215
- delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
216
- delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
217
-
218
- run_summary["delta"] = {
219
- "pass_rate": f"{delta_pass_rate:+.2f}",
220
- "time_seconds": f"{delta_time:+.1f}",
221
- "tokens": f"{delta_tokens:+.0f}"
222
- }
223
-
224
- return run_summary
225
-
226
-
227
- def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
228
- """
229
- Generate complete benchmark.json from run results.
230
- """
231
- results = load_run_results(benchmark_dir)
232
- run_summary = aggregate_results(results)
233
-
234
- # Build runs array for benchmark.json
235
- runs = []
236
- for config in results:
237
- for result in results[config]:
238
- runs.append({
239
- "eval_id": result["eval_id"],
240
- "configuration": config,
241
- "run_number": result["run_number"],
242
- "result": {
243
- "pass_rate": result["pass_rate"],
244
- "passed": result["passed"],
245
- "failed": result["failed"],
246
- "total": result["total"],
247
- "time_seconds": result["time_seconds"],
248
- "tokens": result.get("tokens", 0),
249
- "tool_calls": result.get("tool_calls", 0),
250
- "errors": result.get("errors", 0)
251
- },
252
- "expectations": result["expectations"],
253
- "notes": result["notes"]
254
- })
255
-
256
- # Determine eval IDs from results
257
- eval_ids = sorted(set(
258
- r["eval_id"]
259
- for config in results.values()
260
- for r in config
261
- ))
262
-
263
- benchmark = {
264
- "metadata": {
265
- "skill_name": skill_name or "<skill-name>",
266
- "skill_path": skill_path or "<path/to/skill>",
267
- "executor_model": "<model-name>",
268
- "analyzer_model": "<model-name>",
269
- "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
270
- "evals_run": eval_ids,
271
- "runs_per_configuration": 3
272
- },
273
- "runs": runs,
274
- "run_summary": run_summary,
275
- "notes": [] # To be filled by analyzer
276
- }
277
-
278
- return benchmark
279
-
280
-
281
- def generate_markdown(benchmark: dict) -> str:
282
- """Generate human-readable benchmark.md from benchmark data."""
283
- metadata = benchmark["metadata"]
284
- run_summary = benchmark["run_summary"]
285
-
286
- # Determine config names (excluding "delta")
287
- configs = [k for k in run_summary if k != "delta"]
288
- config_a = configs[0] if len(configs) >= 1 else "config_a"
289
- config_b = configs[1] if len(configs) >= 2 else "config_b"
290
- label_a = config_a.replace("_", " ").title()
291
- label_b = config_b.replace("_", " ").title()
292
-
293
- lines = [
294
- f"# Skill Benchmark: {metadata['skill_name']}",
295
- "",
296
- f"**Model**: {metadata['executor_model']}",
297
- f"**Date**: {metadata['timestamp']}",
298
- f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
299
- "",
300
- "## Summary",
301
- "",
302
- f"| Metric | {label_a} | {label_b} | Delta |",
303
- "|--------|------------|---------------|-------|",
304
- ]
305
-
306
- a_summary = run_summary.get(config_a, {})
307
- b_summary = run_summary.get(config_b, {})
308
- delta = run_summary.get("delta", {})
309
-
310
- # Format pass rate
311
- a_pr = a_summary.get("pass_rate", {})
312
- b_pr = b_summary.get("pass_rate", {})
313
- lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
314
-
315
- # Format time
316
- a_time = a_summary.get("time_seconds", {})
317
- b_time = b_summary.get("time_seconds", {})
318
- lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
319
-
320
- # Format tokens
321
- a_tokens = a_summary.get("tokens", {})
322
- b_tokens = b_summary.get("tokens", {})
323
- lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
324
-
325
- # Notes section
326
- if benchmark.get("notes"):
327
- lines.extend([
328
- "",
329
- "## Notes",
330
- ""
331
- ])
332
- for note in benchmark["notes"]:
333
- lines.append(f"- {note}")
334
-
335
- return "\n".join(lines)
336
-
337
-
338
- def main():
339
- parser = argparse.ArgumentParser(
340
- description="Aggregate benchmark run results into summary statistics"
341
- )
342
- parser.add_argument(
343
- "benchmark_dir",
344
- type=Path,
345
- help="Path to the benchmark directory"
346
- )
347
- parser.add_argument(
348
- "--skill-name",
349
- default="",
350
- help="Name of the skill being benchmarked"
351
- )
352
- parser.add_argument(
353
- "--skill-path",
354
- default="",
355
- help="Path to the skill being benchmarked"
356
- )
357
- parser.add_argument(
358
- "--output", "-o",
359
- type=Path,
360
- help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
361
- )
362
-
363
- args = parser.parse_args()
364
-
365
- if not args.benchmark_dir.exists():
366
- print(f"Directory not found: {args.benchmark_dir}")
367
- sys.exit(1)
368
-
369
- # Generate benchmark
370
- benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
371
-
372
- # Determine output paths
373
- output_json = args.output or (args.benchmark_dir / "benchmark.json")
374
- output_md = output_json.with_suffix(".md")
375
-
376
- # Write benchmark.json
377
- with open(output_json, "w") as f:
378
- json.dump(benchmark, f, indent=2)
379
- print(f"Generated: {output_json}")
380
-
381
- # Write benchmark.md
382
- markdown = generate_markdown(benchmark)
383
- with open(output_md, "w") as f:
384
- f.write(markdown)
385
- print(f"Generated: {output_md}")
386
-
387
- # Print summary
388
- run_summary = benchmark["run_summary"]
389
- configs = [k for k in run_summary if k != "delta"]
390
- delta = run_summary.get("delta", {})
391
-
392
- print(f"\nSummary:")
393
- for config in configs:
394
- pr = run_summary[config]["pass_rate"]["mean"]
395
- label = config.replace("_", " ").title()
396
- print(f" {label}: {pr*100:.1f}% pass rate")
397
- print(f" Delta: {delta.get('pass_rate', '—')}")
398
-
399
-
400
- if __name__ == "__main__":
401
- main()