xtrm-tools 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (447) hide show
  1. package/CHANGELOG.md +496 -0
  2. package/README.md +762 -0
  3. package/cli/dist/index.cjs +55245 -0
  4. package/cli/dist/index.cjs.map +1 -0
  5. package/cli/dist/index.d.cts +2 -0
  6. package/cli/package.json +43 -0
  7. package/config/.env.example +40 -0
  8. package/config/hooks.json +36 -0
  9. package/config/mcp_servers.json +46 -0
  10. package/config/mcp_servers_optional.json +53 -0
  11. package/config/settings.json +70 -0
  12. package/hooks/README.md +156 -0
  13. package/hooks/__pycache__/agent_context.cpython-314.pyc +0 -0
  14. package/hooks/agent_context.py +105 -0
  15. package/hooks/gitnexus/gitnexus-hook.cjs +133 -0
  16. package/hooks/serena-workflow-reminder.py +74 -0
  17. package/hooks/skill-discovery.py +90 -0
  18. package/hooks/skill-suggestion.py +112 -0
  19. package/hooks/test_agent_context.py +112 -0
  20. package/hooks/type-safety-enforcement.py +107 -0
  21. package/package.json +48 -0
  22. package/project-skills/main-guard/.claude/hooks/main-guard.cjs +188 -0
  23. package/project-skills/main-guard/.claude/settings.json +16 -0
  24. package/project-skills/main-guard/.claude/skills/using-main-guard/SKILL.md +135 -0
  25. package/project-skills/main-guard/README.md +163 -0
  26. package/project-skills/py-quality-gate/.claude/hooks/quality-check.py +311 -0
  27. package/project-skills/py-quality-gate/.claude/settings.json +16 -0
  28. package/project-skills/py-quality-gate/.claude/skills/using-py-quality-gate/SKILL.md +112 -0
  29. package/project-skills/py-quality-gate/README.md +147 -0
  30. package/project-skills/service-skills-set/.claude/git-hooks/__pycache__/doc_reminder.cpython-314.pyc +0 -0
  31. package/project-skills/service-skills-set/.claude/git-hooks/__pycache__/skill_staleness.cpython-314.pyc +0 -0
  32. package/project-skills/service-skills-set/.claude/git-hooks/doc_reminder.py +67 -0
  33. package/project-skills/service-skills-set/.claude/git-hooks/skill_staleness.py +194 -0
  34. package/project-skills/service-skills-set/.claude/service-registry.json +4 -0
  35. package/project-skills/service-skills-set/.claude/settings.json +37 -0
  36. package/project-skills/service-skills-set/.claude/skills/creating-service-skills/SKILL.md +433 -0
  37. package/project-skills/service-skills-set/.claude/skills/creating-service-skills/references/script_quality_standards.md +412 -0
  38. package/project-skills/service-skills-set/.claude/skills/creating-service-skills/references/service_skill_system_guide.md +264 -0
  39. package/project-skills/service-skills-set/.claude/skills/creating-service-skills/scripts/bootstrap.py +308 -0
  40. package/project-skills/service-skills-set/.claude/skills/creating-service-skills/scripts/deep_dive.py +304 -0
  41. package/project-skills/service-skills-set/.claude/skills/creating-service-skills/scripts/scaffolder.py +482 -0
  42. package/project-skills/service-skills-set/.claude/skills/scoping-service-skills/SKILL.md +231 -0
  43. package/project-skills/service-skills-set/.claude/skills/scoping-service-skills/scripts/scope.py +74 -0
  44. package/project-skills/service-skills-set/.claude/skills/updating-service-skills/SKILL.md +136 -0
  45. package/project-skills/service-skills-set/.claude/skills/updating-service-skills/scripts/__pycache__/drift_detector.cpython-314.pyc +0 -0
  46. package/project-skills/service-skills-set/.claude/skills/updating-service-skills/scripts/drift_detector.py +222 -0
  47. package/project-skills/service-skills-set/.claude/skills/using-service-skills/SKILL.md +108 -0
  48. package/project-skills/service-skills-set/.claude/skills/using-service-skills/scripts/__pycache__/cataloger.cpython-314.pyc +0 -0
  49. package/project-skills/service-skills-set/.claude/skills/using-service-skills/scripts/__pycache__/skill_activator.cpython-314.pyc +0 -0
  50. package/project-skills/service-skills-set/.claude/skills/using-service-skills/scripts/cataloger.py +74 -0
  51. package/project-skills/service-skills-set/.claude/skills/using-service-skills/scripts/skill_activator.py +152 -0
  52. package/project-skills/service-skills-set/README.md +93 -0
  53. package/project-skills/service-skills-set/__pycache__/install-service-skills.cpython-314.pyc +0 -0
  54. package/project-skills/service-skills-set/install-service-skills.py +163 -0
  55. package/project-skills/service-skills-set/service-skills-readme.md +236 -0
  56. package/project-skills/tdd-guard/.claude/settings.json +38 -0
  57. package/project-skills/tdd-guard/.claude/skills/using-tdd-guard/SKILL.md +74 -0
  58. package/project-skills/tdd-guard/CLAUDE.md +98 -0
  59. package/project-skills/tdd-guard/CONTRIBUTING.md +38 -0
  60. package/project-skills/tdd-guard/DEVELOPMENT.md +127 -0
  61. package/project-skills/tdd-guard/LICENSE +21 -0
  62. package/project-skills/tdd-guard/README.md +396 -0
  63. package/project-skills/tdd-guard/docs/adr/001-claude-session-subdirectory.md +52 -0
  64. package/project-skills/tdd-guard/docs/adr/002-secure-claude-binary-path.md +56 -0
  65. package/project-skills/tdd-guard/docs/adr/003-remove-configurable-data-directory.md +56 -0
  66. package/project-skills/tdd-guard/docs/adr/004-monorepo-architecture.md +64 -0
  67. package/project-skills/tdd-guard/docs/adr/005-claude-project-dir-support.md +55 -0
  68. package/project-skills/tdd-guard/docs/adr/006-phpunit-separate-repository.md +93 -0
  69. package/project-skills/tdd-guard/docs/adr/007-golangci-lint-path-support.md +83 -0
  70. package/project-skills/tdd-guard/docs/adr/008-storybook-reporter-design.md +182 -0
  71. package/project-skills/tdd-guard/docs/assets/tdd-guard-demo-screenshot.gif +0 -0
  72. package/project-skills/tdd-guard/docs/config-migration.md +143 -0
  73. package/project-skills/tdd-guard/docs/configuration.md +137 -0
  74. package/project-skills/tdd-guard/docs/custom-instructions.md +43 -0
  75. package/project-skills/tdd-guard/docs/enforcement.md +46 -0
  76. package/project-skills/tdd-guard/docs/ignore-patterns.md +81 -0
  77. package/project-skills/tdd-guard/docs/linting.md +109 -0
  78. package/project-skills/tdd-guard/docs/quick-commands.md +52 -0
  79. package/project-skills/tdd-guard/docs/session-management.md +75 -0
  80. package/project-skills/tdd-guard/docs/storybook-vitest-addon.md +120 -0
  81. package/project-skills/tdd-guard/docs/validation-model.md +63 -0
  82. package/project-skills/tdd-guard/eslint.config.mjs +140 -0
  83. package/project-skills/tdd-guard/package-lock.json +16937 -0
  84. package/project-skills/tdd-guard/package.json +102 -0
  85. package/project-skills/tdd-guard/reporters/go/README.md +67 -0
  86. package/project-skills/tdd-guard/reporters/go/cmd/tdd-guard-go/main.go +127 -0
  87. package/project-skills/tdd-guard/reporters/go/cmd/tdd-guard-go/main_test.go +280 -0
  88. package/project-skills/tdd-guard/reporters/go/go.mod +3 -0
  89. package/project-skills/tdd-guard/reporters/go/go.sum +0 -0
  90. package/project-skills/tdd-guard/reporters/go/internal/formatter/formatter.go +126 -0
  91. package/project-skills/tdd-guard/reporters/go/internal/formatter/formatter_test.go +264 -0
  92. package/project-skills/tdd-guard/reporters/go/internal/io/tee_reader.go +26 -0
  93. package/project-skills/tdd-guard/reporters/go/internal/io/tee_reader_test.go +37 -0
  94. package/project-skills/tdd-guard/reporters/go/internal/parser/mixed_reader.go +94 -0
  95. package/project-skills/tdd-guard/reporters/go/internal/parser/mixed_reader_test.go +198 -0
  96. package/project-skills/tdd-guard/reporters/go/internal/parser/parser.go +245 -0
  97. package/project-skills/tdd-guard/reporters/go/internal/parser/parser_test.go +547 -0
  98. package/project-skills/tdd-guard/reporters/go/internal/storage/storage.go +35 -0
  99. package/project-skills/tdd-guard/reporters/go/internal/storage/storage_test.go +113 -0
  100. package/project-skills/tdd-guard/reporters/go/internal/transformer/transformer.go +103 -0
  101. package/project-skills/tdd-guard/reporters/go/internal/transformer/transformer_test.go +303 -0
  102. package/project-skills/tdd-guard/reporters/jest/README.md +102 -0
  103. package/project-skills/tdd-guard/reporters/jest/package.json +38 -0
  104. package/project-skills/tdd-guard/reporters/phpunit/.php-cs-fixer.php +28 -0
  105. package/project-skills/tdd-guard/reporters/phpunit/README.md +97 -0
  106. package/project-skills/tdd-guard/reporters/phpunit/SYNC_README.md +29 -0
  107. package/project-skills/tdd-guard/reporters/phpunit/composer.json +55 -0
  108. package/project-skills/tdd-guard/reporters/phpunit/phpunit.xml.dist +19 -0
  109. package/project-skills/tdd-guard/reporters/phpunit/psalm.xml +44 -0
  110. package/project-skills/tdd-guard/reporters/phpunit/src/Event/ErroredTestSubscriber.php +28 -0
  111. package/project-skills/tdd-guard/reporters/phpunit/src/Event/FailedTestSubscriber.php +28 -0
  112. package/project-skills/tdd-guard/reporters/phpunit/src/Event/IncompleteTestSubscriber.php +28 -0
  113. package/project-skills/tdd-guard/reporters/phpunit/src/Event/PassedTestSubscriber.php +27 -0
  114. package/project-skills/tdd-guard/reporters/phpunit/src/Event/SkippedTestSubscriber.php +28 -0
  115. package/project-skills/tdd-guard/reporters/phpunit/src/Event/TestRunnerFinishedSubscriber.php +24 -0
  116. package/project-skills/tdd-guard/reporters/phpunit/src/PathValidator.php +88 -0
  117. package/project-skills/tdd-guard/reporters/phpunit/src/Storage.php +26 -0
  118. package/project-skills/tdd-guard/reporters/phpunit/src/TddGuardExtension.php +33 -0
  119. package/project-skills/tdd-guard/reporters/phpunit/src/TddGuardListener.php +158 -0
  120. package/project-skills/tdd-guard/reporters/phpunit/src/TddGuardSubscriber.php +35 -0
  121. package/project-skills/tdd-guard/reporters/phpunit/src/TestResultCollector.php +105 -0
  122. package/project-skills/tdd-guard/reporters/phpunit/tests/PathValidatorTest.php +74 -0
  123. package/project-skills/tdd-guard/reporters/phpunit/tests/TddGuardExtensionFailedTest.php +241 -0
  124. package/project-skills/tdd-guard/reporters/phpunit/tests/TddGuardExtensionTest.php +84 -0
  125. package/project-skills/tdd-guard/reporters/phpunit/tests/TddGuardStorageLocationTest.php +71 -0
  126. package/project-skills/tdd-guard/reporters/pytest/README.md +77 -0
  127. package/project-skills/tdd-guard/reporters/pytest/pyproject.toml +43 -0
  128. package/project-skills/tdd-guard/reporters/pytest/pytest.ini.example +7 -0
  129. package/project-skills/tdd-guard/reporters/pytest/tdd_guard_pytest/__init__.py +1 -0
  130. package/project-skills/tdd-guard/reporters/pytest/tdd_guard_pytest/pytest_reporter.py +134 -0
  131. package/project-skills/tdd-guard/reporters/pytest/tests/__init__.py +1 -0
  132. package/project-skills/tdd-guard/reporters/pytest/tests/conftest.py +3 -0
  133. package/project-skills/tdd-guard/reporters/pytest/tests/helpers.py +293 -0
  134. package/project-skills/tdd-guard/reporters/pytest/tests/test_config_option.py +38 -0
  135. package/project-skills/tdd-guard/reporters/pytest/tests/test_path_validation.py +59 -0
  136. package/project-skills/tdd-guard/reporters/pytest/tests/test_plugin_config.py +32 -0
  137. package/project-skills/tdd-guard/reporters/pytest/tests/test_project_root.py +296 -0
  138. package/project-skills/tdd-guard/reporters/pytest/tests/test_pytest_reporter.py +137 -0
  139. package/project-skills/tdd-guard/reporters/rspec/Gemfile +3 -0
  140. package/project-skills/tdd-guard/reporters/rust/Cargo.lock +458 -0
  141. package/project-skills/tdd-guard/reporters/rust/Cargo.toml +33 -0
  142. package/project-skills/tdd-guard/reporters/rust/Makefile.example +95 -0
  143. package/project-skills/tdd-guard/reporters/rust/README.md +88 -0
  144. package/project-skills/tdd-guard/reporters/rust/src/error_parser.rs +309 -0
  145. package/project-skills/tdd-guard/reporters/rust/src/main.rs +464 -0
  146. package/project-skills/tdd-guard/reporters/rust/src/parser.rs +225 -0
  147. package/project-skills/tdd-guard/reporters/rust/src/transformer.rs +409 -0
  148. package/project-skills/tdd-guard/reporters/storybook/README.md +108 -0
  149. package/project-skills/tdd-guard/reporters/storybook/package-lock.json +9482 -0
  150. package/project-skills/tdd-guard/reporters/storybook/package.json +43 -0
  151. package/project-skills/tdd-guard/reporters/storybook/src/StorybookReporter.test-data.ts +22 -0
  152. package/project-skills/tdd-guard/reporters/storybook/src/StorybookReporter.test.ts +190 -0
  153. package/project-skills/tdd-guard/reporters/storybook/src/StorybookReporter.ts +88 -0
  154. package/project-skills/tdd-guard/reporters/storybook/src/index.ts +12 -0
  155. package/project-skills/tdd-guard/reporters/storybook/src/types.ts +37 -0
  156. package/project-skills/tdd-guard/reporters/storybook/tsconfig.json +11 -0
  157. package/project-skills/tdd-guard/reporters/test/artifacts/go/failing/go.mod +3 -0
  158. package/project-skills/tdd-guard/reporters/test/artifacts/go/failing/single_failing_test.go +13 -0
  159. package/project-skills/tdd-guard/reporters/test/artifacts/go/import/go.mod +3 -0
  160. package/project-skills/tdd-guard/reporters/test/artifacts/go/import/single_import_error_test.go +17 -0
  161. package/project-skills/tdd-guard/reporters/test/artifacts/go/passing/go.mod +3 -0
  162. package/project-skills/tdd-guard/reporters/test/artifacts/go/passing/single_passing_test.go +13 -0
  163. package/project-skills/tdd-guard/reporters/test/artifacts/jest/single-failing.test.js +5 -0
  164. package/project-skills/tdd-guard/reporters/test/artifacts/jest/single-import-error.test.js +8 -0
  165. package/project-skills/tdd-guard/reporters/test/artifacts/jest/single-passing.test.js +5 -0
  166. package/project-skills/tdd-guard/reporters/test/artifacts/phpunit/SingleFailingTest.php +11 -0
  167. package/project-skills/tdd-guard/reporters/test/artifacts/phpunit/SingleImportErrorTest.php +14 -0
  168. package/project-skills/tdd-guard/reporters/test/artifacts/phpunit/SinglePassingTest.php +11 -0
  169. package/project-skills/tdd-guard/reporters/test/artifacts/pytest/test_single_failing.py +3 -0
  170. package/project-skills/tdd-guard/reporters/test/artifacts/pytest/test_single_import_error.py +6 -0
  171. package/project-skills/tdd-guard/reporters/test/artifacts/pytest/test_single_passing.py +3 -0
  172. package/project-skills/tdd-guard/reporters/test/artifacts/rust/failing/Cargo.lock +7 -0
  173. package/project-skills/tdd-guard/reporters/test/artifacts/rust/failing/Cargo.toml +4 -0
  174. package/project-skills/tdd-guard/reporters/test/artifacts/rust/failing/src/lib.rs +14 -0
  175. package/project-skills/tdd-guard/reporters/test/artifacts/rust/import/Cargo.lock +7 -0
  176. package/project-skills/tdd-guard/reporters/test/artifacts/rust/import/Cargo.toml +4 -0
  177. package/project-skills/tdd-guard/reporters/test/artifacts/rust/import/src/lib.rs +13 -0
  178. package/project-skills/tdd-guard/reporters/test/artifacts/rust/passing/Cargo.lock +7 -0
  179. package/project-skills/tdd-guard/reporters/test/artifacts/rust/passing/Cargo.toml +4 -0
  180. package/project-skills/tdd-guard/reporters/test/artifacts/rust/passing/src/lib.rs +14 -0
  181. package/project-skills/tdd-guard/reporters/test/artifacts/storybook/Calculator.js +4 -0
  182. package/project-skills/tdd-guard/reporters/test/artifacts/storybook/single-failing.stories.js +15 -0
  183. package/project-skills/tdd-guard/reporters/test/artifacts/storybook/single-import-error.stories.js +14 -0
  184. package/project-skills/tdd-guard/reporters/test/artifacts/storybook/single-passing.stories.js +15 -0
  185. package/project-skills/tdd-guard/reporters/test/artifacts/vitest/single-failing.test.js +7 -0
  186. package/project-skills/tdd-guard/reporters/test/artifacts/vitest/single-import-error.test.js +9 -0
  187. package/project-skills/tdd-guard/reporters/test/artifacts/vitest/single-passing.test.js +7 -0
  188. package/project-skills/tdd-guard/reporters/test/factories/go.ts +59 -0
  189. package/project-skills/tdd-guard/reporters/test/factories/helpers.ts +48 -0
  190. package/project-skills/tdd-guard/reporters/test/factories/index.ts +7 -0
  191. package/project-skills/tdd-guard/reporters/test/factories/jest.ts +51 -0
  192. package/project-skills/tdd-guard/reporters/test/factories/phpunit.ts +63 -0
  193. package/project-skills/tdd-guard/reporters/test/factories/pytest.ts +41 -0
  194. package/project-skills/tdd-guard/reporters/test/factories/rust.ts +158 -0
  195. package/project-skills/tdd-guard/reporters/test/factories/storybook.ts +198 -0
  196. package/project-skills/tdd-guard/reporters/test/factories/vitest.ts +51 -0
  197. package/project-skills/tdd-guard/reporters/test/reporters.integration.test.ts +735 -0
  198. package/project-skills/tdd-guard/reporters/test/types.ts +28 -0
  199. package/project-skills/tdd-guard/reporters/vitest/README.md +64 -0
  200. package/project-skills/tdd-guard/reporters/vitest/package.json +35 -0
  201. package/project-skills/tdd-guard/src/cli/buildContext.test.ts +200 -0
  202. package/project-skills/tdd-guard/src/cli/buildContext.ts +48 -0
  203. package/project-skills/tdd-guard/src/cli/tdd-guard.test.ts +159 -0
  204. package/project-skills/tdd-guard/src/cli/tdd-guard.ts +48 -0
  205. package/project-skills/tdd-guard/src/config/Config.test.ts +538 -0
  206. package/project-skills/tdd-guard/src/config/Config.ts +172 -0
  207. package/project-skills/tdd-guard/src/contracts/schemas/guardSchemas.test.ts +58 -0
  208. package/project-skills/tdd-guard/src/contracts/schemas/guardSchemas.ts +8 -0
  209. package/project-skills/tdd-guard/src/contracts/schemas/lintSchemas.test.ts +347 -0
  210. package/project-skills/tdd-guard/src/contracts/schemas/lintSchemas.ts +61 -0
  211. package/project-skills/tdd-guard/src/contracts/schemas/pytestSchemas.test.ts +24 -0
  212. package/project-skills/tdd-guard/src/contracts/schemas/pytestSchemas.ts +7 -0
  213. package/project-skills/tdd-guard/src/contracts/schemas/reporterSchemas.test.ts +377 -0
  214. package/project-skills/tdd-guard/src/contracts/schemas/reporterSchemas.ts +75 -0
  215. package/project-skills/tdd-guard/src/contracts/schemas/toolSchemas.test.ts +563 -0
  216. package/project-skills/tdd-guard/src/contracts/schemas/toolSchemas.ts +140 -0
  217. package/project-skills/tdd-guard/src/contracts/types/ClientType.ts +1 -0
  218. package/project-skills/tdd-guard/src/contracts/types/ConfigOptions.ts +12 -0
  219. package/project-skills/tdd-guard/src/contracts/types/Context.ts +16 -0
  220. package/project-skills/tdd-guard/src/contracts/types/ModelClient.ts +3 -0
  221. package/project-skills/tdd-guard/src/contracts/types/ValidationResult.ts +6 -0
  222. package/project-skills/tdd-guard/src/guard/GuardManager.test.ts +336 -0
  223. package/project-skills/tdd-guard/src/guard/GuardManager.ts +83 -0
  224. package/project-skills/tdd-guard/src/hooks/HookEvents.test.ts +107 -0
  225. package/project-skills/tdd-guard/src/hooks/HookEvents.ts +39 -0
  226. package/project-skills/tdd-guard/src/hooks/fileTypeDetection.ts +16 -0
  227. package/project-skills/tdd-guard/src/hooks/postToolLint.test.ts +327 -0
  228. package/project-skills/tdd-guard/src/hooks/postToolLint.ts +165 -0
  229. package/project-skills/tdd-guard/src/hooks/processHookData.test.ts +465 -0
  230. package/project-skills/tdd-guard/src/hooks/processHookData.ts +203 -0
  231. package/project-skills/tdd-guard/src/hooks/sessionHandler.test.ts +136 -0
  232. package/project-skills/tdd-guard/src/hooks/sessionHandler.ts +31 -0
  233. package/project-skills/tdd-guard/src/hooks/userPromptHandler.test.ts +131 -0
  234. package/project-skills/tdd-guard/src/hooks/userPromptHandler.ts +55 -0
  235. package/project-skills/tdd-guard/src/index.ts +19 -0
  236. package/project-skills/tdd-guard/src/linters/Linter.ts +5 -0
  237. package/project-skills/tdd-guard/src/linters/eslint/ESLint.test.ts +183 -0
  238. package/project-skills/tdd-guard/src/linters/eslint/ESLint.ts +82 -0
  239. package/project-skills/tdd-guard/src/linters/golangci/GolangciLint.test.ts +170 -0
  240. package/project-skills/tdd-guard/src/linters/golangci/GolangciLint.ts +148 -0
  241. package/project-skills/tdd-guard/src/processors/index.ts +1 -0
  242. package/project-skills/tdd-guard/src/processors/lintProcessor.ts +77 -0
  243. package/project-skills/tdd-guard/src/processors/testResults/TestResultsProcessor.test.ts +303 -0
  244. package/project-skills/tdd-guard/src/processors/testResults/TestResultsProcessor.ts +255 -0
  245. package/project-skills/tdd-guard/src/providers/LinterProvider.test.ts +43 -0
  246. package/project-skills/tdd-guard/src/providers/LinterProvider.ts +20 -0
  247. package/project-skills/tdd-guard/src/providers/ModelClientProvider.test.ts +68 -0
  248. package/project-skills/tdd-guard/src/providers/ModelClientProvider.ts +22 -0
  249. package/project-skills/tdd-guard/src/storage/FileStorage.test.ts +76 -0
  250. package/project-skills/tdd-guard/src/storage/FileStorage.ts +108 -0
  251. package/project-skills/tdd-guard/src/storage/MemoryStorage.ts +57 -0
  252. package/project-skills/tdd-guard/src/storage/Storage.test.ts +227 -0
  253. package/project-skills/tdd-guard/src/storage/Storage.ts +17 -0
  254. package/project-skills/tdd-guard/src/validation/context/context.test.ts +364 -0
  255. package/project-skills/tdd-guard/src/validation/context/context.ts +155 -0
  256. package/project-skills/tdd-guard/src/validation/models/AnthropicApi.test.ts +171 -0
  257. package/project-skills/tdd-guard/src/validation/models/AnthropicApi.ts +49 -0
  258. package/project-skills/tdd-guard/src/validation/models/ClaudeAgentSdk.test.ts +167 -0
  259. package/project-skills/tdd-guard/src/validation/models/ClaudeAgentSdk.ts +54 -0
  260. package/project-skills/tdd-guard/src/validation/models/ClaudeCli.test.ts +239 -0
  261. package/project-skills/tdd-guard/src/validation/models/ClaudeCli.ts +57 -0
  262. package/project-skills/tdd-guard/src/validation/prompts/file-types.ts +52 -0
  263. package/project-skills/tdd-guard/src/validation/prompts/operations/edit.ts +58 -0
  264. package/project-skills/tdd-guard/src/validation/prompts/operations/multi-edit.ts +54 -0
  265. package/project-skills/tdd-guard/src/validation/prompts/operations/write.ts +54 -0
  266. package/project-skills/tdd-guard/src/validation/prompts/response.ts +40 -0
  267. package/project-skills/tdd-guard/src/validation/prompts/rules.ts +51 -0
  268. package/project-skills/tdd-guard/src/validation/prompts/system-prompt.ts +10 -0
  269. package/project-skills/tdd-guard/src/validation/prompts/tools/lint-results.ts +15 -0
  270. package/project-skills/tdd-guard/src/validation/prompts/tools/test-output.ts +14 -0
  271. package/project-skills/tdd-guard/src/validation/prompts/tools/todos.ts +9 -0
  272. package/project-skills/tdd-guard/src/validation/validator.test.ts +268 -0
  273. package/project-skills/tdd-guard/src/validation/validator.ts +159 -0
  274. package/project-skills/tdd-guard/test/artifacts/go/.golangci.yml +6 -0
  275. package/project-skills/tdd-guard/test/artifacts/go/with-issues/file-with-issues.go +12 -0
  276. package/project-skills/tdd-guard/test/artifacts/go/with-issues/go.mod +3 -0
  277. package/project-skills/tdd-guard/test/artifacts/go/without-issues/file-without-issues.go +7 -0
  278. package/project-skills/tdd-guard/test/artifacts/go/without-issues/go.mod +3 -0
  279. package/project-skills/tdd-guard/test/artifacts/javascript/eslint.config.js +20 -0
  280. package/project-skills/tdd-guard/test/artifacts/javascript/file-with-issues.js +12 -0
  281. package/project-skills/tdd-guard/test/artifacts/javascript/file-without-issues.js +10 -0
  282. package/project-skills/tdd-guard/test/hooks/fileTypeDetection.test.ts +26 -0
  283. package/project-skills/tdd-guard/test/hooks/processHookData.fileType.test.ts +46 -0
  284. package/project-skills/tdd-guard/test/hooks/processHookData.python.test.ts +68 -0
  285. package/project-skills/tdd-guard/test/integration/test-context.test.ts +66 -0
  286. package/project-skills/tdd-guard/test/integration/validator.core.test.ts +96 -0
  287. package/project-skills/tdd-guard/test/integration/validator.scenarios.test.ts +497 -0
  288. package/project-skills/tdd-guard/test/utils/assertions.ts +29 -0
  289. package/project-skills/tdd-guard/test/utils/factories/contextFactory.ts +30 -0
  290. package/project-skills/tdd-guard/test/utils/factories/editFactory.ts +82 -0
  291. package/project-skills/tdd-guard/test/utils/factories/helpers.test.ts +46 -0
  292. package/project-skills/tdd-guard/test/utils/factories/helpers.ts +46 -0
  293. package/project-skills/tdd-guard/test/utils/factories/lintFactory.ts +352 -0
  294. package/project-skills/tdd-guard/test/utils/factories/modelClientProviderFactory.ts +21 -0
  295. package/project-skills/tdd-guard/test/utils/factories/multiEditFactory.ts +79 -0
  296. package/project-skills/tdd-guard/test/utils/factories/operations.ts +57 -0
  297. package/project-skills/tdd-guard/test/utils/factories/reporterFactory.ts +55 -0
  298. package/project-skills/tdd-guard/test/utils/factories/scenarios/index.ts +22 -0
  299. package/project-skills/tdd-guard/test/utils/factories/scenarios/languages/python.ts +745 -0
  300. package/project-skills/tdd-guard/test/utils/factories/scenarios/languages/typescript.ts +767 -0
  301. package/project-skills/tdd-guard/test/utils/factories/scenarios/types.ts +77 -0
  302. package/project-skills/tdd-guard/test/utils/factories/scenarios/utils.ts +15 -0
  303. package/project-skills/tdd-guard/test/utils/factories/sessionStartFactory.ts +36 -0
  304. package/project-skills/tdd-guard/test/utils/factories/testDefaults.ts +90 -0
  305. package/project-skills/tdd-guard/test/utils/factories/testResultsFactory.ts +234 -0
  306. package/project-skills/tdd-guard/test/utils/factories/todoFactory.ts +99 -0
  307. package/project-skills/tdd-guard/test/utils/factories/userPromptSubmitFactory.ts +39 -0
  308. package/project-skills/tdd-guard/test/utils/factories/writeFactory.ts +70 -0
  309. package/project-skills/tdd-guard/test/utils/index.ts +131 -0
  310. package/project-skills/tdd-guard/tsconfig.build.json +16 -0
  311. package/project-skills/tdd-guard/tsconfig.eslint.json +17 -0
  312. package/project-skills/tdd-guard/tsconfig.json +32 -0
  313. package/project-skills/tdd-guard/tsconfig.node.json +10 -0
  314. package/project-skills/tdd-guard/vitest.config.ts +85 -0
  315. package/project-skills/ts-quality-gate/.claude/hooks/hook-config.json +66 -0
  316. package/project-skills/ts-quality-gate/.claude/hooks/quality-check.cjs +1251 -0
  317. package/project-skills/ts-quality-gate/.claude/settings.json +16 -0
  318. package/project-skills/ts-quality-gate/.claude/skills/using-ts-quality-gate/SKILL.md +81 -0
  319. package/project-skills/ts-quality-gate/README.md +115 -0
  320. package/skills/README.txt +31 -0
  321. package/skills/clean-code/SKILL.md +201 -0
  322. package/skills/delegating/SKILL.md +196 -0
  323. package/skills/delegating/config.yaml +210 -0
  324. package/skills/delegating/references/orchestration-protocols.md +41 -0
  325. package/skills/docker-expert/SKILL.md +409 -0
  326. package/skills/documenting/CHANGELOG.md +23 -0
  327. package/skills/documenting/README.md +148 -0
  328. package/skills/documenting/SKILL.md +113 -0
  329. package/skills/documenting/examples/example_pattern.md +70 -0
  330. package/skills/documenting/examples/example_reference.md +70 -0
  331. package/skills/documenting/examples/example_ssot_analytics.md +64 -0
  332. package/skills/documenting/examples/example_workflow.md +141 -0
  333. package/skills/documenting/references/changelog-format.md +97 -0
  334. package/skills/documenting/references/metadata-schema.md +136 -0
  335. package/skills/documenting/references/taxonomy.md +81 -0
  336. package/skills/documenting/references/versioning-rules.md +78 -0
  337. package/skills/documenting/scripts/__pycache__/drift_detector.cpython-314.pyc +0 -0
  338. package/skills/documenting/scripts/__pycache__/orchestrator.cpython-314.pyc +0 -0
  339. package/skills/documenting/scripts/__pycache__/validate_metadata.cpython-314.pyc +0 -0
  340. package/skills/documenting/scripts/bump_version.sh +60 -0
  341. package/skills/documenting/scripts/changelog/__init__.py +0 -0
  342. package/skills/documenting/scripts/changelog/__pycache__/__init__.cpython-314.pyc +0 -0
  343. package/skills/documenting/scripts/changelog/__pycache__/add_entry.cpython-314.pyc +0 -0
  344. package/skills/documenting/scripts/changelog/__pycache__/bump_release.cpython-314.pyc +0 -0
  345. package/skills/documenting/scripts/changelog/__pycache__/validate_changelog.cpython-314.pyc +0 -0
  346. package/skills/documenting/scripts/changelog/add_entry.py +216 -0
  347. package/skills/documenting/scripts/changelog/bump_release.py +117 -0
  348. package/skills/documenting/scripts/changelog/init_changelog.py +54 -0
  349. package/skills/documenting/scripts/changelog/validate_changelog.py +128 -0
  350. package/skills/documenting/scripts/drift_detector.py +266 -0
  351. package/skills/documenting/scripts/generate_template.py +311 -0
  352. package/skills/documenting/scripts/list_by_category.sh +84 -0
  353. package/skills/documenting/scripts/orchestrator.py +255 -0
  354. package/skills/documenting/scripts/validate_metadata.py +242 -0
  355. package/skills/documenting/templates/CHANGELOG.md.template +13 -0
  356. package/skills/documenting/tests/__pycache__/test_changelog.cpython-314-pytest-9.0.2.pyc +0 -0
  357. package/skills/documenting/tests/__pycache__/test_drift_detector.cpython-314-pytest-9.0.2.pyc +0 -0
  358. package/skills/documenting/tests/__pycache__/test_orchestrator.cpython-314-pytest-9.0.2.pyc +0 -0
  359. package/skills/documenting/tests/__pycache__/test_validate_metadata.cpython-314-pytest-9.0.2.pyc +0 -0
  360. package/skills/documenting/tests/integration_test.sh +70 -0
  361. package/skills/documenting/tests/test_changelog.py +201 -0
  362. package/skills/documenting/tests/test_drift_detector.py +80 -0
  363. package/skills/documenting/tests/test_orchestrator.py +52 -0
  364. package/skills/documenting/tests/test_validate_metadata.py +64 -0
  365. package/skills/find-skills/SKILL.md +133 -0
  366. package/skills/gitnexus-debugging/SKILL.md +85 -0
  367. package/skills/gitnexus-exploring/SKILL.md +75 -0
  368. package/skills/gitnexus-impact-analysis/SKILL.md +94 -0
  369. package/skills/gitnexus-refactoring/SKILL.md +113 -0
  370. package/skills/hook-development/SKILL.md +797 -0
  371. package/skills/hook-development/examples/load-context.sh +55 -0
  372. package/skills/hook-development/examples/quality-check.js +1168 -0
  373. package/skills/hook-development/examples/validate-bash.sh +43 -0
  374. package/skills/hook-development/examples/validate-write.sh +38 -0
  375. package/skills/hook-development/references/advanced.md +527 -0
  376. package/skills/hook-development/references/migration.md +369 -0
  377. package/skills/hook-development/references/patterns.md +412 -0
  378. package/skills/hook-development/scripts/README.md +164 -0
  379. package/skills/hook-development/scripts/hook-linter.sh +153 -0
  380. package/skills/hook-development/scripts/test-hook.sh +252 -0
  381. package/skills/hook-development/scripts/validate-hook-schema.sh +159 -0
  382. package/skills/obsidian-cli/SKILL.md +106 -0
  383. package/skills/orchestrating-agents/SKILL.md +135 -0
  384. package/skills/orchestrating-agents/config.yaml +45 -0
  385. package/skills/orchestrating-agents/references/agent-context-integration.md +37 -0
  386. package/skills/orchestrating-agents/references/examples.md +45 -0
  387. package/skills/orchestrating-agents/references/handover-protocol.md +31 -0
  388. package/skills/orchestrating-agents/references/workflows.md +42 -0
  389. package/skills/orchestrating-agents/scripts/detect_neighbors.py +23 -0
  390. package/skills/prompt-improving/README.md +162 -0
  391. package/skills/prompt-improving/SKILL.md +74 -0
  392. package/skills/prompt-improving/references/analysis_commands.md +24 -0
  393. package/skills/prompt-improving/references/chain_of_thought.md +24 -0
  394. package/skills/prompt-improving/references/mcp_definitions.md +20 -0
  395. package/skills/prompt-improving/references/multishot.md +23 -0
  396. package/skills/prompt-improving/references/xml_core.md +60 -0
  397. package/skills/python-testing/SKILL.md +815 -0
  398. package/skills/senior-backend/SKILL.md +209 -0
  399. package/skills/senior-backend/references/api_design_patterns.md +103 -0
  400. package/skills/senior-backend/references/backend_security_practices.md +103 -0
  401. package/skills/senior-backend/references/database_optimization_guide.md +103 -0
  402. package/skills/senior-backend/scripts/api_load_tester.py +114 -0
  403. package/skills/senior-backend/scripts/api_scaffolder.py +114 -0
  404. package/skills/senior-backend/scripts/database_migration_tool.py +114 -0
  405. package/skills/senior-data-scientist/SKILL.md +226 -0
  406. package/skills/senior-data-scientist/references/experiment_design_frameworks.md +80 -0
  407. package/skills/senior-data-scientist/references/feature_engineering_patterns.md +80 -0
  408. package/skills/senior-data-scientist/references/statistical_methods_advanced.md +80 -0
  409. package/skills/senior-data-scientist/scripts/experiment_designer.py +100 -0
  410. package/skills/senior-data-scientist/scripts/feature_engineering_pipeline.py +100 -0
  411. package/skills/senior-data-scientist/scripts/model_evaluation_suite.py +100 -0
  412. package/skills/senior-devops/SKILL.md +209 -0
  413. package/skills/senior-devops/references/cicd_pipeline_guide.md +103 -0
  414. package/skills/senior-devops/references/deployment_strategies.md +103 -0
  415. package/skills/senior-devops/references/infrastructure_as_code.md +103 -0
  416. package/skills/senior-devops/scripts/deployment_manager.py +114 -0
  417. package/skills/senior-devops/scripts/pipeline_generator.py +114 -0
  418. package/skills/senior-devops/scripts/terraform_scaffolder.py +114 -0
  419. package/skills/senior-security/SKILL.md +209 -0
  420. package/skills/senior-security/references/cryptography_implementation.md +103 -0
  421. package/skills/senior-security/references/penetration_testing_guide.md +103 -0
  422. package/skills/senior-security/references/security_architecture_patterns.md +103 -0
  423. package/skills/senior-security/scripts/pentest_automator.py +114 -0
  424. package/skills/senior-security/scripts/security_auditor.py +114 -0
  425. package/skills/senior-security/scripts/threat_modeler.py +114 -0
  426. package/skills/skill-creator/LICENSE.txt +202 -0
  427. package/skills/skill-creator/SKILL.md +479 -0
  428. package/skills/skill-creator/agents/analyzer.md +274 -0
  429. package/skills/skill-creator/agents/comparator.md +202 -0
  430. package/skills/skill-creator/agents/grader.md +223 -0
  431. package/skills/skill-creator/assets/eval_review.html +146 -0
  432. package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  433. package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  434. package/skills/skill-creator/references/schemas.md +430 -0
  435. package/skills/skill-creator/scripts/__init__.py +0 -0
  436. package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  437. package/skills/skill-creator/scripts/generate_report.py +326 -0
  438. package/skills/skill-creator/scripts/improve_description.py +248 -0
  439. package/skills/skill-creator/scripts/package_skill.py +136 -0
  440. package/skills/skill-creator/scripts/quick_validate.py +103 -0
  441. package/skills/skill-creator/scripts/run_eval.py +310 -0
  442. package/skills/skill-creator/scripts/run_loop.py +332 -0
  443. package/skills/skill-creator/scripts/utils.py +47 -0
  444. package/skills/using-TDD/SKILL.md +410 -0
  445. package/skills/using-serena-lsp/README.md +8 -0
  446. package/skills/using-serena-lsp/REFERENCE.md +194 -0
  447. package/skills/using-serena-lsp/SKILL.md +82 -0
@@ -0,0 +1,326 @@
1
+ #!/usr/bin/env python3
2
+ """Generate an HTML report from run_loop.py output.
3
+
4
+ Takes the JSON output from run_loop.py and generates a visual HTML report
5
+ showing each description attempt with check/x for each test case.
6
+ Distinguishes between train and test queries.
7
+ """
8
+
9
+ import argparse
10
+ import html
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
17
+ """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
18
+ history = data.get("history", [])
19
+ holdout = data.get("holdout", 0)
20
+ title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
21
+
22
+ # Get all unique queries from train and test sets, with should_trigger info
23
+ train_queries: list[dict] = []
24
+ test_queries: list[dict] = []
25
+ if history:
26
+ for r in history[0].get("train_results", history[0].get("results", [])):
27
+ train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
28
+ if history[0].get("test_results"):
29
+ for r in history[0].get("test_results", []):
30
+ test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
31
+
32
+ refresh_tag = ' <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
33
+
34
+ html_parts = ["""<!DOCTYPE html>
35
+ <html>
36
+ <head>
37
+ <meta charset="utf-8">
38
+ """ + refresh_tag + """ <title>""" + title_prefix + """Skill Description Optimization</title>
39
+ <link rel="preconnect" href="https://fonts.googleapis.com">
40
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
41
+ <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
42
+ <style>
43
+ body {
44
+ font-family: 'Lora', Georgia, serif;
45
+ max-width: 100%;
46
+ margin: 0 auto;
47
+ padding: 20px;
48
+ background: #faf9f5;
49
+ color: #141413;
50
+ }
51
+ h1 { font-family: 'Poppins', sans-serif; color: #141413; }
52
+ .explainer {
53
+ background: white;
54
+ padding: 15px;
55
+ border-radius: 6px;
56
+ margin-bottom: 20px;
57
+ border: 1px solid #e8e6dc;
58
+ color: #b0aea5;
59
+ font-size: 0.875rem;
60
+ line-height: 1.6;
61
+ }
62
+ .summary {
63
+ background: white;
64
+ padding: 15px;
65
+ border-radius: 6px;
66
+ margin-bottom: 20px;
67
+ border: 1px solid #e8e6dc;
68
+ }
69
+ .summary p { margin: 5px 0; }
70
+ .best { color: #788c5d; font-weight: bold; }
71
+ .table-container {
72
+ overflow-x: auto;
73
+ width: 100%;
74
+ }
75
+ table {
76
+ border-collapse: collapse;
77
+ background: white;
78
+ border: 1px solid #e8e6dc;
79
+ border-radius: 6px;
80
+ font-size: 12px;
81
+ min-width: 100%;
82
+ }
83
+ th, td {
84
+ padding: 8px;
85
+ text-align: left;
86
+ border: 1px solid #e8e6dc;
87
+ white-space: normal;
88
+ word-wrap: break-word;
89
+ }
90
+ th {
91
+ font-family: 'Poppins', sans-serif;
92
+ background: #141413;
93
+ color: #faf9f5;
94
+ font-weight: 500;
95
+ }
96
+ th.test-col {
97
+ background: #6a9bcc;
98
+ }
99
+ th.query-col { min-width: 200px; }
100
+ td.description {
101
+ font-family: monospace;
102
+ font-size: 11px;
103
+ word-wrap: break-word;
104
+ max-width: 400px;
105
+ }
106
+ td.result {
107
+ text-align: center;
108
+ font-size: 16px;
109
+ min-width: 40px;
110
+ }
111
+ td.test-result {
112
+ background: #f0f6fc;
113
+ }
114
+ .pass { color: #788c5d; }
115
+ .fail { color: #c44; }
116
+ .rate {
117
+ font-size: 9px;
118
+ color: #b0aea5;
119
+ display: block;
120
+ }
121
+ tr:hover { background: #faf9f5; }
122
+ .score {
123
+ display: inline-block;
124
+ padding: 2px 6px;
125
+ border-radius: 4px;
126
+ font-weight: bold;
127
+ font-size: 11px;
128
+ }
129
+ .score-good { background: #eef2e8; color: #788c5d; }
130
+ .score-ok { background: #fef3c7; color: #d97706; }
131
+ .score-bad { background: #fceaea; color: #c44; }
132
+ .train-label { color: #b0aea5; font-size: 10px; }
133
+ .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
134
+ .best-row { background: #f5f8f2; }
135
+ th.positive-col { border-bottom: 3px solid #788c5d; }
136
+ th.negative-col { border-bottom: 3px solid #c44; }
137
+ th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
138
+ th.test-col.negative-col { border-bottom: 3px solid #c44; }
139
+ .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
140
+ .legend-item { display: flex; align-items: center; gap: 6px; }
141
+ .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
142
+ .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
143
+ .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
144
+ .swatch-test { background: #6a9bcc; }
145
+ .swatch-train { background: #141413; }
146
+ </style>
147
+ </head>
148
+ <body>
149
+ <h1>""" + title_prefix + """Skill Description Optimization</h1>
150
+ <div class="explainer">
151
+ <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
152
+ </div>
153
+ """]
154
+
155
+ # Summary section
156
+ best_test_score = data.get('best_test_score')
157
+ best_train_score = data.get('best_train_score')
158
+ html_parts.append(f"""
159
+ <div class="summary">
160
+ <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
161
+ <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
162
+ <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
163
+ <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
164
+ </div>
165
+ """)
166
+
167
+ # Legend
168
+ html_parts.append("""
169
+ <div class="legend">
170
+ <span style="font-weight:600">Query columns:</span>
171
+ <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
172
+ <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
173
+ <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
174
+ <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
175
+ </div>
176
+ """)
177
+
178
+ # Table header
179
+ html_parts.append("""
180
+ <div class="table-container">
181
+ <table>
182
+ <thead>
183
+ <tr>
184
+ <th>Iter</th>
185
+ <th>Train</th>
186
+ <th>Test</th>
187
+ <th class="query-col">Description</th>
188
+ """)
189
+
190
+ # Add column headers for train queries
191
+ for qinfo in train_queries:
192
+ polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
193
+ html_parts.append(f' <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
194
+
195
+ # Add column headers for test queries (different color)
196
+ for qinfo in test_queries:
197
+ polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
198
+ html_parts.append(f' <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
199
+
200
+ html_parts.append(""" </tr>
201
+ </thead>
202
+ <tbody>
203
+ """)
204
+
205
+ # Find best iteration for highlighting
206
+ if test_queries:
207
+ best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
208
+ else:
209
+ best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
210
+
211
+ # Add rows for each iteration
212
+ for h in history:
213
+ iteration = h.get("iteration", "?")
214
+ train_passed = h.get("train_passed", h.get("passed", 0))
215
+ train_total = h.get("train_total", h.get("total", 0))
216
+ test_passed = h.get("test_passed")
217
+ test_total = h.get("test_total")
218
+ description = h.get("description", "")
219
+ train_results = h.get("train_results", h.get("results", []))
220
+ test_results = h.get("test_results", [])
221
+
222
+ # Create lookups for results by query
223
+ train_by_query = {r["query"]: r for r in train_results}
224
+ test_by_query = {r["query"]: r for r in test_results} if test_results else {}
225
+
226
+ # Compute aggregate correct/total runs across all retries
227
+ def aggregate_runs(results: list[dict]) -> tuple[int, int]:
228
+ correct = 0
229
+ total = 0
230
+ for r in results:
231
+ runs = r.get("runs", 0)
232
+ triggers = r.get("triggers", 0)
233
+ total += runs
234
+ if r.get("should_trigger", True):
235
+ correct += triggers
236
+ else:
237
+ correct += runs - triggers
238
+ return correct, total
239
+
240
+ train_correct, train_runs = aggregate_runs(train_results)
241
+ test_correct, test_runs = aggregate_runs(test_results)
242
+
243
+ # Determine score classes
244
+ def score_class(correct: int, total: int) -> str:
245
+ if total > 0:
246
+ ratio = correct / total
247
+ if ratio >= 0.8:
248
+ return "score-good"
249
+ elif ratio >= 0.5:
250
+ return "score-ok"
251
+ return "score-bad"
252
+
253
+ train_class = score_class(train_correct, train_runs)
254
+ test_class = score_class(test_correct, test_runs)
255
+
256
+ row_class = "best-row" if iteration == best_iter else ""
257
+
258
+ html_parts.append(f""" <tr class="{row_class}">
259
+ <td>{iteration}</td>
260
+ <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
261
+ <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
262
+ <td class="description">{html.escape(description)}</td>
263
+ """)
264
+
265
+ # Add result for each train query
266
+ for qinfo in train_queries:
267
+ r = train_by_query.get(qinfo["query"], {})
268
+ did_pass = r.get("pass", False)
269
+ triggers = r.get("triggers", 0)
270
+ runs = r.get("runs", 0)
271
+
272
+ icon = "✓" if did_pass else "✗"
273
+ css_class = "pass" if did_pass else "fail"
274
+
275
+ html_parts.append(f' <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
276
+
277
+ # Add result for each test query (with different background)
278
+ for qinfo in test_queries:
279
+ r = test_by_query.get(qinfo["query"], {})
280
+ did_pass = r.get("pass", False)
281
+ triggers = r.get("triggers", 0)
282
+ runs = r.get("runs", 0)
283
+
284
+ icon = "✓" if did_pass else "✗"
285
+ css_class = "pass" if did_pass else "fail"
286
+
287
+ html_parts.append(f' <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
288
+
289
+ html_parts.append(" </tr>\n")
290
+
291
+ html_parts.append(""" </tbody>
292
+ </table>
293
+ </div>
294
+ """)
295
+
296
+ html_parts.append("""
297
+ </body>
298
+ </html>
299
+ """)
300
+
301
+ return "".join(html_parts)
302
+
303
+
304
+ def main():
305
+ parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
306
+ parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
307
+ parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
308
+ parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
309
+ args = parser.parse_args()
310
+
311
+ if args.input == "-":
312
+ data = json.load(sys.stdin)
313
+ else:
314
+ data = json.loads(Path(args.input).read_text())
315
+
316
+ html_output = generate_html(data, skill_name=args.skill_name)
317
+
318
+ if args.output:
319
+ Path(args.output).write_text(html_output)
320
+ print(f"Report written to {args.output}", file=sys.stderr)
321
+ else:
322
+ print(html_output)
323
+
324
+
325
+ if __name__ == "__main__":
326
+ main()
@@ -0,0 +1,248 @@
1
+ #!/usr/bin/env python3
2
+ """Improve a skill description based on eval results.
3
+
4
+ Takes eval results (from run_eval.py) and generates an improved description
5
+ using Claude with extended thinking.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import re
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ import anthropic
15
+
16
+ from scripts.utils import parse_skill_md
17
+
18
+
19
+ def improve_description(
20
+ client: anthropic.Anthropic,
21
+ skill_name: str,
22
+ skill_content: str,
23
+ current_description: str,
24
+ eval_results: dict,
25
+ history: list[dict],
26
+ model: str,
27
+ test_results: dict | None = None,
28
+ log_dir: Path | None = None,
29
+ iteration: int | None = None,
30
+ ) -> str:
31
+ """Call Claude to improve the description based on eval results."""
32
+ failed_triggers = [
33
+ r for r in eval_results["results"]
34
+ if r["should_trigger"] and not r["pass"]
35
+ ]
36
+ false_triggers = [
37
+ r for r in eval_results["results"]
38
+ if not r["should_trigger"] and not r["pass"]
39
+ ]
40
+
41
+ # Build scores summary
42
+ train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
43
+ if test_results:
44
+ test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
45
+ scores_summary = f"Train: {train_score}, Test: {test_score}"
46
+ else:
47
+ scores_summary = f"Train: {train_score}"
48
+
49
+ prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
50
+
51
+ The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
52
+
53
+ Here's the current description:
54
+ <current_description>
55
+ "{current_description}"
56
+ </current_description>
57
+
58
+ Current scores ({scores_summary}):
59
+ <scores_summary>
60
+ """
61
+ if failed_triggers:
62
+ prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
63
+ for r in failed_triggers:
64
+ prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
65
+ prompt += "\n"
66
+
67
+ if false_triggers:
68
+ prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
69
+ for r in false_triggers:
70
+ prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
71
+ prompt += "\n"
72
+
73
+ if history:
74
+ prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
75
+ for h in history:
76
+ train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
77
+ test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
78
+ score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
79
+ prompt += f'<attempt {score_str}>\n'
80
+ prompt += f'Description: "{h["description"]}"\n'
81
+ if "results" in h:
82
+ prompt += "Train results:\n"
83
+ for r in h["results"]:
84
+ status = "PASS" if r["pass"] else "FAIL"
85
+ prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
86
+ if h.get("note"):
87
+ prompt += f'Note: {h["note"]}\n'
88
+ prompt += "</attempt>\n\n"
89
+
90
+ prompt += f"""</scores_summary>
91
+
92
+ Skill content (for context on what the skill does):
93
+ <skill_content>
94
+ {skill_content}
95
+ </skill_content>
96
+
97
+ Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
98
+
99
+ 1. Avoid overfitting
100
+ 2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
101
+
102
+ Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy.
103
+
104
+ Here are some tips that we've found to work well in writing these descriptions:
105
+ - The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
106
+ - The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
107
+ - The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
108
+ - If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
109
+
110
+ I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end.
111
+
112
+ Please respond with only the new description text in <new_description> tags, nothing else."""
113
+
114
+ response = client.messages.create(
115
+ model=model,
116
+ max_tokens=16000,
117
+ thinking={
118
+ "type": "enabled",
119
+ "budget_tokens": 10000,
120
+ },
121
+ messages=[{"role": "user", "content": prompt}],
122
+ )
123
+
124
+ # Extract thinking and text from response
125
+ thinking_text = ""
126
+ text = ""
127
+ for block in response.content:
128
+ if block.type == "thinking":
129
+ thinking_text = block.thinking
130
+ elif block.type == "text":
131
+ text = block.text
132
+
133
+ # Parse out the <new_description> tags
134
+ match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
135
+ description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
136
+
137
+ # Log the transcript
138
+ transcript: dict = {
139
+ "iteration": iteration,
140
+ "prompt": prompt,
141
+ "thinking": thinking_text,
142
+ "response": text,
143
+ "parsed_description": description,
144
+ "char_count": len(description),
145
+ "over_limit": len(description) > 1024,
146
+ }
147
+
148
+ # If over 1024 chars, ask the model to shorten it
149
+ if len(description) > 1024:
150
+ shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in <new_description> tags."
151
+ shorten_response = client.messages.create(
152
+ model=model,
153
+ max_tokens=16000,
154
+ thinking={
155
+ "type": "enabled",
156
+ "budget_tokens": 10000,
157
+ },
158
+ messages=[
159
+ {"role": "user", "content": prompt},
160
+ {"role": "assistant", "content": text},
161
+ {"role": "user", "content": shorten_prompt},
162
+ ],
163
+ )
164
+
165
+ shorten_thinking = ""
166
+ shorten_text = ""
167
+ for block in shorten_response.content:
168
+ if block.type == "thinking":
169
+ shorten_thinking = block.thinking
170
+ elif block.type == "text":
171
+ shorten_text = block.text
172
+
173
+ match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
174
+ shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
175
+
176
+ transcript["rewrite_prompt"] = shorten_prompt
177
+ transcript["rewrite_thinking"] = shorten_thinking
178
+ transcript["rewrite_response"] = shorten_text
179
+ transcript["rewrite_description"] = shortened
180
+ transcript["rewrite_char_count"] = len(shortened)
181
+ description = shortened
182
+
183
+ transcript["final_description"] = description
184
+
185
+ if log_dir:
186
+ log_dir.mkdir(parents=True, exist_ok=True)
187
+ log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
188
+ log_file.write_text(json.dumps(transcript, indent=2))
189
+
190
+ return description
191
+
192
+
193
+ def main():
194
+ parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
195
+ parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
196
+ parser.add_argument("--skill-path", required=True, help="Path to skill directory")
197
+ parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
198
+ parser.add_argument("--model", required=True, help="Model for improvement")
199
+ parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
200
+ args = parser.parse_args()
201
+
202
+ skill_path = Path(args.skill_path)
203
+ if not (skill_path / "SKILL.md").exists():
204
+ print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
205
+ sys.exit(1)
206
+
207
+ eval_results = json.loads(Path(args.eval_results).read_text())
208
+ history = []
209
+ if args.history:
210
+ history = json.loads(Path(args.history).read_text())
211
+
212
+ name, _, content = parse_skill_md(skill_path)
213
+ current_description = eval_results["description"]
214
+
215
+ if args.verbose:
216
+ print(f"Current: {current_description}", file=sys.stderr)
217
+ print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr)
218
+
219
+ client = anthropic.Anthropic()
220
+ new_description = improve_description(
221
+ client=client,
222
+ skill_name=name,
223
+ skill_content=content,
224
+ current_description=current_description,
225
+ eval_results=eval_results,
226
+ history=history,
227
+ model=args.model,
228
+ )
229
+
230
+ if args.verbose:
231
+ print(f"Improved: {new_description}", file=sys.stderr)
232
+
233
+ # Output as JSON with both the new description and updated history
234
+ output = {
235
+ "description": new_description,
236
+ "history": history + [{
237
+ "description": current_description,
238
+ "passed": eval_results["summary"]["passed"],
239
+ "failed": eval_results["summary"]["failed"],
240
+ "total": eval_results["summary"]["total"],
241
+ "results": eval_results["results"],
242
+ }],
243
+ }
244
+ print(json.dumps(output, indent=2))
245
+
246
+
247
+ if __name__ == "__main__":
248
+ main()