@skill-graph/cli 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +247 -0
- package/LICENSE +200 -0
- package/NOTICE +62 -0
- package/README.md +398 -0
- package/SKILL_GRAPH.md +443 -0
- package/bin/skill-graph.js +374 -0
- package/docs/ADOPTION.md +117 -0
- package/docs/CONFORMANCE.md +66 -0
- package/docs/PRIMER.md +384 -0
- package/docs/QUICKSTART-30MIN.md +333 -0
- package/docs/ROUTING-METRICS.md +120 -0
- package/docs/SKILL-MD-FORMAT-COMPATIBILITY.md +127 -0
- package/docs/SKILL_AUDIT_CHECKLIST.md +199 -0
- package/docs/SKILL_AUDIT_LOOP.md +195 -0
- package/docs/SKILL_METADATA_PROTOCOL.md +609 -0
- package/docs/_archived/marketplace-publication-priority-2026-05-18.md +239 -0
- package/docs/adr/0001-predicate-set.md +69 -0
- package/docs/adr/0002-json-ld-context.md +82 -0
- package/docs/adr/0003-ontoclean-rigidity-tags.md +65 -0
- package/docs/adr/0004-persistent-identifiers.md +74 -0
- package/docs/adr/0005-freshness-consolidation.md +70 -0
- package/docs/adr/0006-revise-predicate-rename.md +105 -0
- package/docs/adr/0007-audit-loop-cadence.md +99 -0
- package/docs/adr/0008-skill-surface-split-and-curation-policy.md +93 -0
- package/docs/category-consumers.md +168 -0
- package/docs/concept-map.md +194 -0
- package/docs/diagrams/drift-states.mmd +21 -0
- package/docs/diagrams/manifest-pipeline.mmd +25 -0
- package/docs/diagrams/routing-harness.mmd +41 -0
- package/docs/diagrams/starter-graph.mmd +53 -0
- package/docs/field-decision-guide.md +315 -0
- package/docs/field-rationale.md +211 -0
- package/docs/field-reference.generated.md +624 -0
- package/docs/field-reference.md +1426 -0
- package/docs/glossary.md +190 -0
- package/docs/head-noun-glossary.md +63 -0
- package/docs/images/audit-phases.png +0 -0
- package/docs/images/drift-states.png +0 -0
- package/docs/images/graded-mode.png +0 -0
- package/docs/images/manifest-pipeline.png +0 -0
- package/docs/images/routing-harness.png +0 -0
- package/docs/images/skill-anatomy.png +0 -0
- package/docs/images/starter-graph.png +0 -0
- package/docs/images/system-model.png +0 -0
- package/docs/integrations/github-actions.md +155 -0
- package/docs/manifest-field-mapping.md +443 -0
- package/docs/marketplace-publication-queue.generated.md +240 -0
- package/docs/marketplace-release-agent-prompt.md +82 -0
- package/docs/marketplace-skill-candidate-list.md +272 -0
- package/docs/marketplace-syndication.md +222 -0
- package/docs/migration-sample-review.md +155 -0
- package/docs/migrations/v4-to-v5.md +168 -0
- package/docs/migrations/v5-to-v6.md +221 -0
- package/docs/name-exceptions.yaml +37 -0
- package/docs/plans/marketplace-p1-public-migration-plan.md +41 -0
- package/docs/plans/multi-root-workspace.md +148 -0
- package/docs/plans/scripts-roadmap.md +107 -0
- package/docs/plans/v4-schema-bump.md +160 -0
- package/docs/plans/wave-2-extraction.md +122 -0
- package/docs/positioning-vs-marketplaces.md +175 -0
- package/docs/proposals/skill-audit-loop-positioning.md +160 -0
- package/docs/quality-doctrine.md +138 -0
- package/docs/recommended-skills.md +150 -0
- package/docs/research/skill-comprehension-eval-research.md +1830 -0
- package/docs/research/skill-retrieval-evidence.md +66 -0
- package/docs/skill-metadata-protocol.md +471 -0
- package/docs/skills-sh-maintainer-cleanup-request.md +80 -0
- package/examples/audits/a11y/findings.md +52 -0
- package/examples/audits/a11y/scorecard.md +21 -0
- package/examples/audits/a11y/verdict.md +44 -0
- package/examples/audits/debugging/findings.md +59 -0
- package/examples/audits/debugging/scorecard.md +22 -0
- package/examples/audits/debugging/verdict.md +33 -0
- package/examples/audits/documentation/findings.md +59 -0
- package/examples/audits/documentation/scorecard.md +22 -0
- package/examples/audits/documentation/verdict.md +33 -0
- package/examples/evals/a11y.json +140 -0
- package/examples/evals/api-design.json +52 -0
- package/examples/evals/code-review.json +52 -0
- package/examples/evals/data-modeling.json +52 -0
- package/examples/evals/database-migration.json +52 -0
- package/examples/evals/debugging.json +118 -0
- package/examples/evals/dependency-architecture.json +52 -0
- package/examples/evals/design-system-architecture.json +52 -0
- package/examples/evals/error-tracking.json +52 -0
- package/examples/evals/event-contract-design.json +52 -0
- package/examples/evals/form-ux-architecture.json +52 -0
- package/examples/evals/framework-fit-analysis.json +52 -0
- package/examples/evals/graph-audit.json +139 -0
- package/examples/evals/information-architecture.json +52 -0
- package/examples/evals/interaction-feedback.json +52 -0
- package/examples/evals/interaction-patterns.json +52 -0
- package/examples/evals/layout-composition.json +52 -0
- package/examples/evals/lint-overlay.json +117 -0
- package/examples/evals/microcopy.json +52 -0
- package/examples/evals/observability-modeling.json +52 -0
- package/examples/evals/pattern-recognition.json +96 -0
- package/examples/evals/performance-engineering.json +52 -0
- package/examples/evals/refactor.json +128 -0
- package/examples/evals/semiotics.json +52 -0
- package/examples/evals/skill-infrastructure.json +96 -0
- package/examples/evals/skill-router.json +140 -0
- package/examples/evals/skill-router.routing.json +113 -0
- package/examples/evals/system-interface-contracts.json +52 -0
- package/examples/evals/task-analysis.json +52 -0
- package/examples/evals/testing-strategy.json +118 -0
- package/examples/evals/type-safety.json +249 -0
- package/examples/evals/visual-design-foundations.json +52 -0
- package/examples/evals/webhook-integration.json +52 -0
- package/examples/exports/a11y.skill-md.md +80 -0
- package/examples/exports/debugging.skill-md.md +80 -0
- package/examples/exports/refactor.skill-md.md +78 -0
- package/examples/exports/testing-strategy.skill-md.md +81 -0
- package/examples/projects/markdown-static-site/README.md +115 -0
- package/examples/projects/markdown-static-site/skills/content-source-router/SKILL.md +131 -0
- package/examples/projects/markdown-static-site/skills/image-optimization-pipeline-config/SKILL.md +132 -0
- package/examples/projects/markdown-static-site/skills/link-rot-detection/SKILL.md +103 -0
- package/examples/projects/markdown-static-site/skills/markdown-post-frontmatter-validation/SKILL.md +133 -0
- package/examples/projects/markdown-static-site/skills/migrate-posts-to-v2-frontmatter/SKILL.md +140 -0
- package/examples/projects/saas-stripe-postgres/README.md +208 -0
- package/examples/projects/saas-stripe-postgres/db/migrations/0004_canonicalize_orders.sql +37 -0
- package/examples/projects/saas-stripe-postgres/db/schema.sql +112 -0
- package/examples/projects/saas-stripe-postgres/skills/migrate-orders-to-canonical-schema/SKILL.md +149 -0
- package/examples/projects/saas-stripe-postgres/skills/nextjs-server-action-validation/SKILL.md +154 -0
- package/examples/projects/saas-stripe-postgres/skills/payment-provider-router/SKILL.md +153 -0
- package/examples/projects/saas-stripe-postgres/skills/postgres-rls-pattern/SKILL.md +163 -0
- package/examples/projects/saas-stripe-postgres/skills/stripe-webhook-signature-verification/SKILL.md +137 -0
- package/examples/protocol/skill-metadata-template.md +301 -0
- package/examples/protocol/skills.manifest.sample.json +13245 -0
- package/examples/skill-metadata-template.md +317 -0
- package/examples/skills.manifest.sample.json +13519 -0
- package/examples/tests/v3-1-skos-fixture/SKILL.md +93 -0
- package/marketplace/README.md +17 -0
- package/marketplace/skills/a11y/SKILL.md +66 -0
- package/marketplace/skills/acid-fundamentals/SKILL.md +106 -0
- package/marketplace/skills/agent-engineering/SKILL.md +386 -0
- package/marketplace/skills/agent-eval-design/SKILL.md +55 -0
- package/marketplace/skills/ai-native-development/SKILL.md +294 -0
- package/marketplace/skills/api-design/SKILL.md +60 -0
- package/marketplace/skills/architecture-decision-records/SKILL.md +55 -0
- package/marketplace/skills/background-jobs/SKILL.md +265 -0
- package/marketplace/skills/bounded-context-mapping/SKILL.md +55 -0
- package/marketplace/skills/cap-theorem-tradeoffs/SKILL.md +127 -0
- package/marketplace/skills/client-server-boundary/SKILL.md +187 -0
- package/marketplace/skills/code-review/SKILL.md +120 -0
- package/marketplace/skills/color-system-design/SKILL.md +43 -0
- package/marketplace/skills/component-architecture/SKILL.md +126 -0
- package/marketplace/skills/compression/SKILL.md +112 -0
- package/marketplace/skills/conceptual-modeling/SKILL.md +181 -0
- package/marketplace/skills/connection-pooling/SKILL.md +105 -0
- package/marketplace/skills/constraint-awareness/SKILL.md +287 -0
- package/marketplace/skills/content-monitor/SKILL.md +209 -0
- package/marketplace/skills/context-engineering/SKILL.md +320 -0
- package/marketplace/skills/context-graph/SKILL.md +174 -0
- package/marketplace/skills/context-management/SKILL.md +174 -0
- package/marketplace/skills/context-window/SKILL.md +239 -0
- package/marketplace/skills/contract-testing/SKILL.md +120 -0
- package/marketplace/skills/cron-scheduling/SKILL.md +223 -0
- package/marketplace/skills/dark-mode-implementation/SKILL.md +47 -0
- package/marketplace/skills/data-modeling/SKILL.md +59 -0
- package/marketplace/skills/data-modeling-fundamentals/SKILL.md +117 -0
- package/marketplace/skills/database-migration/SKILL.md +429 -0
- package/marketplace/skills/debugging/SKILL.md +67 -0
- package/marketplace/skills/dependency-architecture/SKILL.md +58 -0
- package/marketplace/skills/design-module-composition/SKILL.md +43 -0
- package/marketplace/skills/design-system-architecture/SKILL.md +61 -0
- package/marketplace/skills/design-thinking/SKILL.md +44 -0
- package/marketplace/skills/diagnosis/SKILL.md +296 -0
- package/marketplace/skills/diff-analysis/SKILL.md +188 -0
- package/marketplace/skills/e2e-test-design/SKILL.md +113 -0
- package/marketplace/skills/entity-relationship-modeling/SKILL.md +218 -0
- package/marketplace/skills/epistemic-grounding/SKILL.md +112 -0
- package/marketplace/skills/error-boundary/SKILL.md +235 -0
- package/marketplace/skills/error-tracking/SKILL.md +261 -0
- package/marketplace/skills/eval-driven-development/SKILL.md +147 -0
- package/marketplace/skills/evaluation/SKILL.md +113 -0
- package/marketplace/skills/event-contract-design/SKILL.md +60 -0
- package/marketplace/skills/event-storming/SKILL.md +56 -0
- package/marketplace/skills/form-ux-architecture/SKILL.md +60 -0
- package/marketplace/skills/framework-fit-analysis/SKILL.md +59 -0
- package/marketplace/skills/frontend-architecture/SKILL.md +43 -0
- package/marketplace/skills/generative-ui/SKILL.md +118 -0
- package/marketplace/skills/graph-audit/SKILL.md +81 -0
- package/marketplace/skills/guardrails/SKILL.md +118 -0
- package/marketplace/skills/hooks-patterns/SKILL.md +185 -0
- package/marketplace/skills/http-semantics/SKILL.md +136 -0
- package/marketplace/skills/ideation/SKILL.md +41 -0
- package/marketplace/skills/indexing-strategy/SKILL.md +108 -0
- package/marketplace/skills/information-architecture/SKILL.md +59 -0
- package/marketplace/skills/integration-test-design/SKILL.md +111 -0
- package/marketplace/skills/intent-recognition/SKILL.md +136 -0
- package/marketplace/skills/interaction-feedback/SKILL.md +59 -0
- package/marketplace/skills/interaction-patterns/SKILL.md +59 -0
- package/marketplace/skills/journey-mapping/SKILL.md +41 -0
- package/marketplace/skills/keywords/SKILL.md +213 -0
- package/marketplace/skills/knowledge-modeling/SKILL.md +232 -0
- package/marketplace/skills/layout-composition/SKILL.md +59 -0
- package/marketplace/skills/linguistics/SKILL.md +429 -0
- package/marketplace/skills/lint-overlay/SKILL.md +76 -0
- package/marketplace/skills/mental-models/SKILL.md +126 -0
- package/marketplace/skills/merge-queue/SKILL.md +94 -0
- package/marketplace/skills/methodology/SKILL.md +317 -0
- package/marketplace/skills/microcopy/SKILL.md +232 -0
- package/marketplace/skills/middleware-patterns/SKILL.md +363 -0
- package/marketplace/skills/mobile-responsive-ux/SKILL.md +287 -0
- package/marketplace/skills/mutation-testing/SKILL.md +112 -0
- package/marketplace/skills/naming-conventions/SKILL.md +112 -0
- package/marketplace/skills/observability-modeling/SKILL.md +59 -0
- package/marketplace/skills/ontology-modeling/SKILL.md +67 -0
- package/marketplace/skills/owasp-security/SKILL.md +153 -0
- package/marketplace/skills/pattern-recognition/SKILL.md +472 -0
- package/marketplace/skills/performance-budgets/SKILL.md +185 -0
- package/marketplace/skills/performance-engineering/SKILL.md +58 -0
- package/marketplace/skills/performance-testing/SKILL.md +125 -0
- package/marketplace/skills/printify/SKILL.md +42 -0
- package/marketplace/skills/prioritization/SKILL.md +118 -0
- package/marketplace/skills/problem-framing/SKILL.md +41 -0
- package/marketplace/skills/problem-locating-solving/SKILL.md +203 -0
- package/marketplace/skills/project-knowledge-extraction/SKILL.md +54 -0
- package/marketplace/skills/prompt-craft/SKILL.md +134 -0
- package/marketplace/skills/prompt-injection-defense/SKILL.md +132 -0
- package/marketplace/skills/property-based-testing/SKILL.md +100 -0
- package/marketplace/skills/prototyping/SKILL.md +43 -0
- package/marketplace/skills/query-optimization/SKILL.md +144 -0
- package/marketplace/skills/real-time-updates/SKILL.md +324 -0
- package/marketplace/skills/ref-patterns/SKILL.md +284 -0
- package/marketplace/skills/refactor/SKILL.md +65 -0
- package/marketplace/skills/rendering-models/SKILL.md +142 -0
- package/marketplace/skills/replication-patterns/SKILL.md +110 -0
- package/marketplace/skills/research-synthesis/SKILL.md +41 -0
- package/marketplace/skills/route-handler-design/SKILL.md +347 -0
- package/marketplace/skills/schema-evolution/SKILL.md +140 -0
- package/marketplace/skills/security-fundamentals/SKILL.md +139 -0
- package/marketplace/skills/semantic-center/SKILL.md +194 -0
- package/marketplace/skills/semantic-relations/SKILL.md +250 -0
- package/marketplace/skills/semantics/SKILL.md +366 -0
- package/marketplace/skills/semiotics/SKILL.md +230 -0
- package/marketplace/skills/seo-strategy/SKILL.md +260 -0
- package/marketplace/skills/server-actions-design/SKILL.md +243 -0
- package/marketplace/skills/server-components-design/SKILL.md +190 -0
- package/marketplace/skills/sharding-strategy/SKILL.md +123 -0
- package/marketplace/skills/shopify/SKILL.md +42 -0
- package/marketplace/skills/skill-infrastructure/SKILL.md +320 -0
- package/marketplace/skills/skill-router/SKILL.md +71 -0
- package/marketplace/skills/skill-scaffold/SKILL.md +105 -0
- package/marketplace/skills/snapshot-testing/SKILL.md +120 -0
- package/marketplace/skills/spec-driven-development/SKILL.md +148 -0
- package/marketplace/skills/state-machine-modeling/SKILL.md +56 -0
- package/marketplace/skills/state-management/SKILL.md +134 -0
- package/marketplace/skills/streaming-architecture/SKILL.md +194 -0
- package/marketplace/skills/summarization/SKILL.md +156 -0
- package/marketplace/skills/suspense-patterns/SKILL.md +265 -0
- package/marketplace/skills/system-interface-contracts/SKILL.md +59 -0
- package/marketplace/skills/task-analysis/SKILL.md +201 -0
- package/marketplace/skills/taxonomy-design/SKILL.md +66 -0
- package/marketplace/skills/test-coverage-strategy/SKILL.md +108 -0
- package/marketplace/skills/test-doubles-design/SKILL.md +98 -0
- package/marketplace/skills/test-driven-development/SKILL.md +96 -0
- package/marketplace/skills/testing-strategy/SKILL.md +67 -0
- package/marketplace/skills/theme-system-design/SKILL.md +43 -0
- package/marketplace/skills/tool-call-flow/SKILL.md +229 -0
- package/marketplace/skills/tool-call-strategy/SKILL.md +292 -0
- package/marketplace/skills/transaction-isolation/SKILL.md +98 -0
- package/marketplace/skills/type-safety/SKILL.md +177 -0
- package/marketplace/skills/typography-system/SKILL.md +43 -0
- package/marketplace/skills/usability-testing/SKILL.md +43 -0
- package/marketplace/skills/user-research/SKILL.md +43 -0
- package/marketplace/skills/vercel-composition-patterns/SKILL.md +157 -0
- package/marketplace/skills/version-control/SKILL.md +233 -0
- package/marketplace/skills/visual-design-foundations/SKILL.md +59 -0
- package/marketplace/skills/visual-hierarchy/SKILL.md +43 -0
- package/marketplace/skills/webhook-integration/SKILL.md +331 -0
- package/marketplace/skills/writing-humanizer/SKILL.md +380 -0
- package/package.json +67 -0
- package/schemas/manifest.schema.json +811 -0
- package/schemas/manifest.v2.schema.json +164 -0
- package/schemas/manifest.v3.schema.json +758 -0
- package/schemas/manifest.v4.schema.json +755 -0
- package/schemas/manifest.v5.schema.json +755 -0
- package/schemas/manifest.v6.schema.json +811 -0
- package/schemas/skill.context.jsonld +279 -0
- package/schemas/skill.schema.json +919 -0
- package/schemas/skill.v2.schema.json +201 -0
- package/schemas/skill.v3.schema.json +827 -0
- package/schemas/skill.v4.schema.json +822 -0
- package/schemas/skill.v5.schema.json +830 -0
- package/schemas/skill.v6.schema.json +946 -0
- package/schemas/vocabulary/keywords.json +180 -0
- package/schemas/vocabulary/workspace_tags.json +23 -0
- package/scripts/__tests__/migrate-skill-v2-to-v3.test.js +161 -0
- package/scripts/__tests__/migrate-skill-v3-to-v4.test.js +158 -0
- package/scripts/__tests__/test-export-parser-drift.js +149 -0
- package/scripts/__tests__/test-marketplace-export.js +114 -0
- package/scripts/__tests__/test-router-paths.js +82 -0
- package/scripts/__tests__/test-stability-promotion.js +244 -0
- package/scripts/__tests__/test-v3-1-alias-contract.js +109 -0
- package/scripts/__tests__/test-v3-1-skos-runtime.js +116 -0
- package/scripts/backfill-schema-version.js +198 -0
- package/scripts/build-field-reference.js +160 -0
- package/scripts/build-retrieval-baseline.js +511 -0
- package/scripts/check-markdown-links.js +211 -0
- package/scripts/check-protocol-consistency.js +979 -0
- package/scripts/export-marketplace-skills.js +610 -0
- package/scripts/export-skill.js +374 -0
- package/scripts/generate-manifest.js +787 -0
- package/scripts/lib/alias-contract.js +83 -0
- package/scripts/lib/audit-prompt-builder.js +771 -0
- package/scripts/lib/mock-grader.js +134 -0
- package/scripts/lib/parse-frontmatter.js +429 -0
- package/scripts/lib/roots.js +119 -0
- package/scripts/lint/check-archetype-sections.js +185 -0
- package/scripts/lint/check-category-enum.js +83 -0
- package/scripts/lint/check-routing-eval.js +146 -0
- package/scripts/lint/check-routing-quality.js +211 -0
- package/scripts/lint/check-stability-promotion.js +220 -0
- package/scripts/lint/format-code-frame.js +206 -0
- package/scripts/marketplace-install.js +125 -0
- package/scripts/migrate-category-to-enum.js +169 -0
- package/scripts/migrate-skill-v2-to-v3.js +424 -0
- package/scripts/migrate-skill-v3-to-v4.js +200 -0
- package/scripts/migrate-skill-v5-to-v6.js +304 -0
- package/scripts/restructure-by-category.js +85 -0
- package/scripts/seed-publication-classification.js +282 -0
- package/scripts/skill-audit.js +893 -0
- package/scripts/skill-graph-drift.js +483 -0
- package/scripts/skill-graph-route.js +766 -0
- package/scripts/skill-graph-routing-eval.js +393 -0
- package/scripts/skill-lint.js +1317 -0
- package/scripts/skill-overlap.js +213 -0
- package/scripts/verify-skill-md-export.js +201 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "refactor",
|
|
3
|
+
"subject": "Behavior-preserving code reorganization: contract identification, next-change justification, smallest useful cut, behavior re-verification, and explicit stop condition",
|
|
4
|
+
"adjacent_concepts": ["debugging", "testing-strategy", "documentation"],
|
|
5
|
+
"evals": [
|
|
6
|
+
{
|
|
7
|
+
"id": 1,
|
|
8
|
+
"prompt": "A contributor proposes a \"code cleanup\" refactor but cannot name a specific pending change that will become easier because of it. According to the refactor skill's Workflow table, what is step 2 and what should happen when its question is answered \"no\"?",
|
|
9
|
+
"dimension": "definition",
|
|
10
|
+
"substance": "domain",
|
|
11
|
+
"calibration": "semantic",
|
|
12
|
+
"truth_mode": "code_verification",
|
|
13
|
+
"skill_type": "workflow",
|
|
14
|
+
"criticality": "critical",
|
|
15
|
+
"truth_sources": ["skills/refactor/SKILL.md:82-89"]
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": 2,
|
|
19
|
+
"prompt": "The refactor skill's Philosophy says the honest test is not \"this feels better\" but \"the next concrete change will be materially easier.\" Why does the skill reject subjective taste as a justification, and what failure mode does the next-change test prevent?",
|
|
20
|
+
"dimension": "mental_model",
|
|
21
|
+
"substance": "domain",
|
|
22
|
+
"calibration": "semantic",
|
|
23
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
24
|
+
"skill_type": "concept",
|
|
25
|
+
"criticality": "high",
|
|
26
|
+
"truth_sources": ["skills/refactor/SKILL.md:74-76"]
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"id": 3,
|
|
30
|
+
"prompt": "An engineer is chasing a null-pointer exception that surfaces intermittently in production. They wonder if they should engage the refactor skill to clean up the suspected code paths. Should they? Cite the negative-routing rule and name the skill that applies instead.",
|
|
31
|
+
"dimension": "boundary",
|
|
32
|
+
"substance": "contradiction-check",
|
|
33
|
+
"calibration": "semantic",
|
|
34
|
+
"truth_mode": "code_verification",
|
|
35
|
+
"skill_type": "concept",
|
|
36
|
+
"criticality": "high",
|
|
37
|
+
"truth_sources": ["skills/refactor/SKILL.md:107-113"]
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"id": 4,
|
|
41
|
+
"prompt": "A refactor commit touches three abstraction layers at once and the tests still pass. According to the refactor skill's \"When to back out\" section, is this acceptable? If not, what is the correct remediation and why?",
|
|
42
|
+
"dimension": "application",
|
|
43
|
+
"substance": "domain",
|
|
44
|
+
"calibration": "process",
|
|
45
|
+
"truth_mode": "process_correctness",
|
|
46
|
+
"skill_type": "workflow",
|
|
47
|
+
"criticality": "high",
|
|
48
|
+
"truth_sources": ["skills/refactor/SKILL.md:90-94"]
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"id": 5,
|
|
52
|
+
"prompt": "The refactor skill declares `depends_on: testing-strategy` in its relations. Why does refactor declare a hard dependency on testing-strategy rather than a softer `adjacent` or `verify_with` relation?",
|
|
53
|
+
"dimension": "purpose",
|
|
54
|
+
"substance": "domain",
|
|
55
|
+
"calibration": "semantic",
|
|
56
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
57
|
+
"skill_type": "concept",
|
|
58
|
+
"criticality": "high",
|
|
59
|
+
"truth_sources": ["skills/refactor/SKILL.md:45-56", "skills/refactor/SKILL.md:82-89"]
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"id": 6,
|
|
63
|
+
"prompt": "A team claims their refactor is safe because they added new tests to cover the refactored code. According to the refactor skill's step 1 and its Verification checklist, is the refactor's safety properly established? What is wrong with using newly-added tests as the behavior-preservation contract?",
|
|
64
|
+
"dimension": "application",
|
|
65
|
+
"substance": "contradiction-check",
|
|
66
|
+
"calibration": "semantic",
|
|
67
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
68
|
+
"skill_type": "concept",
|
|
69
|
+
"criticality": "critical",
|
|
70
|
+
"truth_sources": ["skills/refactor/SKILL.md:82-89", "skills/refactor/SKILL.md:100-105"]
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"id": 7,
|
|
74
|
+
"prompt": "A contributor asks the refactor skill to help write a changelog entry that explains what the refactor changed and why. Should the skill accept? Cite the negative-routing rule that decides this.",
|
|
75
|
+
"dimension": "boundary",
|
|
76
|
+
"substance": "contradiction-check",
|
|
77
|
+
"calibration": "semantic",
|
|
78
|
+
"truth_mode": "code_verification",
|
|
79
|
+
"skill_type": "concept",
|
|
80
|
+
"criticality": "normal",
|
|
81
|
+
"truth_sources": ["skills/refactor/SKILL.md:107-113"]
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"id": 8,
|
|
85
|
+
"prompt": "After a refactor commit is pushed, a previously-green test from the contract suite is now red. According to the refactor skill's `## When to back out` section, what is the correct response? Why does the skill prescribe reverting immediately rather than forward-fixing the test?",
|
|
86
|
+
"dimension": "application",
|
|
87
|
+
"substance": "domain",
|
|
88
|
+
"calibration": "process",
|
|
89
|
+
"truth_mode": "process_correctness",
|
|
90
|
+
"skill_type": "workflow",
|
|
91
|
+
"criticality": "high",
|
|
92
|
+
"truth_sources": ["skills/refactor/SKILL.md:90-94"]
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"id": 9,
|
|
96
|
+
"prompt": "An engineer has identified a refactor goal and plans three structural changes: rename a module, extract a helper, and collapse two duplicated branches. According to Workflow step 3 (Smallest useful cut), should these ship as one commit or three? What is the rule's reasoning, and what failure mode does splitting prevent?",
|
|
97
|
+
"dimension": "application",
|
|
98
|
+
"substance": "domain",
|
|
99
|
+
"calibration": "process",
|
|
100
|
+
"truth_mode": "process_correctness",
|
|
101
|
+
"skill_type": "workflow",
|
|
102
|
+
"criticality": "normal",
|
|
103
|
+
"truth_sources": ["skills/refactor/SKILL.md:82-89"]
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"id": 10,
|
|
107
|
+
"prompt": "A product manager asks the refactor skill to add retry logic with exponential backoff to a flaky network call. The change adds new behavior (retrying on failure) that the system did not previously have. Should the refactor skill accept this task, and if not, what is the boundary rule that decides this? Distinguish the \"add retry logic\" request from a refactor whose contract stays the same.",
|
|
108
|
+
"dimension": "boundary",
|
|
109
|
+
"substance": "contradiction-check",
|
|
110
|
+
"calibration": "semantic",
|
|
111
|
+
"truth_mode": "code_verification",
|
|
112
|
+
"skill_type": "concept",
|
|
113
|
+
"criticality": "high",
|
|
114
|
+
"truth_sources": ["skills/refactor/SKILL.md:45-55", "skills/refactor/SKILL.md:107-113"]
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"id": 11,
|
|
118
|
+
"prompt": "An engineer is about to run the refactor skill on their change and is looking at two artifacts in the SKILL.md: the `## Verification` checklist and the `## Evals` pointer to `examples/evals/refactor.json`. The skill explicitly warns: \"Do not conflate them — the checklist is for the engineer, the eval is for the grader.\" Explain the distinction: what does the engineer consult the Verification checklist for, what does the eval artifact do instead, and why does the skill refuse to merge the two?",
|
|
119
|
+
"dimension": "mental_model",
|
|
120
|
+
"substance": "domain",
|
|
121
|
+
"calibration": "semantic",
|
|
122
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
123
|
+
"skill_type": "concept",
|
|
124
|
+
"criticality": "normal",
|
|
125
|
+
"truth_sources": ["skills/refactor/SKILL.md:96-105"]
|
|
126
|
+
}
|
|
127
|
+
]
|
|
128
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "semiotics",
|
|
3
|
+
"subject": "Interface sign-system analysis for icons, colors, badges, shapes, visual metaphors, affordances, signifier/signified mapping, and sign drift",
|
|
4
|
+
"adjacent_concepts": ["microcopy", "visual-design-foundations", "a11y", "semantic-relations"],
|
|
5
|
+
"grounding_note": "Truth sources cite the whole SKILL.md file to keep the initial eval surface stable while routing boundaries are tightened.",
|
|
6
|
+
"evals": [
|
|
7
|
+
{
|
|
8
|
+
"id": 1,
|
|
9
|
+
"prompt": "A dashboard uses green for both revenue increases and cost increases, causing users to read both as good. Which skill owns the sign-system failure?",
|
|
10
|
+
"dimension": "application",
|
|
11
|
+
"substance": "domain",
|
|
12
|
+
"calibration": "process",
|
|
13
|
+
"truth_mode": "process_correctness",
|
|
14
|
+
"skill_type": "concept",
|
|
15
|
+
"criticality": "high",
|
|
16
|
+
"truth_sources": ["skills/semiotics/SKILL.md"]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": 2,
|
|
20
|
+
"prompt": "Users keep clicking a label because it looks like an affordance. What should semiotics inspect before recommending a fix?",
|
|
21
|
+
"dimension": "application",
|
|
22
|
+
"substance": "domain",
|
|
23
|
+
"calibration": "semantic",
|
|
24
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
25
|
+
"skill_type": "concept",
|
|
26
|
+
"criticality": "high",
|
|
27
|
+
"truth_sources": ["skills/semiotics/SKILL.md"]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": 3,
|
|
31
|
+
"prompt": "The task is to rewrite tooltip wording and validation copy. Should semiotics accept the work?",
|
|
32
|
+
"dimension": "boundary",
|
|
33
|
+
"substance": "contradiction-check",
|
|
34
|
+
"calibration": "semantic",
|
|
35
|
+
"truth_mode": "code_verification",
|
|
36
|
+
"skill_type": "concept",
|
|
37
|
+
"criticality": "normal",
|
|
38
|
+
"truth_sources": ["skills/semiotics/SKILL.md"]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": 4,
|
|
42
|
+
"prompt": "The task is WCAG contrast math, focus order, and screen-reader behavior for the same visual surface. Which boundary should semiotics respect?",
|
|
43
|
+
"dimension": "boundary",
|
|
44
|
+
"substance": "contradiction-check",
|
|
45
|
+
"calibration": "semantic",
|
|
46
|
+
"truth_mode": "code_verification",
|
|
47
|
+
"skill_type": "concept",
|
|
48
|
+
"criticality": "normal",
|
|
49
|
+
"truth_sources": ["skills/semiotics/SKILL.md"]
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "skill-infrastructure",
|
|
3
|
+
"subject": "Deterministic health tooling for SKILL.md libraries: inventory, protocol consistency, conflict detection, routing health, and drift detection",
|
|
4
|
+
"adjacent_concepts": ["skill-scaffold", "graph-audit", "testing-strategy", "lint-overlay"],
|
|
5
|
+
"grounding_note": "This portable skill is grounded in Skill Graph's health-tooling categories and uses whole-file truth sources to avoid brittle line drift while the skill itself continues to evolve.",
|
|
6
|
+
"evals": [
|
|
7
|
+
{
|
|
8
|
+
"id": 1,
|
|
9
|
+
"prompt": "A team has 200 SKILL.md files and no automated checks. Which health-tooling categories should they add first?",
|
|
10
|
+
"dimension": "definition",
|
|
11
|
+
"substance": "domain",
|
|
12
|
+
"calibration": "semantic",
|
|
13
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
14
|
+
"skill_type": "concept",
|
|
15
|
+
"criticality": "high",
|
|
16
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md"]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": 2,
|
|
20
|
+
"prompt": "The manifest generator works on your machine but the sample manifest in the repo is stale. Which class of skill-infrastructure check owns this?",
|
|
21
|
+
"dimension": "application",
|
|
22
|
+
"substance": "domain",
|
|
23
|
+
"calibration": "process",
|
|
24
|
+
"truth_mode": "process_correctness",
|
|
25
|
+
"skill_type": "workflow",
|
|
26
|
+
"criticality": "high",
|
|
27
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md", "scripts/generate-manifest.js"]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": 3,
|
|
31
|
+
"prompt": "Two skills give opposite instructions for the same target: one says always use a helper and the other says never use it. How should this be handled?",
|
|
32
|
+
"dimension": "application",
|
|
33
|
+
"substance": "contradiction-check",
|
|
34
|
+
"calibration": "semantic",
|
|
35
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
36
|
+
"skill_type": "concept",
|
|
37
|
+
"criticality": "high",
|
|
38
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md"]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": 4,
|
|
42
|
+
"prompt": "A routing-miss report says users frequently ask for 'tool poisoning' but no skill routes. What should skill-infrastructure recommend?",
|
|
43
|
+
"dimension": "application",
|
|
44
|
+
"substance": "domain",
|
|
45
|
+
"calibration": "process",
|
|
46
|
+
"truth_mode": "process_correctness",
|
|
47
|
+
"skill_type": "workflow",
|
|
48
|
+
"criticality": "normal",
|
|
49
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md"]
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"id": 5,
|
|
53
|
+
"prompt": "A skill declares local truth sources but no truth_source_hashes. What state should the drift sentinel report, and what is the fix?",
|
|
54
|
+
"dimension": "mental_model",
|
|
55
|
+
"substance": "domain",
|
|
56
|
+
"calibration": "semantic",
|
|
57
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
58
|
+
"skill_type": "concept",
|
|
59
|
+
"criticality": "high",
|
|
60
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md", "scripts/skill-graph-drift.js"]
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"id": 6,
|
|
64
|
+
"prompt": "An existing skill has five evals, all with positive expectations. Is that enough for a production skill library?",
|
|
65
|
+
"dimension": "purpose",
|
|
66
|
+
"substance": "domain",
|
|
67
|
+
"calibration": "semantic",
|
|
68
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
69
|
+
"skill_type": "concept",
|
|
70
|
+
"criticality": "normal",
|
|
71
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md"]
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"id": 7,
|
|
75
|
+
"prompt": "A batch skill change leaves the working tree dirty, but someone wants to regenerate the manifest and commit it anyway. What does this skill say?",
|
|
76
|
+
"dimension": "boundary",
|
|
77
|
+
"substance": "contradiction-check",
|
|
78
|
+
"calibration": "semantic",
|
|
79
|
+
"truth_mode": "process_correctness",
|
|
80
|
+
"skill_type": "workflow",
|
|
81
|
+
"criticality": "high",
|
|
82
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md"]
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"id": 8,
|
|
86
|
+
"prompt": "A user asks: 'Scaffold a new SKILL.md for our deploy procedure.' Should skill-infrastructure own that task?",
|
|
87
|
+
"dimension": "boundary",
|
|
88
|
+
"substance": "contradiction-check",
|
|
89
|
+
"calibration": "semantic",
|
|
90
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
91
|
+
"skill_type": "concept",
|
|
92
|
+
"criticality": "normal",
|
|
93
|
+
"truth_sources": ["skills/skill-infrastructure/SKILL.md"]
|
|
94
|
+
}
|
|
95
|
+
]
|
|
96
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "skill-router",
|
|
3
|
+
"subject": "Skill-dispatch mechanics for multi-skill agent runtimes: trigger-label matching, file-path matching, keyword matching, scope/type tiebreakers, and coverage-gap detection",
|
|
4
|
+
"adjacent_concepts": ["documentation", "graph-audit"],
|
|
5
|
+
"grounding_note": "Truth_sources cite skills/skill-router/SKILL.md line ranges. Drift is caught by scripts/skill-lint.js's `checkEvalTruthSourceRanges` check (D2) — any edit that moves a cited range out of file bounds fails lint before commit. Section-anchor citations (`## Routing Rules`) were considered but rejected: the Skill Graph schema does not define a stable anchor format across Markdown renderers, and line ranges plus the lint check give the grader an exact substring to ground against without renderer ambiguity.",
|
|
6
|
+
"evals": [
|
|
7
|
+
{
|
|
8
|
+
"id": 1,
|
|
9
|
+
"prompt": "An inbound agent request arrives with the trigger label `a11y-skill` AND a keyword overlap with three other skills. According to the skill-router's Routing Rules priority order, which surface decides the winner and why does the chain stop there?",
|
|
10
|
+
"dimension": "definition",
|
|
11
|
+
"substance": "domain",
|
|
12
|
+
"calibration": "semantic",
|
|
13
|
+
"truth_mode": "code_verification",
|
|
14
|
+
"skill_type": "concept",
|
|
15
|
+
"criticality": "high",
|
|
16
|
+
"truth_sources": ["skills/skill-router/SKILL.md:82-90"]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": 2,
|
|
20
|
+
"prompt": "The skill-router explicitly refuses to fall back to a default skill when no match is found. Why is surfacing a coverage gap treated as better than silent fallback — what authoring signal does the explicit gap carry that a silent default would destroy?",
|
|
21
|
+
"dimension": "mental_model",
|
|
22
|
+
"substance": "domain",
|
|
23
|
+
"calibration": "semantic",
|
|
24
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
25
|
+
"skill_type": "concept",
|
|
26
|
+
"criticality": "high",
|
|
27
|
+
"truth_sources": ["skills/skill-router/SKILL.md:102-104"]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": 3,
|
|
31
|
+
"prompt": "A query asks \"audit whether this SKILL.md declares the correct relations.\" Should the skill-router take the query or route it to another skill? Cite the negative-routing rule and name the target skill.",
|
|
32
|
+
"dimension": "boundary",
|
|
33
|
+
"substance": "contradiction-check",
|
|
34
|
+
"calibration": "semantic",
|
|
35
|
+
"truth_mode": "code_verification",
|
|
36
|
+
"skill_type": "concept",
|
|
37
|
+
"criticality": "high",
|
|
38
|
+
"truth_sources": ["skills/skill-router/SKILL.md:110-117"]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": 4,
|
|
42
|
+
"prompt": "Two skills tie on keyword score: one is `scope: codebase` (repo-specific), the other is `scope: portable`. According to the skill-router's Scope tiebreaker, which wins and why does repo-specificity outrank portability when the match is otherwise equal?",
|
|
43
|
+
"dimension": "application",
|
|
44
|
+
"substance": "domain",
|
|
45
|
+
"calibration": "process",
|
|
46
|
+
"truth_mode": "process_correctness",
|
|
47
|
+
"skill_type": "workflow",
|
|
48
|
+
"criticality": "high",
|
|
49
|
+
"truth_sources": ["skills/skill-router/SKILL.md:92-96"]
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"id": 5,
|
|
53
|
+
"prompt": "The skill-router notes that the v1 enum values `operational` and `generic` were renamed to `codebase` and `portable` in schema_version 2, and that the current schema rejects v1 names as hard errors. Why does the router specifically flag this rename rather than silently accepting both sets of names?",
|
|
54
|
+
"dimension": "purpose",
|
|
55
|
+
"substance": "domain",
|
|
56
|
+
"calibration": "semantic",
|
|
57
|
+
"truth_mode": "code_verification",
|
|
58
|
+
"skill_type": "concept",
|
|
59
|
+
"criticality": "normal",
|
|
60
|
+
"truth_sources": ["skills/skill-router/SKILL.md:92-96"]
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"id": 6,
|
|
64
|
+
"prompt": "An author proposes routing ambiguous queries to the nearest-match skill \"because surfacing a coverage gap blocks the user.\" According to the skill-router, is this acceptable? Cite the fallback-behavior rule and the contradiction with the router's stated purpose.",
|
|
65
|
+
"dimension": "application",
|
|
66
|
+
"substance": "contradiction-check",
|
|
67
|
+
"calibration": "semantic",
|
|
68
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
69
|
+
"skill_type": "concept",
|
|
70
|
+
"criticality": "critical",
|
|
71
|
+
"truth_sources": ["skills/skill-router/SKILL.md:102-104"]
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"id": 7,
|
|
75
|
+
"prompt": "A contributor is about to create a brand-new skill from scratch and asks the skill-router to guide the authoring. Should the router take the task? Cite the Do-NOT-Use row and name what the router directs the contributor to instead.",
|
|
76
|
+
"dimension": "boundary",
|
|
77
|
+
"substance": "contradiction-check",
|
|
78
|
+
"calibration": "semantic",
|
|
79
|
+
"truth_mode": "code_verification",
|
|
80
|
+
"skill_type": "concept",
|
|
81
|
+
"criticality": "high",
|
|
82
|
+
"truth_sources": ["skills/skill-router/SKILL.md:110-117"]
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"id": 8,
|
|
86
|
+
"prompt": "Two candidate skills tie on keyword score AND tie on scope (both `portable`). One is `type: workflow` (a procedural debugging skill), the other is `type: capability` (a reference skill on the same topic). According to the Type tiebreaker, which wins and why does the router prefer the procedural skill when the query is otherwise ambiguous?",
|
|
87
|
+
"dimension": "application",
|
|
88
|
+
"substance": "domain",
|
|
89
|
+
"calibration": "process",
|
|
90
|
+
"truth_mode": "process_correctness",
|
|
91
|
+
"skill_type": "workflow",
|
|
92
|
+
"criticality": "high",
|
|
93
|
+
"truth_sources": ["skills/skill-router/SKILL.md:98-100"]
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"id": 9,
|
|
97
|
+
"prompt": "An agent edits `src/components/Modal.tsx`. The a11y skill declares `paths: [\"**/*.{html,tsx,jsx,vue,svelte}\", \"!**/*.test.{ts,tsx,js,jsx}\"]`. No other skill matches this path. No trigger label is declared by the caller. According to the skill-router's Routing Rules priority order (surface 2 — file path), which skill wins and why does path-match-with-negation-exclusion produce a unique winner here before keyword matching is consulted?",
|
|
98
|
+
"dimension": "application",
|
|
99
|
+
"substance": "domain",
|
|
100
|
+
"calibration": "process",
|
|
101
|
+
"truth_mode": "process_correctness",
|
|
102
|
+
"skill_type": "workflow",
|
|
103
|
+
"criticality": "high",
|
|
104
|
+
"truth_sources": ["skills/skill-router/SKILL.md:82-90"]
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"id": 10,
|
|
108
|
+
"prompt": "An agent sends the query \"why is this modal keyboard-trapped?\" and declares the trigger label `a11y-skill`. No other skill in the library declares that trigger. According to the skill-router's Routing Rules (surface 1 — trigger label), describe the routing path: which surface fires, why the chain stops there, and what the router returns without consulting paths or keywords. This is the baseline positive case against which the tiebreaker and fallback evals are edge cases.",
|
|
109
|
+
"dimension": "definition",
|
|
110
|
+
"substance": "domain",
|
|
111
|
+
"calibration": "process",
|
|
112
|
+
"truth_mode": "process_correctness",
|
|
113
|
+
"skill_type": "workflow",
|
|
114
|
+
"criticality": "normal",
|
|
115
|
+
"truth_sources": ["skills/skill-router/SKILL.md:82-90"]
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
"id": 11,
|
|
119
|
+
"prompt": "A query arrives with trigger label `debugging-skill` (matches: debugging), touches `src/components/Modal.tsx` (path-matches: a11y), and uses the phrase \"decide what to test\" (keyword-matches: testing-strategy). THREE different skills win on THREE different surfaces. According to the Routing Rules priority ordering, which skill does the router actually return and why does the chain not consult surfaces 2 and 3 even though they have distinct winners? What failure mode would a naive multi-surface-vote implementation produce here?",
|
|
120
|
+
"dimension": "application",
|
|
121
|
+
"substance": "contradiction-check",
|
|
122
|
+
"calibration": "process",
|
|
123
|
+
"truth_mode": "process_correctness",
|
|
124
|
+
"skill_type": "workflow",
|
|
125
|
+
"criticality": "critical",
|
|
126
|
+
"truth_sources": ["skills/skill-router/SKILL.md:82-90", "skills/skill-router/SKILL.md:71-80"]
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
"id": 12,
|
|
130
|
+
"prompt": "An agent sends the query \"translate this error message into German.\" No skill declares a trigger for translation, no skill's paths match (it is a runtime operation, not a file edit), and no skill's keywords contain translation-related terms. According to the Fallback behavior section, describe the EXACT response the router returns to the caller — not \"it refuses,\" but the concrete shape of the coverage-gap signal, what it tells the caller to do, and what it explicitly does NOT do (e.g., does it pick nearest-match? fall back to a default? return null silently?). The caller needs to distinguish \"no skill matches\" from \"skill matched but produced no answer.\"",
|
|
131
|
+
"dimension": "application",
|
|
132
|
+
"substance": "domain",
|
|
133
|
+
"calibration": "process",
|
|
134
|
+
"truth_mode": "process_correctness",
|
|
135
|
+
"skill_type": "workflow",
|
|
136
|
+
"criticality": "high",
|
|
137
|
+
"truth_sources": ["skills/skill-router/SKILL.md:102-104", "skills/skill-router/SKILL.md:71-80"]
|
|
138
|
+
}
|
|
139
|
+
]
|
|
140
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "skill-router",
|
|
3
|
+
"kind": "routing_eval",
|
|
4
|
+
"subject": "Dispatch assertions: given (query, candidate-skills) tuples, verify the router selects the expected winner skill by applying the Routing Rules priority chain, Scope tiebreaker, Type tiebreaker, and Fallback behavior.",
|
|
5
|
+
"schema_note": "This is a v1 routing_eval artifact. Each case is a (query, candidates, expected_winner, reason) tuple. A consuming grader SHOULD implement the router's priority chain and assert that the winner matches expected_winner for each case. The format is deliberate: comprehension evals (examples/evals/skill-router.json) ask the model to explain what the router would do; routing evals (this file) ask a router implementation to actually do it. A skill declaring `routing_eval: present` ships one of these.",
|
|
6
|
+
"routing_cases": [
|
|
7
|
+
{
|
|
8
|
+
"id": "trigger-only-winner",
|
|
9
|
+
"description": "Surface 1 unique match — the trigger label resolves the dispatch without consulting surfaces 2 or 3.",
|
|
10
|
+
"query": {
|
|
11
|
+
"triggers": ["a11y-skill"],
|
|
12
|
+
"paths_touched": [],
|
|
13
|
+
"keywords": ["modal", "keyboard-trapped"]
|
|
14
|
+
},
|
|
15
|
+
"candidates": ["a11y", "debugging", "testing-strategy"],
|
|
16
|
+
"expected_winner": "a11y",
|
|
17
|
+
"expected_surface": "triggers",
|
|
18
|
+
"reason": "a11y is the only skill whose triggers array contains 'a11y-skill'. Priority 1 produces a unique winner — the chain stops."
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "path-only-winner",
|
|
22
|
+
"description": "Surface 2 unique match — no trigger is declared, path glob resolves the dispatch.",
|
|
23
|
+
"query": {
|
|
24
|
+
"triggers": [],
|
|
25
|
+
"paths_touched": ["src/components/Modal.tsx"],
|
|
26
|
+
"keywords": []
|
|
27
|
+
},
|
|
28
|
+
"candidates": ["a11y", "graph-audit", "refactor"],
|
|
29
|
+
"expected_winner": "a11y",
|
|
30
|
+
"expected_surface": "paths",
|
|
31
|
+
"reason": "a11y declares `paths: **/*.{html,tsx,jsx,vue,svelte}` which matches Modal.tsx. No other candidate's paths match. Priority 1 had no match, Priority 2 produces a unique winner."
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"id": "keyword-only-winner",
|
|
35
|
+
"description": "Surface 3 unique match — keyword overlap with exactly one skill's keywords array.",
|
|
36
|
+
"query": {
|
|
37
|
+
"triggers": [],
|
|
38
|
+
"paths_touched": [],
|
|
39
|
+
"keywords": ["bisection", "reproduce", "symptom"]
|
|
40
|
+
},
|
|
41
|
+
"candidates": ["a11y", "debugging", "documentation"],
|
|
42
|
+
"expected_winner": "debugging",
|
|
43
|
+
"expected_surface": "keywords",
|
|
44
|
+
"reason": "debugging's keywords include 'reproduce' and closely-related concepts; a11y and documentation do not share these keywords. Priority 3 produces a unique winner after priorities 1 and 2 had no match."
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"id": "multi-surface-conflict-chain-stops-at-surface-1",
|
|
48
|
+
"description": "Three surfaces produce THREE different winners. The priority chain must stop at surface 1 and NOT consult surfaces 2 or 3. Mirrors comprehension eval #11.",
|
|
49
|
+
"query": {
|
|
50
|
+
"triggers": ["debugging-skill"],
|
|
51
|
+
"paths_touched": ["src/components/Modal.tsx"],
|
|
52
|
+
"keywords": ["decide what to test"]
|
|
53
|
+
},
|
|
54
|
+
"candidates": ["debugging", "a11y", "testing-strategy"],
|
|
55
|
+
"expected_winner": "debugging",
|
|
56
|
+
"expected_surface": "triggers",
|
|
57
|
+
"reason": "debugging-skill is a unique trigger match for debugging. Even though Modal.tsx would path-match a11y AND 'decide what to test' would keyword-match testing-strategy, the chain stops at Priority 1. A router that aggregated votes across surfaces here would be wrong."
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"id": "scope-tiebreaker-codebase-over-portable",
|
|
61
|
+
"description": "Two skills tie on keyword score. Scope tiebreaker breaks the tie in favor of `codebase` > `portable`. Mirrors comprehension eval #4.",
|
|
62
|
+
"query": {
|
|
63
|
+
"triggers": [],
|
|
64
|
+
"paths_touched": [],
|
|
65
|
+
"keywords": ["audit", "metadata", "schema"]
|
|
66
|
+
},
|
|
67
|
+
"candidates": ["graph-audit", "documentation"],
|
|
68
|
+
"candidate_metadata": {
|
|
69
|
+
"graph-audit": { "scope": "codebase", "type": "capability" },
|
|
70
|
+
"documentation": { "scope": "portable", "type": "capability" }
|
|
71
|
+
},
|
|
72
|
+
"expected_winner": "graph-audit",
|
|
73
|
+
"expected_surface": "keywords",
|
|
74
|
+
"expected_tiebreaker": "scope",
|
|
75
|
+
"reason": "Both skills score similarly on keyword match. graph-audit is `scope: codebase`; documentation is `scope: portable`. Scope tiebreaker: `codebase` > `reference` > `portable`. graph-audit wins."
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"id": "type-tiebreaker-workflow-over-capability",
|
|
79
|
+
"description": "Two skills tie on keyword AND scope. Type tiebreaker breaks the tie: `workflow` > `capability` > `router` > `overlay`. Mirrors comprehension eval #8.",
|
|
80
|
+
"query": {
|
|
81
|
+
"triggers": [],
|
|
82
|
+
"paths_touched": [],
|
|
83
|
+
"keywords": ["bug", "fix", "verify"]
|
|
84
|
+
},
|
|
85
|
+
"candidates": ["debugging", "a11y"],
|
|
86
|
+
"candidate_metadata": {
|
|
87
|
+
"debugging": { "scope": "portable", "type": "workflow" },
|
|
88
|
+
"a11y": { "scope": "portable", "type": "capability" }
|
|
89
|
+
},
|
|
90
|
+
"expected_winner": "debugging",
|
|
91
|
+
"expected_surface": "keywords",
|
|
92
|
+
"expected_tiebreaker": "type",
|
|
93
|
+
"reason": "Both skills are `scope: portable`. debugging is `type: workflow`; a11y is `type: capability`. Type tiebreaker: `workflow` > `capability` > `router` > `overlay`. debugging wins because procedural decision logic beats a pure reference when the query is otherwise ambiguous."
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"id": "no-match-coverage-gap",
|
|
97
|
+
"description": "Zero surfaces match. The router returns an explicit coverage-gap signal — NOT a nearest-match fallback and NOT a silent null. Mirrors comprehension eval #12.",
|
|
98
|
+
"query": {
|
|
99
|
+
"triggers": [],
|
|
100
|
+
"paths_touched": [],
|
|
101
|
+
"keywords": ["translate", "german", "error message"]
|
|
102
|
+
},
|
|
103
|
+
"candidates": ["a11y", "debugging", "documentation", "refactor", "testing-strategy", "graph-audit", "skill-router", "lint-overlay"],
|
|
104
|
+
"expected_winner": null,
|
|
105
|
+
"expected_surface": "none",
|
|
106
|
+
"expected_signal": {
|
|
107
|
+
"kind": "coverage_gap",
|
|
108
|
+
"message_contract": "MUST surface an explicit coverage-gap signal to the caller indicating no skill matched. MUST recommend authoring a new skill or broadening an existing skill's keywords. MUST NOT fall back to a nearest-match skill. MUST NOT return null silently without the coverage-gap annotation."
|
|
109
|
+
},
|
|
110
|
+
"reason": "No skill in the library has triggers, paths, or keywords that match a translation-into-German query. The Fallback behavior doctrine is non-negotiable: surface the gap, do not guess. A router that returned 'documentation' here because German is prose-ish would be wrong."
|
|
111
|
+
}
|
|
112
|
+
]
|
|
113
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "system-interface-contracts",
|
|
3
|
+
"subject": "System boundary contracts across modules, services, jobs, agents, APIs, events, dashboards, invariants, ownership, failure modes, and verification",
|
|
4
|
+
"adjacent_concepts": ["api-design", "event-contract-design", "data-modeling", "observability-modeling"],
|
|
5
|
+
"grounding_note": "Truth sources cite the whole SKILL.md file to keep the initial eval surface stable while routing boundaries are tightened.",
|
|
6
|
+
"evals": [
|
|
7
|
+
{
|
|
8
|
+
"id": 1,
|
|
9
|
+
"prompt": "An ingestion job, dashboard, and background worker need a shared contract for ownership, invariants, failure modes, and verification. Which skill owns the broad boundary?",
|
|
10
|
+
"dimension": "application",
|
|
11
|
+
"substance": "domain",
|
|
12
|
+
"calibration": "process",
|
|
13
|
+
"truth_mode": "process_correctness",
|
|
14
|
+
"skill_type": "concept",
|
|
15
|
+
"criticality": "high",
|
|
16
|
+
"truth_sources": ["skills/system-interface-contracts/SKILL.md"]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": 2,
|
|
20
|
+
"prompt": "Two systems exchange data through several surfaces and the team needs to name shared invariants before choosing API or event details. What should system-interface-contracts produce?",
|
|
21
|
+
"dimension": "application",
|
|
22
|
+
"substance": "domain",
|
|
23
|
+
"calibration": "semantic",
|
|
24
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
25
|
+
"skill_type": "concept",
|
|
26
|
+
"criticality": "high",
|
|
27
|
+
"truth_sources": ["skills/system-interface-contracts/SKILL.md"]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": 3,
|
|
31
|
+
"prompt": "The task is only REST route naming, request and response schemas, pagination, and status codes. Should system-interface-contracts accept?",
|
|
32
|
+
"dimension": "boundary",
|
|
33
|
+
"substance": "contradiction-check",
|
|
34
|
+
"calibration": "semantic",
|
|
35
|
+
"truth_mode": "code_verification",
|
|
36
|
+
"skill_type": "concept",
|
|
37
|
+
"criticality": "normal",
|
|
38
|
+
"truth_sources": ["skills/system-interface-contracts/SKILL.md"]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": 4,
|
|
42
|
+
"prompt": "The task is asynchronous topic names, event envelopes, replay, and compatibility. Which boundary should system-interface-contracts respect?",
|
|
43
|
+
"dimension": "boundary",
|
|
44
|
+
"substance": "contradiction-check",
|
|
45
|
+
"calibration": "semantic",
|
|
46
|
+
"truth_mode": "code_verification",
|
|
47
|
+
"skill_type": "concept",
|
|
48
|
+
"criticality": "normal",
|
|
49
|
+
"truth_sources": ["skills/system-interface-contracts/SKILL.md"]
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "task-analysis",
|
|
3
|
+
"subject": "Goal-driven UX route and flow analysis for actor, scenario, top task, subtasks, friction, breakpoints, and first-viewport hierarchy contracts",
|
|
4
|
+
"adjacent_concepts": ["information-architecture", "layout-composition", "interaction-patterns", "a11y"],
|
|
5
|
+
"grounding_note": "Truth sources cite the whole SKILL.md file to keep the initial eval surface stable while routing boundaries are tightened.",
|
|
6
|
+
"evals": [
|
|
7
|
+
{
|
|
8
|
+
"id": 1,
|
|
9
|
+
"prompt": "An onboarding flow feels confusing and users abandon step 3. Which skill owns identifying the actor, top task, subtasks, friction, and breakpoint?",
|
|
10
|
+
"dimension": "application",
|
|
11
|
+
"substance": "domain",
|
|
12
|
+
"calibration": "process",
|
|
13
|
+
"truth_mode": "process_correctness",
|
|
14
|
+
"skill_type": "workflow",
|
|
15
|
+
"criticality": "high",
|
|
16
|
+
"truth_sources": ["skills/task-analysis/SKILL.md"]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": 2,
|
|
20
|
+
"prompt": "A route needs a first-viewport contract that separates primary, secondary, and supporting content before layout begins. What should task-analysis produce?",
|
|
21
|
+
"dimension": "application",
|
|
22
|
+
"substance": "domain",
|
|
23
|
+
"calibration": "semantic",
|
|
24
|
+
"truth_mode": "conceptual_correctness_plus_repo_application",
|
|
25
|
+
"skill_type": "workflow",
|
|
26
|
+
"criticality": "high",
|
|
27
|
+
"truth_sources": ["skills/task-analysis/SKILL.md"]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": 3,
|
|
31
|
+
"prompt": "The task is to pick tabs, accordions, dropdowns, or a stepper after the user goal is already known. Should task-analysis continue?",
|
|
32
|
+
"dimension": "boundary",
|
|
33
|
+
"substance": "contradiction-check",
|
|
34
|
+
"calibration": "semantic",
|
|
35
|
+
"truth_mode": "code_verification",
|
|
36
|
+
"skill_type": "concept",
|
|
37
|
+
"criticality": "normal",
|
|
38
|
+
"truth_sources": ["skills/task-analysis/SKILL.md"]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": 4,
|
|
42
|
+
"prompt": "The task is a focused WCAG 2.2 audit for keyboard and screen-reader compliance. Which boundary should task-analysis respect?",
|
|
43
|
+
"dimension": "boundary",
|
|
44
|
+
"substance": "contradiction-check",
|
|
45
|
+
"calibration": "semantic",
|
|
46
|
+
"truth_mode": "code_verification",
|
|
47
|
+
"skill_type": "concept",
|
|
48
|
+
"criticality": "normal",
|
|
49
|
+
"truth_sources": ["skills/task-analysis/SKILL.md"]
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
}
|