@patricio0312rev/skillset 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/LICENSE +21 -0
  3. package/README.md +176 -0
  4. package/bin/cli.js +37 -0
  5. package/package.json +55 -0
  6. package/src/commands/init.js +301 -0
  7. package/src/index.js +168 -0
  8. package/src/lib/config.js +200 -0
  9. package/src/lib/generator.js +166 -0
  10. package/src/utils/display.js +95 -0
  11. package/src/utils/readme.js +196 -0
  12. package/src/utils/tool-specific.js +233 -0
  13. package/templates/ai-engineering/agent-orchestration-planner/ SKILL.md +266 -0
  14. package/templates/ai-engineering/cost-latency-optimizer/ SKILL.md +270 -0
  15. package/templates/ai-engineering/doc-to-vector-dataset-generator/ SKILL.md +239 -0
  16. package/templates/ai-engineering/evaluation-harness/ SKILL.md +219 -0
  17. package/templates/ai-engineering/guardrails-safety-filter-builder/ SKILL.md +226 -0
  18. package/templates/ai-engineering/llm-debugger/ SKILL.md +283 -0
  19. package/templates/ai-engineering/prompt-regression-tester/ SKILL.md +216 -0
  20. package/templates/ai-engineering/prompt-template-builder/ SKILL.md +393 -0
  21. package/templates/ai-engineering/rag-pipeline-builder/ SKILL.md +244 -0
  22. package/templates/ai-engineering/tool-function-schema-designer/ SKILL.md +219 -0
  23. package/templates/architecture/adr-writer/ SKILL.md +250 -0
  24. package/templates/architecture/api-versioning-deprecation-planner/ SKILL.md +331 -0
  25. package/templates/architecture/domain-model-boundaries-mapper/ SKILL.md +300 -0
  26. package/templates/architecture/migration-planner/ SKILL.md +376 -0
  27. package/templates/architecture/performance-budget-setter/ SKILL.md +318 -0
  28. package/templates/architecture/reliability-strategy-builder/ SKILL.md +286 -0
  29. package/templates/architecture/rfc-generator/ SKILL.md +362 -0
  30. package/templates/architecture/scalability-playbook/ SKILL.md +279 -0
  31. package/templates/architecture/system-design-generator/ SKILL.md +339 -0
  32. package/templates/architecture/tech-debt-prioritizer/ SKILL.md +329 -0
  33. package/templates/backend/api-contract-normalizer/ SKILL.md +487 -0
  34. package/templates/backend/api-endpoint-generator/ SKILL.md +415 -0
  35. package/templates/backend/auth-module-builder/ SKILL.md +99 -0
  36. package/templates/backend/background-jobs-designer/ SKILL.md +166 -0
  37. package/templates/backend/caching-strategist/ SKILL.md +190 -0
  38. package/templates/backend/error-handling-standardizer/ SKILL.md +174 -0
  39. package/templates/backend/rate-limiting-abuse-protection/ SKILL.md +147 -0
  40. package/templates/backend/rbac-permissions-builder/ SKILL.md +158 -0
  41. package/templates/backend/service-layer-extractor/ SKILL.md +269 -0
  42. package/templates/backend/webhook-receiver-hardener/ SKILL.md +211 -0
  43. package/templates/ci-cd/artifact-sbom-publisher/ SKILL.md +236 -0
  44. package/templates/ci-cd/caching-strategy-optimizer/ SKILL.md +195 -0
  45. package/templates/ci-cd/deployment-checklist-generator/ SKILL.md +381 -0
  46. package/templates/ci-cd/github-actions-pipeline-creator/ SKILL.md +348 -0
  47. package/templates/ci-cd/monorepo-ci-optimizer/ SKILL.md +298 -0
  48. package/templates/ci-cd/preview-environments-builder/ SKILL.md +187 -0
  49. package/templates/ci-cd/quality-gates-enforcer/ SKILL.md +342 -0
  50. package/templates/ci-cd/release-automation-builder/ SKILL.md +281 -0
  51. package/templates/ci-cd/rollback-workflow-builder/ SKILL.md +372 -0
  52. package/templates/ci-cd/secrets-env-manager/ SKILL.md +242 -0
  53. package/templates/db-management/backup-restore-runbook-generator/ SKILL.md +505 -0
  54. package/templates/db-management/data-integrity-auditor/ SKILL.md +505 -0
  55. package/templates/db-management/data-retention-archiving-planner/ SKILL.md +430 -0
  56. package/templates/db-management/data-seeding-fixtures-builder/ SKILL.md +375 -0
  57. package/templates/db-management/db-performance-watchlist/ SKILL.md +425 -0
  58. package/templates/db-management/etl-sync-job-builder/ SKILL.md +457 -0
  59. package/templates/db-management/multi-tenant-safety-checker/ SKILL.md +398 -0
  60. package/templates/db-management/prisma-migration-assistant/ SKILL.md +379 -0
  61. package/templates/db-management/schema-consistency-checker/ SKILL.md +440 -0
  62. package/templates/db-management/sql-query-optimizer/ SKILL.md +324 -0
  63. package/templates/foundation/changelog-writer/ SKILL.md +431 -0
  64. package/templates/foundation/code-formatter-installer/ SKILL.md +320 -0
  65. package/templates/foundation/codebase-summarizer/ SKILL.md +360 -0
  66. package/templates/foundation/dependency-doctor/ SKILL.md +163 -0
  67. package/templates/foundation/dev-environment-bootstrapper/ SKILL.md +259 -0
  68. package/templates/foundation/dev-onboarding-builder/ SKILL.md +556 -0
  69. package/templates/foundation/docs-starter-kit/ SKILL.md +574 -0
  70. package/templates/foundation/explaining-code/SKILL.md +13 -0
  71. package/templates/foundation/git-hygiene-enforcer/ SKILL.md +455 -0
  72. package/templates/foundation/project-scaffolder/ SKILL.md +65 -0
  73. package/templates/foundation/project-scaffolder/references/templates.md +126 -0
  74. package/templates/foundation/repo-structure-linter/ SKILL.md +0 -0
  75. package/templates/foundation/repo-structure-linter/references/conventions.md +98 -0
  76. package/templates/frontend/animation-micro-interaction-pack/ SKILL.md +41 -0
  77. package/templates/frontend/component-scaffold-generator/ SKILL.md +562 -0
  78. package/templates/frontend/design-to-component-translator/ SKILL.md +547 -0
  79. package/templates/frontend/form-wizard-builder/ SKILL.md +553 -0
  80. package/templates/frontend/frontend-refactor-planner/ SKILL.md +37 -0
  81. package/templates/frontend/i18n-frontend-implementer/ SKILL.md +44 -0
  82. package/templates/frontend/modal-drawer-system/ SKILL.md +377 -0
  83. package/templates/frontend/page-layout-builder/ SKILL.md +630 -0
  84. package/templates/frontend/state-ux-flow-builder/ SKILL.md +23 -0
  85. package/templates/frontend/table-builder/ SKILL.md +350 -0
  86. package/templates/performance/alerting-dashboard-builder/ SKILL.md +162 -0
  87. package/templates/performance/backend-latency-profiler-helper/ SKILL.md +108 -0
  88. package/templates/performance/caching-cdn-strategy-planner/ SKILL.md +150 -0
  89. package/templates/performance/capacity-planning-helper/ SKILL.md +242 -0
  90. package/templates/performance/core-web-vitals-tuner/ SKILL.md +126 -0
  91. package/templates/performance/incident-runbook-generator/ SKILL.md +162 -0
  92. package/templates/performance/load-test-scenario-builder/ SKILL.md +256 -0
  93. package/templates/performance/observability-setup/ SKILL.md +232 -0
  94. package/templates/performance/postmortem-writer/ SKILL.md +203 -0
  95. package/templates/performance/structured-logging-standardizer/ SKILL.md +122 -0
  96. package/templates/security/auth-security-reviewer/ SKILL.md +428 -0
  97. package/templates/security/dependency-vulnerability-triage/ SKILL.md +495 -0
  98. package/templates/security/input-validation-sanitization-auditor/ SKILL.md +76 -0
  99. package/templates/security/pii-redaction-logging-policy-builder/ SKILL.md +65 -0
  100. package/templates/security/rbac-policy-tester/ SKILL.md +80 -0
  101. package/templates/security/secrets-scanner/ SKILL.md +462 -0
  102. package/templates/security/secure-headers-csp-builder/ SKILL.md +404 -0
  103. package/templates/security/security-incident-playbook-generator/ SKILL.md +76 -0
  104. package/templates/security/security-pr-checklist-skill/ SKILL.md +62 -0
  105. package/templates/security/threat-model-generator/ SKILL.md +394 -0
  106. package/templates/testing/contract-testing-builder/ SKILL.md +492 -0
  107. package/templates/testing/coverage-strategist/ SKILL.md +436 -0
  108. package/templates/testing/e2e-test-builder/ SKILL.md +382 -0
  109. package/templates/testing/flaky-test-detective/ SKILL.md +416 -0
  110. package/templates/testing/integration-test-builder/ SKILL.md +525 -0
  111. package/templates/testing/mocking-assistant/ SKILL.md +383 -0
  112. package/templates/testing/snapshot-test-refactorer/ SKILL.md +375 -0
  113. package/templates/testing/test-data-factory-builder/ SKILL.md +449 -0
  114. package/templates/testing/test-reporting-triage-skill/ SKILL.md +469 -0
  115. package/templates/testing/unit-test-generator/ SKILL.md +548 -0
@@ -0,0 +1,219 @@
1
+ ---
2
+ name: evaluation-harness
3
+ description: Builds repeatable evaluation systems with golden datasets, scoring rubrics, pass/fail thresholds, and regression reports. Use for "LLM evaluation", "testing AI systems", "quality assurance", or "model benchmarking".
4
+ ---
5
+
6
+ # Evaluation Harness
7
+
8
+ Build systematic evaluation frameworks for LLM applications.
9
+
10
+ ## Golden Dataset Format
11
+
12
+ ```json
13
+ [
14
+ {
15
+ "id": "test_001",
16
+ "category": "code_generation",
17
+ "input": "Write a Python function to reverse a string",
18
+ "expected_output": "def reverse_string(s: str) -> str:\n return s[::-1]",
19
+ "rubric": {
20
+ "correctness": 1.0,
21
+ "style": 0.8,
22
+ "documentation": 0.5
23
+ },
24
+ "metadata": {
25
+ "difficulty": "easy",
26
+ "tags": ["python", "strings"]
27
+ }
28
+ }
29
+ ]
30
+ ```
31
+
32
+ ## Scoring Rubrics
33
+
34
+ ```python
35
+ from typing import Dict, Any
36
+
37
+ def score_exact_match(actual: str, expected: str) -> float:
38
+ """Binary score: 1.0 if exact match, 0.0 otherwise"""
39
+ return 1.0 if actual.strip() == expected.strip() else 0.0
40
+
41
+ def score_semantic_similarity(actual: str, expected: str) -> float:
42
+ """Cosine similarity of embeddings"""
43
+ actual_emb = get_embedding(actual)
44
+ expected_emb = get_embedding(expected)
45
+ return cosine_similarity(actual_emb, expected_emb)
46
+
47
+ def score_contains_keywords(actual: str, keywords: List[str]) -> float:
48
+ """Percentage of required keywords present"""
49
+ found = sum(1 for kw in keywords if kw.lower() in actual.lower())
50
+ return found / len(keywords)
51
+
52
+ def score_with_llm(actual: str, expected: str, rubric: Dict[str, float]) -> Dict[str, float]:
53
+ """Use LLM as judge"""
54
+ prompt = f"""
55
+ Grade this output on a scale of 0-1 for each criterion:
56
+
57
+ Expected: {expected}
58
+ Actual: {actual}
59
+
60
+ Criteria: {', '.join(rubric.keys())}
61
+
62
+ Return JSON with scores.
63
+ """
64
+ return json.loads(llm(prompt))
65
+ ```
66
+
67
+ ## Test Runner
68
+
69
+ ```python
70
+ class EvaluationHarness:
71
+ def __init__(self, dataset_path: str):
72
+ self.dataset = self.load_dataset(dataset_path)
73
+ self.results = []
74
+
75
+ def run_evaluation(self, model_fn):
76
+ for test_case in self.dataset:
77
+ # Generate output
78
+ actual = model_fn(test_case["input"])
79
+
80
+ # Score
81
+ scores = self.score_output(
82
+ actual,
83
+ test_case["expected_output"],
84
+ test_case["rubric"]
85
+ )
86
+
87
+ # Record result
88
+ self.results.append({
89
+ "test_id": test_case["id"],
90
+ "category": test_case["category"],
91
+ "scores": scores,
92
+ "passed": self.check_threshold(scores, test_case),
93
+ "actual_output": actual,
94
+ })
95
+
96
+ return self.generate_report()
97
+
98
+ def score_output(self, actual, expected, rubric):
99
+ return {
100
+ "exact_match": score_exact_match(actual, expected),
101
+ "semantic_similarity": score_semantic_similarity(actual, expected),
102
+ **score_with_llm(actual, expected, rubric)
103
+ }
104
+
105
+ def check_threshold(self, scores, test_case):
106
+ min_scores = test_case.get("min_scores", {})
107
+ for metric, threshold in min_scores.items():
108
+ if scores.get(metric, 0) < threshold:
109
+ return False
110
+ return True
111
+ ```
112
+
113
+ ## Thresholds & Pass Criteria
114
+
115
+ ```python
116
+ # Define thresholds per category
117
+ THRESHOLDS = {
118
+ "code_generation": {
119
+ "correctness": 0.9,
120
+ "style": 0.7,
121
+ },
122
+ "summarization": {
123
+ "semantic_similarity": 0.8,
124
+ "brevity": 0.7,
125
+ },
126
+ "classification": {
127
+ "exact_match": 1.0,
128
+ }
129
+ }
130
+
131
+ def check_test_passed(result: Dict) -> bool:
132
+ category = result["category"]
133
+ thresholds = THRESHOLDS.get(category, {})
134
+
135
+ for metric, threshold in thresholds.items():
136
+ if result["scores"].get(metric, 0) < threshold:
137
+ return False
138
+
139
+ return True
140
+ ```
141
+
142
+ ## Regression Report
143
+
144
+ ```python
145
+ def generate_regression_report(baseline_results, current_results):
146
+ report = {
147
+ "summary": {},
148
+ "regressions": [],
149
+ "improvements": [],
150
+ "unchanged": 0
151
+ }
152
+
153
+ for baseline, current in zip(baseline_results, current_results):
154
+ assert baseline["test_id"] == current["test_id"]
155
+
156
+ baseline_passed = baseline["passed"]
157
+ current_passed = current["passed"]
158
+
159
+ if baseline_passed and not current_passed:
160
+ report["regressions"].append({
161
+ "test_id": baseline["test_id"],
162
+ "category": baseline["category"],
163
+ "baseline_scores": baseline["scores"],
164
+ "current_scores": current["scores"],
165
+ })
166
+ elif not baseline_passed and current_passed:
167
+ report["improvements"].append(baseline["test_id"])
168
+ else:
169
+ report["unchanged"] += 1
170
+
171
+ report["summary"] = {
172
+ "total_tests": len(baseline_results),
173
+ "regressions": len(report["regressions"]),
174
+ "improvements": len(report["improvements"]),
175
+ "unchanged": report["unchanged"],
176
+ }
177
+
178
+ return report
179
+ ```
180
+
181
+ ## Continuous Evaluation
182
+
183
+ ```python
184
+ # Run evaluation on every commit
185
+ def ci_evaluation():
186
+ harness = EvaluationHarness("golden_dataset.json")
187
+ results = harness.run_evaluation(production_model)
188
+
189
+ # Check for regressions
190
+ baseline = load_baseline("baseline_results.json")
191
+ report = generate_regression_report(baseline, results)
192
+
193
+ # Fail CI if regressions
194
+ if report["summary"]["regressions"] > 0:
195
+ print(f"❌ {report['summary']['regressions']} regressions detected!")
196
+ sys.exit(1)
197
+
198
+ print("✅ All tests passed!")
199
+ ```
200
+
201
+ ## Best Practices
202
+
203
+ 1. **Representative dataset**: Cover edge cases
204
+ 2. **Multiple metrics**: Don't rely on one score
205
+ 3. **Human validation**: Review LLM judge scores
206
+ 4. **Version datasets**: Track changes over time
207
+ 5. **Automate in CI**: Catch regressions early
208
+ 6. **Regular updates**: Add new test cases
209
+
210
+ ## Output Checklist
211
+
212
+ - [ ] Golden dataset created (50+ examples)
213
+ - [ ] Multiple scoring functions
214
+ - [ ] Pass/fail thresholds defined
215
+ - [ ] Test runner implemented
216
+ - [ ] Regression comparison
217
+ - [ ] Report generation
218
+ - [ ] CI integration
219
+ - [ ] Baseline established
@@ -0,0 +1,226 @@
1
+ ---
2
+ name: guardrails-safety-filter-builder
3
+ description: Implements content safety filters with PII redaction, policy constraints, prompt injection detection, and safe refusal templates. Use when adding "content moderation", "safety filters", "PII protection", or "guardrails".
4
+ ---
5
+
6
+ # Guardrails & Safety Filter Builder
7
+
8
+ Build comprehensive safety systems for LLM applications.
9
+
10
+ ## Safety Layers
11
+
12
+ 1. **Input filtering**: Block malicious prompts
13
+ 2. **Output filtering**: Redact sensitive data
14
+ 3. **Topic constraints**: Policy-based refusals
15
+ 4. **PII detection**: Mask personal information
16
+ 5. **Prompt injection**: Detect manipulation attempts
17
+
18
+ ## PII Detection & Redaction
19
+
20
+ ```python
21
+ import re
22
+ from presidio_analyzer import AnalyzerEngine
23
+ from presidio_anonymizer import AnonymizerEngine
24
+
25
+ analyzer = AnalyzerEngine()
26
+ anonymizer = AnonymizerEngine()
27
+
28
+ def redact_pii(text: str) -> str:
29
+ # Detect PII
30
+ results = analyzer.analyze(
31
+ text=text,
32
+ language='en',
33
+ entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "SSN"]
34
+ )
35
+
36
+ # Anonymize
37
+ anonymized = anonymizer.anonymize(text, results)
38
+ return anonymized.text
39
+
40
+ # Example: "My email is john@example.com" → "My email is <EMAIL_ADDRESS>"
41
+ ```
42
+
43
+ ## Prompt Injection Detection
44
+
45
+ ```python
46
+ def detect_prompt_injection(user_input: str) -> bool:
47
+ """Detect common prompt injection patterns"""
48
+ patterns = [
49
+ r'ignore (previous|above) instructions',
50
+ r'disregard (all|any) (prior|previous)',
51
+ r'you are now',
52
+ r'new instructions',
53
+ r'system:',
54
+ r'override',
55
+ ]
56
+
57
+ for pattern in patterns:
58
+ if re.search(pattern, user_input, re.IGNORECASE):
59
+ return True
60
+
61
+ return False
62
+
63
+ # Block if detected
64
+ if detect_prompt_injection(user_input):
65
+ return "I cannot process that request."
66
+ ```
67
+
68
+ ## Topic Constraints
69
+
70
+ ```python
71
+ # Define allowed/disallowed topics
72
+ POLICY = {
73
+ "allowed_topics": [
74
+ "product_features",
75
+ "troubleshooting",
76
+ "billing",
77
+ "account_management"
78
+ ],
79
+ "disallowed_topics": [
80
+ "medical_advice",
81
+ "legal_advice",
82
+ "financial_advice",
83
+ "politics",
84
+ "violence"
85
+ ],
86
+ "requires_disclaimer": [
87
+ "security_practices",
88
+ "data_privacy"
89
+ ]
90
+ }
91
+
92
+ # Classify topic
93
+ def classify_topic(query: str) -> str:
94
+ classification_prompt = f"""
95
+ Classify this query into one of these topics:
96
+ {', '.join(POLICY['allowed_topics'] + POLICY['disallowed_topics'])}
97
+
98
+ Query: {query}
99
+
100
+ Return only the topic name.
101
+ """
102
+ return llm(classification_prompt)
103
+
104
+ # Check policy
105
+ def check_policy(query: str) -> dict:
106
+ topic = classify_topic(query)
107
+
108
+ if topic in POLICY["disallowed_topics"]:
109
+ return {
110
+ "allowed": False,
111
+ "reason": f"Cannot provide {topic}",
112
+ "refusal": REFUSAL_TEMPLATES[topic]
113
+ }
114
+
115
+ return {"allowed": True, "topic": topic}
116
+ ```
117
+
118
+ ## Refusal Templates
119
+
120
+ ```python
121
+ REFUSAL_TEMPLATES = {
122
+ "medical_advice": """
123
+ I cannot provide medical advice. Please consult with a healthcare
124
+ professional for medical concerns.
125
+ """,
126
+ "legal_advice": """
127
+ I cannot provide legal advice. For legal matters, please consult
128
+ with a qualified attorney.
129
+ """,
130
+ "out_of_scope": """
131
+ I'm designed to help with product documentation and support.
132
+ This question is outside my area of expertise.
133
+ """,
134
+ }
135
+
136
+ def refuse_safely(reason: str) -> str:
137
+ return REFUSAL_TEMPLATES.get(reason, REFUSAL_TEMPLATES["out_of_scope"])
138
+ ```
139
+
140
+ ## Output Validation
141
+
142
+ ```python
143
+ def validate_output(output: str) -> dict:
144
+ """Check output before returning to user"""
145
+ issues = []
146
+
147
+ # Check for PII
148
+ pii_results = analyzer.analyze(output, language='en')
149
+ if pii_results:
150
+ issues.append("Contains PII")
151
+ output = redact_pii(output)
152
+
153
+ # Check for banned phrases
154
+ banned_phrases = ["password", "api key", "secret"]
155
+ for phrase in banned_phrases:
156
+ if phrase.lower() in output.lower():
157
+ issues.append(f"Contains banned phrase: {phrase}")
158
+
159
+ # Check toxicity
160
+ toxicity_score = toxicity_classifier(output)
161
+ if toxicity_score > 0.7:
162
+ issues.append("High toxicity detected")
163
+
164
+ return {
165
+ "safe": len(issues) == 0,
166
+ "issues": issues,
167
+ "sanitized_output": output
168
+ }
169
+ ```
170
+
171
+ ## Complete Guardrail Pipeline
172
+
173
+ ```python
174
+ def apply_guardrails(user_input: str) -> dict:
175
+ # 1. Input validation
176
+ if detect_prompt_injection(user_input):
177
+ return {
178
+ "allowed": False,
179
+ "response": "Invalid request detected."
180
+ }
181
+
182
+ # 2. Policy check
183
+ policy_check = check_policy(user_input)
184
+ if not policy_check["allowed"]:
185
+ return {
186
+ "allowed": False,
187
+ "response": policy_check["refusal"]
188
+ }
189
+
190
+ # 3. Generate response
191
+ response = llm(user_input)
192
+
193
+ # 4. Output validation
194
+ validation = validate_output(response)
195
+ if not validation["safe"]:
196
+ return {
197
+ "allowed": True,
198
+ "response": validation["sanitized_output"],
199
+ "warnings": validation["issues"]
200
+ }
201
+
202
+ return {
203
+ "allowed": True,
204
+ "response": response
205
+ }
206
+ ```
207
+
208
+ ## Best Practices
209
+
210
+ - Layer multiple defenses
211
+ - Log all blocked requests
212
+ - Provide helpful refusals
213
+ - Redact, don't reject when possible
214
+ - Regular pattern updates
215
+ - Human review of edge cases
216
+
217
+ ## Output Checklist
218
+
219
+ - [ ] PII detection implemented
220
+ - [ ] Prompt injection detection
221
+ - [ ] Topic classification
222
+ - [ ] Policy constraints defined
223
+ - [ ] Refusal templates written
224
+ - [ ] Output validation
225
+ - [ ] Logging/monitoring
226
+ - [ ] Test cases for bypasses