@patricio0312rev/skillset 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/LICENSE +21 -0
- package/README.md +176 -0
- package/bin/cli.js +37 -0
- package/package.json +55 -0
- package/src/commands/init.js +301 -0
- package/src/index.js +168 -0
- package/src/lib/config.js +200 -0
- package/src/lib/generator.js +166 -0
- package/src/utils/display.js +95 -0
- package/src/utils/readme.js +196 -0
- package/src/utils/tool-specific.js +233 -0
- package/templates/ai-engineering/agent-orchestration-planner/ SKILL.md +266 -0
- package/templates/ai-engineering/cost-latency-optimizer/ SKILL.md +270 -0
- package/templates/ai-engineering/doc-to-vector-dataset-generator/ SKILL.md +239 -0
- package/templates/ai-engineering/evaluation-harness/ SKILL.md +219 -0
- package/templates/ai-engineering/guardrails-safety-filter-builder/ SKILL.md +226 -0
- package/templates/ai-engineering/llm-debugger/ SKILL.md +283 -0
- package/templates/ai-engineering/prompt-regression-tester/ SKILL.md +216 -0
- package/templates/ai-engineering/prompt-template-builder/ SKILL.md +393 -0
- package/templates/ai-engineering/rag-pipeline-builder/ SKILL.md +244 -0
- package/templates/ai-engineering/tool-function-schema-designer/ SKILL.md +219 -0
- package/templates/architecture/adr-writer/ SKILL.md +250 -0
- package/templates/architecture/api-versioning-deprecation-planner/ SKILL.md +331 -0
- package/templates/architecture/domain-model-boundaries-mapper/ SKILL.md +300 -0
- package/templates/architecture/migration-planner/ SKILL.md +376 -0
- package/templates/architecture/performance-budget-setter/ SKILL.md +318 -0
- package/templates/architecture/reliability-strategy-builder/ SKILL.md +286 -0
- package/templates/architecture/rfc-generator/ SKILL.md +362 -0
- package/templates/architecture/scalability-playbook/ SKILL.md +279 -0
- package/templates/architecture/system-design-generator/ SKILL.md +339 -0
- package/templates/architecture/tech-debt-prioritizer/ SKILL.md +329 -0
- package/templates/backend/api-contract-normalizer/ SKILL.md +487 -0
- package/templates/backend/api-endpoint-generator/ SKILL.md +415 -0
- package/templates/backend/auth-module-builder/ SKILL.md +99 -0
- package/templates/backend/background-jobs-designer/ SKILL.md +166 -0
- package/templates/backend/caching-strategist/ SKILL.md +190 -0
- package/templates/backend/error-handling-standardizer/ SKILL.md +174 -0
- package/templates/backend/rate-limiting-abuse-protection/ SKILL.md +147 -0
- package/templates/backend/rbac-permissions-builder/ SKILL.md +158 -0
- package/templates/backend/service-layer-extractor/ SKILL.md +269 -0
- package/templates/backend/webhook-receiver-hardener/ SKILL.md +211 -0
- package/templates/ci-cd/artifact-sbom-publisher/ SKILL.md +236 -0
- package/templates/ci-cd/caching-strategy-optimizer/ SKILL.md +195 -0
- package/templates/ci-cd/deployment-checklist-generator/ SKILL.md +381 -0
- package/templates/ci-cd/github-actions-pipeline-creator/ SKILL.md +348 -0
- package/templates/ci-cd/monorepo-ci-optimizer/ SKILL.md +298 -0
- package/templates/ci-cd/preview-environments-builder/ SKILL.md +187 -0
- package/templates/ci-cd/quality-gates-enforcer/ SKILL.md +342 -0
- package/templates/ci-cd/release-automation-builder/ SKILL.md +281 -0
- package/templates/ci-cd/rollback-workflow-builder/ SKILL.md +372 -0
- package/templates/ci-cd/secrets-env-manager/ SKILL.md +242 -0
- package/templates/db-management/backup-restore-runbook-generator/ SKILL.md +505 -0
- package/templates/db-management/data-integrity-auditor/ SKILL.md +505 -0
- package/templates/db-management/data-retention-archiving-planner/ SKILL.md +430 -0
- package/templates/db-management/data-seeding-fixtures-builder/ SKILL.md +375 -0
- package/templates/db-management/db-performance-watchlist/ SKILL.md +425 -0
- package/templates/db-management/etl-sync-job-builder/ SKILL.md +457 -0
- package/templates/db-management/multi-tenant-safety-checker/ SKILL.md +398 -0
- package/templates/db-management/prisma-migration-assistant/ SKILL.md +379 -0
- package/templates/db-management/schema-consistency-checker/ SKILL.md +440 -0
- package/templates/db-management/sql-query-optimizer/ SKILL.md +324 -0
- package/templates/foundation/changelog-writer/ SKILL.md +431 -0
- package/templates/foundation/code-formatter-installer/ SKILL.md +320 -0
- package/templates/foundation/codebase-summarizer/ SKILL.md +360 -0
- package/templates/foundation/dependency-doctor/ SKILL.md +163 -0
- package/templates/foundation/dev-environment-bootstrapper/ SKILL.md +259 -0
- package/templates/foundation/dev-onboarding-builder/ SKILL.md +556 -0
- package/templates/foundation/docs-starter-kit/ SKILL.md +574 -0
- package/templates/foundation/explaining-code/SKILL.md +13 -0
- package/templates/foundation/git-hygiene-enforcer/ SKILL.md +455 -0
- package/templates/foundation/project-scaffolder/ SKILL.md +65 -0
- package/templates/foundation/project-scaffolder/references/templates.md +126 -0
- package/templates/foundation/repo-structure-linter/ SKILL.md +0 -0
- package/templates/foundation/repo-structure-linter/references/conventions.md +98 -0
- package/templates/frontend/animation-micro-interaction-pack/ SKILL.md +41 -0
- package/templates/frontend/component-scaffold-generator/ SKILL.md +562 -0
- package/templates/frontend/design-to-component-translator/ SKILL.md +547 -0
- package/templates/frontend/form-wizard-builder/ SKILL.md +553 -0
- package/templates/frontend/frontend-refactor-planner/ SKILL.md +37 -0
- package/templates/frontend/i18n-frontend-implementer/ SKILL.md +44 -0
- package/templates/frontend/modal-drawer-system/ SKILL.md +377 -0
- package/templates/frontend/page-layout-builder/ SKILL.md +630 -0
- package/templates/frontend/state-ux-flow-builder/ SKILL.md +23 -0
- package/templates/frontend/table-builder/ SKILL.md +350 -0
- package/templates/performance/alerting-dashboard-builder/ SKILL.md +162 -0
- package/templates/performance/backend-latency-profiler-helper/ SKILL.md +108 -0
- package/templates/performance/caching-cdn-strategy-planner/ SKILL.md +150 -0
- package/templates/performance/capacity-planning-helper/ SKILL.md +242 -0
- package/templates/performance/core-web-vitals-tuner/ SKILL.md +126 -0
- package/templates/performance/incident-runbook-generator/ SKILL.md +162 -0
- package/templates/performance/load-test-scenario-builder/ SKILL.md +256 -0
- package/templates/performance/observability-setup/ SKILL.md +232 -0
- package/templates/performance/postmortem-writer/ SKILL.md +203 -0
- package/templates/performance/structured-logging-standardizer/ SKILL.md +122 -0
- package/templates/security/auth-security-reviewer/ SKILL.md +428 -0
- package/templates/security/dependency-vulnerability-triage/ SKILL.md +495 -0
- package/templates/security/input-validation-sanitization-auditor/ SKILL.md +76 -0
- package/templates/security/pii-redaction-logging-policy-builder/ SKILL.md +65 -0
- package/templates/security/rbac-policy-tester/ SKILL.md +80 -0
- package/templates/security/secrets-scanner/ SKILL.md +462 -0
- package/templates/security/secure-headers-csp-builder/ SKILL.md +404 -0
- package/templates/security/security-incident-playbook-generator/ SKILL.md +76 -0
- package/templates/security/security-pr-checklist-skill/ SKILL.md +62 -0
- package/templates/security/threat-model-generator/ SKILL.md +394 -0
- package/templates/testing/contract-testing-builder/ SKILL.md +492 -0
- package/templates/testing/coverage-strategist/ SKILL.md +436 -0
- package/templates/testing/e2e-test-builder/ SKILL.md +382 -0
- package/templates/testing/flaky-test-detective/ SKILL.md +416 -0
- package/templates/testing/integration-test-builder/ SKILL.md +525 -0
- package/templates/testing/mocking-assistant/ SKILL.md +383 -0
- package/templates/testing/snapshot-test-refactorer/ SKILL.md +375 -0
- package/templates/testing/test-data-factory-builder/ SKILL.md +449 -0
- package/templates/testing/test-reporting-triage-skill/ SKILL.md +469 -0
- package/templates/testing/unit-test-generator/ SKILL.md +548 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: evaluation-harness
|
|
3
|
+
description: Builds repeatable evaluation systems with golden datasets, scoring rubrics, pass/fail thresholds, and regression reports. Use for "LLM evaluation", "testing AI systems", "quality assurance", or "model benchmarking".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Evaluation Harness
|
|
7
|
+
|
|
8
|
+
Build systematic evaluation frameworks for LLM applications.
|
|
9
|
+
|
|
10
|
+
## Golden Dataset Format
|
|
11
|
+
|
|
12
|
+
```json
|
|
13
|
+
[
|
|
14
|
+
{
|
|
15
|
+
"id": "test_001",
|
|
16
|
+
"category": "code_generation",
|
|
17
|
+
"input": "Write a Python function to reverse a string",
|
|
18
|
+
"expected_output": "def reverse_string(s: str) -> str:\n return s[::-1]",
|
|
19
|
+
"rubric": {
|
|
20
|
+
"correctness": 1.0,
|
|
21
|
+
"style": 0.8,
|
|
22
|
+
"documentation": 0.5
|
|
23
|
+
},
|
|
24
|
+
"metadata": {
|
|
25
|
+
"difficulty": "easy",
|
|
26
|
+
"tags": ["python", "strings"]
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Scoring Rubrics
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from typing import Dict, Any
|
|
36
|
+
|
|
37
|
+
def score_exact_match(actual: str, expected: str) -> float:
|
|
38
|
+
"""Binary score: 1.0 if exact match, 0.0 otherwise"""
|
|
39
|
+
return 1.0 if actual.strip() == expected.strip() else 0.0
|
|
40
|
+
|
|
41
|
+
def score_semantic_similarity(actual: str, expected: str) -> float:
|
|
42
|
+
"""Cosine similarity of embeddings"""
|
|
43
|
+
actual_emb = get_embedding(actual)
|
|
44
|
+
expected_emb = get_embedding(expected)
|
|
45
|
+
return cosine_similarity(actual_emb, expected_emb)
|
|
46
|
+
|
|
47
|
+
def score_contains_keywords(actual: str, keywords: List[str]) -> float:
|
|
48
|
+
"""Percentage of required keywords present"""
|
|
49
|
+
found = sum(1 for kw in keywords if kw.lower() in actual.lower())
|
|
50
|
+
return found / len(keywords)
|
|
51
|
+
|
|
52
|
+
def score_with_llm(actual: str, expected: str, rubric: Dict[str, float]) -> Dict[str, float]:
|
|
53
|
+
"""Use LLM as judge"""
|
|
54
|
+
prompt = f"""
|
|
55
|
+
Grade this output on a scale of 0-1 for each criterion:
|
|
56
|
+
|
|
57
|
+
Expected: {expected}
|
|
58
|
+
Actual: {actual}
|
|
59
|
+
|
|
60
|
+
Criteria: {', '.join(rubric.keys())}
|
|
61
|
+
|
|
62
|
+
Return JSON with scores.
|
|
63
|
+
"""
|
|
64
|
+
return json.loads(llm(prompt))
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Test Runner
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
class EvaluationHarness:
|
|
71
|
+
def __init__(self, dataset_path: str):
|
|
72
|
+
self.dataset = self.load_dataset(dataset_path)
|
|
73
|
+
self.results = []
|
|
74
|
+
|
|
75
|
+
def run_evaluation(self, model_fn):
|
|
76
|
+
for test_case in self.dataset:
|
|
77
|
+
# Generate output
|
|
78
|
+
actual = model_fn(test_case["input"])
|
|
79
|
+
|
|
80
|
+
# Score
|
|
81
|
+
scores = self.score_output(
|
|
82
|
+
actual,
|
|
83
|
+
test_case["expected_output"],
|
|
84
|
+
test_case["rubric"]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Record result
|
|
88
|
+
self.results.append({
|
|
89
|
+
"test_id": test_case["id"],
|
|
90
|
+
"category": test_case["category"],
|
|
91
|
+
"scores": scores,
|
|
92
|
+
"passed": self.check_threshold(scores, test_case),
|
|
93
|
+
"actual_output": actual,
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
return self.generate_report()
|
|
97
|
+
|
|
98
|
+
def score_output(self, actual, expected, rubric):
|
|
99
|
+
return {
|
|
100
|
+
"exact_match": score_exact_match(actual, expected),
|
|
101
|
+
"semantic_similarity": score_semantic_similarity(actual, expected),
|
|
102
|
+
**score_with_llm(actual, expected, rubric)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
def check_threshold(self, scores, test_case):
|
|
106
|
+
min_scores = test_case.get("min_scores", {})
|
|
107
|
+
for metric, threshold in min_scores.items():
|
|
108
|
+
if scores.get(metric, 0) < threshold:
|
|
109
|
+
return False
|
|
110
|
+
return True
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Thresholds & Pass Criteria
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
# Define thresholds per category
|
|
117
|
+
THRESHOLDS = {
|
|
118
|
+
"code_generation": {
|
|
119
|
+
"correctness": 0.9,
|
|
120
|
+
"style": 0.7,
|
|
121
|
+
},
|
|
122
|
+
"summarization": {
|
|
123
|
+
"semantic_similarity": 0.8,
|
|
124
|
+
"brevity": 0.7,
|
|
125
|
+
},
|
|
126
|
+
"classification": {
|
|
127
|
+
"exact_match": 1.0,
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
def check_test_passed(result: Dict) -> bool:
|
|
132
|
+
category = result["category"]
|
|
133
|
+
thresholds = THRESHOLDS.get(category, {})
|
|
134
|
+
|
|
135
|
+
for metric, threshold in thresholds.items():
|
|
136
|
+
if result["scores"].get(metric, 0) < threshold:
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
return True
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Regression Report
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
def generate_regression_report(baseline_results, current_results):
|
|
146
|
+
report = {
|
|
147
|
+
"summary": {},
|
|
148
|
+
"regressions": [],
|
|
149
|
+
"improvements": [],
|
|
150
|
+
"unchanged": 0
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
for baseline, current in zip(baseline_results, current_results):
|
|
154
|
+
assert baseline["test_id"] == current["test_id"]
|
|
155
|
+
|
|
156
|
+
baseline_passed = baseline["passed"]
|
|
157
|
+
current_passed = current["passed"]
|
|
158
|
+
|
|
159
|
+
if baseline_passed and not current_passed:
|
|
160
|
+
report["regressions"].append({
|
|
161
|
+
"test_id": baseline["test_id"],
|
|
162
|
+
"category": baseline["category"],
|
|
163
|
+
"baseline_scores": baseline["scores"],
|
|
164
|
+
"current_scores": current["scores"],
|
|
165
|
+
})
|
|
166
|
+
elif not baseline_passed and current_passed:
|
|
167
|
+
report["improvements"].append(baseline["test_id"])
|
|
168
|
+
else:
|
|
169
|
+
report["unchanged"] += 1
|
|
170
|
+
|
|
171
|
+
report["summary"] = {
|
|
172
|
+
"total_tests": len(baseline_results),
|
|
173
|
+
"regressions": len(report["regressions"]),
|
|
174
|
+
"improvements": len(report["improvements"]),
|
|
175
|
+
"unchanged": report["unchanged"],
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return report
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Continuous Evaluation
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
# Run evaluation on every commit
|
|
185
|
+
def ci_evaluation():
|
|
186
|
+
harness = EvaluationHarness("golden_dataset.json")
|
|
187
|
+
results = harness.run_evaluation(production_model)
|
|
188
|
+
|
|
189
|
+
# Check for regressions
|
|
190
|
+
baseline = load_baseline("baseline_results.json")
|
|
191
|
+
report = generate_regression_report(baseline, results)
|
|
192
|
+
|
|
193
|
+
# Fail CI if regressions
|
|
194
|
+
if report["summary"]["regressions"] > 0:
|
|
195
|
+
print(f"❌ {report['summary']['regressions']} regressions detected!")
|
|
196
|
+
sys.exit(1)
|
|
197
|
+
|
|
198
|
+
print("✅ All tests passed!")
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Best Practices
|
|
202
|
+
|
|
203
|
+
1. **Representative dataset**: Cover edge cases
|
|
204
|
+
2. **Multiple metrics**: Don't rely on one score
|
|
205
|
+
3. **Human validation**: Review LLM judge scores
|
|
206
|
+
4. **Version datasets**: Track changes over time
|
|
207
|
+
5. **Automate in CI**: Catch regressions early
|
|
208
|
+
6. **Regular updates**: Add new test cases
|
|
209
|
+
|
|
210
|
+
## Output Checklist
|
|
211
|
+
|
|
212
|
+
- [ ] Golden dataset created (50+ examples)
|
|
213
|
+
- [ ] Multiple scoring functions
|
|
214
|
+
- [ ] Pass/fail thresholds defined
|
|
215
|
+
- [ ] Test runner implemented
|
|
216
|
+
- [ ] Regression comparison
|
|
217
|
+
- [ ] Report generation
|
|
218
|
+
- [ ] CI integration
|
|
219
|
+
- [ ] Baseline established
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: guardrails-safety-filter-builder
|
|
3
|
+
description: Implements content safety filters with PII redaction, policy constraints, prompt injection detection, and safe refusal templates. Use when adding "content moderation", "safety filters", "PII protection", or "guardrails".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Guardrails & Safety Filter Builder
|
|
7
|
+
|
|
8
|
+
Build comprehensive safety systems for LLM applications.
|
|
9
|
+
|
|
10
|
+
## Safety Layers
|
|
11
|
+
|
|
12
|
+
1. **Input filtering**: Block malicious prompts
|
|
13
|
+
2. **Output filtering**: Redact sensitive data
|
|
14
|
+
3. **Topic constraints**: Policy-based refusals
|
|
15
|
+
4. **PII detection**: Mask personal information
|
|
16
|
+
5. **Prompt injection**: Detect manipulation attempts
|
|
17
|
+
|
|
18
|
+
## PII Detection & Redaction
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
import re
|
|
22
|
+
from presidio_analyzer import AnalyzerEngine
|
|
23
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
24
|
+
|
|
25
|
+
analyzer = AnalyzerEngine()
|
|
26
|
+
anonymizer = AnonymizerEngine()
|
|
27
|
+
|
|
28
|
+
def redact_pii(text: str) -> str:
|
|
29
|
+
# Detect PII
|
|
30
|
+
results = analyzer.analyze(
|
|
31
|
+
text=text,
|
|
32
|
+
language='en',
|
|
33
|
+
entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "SSN"]
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Anonymize
|
|
37
|
+
anonymized = anonymizer.anonymize(text, results)
|
|
38
|
+
return anonymized.text
|
|
39
|
+
|
|
40
|
+
# Example: "My email is john@example.com" → "My email is <EMAIL_ADDRESS>"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Prompt Injection Detection
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
def detect_prompt_injection(user_input: str) -> bool:
|
|
47
|
+
"""Detect common prompt injection patterns"""
|
|
48
|
+
patterns = [
|
|
49
|
+
r'ignore (previous|above) instructions',
|
|
50
|
+
r'disregard (all|any) (prior|previous)',
|
|
51
|
+
r'you are now',
|
|
52
|
+
r'new instructions',
|
|
53
|
+
r'system:',
|
|
54
|
+
r'override',
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
for pattern in patterns:
|
|
58
|
+
if re.search(pattern, user_input, re.IGNORECASE):
|
|
59
|
+
return True
|
|
60
|
+
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
# Block if detected
|
|
64
|
+
if detect_prompt_injection(user_input):
|
|
65
|
+
return "I cannot process that request."
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Topic Constraints
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
# Define allowed/disallowed topics
|
|
72
|
+
POLICY = {
|
|
73
|
+
"allowed_topics": [
|
|
74
|
+
"product_features",
|
|
75
|
+
"troubleshooting",
|
|
76
|
+
"billing",
|
|
77
|
+
"account_management"
|
|
78
|
+
],
|
|
79
|
+
"disallowed_topics": [
|
|
80
|
+
"medical_advice",
|
|
81
|
+
"legal_advice",
|
|
82
|
+
"financial_advice",
|
|
83
|
+
"politics",
|
|
84
|
+
"violence"
|
|
85
|
+
],
|
|
86
|
+
"requires_disclaimer": [
|
|
87
|
+
"security_practices",
|
|
88
|
+
"data_privacy"
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Classify topic
|
|
93
|
+
def classify_topic(query: str) -> str:
|
|
94
|
+
classification_prompt = f"""
|
|
95
|
+
Classify this query into one of these topics:
|
|
96
|
+
{', '.join(POLICY['allowed_topics'] + POLICY['disallowed_topics'])}
|
|
97
|
+
|
|
98
|
+
Query: {query}
|
|
99
|
+
|
|
100
|
+
Return only the topic name.
|
|
101
|
+
"""
|
|
102
|
+
return llm(classification_prompt)
|
|
103
|
+
|
|
104
|
+
# Check policy
|
|
105
|
+
def check_policy(query: str) -> dict:
|
|
106
|
+
topic = classify_topic(query)
|
|
107
|
+
|
|
108
|
+
if topic in POLICY["disallowed_topics"]:
|
|
109
|
+
return {
|
|
110
|
+
"allowed": False,
|
|
111
|
+
"reason": f"Cannot provide {topic}",
|
|
112
|
+
"refusal": REFUSAL_TEMPLATES[topic]
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return {"allowed": True, "topic": topic}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Refusal Templates
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
REFUSAL_TEMPLATES = {
|
|
122
|
+
"medical_advice": """
|
|
123
|
+
I cannot provide medical advice. Please consult with a healthcare
|
|
124
|
+
professional for medical concerns.
|
|
125
|
+
""",
|
|
126
|
+
"legal_advice": """
|
|
127
|
+
I cannot provide legal advice. For legal matters, please consult
|
|
128
|
+
with a qualified attorney.
|
|
129
|
+
""",
|
|
130
|
+
"out_of_scope": """
|
|
131
|
+
I'm designed to help with product documentation and support.
|
|
132
|
+
This question is outside my area of expertise.
|
|
133
|
+
""",
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
def refuse_safely(reason: str) -> str:
|
|
137
|
+
return REFUSAL_TEMPLATES.get(reason, REFUSAL_TEMPLATES["out_of_scope"])
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Output Validation
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
def validate_output(output: str) -> dict:
|
|
144
|
+
"""Check output before returning to user"""
|
|
145
|
+
issues = []
|
|
146
|
+
|
|
147
|
+
# Check for PII
|
|
148
|
+
pii_results = analyzer.analyze(output, language='en')
|
|
149
|
+
if pii_results:
|
|
150
|
+
issues.append("Contains PII")
|
|
151
|
+
output = redact_pii(output)
|
|
152
|
+
|
|
153
|
+
# Check for banned phrases
|
|
154
|
+
banned_phrases = ["password", "api key", "secret"]
|
|
155
|
+
for phrase in banned_phrases:
|
|
156
|
+
if phrase.lower() in output.lower():
|
|
157
|
+
issues.append(f"Contains banned phrase: {phrase}")
|
|
158
|
+
|
|
159
|
+
# Check toxicity
|
|
160
|
+
toxicity_score = toxicity_classifier(output)
|
|
161
|
+
if toxicity_score > 0.7:
|
|
162
|
+
issues.append("High toxicity detected")
|
|
163
|
+
|
|
164
|
+
return {
|
|
165
|
+
"safe": len(issues) == 0,
|
|
166
|
+
"issues": issues,
|
|
167
|
+
"sanitized_output": output
|
|
168
|
+
}
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Complete Guardrail Pipeline
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
def apply_guardrails(user_input: str) -> dict:
|
|
175
|
+
# 1. Input validation
|
|
176
|
+
if detect_prompt_injection(user_input):
|
|
177
|
+
return {
|
|
178
|
+
"allowed": False,
|
|
179
|
+
"response": "Invalid request detected."
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
# 2. Policy check
|
|
183
|
+
policy_check = check_policy(user_input)
|
|
184
|
+
if not policy_check["allowed"]:
|
|
185
|
+
return {
|
|
186
|
+
"allowed": False,
|
|
187
|
+
"response": policy_check["refusal"]
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
# 3. Generate response
|
|
191
|
+
response = llm(user_input)
|
|
192
|
+
|
|
193
|
+
# 4. Output validation
|
|
194
|
+
validation = validate_output(response)
|
|
195
|
+
if not validation["safe"]:
|
|
196
|
+
return {
|
|
197
|
+
"allowed": True,
|
|
198
|
+
"response": validation["sanitized_output"],
|
|
199
|
+
"warnings": validation["issues"]
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"allowed": True,
|
|
204
|
+
"response": response
|
|
205
|
+
}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Best Practices
|
|
209
|
+
|
|
210
|
+
- Layer multiple defenses
|
|
211
|
+
- Log all blocked requests
|
|
212
|
+
- Provide helpful refusals
|
|
213
|
+
- Redact, don't reject when possible
|
|
214
|
+
- Regular pattern updates
|
|
215
|
+
- Human review of edge cases
|
|
216
|
+
|
|
217
|
+
## Output Checklist
|
|
218
|
+
|
|
219
|
+
- [ ] PII detection implemented
|
|
220
|
+
- [ ] Prompt injection detection
|
|
221
|
+
- [ ] Topic classification
|
|
222
|
+
- [ ] Policy constraints defined
|
|
223
|
+
- [ ] Refusal templates written
|
|
224
|
+
- [ ] Output validation
|
|
225
|
+
- [ ] Logging/monitoring
|
|
226
|
+
- [ ] Test cases for bypasses
|