@sylix/coworker 2.0.11 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/dist/commands/slash/config.d.ts.map +1 -1
  2. package/dist/commands/slash/config.js +22 -4
  3. package/dist/commands/slash/config.js.map +1 -1
  4. package/dist/core/CoWorkerAgent.d.ts.map +1 -1
  5. package/dist/core/CoWorkerAgent.js +6 -3
  6. package/dist/core/CoWorkerAgent.js.map +1 -1
  7. package/dist/skills/defaults/accessibility/screen-reader-testing.md +545 -0
  8. package/dist/skills/defaults/accessibility/wcag-audit-patterns.md +555 -0
  9. package/dist/skills/defaults/ai-ml/rag.md +276 -0
  10. package/dist/skills/defaults/backend-development/api-design-principles.md +528 -0
  11. package/dist/skills/defaults/backend-development/api-design.md +285 -0
  12. package/dist/skills/defaults/backend-development/architecture-patterns.md +494 -0
  13. package/dist/skills/defaults/backend-development/async-python.md +237 -0
  14. package/dist/skills/defaults/backend-development/auth-implementation-patterns.md +638 -0
  15. package/dist/skills/defaults/backend-development/bazel-build-optimization.md +387 -0
  16. package/dist/skills/defaults/backend-development/billing-automation/SKILL.md +566 -0
  17. package/dist/skills/defaults/backend-development/code-review-excellence.md +538 -0
  18. package/dist/skills/defaults/backend-development/cqrs-implementation.md +554 -0
  19. package/dist/skills/defaults/backend-development/database-design.md +305 -0
  20. package/dist/skills/defaults/backend-development/debugging-strategies.md +536 -0
  21. package/dist/skills/defaults/backend-development/e2e-testing-patterns.md +544 -0
  22. package/dist/skills/defaults/backend-development/error-handling-patterns.md +641 -0
  23. package/dist/skills/defaults/backend-development/fastapi-templates.md +559 -0
  24. package/dist/skills/defaults/backend-development/fastapi.md +309 -0
  25. package/dist/skills/defaults/backend-development/git-advanced-workflows.md +405 -0
  26. package/dist/skills/defaults/backend-development/microservices-patterns.md +595 -0
  27. package/dist/skills/defaults/backend-development/microservices.md +284 -0
  28. package/dist/skills/defaults/backend-development/monorepo-management.md +623 -0
  29. package/dist/skills/defaults/backend-development/nodejs-backend-patterns.md +1048 -0
  30. package/dist/skills/defaults/backend-development/nx-workspace-patterns.md +457 -0
  31. package/dist/skills/defaults/backend-development/paypal-integration/SKILL.md +478 -0
  32. package/dist/skills/defaults/backend-development/pci-compliance/SKILL.md +480 -0
  33. package/dist/skills/defaults/backend-development/python-anti-patterns.md +349 -0
  34. package/dist/skills/defaults/backend-development/python-background-jobs.md +364 -0
  35. package/dist/skills/defaults/backend-development/python-code-style.md +360 -0
  36. package/dist/skills/defaults/backend-development/python-configuration.md +368 -0
  37. package/dist/skills/defaults/backend-development/python-design-patterns.md +296 -0
  38. package/dist/skills/defaults/backend-development/python-error-handling.md +323 -0
  39. package/dist/skills/defaults/backend-development/python-packaging.md +887 -0
  40. package/dist/skills/defaults/backend-development/python-performance-optimization.md +874 -0
  41. package/dist/skills/defaults/backend-development/python-project-structure.md +252 -0
  42. package/dist/skills/defaults/backend-development/python-resilience.md +376 -0
  43. package/dist/skills/defaults/backend-development/python-resource-management.md +421 -0
  44. package/dist/skills/defaults/backend-development/python-type-safety.md +428 -0
  45. package/dist/skills/defaults/backend-development/sql-optimization-patterns.md +509 -0
  46. package/dist/skills/defaults/backend-development/stripe-integration/SKILL.md +522 -0
  47. package/dist/skills/defaults/backend-development/turborepo-caching.md +376 -0
  48. package/dist/skills/defaults/blockchain/defi-protocol-templates.md +430 -0
  49. package/dist/skills/defaults/blockchain/nft-standards.md +364 -0
  50. package/dist/skills/defaults/blockchain/solidity-security.md +514 -0
  51. package/dist/skills/defaults/blockchain/web3-testing.md +360 -0
  52. package/dist/skills/defaults/business/competitive-landscape/SKILL.md +527 -0
  53. package/dist/skills/defaults/business/market-sizing-analysis/SKILL.md +451 -0
  54. package/dist/skills/defaults/business/startup-financial-modeling/SKILL.md +494 -0
  55. package/dist/skills/defaults/business/startup-metrics-framework/SKILL.md +564 -0
  56. package/dist/skills/defaults/business/team-composition-analysis.md +437 -0
  57. package/dist/skills/defaults/compliance/employment-contract-templates/SKILL.md +527 -0
  58. package/dist/skills/defaults/compliance/gdpr-data-handling/SKILL.md +630 -0
  59. package/dist/skills/defaults/data-engineering/airflow-dag-patterns.md +436 -0
  60. package/dist/skills/defaults/data-engineering/airflow.md +519 -0
  61. package/dist/skills/defaults/data-engineering/data-quality.md +583 -0
  62. package/dist/skills/defaults/data-engineering/dbt-transformation-patterns.md +482 -0
  63. package/dist/skills/defaults/data-engineering/dbt.md +556 -0
  64. package/dist/skills/defaults/data-engineering/ml-pipeline-workflow/SKILL.md +247 -0
  65. package/dist/skills/defaults/data-engineering/spark-optimization.md +348 -0
  66. package/dist/skills/defaults/data-engineering/spark.md +411 -0
  67. package/dist/skills/defaults/database/postgresql.md +202 -0
  68. package/dist/skills/defaults/debugging/systematic-debugging.md +249 -0
  69. package/dist/skills/defaults/devops/architecture-decision-records.md +448 -0
  70. package/dist/skills/defaults/devops/changelog-automation.md +580 -0
  71. package/dist/skills/defaults/devops/cicd.md +314 -0
  72. package/dist/skills/defaults/devops/cloud.md +263 -0
  73. package/dist/skills/defaults/devops/code-review-excellence.md +299 -0
  74. package/dist/skills/defaults/devops/cost-optimization.md +295 -0
  75. package/dist/skills/defaults/devops/deployment-pipeline-design.md +356 -0
  76. package/dist/skills/defaults/devops/docker.md +281 -0
  77. package/dist/skills/defaults/devops/git-workflows.md +205 -0
  78. package/dist/skills/defaults/devops/github-actions.md +311 -0
  79. package/dist/skills/defaults/devops/gitlab-ci-patterns.md +266 -0
  80. package/dist/skills/defaults/devops/hybrid-cloud-networking.md +241 -0
  81. package/dist/skills/defaults/devops/istio-traffic-management.md +327 -0
  82. package/dist/skills/defaults/devops/kubernetes.md +339 -0
  83. package/dist/skills/defaults/devops/linkerd-patterns.md +311 -0
  84. package/dist/skills/defaults/devops/multi-cloud-architecture.md +181 -0
  85. package/dist/skills/defaults/devops/observability.md +243 -0
  86. package/dist/skills/defaults/devops/openapi-spec-generation.md +1024 -0
  87. package/dist/skills/defaults/devops/postmortem-writing.md +396 -0
  88. package/dist/skills/defaults/devops/prometheus-configuration.md +265 -0
  89. package/dist/skills/defaults/devops/secrets-management.md +341 -0
  90. package/dist/skills/defaults/devops/service-mesh-observability.md +385 -0
  91. package/dist/skills/defaults/devops/terraform-module-library.md +244 -0
  92. package/dist/skills/defaults/finance/backtesting-frameworks/SKILL.md +663 -0
  93. package/dist/skills/defaults/finance/risk-metrics-calculation/SKILL.md +557 -0
  94. package/dist/skills/defaults/frontend/accessibility-compliance.md +420 -0
  95. package/dist/skills/defaults/frontend/design-system-patterns.md +337 -0
  96. package/dist/skills/defaults/frontend/interaction-design.md +327 -0
  97. package/dist/skills/defaults/frontend/javascript.md +311 -0
  98. package/dist/skills/defaults/frontend/modern-javascript-patterns.md +927 -0
  99. package/dist/skills/defaults/frontend/react-native-design.md +440 -0
  100. package/dist/skills/defaults/frontend/react.md +345 -0
  101. package/dist/skills/defaults/frontend/responsive-design.md +472 -0
  102. package/dist/skills/defaults/frontend/tailwind-design-system.md +337 -0
  103. package/dist/skills/defaults/frontend/typescript-advanced-types.md +724 -0
  104. package/dist/skills/defaults/frontend/typescript.md +334 -0
  105. package/dist/skills/defaults/frontend/visual-design-foundations.md +326 -0
  106. package/dist/skills/defaults/frontend/web-component-design.md +279 -0
  107. package/dist/skills/defaults/game-development/godot-gdscript-patterns.md +188 -0
  108. package/dist/skills/defaults/game-development/unity-ecs-patterns.md +594 -0
  109. package/dist/skills/defaults/kubernetes/gitops-workflow.md +285 -0
  110. package/dist/skills/defaults/kubernetes/gitops.md +280 -0
  111. package/dist/skills/defaults/kubernetes/helm-chart-scaffolding.md +553 -0
  112. package/dist/skills/defaults/kubernetes/helm.md +343 -0
  113. package/dist/skills/defaults/kubernetes/k8s-manifest-generator.md +501 -0
  114. package/dist/skills/defaults/kubernetes/k8s-security-policies.md +342 -0
  115. package/dist/skills/defaults/kubernetes/manifests.md +330 -0
  116. package/dist/skills/defaults/kubernetes/security.md +337 -0
  117. package/dist/skills/defaults/llm-application/embedding-strategies.md +608 -0
  118. package/dist/skills/defaults/llm-application/hybrid-search-implementation.md +570 -0
  119. package/dist/skills/defaults/llm-application/hybrid-search.md +570 -0
  120. package/dist/skills/defaults/llm-application/langchain-architecture.md +666 -0
  121. package/dist/skills/defaults/llm-application/langchain.md +259 -0
  122. package/dist/skills/defaults/llm-application/llm-evaluation.md +695 -0
  123. package/dist/skills/defaults/llm-application/prompt-engineering-patterns.md +449 -0
  124. package/dist/skills/defaults/llm-application/prompt-engineering.md +219 -0
  125. package/dist/skills/defaults/llm-application/rag-implementation.md +434 -0
  126. package/dist/skills/defaults/llm-application/similarity-search-patterns.md +560 -0
  127. package/dist/skills/defaults/llm-application/similarity-search.md +560 -0
  128. package/dist/skills/defaults/llm-application/vector-index-tuning.md +523 -0
  129. package/dist/skills/defaults/mobile/mobile-android-design.md +440 -0
  130. package/dist/skills/defaults/mobile/mobile-ios-design.md +266 -0
  131. package/dist/skills/defaults/monitoring/distributed-tracing.md +436 -0
  132. package/dist/skills/defaults/monitoring/grafana-dashboards.md +370 -0
  133. package/dist/skills/defaults/monitoring/prometheus-configuration.md +379 -0
  134. package/dist/skills/defaults/monitoring/slo-implementation.md +323 -0
  135. package/dist/skills/defaults/refactoring/code-refactoring.md +349 -0
  136. package/dist/skills/defaults/security/anti-reversing-techniques/SKILL.md +559 -0
  137. package/dist/skills/defaults/security/auditor.md +168 -0
  138. package/dist/skills/defaults/security/binary-analysis-patterns/SKILL.md +438 -0
  139. package/dist/skills/defaults/security/memory-forensics/SKILL.md +483 -0
  140. package/dist/skills/defaults/security/mtls-configuration.md +349 -0
  141. package/dist/skills/defaults/security/protocol-reverse-engineering/SKILL.md +520 -0
  142. package/dist/skills/defaults/security/sast-configuration.md +182 -0
  143. package/dist/skills/defaults/security/security.md +313 -0
  144. package/dist/skills/defaults/security/stride-analysis.md +273 -0
  145. package/dist/skills/defaults/security/threat-mitigation-mapping.md +290 -0
  146. package/dist/skills/defaults/systems/bash-defensive-patterns/SKILL.md +539 -0
  147. package/dist/skills/defaults/systems/bats-testing-patterns/SKILL.md +631 -0
  148. package/dist/skills/defaults/systems/go-concurrency-patterns.md +657 -0
  149. package/dist/skills/defaults/systems/memory-safety-patterns.md +605 -0
  150. package/dist/skills/defaults/systems/rust-async-patterns.md +519 -0
  151. package/dist/skills/defaults/systems/shellcheck-configuration/SKILL.md +456 -0
  152. package/dist/skills/defaults/team-collaboration/multi-reviewer-patterns.md +126 -0
  153. package/dist/skills/defaults/team-collaboration/parallel-feature-development.md +151 -0
  154. package/dist/skills/defaults/testing/javascript-testing-patterns.md +1021 -0
  155. package/dist/skills/defaults/testing/python-testing-patterns.md +351 -0
  156. package/dist/skills/defaults/testing/testing.md +332 -0
  157. package/dist/skills/defaults/workflows/context-driven-development.md +384 -0
  158. package/dist/skills/defaults/workflows/track-management.md +592 -0
  159. package/dist/skills/defaults/workflows/workflow-patterns.md +622 -0
  160. package/dist/skills/index.d.ts +11 -0
  161. package/dist/skills/index.d.ts.map +1 -0
  162. package/dist/skills/index.js +129 -0
  163. package/dist/skills/index.js.map +1 -0
  164. package/dist/utils/character.js +4 -4
  165. package/dist/utils/character.js.map +1 -1
  166. package/dist/utils/inputbar.d.ts.map +1 -1
  167. package/dist/utils/inputbar.js +7 -0
  168. package/dist/utils/inputbar.js.map +1 -1
  169. package/package.json +1 -1
@@ -0,0 +1,695 @@
1
+ ---
2
+ name: llm-evaluation
3
+ description: Implement comprehensive evaluation strategies for LLM applications using automated metrics, human feedback, and benchmarking. Use when testing LLM performance, measuring AI application quality, or establishing evaluation frameworks.
4
+ ---
5
+
6
+ # LLM Evaluation
7
+
8
+ Master comprehensive evaluation strategies for LLM applications, from automated metrics to human evaluation and A/B testing.
9
+
10
+ ## When to Use This Skill
11
+
12
+ - Measuring LLM application performance systematically
13
+ - Comparing different models or prompts
14
+ - Detecting performance regressions before deployment
15
+ - Validating improvements from prompt changes
16
+ - Building confidence in production systems
17
+ - Establishing baselines and tracking progress over time
18
+ - Debugging unexpected model behavior
19
+
20
+ ## Core Evaluation Types
21
+
22
+ ### 1. Automated Metrics
23
+
24
+ Fast, repeatable, scalable evaluation using computed scores.
25
+
26
+ **Text Generation:**
27
+
28
+ - **BLEU**: N-gram overlap (translation)
29
+ - **ROUGE**: Recall-oriented (summarization)
30
+ - **METEOR**: Semantic similarity
31
+ - **BERTScore**: Embedding-based similarity
32
+ - **Perplexity**: Language model confidence
33
+
34
+ **Classification:**
35
+
36
+ - **Accuracy**: Percentage correct
37
+ - **Precision/Recall/F1**: Class-specific performance
38
+ - **Confusion Matrix**: Error patterns
39
+ - **AUC-ROC**: Ranking quality
40
+
41
+ **Retrieval (RAG):**
42
+
43
+ - **MRR**: Mean Reciprocal Rank
44
+ - **NDCG**: Normalized Discounted Cumulative Gain
45
+ - **Precision@K**: Relevant in top K
46
+ - **Recall@K**: Coverage in top K
47
+
48
+ ### 2. Human Evaluation
49
+
50
+ Manual assessment for quality aspects difficult to automate.
51
+
52
+ **Dimensions:**
53
+
54
+ - **Accuracy**: Factual correctness
55
+ - **Coherence**: Logical flow
56
+ - **Relevance**: Answers the question
57
+ - **Fluency**: Natural language quality
58
+ - **Safety**: No harmful content
59
+ - **Helpfulness**: Useful to the user
60
+
61
+ ### 3. LLM-as-Judge
62
+
63
+ Use stronger LLMs to evaluate weaker model outputs.
64
+
65
+ **Approaches:**
66
+
67
+ - **Pointwise**: Score individual responses
68
+ - **Pairwise**: Compare two responses
69
+ - **Reference-based**: Compare to gold standard
70
+ - **Reference-free**: Judge without ground truth
71
+
72
+ ## Quick Start
73
+
74
+ ```python
75
+ from dataclasses import dataclass
76
+ from typing import Callable
77
+ import numpy as np
78
+
79
+ @dataclass
80
+ class Metric:
81
+ name: str
82
+ fn: Callable
83
+
84
+ @staticmethod
85
+ def accuracy():
86
+ return Metric("accuracy", calculate_accuracy)
87
+
88
+ @staticmethod
89
+ def bleu():
90
+ return Metric("bleu", calculate_bleu)
91
+
92
+ @staticmethod
93
+ def bertscore():
94
+ return Metric("bertscore", calculate_bertscore)
95
+
96
+ @staticmethod
97
+ def custom(name: str, fn: Callable):
98
+ return Metric(name, fn)
99
+
100
+ class EvaluationSuite:
101
+ def __init__(self, metrics: list[Metric]):
102
+ self.metrics = metrics
103
+
104
+ async def evaluate(self, model, test_cases: list[dict]) -> dict:
105
+ results = {m.name: [] for m in self.metrics}
106
+
107
+ for test in test_cases:
108
+ prediction = await model.predict(test["input"])
109
+
110
+ for metric in self.metrics:
111
+ score = metric.fn(
112
+ prediction=prediction,
113
+ reference=test.get("expected"),
114
+ context=test.get("context")
115
+ )
116
+ results[metric.name].append(score)
117
+
118
+ return {
119
+ "metrics": {k: np.mean(v) for k, v in results.items()},
120
+ "raw_scores": results
121
+ }
122
+
123
+ # Usage
124
+ suite = EvaluationSuite([
125
+ Metric.accuracy(),
126
+ Metric.bleu(),
127
+ Metric.bertscore(),
128
+ Metric.custom("groundedness", check_groundedness)
129
+ ])
130
+
131
+ test_cases = [
132
+ {
133
+ "input": "What is the capital of France?",
134
+ "expected": "Paris",
135
+ "context": "France is a country in Europe. Paris is its capital."
136
+ },
137
+ ]
138
+
139
+ results = await suite.evaluate(model=your_model, test_cases=test_cases)
140
+ ```
141
+
142
+ ## Automated Metrics Implementation
143
+
144
+ ### BLEU Score
145
+
146
+ ```python
147
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
148
+
149
+ def calculate_bleu(reference: str, hypothesis: str, **kwargs) -> float:
150
+ """Calculate BLEU score between reference and hypothesis."""
151
+ smoothie = SmoothingFunction().method4
152
+
153
+ return sentence_bleu(
154
+ [reference.split()],
155
+ hypothesis.split(),
156
+ smoothing_function=smoothie
157
+ )
158
+ ```
159
+
160
+ ### ROUGE Score
161
+
162
+ ```python
163
+ from rouge_score import rouge_scorer
164
+
165
+ def calculate_rouge(reference: str, hypothesis: str, **kwargs) -> dict:
166
+ """Calculate ROUGE scores."""
167
+ scorer = rouge_scorer.RougeScorer(
168
+ ['rouge1', 'rouge2', 'rougeL'],
169
+ use_stemmer=True
170
+ )
171
+ scores = scorer.score(reference, hypothesis)
172
+
173
+ return {
174
+ 'rouge1': scores['rouge1'].fmeasure,
175
+ 'rouge2': scores['rouge2'].fmeasure,
176
+ 'rougeL': scores['rougeL'].fmeasure
177
+ }
178
+ ```
179
+
180
+ ### BERTScore
181
+
182
+ ```python
183
+ from bert_score import score
184
+
185
+ def calculate_bertscore(
186
+ references: list[str],
187
+ hypotheses: list[str],
188
+ **kwargs
189
+ ) -> dict:
190
+ """Calculate BERTScore using pre-trained model."""
191
+ P, R, F1 = score(
192
+ hypotheses,
193
+ references,
194
+ lang='en',
195
+ model_type='microsoft/deberta-xlarge-mnli'
196
+ )
197
+
198
+ return {
199
+ 'precision': P.mean().item(),
200
+ 'recall': R.mean().item(),
201
+ 'f1': F1.mean().item()
202
+ }
203
+ ```
204
+
205
+ ### Custom Metrics
206
+
207
+ ```python
208
+ def calculate_groundedness(response: str, context: str, **kwargs) -> float:
209
+ """Check if response is grounded in provided context."""
210
+ from transformers import pipeline
211
+
212
+ nli = pipeline(
213
+ "text-classification",
214
+ model="microsoft/deberta-large-mnli"
215
+ )
216
+
217
+ result = nli(f"{context} [SEP] {response}")[0]
218
+
219
+ # Return confidence that response is entailed by context
220
+ return result['score'] if result['label'] == 'ENTAILMENT' else 0.0
221
+
222
+ def calculate_toxicity(text: str, **kwargs) -> float:
223
+ """Measure toxicity in generated text."""
224
+ from detoxify import Detoxify
225
+
226
+ results = Detoxify('original').predict(text)
227
+ return max(results.values()) # Return highest toxicity score
228
+
229
+ def calculate_factuality(claim: str, sources: list[str], **kwargs) -> float:
230
+ """Verify factual claims against sources."""
231
+ from transformers import pipeline
232
+
233
+ nli = pipeline("text-classification", model="facebook/bart-large-mnli")
234
+
235
+ scores = []
236
+ for source in sources:
237
+ result = nli(f"{source}</s></s>{claim}")[0]
238
+ if result['label'] == 'entailment':
239
+ scores.append(result['score'])
240
+
241
+ return max(scores) if scores else 0.0
242
+ ```
243
+
244
+ ## LLM-as-Judge Patterns
245
+
246
+ ### Single Output Evaluation
247
+
248
+ ```python
249
+ from anthropic import Anthropic
250
+ from pydantic import BaseModel, Field
251
+ import json
252
+
253
+ class QualityRating(BaseModel):
254
+ accuracy: int = Field(ge=1, le=10, description="Factual correctness")
255
+ helpfulness: int = Field(ge=1, le=10, description="Answers the question")
256
+ clarity: int = Field(ge=1, le=10, description="Well-written and understandable")
257
+ reasoning: str = Field(description="Brief explanation")
258
+
259
+ async def llm_judge_quality(
260
+ response: str,
261
+ question: str,
262
+ context: str = None
263
+ ) -> QualityRating:
264
+ """Use Claude to judge response quality."""
265
+ client = Anthropic()
266
+
267
+ system = """You are an expert evaluator of AI responses.
268
+ Rate responses on accuracy, helpfulness, and clarity (1-10 scale).
269
+ Provide brief reasoning for your ratings."""
270
+
271
+ prompt = f"""Rate the following response:
272
+
273
+ Question: {question}
274
+ {f'Context: {context}' if context else ''}
275
+ Response: {response}
276
+
277
+ Provide ratings in JSON format:
278
+ {{
279
+ "accuracy": <1-10>,
280
+ "helpfulness": <1-10>,
281
+ "clarity": <1-10>,
282
+ "reasoning": "<brief explanation>"
283
+ }}"""
284
+
285
+ message = client.messages.create(
286
+ model="claude-sonnet-4-6",
287
+ max_tokens=500,
288
+ system=system,
289
+ messages=[{"role": "user", "content": prompt}]
290
+ )
291
+
292
+ return QualityRating(**json.loads(message.content[0].text))
293
+ ```
294
+
295
+ ### Pairwise Comparison
296
+
297
+ ```python
298
+ from pydantic import BaseModel, Field
299
+ from typing import Literal
300
+
301
+ class ComparisonResult(BaseModel):
302
+ winner: Literal["A", "B", "tie"]
303
+ reasoning: str
304
+ confidence: int = Field(ge=1, le=10)
305
+
306
+ async def compare_responses(
307
+ question: str,
308
+ response_a: str,
309
+ response_b: str
310
+ ) -> ComparisonResult:
311
+ """Compare two responses using LLM judge."""
312
+ client = Anthropic()
313
+
314
+ prompt = f"""Compare these two responses and determine which is better.
315
+
316
+ Question: {question}
317
+
318
+ Response A: {response_a}
319
+
320
+ Response B: {response_b}
321
+
322
+ Consider accuracy, helpfulness, and clarity.
323
+
324
+ Answer with JSON:
325
+ {{
326
+ "winner": "A" or "B" or "tie",
327
+ "reasoning": "<explanation>",
328
+ "confidence": <1-10>
329
+ }}"""
330
+
331
+ message = client.messages.create(
332
+ model="claude-sonnet-4-6",
333
+ max_tokens=500,
334
+ messages=[{"role": "user", "content": prompt}]
335
+ )
336
+
337
+ return ComparisonResult(**json.loads(message.content[0].text))
338
+ ```
339
+
340
+ ### Reference-Based Evaluation
341
+
342
+ ```python
343
+ class ReferenceEvaluation(BaseModel):
344
+ semantic_similarity: float = Field(ge=0, le=1)
345
+ factual_accuracy: float = Field(ge=0, le=1)
346
+ completeness: float = Field(ge=0, le=1)
347
+ issues: list[str]
348
+
349
+ async def evaluate_against_reference(
350
+ response: str,
351
+ reference: str,
352
+ question: str
353
+ ) -> ReferenceEvaluation:
354
+ """Evaluate response against gold standard reference."""
355
+ client = Anthropic()
356
+
357
+ prompt = f"""Compare the response to the reference answer.
358
+
359
+ Question: {question}
360
+ Reference Answer: {reference}
361
+ Response to Evaluate: {response}
362
+
363
+ Evaluate:
364
+ 1. Semantic similarity (0-1): How similar is the meaning?
365
+ 2. Factual accuracy (0-1): Are all facts correct?
366
+ 3. Completeness (0-1): Does it cover all key points?
367
+ 4. List any specific issues or errors.
368
+
369
+ Respond in JSON:
370
+ {{
371
+ "semantic_similarity": <0-1>,
372
+ "factual_accuracy": <0-1>,
373
+ "completeness": <0-1>,
374
+ "issues": ["issue1", "issue2"]
375
+ }}"""
376
+
377
+ message = client.messages.create(
378
+ model="claude-sonnet-4-6",
379
+ max_tokens=500,
380
+ messages=[{"role": "user", "content": prompt}]
381
+ )
382
+
383
+ return ReferenceEvaluation(**json.loads(message.content[0].text))
384
+ ```
385
+
386
+ ## Human Evaluation Frameworks
387
+
388
+ ### Annotation Guidelines
389
+
390
+ ```python
391
+ from dataclasses import dataclass, field
392
+ from typing import Optional
393
+
394
+ @dataclass
395
+ class AnnotationTask:
396
+ """Structure for human annotation task."""
397
+ response: str
398
+ question: str
399
+ context: Optional[str] = None
400
+
401
+ def get_annotation_form(self) -> dict:
402
+ return {
403
+ "question": self.question,
404
+ "context": self.context,
405
+ "response": self.response,
406
+ "ratings": {
407
+ "accuracy": {
408
+ "scale": "1-5",
409
+ "description": "Is the response factually correct?"
410
+ },
411
+ "relevance": {
412
+ "scale": "1-5",
413
+ "description": "Does it answer the question?"
414
+ },
415
+ "coherence": {
416
+ "scale": "1-5",
417
+ "description": "Is it logically consistent?"
418
+ }
419
+ },
420
+ "issues": {
421
+ "factual_error": False,
422
+ "hallucination": False,
423
+ "off_topic": False,
424
+ "unsafe_content": False
425
+ },
426
+ "feedback": ""
427
+ }
428
+ ```
429
+
430
+ ### Inter-Rater Agreement
431
+
432
+ ```python
433
+ from sklearn.metrics import cohen_kappa_score
434
+
435
+ def calculate_agreement(
436
+ rater1_scores: list[int],
437
+ rater2_scores: list[int]
438
+ ) -> dict:
439
+ """Calculate inter-rater agreement."""
440
+ kappa = cohen_kappa_score(rater1_scores, rater2_scores)
441
+
442
+ if kappa < 0:
443
+ interpretation = "Poor"
444
+ elif kappa < 0.2:
445
+ interpretation = "Slight"
446
+ elif kappa < 0.4:
447
+ interpretation = "Fair"
448
+ elif kappa < 0.6:
449
+ interpretation = "Moderate"
450
+ elif kappa < 0.8:
451
+ interpretation = "Substantial"
452
+ else:
453
+ interpretation = "Almost Perfect"
454
+
455
+ return {
456
+ "kappa": kappa,
457
+ "interpretation": interpretation
458
+ }
459
+ ```
460
+
461
+ ## A/B Testing
462
+
463
+ ### Statistical Testing Framework
464
+
465
+ ```python
466
+ from scipy import stats
467
+ import numpy as np
468
+ from dataclasses import dataclass, field
469
+
470
+ @dataclass
471
+ class ABTest:
472
+ variant_a_name: str = "A"
473
+ variant_b_name: str = "B"
474
+ variant_a_scores: list[float] = field(default_factory=list)
475
+ variant_b_scores: list[float] = field(default_factory=list)
476
+
477
+ def add_result(self, variant: str, score: float):
478
+ """Add evaluation result for a variant."""
479
+ if variant == "A":
480
+ self.variant_a_scores.append(score)
481
+ else:
482
+ self.variant_b_scores.append(score)
483
+
484
+ def analyze(self, alpha: float = 0.05) -> dict:
485
+ """Perform statistical analysis."""
486
+ a_scores = np.array(self.variant_a_scores)
487
+ b_scores = np.array(self.variant_b_scores)
488
+
489
+ # T-test
490
+ t_stat, p_value = stats.ttest_ind(a_scores, b_scores)
491
+
492
+ # Effect size (Cohen's d)
493
+ pooled_std = np.sqrt((np.std(a_scores)**2 + np.std(b_scores)**2) / 2)
494
+ cohens_d = (np.mean(b_scores) - np.mean(a_scores)) / pooled_std
495
+
496
+ return {
497
+ "variant_a_mean": np.mean(a_scores),
498
+ "variant_b_mean": np.mean(b_scores),
499
+ "difference": np.mean(b_scores) - np.mean(a_scores),
500
+ "relative_improvement": (np.mean(b_scores) - np.mean(a_scores)) / np.mean(a_scores),
501
+ "p_value": p_value,
502
+ "statistically_significant": p_value < alpha,
503
+ "cohens_d": cohens_d,
504
+ "effect_size": self._interpret_cohens_d(cohens_d),
505
+ "winner": self.variant_b_name if np.mean(b_scores) > np.mean(a_scores) else self.variant_a_name
506
+ }
507
+
508
+ @staticmethod
509
+ def _interpret_cohens_d(d: float) -> str:
510
+ """Interpret Cohen's d effect size."""
511
+ abs_d = abs(d)
512
+ if abs_d < 0.2:
513
+ return "negligible"
514
+ elif abs_d < 0.5:
515
+ return "small"
516
+ elif abs_d < 0.8:
517
+ return "medium"
518
+ else:
519
+ return "large"
520
+ ```
521
+
522
+ ## Regression Testing
523
+
524
+ ### Regression Detection
525
+
526
+ ```python
527
+ from dataclasses import dataclass
528
+
529
+ @dataclass
530
+ class RegressionResult:
531
+ metric: str
532
+ baseline: float
533
+ current: float
534
+ change: float
535
+ is_regression: bool
536
+
537
+ class RegressionDetector:
538
+ def __init__(self, baseline_results: dict, threshold: float = 0.05):
539
+ self.baseline = baseline_results
540
+ self.threshold = threshold
541
+
542
+ def check_for_regression(self, new_results: dict) -> dict:
543
+ """Detect if new results show regression."""
544
+ regressions = []
545
+
546
+ for metric in self.baseline.keys():
547
+ baseline_score = self.baseline[metric]
548
+ new_score = new_results.get(metric)
549
+
550
+ if new_score is None:
551
+ continue
552
+
553
+ # Calculate relative change
554
+ relative_change = (new_score - baseline_score) / baseline_score
555
+
556
+ # Flag if significant decrease
557
+ is_regression = relative_change < -self.threshold
558
+ if is_regression:
559
+ regressions.append(RegressionResult(
560
+ metric=metric,
561
+ baseline=baseline_score,
562
+ current=new_score,
563
+ change=relative_change,
564
+ is_regression=True
565
+ ))
566
+
567
+ return {
568
+ "has_regression": len(regressions) > 0,
569
+ "regressions": regressions,
570
+ "summary": f"{len(regressions)} metric(s) regressed"
571
+ }
572
+ ```
573
+
574
+ ## LangSmith Evaluation Integration
575
+
576
+ ```python
577
+ from langsmith import Client
578
+ from langsmith.evaluation import evaluate, LangChainStringEvaluator
579
+
580
+ # Initialize LangSmith client
581
+ client = Client()
582
+
583
+ # Create dataset
584
+ dataset = client.create_dataset("qa_test_cases")
585
+ client.create_examples(
586
+ inputs=[{"question": q} for q in questions],
587
+ outputs=[{"answer": a} for a in expected_answers],
588
+ dataset_id=dataset.id
589
+ )
590
+
591
+ # Define evaluators
592
+ evaluators = [
593
+ LangChainStringEvaluator("qa"), # QA correctness
594
+ LangChainStringEvaluator("context_qa"), # Context-grounded QA
595
+ LangChainStringEvaluator("cot_qa"), # Chain-of-thought QA
596
+ ]
597
+
598
+ # Run evaluation
599
+ async def target_function(inputs: dict) -> dict:
600
+ result = await your_chain.ainvoke(inputs)
601
+ return {"answer": result}
602
+
603
+ experiment_results = await evaluate(
604
+ target_function,
605
+ data=dataset.name,
606
+ evaluators=evaluators,
607
+ experiment_prefix="v1.0.0",
608
+ metadata={"model": "claude-sonnet-4-6", "version": "1.0.0"}
609
+ )
610
+
611
+ print(f"Mean score: {experiment_results.aggregate_metrics['qa']['mean']}")
612
+ ```
613
+
614
+ ## Benchmarking
615
+
616
+ ### Running Benchmarks
617
+
618
+ ```python
619
+ from dataclasses import dataclass
620
+ import numpy as np
621
+
622
+ @dataclass
623
+ class BenchmarkResult:
624
+ metric: str
625
+ mean: float
626
+ std: float
627
+ min: float
628
+ max: float
629
+
630
+ class BenchmarkRunner:
631
+ def __init__(self, benchmark_dataset: list[dict]):
632
+ self.dataset = benchmark_dataset
633
+
634
+ async def run_benchmark(
635
+ self,
636
+ model,
637
+ metrics: list[Metric]
638
+ ) -> dict[str, BenchmarkResult]:
639
+ """Run model on benchmark and calculate metrics."""
640
+ results = {metric.name: [] for metric in metrics}
641
+
642
+ for example in self.dataset:
643
+ # Generate prediction
644
+ prediction = await model.predict(example["input"])
645
+
646
+ # Calculate each metric
647
+ for metric in metrics:
648
+ score = metric.fn(
649
+ prediction=prediction,
650
+ reference=example["reference"],
651
+ context=example.get("context")
652
+ )
653
+ results[metric.name].append(score)
654
+
655
+ # Aggregate results
656
+ return {
657
+ metric: BenchmarkResult(
658
+ metric=metric,
659
+ mean=np.mean(scores),
660
+ std=np.std(scores),
661
+ min=min(scores),
662
+ max=max(scores)
663
+ )
664
+ for metric, scores in results.items()
665
+ }
666
+ ```
667
+
668
+ ## Resources
669
+
670
+ - [LangSmith Evaluation Guide](https://docs.smith.langchain.com/evaluation)
671
+ - [RAGAS Framework](https://docs.ragas.io/)
672
+ - [DeepEval Library](https://docs.deepeval.com/)
673
+ - [Arize Phoenix](https://docs.arize.com/phoenix/)
674
+ - [HELM Benchmark](https://crfm.stanford.edu/helm/)
675
+
676
+ ## Best Practices
677
+
678
+ 1. **Multiple Metrics**: Use diverse metrics for comprehensive view
679
+ 2. **Representative Data**: Test on real-world, diverse examples
680
+ 3. **Baselines**: Always compare against baseline performance
681
+ 4. **Statistical Rigor**: Use proper statistical tests for comparisons
682
+ 5. **Continuous Evaluation**: Integrate into CI/CD pipeline
683
+ 6. **Human Validation**: Combine automated metrics with human judgment
684
+ 7. **Error Analysis**: Investigate failures to understand weaknesses
685
+ 8. **Version Control**: Track evaluation results over time
686
+
687
+ ## Common Pitfalls
688
+
689
+ - **Single Metric Obsession**: Optimizing for one metric at the expense of others
690
+ - **Small Sample Size**: Drawing conclusions from too few examples
691
+ - **Data Contamination**: Testing on training data
692
+ - **Ignoring Variance**: Not accounting for statistical uncertainty
693
+ - **Metric Mismatch**: Using metrics not aligned with business goals
694
+ - **Position Bias**: In pairwise evals, randomize order
695
+ - **Overfitting Prompts**: Optimizing for test set instead of real use