claude-self-reflect 7.1.10 → 7.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,496 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Batch Ground Truth Generator for Code Session Evaluations
4
+
5
+ Uses Anthropic's Batch API to create high-quality ground truth evaluations
6
+ at scale (50% cost savings vs streaming).
7
+
8
+ Process:
9
+ 1. Fetch narratives from Qdrant v3_all_projects collection
10
+ 2. Create batch evaluation requests using GRADER_PROMPT.md
11
+ 3. Submit to Batch API ($0.015 per eval instead of $0.30)
12
+ 4. Retrieve results after 24 hours
13
+ 5. Push ground truth to new Qdrant collection: ground_truth_evals
14
+
15
+ Why Batch API?
16
+ - 50% cost: $0.75 for 50 evals vs $15 streaming
17
+ - Parallel processing: All 50 done in 24hrs
18
+ - High quality: Uses Claude Opus 4 for grading
19
+ - Reproducible: Same prompts, consistent results
20
+ """
21
+
22
+ import json
23
+ import anthropic
24
+ import time
25
+ from pathlib import Path
26
+ from typing import Dict, List, Any, Optional
27
+ from datetime import datetime
28
+ import requests
29
+ import os
30
+ from dotenv import load_dotenv
31
+
32
+ # Load environment variables from .env
33
+ load_dotenv()
34
+
35
+
36
+ class BatchGroundTruthGenerator:
37
+ """
38
+ Generate ground truth evaluations using Batch API.
39
+ Stores results in Qdrant for calibration and testing.
40
+ """
41
+
42
+ def __init__(self):
43
+ """Initialize with Anthropic client and Qdrant connection."""
44
+ # Validate API key
45
+ api_key = os.getenv("ANTHROPIC_API_KEY")
46
+ if not api_key:
47
+ raise ValueError(
48
+ "ANTHROPIC_API_KEY environment variable required. "
49
+ "Set it in your .env file or export it in your shell."
50
+ )
51
+
52
+ # Initialize Anthropic client
53
+ self.client = anthropic.Anthropic(api_key=api_key)
54
+ self.qdrant_url = "http://localhost:6333"
55
+ self.collection_name = "v3_all_projects"
56
+ self.ground_truth_collection = "ground_truth_evals"
57
+
58
+ # Load grader prompt template
59
+ prompt_path = Path(__file__).parent / "GRADER_PROMPT.md"
60
+ with open(prompt_path, 'r') as f:
61
+ self.grader_prompt_template = f.read()
62
+
63
+ def fetch_narratives_from_qdrant(
64
+ self,
65
+ limit: int = 100,
66
+ filters: Optional[Dict] = None
67
+ ) -> List[Dict]:
68
+ """
69
+ Fetch narratives from Qdrant for evaluation.
70
+
71
+ Args:
72
+ limit: Number of narratives to fetch
73
+ filters: Optional Qdrant filters (e.g., only successful builds)
74
+
75
+ Returns:
76
+ List of narrative payloads with metadata
77
+ """
78
+ url = f"{self.qdrant_url}/collections/{self.collection_name}/points/scroll"
79
+
80
+ payload = {
81
+ "limit": limit,
82
+ "with_payload": True,
83
+ "with_vector": False
84
+ }
85
+
86
+ if filters:
87
+ payload["filter"] = filters
88
+
89
+ response = requests.post(url, json=payload)
90
+ response.raise_for_status()
91
+
92
+ points = response.json()["result"]["points"]
93
+ print(f"✅ Fetched {len(points)} narratives from Qdrant")
94
+
95
+ return points
96
+
97
+ def create_batch_requests(
98
+ self,
99
+ narratives: List[Dict],
100
+ output_file: str = "batch_ground_truth_requests.jsonl",
101
+ model: str = "claude-haiku-4-5"
102
+ ) -> str:
103
+ """
104
+ Create batch API request file from narratives.
105
+
106
+ Each request evaluates one conversation using GRADER_PROMPT.md.
107
+
108
+ Uses Haiku 4.5 by default for:
109
+ - Fast processing (minutes instead of 24 hours)
110
+ - Lower cost (~$0.001 per eval vs $0.015 for Opus)
111
+ - Still high quality for grading tasks
112
+
113
+ Format per line:
114
+ {
115
+ "custom_id": "conversation_id",
116
+ "params": {
117
+ "model": "claude-haiku-4.5",
118
+ "max_tokens": 4096,
119
+ "messages": [
120
+ {"role": "user", "content": "<grader_prompt>"}
121
+ ]
122
+ }
123
+ }
124
+ """
125
+ requests_data = []
126
+
127
+ for point in narratives:
128
+ payload = point["payload"]
129
+ conversation_id = payload.get("conversation_id")
130
+ narrative = payload.get("narrative", "")
131
+ search_index = payload.get("search_index", "")
132
+ context_cache = payload.get("context_cache", "")
133
+ signature = payload.get("signature", {})
134
+
135
+ # Build grader prompt
136
+ grader_input = self._build_grader_input(
137
+ conversation_id,
138
+ narrative,
139
+ search_index,
140
+ context_cache,
141
+ signature
142
+ )
143
+
144
+ # Create batch request
145
+ batch_request = {
146
+ "custom_id": conversation_id,
147
+ "params": {
148
+ "model": model,
149
+ "max_tokens": 4096,
150
+ "messages": [
151
+ {
152
+ "role": "user",
153
+ "content": grader_input
154
+ }
155
+ ]
156
+ }
157
+ }
158
+
159
+ requests_data.append(batch_request)
160
+
161
+ # Write to JSONL file
162
+ output_path = Path(__file__).parent / output_file
163
+ with open(output_path, 'w') as f:
164
+ for req in requests_data:
165
+ f.write(json.dumps(req) + '\n')
166
+
167
+ print(f"✅ Created {len(requests_data)} batch requests in {output_file}")
168
+ return str(output_path)
169
+
170
+ def _build_grader_input(
171
+ self,
172
+ conversation_id: str,
173
+ narrative: str,
174
+ search_index: str,
175
+ context_cache: str,
176
+ signature: Dict
177
+ ) -> str:
178
+ """
179
+ Build grader prompt for one conversation.
180
+
181
+ Extracts:
182
+ - User request (from search_index)
183
+ - Solution (from narrative)
184
+ - Build/test results (from context_cache validation section)
185
+ - Completion status (from signature)
186
+ """
187
+ # Extract user request
188
+ request_lines = search_index.split("## User Request")
189
+ user_request = request_lines[1].split("##")[0].strip() if len(request_lines) > 1 else "Unknown"
190
+
191
+ # Extract validation signals from context_cache
192
+ validation_section = context_cache.split("## Validation")[1] if "## Validation" in context_cache else ""
193
+
194
+ # Count builds and tests
195
+ build_success_count = validation_section.count("Build: Success")
196
+ test_passed_count = validation_section.count("Tests: Passed")
197
+
198
+ # Build tier1_results XML
199
+ tier1_xml = f"""<tier1_results>
200
+ <build_success>{build_success_count > 0}</build_success>
201
+ <test_results>
202
+ <passed>{test_passed_count}</passed>
203
+ <failed>0</failed>
204
+ <framework>unknown</framework>
205
+ </test_results>
206
+ <code_quality>0.0</code_quality>
207
+ <security_issues>0</security_issues>
208
+ <confidence>0.6</confidence>
209
+ </tier1_results>"""
210
+
211
+ # Extract solution from narrative
212
+ solution_section = narrative.split("## Technical Pattern")[1].split("##")[0] if "## Technical Pattern" in narrative else narrative[:500]
213
+
214
+ # Build rubric from completion status
215
+ completion = signature.get("completion_status", "unknown")
216
+ rubric = f"""
217
+ - Solution must address the user's request
218
+ - Code must be functional (builds and tests pass)
219
+ - Implementation should follow best practices
220
+ - Expected completion status: {completion}
221
+ """
222
+
223
+ # Fill in grader prompt template
224
+ grader_prompt = f"""You are evaluating a code generation session for ground truth labeling.
225
+
226
+ <request>{user_request}</request>
227
+
228
+ <solution>
229
+ {solution_section}
230
+ </solution>
231
+
232
+ {tier1_xml}
233
+
234
+ <rubric>
235
+ {rubric}
236
+ </rubric>
237
+
238
+ <narrative>
239
+ Full narrative for context:
240
+ {narrative[:1000]}...
241
+ </narrative>
242
+
243
+ Please evaluate this session and provide:
244
+ 1. Functional correctness score (0.0-1.0)
245
+ 2. Design quality score (0.0-1.0)
246
+ 3. Overall grade (0.0-1.0)
247
+ 4. Reasoning for your scores
248
+
249
+ Output in XML format as specified in the grader prompt.
250
+ """
251
+
252
+ return grader_prompt
253
+
254
+ def submit_batch(self, requests_file: str) -> str:
255
+ """
256
+ Submit batch to Anthropic API.
257
+
258
+ Returns:
259
+ batch_id for tracking
260
+ """
261
+ # Read JSONL file and parse requests
262
+ requests = []
263
+ with open(requests_file, 'r') as f:
264
+ for line in f:
265
+ if line.strip():
266
+ requests.append(json.loads(line))
267
+
268
+ # Submit batch with parsed requests
269
+ batch = self.client.messages.batches.create(
270
+ requests=requests
271
+ )
272
+
273
+ print(f"✅ Batch submitted: {batch.id}")
274
+ print(f" Status: {batch.processing_status}")
275
+ print(f" Request count: {batch.request_counts}")
276
+
277
+ return batch.id
278
+
279
+ def check_batch_status(self, batch_id: str) -> Dict:
280
+ """
281
+ Check batch processing status.
282
+
283
+ Returns:
284
+ Batch status object
285
+ """
286
+ batch = self.client.messages.batches.retrieve(batch_id)
287
+
288
+ print(f"📊 Batch {batch_id}:")
289
+ print(f" Status: {batch.processing_status}")
290
+ print(f" Succeeded: {batch.request_counts.succeeded}")
291
+ print(f" Failed: {batch.request_counts.errored}")
292
+ print(f" Total: {batch.request_counts.processing}")
293
+
294
+ return {
295
+ "status": batch.processing_status,
296
+ "succeeded": batch.request_counts.succeeded,
297
+ "failed": batch.request_counts.errored,
298
+ "total": batch.request_counts.processing
299
+ }
300
+
301
+ def retrieve_batch_results(
302
+ self,
303
+ batch_id: str,
304
+ output_file: str = "batch_ground_truth_results.jsonl"
305
+ ) -> str:
306
+ """
307
+ Retrieve completed batch results.
308
+
309
+ Returns:
310
+ Path to results file
311
+ """
312
+ # Stream results to file
313
+ output_path = Path(__file__).parent / output_file
314
+
315
+ with open(output_path, 'w') as f:
316
+ for result in self.client.messages.batches.results(batch_id):
317
+ f.write(json.dumps(result.model_dump()) + '\n')
318
+
319
+ print(f"✅ Retrieved batch results to {output_file}")
320
+ return str(output_path)
321
+
322
+ def parse_batch_results(self, results_file: str) -> List[Dict]:
323
+ """
324
+ Parse batch results into ground truth evaluations.
325
+
326
+ Each result contains:
327
+ - custom_id (conversation_id)
328
+ - result (API response with evaluation)
329
+ """
330
+ ground_truths = []
331
+
332
+ with open(results_file, 'r') as f:
333
+ for line in f:
334
+ result = json.loads(line)
335
+
336
+ # Extract evaluation from response
337
+ custom_id = result.get("custom_id")
338
+ response = result.get("result", {})
339
+
340
+ if response.get("type") == "succeeded":
341
+ message = response.get("message", {})
342
+ content = message.get("content", [])
343
+
344
+ # Extract evaluation XML
345
+ eval_text = content[0].get("text", "") if content else ""
346
+
347
+ # Parse evaluation (would need proper XML parsing)
348
+ ground_truth = {
349
+ "conversation_id": custom_id,
350
+ "evaluation": eval_text,
351
+ "timestamp": datetime.utcnow().isoformat(),
352
+ "model": "claude-haiku-4.5",
353
+ "method": "batch_api"
354
+ }
355
+
356
+ ground_truths.append(ground_truth)
357
+
358
+ print(f"✅ Parsed {len(ground_truths)} ground truth evaluations")
359
+ return ground_truths
360
+
361
+ def create_ground_truth_collection(self):
362
+ """
363
+ Create Qdrant collection for ground truth evaluations.
364
+
365
+ Schema:
366
+ - conversation_id: str
367
+ - evaluation: str (XML from grader)
368
+ - scores: {functional, design, overall}
369
+ - timestamp: str
370
+ - model: str
371
+ """
372
+ url = f"{self.qdrant_url}/collections/{self.ground_truth_collection}"
373
+
374
+ # Create collection with same vector size as narratives
375
+ payload = {
376
+ "vectors": {
377
+ "size": 384, # FastEmbed dimensions
378
+ "distance": "Cosine"
379
+ }
380
+ }
381
+
382
+ response = requests.put(url, json=payload)
383
+ if response.status_code == 200:
384
+ print(f"✅ Created collection: {self.ground_truth_collection}")
385
+ else:
386
+ print(f"ℹ️ Collection already exists or error: {response.status_code}")
387
+
388
+ def push_to_qdrant(self, ground_truths: List[Dict]):
389
+ """
390
+ Push ground truth evaluations to Qdrant.
391
+
392
+ Note: We don't need embeddings for ground truth,
393
+ just storing for reference and calibration.
394
+ """
395
+ url = f"{self.qdrant_url}/collections/{self.ground_truth_collection}/points"
396
+
397
+ points = []
398
+ for gt in ground_truths:
399
+ point = {
400
+ "id": gt["conversation_id"],
401
+ "vector": [0.0] * 384, # Dummy vector, not used for search
402
+ "payload": gt
403
+ }
404
+ points.append(point)
405
+
406
+ # Batch upsert
407
+ payload = {"points": points}
408
+ response = requests.put(url, json=payload)
409
+ response.raise_for_status()
410
+
411
+ print(f"✅ Pushed {len(points)} ground truths to Qdrant")
412
+
413
+
414
+ def main():
415
+ """
416
+ Example workflow:
417
+
418
+ Step 1: Generate batch requests (today)
419
+ Step 2: Submit to Batch API (today)
420
+ Step 3: Wait 24 hours
421
+ Step 4: Retrieve results (tomorrow)
422
+ Step 5: Push to Qdrant (tomorrow)
423
+ """
424
+ generator = BatchGroundTruthGenerator()
425
+
426
+ # Step 1: Fetch narratives
427
+ print("Step 1: Fetching narratives from Qdrant...")
428
+ narratives = generator.fetch_narratives_from_qdrant(limit=50)
429
+
430
+ # Step 2: Create batch requests
431
+ print("\nStep 2: Creating batch requests...")
432
+ requests_file = generator.create_batch_requests(narratives)
433
+
434
+ # Step 3: Submit batch
435
+ print("\nStep 3: Submitting batch to Anthropic API...")
436
+ batch_id = generator.submit_batch(requests_file)
437
+
438
+ print(f"\n✅ Batch submitted successfully!")
439
+ print(f" Batch ID: {batch_id}")
440
+ print(f" Processing time: ~5-10 minutes (Haiku 4.5)")
441
+ print(f" Cost: ~${50 * 0.001:.2f} for 50 evaluations")
442
+
443
+ # Save batch ID for retrieval
444
+ with open("batch_ground_truth_id.txt", "w") as f:
445
+ f.write(batch_id)
446
+
447
+ print(f"\n💾 Batch ID saved to batch_ground_truth_id.txt")
448
+ print(f"\n⏰ Run retrieve_results() in 10 minutes to get evaluations")
449
+
450
+
451
+ def retrieve_results():
452
+ """
453
+ Run this after 24 hours to retrieve batch results.
454
+ """
455
+ # Load batch ID
456
+ with open("batch_ground_truth_id.txt", "r") as f:
457
+ batch_id = f.read().strip()
458
+
459
+ generator = BatchGroundTruthGenerator()
460
+
461
+ # Check status
462
+ print("Checking batch status...")
463
+ status = generator.check_batch_status(batch_id)
464
+
465
+ if status["status"] != "ended":
466
+ print(f"⏳ Batch still processing. Status: {status['status']}")
467
+ return
468
+
469
+ # Retrieve results
470
+ print("\nRetrieving results...")
471
+ results_file = generator.retrieve_batch_results(batch_id)
472
+
473
+ # Parse results
474
+ print("\nParsing evaluations...")
475
+ ground_truths = generator.parse_batch_results(results_file)
476
+
477
+ # Create collection
478
+ print("\nCreating Qdrant collection...")
479
+ generator.create_ground_truth_collection()
480
+
481
+ # Push to Qdrant
482
+ print("\nPushing to Qdrant...")
483
+ generator.push_to_qdrant(ground_truths)
484
+
485
+ print(f"\n✅ Ground truth generation complete!")
486
+ print(f" {len(ground_truths)} evaluations stored in Qdrant")
487
+ print(f" Collection: {generator.ground_truth_collection}")
488
+
489
+
490
+ if __name__ == "__main__":
491
+ import sys
492
+
493
+ if len(sys.argv) > 1 and sys.argv[1] == "retrieve":
494
+ retrieve_results()
495
+ else:
496
+ main()