claude-self-reflect 7.1.9 → 7.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,477 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Batch import ALL Claude Code projects with V3+SKILL_V2 to Qdrant.
4
+
5
+ Uses batching for efficient API calls and tracks costs across all projects.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ from pathlib import Path
12
+ from dotenv import load_dotenv
13
+ import time
14
+ from collections import defaultdict
15
+
16
+ load_dotenv()
17
+
18
+ # Add parent dirs to path
19
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
20
+
21
+ try:
22
+ import anthropic
23
+ except ImportError:
24
+ print("Error: anthropic SDK not found")
25
+ sys.exit(1)
26
+
27
+ from docs.design.extract_events_v3 import extract_events_v3
28
+ from qdrant_client import QdrantClient
29
+ from qdrant_client.models import Distance, VectorParams, PointStruct
30
+
31
+ # Import metadata extraction functions
32
+ # Path: docs/design/batch_import_all_projects.py -> ../../src/runtime/delta-metadata-update.py
33
+ import importlib.util
34
+ delta_metadata_path = Path(__file__).parent.parent.parent / "src" / "runtime" / "delta-metadata-update.py"
35
+ spec = importlib.util.spec_from_file_location("delta_metadata_update", delta_metadata_path)
36
+ delta_metadata_update = importlib.util.module_from_spec(spec)
37
+ spec.loader.exec_module(delta_metadata_update)
38
+ extract_tool_usage_from_jsonl = delta_metadata_update.extract_tool_usage_from_jsonl
39
+ extract_concepts = delta_metadata_update.extract_concepts
40
+
41
+ # Try importing FastEmbed
42
+ try:
43
+ from fastembed import TextEmbedding
44
+ FASTEMBED_AVAILABLE = True
45
+ except ImportError:
46
+ FASTEMBED_AVAILABLE = False
47
+ print("⚠️ FastEmbed not available, will use Voyage AI")
48
+
49
+
50
+ def get_embedding(text: str, embedding_model) -> list:
51
+ """Generate embedding for text."""
52
+ if FASTEMBED_AVAILABLE and embedding_model:
53
+ embeddings = list(embedding_model.embed([text]))
54
+ return embeddings[0].tolist()
55
+ else:
56
+ # Fallback to Voyage
57
+ import voyageai
58
+ vo = voyageai.Client(api_key=os.getenv('VOYAGE_KEY'))
59
+ result = vo.embed([text], model="voyage-3", input_type="document")
60
+ return result.embeddings[0]
61
+
62
+
63
+ def batch_generate_narratives(conversations_data: list, client: anthropic.Anthropic, skill_instructions: str):
64
+ """
65
+ Generate narratives for multiple conversations using batched requests.
66
+
67
+ Args:
68
+ conversations_data: List of dicts with 'conv_id', 'result' (V3 extraction)
69
+ client: Anthropic client
70
+ skill_instructions: SKILL_V2 instructions
71
+
72
+ Returns:
73
+ dict mapping conv_id to narrative
74
+ """
75
+
76
+ print(f"\n🔄 Batching {len(conversations_data)} narrative generation requests...")
77
+
78
+ # Create batch requests
79
+ batch_requests = []
80
+ for data in conversations_data:
81
+ result = data['result']
82
+
83
+ # Build metadata context section if available
84
+ metadata_context = ""
85
+ if 'metadata' in result:
86
+ metadata = result['metadata']
87
+ tool_usage = metadata.get('tool_usage', {})
88
+ concepts = metadata.get('concepts', [])
89
+
90
+ metadata_context = f"""
91
+ ### Metadata Context (USE THIS to enhance your narrative)
92
+
93
+ **Tools Used**: {json.dumps(tool_usage.get('tools_summary', {}))}
94
+ **Files Analyzed**: {tool_usage.get('files_read', [])[:10]}
95
+ **Files Modified**: {tool_usage.get('files_edited', [])[:10]}
96
+ **Concepts Detected**: {list(concepts)[:10]}
97
+ **Grep Searches**: {[s.get('pattern', '') for s in tool_usage.get('grep_searches', [])][:5]}
98
+ **Bash Commands**: {[cmd.get('command', '')[:100] for cmd in tool_usage.get('bash_commands', [])][:5]}
99
+
100
+ Use this metadata to understand:
101
+ - What tools were actually used (Read, Edit, Grep, Bash, etc.)
102
+ - Which files were involved in this conversation
103
+ - What technical concepts and domains this conversation touched
104
+ - What the developer was searching for and building
105
+ """
106
+
107
+ prompt = f"""You are analyzing a development conversation. Use the SKILL_V2 guidelines to generate a search-optimized narrative.
108
+
109
+ ## Extracted Events
110
+
111
+ ### Search Index
112
+ {result['search_index']}
113
+
114
+ ### Context Cache
115
+ {result['context_cache']}
116
+ {metadata_context}
117
+ ### Conversation Signature
118
+ ```json
119
+ {json.dumps(result['signature'], indent=2)}
120
+ ```
121
+
122
+ Now generate the narrative following SKILL_V2 format exactly, using ALL the context above including metadata."""
123
+
124
+ batch_requests.append({
125
+ "custom_id": data['conv_id'],
126
+ "params": {
127
+ "model": "claude-sonnet-4-5-20250929",
128
+ "max_tokens": 2048,
129
+ "system": skill_instructions,
130
+ "messages": [{"role": "user", "content": prompt}]
131
+ }
132
+ })
133
+
134
+ # Create batch via API
135
+ batch_response = client.messages.batches.create(requests=batch_requests)
136
+ batch_id = batch_response.id
137
+
138
+ print(f" ✅ Batch created: {batch_id}")
139
+ print(f" Status: {batch_response.processing_status}")
140
+
141
+ # Poll for completion
142
+ print("\n Waiting for batch to complete...")
143
+ max_wait = 1800 # 30 minutes max (batches can take longer than expected)
144
+ start_time = time.time()
145
+
146
+ while True:
147
+ if time.time() - start_time > max_wait:
148
+ print(f" ⚠️ Timeout after {max_wait}s")
149
+ break
150
+
151
+ batch_status = client.messages.batches.retrieve(batch_id)
152
+ status = batch_status.processing_status
153
+
154
+ print(f" Status: {status} ({batch_status.request_counts.processing} processing, "
155
+ f"{batch_status.request_counts.succeeded} succeeded, "
156
+ f"{batch_status.request_counts.errored} errored)")
157
+
158
+ if status == "ended":
159
+ print(f" ✅ Batch completed!")
160
+ break
161
+
162
+ time.sleep(5)
163
+
164
+ # Retrieve results
165
+ print("\n Fetching results...")
166
+ results_response = client.messages.batches.results(batch_id)
167
+
168
+ narratives = {}
169
+ total_cost = 0.0
170
+ total_input = 0
171
+ total_output = 0
172
+
173
+ for result_item in results_response:
174
+ conv_id = result_item.custom_id
175
+
176
+ if result_item.result.type == "succeeded":
177
+ message = result_item.result.message
178
+
179
+ # Extract narrative
180
+ narrative = ""
181
+ for block in message.content:
182
+ if hasattr(block, 'text'):
183
+ narrative += block.text
184
+
185
+ narratives[conv_id] = narrative
186
+
187
+ # Track usage
188
+ input_tokens = message.usage.input_tokens
189
+ output_tokens = message.usage.output_tokens
190
+ cost = (input_tokens * 3 + output_tokens * 15) / 1_000_000
191
+
192
+ total_input += input_tokens
193
+ total_output += output_tokens
194
+ total_cost += cost
195
+ else:
196
+ print(f" ❌ Error for {conv_id}: {result_item.result.error}")
197
+
198
+ print(f"\n 📊 Batch Results:")
199
+ print(f" Succeeded: {len(narratives)}/{len(conversations_data)}")
200
+ print(f" Total tokens: {total_input} input, {total_output} output")
201
+ print(f" Total cost: ${total_cost:.4f}")
202
+ print(f" Avg cost/conversation: ${total_cost/len(narratives):.4f}")
203
+
204
+ return narratives, total_cost
205
+
206
+
207
+ def discover_projects():
208
+ """Find all Claude Code projects with JSONL files."""
209
+ import os
210
+
211
+ # Support Docker environment (CLAUDE_PROJECTS_DIR=/logs) and local
212
+ projects_dir_env = os.environ.get("CLAUDE_PROJECTS_DIR")
213
+ if projects_dir_env:
214
+ projects_dir = Path(projects_dir_env)
215
+ elif Path("/logs").exists():
216
+ # Docker fallback
217
+ projects_dir = Path("/logs")
218
+ else:
219
+ # Local environment
220
+ projects_dir = Path.home() / ".claude/projects"
221
+
222
+ projects = {}
223
+
224
+ for project_dir in projects_dir.iterdir():
225
+ if not project_dir.is_dir():
226
+ continue
227
+ if project_dir.name.startswith('.'):
228
+ continue
229
+ if project_dir.name in ['test-streaming-project', 'test-voyage-import', 'claude-self-reflect-stress-test']:
230
+ continue # Skip test projects
231
+
232
+ # Count JSONL files
233
+ jsonl_files = list(project_dir.glob("*.jsonl"))
234
+ if jsonl_files:
235
+ # Extract project name from directory
236
+ # Format: "-Users-rama-projects-procsolve-website" -> "procsolve-website"
237
+ parts = project_dir.name.split('-projects-')
238
+ project_name = parts[-1] if len(parts) > 1 else project_dir.name
239
+
240
+ projects[project_name] = {
241
+ 'dir': project_dir,
242
+ 'conversations': jsonl_files,
243
+ 'count': len(jsonl_files)
244
+ }
245
+
246
+ return projects
247
+
248
+
249
+ def main():
250
+ """Main multi-project batch import process."""
251
+
252
+ print(f"\n{'='*80}")
253
+ print(f"MULTI-PROJECT V3+SKILL_V2 BATCH IMPORT")
254
+ print(f"{'='*80}\n")
255
+
256
+ # Discover all projects
257
+ print("🔍 Discovering projects...")
258
+ projects = discover_projects()
259
+
260
+ total_conversations = sum(p['count'] for p in projects.values())
261
+
262
+ print(f"\n📊 Found {len(projects)} projects with {total_conversations} conversations:")
263
+ for name, info in sorted(projects.items(), key=lambda x: x[1]['count'], reverse=True):
264
+ print(f" • {name}: {info['count']} conversations")
265
+
266
+ # Budget check
267
+ estimated_cost = total_conversations * 0.017 # Conservative estimate with batching
268
+ print(f"\n💰 Estimated cost: ${estimated_cost:.2f} (budget: $5.00)")
269
+
270
+ if estimated_cost > 5.0:
271
+ print(f"⚠️ Estimated cost exceeds budget! Proceeding anyway (batching reduces cost)")
272
+
273
+ # Load SKILL_V2
274
+ skill_v2_path = Path(__file__).parent / "conversation-analyzer" / "SKILL_V2.md"
275
+ if not skill_v2_path.exists():
276
+ print(f"❌ SKILL_V2.md not found: {skill_v2_path}")
277
+ sys.exit(1)
278
+
279
+ with open(skill_v2_path) as f:
280
+ skill_instructions = f.read()
281
+
282
+ # Initialize clients
283
+ print("\n🔧 Initializing clients...")
284
+
285
+ # Validate API key
286
+ api_key = os.getenv("ANTHROPIC_API_KEY")
287
+ if not api_key:
288
+ raise ValueError(
289
+ "ANTHROPIC_API_KEY environment variable required. "
290
+ "Set it in your .env file or export it in your shell."
291
+ )
292
+
293
+ anthropic_client = anthropic.Anthropic(api_key=api_key)
294
+ qdrant_client = QdrantClient(url=os.getenv("QDRANT_URL", "http://localhost:6333"))
295
+
296
+ # Initialize embedding model
297
+ embedding_model = None
298
+ vector_size = 384 # Default for FastEmbed
299
+
300
+ if FASTEMBED_AVAILABLE:
301
+ print(" Using FastEmbed (384 dimensions)")
302
+ embedding_model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
303
+ vector_size = 384
304
+ else:
305
+ print(" Using Voyage AI (1024 dimensions)")
306
+ vector_size = 1024
307
+
308
+ # Create unified test collection
309
+ collection_name = "v3_all_projects"
310
+ print(f"\n🔧 Creating unified collection: {collection_name}")
311
+
312
+ try:
313
+ qdrant_client.delete_collection(collection_name)
314
+ print(f" Deleted existing collection")
315
+ except:
316
+ pass
317
+
318
+ qdrant_client.create_collection(
319
+ collection_name=collection_name,
320
+ vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
321
+ )
322
+ print(f" ✅ Created collection with {vector_size} dimensions")
323
+
324
+ # Process all projects
325
+ grand_total_cost = 0.0
326
+ project_results = {}
327
+ all_points = []
328
+
329
+ for project_name, project_info in sorted(projects.items()):
330
+ print(f"\n\n{'='*80}")
331
+ print(f"PROJECT: {project_name} ({project_info['count']} conversations)")
332
+ print(f"{'='*80}")
333
+
334
+ # Step 1: Metadata + V3 Extraction (fast, local)
335
+ print(f"\n🔄 Step 1: Metadata extraction + V3 Extraction for all conversations...")
336
+ conversations_data = []
337
+
338
+ for conv_path in project_info['conversations']:
339
+ conv_id = conv_path.stem
340
+
341
+ # Extract metadata FIRST
342
+ tool_usage = extract_tool_usage_from_jsonl(str(conv_path))
343
+
344
+ # Read messages for V3 extraction
345
+ messages = []
346
+ conversation_text = ""
347
+ with open(conv_path) as f:
348
+ for line in f:
349
+ if line.strip():
350
+ msg = json.loads(line)
351
+ messages.append(msg)
352
+
353
+ # Extract text for concept detection
354
+ if 'message' in msg and msg['message']:
355
+ content = msg['message'].get('content', '')
356
+ if isinstance(content, str):
357
+ conversation_text += content + "\n"
358
+ elif isinstance(content, list):
359
+ for item in content:
360
+ if isinstance(item, dict) and item.get('text'):
361
+ conversation_text += item['text'] + "\n"
362
+
363
+ # Extract concepts
364
+ concepts = extract_concepts(conversation_text[:10000], tool_usage)
365
+
366
+ # Build metadata dict
367
+ metadata = {
368
+ 'tool_usage': tool_usage,
369
+ 'concepts': concepts
370
+ }
371
+
372
+ # V3 extraction WITH metadata
373
+ result = extract_events_v3(messages, metadata=metadata)
374
+
375
+ conversations_data.append({
376
+ 'conv_id': conv_id,
377
+ 'path': conv_path,
378
+ 'result': result,
379
+ 'project': project_name
380
+ })
381
+
382
+ print(f" ✅ {conv_id}: {result['stats']['total_tokens']} tokens, {len(concepts)} concepts, {len(tool_usage.get('tools_summary', {}))} tool types")
383
+
384
+ # Step 2: Batch generate narratives (API call)
385
+ narratives, batch_cost = batch_generate_narratives(
386
+ conversations_data,
387
+ anthropic_client,
388
+ skill_instructions
389
+ )
390
+
391
+ grand_total_cost += batch_cost
392
+
393
+ # Step 3: Generate embeddings and prepare points
394
+ print(f"\n🔄 Step 3: Generating embeddings...")
395
+
396
+ for data in conversations_data:
397
+ conv_id = data['conv_id']
398
+
399
+ if conv_id not in narratives:
400
+ print(f" ⚠️ Skipping {conv_id} (no narrative)")
401
+ continue
402
+
403
+ narrative = narratives[conv_id]
404
+ result = data['result']
405
+
406
+ # Generate embedding
407
+ embedding = get_embedding(narrative, embedding_model)
408
+
409
+ # Create point
410
+ point = PointStruct(
411
+ id=conv_id,
412
+ vector=embedding,
413
+ payload={
414
+ "conversation_id": conv_id,
415
+ "project": project_name,
416
+ "narrative": narrative,
417
+ "search_index": result['search_index'],
418
+ "context_cache": result['context_cache'],
419
+ "signature": result['signature'],
420
+ "timestamp": time.time(),
421
+ "extraction_stats": result['stats']
422
+ }
423
+ )
424
+
425
+ all_points.append(point)
426
+ print(f" ✅ {conv_id}: Embedded ({len(embedding)} dims)")
427
+
428
+ project_results[project_name] = {
429
+ 'processed': len(narratives),
430
+ 'total': project_info['count'],
431
+ 'cost': batch_cost
432
+ }
433
+
434
+ print(f"\n✅ Project {project_name} complete: {len(narratives)} conversations, ${batch_cost:.4f}")
435
+
436
+ # Step 4: Batch import all points to Qdrant
437
+ print(f"\n\n{'='*80}")
438
+ print(f"IMPORTING TO QDRANT")
439
+ print(f"{'='*80}")
440
+ print(f"\n🔄 Importing {len(all_points)} points in batches...")
441
+
442
+ batch_size = 100
443
+ for i in range(0, len(all_points), batch_size):
444
+ batch = all_points[i:i+batch_size]
445
+ qdrant_client.upsert(
446
+ collection_name=collection_name,
447
+ points=batch
448
+ )
449
+ print(f" ✅ Imported batch {i//batch_size + 1}: {len(batch)} points")
450
+
451
+ # Final Summary
452
+ print(f"\n\n{'='*80}")
453
+ print(f"FINAL SUMMARY")
454
+ print(f"{'='*80}")
455
+
456
+ print(f"\n📊 Per-Project Results:")
457
+ for project_name, results in sorted(project_results.items()):
458
+ print(f" • {project_name}: {results['processed']}/{results['total']} conversations, ${results['cost']:.4f}")
459
+
460
+ total_processed = sum(r['processed'] for r in project_results.values())
461
+ print(f"\n💰 Total Cost: ${grand_total_cost:.4f}")
462
+ print(f"📈 Total Processed: {total_processed}/{total_conversations} conversations")
463
+ print(f"💡 Average Cost: ${grand_total_cost/total_processed:.4f} per conversation")
464
+ print(f"\n🎯 Collection: {collection_name}")
465
+ print(f"📦 Total Points: {len(all_points)}")
466
+
467
+ print(f"\n✅ All projects imported successfully!")
468
+ print(f"\nNext steps:")
469
+ print(f"1. Test Skill with: 'Find conversations about Docker issues'")
470
+ print(f"2. Compare with current CSR: csr_reflect_on_past(...)")
471
+ print(f"3. Validate enhanced narratives vs raw excerpts")
472
+
473
+ return project_results
474
+
475
+
476
+ if __name__ == "__main__":
477
+ results = main()