claude-self-reflect 2.5.18 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,305 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Force metadata recovery script for Claude Self-Reflect.
4
+ Fixes conversations that were marked as updated but don't actually have metadata.
5
+ This addresses the point ID mismatch bug in delta-metadata-update.py.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import hashlib
12
+ import re
13
+ import asyncio
14
+ from datetime import datetime
15
+ from typing import List, Dict, Any, Set, Optional
16
+ import logging
17
+ from pathlib import Path
18
+
19
+ from qdrant_client import QdrantClient
20
+
21
+ # Configuration
22
+ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
23
+ LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
24
+ DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
25
+ BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))
26
+
27
+ # Set up logging
28
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Initialize Qdrant client
32
+ client = QdrantClient(url=QDRANT_URL, timeout=30)
33
+
34
+ def normalize_path(path: str) -> str:
35
+ """Normalize file paths for consistency."""
36
+ if not path:
37
+ return ""
38
+ path = path.replace("/Users/", "~/").replace("\\Users\\", "~\\")
39
+ path = path.replace("\\", "/")
40
+ path = re.sub(r'/+', '/', path)
41
+ return path
42
+
43
+ def extract_concepts(text: str) -> Set[str]:
44
+ """Extract high-level concepts from text."""
45
+ concepts = set()
46
+
47
+ concept_patterns = {
48
+ 'security': r'(security|vulnerability|CVE|injection|auth)',
49
+ 'docker': r'(docker|container|compose|kubernetes)',
50
+ 'testing': r'(test|pytest|unittest|coverage)',
51
+ 'api': r'(API|REST|GraphQL|endpoint)',
52
+ 'database': r'(database|SQL|query|migration|qdrant)',
53
+ 'debugging': r'(debug|error|exception|traceback)',
54
+ 'git': r'(git|commit|branch|merge|pull request)',
55
+ 'mcp': r'(MCP|claude-self-reflect|tool|agent)',
56
+ 'embeddings': r'(embedding|vector|semantic|similarity)',
57
+ }
58
+
59
+ text_lower = text.lower()
60
+ for concept, pattern in concept_patterns.items():
61
+ if re.search(pattern, text_lower, re.IGNORECASE):
62
+ concepts.add(concept)
63
+
64
+ return concepts
65
+
66
+ def extract_metadata_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
67
+ """Extract metadata from a JSONL conversation file."""
68
+ metadata = {
69
+ "files_analyzed": [],
70
+ "files_edited": [],
71
+ "tools_used": set(),
72
+ "concepts": set(),
73
+ "text_sample": ""
74
+ }
75
+
76
+ try:
77
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
78
+ line_count = 0
79
+ for line in f:
80
+ line_count += 1
81
+ if line_count > 200: # Limit processing
82
+ break
83
+
84
+ if not line.strip():
85
+ continue
86
+
87
+ try:
88
+ data = json.loads(line)
89
+ if 'message' in data and data['message']:
90
+ msg = data['message']
91
+
92
+ # Extract text for concept analysis
93
+ if msg.get('content'):
94
+ if isinstance(msg['content'], str):
95
+ metadata['text_sample'] += msg['content'][:500] + "\n"
96
+
97
+ # Extract tool usage
98
+ if msg.get('role') == 'assistant' and msg.get('content'):
99
+ content = msg['content']
100
+ if isinstance(content, list):
101
+ for item in content:
102
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
103
+ tool_name = item.get('name', '')
104
+ metadata['tools_used'].add(tool_name)
105
+
106
+ inputs = item.get('input', {})
107
+
108
+ if tool_name == 'Read' and 'file_path' in inputs:
109
+ metadata['files_analyzed'].append(
110
+ normalize_path(inputs['file_path'])
111
+ )
112
+ elif tool_name in ['Edit', 'Write'] and 'file_path' in inputs:
113
+ metadata['files_edited'].append(
114
+ normalize_path(inputs['file_path'])
115
+ )
116
+
117
+ except json.JSONDecodeError:
118
+ continue
119
+
120
+ except Exception as e:
121
+ logger.error(f"Error reading {jsonl_path}: {e}")
122
+
123
+ # Extract concepts from collected text
124
+ if metadata['text_sample']:
125
+ metadata['concepts'] = extract_concepts(metadata['text_sample'][:5000])
126
+
127
+ # Convert sets to lists and limit
128
+ metadata['tools_used'] = list(metadata['tools_used'])[:20]
129
+ metadata['concepts'] = list(metadata['concepts'])[:15]
130
+ metadata['files_analyzed'] = list(set(metadata['files_analyzed']))[:20]
131
+ metadata['files_edited'] = list(set(metadata['files_edited']))[:10]
132
+
133
+ del metadata['text_sample'] # Don't store in Qdrant
134
+
135
+ return metadata
136
+
137
+ async def find_conversations_without_metadata(collection_name: str) -> List[str]:
138
+ """Find all unique conversation IDs that don't have metadata."""
139
+ conversations_without_metadata = set()
140
+
141
+ offset = None
142
+ total_checked = 0
143
+
144
+ while True:
145
+ points, next_offset = client.scroll(
146
+ collection_name=collection_name,
147
+ limit=BATCH_SIZE,
148
+ offset=offset,
149
+ with_payload=True,
150
+ with_vectors=False
151
+ )
152
+
153
+ if not points:
154
+ break
155
+
156
+ for point in points:
157
+ # Check if metadata is missing
158
+ if not point.payload.get('concepts') or not point.payload.get('has_file_metadata'):
159
+ conv_id = point.payload.get('conversation_id')
160
+ if conv_id:
161
+ conversations_without_metadata.add(conv_id)
162
+
163
+ total_checked += len(points)
164
+ offset = next_offset
165
+
166
+ if offset is None:
167
+ break
168
+
169
+ logger.info(f" Checked {total_checked} points, found {len(conversations_without_metadata)} conversations without metadata")
170
+ return list(conversations_without_metadata)
171
+
172
+ async def update_conversation_points(collection_name: str, conversation_id: str, metadata: Dict[str, Any]) -> int:
173
+ """Update all points for a conversation with metadata."""
174
+ updated_count = 0
175
+
176
+ # Get all points in the collection
177
+ offset = None
178
+ while True:
179
+ points, next_offset = client.scroll(
180
+ collection_name=collection_name,
181
+ limit=BATCH_SIZE,
182
+ offset=offset,
183
+ with_payload=True,
184
+ with_vectors=False
185
+ )
186
+
187
+ if not points:
188
+ break
189
+
190
+ # Find and update points for this conversation
191
+ for point in points:
192
+ if point.payload.get('conversation_id') == conversation_id:
193
+ if not DRY_RUN:
194
+ # Merge metadata with existing payload
195
+ updated_payload = {**point.payload, **metadata}
196
+ updated_payload['has_file_metadata'] = True
197
+ updated_payload['metadata_updated_at'] = datetime.now().isoformat()
198
+
199
+ client.set_payload(
200
+ collection_name=collection_name,
201
+ payload=updated_payload,
202
+ points=[point.id],
203
+ wait=False
204
+ )
205
+
206
+ updated_count += 1
207
+
208
+ offset = next_offset
209
+ if offset is None:
210
+ break
211
+
212
+ return updated_count
213
+
214
+ async def process_collection(collection_name: str):
215
+ """Process a single collection to add missing metadata."""
216
+ logger.info(f"\nProcessing collection: {collection_name}")
217
+
218
+ # Find conversations without metadata
219
+ conversations_without_metadata = await find_conversations_without_metadata(collection_name)
220
+
221
+ if not conversations_without_metadata:
222
+ logger.info(f" ✓ All conversations have metadata")
223
+ return 0
224
+
225
+ logger.info(f" Found {len(conversations_without_metadata)} conversations needing metadata")
226
+
227
+ # Process each conversation
228
+ success_count = 0
229
+ failed_count = 0
230
+
231
+ for conv_id in conversations_without_metadata[:10]: # Limit for testing
232
+ # Find the JSONL file
233
+ jsonl_pattern = f"**/{conv_id}.jsonl"
234
+ jsonl_files = list(Path(LOGS_DIR).glob(jsonl_pattern))
235
+
236
+ if not jsonl_files:
237
+ logger.warning(f" Cannot find JSONL for {conv_id}")
238
+ failed_count += 1
239
+ continue
240
+
241
+ jsonl_file = jsonl_files[0]
242
+ logger.info(f" Processing {conv_id}")
243
+
244
+ # Extract metadata
245
+ metadata = extract_metadata_from_jsonl(str(jsonl_file))
246
+
247
+ if not metadata['concepts'] and not metadata['files_analyzed']:
248
+ logger.warning(f" No metadata extracted from {conv_id}")
249
+ failed_count += 1
250
+ continue
251
+
252
+ # Update points
253
+ updated_points = await update_conversation_points(collection_name, conv_id, metadata)
254
+
255
+ if updated_points > 0:
256
+ logger.info(f" ✓ Updated {updated_points} points with {len(metadata['concepts'])} concepts")
257
+ success_count += 1
258
+ else:
259
+ logger.warning(f" No points updated for {conv_id}")
260
+ failed_count += 1
261
+
262
+ logger.info(f" Collection complete: {success_count} fixed, {failed_count} failed")
263
+ return success_count
264
+
265
+ async def main():
266
+ """Main recovery process."""
267
+ logger.info("=== Force Metadata Recovery ===")
268
+ logger.info(f"Qdrant URL: {QDRANT_URL}")
269
+ logger.info(f"Dry run: {DRY_RUN}")
270
+
271
+ # Get all collections
272
+ collections = client.get_collections().collections
273
+
274
+ # Focus on collections with potential issues
275
+ priority_collections = []
276
+ other_collections = []
277
+
278
+ for collection in collections:
279
+ name = collection.name
280
+ if name.startswith('conv_'):
281
+ other_collections.append(name)
282
+
283
+ logger.info(f"Found {len(priority_collections)} priority collections")
284
+ logger.info(f"Found {len(other_collections)} other collections")
285
+
286
+ # Process priority collections first
287
+ total_fixed = 0
288
+
289
+ for collection_name in priority_collections:
290
+ fixed = await process_collection(collection_name)
291
+ total_fixed += fixed
292
+
293
+ # Process a sample of other collections
294
+ for collection_name in other_collections[:5]:
295
+ fixed = await process_collection(collection_name)
296
+ total_fixed += fixed
297
+
298
+ logger.info(f"\n=== Recovery Complete ===")
299
+ logger.info(f"Total conversations fixed: {total_fixed}")
300
+
301
+ if DRY_RUN:
302
+ logger.info("This was a DRY RUN - no actual updates were made")
303
+
304
+ if __name__ == "__main__":
305
+ asyncio.run(main())
@@ -42,6 +42,20 @@ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower()
42
42
  VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
43
43
  CURRENT_METADATA_VERSION = 2 # Version 2: Added tool output extraction
44
44
 
45
+ # Token limit configuration for Voyage AI
46
+ MAX_TOKENS_PER_BATCH = int(os.getenv("MAX_TOKENS_PER_BATCH", "100000")) # Safe limit (120k - 20k buffer)
47
+ if MAX_TOKENS_PER_BATCH > 120000 or MAX_TOKENS_PER_BATCH < 1000:
48
+ logger.warning(f"MAX_TOKENS_PER_BATCH={MAX_TOKENS_PER_BATCH} outside safe range [1000, 120000], using 100000")
49
+ MAX_TOKENS_PER_BATCH = 100000
50
+
51
+ TOKEN_ESTIMATION_RATIO = int(os.getenv("TOKEN_ESTIMATION_RATIO", "3")) # chars per token estimate
52
+ if TOKEN_ESTIMATION_RATIO < 2 or TOKEN_ESTIMATION_RATIO > 10:
53
+ logger.warning(f"TOKEN_ESTIMATION_RATIO={TOKEN_ESTIMATION_RATIO} outside normal range [2, 10], using 3")
54
+ TOKEN_ESTIMATION_RATIO = 3
55
+
56
+ USE_TOKEN_AWARE_BATCHING = os.getenv("USE_TOKEN_AWARE_BATCHING", "true").lower() == "true"
57
+ MAX_RECURSION_DEPTH = 10 # Maximum depth for recursive chunk splitting
58
+
45
59
  # Set up logging
46
60
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
47
61
  logger = logging.getLogger(__name__)
@@ -381,6 +395,38 @@ def log_retry_state(retry_state):
381
395
  def embed_with_backoff(**kwargs):
382
396
  return voyage_client.embed(**kwargs)
383
397
 
398
+ def estimate_tokens(text: str) -> int:
399
+ """Estimate token count for text with content-aware heuristics.
400
+ Base estimate uses TOKEN_ESTIMATION_RATIO, adjusted for content type.
401
+ """
402
+ # Base estimate
403
+ base_tokens = len(text) // TOKEN_ESTIMATION_RATIO
404
+
405
+ # Adjust for code/JSON content (typically more tokens per char)
406
+ # Count indicators of structured content
407
+ structure_indicators = text.count('{') + text.count('[') + text.count('```')
408
+ if structure_indicators > 10: # Likely JSON/code
409
+ base_tokens = int(base_tokens * 1.3)
410
+
411
+ # Add 10% safety margin
412
+ return int(base_tokens * 1.1)
413
+
414
+ def extract_message_content(msg: Dict[str, Any]) -> str:
415
+ """Extract text content from a message."""
416
+ content = msg.get("content", "")
417
+
418
+ if isinstance(content, list):
419
+ # Handle structured content
420
+ text_parts = []
421
+ for item in content:
422
+ if isinstance(item, dict) and item.get("type") == "text":
423
+ text_parts.append(item.get("text", ""))
424
+ elif isinstance(item, str):
425
+ text_parts.append(item)
426
+ content = " ".join(text_parts)
427
+
428
+ return content
429
+
384
430
  def generate_embeddings(texts: List[str]) -> List[List[float]]:
385
431
  """Generate embeddings for a list of texts."""
386
432
  if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
@@ -432,6 +478,125 @@ def chunk_conversation(messages: List[Dict[str, Any]], chunk_size: int = 10) ->
432
478
 
433
479
  return chunks
434
480
 
481
+ def split_large_chunk(chunk: Dict[str, Any], max_tokens: int, depth: int = 0) -> List[Dict[str, Any]]:
482
+ """Split a large chunk into smaller pieces that fit token limit."""
483
+ # Check recursion depth to prevent stack overflow
484
+ if depth >= MAX_RECURSION_DEPTH:
485
+ logger.error(f"Max recursion depth {MAX_RECURSION_DEPTH} reached while splitting chunk")
486
+ # Force truncate as last resort
487
+ max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
488
+ chunk["text"] = chunk["text"][:max_chars] + "\n[TRUNCATED - MAX DEPTH REACHED]"
489
+ chunk["was_truncated"] = True
490
+ return [chunk]
491
+
492
+ text = chunk["text"]
493
+ messages = chunk["messages"]
494
+
495
+ # First, check if we can split by messages
496
+ if len(messages) > 1:
497
+ # Try splitting messages into smaller groups
498
+ mid = len(messages) // 2
499
+ chunk1_messages = messages[:mid]
500
+ chunk2_messages = messages[mid:]
501
+
502
+ # Recreate text for each split
503
+ texts1 = []
504
+ texts2 = []
505
+
506
+ for msg in chunk1_messages:
507
+ role = msg.get("role", "unknown")
508
+ content = extract_message_content(msg)
509
+ if content:
510
+ texts1.append(f"{role.upper()}: {content}")
511
+
512
+ for msg in chunk2_messages:
513
+ role = msg.get("role", "unknown")
514
+ content = extract_message_content(msg)
515
+ if content:
516
+ texts2.append(f"{role.upper()}: {content}")
517
+
518
+ split_chunks = []
519
+ if texts1:
520
+ split_chunks.append({
521
+ "text": "\n".join(texts1),
522
+ "messages": chunk1_messages,
523
+ "chunk_index": f"{chunk['chunk_index']}_a",
524
+ "start_role": chunk["start_role"]
525
+ })
526
+ if texts2:
527
+ split_chunks.append({
528
+ "text": "\n".join(texts2),
529
+ "messages": chunk2_messages,
530
+ "chunk_index": f"{chunk['chunk_index']}_b",
531
+ "start_role": chunk2_messages[0].get("role", "unknown") if chunk2_messages else "unknown"
532
+ })
533
+
534
+ # Recursively split if still too large
535
+ result = []
536
+ for split_chunk in split_chunks:
537
+ if estimate_tokens(split_chunk["text"]) > max_tokens:
538
+ result.extend(split_large_chunk(split_chunk, max_tokens, depth + 1))
539
+ else:
540
+ result.append(split_chunk)
541
+ return result
542
+ else:
543
+ # Single message too large - truncate with warning
544
+ max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
545
+ if len(text) > max_chars:
546
+ truncated_size = len(text) - max_chars
547
+ logger.warning(f"Single message exceeds token limit, truncating {truncated_size} chars from {len(text)} total")
548
+ chunk["text"] = text[:max_chars] + f"\n[TRUNCATED {truncated_size} CHARS]"
549
+ chunk["was_truncated"] = True
550
+ chunk["original_size"] = len(text)
551
+ return [chunk]
552
+
553
+ def create_token_aware_batches(chunks: List[Dict[str, Any]], max_tokens: int = MAX_TOKENS_PER_BATCH) -> List[List[Dict[str, Any]]]:
554
+ """Create batches that respect token limits."""
555
+ if not USE_TOKEN_AWARE_BATCHING:
556
+ # Fall back to old batching method
557
+ batches = []
558
+ for i in range(0, len(chunks), BATCH_SIZE):
559
+ batches.append(chunks[i:i + BATCH_SIZE])
560
+ return batches
561
+
562
+ batches = []
563
+ current_batch = []
564
+ current_tokens = 0
565
+
566
+ for chunk in chunks:
567
+ chunk_tokens = estimate_tokens(chunk["text"])
568
+
569
+ # If single chunk exceeds limit, split it
570
+ if chunk_tokens > max_tokens:
571
+ logger.warning(f"Chunk with {chunk_tokens} estimated tokens exceeds limit of {max_tokens}, splitting...")
572
+ split_chunks = split_large_chunk(chunk, max_tokens)
573
+ for split_chunk in split_chunks:
574
+ split_tokens = estimate_tokens(split_chunk["text"])
575
+ if split_tokens > max_tokens:
576
+ logger.error(f"Split chunk still exceeds limit: {split_tokens} tokens")
577
+ batches.append([split_chunk])
578
+ # If adding chunk would exceed limit, start new batch
579
+ elif current_tokens + chunk_tokens > max_tokens:
580
+ if current_batch:
581
+ batches.append(current_batch)
582
+ current_batch = [chunk]
583
+ current_tokens = chunk_tokens
584
+ else:
585
+ current_batch.append(chunk)
586
+ current_tokens += chunk_tokens
587
+
588
+ if current_batch:
589
+ batches.append(current_batch)
590
+
591
+ # Log batch statistics
592
+ if batches:
593
+ batch_sizes = [len(batch) for batch in batches]
594
+ batch_tokens = [sum(estimate_tokens(chunk["text"]) for chunk in batch) for batch in batches]
595
+ logger.debug(f"Created {len(batches)} batches, chunk counts: min={min(batch_sizes)}, max={max(batch_sizes)}, "
596
+ f"estimated tokens: min={min(batch_tokens)}, max={max(batch_tokens)}, avg={sum(batch_tokens)//len(batches)}")
597
+
598
+ return batches
599
+
435
600
  def import_project(project_path: Path, collection_name: str, state: dict) -> int:
436
601
  """Import all conversations from a project."""
437
602
  jsonl_files = list(project_path.glob("*.jsonl"))
@@ -524,11 +689,17 @@ def import_project(project_path: Path, collection_name: str, state: dict) -> int
524
689
  if not chunks:
525
690
  continue
526
691
 
527
- # Process in batches
528
- for batch_start in range(0, len(chunks), BATCH_SIZE):
529
- batch = chunks[batch_start:batch_start + BATCH_SIZE]
692
+ # Process in batches (token-aware if enabled)
693
+ token_aware_batches = create_token_aware_batches(chunks)
694
+
695
+ for batch_idx, batch in enumerate(token_aware_batches):
530
696
  texts = [chunk["text"] for chunk in batch]
531
697
 
698
+ # Log batch info for debugging
699
+ if USE_TOKEN_AWARE_BATCHING:
700
+ total_tokens = sum(estimate_tokens(text) for text in texts)
701
+ logger.debug(f"Batch {batch_idx + 1}/{len(token_aware_batches)}: {len(texts)} chunks, ~{total_tokens} estimated tokens")
702
+
532
703
  # Generate embeddings
533
704
  embeddings = generate_embeddings(texts)
534
705