claude-self-reflect 5.0.4 → 5.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,248 @@
1
+ """
2
+ Message processor classes for handling different message types in JSONL import.
3
+ Refactored from extract_metadata_single_pass to reduce complexity.
4
+ """
5
+
6
+ import re
7
+ import ast
8
+ import logging
9
+ from abc import ABC, abstractmethod
10
+ from typing import Dict, Any, List, Set, Optional
11
+ from pathlib import Path
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Constants for metadata limits (can be overridden via environment variables)
16
+ import os
17
+
18
+ MAX_CONCEPTS = int(os.getenv("MAX_CONCEPTS", "10"))
19
+ MAX_AST_ELEMENTS = int(os.getenv("MAX_AST_ELEMENTS", "30"))
20
+ MAX_CODE_BLOCKS = int(os.getenv("MAX_CODE_BLOCKS", "5"))
21
+ MAX_ELEMENTS_PER_BLOCK = int(os.getenv("MAX_ELEMENTS_PER_BLOCK", "10"))
22
+ MAX_FILES_ANALYZED = int(os.getenv("MAX_FILES_ANALYZED", "20"))
23
+ MAX_FILES_EDITED = int(os.getenv("MAX_FILES_EDITED", "20"))
24
+ MAX_TOOLS_USED = int(os.getenv("MAX_TOOLS_USED", "15"))
25
+ MAX_CONCEPT_MESSAGES = int(os.getenv("MAX_CONCEPT_MESSAGES", "50"))
26
+
27
+
28
+ class MessageProcessor(ABC):
29
+ """Abstract base class for message processing."""
30
+
31
+ @abstractmethod
32
+ def process(self, item: Any, metadata: Dict[str, Any]) -> Optional[str]:
33
+ """Process a message item and update metadata."""
34
+ pass
35
+
36
+
37
+ class TextMessageProcessor(MessageProcessor):
38
+ """Process text messages and extract code blocks."""
39
+
40
+ def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
41
+ """Process text content and extract code blocks with AST elements."""
42
+ if item.get('type') != 'text':
43
+ return None
44
+
45
+ text_content = item.get('text', '')
46
+
47
+ # Check for code blocks
48
+ if '```' in text_content:
49
+ metadata['has_code_blocks'] = True
50
+ self._extract_code_ast_elements(text_content, metadata)
51
+
52
+ return text_content
53
+
54
+ def _extract_code_ast_elements(self, text: str, metadata: Dict[str, Any]):
55
+ """Extract AST elements from code blocks in text."""
56
+ if 'ast_elements' not in metadata:
57
+ metadata['ast_elements'] = []
58
+
59
+ if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
60
+ return
61
+
62
+ # More permissive regex to handle various fence formats
63
+ code_blocks = re.findall(r'```[^`\n]*\n?(.*?)```', text, re.DOTALL)
64
+
65
+ for code_block in code_blocks[:MAX_CODE_BLOCKS]:
66
+ if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
67
+ break
68
+
69
+ ast_elems = extract_ast_elements(code_block)
70
+ for elem in list(ast_elems)[:MAX_ELEMENTS_PER_BLOCK]:
71
+ if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
72
+ metadata['ast_elements'].append(elem)
73
+
74
+
75
+ class ThinkingMessageProcessor(MessageProcessor):
76
+ """Process thinking messages."""
77
+
78
+ def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
79
+ """Process thinking content."""
80
+ if item.get('type') != 'thinking':
81
+ return None
82
+
83
+ return item.get('thinking', '')
84
+
85
+
86
+ class ToolMessageProcessor(MessageProcessor):
87
+ """Process tool use messages and extract file references."""
88
+
89
+ def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
90
+ """Process tool use and extract file references."""
91
+ if item.get('type') != 'tool_use':
92
+ return None
93
+
94
+ tool_name = item.get('name', '')
95
+
96
+ # Track tool usage
97
+ if 'tools_used' not in metadata:
98
+ metadata['tools_used'] = []
99
+
100
+ if tool_name and tool_name not in metadata['tools_used']:
101
+ if len(metadata['tools_used']) < MAX_TOOLS_USED:
102
+ metadata['tools_used'].append(tool_name)
103
+
104
+ # Extract file references
105
+ if 'input' in item:
106
+ self._extract_file_references(item['input'], tool_name, metadata)
107
+
108
+ # Return tool use as text
109
+ tool_input = str(item.get('input', ''))[:500]
110
+ return f"[Tool: {tool_name}] {tool_input}"
111
+
112
+ def _extract_file_references(self, input_data: Any, tool_name: str, metadata: Dict[str, Any]):
113
+ """Extract file references from tool input."""
114
+ if not isinstance(input_data, dict):
115
+ return
116
+
117
+ # Initialize metadata lists if not present
118
+ if 'files_edited' not in metadata:
119
+ metadata['files_edited'] = []
120
+ if 'files_analyzed' not in metadata:
121
+ metadata['files_analyzed'] = []
122
+
123
+ is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
124
+
125
+ # Check file_path field
126
+ if 'file_path' in input_data:
127
+ file_ref = input_data['file_path']
128
+ if is_edit:
129
+ if file_ref not in metadata['files_edited'] and len(metadata['files_edited']) < MAX_FILES_EDITED:
130
+ metadata['files_edited'].append(file_ref)
131
+ else:
132
+ if file_ref not in metadata['files_analyzed'] and len(metadata['files_analyzed']) < MAX_FILES_ANALYZED:
133
+ metadata['files_analyzed'].append(file_ref)
134
+
135
+ # Check path field (for non-edit tools)
136
+ if 'path' in input_data and not is_edit:
137
+ file_ref = input_data['path']
138
+ if file_ref not in metadata['files_analyzed'] and len(metadata['files_analyzed']) < MAX_FILES_ANALYZED:
139
+ metadata['files_analyzed'].append(file_ref)
140
+
141
+
142
+ class ToolResultProcessor(MessageProcessor):
143
+ """Process tool result messages."""
144
+
145
+ def process(self, item: Any, metadata: Dict[str, Any]) -> Optional[str]:
146
+ """Process tool results."""
147
+ # Handle both dict items and top-level tool results
148
+ if isinstance(item, dict):
149
+ if item.get('type') == 'tool_result':
150
+ result_content = str(item.get('content', ''))[:1000]
151
+ return f"[Result] {result_content}"
152
+ elif item.get('type') == 'tool_use':
153
+ # Already handled by ToolMessageProcessor
154
+ return None
155
+
156
+ return None
157
+
158
+
159
+ class MessageProcessorFactory:
160
+ """Factory for creating appropriate message processors."""
161
+
162
+ def __init__(self):
163
+ self.processors = {
164
+ 'text': TextMessageProcessor(),
165
+ 'thinking': ThinkingMessageProcessor(),
166
+ 'tool_use': ToolMessageProcessor(),
167
+ 'tool_result': ToolResultProcessor()
168
+ }
169
+
170
+ def get_processor(self, message_type: str) -> Optional[MessageProcessor]:
171
+ """Get the appropriate processor for a message type."""
172
+ return self.processors.get(message_type)
173
+
174
+ def process_content(self, content: Any, metadata: Dict[str, Any]) -> str:
175
+ """Process content of various types and return text representation."""
176
+ text_parts = []
177
+
178
+ if isinstance(content, list):
179
+ for item in content:
180
+ if isinstance(item, dict):
181
+ item_type = item.get('type', '')
182
+ processor = self.get_processor(item_type)
183
+ if processor:
184
+ text = processor.process(item, metadata)
185
+ if text:
186
+ text_parts.append(text)
187
+ elif isinstance(item, str):
188
+ text_parts.append(item)
189
+ elif isinstance(content, str):
190
+ text_parts.append(content)
191
+
192
+ return '\n'.join(text_parts)
193
+
194
+
195
+ def extract_ast_elements(code_text: str) -> Set[str]:
196
+ """Extract AST elements from Python code."""
197
+ elements = set()
198
+
199
+ try:
200
+ tree = ast.parse(code_text)
201
+ for node in ast.walk(tree):
202
+ if isinstance(node, ast.FunctionDef):
203
+ elements.add(f"func:{node.name}")
204
+ elif isinstance(node, ast.ClassDef):
205
+ elements.add(f"class:{node.name}")
206
+ elif isinstance(node, ast.Import):
207
+ for alias in node.names:
208
+ elements.add(f"import:{alias.name}")
209
+ elif isinstance(node, ast.ImportFrom):
210
+ module = node.module or ''
211
+ for alias in node.names:
212
+ elements.add(f"from:{module}.{alias.name}")
213
+ except (SyntaxError, ValueError):
214
+ # Not Python code or invalid syntax
215
+ pass
216
+
217
+ return elements
218
+
219
+
220
+ def extract_concepts(text: str) -> List[str]:
221
+ """Extract key concepts from text using simple heuristics."""
222
+ concepts = []
223
+
224
+ # Common programming concepts
225
+ concept_patterns = [
226
+ (r'\b(async|await|promise|future)\b', 'async-programming'),
227
+ (r'\b(test|spec|jest|pytest|unittest)\b', 'testing'),
228
+ (r'\b(docker|container|kubernetes|k8s)\b', 'containerization'),
229
+ (r'\b(api|rest|graphql|endpoint)\b', 'api-development'),
230
+ (r'\b(react|vue|angular|svelte)\b', 'frontend-framework'),
231
+ (r'\b(database|sql|postgres|mysql|mongodb)\b', 'database'),
232
+ (r'\b(auth|authentication|oauth|jwt)\b', 'authentication'),
233
+ (r'\b(error|exception|bug|fix)\b', 'debugging'),
234
+ (r'\b(refactor|optimize|performance)\b', 'optimization'),
235
+ (r'\b(deploy|ci|cd|pipeline)\b', 'deployment')
236
+ ]
237
+
238
+ text_lower = text.lower()
239
+ seen_concepts = set()
240
+
241
+ for pattern, concept in concept_patterns:
242
+ if re.search(pattern, text_lower) and concept not in seen_concepts:
243
+ concepts.append(concept)
244
+ seen_concepts.add(concept)
245
+ if len(concepts) >= MAX_CONCEPTS:
246
+ break
247
+
248
+ return concepts
@@ -0,0 +1,262 @@
1
+ """
2
+ Metadata extractor using message processors to reduce complexity.
3
+ Refactored from extract_metadata_single_pass function.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Dict, Any, Tuple, Optional
11
+ from datetime import datetime
12
+
13
+ from message_processors import (
14
+ MessageProcessorFactory,
15
+ extract_concepts,
16
+ MAX_CONCEPT_MESSAGES,
17
+ MAX_FILES_ANALYZED,
18
+ MAX_FILES_EDITED,
19
+ MAX_TOOLS_USED,
20
+ MAX_AST_ELEMENTS
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class MetadataExtractor:
27
+ """Extract metadata from JSONL conversation files."""
28
+
29
+ def __init__(self):
30
+ self.processor_factory = MessageProcessorFactory()
31
+
32
+ def extract_metadata_from_file(self, file_path: str) -> Tuple[Dict[str, Any], str, int]:
33
+ """
34
+ Extract metadata from a JSONL file in a single pass.
35
+ Returns: (metadata, first_timestamp, message_count)
36
+ """
37
+ metadata = self._initialize_metadata()
38
+ first_timestamp = None
39
+ message_count = 0
40
+ all_text = []
41
+
42
+ try:
43
+ with open(file_path, 'r', encoding='utf-8') as f:
44
+ for line in f:
45
+ if not line.strip():
46
+ continue
47
+
48
+ result = self._process_line(line, metadata)
49
+ if result:
50
+ text_content, is_message = result
51
+
52
+ # Update timestamp and counts
53
+ if first_timestamp is None:
54
+ first_timestamp = self._extract_timestamp(line)
55
+
56
+ if is_message:
57
+ message_count += 1
58
+
59
+ if text_content:
60
+ # Limit text accumulation to prevent memory issues
61
+ if len(all_text) < MAX_CONCEPT_MESSAGES:
62
+ all_text.append(text_content[:1000])
63
+
64
+ except (IOError, OSError) as e:
65
+ logger.warning(f"Error reading file {file_path}: {e}")
66
+ except (json.JSONDecodeError, ValueError) as e:
67
+ logger.warning(f"Error parsing JSON in {file_path}: {e}")
68
+ except Exception as e:
69
+ logger.error(f"Unexpected error extracting metadata from {file_path}: {e}")
70
+
71
+ # Post-process collected data
72
+ self._post_process_metadata(metadata, all_text, file_path)
73
+
74
+ # Apply limits to arrays
75
+ self._apply_metadata_limits(metadata)
76
+
77
+ return metadata, first_timestamp or datetime.now().isoformat(), message_count
78
+
79
+ def _initialize_metadata(self) -> Dict[str, Any]:
80
+ """Initialize empty metadata structure."""
81
+ return {
82
+ "files_analyzed": [],
83
+ "files_edited": [],
84
+ "tools_used": [],
85
+ "concepts": [],
86
+ "ast_elements": [],
87
+ "has_code_blocks": False,
88
+ "total_messages": 0,
89
+ "project_path": None,
90
+ "pattern_analysis": {},
91
+ "avg_quality_score": 0.0
92
+ }
93
+
94
+ def _process_line(self, line: str, metadata: Dict[str, Any]) -> Optional[Tuple[str, bool]]:
95
+ """
96
+ Process a single line from the JSONL file.
97
+ Returns: (text_content, is_message) or None
98
+ """
99
+ try:
100
+ data = json.loads(line)
101
+
102
+ # Extract project path from cwd
103
+ if metadata["project_path"] is None and 'cwd' in data:
104
+ metadata["project_path"] = data.get('cwd')
105
+
106
+ # Handle message entries
107
+ if 'message' in data and data['message']:
108
+ return self._process_message_entry(data['message'], metadata)
109
+
110
+ # Handle top-level tool entries
111
+ entry_type = data.get('type')
112
+ if entry_type in ('tool_result', 'tool_use'):
113
+ return self._process_tool_entry(data, metadata)
114
+
115
+ except json.JSONDecodeError:
116
+ # Expected for non-JSON lines, skip silently
117
+ pass
118
+ except (KeyError, TypeError, ValueError) as e:
119
+ # Log specific parsing errors for debugging
120
+ logger.debug(f"Error parsing line: {e}")
121
+
122
+ return None
123
+
124
+ def _process_message_entry(self, message: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[Tuple[str, bool]]:
125
+ """Process a message entry."""
126
+ role = message.get('role')
127
+ content = message.get('content')
128
+
129
+ if not role or not content:
130
+ return None
131
+
132
+ # Check if it's a countable message
133
+ is_user_or_assistant = role in ['user', 'assistant']
134
+
135
+ # Process content
136
+ text_content = self.processor_factory.process_content(content, metadata)
137
+
138
+ return text_content, is_user_or_assistant
139
+
140
+ def _process_tool_entry(self, data: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[Tuple[str, bool]]:
141
+ """Process a top-level tool entry."""
142
+ entry_type = data.get('type')
143
+ text_parts = []
144
+
145
+ if entry_type == 'tool_use':
146
+ tool_name = data.get('name', 'unknown')
147
+ tool_input = str(data.get('input', ''))[:500]
148
+ text_parts.append(f"[Tool: {tool_name}] {tool_input}")
149
+
150
+ # Track tool usage
151
+ if tool_name and tool_name not in metadata['tools_used']:
152
+ metadata['tools_used'].append(tool_name)
153
+
154
+ elif entry_type == 'tool_result':
155
+ result_content = self._extract_tool_result_content(data)
156
+ text_parts.append(f"[Result] {result_content[:1000]}")
157
+
158
+ content = "\n".join(text_parts)
159
+ # Tool entries should not count as messages (only user/assistant messages count)
160
+ return (content, False) if content else None
161
+
162
+ def _extract_tool_result_content(self, data: Dict[str, Any]) -> str:
163
+ """Extract content from tool result data."""
164
+ result_content = data.get('content')
165
+
166
+ if isinstance(result_content, list):
167
+ flat = []
168
+ for item in result_content:
169
+ if isinstance(item, dict) and item.get('type') == 'text':
170
+ flat.append(item.get('text', ''))
171
+ elif isinstance(item, str):
172
+ flat.append(item)
173
+ result_content = "\n".join(flat)
174
+
175
+ if not result_content:
176
+ result_content = data.get('result', '')
177
+
178
+ return str(result_content)
179
+
180
+ def _extract_timestamp(self, line: str) -> Optional[str]:
181
+ """Extract timestamp from a line if present."""
182
+ try:
183
+ data = json.loads(line)
184
+ return data.get('timestamp')
185
+ except (json.JSONDecodeError, TypeError) as e:
186
+ logger.debug(f"Failed to extract timestamp: {e}")
187
+ return None
188
+
189
+ def _post_process_metadata(self, metadata: Dict[str, Any], all_text: list, file_path: str):
190
+ """Post-process collected metadata."""
191
+ # Extract concepts from collected text
192
+ if all_text:
193
+ combined_text = ' '.join(all_text[:MAX_CONCEPT_MESSAGES])
194
+ metadata['concepts'] = extract_concepts(combined_text)
195
+
196
+ # Run AST-GREP pattern analysis if available
197
+ self._run_pattern_analysis(metadata)
198
+
199
+ def _run_pattern_analysis(self, metadata: Dict[str, Any]):
200
+ """Run AST-GREP pattern analysis on mentioned files."""
201
+ pattern_quality = {}
202
+ avg_quality_score = 0.0
203
+
204
+ try:
205
+ # Update patterns first
206
+ from update_patterns import check_and_update_patterns
207
+ check_and_update_patterns()
208
+
209
+ # Import analyzer
210
+ from ast_grep_final_analyzer import FinalASTGrepAnalyzer
211
+ analyzer = FinalASTGrepAnalyzer()
212
+
213
+ # Analyze files
214
+ files_to_analyze = list(set(
215
+ metadata['files_edited'] + metadata['files_analyzed'][:10]
216
+ ))
217
+ quality_scores = []
218
+
219
+ for file_path in files_to_analyze:
220
+ # Expand file path for proper checking
221
+ expanded_path = os.path.expanduser(file_path) if file_path.startswith('~') else file_path
222
+ if self._is_code_file(expanded_path) and os.path.exists(expanded_path):
223
+ try:
224
+ result = analyzer.analyze_file(expanded_path)
225
+ metrics = result['quality_metrics']
226
+ pattern_quality[file_path] = {
227
+ 'score': metrics['quality_score'],
228
+ 'good_patterns': metrics['good_patterns_found'],
229
+ 'bad_patterns': metrics['bad_patterns_found'],
230
+ 'issues': metrics['total_issues']
231
+ }
232
+ quality_scores.append(metrics['quality_score'])
233
+ except (IOError, OSError) as e:
234
+ logger.debug(f"Could not read file {file_path}: {e}")
235
+ except (KeyError, ValueError) as e:
236
+ logger.debug(f"Error parsing AST results for {file_path}: {e}")
237
+ except Exception as e:
238
+ logger.warning(f"Unexpected error analyzing {file_path}: {e}")
239
+
240
+ # Calculate average quality
241
+ if quality_scores:
242
+ avg_quality_score = sum(quality_scores) / len(quality_scores)
243
+
244
+ except Exception as e:
245
+ logger.debug(f"AST analysis not available: {e}")
246
+
247
+ metadata['pattern_analysis'] = pattern_quality
248
+ metadata['avg_quality_score'] = round(avg_quality_score, 3)
249
+
250
+ def _is_code_file(self, file_path: str) -> bool:
251
+ """Check if file is a code file."""
252
+ if not file_path:
253
+ return False
254
+ extensions = ['.py', '.ts', '.js', '.tsx', '.jsx']
255
+ return any(file_path.endswith(ext) for ext in extensions)
256
+
257
+ def _apply_metadata_limits(self, metadata: Dict[str, Any]):
258
+ """Apply size limits to metadata arrays."""
259
+ metadata['files_analyzed'] = metadata['files_analyzed'][:MAX_FILES_ANALYZED]
260
+ metadata['files_edited'] = metadata['files_edited'][:MAX_FILES_EDITED]
261
+ metadata['tools_used'] = metadata['tools_used'][:MAX_TOOLS_USED]
262
+ metadata['ast_elements'] = metadata['ast_elements'][:MAX_AST_ELEMENTS]
@@ -0,0 +1,56 @@
1
+ #!/bin/bash
2
+ # Watcher loop for Docker container
3
+ # Runs the streaming-watcher.py with HOT/WARM/COLD prioritization
4
+
5
+ # Don't use set -e in retry loops - it can cause premature exits
6
+
7
+ echo "Starting Claude Self-Reflect Streaming Watcher v3.0.0"
8
+ echo "HOT/WARM/COLD prioritization enabled"
9
+ echo "=========================================="
10
+
11
+ # Ensure config directory exists
12
+ mkdir -p /config
13
+
14
+ # Set Python path to include scripts directory
15
+ export PYTHONPATH=/app/scripts:$PYTHONPATH
16
+
17
+ # Main loop - restart on failure with backoff
18
+ RETRY_COUNT=0
19
+ MAX_RETRIES=10
20
+ BACKOFF_SECONDS=5
21
+
22
+ while true; do
23
+ echo "[$(date)] Starting watcher (attempt $((RETRY_COUNT + 1))/$MAX_RETRIES)..."
24
+
25
+ # Run the streaming watcher
26
+ python /app/scripts/streaming-watcher.py
27
+ EXIT_CODE=$?
28
+
29
+ if [ $EXIT_CODE -eq 0 ]; then
30
+ echo "[$(date)] Watcher exited cleanly"
31
+ RETRY_COUNT=0
32
+ BACKOFF_SECONDS=5
33
+ else
34
+ echo "[$(date)] Watcher exited with code $EXIT_CODE"
35
+
36
+ RETRY_COUNT=$((RETRY_COUNT + 1))
37
+ if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then
38
+ echo "[$(date)] Maximum retries reached. Exiting."
39
+ exit 1
40
+ fi
41
+
42
+ # Add jitter to prevent thundering herd (±20% of backoff)
43
+ JITTER=$(( (RANDOM % (BACKOFF_SECONDS / 5 + 1)) - (BACKOFF_SECONDS / 10) ))
44
+ SLEEP_TIME=$((BACKOFF_SECONDS + JITTER))
45
+ [ $SLEEP_TIME -lt 1 ] && SLEEP_TIME=1
46
+
47
+ echo "[$(date)] Restarting in $SLEEP_TIME seconds (base: $BACKOFF_SECONDS, jitter: $JITTER)..."
48
+ sleep $SLEEP_TIME
49
+
50
+ # Exponential backoff (max 300 seconds)
51
+ BACKOFF_SECONDS=$((BACKOFF_SECONDS * 2))
52
+ if [ $BACKOFF_SECONDS -gt 300 ]; then
53
+ BACKOFF_SECONDS=300
54
+ fi
55
+ fi
56
+ done