claude-self-reflect 5.0.2 → 5.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ """
2
+ Import strategies using Strategy pattern to reduce complexity of stream_import_file.
3
+ """
4
+
5
+ import json
6
+ import gc
7
+ import os
8
+ import logging
9
+ from abc import ABC, abstractmethod
10
+ from pathlib import Path
11
+ from typing import Dict, Any, List, Optional, Generator
12
+ from datetime import datetime
13
+
14
+ from message_processors import MessageProcessorFactory
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ImportStrategy(ABC):
20
+ """Abstract base class for import strategies."""
21
+
22
+ @abstractmethod
23
+ def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
24
+ """Import a JSONL file using the specific strategy."""
25
+ pass
26
+
27
+
28
+ class ChunkBuffer:
29
+ """Manages buffering and processing of message chunks."""
30
+
31
+ def __init__(self, max_size: int = 50):
32
+ self.buffer: List[Dict[str, Any]] = []
33
+ self.max_size = max_size
34
+ self.current_index = 0
35
+ # Add memory limit for message content
36
+ self.max_content_length = int(os.getenv('MAX_MESSAGE_CONTENT_LENGTH', '5000'))
37
+
38
+ def add(self, message: Dict[str, Any]) -> bool:
39
+ """Add a message to the buffer. Returns True if buffer is full."""
40
+ # Truncate long content to prevent memory issues
41
+ if 'content' in message and len(message['content']) > self.max_content_length:
42
+ message = message.copy()
43
+ message['content'] = message['content'][:self.max_content_length] + '...[truncated]'
44
+ self.buffer.append(message)
45
+ return len(self.buffer) >= self.max_size
46
+
47
+ def get_and_clear(self) -> List[Dict[str, Any]]:
48
+ """Get buffer contents and clear it."""
49
+ contents = self.buffer.copy()
50
+ self.buffer.clear()
51
+ return contents
52
+
53
+ def has_content(self) -> bool:
54
+ """Check if buffer has any content."""
55
+ return len(self.buffer) > 0
56
+
57
+
58
+ class MessageStreamReader:
59
+ """Handles reading and parsing messages from JSONL files."""
60
+
61
+ def __init__(self):
62
+ self.processor_factory = MessageProcessorFactory()
63
+ self.current_message_index = 0
64
+
65
+ def read_messages(self, file_path: Path) -> Generator[Dict[str, Any], None, None]:
66
+ """Generator that yields processed messages from a JSONL file."""
67
+ self.current_message_index = 0
68
+
69
+ with open(file_path, 'r', encoding='utf-8') as f:
70
+ for line_num, line in enumerate(f, 1):
71
+ line = line.strip()
72
+ if not line:
73
+ continue
74
+
75
+ message = self._parse_line(line, line_num)
76
+ if message:
77
+ yield message
78
+
79
+ def _parse_line(self, line: str, line_num: int) -> Optional[Dict[str, Any]]:
80
+ """Parse a single line and extract message if present."""
81
+ try:
82
+ data = json.loads(line)
83
+
84
+ # Skip summary lines
85
+ if data.get('type') == 'summary':
86
+ return None
87
+
88
+ # Handle message entries
89
+ if 'message' in data and data['message']:
90
+ return self._process_message(data['message'])
91
+
92
+ # Handle top-level tool entries
93
+ entry_type = data.get('type')
94
+ if entry_type in ('tool_result', 'tool_use'):
95
+ return self._process_tool_entry(data, entry_type)
96
+
97
+ except json.JSONDecodeError:
98
+ logger.debug(f"Skipping invalid JSON at line {line_num}")
99
+ except (KeyError, TypeError, ValueError) as e:
100
+ logger.debug(f"Error processing data at line {line_num}: {e}")
101
+ except Exception as e:
102
+ logger.warning(f"Unexpected error at line {line_num}: {e}")
103
+
104
+ return None
105
+
106
+ def _process_message(self, message: Dict[str, Any]) -> Optional[Dict[str, Any]]:
107
+ """Process a message entry."""
108
+ role = message.get('role')
109
+ content = message.get('content')
110
+
111
+ if not role or not content:
112
+ return None
113
+
114
+ # Process content
115
+ text_content = self._extract_text_content(content)
116
+
117
+ if not text_content:
118
+ return None
119
+
120
+ # Track message index for user/assistant messages
121
+ if role in ['user', 'assistant']:
122
+ message_idx = self.current_message_index
123
+ self.current_message_index += 1
124
+ else:
125
+ message_idx = 0
126
+
127
+ return {
128
+ 'role': role,
129
+ 'content': text_content,
130
+ 'message_index': message_idx
131
+ }
132
+
133
+ def _extract_text_content(self, content: Any) -> str:
134
+ """Extract text content from various content formats."""
135
+ if isinstance(content, str):
136
+ return content
137
+
138
+ if isinstance(content, list):
139
+ text_parts = []
140
+ for item in content:
141
+ if isinstance(item, dict):
142
+ text = self._process_content_item(item)
143
+ if text:
144
+ text_parts.append(text)
145
+ elif isinstance(item, str):
146
+ text_parts.append(item)
147
+ return '\n'.join(text_parts)
148
+
149
+ return ''
150
+
151
+ def _process_content_item(self, item: Dict[str, Any]) -> Optional[str]:
152
+ """Process a single content item."""
153
+ item_type = item.get('type', '')
154
+
155
+ if item_type == 'text':
156
+ return item.get('text', '')
157
+ elif item_type == 'thinking':
158
+ thinking_content = item.get('thinking', '')
159
+ return f"[Thinking] {thinking_content[:1000]}" if thinking_content else None
160
+ elif item_type == 'tool_use':
161
+ tool_name = item.get('name', 'unknown')
162
+ tool_input = str(item.get('input', ''))[:500]
163
+ return f"[Tool: {tool_name}] {tool_input}"
164
+ elif item_type == 'tool_result':
165
+ result_content = str(item.get('content', ''))[:1000]
166
+ return f"[Result] {result_content}"
167
+
168
+ return None
169
+
170
+ def _process_tool_entry(self, data: Dict[str, Any], entry_type: str) -> Optional[Dict[str, Any]]:
171
+ """Process a top-level tool entry."""
172
+ text_parts = []
173
+
174
+ if entry_type == 'tool_use':
175
+ tool_name = data.get('name', 'unknown')
176
+ tool_input = str(data.get('input', ''))[:500]
177
+ text_parts.append(f"[Tool: {tool_name}] {tool_input}")
178
+
179
+ elif entry_type == 'tool_result':
180
+ result_content = self._extract_tool_result(data)
181
+ text_parts.append(f"[Result] {result_content[:1000]}")
182
+
183
+ content = "\n".join(text_parts)
184
+ if not content:
185
+ return None
186
+
187
+ message_idx = self.current_message_index
188
+ self.current_message_index += 1
189
+
190
+ return {
191
+ 'role': entry_type,
192
+ 'content': content,
193
+ 'message_index': message_idx
194
+ }
195
+
196
+ def _extract_tool_result(self, data: Dict[str, Any]) -> str:
197
+ """Extract result content from tool result data."""
198
+ result_content = data.get('content')
199
+
200
+ if isinstance(result_content, list):
201
+ flat = []
202
+ for item in result_content:
203
+ if isinstance(item, dict) and item.get('type') == 'text':
204
+ flat.append(item.get('text', ''))
205
+ elif isinstance(item, str):
206
+ flat.append(item)
207
+ result_content = "\n".join(flat)
208
+
209
+ if not result_content:
210
+ result_content = data.get('result', '')
211
+
212
+ return str(result_content)
213
+
214
+
215
+ class StreamImportStrategy(ImportStrategy):
216
+ """
217
+ Strategy for streaming import with chunked processing.
218
+ This is the main refactored implementation.
219
+ """
220
+
221
+ def __init__(self, client, process_chunk_fn, state_manager, max_chunk_size: int = 50,
222
+ cleanup_tolerance: int = None):
223
+ self.client = client
224
+ self.process_chunk_fn = process_chunk_fn
225
+ self.state_manager = state_manager
226
+ self.max_chunk_size = max_chunk_size
227
+ # Make cleanup tolerance configurable via environment variable
228
+ self.cleanup_tolerance = cleanup_tolerance or int(os.getenv('CLEANUP_TOLERANCE', '5'))
229
+ self.stream_reader = MessageStreamReader()
230
+
231
+ def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
232
+ """Import a JSONL file using streaming strategy."""
233
+ logger.info(f"Streaming import of {jsonl_file.name}")
234
+
235
+ conversation_id = jsonl_file.stem
236
+
237
+ # Extract metadata first (lightweight)
238
+ from metadata_extractor import MetadataExtractor
239
+ extractor = MetadataExtractor()
240
+ metadata, created_at, total_messages = extractor.extract_metadata_from_file(str(jsonl_file))
241
+
242
+ # Initialize chunk processing
243
+ chunk_buffer = ChunkBuffer(self.max_chunk_size)
244
+ chunk_index = 0
245
+ total_chunks = 0
246
+
247
+ try:
248
+ # Stream and process messages
249
+ for message in self.stream_reader.read_messages(jsonl_file):
250
+ if chunk_buffer.add(message):
251
+ # Buffer is full, process chunk
252
+ chunks = self._process_buffer(
253
+ chunk_buffer, chunk_index, conversation_id,
254
+ created_at, metadata, collection_name, project_path, total_messages
255
+ )
256
+ total_chunks += chunks
257
+ chunk_index += 1
258
+
259
+ # Force garbage collection after each chunk
260
+ gc.collect()
261
+
262
+ # Log progress
263
+ if chunk_index % 10 == 0:
264
+ logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
265
+
266
+ # Process remaining messages
267
+ if chunk_buffer.has_content():
268
+ chunks = self._process_buffer(
269
+ chunk_buffer, chunk_index, conversation_id,
270
+ created_at, metadata, collection_name, project_path, total_messages
271
+ )
272
+ total_chunks += chunks
273
+
274
+ # Clean up old points after successful import
275
+ if total_chunks > 0:
276
+ self._cleanup_old_points(conversation_id, collection_name, total_chunks)
277
+
278
+ logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
279
+ return total_chunks
280
+
281
+ except (IOError, OSError) as e:
282
+ logger.error(f"Failed to read file {jsonl_file}: {e}")
283
+ self._mark_failed(jsonl_file, str(e))
284
+ return 0
285
+ except json.JSONDecodeError as e:
286
+ logger.error(f"Invalid JSON in {jsonl_file}: {e}")
287
+ self._mark_failed(jsonl_file, str(e))
288
+ return 0
289
+ except Exception as e:
290
+ logger.error(f"Unexpected error importing {jsonl_file}: {e}")
291
+ self._mark_failed(jsonl_file, str(e))
292
+ return 0
293
+
294
+ def _process_buffer(self, chunk_buffer: ChunkBuffer, chunk_index: int,
295
+ conversation_id: str, created_at: str, metadata: Dict[str, Any],
296
+ collection_name: str, project_path: Path, total_messages: int) -> int:
297
+ """Process a buffer of messages and return number of chunks created."""
298
+ messages = chunk_buffer.get_and_clear()
299
+ return self.process_chunk_fn(
300
+ messages, chunk_index, conversation_id,
301
+ created_at, metadata, collection_name, project_path, total_messages
302
+ )
303
+
304
+ def _cleanup_old_points(self, conversation_id: str, collection_name: str, total_chunks: int):
305
+ """Clean up old points after successful import."""
306
+ try:
307
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
308
+
309
+ # Count old points using count API
310
+ old_count_filter = Filter(
311
+ must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
312
+ )
313
+
314
+ # Use count API to get actual count
315
+ old_count = self.client.count(
316
+ collection_name=collection_name,
317
+ count_filter=old_count_filter,
318
+ exact=True
319
+ ).count
320
+
321
+ if old_count > total_chunks + self.cleanup_tolerance:
322
+ # Use filter parameter for delete
323
+ self.client.delete(
324
+ collection_name=collection_name,
325
+ points_selector=Filter(
326
+ must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
327
+ ),
328
+ wait=True
329
+ )
330
+ logger.info(f"Deleted {old_count - total_chunks} old points for conversation {conversation_id}")
331
+
332
+ except ImportError as e:
333
+ logger.debug(f"Qdrant client import error: {e}")
334
+ except Exception as e:
335
+ logger.warning(f"Could not clean up old points for {conversation_id}: {e}")
336
+
337
+ def _mark_failed(self, jsonl_file: Path, error: str):
338
+ """Mark a file as failed in state manager."""
339
+ try:
340
+ self.state_manager.mark_file_failed(str(jsonl_file), error)
341
+ except AttributeError as e:
342
+ logger.debug(f"State manager method not available: {e}")
343
+ except Exception as e:
344
+ logger.warning(f"Unexpected error marking file as failed: {e}")
@@ -0,0 +1,248 @@
1
+ """
2
+ Message processor classes for handling different message types in JSONL import.
3
+ Refactored from extract_metadata_single_pass to reduce complexity.
4
+ """
5
+
6
+ import re
7
+ import ast
8
+ import logging
9
+ from abc import ABC, abstractmethod
10
+ from typing import Dict, Any, List, Set, Optional
11
+ from pathlib import Path
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Constants for metadata limits (can be overridden via environment variables)
16
+ import os
17
+
18
+ MAX_CONCEPTS = int(os.getenv("MAX_CONCEPTS", "10"))
19
+ MAX_AST_ELEMENTS = int(os.getenv("MAX_AST_ELEMENTS", "30"))
20
+ MAX_CODE_BLOCKS = int(os.getenv("MAX_CODE_BLOCKS", "5"))
21
+ MAX_ELEMENTS_PER_BLOCK = int(os.getenv("MAX_ELEMENTS_PER_BLOCK", "10"))
22
+ MAX_FILES_ANALYZED = int(os.getenv("MAX_FILES_ANALYZED", "20"))
23
+ MAX_FILES_EDITED = int(os.getenv("MAX_FILES_EDITED", "20"))
24
+ MAX_TOOLS_USED = int(os.getenv("MAX_TOOLS_USED", "15"))
25
+ MAX_CONCEPT_MESSAGES = int(os.getenv("MAX_CONCEPT_MESSAGES", "50"))
26
+
27
+
28
+ class MessageProcessor(ABC):
29
+ """Abstract base class for message processing."""
30
+
31
+ @abstractmethod
32
+ def process(self, item: Any, metadata: Dict[str, Any]) -> Optional[str]:
33
+ """Process a message item and update metadata."""
34
+ pass
35
+
36
+
37
+ class TextMessageProcessor(MessageProcessor):
38
+ """Process text messages and extract code blocks."""
39
+
40
+ def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
41
+ """Process text content and extract code blocks with AST elements."""
42
+ if item.get('type') != 'text':
43
+ return None
44
+
45
+ text_content = item.get('text', '')
46
+
47
+ # Check for code blocks
48
+ if '```' in text_content:
49
+ metadata['has_code_blocks'] = True
50
+ self._extract_code_ast_elements(text_content, metadata)
51
+
52
+ return text_content
53
+
54
+ def _extract_code_ast_elements(self, text: str, metadata: Dict[str, Any]):
55
+ """Extract AST elements from code blocks in text."""
56
+ if 'ast_elements' not in metadata:
57
+ metadata['ast_elements'] = []
58
+
59
+ if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
60
+ return
61
+
62
+ # More permissive regex to handle various fence formats
63
+ code_blocks = re.findall(r'```[^`\n]*\n?(.*?)```', text, re.DOTALL)
64
+
65
+ for code_block in code_blocks[:MAX_CODE_BLOCKS]:
66
+ if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
67
+ break
68
+
69
+ ast_elems = extract_ast_elements(code_block)
70
+ for elem in list(ast_elems)[:MAX_ELEMENTS_PER_BLOCK]:
71
+ if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
72
+ metadata['ast_elements'].append(elem)
73
+
74
+
75
+ class ThinkingMessageProcessor(MessageProcessor):
76
+ """Process thinking messages."""
77
+
78
+ def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
79
+ """Process thinking content."""
80
+ if item.get('type') != 'thinking':
81
+ return None
82
+
83
+ return item.get('thinking', '')
84
+
85
+
86
+ class ToolMessageProcessor(MessageProcessor):
87
+ """Process tool use messages and extract file references."""
88
+
89
+ def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
90
+ """Process tool use and extract file references."""
91
+ if item.get('type') != 'tool_use':
92
+ return None
93
+
94
+ tool_name = item.get('name', '')
95
+
96
+ # Track tool usage
97
+ if 'tools_used' not in metadata:
98
+ metadata['tools_used'] = []
99
+
100
+ if tool_name and tool_name not in metadata['tools_used']:
101
+ if len(metadata['tools_used']) < MAX_TOOLS_USED:
102
+ metadata['tools_used'].append(tool_name)
103
+
104
+ # Extract file references
105
+ if 'input' in item:
106
+ self._extract_file_references(item['input'], tool_name, metadata)
107
+
108
+ # Return tool use as text
109
+ tool_input = str(item.get('input', ''))[:500]
110
+ return f"[Tool: {tool_name}] {tool_input}"
111
+
112
+ def _extract_file_references(self, input_data: Any, tool_name: str, metadata: Dict[str, Any]):
113
+ """Extract file references from tool input."""
114
+ if not isinstance(input_data, dict):
115
+ return
116
+
117
+ # Initialize metadata lists if not present
118
+ if 'files_edited' not in metadata:
119
+ metadata['files_edited'] = []
120
+ if 'files_analyzed' not in metadata:
121
+ metadata['files_analyzed'] = []
122
+
123
+ is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
124
+
125
+ # Check file_path field
126
+ if 'file_path' in input_data:
127
+ file_ref = input_data['file_path']
128
+ if is_edit:
129
+ if file_ref not in metadata['files_edited'] and len(metadata['files_edited']) < MAX_FILES_EDITED:
130
+ metadata['files_edited'].append(file_ref)
131
+ else:
132
+ if file_ref not in metadata['files_analyzed'] and len(metadata['files_analyzed']) < MAX_FILES_ANALYZED:
133
+ metadata['files_analyzed'].append(file_ref)
134
+
135
+ # Check path field (for non-edit tools)
136
+ if 'path' in input_data and not is_edit:
137
+ file_ref = input_data['path']
138
+ if file_ref not in metadata['files_analyzed'] and len(metadata['files_analyzed']) < MAX_FILES_ANALYZED:
139
+ metadata['files_analyzed'].append(file_ref)
140
+
141
+
142
+ class ToolResultProcessor(MessageProcessor):
143
+ """Process tool result messages."""
144
+
145
+ def process(self, item: Any, metadata: Dict[str, Any]) -> Optional[str]:
146
+ """Process tool results."""
147
+ # Handle both dict items and top-level tool results
148
+ if isinstance(item, dict):
149
+ if item.get('type') == 'tool_result':
150
+ result_content = str(item.get('content', ''))[:1000]
151
+ return f"[Result] {result_content}"
152
+ elif item.get('type') == 'tool_use':
153
+ # Already handled by ToolMessageProcessor
154
+ return None
155
+
156
+ return None
157
+
158
+
159
+ class MessageProcessorFactory:
160
+ """Factory for creating appropriate message processors."""
161
+
162
+ def __init__(self):
163
+ self.processors = {
164
+ 'text': TextMessageProcessor(),
165
+ 'thinking': ThinkingMessageProcessor(),
166
+ 'tool_use': ToolMessageProcessor(),
167
+ 'tool_result': ToolResultProcessor()
168
+ }
169
+
170
+ def get_processor(self, message_type: str) -> Optional[MessageProcessor]:
171
+ """Get the appropriate processor for a message type."""
172
+ return self.processors.get(message_type)
173
+
174
+ def process_content(self, content: Any, metadata: Dict[str, Any]) -> str:
175
+ """Process content of various types and return text representation."""
176
+ text_parts = []
177
+
178
+ if isinstance(content, list):
179
+ for item in content:
180
+ if isinstance(item, dict):
181
+ item_type = item.get('type', '')
182
+ processor = self.get_processor(item_type)
183
+ if processor:
184
+ text = processor.process(item, metadata)
185
+ if text:
186
+ text_parts.append(text)
187
+ elif isinstance(item, str):
188
+ text_parts.append(item)
189
+ elif isinstance(content, str):
190
+ text_parts.append(content)
191
+
192
+ return '\n'.join(text_parts)
193
+
194
+
195
+ def extract_ast_elements(code_text: str) -> Set[str]:
196
+ """Extract AST elements from Python code."""
197
+ elements = set()
198
+
199
+ try:
200
+ tree = ast.parse(code_text)
201
+ for node in ast.walk(tree):
202
+ if isinstance(node, ast.FunctionDef):
203
+ elements.add(f"func:{node.name}")
204
+ elif isinstance(node, ast.ClassDef):
205
+ elements.add(f"class:{node.name}")
206
+ elif isinstance(node, ast.Import):
207
+ for alias in node.names:
208
+ elements.add(f"import:{alias.name}")
209
+ elif isinstance(node, ast.ImportFrom):
210
+ module = node.module or ''
211
+ for alias in node.names:
212
+ elements.add(f"from:{module}.{alias.name}")
213
+ except (SyntaxError, ValueError):
214
+ # Not Python code or invalid syntax
215
+ pass
216
+
217
+ return elements
218
+
219
+
220
+ def extract_concepts(text: str) -> List[str]:
221
+ """Extract key concepts from text using simple heuristics."""
222
+ concepts = []
223
+
224
+ # Common programming concepts
225
+ concept_patterns = [
226
+ (r'\b(async|await|promise|future)\b', 'async-programming'),
227
+ (r'\b(test|spec|jest|pytest|unittest)\b', 'testing'),
228
+ (r'\b(docker|container|kubernetes|k8s)\b', 'containerization'),
229
+ (r'\b(api|rest|graphql|endpoint)\b', 'api-development'),
230
+ (r'\b(react|vue|angular|svelte)\b', 'frontend-framework'),
231
+ (r'\b(database|sql|postgres|mysql|mongodb)\b', 'database'),
232
+ (r'\b(auth|authentication|oauth|jwt)\b', 'authentication'),
233
+ (r'\b(error|exception|bug|fix)\b', 'debugging'),
234
+ (r'\b(refactor|optimize|performance)\b', 'optimization'),
235
+ (r'\b(deploy|ci|cd|pipeline)\b', 'deployment')
236
+ ]
237
+
238
+ text_lower = text.lower()
239
+ seen_concepts = set()
240
+
241
+ for pattern, concept in concept_patterns:
242
+ if re.search(pattern, text_lower) and concept not in seen_concepts:
243
+ concepts.append(concept)
244
+ seen_concepts.add(concept)
245
+ if len(concepts) >= MAX_CONCEPTS:
246
+ break
247
+
248
+ return concepts