claude-self-reflect 5.0.2 → 5.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/csr-validator.md +43 -0
- package/.claude/agents/open-source-maintainer.md +77 -0
- package/docker-compose.yaml +3 -1
- package/installer/setup-wizard-docker.js +64 -9
- package/package.json +6 -1
- package/scripts/ast_grep_final_analyzer.py +16 -6
- package/scripts/csr-status +120 -17
- package/scripts/debug-august-parsing.py +5 -1
- package/scripts/debug-project-resolver.py +3 -3
- package/scripts/doctor.py +342 -0
- package/scripts/embedding_service.py +241 -0
- package/scripts/import-conversations-unified.py +292 -821
- package/scripts/import_strategies.py +344 -0
- package/scripts/message_processors.py +248 -0
- package/scripts/metadata_extractor.py +262 -0
- package/scripts/session_quality_tracker.py +10 -0
- package/scripts/unified_state_manager.py +7 -4
- package/mcp-server/src/test_quality.py +0 -153
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Import strategies using Strategy pattern to reduce complexity of stream_import_file.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import gc
|
|
7
|
+
import os
|
|
8
|
+
import logging
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, Any, List, Optional, Generator
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
14
|
+
from message_processors import MessageProcessorFactory
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ImportStrategy(ABC):
|
|
20
|
+
"""Abstract base class for import strategies."""
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
24
|
+
"""Import a JSONL file using the specific strategy."""
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ChunkBuffer:
|
|
29
|
+
"""Manages buffering and processing of message chunks."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, max_size: int = 50):
|
|
32
|
+
self.buffer: List[Dict[str, Any]] = []
|
|
33
|
+
self.max_size = max_size
|
|
34
|
+
self.current_index = 0
|
|
35
|
+
# Add memory limit for message content
|
|
36
|
+
self.max_content_length = int(os.getenv('MAX_MESSAGE_CONTENT_LENGTH', '5000'))
|
|
37
|
+
|
|
38
|
+
def add(self, message: Dict[str, Any]) -> bool:
|
|
39
|
+
"""Add a message to the buffer. Returns True if buffer is full."""
|
|
40
|
+
# Truncate long content to prevent memory issues
|
|
41
|
+
if 'content' in message and len(message['content']) > self.max_content_length:
|
|
42
|
+
message = message.copy()
|
|
43
|
+
message['content'] = message['content'][:self.max_content_length] + '...[truncated]'
|
|
44
|
+
self.buffer.append(message)
|
|
45
|
+
return len(self.buffer) >= self.max_size
|
|
46
|
+
|
|
47
|
+
def get_and_clear(self) -> List[Dict[str, Any]]:
|
|
48
|
+
"""Get buffer contents and clear it."""
|
|
49
|
+
contents = self.buffer.copy()
|
|
50
|
+
self.buffer.clear()
|
|
51
|
+
return contents
|
|
52
|
+
|
|
53
|
+
def has_content(self) -> bool:
|
|
54
|
+
"""Check if buffer has any content."""
|
|
55
|
+
return len(self.buffer) > 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class MessageStreamReader:
|
|
59
|
+
"""Handles reading and parsing messages from JSONL files."""
|
|
60
|
+
|
|
61
|
+
def __init__(self):
|
|
62
|
+
self.processor_factory = MessageProcessorFactory()
|
|
63
|
+
self.current_message_index = 0
|
|
64
|
+
|
|
65
|
+
def read_messages(self, file_path: Path) -> Generator[Dict[str, Any], None, None]:
|
|
66
|
+
"""Generator that yields processed messages from a JSONL file."""
|
|
67
|
+
self.current_message_index = 0
|
|
68
|
+
|
|
69
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
70
|
+
for line_num, line in enumerate(f, 1):
|
|
71
|
+
line = line.strip()
|
|
72
|
+
if not line:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
message = self._parse_line(line, line_num)
|
|
76
|
+
if message:
|
|
77
|
+
yield message
|
|
78
|
+
|
|
79
|
+
def _parse_line(self, line: str, line_num: int) -> Optional[Dict[str, Any]]:
|
|
80
|
+
"""Parse a single line and extract message if present."""
|
|
81
|
+
try:
|
|
82
|
+
data = json.loads(line)
|
|
83
|
+
|
|
84
|
+
# Skip summary lines
|
|
85
|
+
if data.get('type') == 'summary':
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
# Handle message entries
|
|
89
|
+
if 'message' in data and data['message']:
|
|
90
|
+
return self._process_message(data['message'])
|
|
91
|
+
|
|
92
|
+
# Handle top-level tool entries
|
|
93
|
+
entry_type = data.get('type')
|
|
94
|
+
if entry_type in ('tool_result', 'tool_use'):
|
|
95
|
+
return self._process_tool_entry(data, entry_type)
|
|
96
|
+
|
|
97
|
+
except json.JSONDecodeError:
|
|
98
|
+
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
99
|
+
except (KeyError, TypeError, ValueError) as e:
|
|
100
|
+
logger.debug(f"Error processing data at line {line_num}: {e}")
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.warning(f"Unexpected error at line {line_num}: {e}")
|
|
103
|
+
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def _process_message(self, message: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
107
|
+
"""Process a message entry."""
|
|
108
|
+
role = message.get('role')
|
|
109
|
+
content = message.get('content')
|
|
110
|
+
|
|
111
|
+
if not role or not content:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
# Process content
|
|
115
|
+
text_content = self._extract_text_content(content)
|
|
116
|
+
|
|
117
|
+
if not text_content:
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
# Track message index for user/assistant messages
|
|
121
|
+
if role in ['user', 'assistant']:
|
|
122
|
+
message_idx = self.current_message_index
|
|
123
|
+
self.current_message_index += 1
|
|
124
|
+
else:
|
|
125
|
+
message_idx = 0
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
'role': role,
|
|
129
|
+
'content': text_content,
|
|
130
|
+
'message_index': message_idx
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
def _extract_text_content(self, content: Any) -> str:
|
|
134
|
+
"""Extract text content from various content formats."""
|
|
135
|
+
if isinstance(content, str):
|
|
136
|
+
return content
|
|
137
|
+
|
|
138
|
+
if isinstance(content, list):
|
|
139
|
+
text_parts = []
|
|
140
|
+
for item in content:
|
|
141
|
+
if isinstance(item, dict):
|
|
142
|
+
text = self._process_content_item(item)
|
|
143
|
+
if text:
|
|
144
|
+
text_parts.append(text)
|
|
145
|
+
elif isinstance(item, str):
|
|
146
|
+
text_parts.append(item)
|
|
147
|
+
return '\n'.join(text_parts)
|
|
148
|
+
|
|
149
|
+
return ''
|
|
150
|
+
|
|
151
|
+
def _process_content_item(self, item: Dict[str, Any]) -> Optional[str]:
|
|
152
|
+
"""Process a single content item."""
|
|
153
|
+
item_type = item.get('type', '')
|
|
154
|
+
|
|
155
|
+
if item_type == 'text':
|
|
156
|
+
return item.get('text', '')
|
|
157
|
+
elif item_type == 'thinking':
|
|
158
|
+
thinking_content = item.get('thinking', '')
|
|
159
|
+
return f"[Thinking] {thinking_content[:1000]}" if thinking_content else None
|
|
160
|
+
elif item_type == 'tool_use':
|
|
161
|
+
tool_name = item.get('name', 'unknown')
|
|
162
|
+
tool_input = str(item.get('input', ''))[:500]
|
|
163
|
+
return f"[Tool: {tool_name}] {tool_input}"
|
|
164
|
+
elif item_type == 'tool_result':
|
|
165
|
+
result_content = str(item.get('content', ''))[:1000]
|
|
166
|
+
return f"[Result] {result_content}"
|
|
167
|
+
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
def _process_tool_entry(self, data: Dict[str, Any], entry_type: str) -> Optional[Dict[str, Any]]:
|
|
171
|
+
"""Process a top-level tool entry."""
|
|
172
|
+
text_parts = []
|
|
173
|
+
|
|
174
|
+
if entry_type == 'tool_use':
|
|
175
|
+
tool_name = data.get('name', 'unknown')
|
|
176
|
+
tool_input = str(data.get('input', ''))[:500]
|
|
177
|
+
text_parts.append(f"[Tool: {tool_name}] {tool_input}")
|
|
178
|
+
|
|
179
|
+
elif entry_type == 'tool_result':
|
|
180
|
+
result_content = self._extract_tool_result(data)
|
|
181
|
+
text_parts.append(f"[Result] {result_content[:1000]}")
|
|
182
|
+
|
|
183
|
+
content = "\n".join(text_parts)
|
|
184
|
+
if not content:
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
message_idx = self.current_message_index
|
|
188
|
+
self.current_message_index += 1
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
'role': entry_type,
|
|
192
|
+
'content': content,
|
|
193
|
+
'message_index': message_idx
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
def _extract_tool_result(self, data: Dict[str, Any]) -> str:
|
|
197
|
+
"""Extract result content from tool result data."""
|
|
198
|
+
result_content = data.get('content')
|
|
199
|
+
|
|
200
|
+
if isinstance(result_content, list):
|
|
201
|
+
flat = []
|
|
202
|
+
for item in result_content:
|
|
203
|
+
if isinstance(item, dict) and item.get('type') == 'text':
|
|
204
|
+
flat.append(item.get('text', ''))
|
|
205
|
+
elif isinstance(item, str):
|
|
206
|
+
flat.append(item)
|
|
207
|
+
result_content = "\n".join(flat)
|
|
208
|
+
|
|
209
|
+
if not result_content:
|
|
210
|
+
result_content = data.get('result', '')
|
|
211
|
+
|
|
212
|
+
return str(result_content)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class StreamImportStrategy(ImportStrategy):
|
|
216
|
+
"""
|
|
217
|
+
Strategy for streaming import with chunked processing.
|
|
218
|
+
This is the main refactored implementation.
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
def __init__(self, client, process_chunk_fn, state_manager, max_chunk_size: int = 50,
|
|
222
|
+
cleanup_tolerance: int = None):
|
|
223
|
+
self.client = client
|
|
224
|
+
self.process_chunk_fn = process_chunk_fn
|
|
225
|
+
self.state_manager = state_manager
|
|
226
|
+
self.max_chunk_size = max_chunk_size
|
|
227
|
+
# Make cleanup tolerance configurable via environment variable
|
|
228
|
+
self.cleanup_tolerance = cleanup_tolerance or int(os.getenv('CLEANUP_TOLERANCE', '5'))
|
|
229
|
+
self.stream_reader = MessageStreamReader()
|
|
230
|
+
|
|
231
|
+
def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
232
|
+
"""Import a JSONL file using streaming strategy."""
|
|
233
|
+
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
234
|
+
|
|
235
|
+
conversation_id = jsonl_file.stem
|
|
236
|
+
|
|
237
|
+
# Extract metadata first (lightweight)
|
|
238
|
+
from metadata_extractor import MetadataExtractor
|
|
239
|
+
extractor = MetadataExtractor()
|
|
240
|
+
metadata, created_at, total_messages = extractor.extract_metadata_from_file(str(jsonl_file))
|
|
241
|
+
|
|
242
|
+
# Initialize chunk processing
|
|
243
|
+
chunk_buffer = ChunkBuffer(self.max_chunk_size)
|
|
244
|
+
chunk_index = 0
|
|
245
|
+
total_chunks = 0
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
# Stream and process messages
|
|
249
|
+
for message in self.stream_reader.read_messages(jsonl_file):
|
|
250
|
+
if chunk_buffer.add(message):
|
|
251
|
+
# Buffer is full, process chunk
|
|
252
|
+
chunks = self._process_buffer(
|
|
253
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
254
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
255
|
+
)
|
|
256
|
+
total_chunks += chunks
|
|
257
|
+
chunk_index += 1
|
|
258
|
+
|
|
259
|
+
# Force garbage collection after each chunk
|
|
260
|
+
gc.collect()
|
|
261
|
+
|
|
262
|
+
# Log progress
|
|
263
|
+
if chunk_index % 10 == 0:
|
|
264
|
+
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
265
|
+
|
|
266
|
+
# Process remaining messages
|
|
267
|
+
if chunk_buffer.has_content():
|
|
268
|
+
chunks = self._process_buffer(
|
|
269
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
270
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
271
|
+
)
|
|
272
|
+
total_chunks += chunks
|
|
273
|
+
|
|
274
|
+
# Clean up old points after successful import
|
|
275
|
+
if total_chunks > 0:
|
|
276
|
+
self._cleanup_old_points(conversation_id, collection_name, total_chunks)
|
|
277
|
+
|
|
278
|
+
logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
|
|
279
|
+
return total_chunks
|
|
280
|
+
|
|
281
|
+
except (IOError, OSError) as e:
|
|
282
|
+
logger.error(f"Failed to read file {jsonl_file}: {e}")
|
|
283
|
+
self._mark_failed(jsonl_file, str(e))
|
|
284
|
+
return 0
|
|
285
|
+
except json.JSONDecodeError as e:
|
|
286
|
+
logger.error(f"Invalid JSON in {jsonl_file}: {e}")
|
|
287
|
+
self._mark_failed(jsonl_file, str(e))
|
|
288
|
+
return 0
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f"Unexpected error importing {jsonl_file}: {e}")
|
|
291
|
+
self._mark_failed(jsonl_file, str(e))
|
|
292
|
+
return 0
|
|
293
|
+
|
|
294
|
+
def _process_buffer(self, chunk_buffer: ChunkBuffer, chunk_index: int,
|
|
295
|
+
conversation_id: str, created_at: str, metadata: Dict[str, Any],
|
|
296
|
+
collection_name: str, project_path: Path, total_messages: int) -> int:
|
|
297
|
+
"""Process a buffer of messages and return number of chunks created."""
|
|
298
|
+
messages = chunk_buffer.get_and_clear()
|
|
299
|
+
return self.process_chunk_fn(
|
|
300
|
+
messages, chunk_index, conversation_id,
|
|
301
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
def _cleanup_old_points(self, conversation_id: str, collection_name: str, total_chunks: int):
|
|
305
|
+
"""Clean up old points after successful import."""
|
|
306
|
+
try:
|
|
307
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
308
|
+
|
|
309
|
+
# Count old points using count API
|
|
310
|
+
old_count_filter = Filter(
|
|
311
|
+
must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Use count API to get actual count
|
|
315
|
+
old_count = self.client.count(
|
|
316
|
+
collection_name=collection_name,
|
|
317
|
+
count_filter=old_count_filter,
|
|
318
|
+
exact=True
|
|
319
|
+
).count
|
|
320
|
+
|
|
321
|
+
if old_count > total_chunks + self.cleanup_tolerance:
|
|
322
|
+
# Use filter parameter for delete
|
|
323
|
+
self.client.delete(
|
|
324
|
+
collection_name=collection_name,
|
|
325
|
+
points_selector=Filter(
|
|
326
|
+
must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
|
|
327
|
+
),
|
|
328
|
+
wait=True
|
|
329
|
+
)
|
|
330
|
+
logger.info(f"Deleted {old_count - total_chunks} old points for conversation {conversation_id}")
|
|
331
|
+
|
|
332
|
+
except ImportError as e:
|
|
333
|
+
logger.debug(f"Qdrant client import error: {e}")
|
|
334
|
+
except Exception as e:
|
|
335
|
+
logger.warning(f"Could not clean up old points for {conversation_id}: {e}")
|
|
336
|
+
|
|
337
|
+
def _mark_failed(self, jsonl_file: Path, error: str):
|
|
338
|
+
"""Mark a file as failed in state manager."""
|
|
339
|
+
try:
|
|
340
|
+
self.state_manager.mark_file_failed(str(jsonl_file), error)
|
|
341
|
+
except AttributeError as e:
|
|
342
|
+
logger.debug(f"State manager method not available: {e}")
|
|
343
|
+
except Exception as e:
|
|
344
|
+
logger.warning(f"Unexpected error marking file as failed: {e}")
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Message processor classes for handling different message types in JSONL import.
|
|
3
|
+
Refactored from extract_metadata_single_pass to reduce complexity.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import ast
|
|
8
|
+
import logging
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from typing import Dict, Any, List, Set, Optional
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Constants for metadata limits (can be overridden via environment variables)
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
MAX_CONCEPTS = int(os.getenv("MAX_CONCEPTS", "10"))
|
|
19
|
+
MAX_AST_ELEMENTS = int(os.getenv("MAX_AST_ELEMENTS", "30"))
|
|
20
|
+
MAX_CODE_BLOCKS = int(os.getenv("MAX_CODE_BLOCKS", "5"))
|
|
21
|
+
MAX_ELEMENTS_PER_BLOCK = int(os.getenv("MAX_ELEMENTS_PER_BLOCK", "10"))
|
|
22
|
+
MAX_FILES_ANALYZED = int(os.getenv("MAX_FILES_ANALYZED", "20"))
|
|
23
|
+
MAX_FILES_EDITED = int(os.getenv("MAX_FILES_EDITED", "20"))
|
|
24
|
+
MAX_TOOLS_USED = int(os.getenv("MAX_TOOLS_USED", "15"))
|
|
25
|
+
MAX_CONCEPT_MESSAGES = int(os.getenv("MAX_CONCEPT_MESSAGES", "50"))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MessageProcessor(ABC):
|
|
29
|
+
"""Abstract base class for message processing."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def process(self, item: Any, metadata: Dict[str, Any]) -> Optional[str]:
|
|
33
|
+
"""Process a message item and update metadata."""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TextMessageProcessor(MessageProcessor):
|
|
38
|
+
"""Process text messages and extract code blocks."""
|
|
39
|
+
|
|
40
|
+
def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
|
|
41
|
+
"""Process text content and extract code blocks with AST elements."""
|
|
42
|
+
if item.get('type') != 'text':
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
text_content = item.get('text', '')
|
|
46
|
+
|
|
47
|
+
# Check for code blocks
|
|
48
|
+
if '```' in text_content:
|
|
49
|
+
metadata['has_code_blocks'] = True
|
|
50
|
+
self._extract_code_ast_elements(text_content, metadata)
|
|
51
|
+
|
|
52
|
+
return text_content
|
|
53
|
+
|
|
54
|
+
def _extract_code_ast_elements(self, text: str, metadata: Dict[str, Any]):
|
|
55
|
+
"""Extract AST elements from code blocks in text."""
|
|
56
|
+
if 'ast_elements' not in metadata:
|
|
57
|
+
metadata['ast_elements'] = []
|
|
58
|
+
|
|
59
|
+
if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
# More permissive regex to handle various fence formats
|
|
63
|
+
code_blocks = re.findall(r'```[^`\n]*\n?(.*?)```', text, re.DOTALL)
|
|
64
|
+
|
|
65
|
+
for code_block in code_blocks[:MAX_CODE_BLOCKS]:
|
|
66
|
+
if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
ast_elems = extract_ast_elements(code_block)
|
|
70
|
+
for elem in list(ast_elems)[:MAX_ELEMENTS_PER_BLOCK]:
|
|
71
|
+
if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
|
|
72
|
+
metadata['ast_elements'].append(elem)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ThinkingMessageProcessor(MessageProcessor):
|
|
76
|
+
"""Process thinking messages."""
|
|
77
|
+
|
|
78
|
+
def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
|
|
79
|
+
"""Process thinking content."""
|
|
80
|
+
if item.get('type') != 'thinking':
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
return item.get('thinking', '')
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ToolMessageProcessor(MessageProcessor):
|
|
87
|
+
"""Process tool use messages and extract file references."""
|
|
88
|
+
|
|
89
|
+
def process(self, item: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[str]:
|
|
90
|
+
"""Process tool use and extract file references."""
|
|
91
|
+
if item.get('type') != 'tool_use':
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
tool_name = item.get('name', '')
|
|
95
|
+
|
|
96
|
+
# Track tool usage
|
|
97
|
+
if 'tools_used' not in metadata:
|
|
98
|
+
metadata['tools_used'] = []
|
|
99
|
+
|
|
100
|
+
if tool_name and tool_name not in metadata['tools_used']:
|
|
101
|
+
if len(metadata['tools_used']) < MAX_TOOLS_USED:
|
|
102
|
+
metadata['tools_used'].append(tool_name)
|
|
103
|
+
|
|
104
|
+
# Extract file references
|
|
105
|
+
if 'input' in item:
|
|
106
|
+
self._extract_file_references(item['input'], tool_name, metadata)
|
|
107
|
+
|
|
108
|
+
# Return tool use as text
|
|
109
|
+
tool_input = str(item.get('input', ''))[:500]
|
|
110
|
+
return f"[Tool: {tool_name}] {tool_input}"
|
|
111
|
+
|
|
112
|
+
def _extract_file_references(self, input_data: Any, tool_name: str, metadata: Dict[str, Any]):
|
|
113
|
+
"""Extract file references from tool input."""
|
|
114
|
+
if not isinstance(input_data, dict):
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
# Initialize metadata lists if not present
|
|
118
|
+
if 'files_edited' not in metadata:
|
|
119
|
+
metadata['files_edited'] = []
|
|
120
|
+
if 'files_analyzed' not in metadata:
|
|
121
|
+
metadata['files_analyzed'] = []
|
|
122
|
+
|
|
123
|
+
is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
|
|
124
|
+
|
|
125
|
+
# Check file_path field
|
|
126
|
+
if 'file_path' in input_data:
|
|
127
|
+
file_ref = input_data['file_path']
|
|
128
|
+
if is_edit:
|
|
129
|
+
if file_ref not in metadata['files_edited'] and len(metadata['files_edited']) < MAX_FILES_EDITED:
|
|
130
|
+
metadata['files_edited'].append(file_ref)
|
|
131
|
+
else:
|
|
132
|
+
if file_ref not in metadata['files_analyzed'] and len(metadata['files_analyzed']) < MAX_FILES_ANALYZED:
|
|
133
|
+
metadata['files_analyzed'].append(file_ref)
|
|
134
|
+
|
|
135
|
+
# Check path field (for non-edit tools)
|
|
136
|
+
if 'path' in input_data and not is_edit:
|
|
137
|
+
file_ref = input_data['path']
|
|
138
|
+
if file_ref not in metadata['files_analyzed'] and len(metadata['files_analyzed']) < MAX_FILES_ANALYZED:
|
|
139
|
+
metadata['files_analyzed'].append(file_ref)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class ToolResultProcessor(MessageProcessor):
|
|
143
|
+
"""Process tool result messages."""
|
|
144
|
+
|
|
145
|
+
def process(self, item: Any, metadata: Dict[str, Any]) -> Optional[str]:
|
|
146
|
+
"""Process tool results."""
|
|
147
|
+
# Handle both dict items and top-level tool results
|
|
148
|
+
if isinstance(item, dict):
|
|
149
|
+
if item.get('type') == 'tool_result':
|
|
150
|
+
result_content = str(item.get('content', ''))[:1000]
|
|
151
|
+
return f"[Result] {result_content}"
|
|
152
|
+
elif item.get('type') == 'tool_use':
|
|
153
|
+
# Already handled by ToolMessageProcessor
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class MessageProcessorFactory:
|
|
160
|
+
"""Factory for creating appropriate message processors."""
|
|
161
|
+
|
|
162
|
+
def __init__(self):
|
|
163
|
+
self.processors = {
|
|
164
|
+
'text': TextMessageProcessor(),
|
|
165
|
+
'thinking': ThinkingMessageProcessor(),
|
|
166
|
+
'tool_use': ToolMessageProcessor(),
|
|
167
|
+
'tool_result': ToolResultProcessor()
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
def get_processor(self, message_type: str) -> Optional[MessageProcessor]:
|
|
171
|
+
"""Get the appropriate processor for a message type."""
|
|
172
|
+
return self.processors.get(message_type)
|
|
173
|
+
|
|
174
|
+
def process_content(self, content: Any, metadata: Dict[str, Any]) -> str:
|
|
175
|
+
"""Process content of various types and return text representation."""
|
|
176
|
+
text_parts = []
|
|
177
|
+
|
|
178
|
+
if isinstance(content, list):
|
|
179
|
+
for item in content:
|
|
180
|
+
if isinstance(item, dict):
|
|
181
|
+
item_type = item.get('type', '')
|
|
182
|
+
processor = self.get_processor(item_type)
|
|
183
|
+
if processor:
|
|
184
|
+
text = processor.process(item, metadata)
|
|
185
|
+
if text:
|
|
186
|
+
text_parts.append(text)
|
|
187
|
+
elif isinstance(item, str):
|
|
188
|
+
text_parts.append(item)
|
|
189
|
+
elif isinstance(content, str):
|
|
190
|
+
text_parts.append(content)
|
|
191
|
+
|
|
192
|
+
return '\n'.join(text_parts)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def extract_ast_elements(code_text: str) -> Set[str]:
|
|
196
|
+
"""Extract AST elements from Python code."""
|
|
197
|
+
elements = set()
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
tree = ast.parse(code_text)
|
|
201
|
+
for node in ast.walk(tree):
|
|
202
|
+
if isinstance(node, ast.FunctionDef):
|
|
203
|
+
elements.add(f"func:{node.name}")
|
|
204
|
+
elif isinstance(node, ast.ClassDef):
|
|
205
|
+
elements.add(f"class:{node.name}")
|
|
206
|
+
elif isinstance(node, ast.Import):
|
|
207
|
+
for alias in node.names:
|
|
208
|
+
elements.add(f"import:{alias.name}")
|
|
209
|
+
elif isinstance(node, ast.ImportFrom):
|
|
210
|
+
module = node.module or ''
|
|
211
|
+
for alias in node.names:
|
|
212
|
+
elements.add(f"from:{module}.{alias.name}")
|
|
213
|
+
except (SyntaxError, ValueError):
|
|
214
|
+
# Not Python code or invalid syntax
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
return elements
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def extract_concepts(text: str) -> List[str]:
|
|
221
|
+
"""Extract key concepts from text using simple heuristics."""
|
|
222
|
+
concepts = []
|
|
223
|
+
|
|
224
|
+
# Common programming concepts
|
|
225
|
+
concept_patterns = [
|
|
226
|
+
(r'\b(async|await|promise|future)\b', 'async-programming'),
|
|
227
|
+
(r'\b(test|spec|jest|pytest|unittest)\b', 'testing'),
|
|
228
|
+
(r'\b(docker|container|kubernetes|k8s)\b', 'containerization'),
|
|
229
|
+
(r'\b(api|rest|graphql|endpoint)\b', 'api-development'),
|
|
230
|
+
(r'\b(react|vue|angular|svelte)\b', 'frontend-framework'),
|
|
231
|
+
(r'\b(database|sql|postgres|mysql|mongodb)\b', 'database'),
|
|
232
|
+
(r'\b(auth|authentication|oauth|jwt)\b', 'authentication'),
|
|
233
|
+
(r'\b(error|exception|bug|fix)\b', 'debugging'),
|
|
234
|
+
(r'\b(refactor|optimize|performance)\b', 'optimization'),
|
|
235
|
+
(r'\b(deploy|ci|cd|pipeline)\b', 'deployment')
|
|
236
|
+
]
|
|
237
|
+
|
|
238
|
+
text_lower = text.lower()
|
|
239
|
+
seen_concepts = set()
|
|
240
|
+
|
|
241
|
+
for pattern, concept in concept_patterns:
|
|
242
|
+
if re.search(pattern, text_lower) and concept not in seen_concepts:
|
|
243
|
+
concepts.append(concept)
|
|
244
|
+
seen_concepts.add(concept)
|
|
245
|
+
if len(concepts) >= MAX_CONCEPTS:
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
return concepts
|