claude-self-reflect 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ """Intelligent chunking for conversations."""
2
+
3
+ import logging
4
+ from typing import List, Optional
5
+ from pathlib import Path
6
+
7
+ from ..core import Message, ConversationChunk
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Chunker:
13
+ """
14
+ Create optimized chunks from conversation messages.
15
+
16
+ Implements intelligent chunking with overlap for context preservation.
17
+ """
18
+
19
+ def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 200):
20
+ self.chunk_size = chunk_size
21
+ self.chunk_overlap = chunk_overlap
22
+
23
+ def create_chunks(
24
+ self,
25
+ messages: List[Message],
26
+ file_path: str
27
+ ) -> List[ConversationChunk]:
28
+ """
29
+ Create chunks from messages.
30
+
31
+ Args:
32
+ messages: List of messages to chunk
33
+ file_path: Source file path for metadata
34
+
35
+ Returns:
36
+ List of conversation chunks
37
+ """
38
+ if not messages:
39
+ return []
40
+
41
+ # Generate conversation ID from file path
42
+ conversation_id = Path(file_path).stem
43
+
44
+ chunks = []
45
+ current_chunk_text = []
46
+ current_chunk_size = 0
47
+ current_message_indices = []
48
+
49
+ for msg in messages:
50
+ # Format message for chunk
51
+ formatted = self._format_message(msg)
52
+ msg_size = len(formatted)
53
+
54
+ # Check if adding this message would exceed chunk size
55
+ if current_chunk_size + msg_size > self.chunk_size and current_chunk_text:
56
+ # Create chunk with current messages
57
+ chunk = self._create_chunk(
58
+ current_chunk_text,
59
+ current_message_indices,
60
+ len(chunks),
61
+ conversation_id,
62
+ file_path
63
+ )
64
+ chunks.append(chunk)
65
+
66
+ # Start new chunk with overlap
67
+ overlap_text, overlap_indices = self._get_overlap(
68
+ current_chunk_text,
69
+ current_message_indices
70
+ )
71
+ current_chunk_text = overlap_text
72
+ current_message_indices = overlap_indices
73
+ current_chunk_size = sum(len(t) for t in current_chunk_text)
74
+
75
+ # Add message to current chunk
76
+ current_chunk_text.append(formatted)
77
+ # Fix: Check for None instead of truthiness to include index 0
78
+ if msg.message_index is not None:
79
+ current_message_indices.append(msg.message_index)
80
+ current_chunk_size += msg_size
81
+
82
+ # Create final chunk
83
+ if current_chunk_text:
84
+ chunk = self._create_chunk(
85
+ current_chunk_text,
86
+ current_message_indices,
87
+ len(chunks),
88
+ conversation_id,
89
+ file_path
90
+ )
91
+ chunks.append(chunk)
92
+
93
+ # Update total chunks count
94
+ for chunk in chunks:
95
+ chunk.total_chunks = len(chunks)
96
+
97
+ logger.debug(f"Created {len(chunks)} chunks from {len(messages)} messages")
98
+ return chunks
99
+
100
+ def _format_message(self, message: Message) -> str:
101
+ """Format a message for inclusion in chunk."""
102
+ # Include role for context
103
+ role_prefix = f"[{message.role.upper()}]: "
104
+ return role_prefix + message.content
105
+
106
+ def _get_overlap(
107
+ self,
108
+ chunk_text: List[str],
109
+ message_indices: List[int]
110
+ ) -> tuple[List[str], List[int]]:
111
+ """Get overlap text and indices for next chunk."""
112
+ if not chunk_text:
113
+ return [], []
114
+
115
+ # Calculate how many messages to include in overlap
116
+ overlap_size = 0
117
+ overlap_messages = []
118
+ overlap_indices = []
119
+
120
+ # Work backwards to get overlap
121
+ for i in range(len(chunk_text) - 1, -1, -1):
122
+ msg_size = len(chunk_text[i])
123
+ if overlap_size + msg_size <= self.chunk_overlap:
124
+ overlap_messages.insert(0, chunk_text[i])
125
+ if i < len(message_indices):
126
+ overlap_indices.insert(0, message_indices[i])
127
+ overlap_size += msg_size
128
+ else:
129
+ break
130
+
131
+ return overlap_messages, overlap_indices
132
+
133
+ def _create_chunk(
134
+ self,
135
+ text_parts: List[str],
136
+ message_indices: List[int],
137
+ chunk_index: int,
138
+ conversation_id: str,
139
+ file_path: str
140
+ ) -> ConversationChunk:
141
+ """Create a conversation chunk."""
142
+ chunk_text = "\n".join(text_parts)
143
+
144
+ chunk = ConversationChunk(
145
+ text=chunk_text,
146
+ message_indices=message_indices,
147
+ chunk_index=chunk_index,
148
+ total_chunks=0, # Will be updated later
149
+ conversation_id=conversation_id
150
+ )
151
+
152
+ # Add file metadata
153
+ chunk.add_metadata("file_path", file_path)
154
+ chunk.add_metadata("chunk_method", "overlap")
155
+ chunk.add_metadata("chunk_size_chars", len(chunk_text))
156
+
157
+ return chunk
@@ -0,0 +1,109 @@
1
+ """Extract concepts and keywords from text."""
2
+
3
+ import re
4
+ import logging
5
+ from typing import Dict, Any, Set, List
6
+ from collections import Counter
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ConceptExtractor:
12
+ """Extract key concepts and keywords from conversation text."""
13
+
14
+ def __init__(self):
15
+ # Technical concepts to look for
16
+ self.tech_patterns = {
17
+ 'languages': re.compile(r'\b(python|javascript|typescript|java|rust|go|c\+\+|ruby|php|swift|kotlin)\b', re.IGNORECASE),
18
+ 'frameworks': re.compile(r'\b(react|vue|angular|django|flask|fastapi|express|spring|rails|laravel)\b', re.IGNORECASE),
19
+ 'databases': re.compile(r'\b(mongodb|postgres|mysql|redis|elasticsearch|dynamodb|sqlite|cassandra)\b', re.IGNORECASE),
20
+ 'cloud': re.compile(r'\b(aws|azure|gcp|docker|kubernetes|serverless|lambda|ec2|s3)\b', re.IGNORECASE),
21
+ 'tools': re.compile(r'\b(git|npm|yarn|webpack|babel|eslint|pytest|jest|vscode|vim)\b', re.IGNORECASE),
22
+ 'concepts': re.compile(r'\b(api|rest|graphql|microservices|ci\/cd|devops|agile|tdd|security|authentication)\b', re.IGNORECASE)
23
+ }
24
+
25
+ # Common stop words to exclude
26
+ self.stop_words = {
27
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
28
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
29
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
30
+ 'should', 'could', 'may', 'might', 'must', 'can', 'shall', 'need'
31
+ }
32
+
33
+ def extract(self, text: str) -> Dict[str, Any]:
34
+ """
35
+ Extract concepts from text.
36
+
37
+ Returns:
38
+ Dictionary with concepts list
39
+ """
40
+ concepts = set()
41
+
42
+ # Extract technical concepts
43
+ for category, pattern in self.tech_patterns.items():
44
+ matches = pattern.findall(text)
45
+ for match in matches:
46
+ concepts.add(match.lower())
47
+
48
+ # Extract capitalized terms (likely important)
49
+ capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
50
+ for term in capitalized[:20]: # Limit to prevent noise
51
+ if term.lower() not in self.stop_words and len(term) > 3:
52
+ concepts.add(term.lower())
53
+
54
+ # Extract terms in backticks (code references)
55
+ code_terms = re.findall(r'`([^`]+)`', text)
56
+ for term in code_terms[:20]:
57
+ # Clean and add if it's a reasonable concept
58
+ clean_term = term.strip().lower()
59
+ if len(clean_term) > 2 and len(clean_term) < 50:
60
+ # Skip if it looks like code
61
+ if not any(char in clean_term for char in ['{', '}', '(', ')', ';', '=']):
62
+ concepts.add(clean_term)
63
+
64
+ # Extract file extensions mentioned
65
+ extensions = re.findall(r'\b\w+\.(py|js|ts|jsx|tsx|java|go|rs|cpp|c|h|md|json|yaml|yml|xml|html|css|sql)\b', text)
66
+ for ext in extensions[:10]:
67
+ concepts.add(f"file:{ext.split('.')[-1]}")
68
+
69
+ # Extract error types
70
+ errors = re.findall(r'\b(\w+Error|Exception)\b', text)
71
+ for error in errors[:10]:
72
+ concepts.add(f"error:{error.lower()}")
73
+
74
+ # Limit total concepts to prevent bloat
75
+ concept_list = list(concepts)[:50]
76
+
77
+ return {
78
+ "concepts": concept_list,
79
+ "concept_count": len(concept_list)
80
+ }
81
+
82
+ def extract_topics(self, text: str) -> List[str]:
83
+ """
84
+ Extract higher-level topics from text.
85
+
86
+ This is a more sophisticated extraction for topic modeling.
87
+ """
88
+ topics = []
89
+
90
+ # Check for common development topics
91
+ topic_indicators = {
92
+ 'debugging': ['debug', 'error', 'bug', 'fix', 'issue', 'problem'],
93
+ 'testing': ['test', 'unit test', 'integration', 'pytest', 'jest', 'coverage'],
94
+ 'deployment': ['deploy', 'production', 'release', 'ci/cd', 'pipeline'],
95
+ 'optimization': ['optimize', 'performance', 'speed', 'efficiency', 'cache'],
96
+ 'security': ['security', 'authentication', 'authorization', 'encryption', 'vulnerability'],
97
+ 'database': ['database', 'sql', 'query', 'schema', 'migration'],
98
+ 'api': ['api', 'endpoint', 'rest', 'graphql', 'webhook'],
99
+ 'frontend': ['ui', 'ux', 'component', 'react', 'vue', 'css', 'style'],
100
+ 'backend': ['server', 'backend', 'api', 'database', 'microservice'],
101
+ 'architecture': ['architecture', 'design pattern', 'structure', 'refactor']
102
+ }
103
+
104
+ text_lower = text.lower()
105
+ for topic, indicators in topic_indicators.items():
106
+ if any(indicator in text_lower for indicator in indicators):
107
+ topics.append(topic)
108
+
109
+ return topics[:5] # Return top 5 topics
@@ -0,0 +1,181 @@
1
+ """Parser for JSONL conversation files."""
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import List, Dict, Any, Optional
7
+ from datetime import datetime
8
+
9
+ from ..core import Message
10
+ from ..core.exceptions import ParseError
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ConversationParser:
16
+ """
17
+ Parse JSONL conversation files into Message objects.
18
+
19
+ Handles various conversation formats from Claude.
20
+ """
21
+
22
+ def parse_file(self, file_path: Path) -> List[Message]:
23
+ """
24
+ Parse a JSONL file into messages.
25
+
26
+ Args:
27
+ file_path: Path to JSONL file
28
+
29
+ Returns:
30
+ List of Message objects
31
+
32
+ Raises:
33
+ ParseError: If parsing fails
34
+ """
35
+ messages = []
36
+
37
+ try:
38
+ with open(file_path, 'r', encoding='utf-8') as f:
39
+ for line_num, line in enumerate(f, 1):
40
+ line = line.strip()
41
+ if not line:
42
+ continue
43
+
44
+ try:
45
+ data = json.loads(line)
46
+ message = self._parse_message(data, line_num)
47
+ if message:
48
+ messages.append(message)
49
+ except json.JSONDecodeError as e:
50
+ logger.warning(f"Skipping invalid JSON at line {line_num}: {e}")
51
+ # Don't fail entire file for one bad line
52
+ continue
53
+
54
+ if not messages:
55
+ raise ParseError(
56
+ str(file_path),
57
+ reason="No valid messages found in file"
58
+ )
59
+
60
+ # Add message indices
61
+ for i, msg in enumerate(messages):
62
+ msg.message_index = i
63
+
64
+ logger.debug(f"Parsed {len(messages)} messages from {file_path}")
65
+ return messages
66
+
67
+ except FileNotFoundError:
68
+ raise ParseError(str(file_path), reason="File not found")
69
+ except Exception as e:
70
+ if isinstance(e, ParseError):
71
+ raise
72
+ raise ParseError(str(file_path), reason=str(e))
73
+
74
+ def _parse_message(self, data: Dict[str, Any], line_num: int) -> Optional[Message]:
75
+ """
76
+ Parse a single message from JSON data.
77
+
78
+ Handles multiple conversation formats.
79
+ """
80
+ # Format 1: Direct message format
81
+ if "role" in data and "content" in data:
82
+ return Message(
83
+ role=data["role"],
84
+ content=self._extract_content(data["content"]),
85
+ timestamp=self._parse_timestamp(data.get("timestamp")),
86
+ metadata=self._extract_metadata(data)
87
+ )
88
+
89
+ # Format 2: Nested messages array
90
+ if "messages" in data and isinstance(data["messages"], list):
91
+ # Return first message or aggregate
92
+ messages = []
93
+ for msg_data in data["messages"]:
94
+ if isinstance(msg_data, dict) and "role" in msg_data:
95
+ msg = Message(
96
+ role=msg_data["role"],
97
+ content=self._extract_content(msg_data.get("content", "")),
98
+ timestamp=self._parse_timestamp(msg_data.get("timestamp")),
99
+ metadata=self._extract_metadata(msg_data)
100
+ )
101
+ messages.append(msg)
102
+
103
+ # For now, return first message
104
+ # In future, might want to handle differently
105
+ return messages[0] if messages else None
106
+
107
+ # Format 3: Event-based format
108
+ if "event" in data and data["event"] == "message":
109
+ return Message(
110
+ role=data.get("role", "unknown"),
111
+ content=self._extract_content(data.get("text", "")),
112
+ timestamp=self._parse_timestamp(data.get("timestamp")),
113
+ metadata=self._extract_metadata(data)
114
+ )
115
+
116
+ # Unknown format
117
+ logger.debug(f"Unknown message format at line {line_num}")
118
+ return None
119
+
120
+ def _extract_content(self, content: Any) -> str:
121
+ """Extract text content from various formats."""
122
+ if isinstance(content, str):
123
+ return content
124
+
125
+ if isinstance(content, list):
126
+ # Handle content array format
127
+ text_parts = []
128
+ for item in content:
129
+ if isinstance(item, dict):
130
+ if "text" in item:
131
+ text_parts.append(item["text"])
132
+ elif "content" in item:
133
+ text_parts.append(str(item["content"]))
134
+ else:
135
+ text_parts.append(str(item))
136
+ return "\n".join(text_parts)
137
+
138
+ if isinstance(content, dict):
139
+ if "text" in content:
140
+ return content["text"]
141
+ elif "content" in content:
142
+ return str(content["content"])
143
+
144
+ return str(content) if content else ""
145
+
146
+ def _parse_timestamp(self, timestamp: Any) -> Optional[datetime]:
147
+ """Parse timestamp from various formats."""
148
+ if not timestamp:
149
+ return None
150
+
151
+ if isinstance(timestamp, datetime):
152
+ return timestamp
153
+
154
+ if isinstance(timestamp, (int, float)):
155
+ # Unix timestamp
156
+ try:
157
+ return datetime.fromtimestamp(timestamp)
158
+ except Exception:
159
+ return None
160
+
161
+ if isinstance(timestamp, str):
162
+ # ISO format or other string formats
163
+ try:
164
+ return datetime.fromisoformat(timestamp)
165
+ except Exception:
166
+ # Try other formats if needed
167
+ return None
168
+
169
+ return None
170
+
171
+ def _extract_metadata(self, data: Dict[str, Any]) -> Dict[str, Any]:
172
+ """Extract additional metadata from message data."""
173
+ # Skip known fields
174
+ skip_fields = {"role", "content", "text", "timestamp", "message_index"}
175
+
176
+ metadata = {}
177
+ for key, value in data.items():
178
+ if key not in skip_fields:
179
+ metadata[key] = value
180
+
181
+ return metadata
@@ -0,0 +1,165 @@
1
+ """Extract tool usage and file references from conversations."""
2
+
3
+ import re
4
+ import logging
5
+ from typing import Dict, Any, Set, List
6
+ from pathlib import Path
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ToolUsageExtractor:
12
+ """Extract files analyzed, edited, and tools used from conversations."""
13
+
14
+ def __init__(self):
15
+ # Patterns for file operations
16
+ self.file_patterns = {
17
+ 'analyzed': [
18
+ re.compile(r'(?:reading|analyzing|examining|looking at|checking)\s+(?:file\s+)?([/\w\-\.]+\.\w+)', re.IGNORECASE),
19
+ re.compile(r'(?:Read|read)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
20
+ re.compile(r'(?:in|from)\s+file\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
21
+ ],
22
+ 'edited': [
23
+ re.compile(r'(?:editing|modifying|updating|changing|writing to)\s+(?:file\s+)?([/\w\-\.]+\.\w+)', re.IGNORECASE),
24
+ re.compile(r'(?:Edit|Write)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
25
+ re.compile(r'(?:changes to|modified|updated)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
26
+ ],
27
+ 'created': [
28
+ re.compile(r'(?:creating|created|new file)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
29
+ re.compile(r'(?:Write|Create)\s+new\s+file\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
30
+ ]
31
+ }
32
+
33
+ # Tool patterns
34
+ self.tool_patterns = [
35
+ re.compile(r'(?:using|running|executing)\s+(\w+)\s+(?:tool|command)', re.IGNORECASE),
36
+ re.compile(r'(?:Tool:|Command:)\s*(\w+)', re.IGNORECASE),
37
+ re.compile(r'```(?:bash|shell|sh)\n([a-z]+)', re.IGNORECASE),
38
+ re.compile(r'\$\s+([a-z]+)\s+', re.IGNORECASE), # Command line
39
+ re.compile(r'(?:npm|yarn|pip|cargo|go|cargo)\s+([\w\-]+)', re.IGNORECASE)
40
+ ]
41
+
42
+ # MCP tool pattern
43
+ self.mcp_pattern = re.compile(r'mcp__([a-zA-Z0-9\-_]+)__([a-zA-Z0-9\-_]+)')
44
+
45
+ def extract(self, text: str) -> Dict[str, Any]:
46
+ """
47
+ Extract tool usage information from text.
48
+
49
+ Returns:
50
+ Dictionary with files_analyzed, files_edited, tools_used
51
+ """
52
+ files_analyzed = set()
53
+ files_edited = set()
54
+ files_created = set()
55
+ tools_used = set()
56
+
57
+ # Extract file operations
58
+ for pattern in self.file_patterns['analyzed']:
59
+ matches = pattern.findall(text)
60
+ for match in matches:
61
+ file_path = self._normalize_file_path(match)
62
+ if file_path:
63
+ files_analyzed.add(file_path)
64
+
65
+ for pattern in self.file_patterns['edited']:
66
+ matches = pattern.findall(text)
67
+ for match in matches:
68
+ file_path = self._normalize_file_path(match)
69
+ if file_path:
70
+ files_edited.add(file_path)
71
+
72
+ for pattern in self.file_patterns['created']:
73
+ matches = pattern.findall(text)
74
+ for match in matches:
75
+ file_path = self._normalize_file_path(match)
76
+ if file_path:
77
+ files_created.add(file_path)
78
+
79
+ # Extract tools
80
+ for pattern in self.tool_patterns:
81
+ matches = pattern.findall(text)
82
+ for match in matches:
83
+ tool = match.lower().strip()
84
+ if self._is_valid_tool(tool):
85
+ tools_used.add(tool)
86
+
87
+ # Extract MCP tools specifically
88
+ mcp_matches = self.mcp_pattern.findall(text)
89
+ for server, tool in mcp_matches:
90
+ tools_used.add(f"mcp:{server}:{tool}")
91
+
92
+ # Look for common CLI tools
93
+ common_tools = [
94
+ 'git', 'npm', 'yarn', 'pip', 'python', 'node', 'docker',
95
+ 'kubectl', 'aws', 'gcloud', 'az', 'terraform', 'ansible',
96
+ 'make', 'gradle', 'maven', 'cargo', 'go', 'rustc'
97
+ ]
98
+ for tool in common_tools:
99
+ if re.search(rf'\b{tool}\b', text, re.IGNORECASE):
100
+ tools_used.add(tool)
101
+
102
+ # Combine all files for backward compatibility
103
+ all_files = files_analyzed | files_edited | files_created
104
+
105
+ return {
106
+ "files_analyzed": list(files_analyzed)[:50],
107
+ "files_edited": list(files_edited)[:50],
108
+ "files_created": list(files_created)[:50],
109
+ "files": list(all_files)[:50], # Legacy field
110
+ "tools_used": list(tools_used)[:30],
111
+ "file_count": len(all_files),
112
+ "tool_count": len(tools_used)
113
+ }
114
+
115
+ def _normalize_file_path(self, path: str) -> str:
116
+ """Normalize and validate file path."""
117
+ # Remove quotes and whitespace
118
+ path = path.strip('\'"` \n')
119
+
120
+ # Skip if too short or too long
121
+ if len(path) < 3 or len(path) > 200:
122
+ return ""
123
+
124
+ # Must have an extension
125
+ if '.' not in path:
126
+ return ""
127
+
128
+ # Extract just the filename if it's a full path
129
+ if '/' in path:
130
+ # Get the last component
131
+ path = path.split('/')[-1]
132
+
133
+ # Validate extension
134
+ valid_extensions = {
135
+ 'py', 'js', 'ts', 'jsx', 'tsx', 'java', 'go', 'rs', 'cpp', 'c', 'h',
136
+ 'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'html', 'css', 'scss',
137
+ 'sql', 'sh', 'bash', 'dockerfile', 'makefile', 'toml', 'ini', 'cfg'
138
+ }
139
+
140
+ ext = path.split('.')[-1].lower()
141
+ if ext not in valid_extensions:
142
+ return ""
143
+
144
+ return path
145
+
146
+ def _is_valid_tool(self, tool: str) -> bool:
147
+ """Check if a string is a valid tool name."""
148
+ # Skip common words
149
+ skip_words = {
150
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
151
+ 'this', 'that', 'with', 'from', 'as', 'is', 'was', 'are', 'were'
152
+ }
153
+
154
+ if tool in skip_words:
155
+ return False
156
+
157
+ # Must be alphanumeric with possible hyphens/underscores
158
+ if not re.match(r'^[a-z0-9\-_]+$', tool):
159
+ return False
160
+
161
+ # Reasonable length
162
+ if len(tool) < 2 or len(tool) > 30:
163
+ return False
164
+
165
+ return True
@@ -0,0 +1,5 @@
1
+ """State management for import tracking."""
2
+
3
+ from .state_manager import StateManager
4
+
5
+ __all__ = ["StateManager"]