claude-self-reflect 3.0.0 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +110 -66
- package/README.md +1 -1
- package/installer/setup-wizard.js +4 -2
- package/mcp-server/pyproject.toml +1 -0
- package/mcp-server/src/server.py +84 -0
- package/package.json +2 -1
- package/scripts/import-conversations-unified.py +225 -44
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Extract AST elements from code blocks."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import re
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Dict, Any, Set, List
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ASTExtractor:
|
|
12
|
+
"""
|
|
13
|
+
Extract Abstract Syntax Tree elements from code.
|
|
14
|
+
|
|
15
|
+
Implements the critical fixes identified in code review:
|
|
16
|
+
1. More permissive code fence regex
|
|
17
|
+
2. Python regex fallback for partial code
|
|
18
|
+
3. Bounded extraction with MAX_AST_ELEMENTS
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, max_elements: int = 100):
|
|
22
|
+
self.max_elements = max_elements
|
|
23
|
+
|
|
24
|
+
# FIX: More permissive code fence regex to handle various formats
|
|
25
|
+
# Matches: ```python, ```py, ```javascript, ```ts strict, etc.
|
|
26
|
+
self.code_fence_pattern = re.compile(
|
|
27
|
+
r'```[^\n]*\n?(.*?)```',
|
|
28
|
+
re.DOTALL
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Python patterns for fallback extraction
|
|
32
|
+
self.python_patterns = {
|
|
33
|
+
'function': re.compile(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
|
|
34
|
+
'async_function': re.compile(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
|
|
35
|
+
'class': re.compile(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', re.MULTILINE),
|
|
36
|
+
'method': re.compile(r'^\s+def\s+([A-Za-z_]\w*)\s*\(self', re.MULTILINE),
|
|
37
|
+
'static_method': re.compile(r'@staticmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL),
|
|
38
|
+
'class_method': re.compile(r'@classmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# JavaScript/TypeScript patterns
|
|
42
|
+
self.js_patterns = {
|
|
43
|
+
'function': re.compile(r'function\s+([A-Za-z_$][\w$]*)\s*\('),
|
|
44
|
+
'arrow': re.compile(r'(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>'),
|
|
45
|
+
'async_function': re.compile(r'async\s+function\s+([A-Za-z_$][\w$]*)\s*\('),
|
|
46
|
+
'class': re.compile(r'class\s+([A-Za-z_$][\w$]*)\s*(?:extends\s+[A-Za-z_$][\w$]*)?\s*\{'),
|
|
47
|
+
'method': re.compile(r'([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{'),
|
|
48
|
+
'export_function': re.compile(r'export\s+(?:default\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\('),
|
|
49
|
+
'export_const': re.compile(r'export\s+const\s+([A-Za-z_$][\w$]*)\s*=')
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def extract(self, text: str) -> Dict[str, Any]:
|
|
53
|
+
"""
|
|
54
|
+
Extract AST elements from text.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Dictionary with ast_elements and code_blocks keys
|
|
58
|
+
"""
|
|
59
|
+
elements = set()
|
|
60
|
+
has_code = False
|
|
61
|
+
|
|
62
|
+
# Extract code blocks using permissive regex
|
|
63
|
+
code_blocks = self.code_fence_pattern.findall(text)
|
|
64
|
+
|
|
65
|
+
for code_block in code_blocks[:10]: # Limit processing
|
|
66
|
+
has_code = True
|
|
67
|
+
|
|
68
|
+
# Try to detect language from content
|
|
69
|
+
if self._looks_like_python(code_block):
|
|
70
|
+
python_elements = self._extract_python_ast(code_block)
|
|
71
|
+
elements.update(python_elements)
|
|
72
|
+
elif self._looks_like_javascript(code_block):
|
|
73
|
+
js_elements = self._extract_javascript_patterns(code_block)
|
|
74
|
+
elements.update(js_elements)
|
|
75
|
+
else:
|
|
76
|
+
# Try both as fallback
|
|
77
|
+
elements.update(self._extract_python_ast(code_block))
|
|
78
|
+
elements.update(self._extract_javascript_patterns(code_block))
|
|
79
|
+
|
|
80
|
+
# FIX: Enforce max elements limit
|
|
81
|
+
if len(elements) >= self.max_elements:
|
|
82
|
+
logger.debug(f"Reached max AST elements limit: {self.max_elements}")
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
# Also check for inline code patterns outside of fences
|
|
86
|
+
if not has_code:
|
|
87
|
+
# Look for function/class definitions in plain text
|
|
88
|
+
elements.update(self._extract_inline_patterns(text))
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
"ast_elements": list(elements)[:self.max_elements],
|
|
92
|
+
"has_code_blocks": has_code
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
def _extract_python_ast(self, code: str) -> Set[str]:
|
|
96
|
+
"""Extract Python AST elements with fallback to regex."""
|
|
97
|
+
elements = set()
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
# Try proper AST parsing first
|
|
101
|
+
tree = ast.parse(code)
|
|
102
|
+
|
|
103
|
+
for node in ast.walk(tree):
|
|
104
|
+
if len(elements) >= self.max_elements:
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
if isinstance(node, ast.FunctionDef):
|
|
108
|
+
elements.add(f"func:{node.name}")
|
|
109
|
+
elif isinstance(node, ast.AsyncFunctionDef):
|
|
110
|
+
elements.add(f"func:{node.name}")
|
|
111
|
+
elif isinstance(node, ast.ClassDef):
|
|
112
|
+
elements.add(f"class:{node.name}")
|
|
113
|
+
# Extract methods
|
|
114
|
+
for item in node.body:
|
|
115
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
116
|
+
elements.add(f"method:{node.name}.{item.name}")
|
|
117
|
+
if len(elements) >= self.max_elements:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
except (SyntaxError, ValueError) as e:
|
|
121
|
+
# FIX: Python regex fallback for partial code fragments
|
|
122
|
+
logger.debug(f"AST parsing failed, using regex fallback: {e}")
|
|
123
|
+
|
|
124
|
+
for pattern_type, pattern in self.python_patterns.items():
|
|
125
|
+
for match in pattern.finditer(code):
|
|
126
|
+
if len(elements) >= self.max_elements:
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
name = match.group(1)
|
|
130
|
+
if 'method' in pattern_type:
|
|
131
|
+
elements.add(f"method:{name}")
|
|
132
|
+
elif 'class' in pattern_type:
|
|
133
|
+
elements.add(f"class:{name}")
|
|
134
|
+
else:
|
|
135
|
+
elements.add(f"func:{name}")
|
|
136
|
+
|
|
137
|
+
return elements
|
|
138
|
+
|
|
139
|
+
def _extract_javascript_patterns(self, code: str) -> Set[str]:
|
|
140
|
+
"""Extract JavaScript/TypeScript patterns."""
|
|
141
|
+
elements = set()
|
|
142
|
+
|
|
143
|
+
for pattern_type, pattern in self.js_patterns.items():
|
|
144
|
+
for match in pattern.finditer(code):
|
|
145
|
+
if len(elements) >= self.max_elements:
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
name = match.group(1)
|
|
149
|
+
if 'class' in pattern_type:
|
|
150
|
+
elements.add(f"class:{name}")
|
|
151
|
+
elif 'method' in pattern_type and name not in ['constructor', 'if', 'for', 'while']:
|
|
152
|
+
elements.add(f"method:{name}")
|
|
153
|
+
else:
|
|
154
|
+
elements.add(f"func:{name}")
|
|
155
|
+
|
|
156
|
+
return elements
|
|
157
|
+
|
|
158
|
+
def _extract_inline_patterns(self, text: str) -> Set[str]:
|
|
159
|
+
"""Extract patterns from inline code mentions."""
|
|
160
|
+
elements = set()
|
|
161
|
+
|
|
162
|
+
# Look for backtick-wrapped function/class names
|
|
163
|
+
inline_pattern = re.compile(r'`([A-Za-z_][\w]*(?:\.[A-Za-z_][\w]*)*)`')
|
|
164
|
+
|
|
165
|
+
for match in inline_pattern.finditer(text):
|
|
166
|
+
if len(elements) >= self.max_elements:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
name = match.group(1)
|
|
170
|
+
# Heuristic: if contains dot, likely a method
|
|
171
|
+
if '.' in name:
|
|
172
|
+
elements.add(f"method:{name}")
|
|
173
|
+
# Heuristic: PascalCase likely a class
|
|
174
|
+
elif name[0].isupper():
|
|
175
|
+
elements.add(f"class:{name}")
|
|
176
|
+
# Otherwise assume function
|
|
177
|
+
else:
|
|
178
|
+
elements.add(f"func:{name}")
|
|
179
|
+
|
|
180
|
+
return elements
|
|
181
|
+
|
|
182
|
+
def _looks_like_python(self, code: str) -> bool:
|
|
183
|
+
"""Heuristic to detect Python code."""
|
|
184
|
+
python_indicators = [
|
|
185
|
+
'def ', 'import ', 'from ', 'class ', 'self.', 'self,',
|
|
186
|
+
'__init__', '__name__', 'if __name__', 'print(', 'async def'
|
|
187
|
+
]
|
|
188
|
+
return any(indicator in code for indicator in python_indicators)
|
|
189
|
+
|
|
190
|
+
def _looks_like_javascript(self, code: str) -> bool:
|
|
191
|
+
"""Heuristic to detect JavaScript/TypeScript."""
|
|
192
|
+
js_indicators = [
|
|
193
|
+
'function ', 'const ', 'let ', 'var ', '=>', 'export ',
|
|
194
|
+
'import ', 'class ', 'constructor(', 'this.', 'async function',
|
|
195
|
+
'interface ', 'type ', 'namespace ', 'enum '
|
|
196
|
+
]
|
|
197
|
+
return any(indicator in code for indicator in js_indicators)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Intelligent chunking for conversations."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ..core import Message, ConversationChunk
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Chunker:
|
|
13
|
+
"""
|
|
14
|
+
Create optimized chunks from conversation messages.
|
|
15
|
+
|
|
16
|
+
Implements intelligent chunking with overlap for context preservation.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 200):
|
|
20
|
+
self.chunk_size = chunk_size
|
|
21
|
+
self.chunk_overlap = chunk_overlap
|
|
22
|
+
|
|
23
|
+
def create_chunks(
|
|
24
|
+
self,
|
|
25
|
+
messages: List[Message],
|
|
26
|
+
file_path: str
|
|
27
|
+
) -> List[ConversationChunk]:
|
|
28
|
+
"""
|
|
29
|
+
Create chunks from messages.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
messages: List of messages to chunk
|
|
33
|
+
file_path: Source file path for metadata
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of conversation chunks
|
|
37
|
+
"""
|
|
38
|
+
if not messages:
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
# Generate conversation ID from file path
|
|
42
|
+
conversation_id = Path(file_path).stem
|
|
43
|
+
|
|
44
|
+
chunks = []
|
|
45
|
+
current_chunk_text = []
|
|
46
|
+
current_chunk_size = 0
|
|
47
|
+
current_message_indices = []
|
|
48
|
+
|
|
49
|
+
for msg in messages:
|
|
50
|
+
# Format message for chunk
|
|
51
|
+
formatted = self._format_message(msg)
|
|
52
|
+
msg_size = len(formatted)
|
|
53
|
+
|
|
54
|
+
# Check if adding this message would exceed chunk size
|
|
55
|
+
if current_chunk_size + msg_size > self.chunk_size and current_chunk_text:
|
|
56
|
+
# Create chunk with current messages
|
|
57
|
+
chunk = self._create_chunk(
|
|
58
|
+
current_chunk_text,
|
|
59
|
+
current_message_indices,
|
|
60
|
+
len(chunks),
|
|
61
|
+
conversation_id,
|
|
62
|
+
file_path
|
|
63
|
+
)
|
|
64
|
+
chunks.append(chunk)
|
|
65
|
+
|
|
66
|
+
# Start new chunk with overlap
|
|
67
|
+
overlap_text, overlap_indices = self._get_overlap(
|
|
68
|
+
current_chunk_text,
|
|
69
|
+
current_message_indices
|
|
70
|
+
)
|
|
71
|
+
current_chunk_text = overlap_text
|
|
72
|
+
current_message_indices = overlap_indices
|
|
73
|
+
current_chunk_size = sum(len(t) for t in current_chunk_text)
|
|
74
|
+
|
|
75
|
+
# Add message to current chunk
|
|
76
|
+
current_chunk_text.append(formatted)
|
|
77
|
+
# Fix: Check for None instead of truthiness to include index 0
|
|
78
|
+
if msg.message_index is not None:
|
|
79
|
+
current_message_indices.append(msg.message_index)
|
|
80
|
+
current_chunk_size += msg_size
|
|
81
|
+
|
|
82
|
+
# Create final chunk
|
|
83
|
+
if current_chunk_text:
|
|
84
|
+
chunk = self._create_chunk(
|
|
85
|
+
current_chunk_text,
|
|
86
|
+
current_message_indices,
|
|
87
|
+
len(chunks),
|
|
88
|
+
conversation_id,
|
|
89
|
+
file_path
|
|
90
|
+
)
|
|
91
|
+
chunks.append(chunk)
|
|
92
|
+
|
|
93
|
+
# Update total chunks count
|
|
94
|
+
for chunk in chunks:
|
|
95
|
+
chunk.total_chunks = len(chunks)
|
|
96
|
+
|
|
97
|
+
logger.debug(f"Created {len(chunks)} chunks from {len(messages)} messages")
|
|
98
|
+
return chunks
|
|
99
|
+
|
|
100
|
+
def _format_message(self, message: Message) -> str:
|
|
101
|
+
"""Format a message for inclusion in chunk."""
|
|
102
|
+
# Include role for context
|
|
103
|
+
role_prefix = f"[{message.role.upper()}]: "
|
|
104
|
+
return role_prefix + message.content
|
|
105
|
+
|
|
106
|
+
def _get_overlap(
|
|
107
|
+
self,
|
|
108
|
+
chunk_text: List[str],
|
|
109
|
+
message_indices: List[int]
|
|
110
|
+
) -> tuple[List[str], List[int]]:
|
|
111
|
+
"""Get overlap text and indices for next chunk."""
|
|
112
|
+
if not chunk_text:
|
|
113
|
+
return [], []
|
|
114
|
+
|
|
115
|
+
# Calculate how many messages to include in overlap
|
|
116
|
+
overlap_size = 0
|
|
117
|
+
overlap_messages = []
|
|
118
|
+
overlap_indices = []
|
|
119
|
+
|
|
120
|
+
# Work backwards to get overlap
|
|
121
|
+
for i in range(len(chunk_text) - 1, -1, -1):
|
|
122
|
+
msg_size = len(chunk_text[i])
|
|
123
|
+
if overlap_size + msg_size <= self.chunk_overlap:
|
|
124
|
+
overlap_messages.insert(0, chunk_text[i])
|
|
125
|
+
if i < len(message_indices):
|
|
126
|
+
overlap_indices.insert(0, message_indices[i])
|
|
127
|
+
overlap_size += msg_size
|
|
128
|
+
else:
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
return overlap_messages, overlap_indices
|
|
132
|
+
|
|
133
|
+
def _create_chunk(
|
|
134
|
+
self,
|
|
135
|
+
text_parts: List[str],
|
|
136
|
+
message_indices: List[int],
|
|
137
|
+
chunk_index: int,
|
|
138
|
+
conversation_id: str,
|
|
139
|
+
file_path: str
|
|
140
|
+
) -> ConversationChunk:
|
|
141
|
+
"""Create a conversation chunk."""
|
|
142
|
+
chunk_text = "\n".join(text_parts)
|
|
143
|
+
|
|
144
|
+
chunk = ConversationChunk(
|
|
145
|
+
text=chunk_text,
|
|
146
|
+
message_indices=message_indices,
|
|
147
|
+
chunk_index=chunk_index,
|
|
148
|
+
total_chunks=0, # Will be updated later
|
|
149
|
+
conversation_id=conversation_id
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Add file metadata
|
|
153
|
+
chunk.add_metadata("file_path", file_path)
|
|
154
|
+
chunk.add_metadata("chunk_method", "overlap")
|
|
155
|
+
chunk.add_metadata("chunk_size_chars", len(chunk_text))
|
|
156
|
+
|
|
157
|
+
return chunk
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Extract concepts and keywords from text."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, Any, Set, List
|
|
6
|
+
from collections import Counter
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ConceptExtractor:
|
|
12
|
+
"""Extract key concepts and keywords from conversation text."""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
# Technical concepts to look for
|
|
16
|
+
self.tech_patterns = {
|
|
17
|
+
'languages': re.compile(r'\b(python|javascript|typescript|java|rust|go|c\+\+|ruby|php|swift|kotlin)\b', re.IGNORECASE),
|
|
18
|
+
'frameworks': re.compile(r'\b(react|vue|angular|django|flask|fastapi|express|spring|rails|laravel)\b', re.IGNORECASE),
|
|
19
|
+
'databases': re.compile(r'\b(mongodb|postgres|mysql|redis|elasticsearch|dynamodb|sqlite|cassandra)\b', re.IGNORECASE),
|
|
20
|
+
'cloud': re.compile(r'\b(aws|azure|gcp|docker|kubernetes|serverless|lambda|ec2|s3)\b', re.IGNORECASE),
|
|
21
|
+
'tools': re.compile(r'\b(git|npm|yarn|webpack|babel|eslint|pytest|jest|vscode|vim)\b', re.IGNORECASE),
|
|
22
|
+
'concepts': re.compile(r'\b(api|rest|graphql|microservices|ci\/cd|devops|agile|tdd|security|authentication)\b', re.IGNORECASE)
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Common stop words to exclude
|
|
26
|
+
self.stop_words = {
|
|
27
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
28
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
29
|
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
30
|
+
'should', 'could', 'may', 'might', 'must', 'can', 'shall', 'need'
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def extract(self, text: str) -> Dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Extract concepts from text.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dictionary with concepts list
|
|
39
|
+
"""
|
|
40
|
+
concepts = set()
|
|
41
|
+
|
|
42
|
+
# Extract technical concepts
|
|
43
|
+
for category, pattern in self.tech_patterns.items():
|
|
44
|
+
matches = pattern.findall(text)
|
|
45
|
+
for match in matches:
|
|
46
|
+
concepts.add(match.lower())
|
|
47
|
+
|
|
48
|
+
# Extract capitalized terms (likely important)
|
|
49
|
+
capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
|
50
|
+
for term in capitalized[:20]: # Limit to prevent noise
|
|
51
|
+
if term.lower() not in self.stop_words and len(term) > 3:
|
|
52
|
+
concepts.add(term.lower())
|
|
53
|
+
|
|
54
|
+
# Extract terms in backticks (code references)
|
|
55
|
+
code_terms = re.findall(r'`([^`]+)`', text)
|
|
56
|
+
for term in code_terms[:20]:
|
|
57
|
+
# Clean and add if it's a reasonable concept
|
|
58
|
+
clean_term = term.strip().lower()
|
|
59
|
+
if len(clean_term) > 2 and len(clean_term) < 50:
|
|
60
|
+
# Skip if it looks like code
|
|
61
|
+
if not any(char in clean_term for char in ['{', '}', '(', ')', ';', '=']):
|
|
62
|
+
concepts.add(clean_term)
|
|
63
|
+
|
|
64
|
+
# Extract file extensions mentioned
|
|
65
|
+
extensions = re.findall(r'\b\w+\.(py|js|ts|jsx|tsx|java|go|rs|cpp|c|h|md|json|yaml|yml|xml|html|css|sql)\b', text)
|
|
66
|
+
for ext in extensions[:10]:
|
|
67
|
+
concepts.add(f"file:{ext.split('.')[-1]}")
|
|
68
|
+
|
|
69
|
+
# Extract error types
|
|
70
|
+
errors = re.findall(r'\b(\w+Error|Exception)\b', text)
|
|
71
|
+
for error in errors[:10]:
|
|
72
|
+
concepts.add(f"error:{error.lower()}")
|
|
73
|
+
|
|
74
|
+
# Limit total concepts to prevent bloat
|
|
75
|
+
concept_list = list(concepts)[:50]
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
"concepts": concept_list,
|
|
79
|
+
"concept_count": len(concept_list)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
def extract_topics(self, text: str) -> List[str]:
|
|
83
|
+
"""
|
|
84
|
+
Extract higher-level topics from text.
|
|
85
|
+
|
|
86
|
+
This is a more sophisticated extraction for topic modeling.
|
|
87
|
+
"""
|
|
88
|
+
topics = []
|
|
89
|
+
|
|
90
|
+
# Check for common development topics
|
|
91
|
+
topic_indicators = {
|
|
92
|
+
'debugging': ['debug', 'error', 'bug', 'fix', 'issue', 'problem'],
|
|
93
|
+
'testing': ['test', 'unit test', 'integration', 'pytest', 'jest', 'coverage'],
|
|
94
|
+
'deployment': ['deploy', 'production', 'release', 'ci/cd', 'pipeline'],
|
|
95
|
+
'optimization': ['optimize', 'performance', 'speed', 'efficiency', 'cache'],
|
|
96
|
+
'security': ['security', 'authentication', 'authorization', 'encryption', 'vulnerability'],
|
|
97
|
+
'database': ['database', 'sql', 'query', 'schema', 'migration'],
|
|
98
|
+
'api': ['api', 'endpoint', 'rest', 'graphql', 'webhook'],
|
|
99
|
+
'frontend': ['ui', 'ux', 'component', 'react', 'vue', 'css', 'style'],
|
|
100
|
+
'backend': ['server', 'backend', 'api', 'database', 'microservice'],
|
|
101
|
+
'architecture': ['architecture', 'design pattern', 'structure', 'refactor']
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
text_lower = text.lower()
|
|
105
|
+
for topic, indicators in topic_indicators.items():
|
|
106
|
+
if any(indicator in text_lower for indicator in indicators):
|
|
107
|
+
topics.append(topic)
|
|
108
|
+
|
|
109
|
+
return topics[:5] # Return top 5 topics
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Parser for JSONL conversation files."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Dict, Any, Optional
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
from ..core import Message
|
|
10
|
+
from ..core.exceptions import ParseError
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ConversationParser:
|
|
16
|
+
"""
|
|
17
|
+
Parse JSONL conversation files into Message objects.
|
|
18
|
+
|
|
19
|
+
Handles various conversation formats from Claude.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def parse_file(self, file_path: Path) -> List[Message]:
|
|
23
|
+
"""
|
|
24
|
+
Parse a JSONL file into messages.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
file_path: Path to JSONL file
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
List of Message objects
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ParseError: If parsing fails
|
|
34
|
+
"""
|
|
35
|
+
messages = []
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
39
|
+
for line_num, line in enumerate(f, 1):
|
|
40
|
+
line = line.strip()
|
|
41
|
+
if not line:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
data = json.loads(line)
|
|
46
|
+
message = self._parse_message(data, line_num)
|
|
47
|
+
if message:
|
|
48
|
+
messages.append(message)
|
|
49
|
+
except json.JSONDecodeError as e:
|
|
50
|
+
logger.warning(f"Skipping invalid JSON at line {line_num}: {e}")
|
|
51
|
+
# Don't fail entire file for one bad line
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
if not messages:
|
|
55
|
+
raise ParseError(
|
|
56
|
+
str(file_path),
|
|
57
|
+
reason="No valid messages found in file"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Add message indices
|
|
61
|
+
for i, msg in enumerate(messages):
|
|
62
|
+
msg.message_index = i
|
|
63
|
+
|
|
64
|
+
logger.debug(f"Parsed {len(messages)} messages from {file_path}")
|
|
65
|
+
return messages
|
|
66
|
+
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
raise ParseError(str(file_path), reason="File not found")
|
|
69
|
+
except Exception as e:
|
|
70
|
+
if isinstance(e, ParseError):
|
|
71
|
+
raise
|
|
72
|
+
raise ParseError(str(file_path), reason=str(e))
|
|
73
|
+
|
|
74
|
+
def _parse_message(self, data: Dict[str, Any], line_num: int) -> Optional[Message]:
|
|
75
|
+
"""
|
|
76
|
+
Parse a single message from JSON data.
|
|
77
|
+
|
|
78
|
+
Handles multiple conversation formats.
|
|
79
|
+
"""
|
|
80
|
+
# Format 1: Direct message format
|
|
81
|
+
if "role" in data and "content" in data:
|
|
82
|
+
return Message(
|
|
83
|
+
role=data["role"],
|
|
84
|
+
content=self._extract_content(data["content"]),
|
|
85
|
+
timestamp=self._parse_timestamp(data.get("timestamp")),
|
|
86
|
+
metadata=self._extract_metadata(data)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Format 2: Nested messages array
|
|
90
|
+
if "messages" in data and isinstance(data["messages"], list):
|
|
91
|
+
# Return first message or aggregate
|
|
92
|
+
messages = []
|
|
93
|
+
for msg_data in data["messages"]:
|
|
94
|
+
if isinstance(msg_data, dict) and "role" in msg_data:
|
|
95
|
+
msg = Message(
|
|
96
|
+
role=msg_data["role"],
|
|
97
|
+
content=self._extract_content(msg_data.get("content", "")),
|
|
98
|
+
timestamp=self._parse_timestamp(msg_data.get("timestamp")),
|
|
99
|
+
metadata=self._extract_metadata(msg_data)
|
|
100
|
+
)
|
|
101
|
+
messages.append(msg)
|
|
102
|
+
|
|
103
|
+
# For now, return first message
|
|
104
|
+
# In future, might want to handle differently
|
|
105
|
+
return messages[0] if messages else None
|
|
106
|
+
|
|
107
|
+
# Format 3: Event-based format
|
|
108
|
+
if "event" in data and data["event"] == "message":
|
|
109
|
+
return Message(
|
|
110
|
+
role=data.get("role", "unknown"),
|
|
111
|
+
content=self._extract_content(data.get("text", "")),
|
|
112
|
+
timestamp=self._parse_timestamp(data.get("timestamp")),
|
|
113
|
+
metadata=self._extract_metadata(data)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Unknown format
|
|
117
|
+
logger.debug(f"Unknown message format at line {line_num}")
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
def _extract_content(self, content: Any) -> str:
|
|
121
|
+
"""Extract text content from various formats."""
|
|
122
|
+
if isinstance(content, str):
|
|
123
|
+
return content
|
|
124
|
+
|
|
125
|
+
if isinstance(content, list):
|
|
126
|
+
# Handle content array format
|
|
127
|
+
text_parts = []
|
|
128
|
+
for item in content:
|
|
129
|
+
if isinstance(item, dict):
|
|
130
|
+
if "text" in item:
|
|
131
|
+
text_parts.append(item["text"])
|
|
132
|
+
elif "content" in item:
|
|
133
|
+
text_parts.append(str(item["content"]))
|
|
134
|
+
else:
|
|
135
|
+
text_parts.append(str(item))
|
|
136
|
+
return "\n".join(text_parts)
|
|
137
|
+
|
|
138
|
+
if isinstance(content, dict):
|
|
139
|
+
if "text" in content:
|
|
140
|
+
return content["text"]
|
|
141
|
+
elif "content" in content:
|
|
142
|
+
return str(content["content"])
|
|
143
|
+
|
|
144
|
+
return str(content) if content else ""
|
|
145
|
+
|
|
146
|
+
def _parse_timestamp(self, timestamp: Any) -> Optional[datetime]:
|
|
147
|
+
"""Parse timestamp from various formats."""
|
|
148
|
+
if not timestamp:
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
if isinstance(timestamp, datetime):
|
|
152
|
+
return timestamp
|
|
153
|
+
|
|
154
|
+
if isinstance(timestamp, (int, float)):
|
|
155
|
+
# Unix timestamp
|
|
156
|
+
try:
|
|
157
|
+
return datetime.fromtimestamp(timestamp)
|
|
158
|
+
except Exception:
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
if isinstance(timestamp, str):
|
|
162
|
+
# ISO format or other string formats
|
|
163
|
+
try:
|
|
164
|
+
return datetime.fromisoformat(timestamp)
|
|
165
|
+
except Exception:
|
|
166
|
+
# Try other formats if needed
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
def _extract_metadata(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
172
|
+
"""Extract additional metadata from message data."""
|
|
173
|
+
# Skip known fields
|
|
174
|
+
skip_fields = {"role", "content", "text", "timestamp", "message_index"}
|
|
175
|
+
|
|
176
|
+
metadata = {}
|
|
177
|
+
for key, value in data.items():
|
|
178
|
+
if key not in skip_fields:
|
|
179
|
+
metadata[key] = value
|
|
180
|
+
|
|
181
|
+
return metadata
|