claude-self-reflect 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-server/pyproject.toml +1 -0
- package/package.json +2 -1
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Intelligent chunking for conversations."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ..core import Message, ConversationChunk
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Chunker:
|
|
13
|
+
"""
|
|
14
|
+
Create optimized chunks from conversation messages.
|
|
15
|
+
|
|
16
|
+
Implements intelligent chunking with overlap for context preservation.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 200):
|
|
20
|
+
self.chunk_size = chunk_size
|
|
21
|
+
self.chunk_overlap = chunk_overlap
|
|
22
|
+
|
|
23
|
+
def create_chunks(
|
|
24
|
+
self,
|
|
25
|
+
messages: List[Message],
|
|
26
|
+
file_path: str
|
|
27
|
+
) -> List[ConversationChunk]:
|
|
28
|
+
"""
|
|
29
|
+
Create chunks from messages.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
messages: List of messages to chunk
|
|
33
|
+
file_path: Source file path for metadata
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of conversation chunks
|
|
37
|
+
"""
|
|
38
|
+
if not messages:
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
# Generate conversation ID from file path
|
|
42
|
+
conversation_id = Path(file_path).stem
|
|
43
|
+
|
|
44
|
+
chunks = []
|
|
45
|
+
current_chunk_text = []
|
|
46
|
+
current_chunk_size = 0
|
|
47
|
+
current_message_indices = []
|
|
48
|
+
|
|
49
|
+
for msg in messages:
|
|
50
|
+
# Format message for chunk
|
|
51
|
+
formatted = self._format_message(msg)
|
|
52
|
+
msg_size = len(formatted)
|
|
53
|
+
|
|
54
|
+
# Check if adding this message would exceed chunk size
|
|
55
|
+
if current_chunk_size + msg_size > self.chunk_size and current_chunk_text:
|
|
56
|
+
# Create chunk with current messages
|
|
57
|
+
chunk = self._create_chunk(
|
|
58
|
+
current_chunk_text,
|
|
59
|
+
current_message_indices,
|
|
60
|
+
len(chunks),
|
|
61
|
+
conversation_id,
|
|
62
|
+
file_path
|
|
63
|
+
)
|
|
64
|
+
chunks.append(chunk)
|
|
65
|
+
|
|
66
|
+
# Start new chunk with overlap
|
|
67
|
+
overlap_text, overlap_indices = self._get_overlap(
|
|
68
|
+
current_chunk_text,
|
|
69
|
+
current_message_indices
|
|
70
|
+
)
|
|
71
|
+
current_chunk_text = overlap_text
|
|
72
|
+
current_message_indices = overlap_indices
|
|
73
|
+
current_chunk_size = sum(len(t) for t in current_chunk_text)
|
|
74
|
+
|
|
75
|
+
# Add message to current chunk
|
|
76
|
+
current_chunk_text.append(formatted)
|
|
77
|
+
# Fix: Check for None instead of truthiness to include index 0
|
|
78
|
+
if msg.message_index is not None:
|
|
79
|
+
current_message_indices.append(msg.message_index)
|
|
80
|
+
current_chunk_size += msg_size
|
|
81
|
+
|
|
82
|
+
# Create final chunk
|
|
83
|
+
if current_chunk_text:
|
|
84
|
+
chunk = self._create_chunk(
|
|
85
|
+
current_chunk_text,
|
|
86
|
+
current_message_indices,
|
|
87
|
+
len(chunks),
|
|
88
|
+
conversation_id,
|
|
89
|
+
file_path
|
|
90
|
+
)
|
|
91
|
+
chunks.append(chunk)
|
|
92
|
+
|
|
93
|
+
# Update total chunks count
|
|
94
|
+
for chunk in chunks:
|
|
95
|
+
chunk.total_chunks = len(chunks)
|
|
96
|
+
|
|
97
|
+
logger.debug(f"Created {len(chunks)} chunks from {len(messages)} messages")
|
|
98
|
+
return chunks
|
|
99
|
+
|
|
100
|
+
def _format_message(self, message: Message) -> str:
|
|
101
|
+
"""Format a message for inclusion in chunk."""
|
|
102
|
+
# Include role for context
|
|
103
|
+
role_prefix = f"[{message.role.upper()}]: "
|
|
104
|
+
return role_prefix + message.content
|
|
105
|
+
|
|
106
|
+
def _get_overlap(
|
|
107
|
+
self,
|
|
108
|
+
chunk_text: List[str],
|
|
109
|
+
message_indices: List[int]
|
|
110
|
+
) -> tuple[List[str], List[int]]:
|
|
111
|
+
"""Get overlap text and indices for next chunk."""
|
|
112
|
+
if not chunk_text:
|
|
113
|
+
return [], []
|
|
114
|
+
|
|
115
|
+
# Calculate how many messages to include in overlap
|
|
116
|
+
overlap_size = 0
|
|
117
|
+
overlap_messages = []
|
|
118
|
+
overlap_indices = []
|
|
119
|
+
|
|
120
|
+
# Work backwards to get overlap
|
|
121
|
+
for i in range(len(chunk_text) - 1, -1, -1):
|
|
122
|
+
msg_size = len(chunk_text[i])
|
|
123
|
+
if overlap_size + msg_size <= self.chunk_overlap:
|
|
124
|
+
overlap_messages.insert(0, chunk_text[i])
|
|
125
|
+
if i < len(message_indices):
|
|
126
|
+
overlap_indices.insert(0, message_indices[i])
|
|
127
|
+
overlap_size += msg_size
|
|
128
|
+
else:
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
return overlap_messages, overlap_indices
|
|
132
|
+
|
|
133
|
+
def _create_chunk(
|
|
134
|
+
self,
|
|
135
|
+
text_parts: List[str],
|
|
136
|
+
message_indices: List[int],
|
|
137
|
+
chunk_index: int,
|
|
138
|
+
conversation_id: str,
|
|
139
|
+
file_path: str
|
|
140
|
+
) -> ConversationChunk:
|
|
141
|
+
"""Create a conversation chunk."""
|
|
142
|
+
chunk_text = "\n".join(text_parts)
|
|
143
|
+
|
|
144
|
+
chunk = ConversationChunk(
|
|
145
|
+
text=chunk_text,
|
|
146
|
+
message_indices=message_indices,
|
|
147
|
+
chunk_index=chunk_index,
|
|
148
|
+
total_chunks=0, # Will be updated later
|
|
149
|
+
conversation_id=conversation_id
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Add file metadata
|
|
153
|
+
chunk.add_metadata("file_path", file_path)
|
|
154
|
+
chunk.add_metadata("chunk_method", "overlap")
|
|
155
|
+
chunk.add_metadata("chunk_size_chars", len(chunk_text))
|
|
156
|
+
|
|
157
|
+
return chunk
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Extract concepts and keywords from text."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, Any, Set, List
|
|
6
|
+
from collections import Counter
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ConceptExtractor:
|
|
12
|
+
"""Extract key concepts and keywords from conversation text."""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
# Technical concepts to look for
|
|
16
|
+
self.tech_patterns = {
|
|
17
|
+
'languages': re.compile(r'\b(python|javascript|typescript|java|rust|go|c\+\+|ruby|php|swift|kotlin)\b', re.IGNORECASE),
|
|
18
|
+
'frameworks': re.compile(r'\b(react|vue|angular|django|flask|fastapi|express|spring|rails|laravel)\b', re.IGNORECASE),
|
|
19
|
+
'databases': re.compile(r'\b(mongodb|postgres|mysql|redis|elasticsearch|dynamodb|sqlite|cassandra)\b', re.IGNORECASE),
|
|
20
|
+
'cloud': re.compile(r'\b(aws|azure|gcp|docker|kubernetes|serverless|lambda|ec2|s3)\b', re.IGNORECASE),
|
|
21
|
+
'tools': re.compile(r'\b(git|npm|yarn|webpack|babel|eslint|pytest|jest|vscode|vim)\b', re.IGNORECASE),
|
|
22
|
+
'concepts': re.compile(r'\b(api|rest|graphql|microservices|ci\/cd|devops|agile|tdd|security|authentication)\b', re.IGNORECASE)
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Common stop words to exclude
|
|
26
|
+
self.stop_words = {
|
|
27
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
28
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
29
|
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
30
|
+
'should', 'could', 'may', 'might', 'must', 'can', 'shall', 'need'
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def extract(self, text: str) -> Dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Extract concepts from text.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dictionary with concepts list
|
|
39
|
+
"""
|
|
40
|
+
concepts = set()
|
|
41
|
+
|
|
42
|
+
# Extract technical concepts
|
|
43
|
+
for category, pattern in self.tech_patterns.items():
|
|
44
|
+
matches = pattern.findall(text)
|
|
45
|
+
for match in matches:
|
|
46
|
+
concepts.add(match.lower())
|
|
47
|
+
|
|
48
|
+
# Extract capitalized terms (likely important)
|
|
49
|
+
capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
|
50
|
+
for term in capitalized[:20]: # Limit to prevent noise
|
|
51
|
+
if term.lower() not in self.stop_words and len(term) > 3:
|
|
52
|
+
concepts.add(term.lower())
|
|
53
|
+
|
|
54
|
+
# Extract terms in backticks (code references)
|
|
55
|
+
code_terms = re.findall(r'`([^`]+)`', text)
|
|
56
|
+
for term in code_terms[:20]:
|
|
57
|
+
# Clean and add if it's a reasonable concept
|
|
58
|
+
clean_term = term.strip().lower()
|
|
59
|
+
if len(clean_term) > 2 and len(clean_term) < 50:
|
|
60
|
+
# Skip if it looks like code
|
|
61
|
+
if not any(char in clean_term for char in ['{', '}', '(', ')', ';', '=']):
|
|
62
|
+
concepts.add(clean_term)
|
|
63
|
+
|
|
64
|
+
# Extract file extensions mentioned
|
|
65
|
+
extensions = re.findall(r'\b\w+\.(py|js|ts|jsx|tsx|java|go|rs|cpp|c|h|md|json|yaml|yml|xml|html|css|sql)\b', text)
|
|
66
|
+
for ext in extensions[:10]:
|
|
67
|
+
concepts.add(f"file:{ext.split('.')[-1]}")
|
|
68
|
+
|
|
69
|
+
# Extract error types
|
|
70
|
+
errors = re.findall(r'\b(\w+Error|Exception)\b', text)
|
|
71
|
+
for error in errors[:10]:
|
|
72
|
+
concepts.add(f"error:{error.lower()}")
|
|
73
|
+
|
|
74
|
+
# Limit total concepts to prevent bloat
|
|
75
|
+
concept_list = list(concepts)[:50]
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
"concepts": concept_list,
|
|
79
|
+
"concept_count": len(concept_list)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
def extract_topics(self, text: str) -> List[str]:
|
|
83
|
+
"""
|
|
84
|
+
Extract higher-level topics from text.
|
|
85
|
+
|
|
86
|
+
This is a more sophisticated extraction for topic modeling.
|
|
87
|
+
"""
|
|
88
|
+
topics = []
|
|
89
|
+
|
|
90
|
+
# Check for common development topics
|
|
91
|
+
topic_indicators = {
|
|
92
|
+
'debugging': ['debug', 'error', 'bug', 'fix', 'issue', 'problem'],
|
|
93
|
+
'testing': ['test', 'unit test', 'integration', 'pytest', 'jest', 'coverage'],
|
|
94
|
+
'deployment': ['deploy', 'production', 'release', 'ci/cd', 'pipeline'],
|
|
95
|
+
'optimization': ['optimize', 'performance', 'speed', 'efficiency', 'cache'],
|
|
96
|
+
'security': ['security', 'authentication', 'authorization', 'encryption', 'vulnerability'],
|
|
97
|
+
'database': ['database', 'sql', 'query', 'schema', 'migration'],
|
|
98
|
+
'api': ['api', 'endpoint', 'rest', 'graphql', 'webhook'],
|
|
99
|
+
'frontend': ['ui', 'ux', 'component', 'react', 'vue', 'css', 'style'],
|
|
100
|
+
'backend': ['server', 'backend', 'api', 'database', 'microservice'],
|
|
101
|
+
'architecture': ['architecture', 'design pattern', 'structure', 'refactor']
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
text_lower = text.lower()
|
|
105
|
+
for topic, indicators in topic_indicators.items():
|
|
106
|
+
if any(indicator in text_lower for indicator in indicators):
|
|
107
|
+
topics.append(topic)
|
|
108
|
+
|
|
109
|
+
return topics[:5] # Return top 5 topics
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Parser for JSONL conversation files."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Dict, Any, Optional
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
from ..core import Message
|
|
10
|
+
from ..core.exceptions import ParseError
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ConversationParser:
|
|
16
|
+
"""
|
|
17
|
+
Parse JSONL conversation files into Message objects.
|
|
18
|
+
|
|
19
|
+
Handles various conversation formats from Claude.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def parse_file(self, file_path: Path) -> List[Message]:
|
|
23
|
+
"""
|
|
24
|
+
Parse a JSONL file into messages.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
file_path: Path to JSONL file
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
List of Message objects
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ParseError: If parsing fails
|
|
34
|
+
"""
|
|
35
|
+
messages = []
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
39
|
+
for line_num, line in enumerate(f, 1):
|
|
40
|
+
line = line.strip()
|
|
41
|
+
if not line:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
data = json.loads(line)
|
|
46
|
+
message = self._parse_message(data, line_num)
|
|
47
|
+
if message:
|
|
48
|
+
messages.append(message)
|
|
49
|
+
except json.JSONDecodeError as e:
|
|
50
|
+
logger.warning(f"Skipping invalid JSON at line {line_num}: {e}")
|
|
51
|
+
# Don't fail entire file for one bad line
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
if not messages:
|
|
55
|
+
raise ParseError(
|
|
56
|
+
str(file_path),
|
|
57
|
+
reason="No valid messages found in file"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Add message indices
|
|
61
|
+
for i, msg in enumerate(messages):
|
|
62
|
+
msg.message_index = i
|
|
63
|
+
|
|
64
|
+
logger.debug(f"Parsed {len(messages)} messages from {file_path}")
|
|
65
|
+
return messages
|
|
66
|
+
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
raise ParseError(str(file_path), reason="File not found")
|
|
69
|
+
except Exception as e:
|
|
70
|
+
if isinstance(e, ParseError):
|
|
71
|
+
raise
|
|
72
|
+
raise ParseError(str(file_path), reason=str(e))
|
|
73
|
+
|
|
74
|
+
def _parse_message(self, data: Dict[str, Any], line_num: int) -> Optional[Message]:
|
|
75
|
+
"""
|
|
76
|
+
Parse a single message from JSON data.
|
|
77
|
+
|
|
78
|
+
Handles multiple conversation formats.
|
|
79
|
+
"""
|
|
80
|
+
# Format 1: Direct message format
|
|
81
|
+
if "role" in data and "content" in data:
|
|
82
|
+
return Message(
|
|
83
|
+
role=data["role"],
|
|
84
|
+
content=self._extract_content(data["content"]),
|
|
85
|
+
timestamp=self._parse_timestamp(data.get("timestamp")),
|
|
86
|
+
metadata=self._extract_metadata(data)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Format 2: Nested messages array
|
|
90
|
+
if "messages" in data and isinstance(data["messages"], list):
|
|
91
|
+
# Return first message or aggregate
|
|
92
|
+
messages = []
|
|
93
|
+
for msg_data in data["messages"]:
|
|
94
|
+
if isinstance(msg_data, dict) and "role" in msg_data:
|
|
95
|
+
msg = Message(
|
|
96
|
+
role=msg_data["role"],
|
|
97
|
+
content=self._extract_content(msg_data.get("content", "")),
|
|
98
|
+
timestamp=self._parse_timestamp(msg_data.get("timestamp")),
|
|
99
|
+
metadata=self._extract_metadata(msg_data)
|
|
100
|
+
)
|
|
101
|
+
messages.append(msg)
|
|
102
|
+
|
|
103
|
+
# For now, return first message
|
|
104
|
+
# In future, might want to handle differently
|
|
105
|
+
return messages[0] if messages else None
|
|
106
|
+
|
|
107
|
+
# Format 3: Event-based format
|
|
108
|
+
if "event" in data and data["event"] == "message":
|
|
109
|
+
return Message(
|
|
110
|
+
role=data.get("role", "unknown"),
|
|
111
|
+
content=self._extract_content(data.get("text", "")),
|
|
112
|
+
timestamp=self._parse_timestamp(data.get("timestamp")),
|
|
113
|
+
metadata=self._extract_metadata(data)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Unknown format
|
|
117
|
+
logger.debug(f"Unknown message format at line {line_num}")
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
def _extract_content(self, content: Any) -> str:
|
|
121
|
+
"""Extract text content from various formats."""
|
|
122
|
+
if isinstance(content, str):
|
|
123
|
+
return content
|
|
124
|
+
|
|
125
|
+
if isinstance(content, list):
|
|
126
|
+
# Handle content array format
|
|
127
|
+
text_parts = []
|
|
128
|
+
for item in content:
|
|
129
|
+
if isinstance(item, dict):
|
|
130
|
+
if "text" in item:
|
|
131
|
+
text_parts.append(item["text"])
|
|
132
|
+
elif "content" in item:
|
|
133
|
+
text_parts.append(str(item["content"]))
|
|
134
|
+
else:
|
|
135
|
+
text_parts.append(str(item))
|
|
136
|
+
return "\n".join(text_parts)
|
|
137
|
+
|
|
138
|
+
if isinstance(content, dict):
|
|
139
|
+
if "text" in content:
|
|
140
|
+
return content["text"]
|
|
141
|
+
elif "content" in content:
|
|
142
|
+
return str(content["content"])
|
|
143
|
+
|
|
144
|
+
return str(content) if content else ""
|
|
145
|
+
|
|
146
|
+
def _parse_timestamp(self, timestamp: Any) -> Optional[datetime]:
|
|
147
|
+
"""Parse timestamp from various formats."""
|
|
148
|
+
if not timestamp:
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
if isinstance(timestamp, datetime):
|
|
152
|
+
return timestamp
|
|
153
|
+
|
|
154
|
+
if isinstance(timestamp, (int, float)):
|
|
155
|
+
# Unix timestamp
|
|
156
|
+
try:
|
|
157
|
+
return datetime.fromtimestamp(timestamp)
|
|
158
|
+
except Exception:
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
if isinstance(timestamp, str):
|
|
162
|
+
# ISO format or other string formats
|
|
163
|
+
try:
|
|
164
|
+
return datetime.fromisoformat(timestamp)
|
|
165
|
+
except Exception:
|
|
166
|
+
# Try other formats if needed
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
def _extract_metadata(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
172
|
+
"""Extract additional metadata from message data."""
|
|
173
|
+
# Skip known fields
|
|
174
|
+
skip_fields = {"role", "content", "text", "timestamp", "message_index"}
|
|
175
|
+
|
|
176
|
+
metadata = {}
|
|
177
|
+
for key, value in data.items():
|
|
178
|
+
if key not in skip_fields:
|
|
179
|
+
metadata[key] = value
|
|
180
|
+
|
|
181
|
+
return metadata
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Extract tool usage and file references from conversations."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, Any, Set, List
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ToolUsageExtractor:
|
|
12
|
+
"""Extract files analyzed, edited, and tools used from conversations."""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
# Patterns for file operations
|
|
16
|
+
self.file_patterns = {
|
|
17
|
+
'analyzed': [
|
|
18
|
+
re.compile(r'(?:reading|analyzing|examining|looking at|checking)\s+(?:file\s+)?([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
19
|
+
re.compile(r'(?:Read|read)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
20
|
+
re.compile(r'(?:in|from)\s+file\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
|
|
21
|
+
],
|
|
22
|
+
'edited': [
|
|
23
|
+
re.compile(r'(?:editing|modifying|updating|changing|writing to)\s+(?:file\s+)?([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
24
|
+
re.compile(r'(?:Edit|Write)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
25
|
+
re.compile(r'(?:changes to|modified|updated)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
|
|
26
|
+
],
|
|
27
|
+
'created': [
|
|
28
|
+
re.compile(r'(?:creating|created|new file)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
29
|
+
re.compile(r'(?:Write|Create)\s+new\s+file\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Tool patterns
|
|
34
|
+
self.tool_patterns = [
|
|
35
|
+
re.compile(r'(?:using|running|executing)\s+(\w+)\s+(?:tool|command)', re.IGNORECASE),
|
|
36
|
+
re.compile(r'(?:Tool:|Command:)\s*(\w+)', re.IGNORECASE),
|
|
37
|
+
re.compile(r'```(?:bash|shell|sh)\n([a-z]+)', re.IGNORECASE),
|
|
38
|
+
re.compile(r'\$\s+([a-z]+)\s+', re.IGNORECASE), # Command line
|
|
39
|
+
re.compile(r'(?:npm|yarn|pip|cargo|go|cargo)\s+([\w\-]+)', re.IGNORECASE)
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# MCP tool pattern
|
|
43
|
+
self.mcp_pattern = re.compile(r'mcp__([a-zA-Z0-9\-_]+)__([a-zA-Z0-9\-_]+)')
|
|
44
|
+
|
|
45
|
+
def extract(self, text: str) -> Dict[str, Any]:
|
|
46
|
+
"""
|
|
47
|
+
Extract tool usage information from text.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Dictionary with files_analyzed, files_edited, tools_used
|
|
51
|
+
"""
|
|
52
|
+
files_analyzed = set()
|
|
53
|
+
files_edited = set()
|
|
54
|
+
files_created = set()
|
|
55
|
+
tools_used = set()
|
|
56
|
+
|
|
57
|
+
# Extract file operations
|
|
58
|
+
for pattern in self.file_patterns['analyzed']:
|
|
59
|
+
matches = pattern.findall(text)
|
|
60
|
+
for match in matches:
|
|
61
|
+
file_path = self._normalize_file_path(match)
|
|
62
|
+
if file_path:
|
|
63
|
+
files_analyzed.add(file_path)
|
|
64
|
+
|
|
65
|
+
for pattern in self.file_patterns['edited']:
|
|
66
|
+
matches = pattern.findall(text)
|
|
67
|
+
for match in matches:
|
|
68
|
+
file_path = self._normalize_file_path(match)
|
|
69
|
+
if file_path:
|
|
70
|
+
files_edited.add(file_path)
|
|
71
|
+
|
|
72
|
+
for pattern in self.file_patterns['created']:
|
|
73
|
+
matches = pattern.findall(text)
|
|
74
|
+
for match in matches:
|
|
75
|
+
file_path = self._normalize_file_path(match)
|
|
76
|
+
if file_path:
|
|
77
|
+
files_created.add(file_path)
|
|
78
|
+
|
|
79
|
+
# Extract tools
|
|
80
|
+
for pattern in self.tool_patterns:
|
|
81
|
+
matches = pattern.findall(text)
|
|
82
|
+
for match in matches:
|
|
83
|
+
tool = match.lower().strip()
|
|
84
|
+
if self._is_valid_tool(tool):
|
|
85
|
+
tools_used.add(tool)
|
|
86
|
+
|
|
87
|
+
# Extract MCP tools specifically
|
|
88
|
+
mcp_matches = self.mcp_pattern.findall(text)
|
|
89
|
+
for server, tool in mcp_matches:
|
|
90
|
+
tools_used.add(f"mcp:{server}:{tool}")
|
|
91
|
+
|
|
92
|
+
# Look for common CLI tools
|
|
93
|
+
common_tools = [
|
|
94
|
+
'git', 'npm', 'yarn', 'pip', 'python', 'node', 'docker',
|
|
95
|
+
'kubectl', 'aws', 'gcloud', 'az', 'terraform', 'ansible',
|
|
96
|
+
'make', 'gradle', 'maven', 'cargo', 'go', 'rustc'
|
|
97
|
+
]
|
|
98
|
+
for tool in common_tools:
|
|
99
|
+
if re.search(rf'\b{tool}\b', text, re.IGNORECASE):
|
|
100
|
+
tools_used.add(tool)
|
|
101
|
+
|
|
102
|
+
# Combine all files for backward compatibility
|
|
103
|
+
all_files = files_analyzed | files_edited | files_created
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
"files_analyzed": list(files_analyzed)[:50],
|
|
107
|
+
"files_edited": list(files_edited)[:50],
|
|
108
|
+
"files_created": list(files_created)[:50],
|
|
109
|
+
"files": list(all_files)[:50], # Legacy field
|
|
110
|
+
"tools_used": list(tools_used)[:30],
|
|
111
|
+
"file_count": len(all_files),
|
|
112
|
+
"tool_count": len(tools_used)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
def _normalize_file_path(self, path: str) -> str:
|
|
116
|
+
"""Normalize and validate file path."""
|
|
117
|
+
# Remove quotes and whitespace
|
|
118
|
+
path = path.strip('\'"` \n')
|
|
119
|
+
|
|
120
|
+
# Skip if too short or too long
|
|
121
|
+
if len(path) < 3 or len(path) > 200:
|
|
122
|
+
return ""
|
|
123
|
+
|
|
124
|
+
# Must have an extension
|
|
125
|
+
if '.' not in path:
|
|
126
|
+
return ""
|
|
127
|
+
|
|
128
|
+
# Extract just the filename if it's a full path
|
|
129
|
+
if '/' in path:
|
|
130
|
+
# Get the last component
|
|
131
|
+
path = path.split('/')[-1]
|
|
132
|
+
|
|
133
|
+
# Validate extension
|
|
134
|
+
valid_extensions = {
|
|
135
|
+
'py', 'js', 'ts', 'jsx', 'tsx', 'java', 'go', 'rs', 'cpp', 'c', 'h',
|
|
136
|
+
'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'html', 'css', 'scss',
|
|
137
|
+
'sql', 'sh', 'bash', 'dockerfile', 'makefile', 'toml', 'ini', 'cfg'
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
ext = path.split('.')[-1].lower()
|
|
141
|
+
if ext not in valid_extensions:
|
|
142
|
+
return ""
|
|
143
|
+
|
|
144
|
+
return path
|
|
145
|
+
|
|
146
|
+
def _is_valid_tool(self, tool: str) -> bool:
|
|
147
|
+
"""Check if a string is a valid tool name."""
|
|
148
|
+
# Skip common words
|
|
149
|
+
skip_words = {
|
|
150
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
151
|
+
'this', 'that', 'with', 'from', 'as', 'is', 'was', 'are', 'were'
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if tool in skip_words:
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
# Must be alphanumeric with possible hyphens/underscores
|
|
158
|
+
if not re.match(r'^[a-z0-9\-_]+$', tool):
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
# Reasonable length
|
|
162
|
+
if len(tool) < 2 or len(tool) > 30:
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
return True
|