claude-self-reflect 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/.claude/agents/claude-self-reflect-test.md +110 -66
  2. package/README.md +1 -1
  3. package/installer/setup-wizard.js +4 -2
  4. package/mcp-server/pyproject.toml +1 -0
  5. package/mcp-server/src/server.py +84 -0
  6. package/package.json +2 -1
  7. package/scripts/import-conversations-unified.py +225 -44
  8. package/scripts/importer/__init__.py +25 -0
  9. package/scripts/importer/__main__.py +14 -0
  10. package/scripts/importer/core/__init__.py +25 -0
  11. package/scripts/importer/core/config.py +120 -0
  12. package/scripts/importer/core/exceptions.py +52 -0
  13. package/scripts/importer/core/models.py +184 -0
  14. package/scripts/importer/embeddings/__init__.py +22 -0
  15. package/scripts/importer/embeddings/base.py +141 -0
  16. package/scripts/importer/embeddings/fastembed_provider.py +164 -0
  17. package/scripts/importer/embeddings/validator.py +136 -0
  18. package/scripts/importer/embeddings/voyage_provider.py +251 -0
  19. package/scripts/importer/main.py +393 -0
  20. package/scripts/importer/processors/__init__.py +15 -0
  21. package/scripts/importer/processors/ast_extractor.py +197 -0
  22. package/scripts/importer/processors/chunker.py +157 -0
  23. package/scripts/importer/processors/concept_extractor.py +109 -0
  24. package/scripts/importer/processors/conversation_parser.py +181 -0
  25. package/scripts/importer/processors/tool_extractor.py +165 -0
  26. package/scripts/importer/state/__init__.py +5 -0
  27. package/scripts/importer/state/state_manager.py +190 -0
  28. package/scripts/importer/storage/__init__.py +5 -0
  29. package/scripts/importer/storage/qdrant_storage.py +250 -0
  30. package/scripts/importer/utils/__init__.py +9 -0
  31. package/scripts/importer/utils/logger.py +87 -0
  32. package/scripts/importer/utils/project_normalizer.py +120 -0
@@ -0,0 +1,197 @@
1
+ """Extract AST elements from code blocks."""
2
+
3
+ import ast
4
+ import re
5
+ import logging
6
+ from typing import Dict, Any, Set, List
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ASTExtractor:
12
+ """
13
+ Extract Abstract Syntax Tree elements from code.
14
+
15
+ Implements the critical fixes identified in code review:
16
+ 1. More permissive code fence regex
17
+ 2. Python regex fallback for partial code
18
+ 3. Bounded extraction with MAX_AST_ELEMENTS
19
+ """
20
+
21
+ def __init__(self, max_elements: int = 100):
22
+ self.max_elements = max_elements
23
+
24
+ # FIX: More permissive code fence regex to handle various formats
25
+ # Matches: ```python, ```py, ```javascript, ```ts strict, etc.
26
+ self.code_fence_pattern = re.compile(
27
+ r'```[^\n]*\n?(.*?)```',
28
+ re.DOTALL
29
+ )
30
+
31
+ # Python patterns for fallback extraction
32
+ self.python_patterns = {
33
+ 'function': re.compile(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
34
+ 'async_function': re.compile(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
35
+ 'class': re.compile(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', re.MULTILINE),
36
+ 'method': re.compile(r'^\s+def\s+([A-Za-z_]\w*)\s*\(self', re.MULTILINE),
37
+ 'static_method': re.compile(r'@staticmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL),
38
+ 'class_method': re.compile(r'@classmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL)
39
+ }
40
+
41
+ # JavaScript/TypeScript patterns
42
+ self.js_patterns = {
43
+ 'function': re.compile(r'function\s+([A-Za-z_$][\w$]*)\s*\('),
44
+ 'arrow': re.compile(r'(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>'),
45
+ 'async_function': re.compile(r'async\s+function\s+([A-Za-z_$][\w$]*)\s*\('),
46
+ 'class': re.compile(r'class\s+([A-Za-z_$][\w$]*)\s*(?:extends\s+[A-Za-z_$][\w$]*)?\s*\{'),
47
+ 'method': re.compile(r'([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{'),
48
+ 'export_function': re.compile(r'export\s+(?:default\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\('),
49
+ 'export_const': re.compile(r'export\s+const\s+([A-Za-z_$][\w$]*)\s*=')
50
+ }
51
+
52
+ def extract(self, text: str) -> Dict[str, Any]:
53
+ """
54
+ Extract AST elements from text.
55
+
56
+ Returns:
57
+ Dictionary with ast_elements and code_blocks keys
58
+ """
59
+ elements = set()
60
+ has_code = False
61
+
62
+ # Extract code blocks using permissive regex
63
+ code_blocks = self.code_fence_pattern.findall(text)
64
+
65
+ for code_block in code_blocks[:10]: # Limit processing
66
+ has_code = True
67
+
68
+ # Try to detect language from content
69
+ if self._looks_like_python(code_block):
70
+ python_elements = self._extract_python_ast(code_block)
71
+ elements.update(python_elements)
72
+ elif self._looks_like_javascript(code_block):
73
+ js_elements = self._extract_javascript_patterns(code_block)
74
+ elements.update(js_elements)
75
+ else:
76
+ # Try both as fallback
77
+ elements.update(self._extract_python_ast(code_block))
78
+ elements.update(self._extract_javascript_patterns(code_block))
79
+
80
+ # FIX: Enforce max elements limit
81
+ if len(elements) >= self.max_elements:
82
+ logger.debug(f"Reached max AST elements limit: {self.max_elements}")
83
+ break
84
+
85
+ # Also check for inline code patterns outside of fences
86
+ if not has_code:
87
+ # Look for function/class definitions in plain text
88
+ elements.update(self._extract_inline_patterns(text))
89
+
90
+ return {
91
+ "ast_elements": list(elements)[:self.max_elements],
92
+ "has_code_blocks": has_code
93
+ }
94
+
95
+ def _extract_python_ast(self, code: str) -> Set[str]:
96
+ """Extract Python AST elements with fallback to regex."""
97
+ elements = set()
98
+
99
+ try:
100
+ # Try proper AST parsing first
101
+ tree = ast.parse(code)
102
+
103
+ for node in ast.walk(tree):
104
+ if len(elements) >= self.max_elements:
105
+ break
106
+
107
+ if isinstance(node, ast.FunctionDef):
108
+ elements.add(f"func:{node.name}")
109
+ elif isinstance(node, ast.AsyncFunctionDef):
110
+ elements.add(f"func:{node.name}")
111
+ elif isinstance(node, ast.ClassDef):
112
+ elements.add(f"class:{node.name}")
113
+ # Extract methods
114
+ for item in node.body:
115
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
116
+ elements.add(f"method:{node.name}.{item.name}")
117
+ if len(elements) >= self.max_elements:
118
+ break
119
+
120
+ except (SyntaxError, ValueError) as e:
121
+ # FIX: Python regex fallback for partial code fragments
122
+ logger.debug(f"AST parsing failed, using regex fallback: {e}")
123
+
124
+ for pattern_type, pattern in self.python_patterns.items():
125
+ for match in pattern.finditer(code):
126
+ if len(elements) >= self.max_elements:
127
+ break
128
+
129
+ name = match.group(1)
130
+ if 'method' in pattern_type:
131
+ elements.add(f"method:{name}")
132
+ elif 'class' in pattern_type:
133
+ elements.add(f"class:{name}")
134
+ else:
135
+ elements.add(f"func:{name}")
136
+
137
+ return elements
138
+
139
+ def _extract_javascript_patterns(self, code: str) -> Set[str]:
140
+ """Extract JavaScript/TypeScript patterns."""
141
+ elements = set()
142
+
143
+ for pattern_type, pattern in self.js_patterns.items():
144
+ for match in pattern.finditer(code):
145
+ if len(elements) >= self.max_elements:
146
+ break
147
+
148
+ name = match.group(1)
149
+ if 'class' in pattern_type:
150
+ elements.add(f"class:{name}")
151
+ elif 'method' in pattern_type and name not in ['constructor', 'if', 'for', 'while']:
152
+ elements.add(f"method:{name}")
153
+ else:
154
+ elements.add(f"func:{name}")
155
+
156
+ return elements
157
+
158
+ def _extract_inline_patterns(self, text: str) -> Set[str]:
159
+ """Extract patterns from inline code mentions."""
160
+ elements = set()
161
+
162
+ # Look for backtick-wrapped function/class names
163
+ inline_pattern = re.compile(r'`([A-Za-z_][\w]*(?:\.[A-Za-z_][\w]*)*)`')
164
+
165
+ for match in inline_pattern.finditer(text):
166
+ if len(elements) >= self.max_elements:
167
+ break
168
+
169
+ name = match.group(1)
170
+ # Heuristic: if contains dot, likely a method
171
+ if '.' in name:
172
+ elements.add(f"method:{name}")
173
+ # Heuristic: PascalCase likely a class
174
+ elif name[0].isupper():
175
+ elements.add(f"class:{name}")
176
+ # Otherwise assume function
177
+ else:
178
+ elements.add(f"func:{name}")
179
+
180
+ return elements
181
+
182
+ def _looks_like_python(self, code: str) -> bool:
183
+ """Heuristic to detect Python code."""
184
+ python_indicators = [
185
+ 'def ', 'import ', 'from ', 'class ', 'self.', 'self,',
186
+ '__init__', '__name__', 'if __name__', 'print(', 'async def'
187
+ ]
188
+ return any(indicator in code for indicator in python_indicators)
189
+
190
+ def _looks_like_javascript(self, code: str) -> bool:
191
+ """Heuristic to detect JavaScript/TypeScript."""
192
+ js_indicators = [
193
+ 'function ', 'const ', 'let ', 'var ', '=>', 'export ',
194
+ 'import ', 'class ', 'constructor(', 'this.', 'async function',
195
+ 'interface ', 'type ', 'namespace ', 'enum '
196
+ ]
197
+ return any(indicator in code for indicator in js_indicators)
@@ -0,0 +1,157 @@
1
+ """Intelligent chunking for conversations."""
2
+
3
+ import logging
4
+ from typing import List, Optional
5
+ from pathlib import Path
6
+
7
+ from ..core import Message, ConversationChunk
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Chunker:
13
+ """
14
+ Create optimized chunks from conversation messages.
15
+
16
+ Implements intelligent chunking with overlap for context preservation.
17
+ """
18
+
19
+ def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 200):
20
+ self.chunk_size = chunk_size
21
+ self.chunk_overlap = chunk_overlap
22
+
23
+ def create_chunks(
24
+ self,
25
+ messages: List[Message],
26
+ file_path: str
27
+ ) -> List[ConversationChunk]:
28
+ """
29
+ Create chunks from messages.
30
+
31
+ Args:
32
+ messages: List of messages to chunk
33
+ file_path: Source file path for metadata
34
+
35
+ Returns:
36
+ List of conversation chunks
37
+ """
38
+ if not messages:
39
+ return []
40
+
41
+ # Generate conversation ID from file path
42
+ conversation_id = Path(file_path).stem
43
+
44
+ chunks = []
45
+ current_chunk_text = []
46
+ current_chunk_size = 0
47
+ current_message_indices = []
48
+
49
+ for msg in messages:
50
+ # Format message for chunk
51
+ formatted = self._format_message(msg)
52
+ msg_size = len(formatted)
53
+
54
+ # Check if adding this message would exceed chunk size
55
+ if current_chunk_size + msg_size > self.chunk_size and current_chunk_text:
56
+ # Create chunk with current messages
57
+ chunk = self._create_chunk(
58
+ current_chunk_text,
59
+ current_message_indices,
60
+ len(chunks),
61
+ conversation_id,
62
+ file_path
63
+ )
64
+ chunks.append(chunk)
65
+
66
+ # Start new chunk with overlap
67
+ overlap_text, overlap_indices = self._get_overlap(
68
+ current_chunk_text,
69
+ current_message_indices
70
+ )
71
+ current_chunk_text = overlap_text
72
+ current_message_indices = overlap_indices
73
+ current_chunk_size = sum(len(t) for t in current_chunk_text)
74
+
75
+ # Add message to current chunk
76
+ current_chunk_text.append(formatted)
77
+ # Fix: Check for None instead of truthiness to include index 0
78
+ if msg.message_index is not None:
79
+ current_message_indices.append(msg.message_index)
80
+ current_chunk_size += msg_size
81
+
82
+ # Create final chunk
83
+ if current_chunk_text:
84
+ chunk = self._create_chunk(
85
+ current_chunk_text,
86
+ current_message_indices,
87
+ len(chunks),
88
+ conversation_id,
89
+ file_path
90
+ )
91
+ chunks.append(chunk)
92
+
93
+ # Update total chunks count
94
+ for chunk in chunks:
95
+ chunk.total_chunks = len(chunks)
96
+
97
+ logger.debug(f"Created {len(chunks)} chunks from {len(messages)} messages")
98
+ return chunks
99
+
100
+ def _format_message(self, message: Message) -> str:
101
+ """Format a message for inclusion in chunk."""
102
+ # Include role for context
103
+ role_prefix = f"[{message.role.upper()}]: "
104
+ return role_prefix + message.content
105
+
106
+ def _get_overlap(
107
+ self,
108
+ chunk_text: List[str],
109
+ message_indices: List[int]
110
+ ) -> tuple[List[str], List[int]]:
111
+ """Get overlap text and indices for next chunk."""
112
+ if not chunk_text:
113
+ return [], []
114
+
115
+ # Calculate how many messages to include in overlap
116
+ overlap_size = 0
117
+ overlap_messages = []
118
+ overlap_indices = []
119
+
120
+ # Work backwards to get overlap
121
+ for i in range(len(chunk_text) - 1, -1, -1):
122
+ msg_size = len(chunk_text[i])
123
+ if overlap_size + msg_size <= self.chunk_overlap:
124
+ overlap_messages.insert(0, chunk_text[i])
125
+ if i < len(message_indices):
126
+ overlap_indices.insert(0, message_indices[i])
127
+ overlap_size += msg_size
128
+ else:
129
+ break
130
+
131
+ return overlap_messages, overlap_indices
132
+
133
+ def _create_chunk(
134
+ self,
135
+ text_parts: List[str],
136
+ message_indices: List[int],
137
+ chunk_index: int,
138
+ conversation_id: str,
139
+ file_path: str
140
+ ) -> ConversationChunk:
141
+ """Create a conversation chunk."""
142
+ chunk_text = "\n".join(text_parts)
143
+
144
+ chunk = ConversationChunk(
145
+ text=chunk_text,
146
+ message_indices=message_indices,
147
+ chunk_index=chunk_index,
148
+ total_chunks=0, # Will be updated later
149
+ conversation_id=conversation_id
150
+ )
151
+
152
+ # Add file metadata
153
+ chunk.add_metadata("file_path", file_path)
154
+ chunk.add_metadata("chunk_method", "overlap")
155
+ chunk.add_metadata("chunk_size_chars", len(chunk_text))
156
+
157
+ return chunk
@@ -0,0 +1,109 @@
1
+ """Extract concepts and keywords from text."""
2
+
3
+ import re
4
+ import logging
5
+ from typing import Dict, Any, Set, List
6
+ from collections import Counter
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ConceptExtractor:
12
+ """Extract key concepts and keywords from conversation text."""
13
+
14
+ def __init__(self):
15
+ # Technical concepts to look for
16
+ self.tech_patterns = {
17
+ 'languages': re.compile(r'\b(python|javascript|typescript|java|rust|go|c\+\+|ruby|php|swift|kotlin)\b', re.IGNORECASE),
18
+ 'frameworks': re.compile(r'\b(react|vue|angular|django|flask|fastapi|express|spring|rails|laravel)\b', re.IGNORECASE),
19
+ 'databases': re.compile(r'\b(mongodb|postgres|mysql|redis|elasticsearch|dynamodb|sqlite|cassandra)\b', re.IGNORECASE),
20
+ 'cloud': re.compile(r'\b(aws|azure|gcp|docker|kubernetes|serverless|lambda|ec2|s3)\b', re.IGNORECASE),
21
+ 'tools': re.compile(r'\b(git|npm|yarn|webpack|babel|eslint|pytest|jest|vscode|vim)\b', re.IGNORECASE),
22
+ 'concepts': re.compile(r'\b(api|rest|graphql|microservices|ci\/cd|devops|agile|tdd|security|authentication)\b', re.IGNORECASE)
23
+ }
24
+
25
+ # Common stop words to exclude
26
+ self.stop_words = {
27
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
28
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
29
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
30
+ 'should', 'could', 'may', 'might', 'must', 'can', 'shall', 'need'
31
+ }
32
+
33
+ def extract(self, text: str) -> Dict[str, Any]:
34
+ """
35
+ Extract concepts from text.
36
+
37
+ Returns:
38
+ Dictionary with concepts list
39
+ """
40
+ concepts = set()
41
+
42
+ # Extract technical concepts
43
+ for category, pattern in self.tech_patterns.items():
44
+ matches = pattern.findall(text)
45
+ for match in matches:
46
+ concepts.add(match.lower())
47
+
48
+ # Extract capitalized terms (likely important)
49
+ capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
50
+ for term in capitalized[:20]: # Limit to prevent noise
51
+ if term.lower() not in self.stop_words and len(term) > 3:
52
+ concepts.add(term.lower())
53
+
54
+ # Extract terms in backticks (code references)
55
+ code_terms = re.findall(r'`([^`]+)`', text)
56
+ for term in code_terms[:20]:
57
+ # Clean and add if it's a reasonable concept
58
+ clean_term = term.strip().lower()
59
+ if len(clean_term) > 2 and len(clean_term) < 50:
60
+ # Skip if it looks like code
61
+ if not any(char in clean_term for char in ['{', '}', '(', ')', ';', '=']):
62
+ concepts.add(clean_term)
63
+
64
+ # Extract file extensions mentioned
65
+ extensions = re.findall(r'\b\w+\.(py|js|ts|jsx|tsx|java|go|rs|cpp|c|h|md|json|yaml|yml|xml|html|css|sql)\b', text)
66
+ for ext in extensions[:10]:
67
+ concepts.add(f"file:{ext.split('.')[-1]}")
68
+
69
+ # Extract error types
70
+ errors = re.findall(r'\b(\w+Error|Exception)\b', text)
71
+ for error in errors[:10]:
72
+ concepts.add(f"error:{error.lower()}")
73
+
74
+ # Limit total concepts to prevent bloat
75
+ concept_list = list(concepts)[:50]
76
+
77
+ return {
78
+ "concepts": concept_list,
79
+ "concept_count": len(concept_list)
80
+ }
81
+
82
+ def extract_topics(self, text: str) -> List[str]:
83
+ """
84
+ Extract higher-level topics from text.
85
+
86
+ This is a more sophisticated extraction for topic modeling.
87
+ """
88
+ topics = []
89
+
90
+ # Check for common development topics
91
+ topic_indicators = {
92
+ 'debugging': ['debug', 'error', 'bug', 'fix', 'issue', 'problem'],
93
+ 'testing': ['test', 'unit test', 'integration', 'pytest', 'jest', 'coverage'],
94
+ 'deployment': ['deploy', 'production', 'release', 'ci/cd', 'pipeline'],
95
+ 'optimization': ['optimize', 'performance', 'speed', 'efficiency', 'cache'],
96
+ 'security': ['security', 'authentication', 'authorization', 'encryption', 'vulnerability'],
97
+ 'database': ['database', 'sql', 'query', 'schema', 'migration'],
98
+ 'api': ['api', 'endpoint', 'rest', 'graphql', 'webhook'],
99
+ 'frontend': ['ui', 'ux', 'component', 'react', 'vue', 'css', 'style'],
100
+ 'backend': ['server', 'backend', 'api', 'database', 'microservice'],
101
+ 'architecture': ['architecture', 'design pattern', 'structure', 'refactor']
102
+ }
103
+
104
+ text_lower = text.lower()
105
+ for topic, indicators in topic_indicators.items():
106
+ if any(indicator in text_lower for indicator in indicators):
107
+ topics.append(topic)
108
+
109
+ return topics[:5] # Return top 5 topics
@@ -0,0 +1,181 @@
1
+ """Parser for JSONL conversation files."""
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import List, Dict, Any, Optional
7
+ from datetime import datetime
8
+
9
+ from ..core import Message
10
+ from ..core.exceptions import ParseError
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ConversationParser:
16
+ """
17
+ Parse JSONL conversation files into Message objects.
18
+
19
+ Handles various conversation formats from Claude.
20
+ """
21
+
22
+ def parse_file(self, file_path: Path) -> List[Message]:
23
+ """
24
+ Parse a JSONL file into messages.
25
+
26
+ Args:
27
+ file_path: Path to JSONL file
28
+
29
+ Returns:
30
+ List of Message objects
31
+
32
+ Raises:
33
+ ParseError: If parsing fails
34
+ """
35
+ messages = []
36
+
37
+ try:
38
+ with open(file_path, 'r', encoding='utf-8') as f:
39
+ for line_num, line in enumerate(f, 1):
40
+ line = line.strip()
41
+ if not line:
42
+ continue
43
+
44
+ try:
45
+ data = json.loads(line)
46
+ message = self._parse_message(data, line_num)
47
+ if message:
48
+ messages.append(message)
49
+ except json.JSONDecodeError as e:
50
+ logger.warning(f"Skipping invalid JSON at line {line_num}: {e}")
51
+ # Don't fail entire file for one bad line
52
+ continue
53
+
54
+ if not messages:
55
+ raise ParseError(
56
+ str(file_path),
57
+ reason="No valid messages found in file"
58
+ )
59
+
60
+ # Add message indices
61
+ for i, msg in enumerate(messages):
62
+ msg.message_index = i
63
+
64
+ logger.debug(f"Parsed {len(messages)} messages from {file_path}")
65
+ return messages
66
+
67
+ except FileNotFoundError:
68
+ raise ParseError(str(file_path), reason="File not found")
69
+ except Exception as e:
70
+ if isinstance(e, ParseError):
71
+ raise
72
+ raise ParseError(str(file_path), reason=str(e))
73
+
74
+ def _parse_message(self, data: Dict[str, Any], line_num: int) -> Optional[Message]:
75
+ """
76
+ Parse a single message from JSON data.
77
+
78
+ Handles multiple conversation formats.
79
+ """
80
+ # Format 1: Direct message format
81
+ if "role" in data and "content" in data:
82
+ return Message(
83
+ role=data["role"],
84
+ content=self._extract_content(data["content"]),
85
+ timestamp=self._parse_timestamp(data.get("timestamp")),
86
+ metadata=self._extract_metadata(data)
87
+ )
88
+
89
+ # Format 2: Nested messages array
90
+ if "messages" in data and isinstance(data["messages"], list):
91
+ # Return first message or aggregate
92
+ messages = []
93
+ for msg_data in data["messages"]:
94
+ if isinstance(msg_data, dict) and "role" in msg_data:
95
+ msg = Message(
96
+ role=msg_data["role"],
97
+ content=self._extract_content(msg_data.get("content", "")),
98
+ timestamp=self._parse_timestamp(msg_data.get("timestamp")),
99
+ metadata=self._extract_metadata(msg_data)
100
+ )
101
+ messages.append(msg)
102
+
103
+ # For now, return first message
104
+ # In future, might want to handle differently
105
+ return messages[0] if messages else None
106
+
107
+ # Format 3: Event-based format
108
+ if "event" in data and data["event"] == "message":
109
+ return Message(
110
+ role=data.get("role", "unknown"),
111
+ content=self._extract_content(data.get("text", "")),
112
+ timestamp=self._parse_timestamp(data.get("timestamp")),
113
+ metadata=self._extract_metadata(data)
114
+ )
115
+
116
+ # Unknown format
117
+ logger.debug(f"Unknown message format at line {line_num}")
118
+ return None
119
+
120
+ def _extract_content(self, content: Any) -> str:
121
+ """Extract text content from various formats."""
122
+ if isinstance(content, str):
123
+ return content
124
+
125
+ if isinstance(content, list):
126
+ # Handle content array format
127
+ text_parts = []
128
+ for item in content:
129
+ if isinstance(item, dict):
130
+ if "text" in item:
131
+ text_parts.append(item["text"])
132
+ elif "content" in item:
133
+ text_parts.append(str(item["content"]))
134
+ else:
135
+ text_parts.append(str(item))
136
+ return "\n".join(text_parts)
137
+
138
+ if isinstance(content, dict):
139
+ if "text" in content:
140
+ return content["text"]
141
+ elif "content" in content:
142
+ return str(content["content"])
143
+
144
+ return str(content) if content else ""
145
+
146
+ def _parse_timestamp(self, timestamp: Any) -> Optional[datetime]:
147
+ """Parse timestamp from various formats."""
148
+ if not timestamp:
149
+ return None
150
+
151
+ if isinstance(timestamp, datetime):
152
+ return timestamp
153
+
154
+ if isinstance(timestamp, (int, float)):
155
+ # Unix timestamp
156
+ try:
157
+ return datetime.fromtimestamp(timestamp)
158
+ except Exception:
159
+ return None
160
+
161
+ if isinstance(timestamp, str):
162
+ # ISO format or other string formats
163
+ try:
164
+ return datetime.fromisoformat(timestamp)
165
+ except Exception:
166
+ # Try other formats if needed
167
+ return None
168
+
169
+ return None
170
+
171
+ def _extract_metadata(self, data: Dict[str, Any]) -> Dict[str, Any]:
172
+ """Extract additional metadata from message data."""
173
+ # Skip known fields
174
+ skip_fields = {"role", "content", "text", "timestamp", "message_index"}
175
+
176
+ metadata = {}
177
+ for key, value in data.items():
178
+ if key not in skip_fields:
179
+ metadata[key] = value
180
+
181
+ return metadata