claude-self-reflect 2.4.15 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "claude-self-reflect-mcp"
3
- version = "2.4.11"
3
+ version = "2.5.1"
4
4
  description = "MCP server for Claude self-reflection with memory decay"
5
5
  # readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -887,6 +887,223 @@ async def get_more_results(
887
887
  return response
888
888
 
889
889
 
890
+ @mcp.tool()
891
+ async def search_by_file(
892
+ ctx: Context,
893
+ file_path: str = Field(description="The file path to search for in conversations"),
894
+ limit: int = Field(default=10, description="Maximum number of results to return"),
895
+ project: Optional[str] = Field(default=None, description="Search specific project only. Use 'all' to search across all projects.")
896
+ ) -> str:
897
+ """Search for conversations that analyzed a specific file."""
898
+ global qdrant_client
899
+
900
+ # Normalize file path
901
+ normalized_path = file_path.replace("\\", "/").replace("/Users/", "~/")
902
+
903
+ # Determine which collections to search
904
+ # If no project specified, search all collections
905
+ collections = await get_all_collections() if not project else []
906
+
907
+ if project and project != 'all':
908
+ # Filter collections for specific project
909
+ project_hash = hashlib.md5(project.encode()).hexdigest()[:8]
910
+ collection_prefix = f"conv_{project_hash}_"
911
+ collections = [c for c in await get_all_collections() if c.startswith(collection_prefix)]
912
+ elif project == 'all':
913
+ collections = await get_all_collections()
914
+
915
+ if not collections:
916
+ return "<search_by_file>\n<error>No collections found to search</error>\n</search_by_file>"
917
+
918
+ # Prepare results
919
+ all_results = []
920
+
921
+ for collection_name in collections:
922
+ try:
923
+ # Use scroll to get all points and filter manually
924
+ # Qdrant's array filtering can be tricky, so we'll filter in code
925
+ scroll_result = await qdrant_client.scroll(
926
+ collection_name=collection_name,
927
+ limit=1000, # Get a batch
928
+ with_payload=True
929
+ )
930
+
931
+ # Filter results that contain the file
932
+ for point in scroll_result[0]:
933
+ payload = point.payload
934
+ files_analyzed = payload.get('files_analyzed', [])
935
+ files_edited = payload.get('files_edited', [])
936
+
937
+ if normalized_path in files_analyzed or normalized_path in files_edited:
938
+ all_results.append({
939
+ 'score': 1.0, # File match is always 1.0
940
+ 'payload': payload,
941
+ 'collection': collection_name
942
+ })
943
+
944
+ except Exception as e:
945
+ continue
946
+
947
+ # Sort by timestamp (newest first)
948
+ all_results.sort(key=lambda x: x['payload'].get('timestamp', ''), reverse=True)
949
+
950
+ # Format results
951
+ if not all_results:
952
+ return f"""<search_by_file>
953
+ <query>{file_path}</query>
954
+ <normalized_path>{normalized_path}</normalized_path>
955
+ <message>No conversations found that analyzed this file</message>
956
+ </search_by_file>"""
957
+
958
+ results_text = []
959
+ for i, result in enumerate(all_results[:limit]):
960
+ payload = result['payload']
961
+ timestamp = payload.get('timestamp', 'Unknown')
962
+ conversation_id = payload.get('conversation_id', 'Unknown')
963
+ project = payload.get('project', 'Unknown')
964
+ text_preview = payload.get('text', '')[:200] + '...' if len(payload.get('text', '')) > 200 else payload.get('text', '')
965
+
966
+ # Check if file was edited or just read
967
+ action = "edited" if normalized_path in payload.get('files_edited', []) else "analyzed"
968
+
969
+ # Get related tools used
970
+ tool_summary = payload.get('tool_summary', {})
971
+ tools_used = ', '.join(f"{tool}({count})" for tool, count in tool_summary.items())
972
+
973
+ results_text.append(f"""<result rank="{i+1}">
974
+ <conversation_id>{conversation_id}</conversation_id>
975
+ <project>{project}</project>
976
+ <timestamp>{timestamp}</timestamp>
977
+ <action>{action}</action>
978
+ <tools_used>{tools_used}</tools_used>
979
+ <preview>{text_preview}</preview>
980
+ </result>""")
981
+
982
+ return f"""<search_by_file>
983
+ <query>{file_path}</query>
984
+ <normalized_path>{normalized_path}</normalized_path>
985
+ <count>{len(all_results)}</count>
986
+ <results>
987
+ {''.join(results_text)}
988
+ </results>
989
+ </search_by_file>"""
990
+
991
+
992
+ @mcp.tool()
993
+ async def search_by_concept(
994
+ ctx: Context,
995
+ concept: str = Field(description="The concept to search for (e.g., 'security', 'docker', 'testing')"),
996
+ include_files: bool = Field(default=True, description="Include file information in results"),
997
+ limit: int = Field(default=10, description="Maximum number of results to return"),
998
+ project: Optional[str] = Field(default=None, description="Search specific project only. Use 'all' to search across all projects.")
999
+ ) -> str:
1000
+ """Search for conversations about a specific development concept."""
1001
+ global qdrant_client
1002
+
1003
+ # Generate embedding for the concept
1004
+ embedding = await generate_embedding(concept)
1005
+
1006
+ # Determine which collections to search
1007
+ # If no project specified, search all collections
1008
+ collections = await get_all_collections() if not project else []
1009
+
1010
+ if project and project != 'all':
1011
+ # Filter collections for specific project
1012
+ project_hash = hashlib.md5(project.encode()).hexdigest()[:8]
1013
+ collection_prefix = f"conv_{project_hash}_"
1014
+ collections = [c for c in await get_all_collections() if c.startswith(collection_prefix)]
1015
+ elif project == 'all':
1016
+ collections = await get_all_collections()
1017
+
1018
+ if not collections:
1019
+ return "<search_by_concept>\n<error>No collections found to search</error>\n</search_by_concept>"
1020
+
1021
+ # Search all collections
1022
+ all_results = []
1023
+
1024
+ for collection_name in collections:
1025
+ try:
1026
+ # Hybrid search: semantic + concept filter
1027
+ results = await qdrant_client.search(
1028
+ collection_name=collection_name,
1029
+ query_vector=embedding,
1030
+ query_filter=models.Filter(
1031
+ should=[
1032
+ models.FieldCondition(
1033
+ key="concepts",
1034
+ match=models.MatchAny(any=[concept.lower()])
1035
+ )
1036
+ ]
1037
+ ),
1038
+ limit=limit * 2, # Get more results for better filtering
1039
+ with_payload=True
1040
+ )
1041
+
1042
+ for point in results:
1043
+ payload = point.payload
1044
+ # Boost score if concept is in the concepts list
1045
+ score_boost = 0.2 if concept.lower() in payload.get('concepts', []) else 0.0
1046
+ all_results.append({
1047
+ 'score': float(point.score) + score_boost,
1048
+ 'payload': payload,
1049
+ 'collection': collection_name
1050
+ })
1051
+
1052
+ except Exception as e:
1053
+ continue
1054
+
1055
+ # Sort by score and limit
1056
+ all_results.sort(key=lambda x: x['score'], reverse=True)
1057
+ all_results = all_results[:limit]
1058
+
1059
+ # Format results
1060
+ if not all_results:
1061
+ return f"""<search_by_concept>
1062
+ <concept>{concept}</concept>
1063
+ <message>No conversations found about this concept</message>
1064
+ </search_by_concept>"""
1065
+
1066
+ results_text = []
1067
+ for i, result in enumerate(all_results):
1068
+ payload = result['payload']
1069
+ score = result['score']
1070
+ timestamp = payload.get('timestamp', 'Unknown')
1071
+ conversation_id = payload.get('conversation_id', 'Unknown')
1072
+ project = payload.get('project', 'Unknown')
1073
+ concepts = payload.get('concepts', [])
1074
+
1075
+ # Get text preview
1076
+ text_preview = payload.get('text', '')[:200] + '...' if len(payload.get('text', '')) > 200 else payload.get('text', '')
1077
+
1078
+ # File information
1079
+ files_info = ""
1080
+ if include_files:
1081
+ files_analyzed = payload.get('files_analyzed', [])[:5]
1082
+ if files_analyzed:
1083
+ files_info = f"\n<files_analyzed>{', '.join(files_analyzed)}</files_analyzed>"
1084
+
1085
+ # Related concepts
1086
+ related_concepts = [c for c in concepts if c != concept.lower()][:5]
1087
+
1088
+ results_text.append(f"""<result rank="{i+1}">
1089
+ <score>{score:.3f}</score>
1090
+ <conversation_id>{conversation_id}</conversation_id>
1091
+ <project>{project}</project>
1092
+ <timestamp>{timestamp}</timestamp>
1093
+ <concepts>{', '.join(concepts)}</concepts>
1094
+ <related_concepts>{', '.join(related_concepts)}</related_concepts>{files_info}
1095
+ <preview>{text_preview}</preview>
1096
+ </result>""")
1097
+
1098
+ return f"""<search_by_concept>
1099
+ <concept>{concept}</concept>
1100
+ <count>{len(all_results)}</count>
1101
+ <results>
1102
+ {''.join(results_text)}
1103
+ </results>
1104
+ </search_by_concept>"""
1105
+
1106
+
890
1107
  # Debug output
891
1108
  print(f"[DEBUG] FastMCP server created with name: {mcp.name}")
892
1109
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-self-reflect",
3
- "version": "2.4.15",
3
+ "version": "2.5.1",
4
4
  "description": "Give Claude perfect memory of all your conversations - Installation wizard for Python MCP server",
5
5
  "keywords": [
6
6
  "claude",
@@ -0,0 +1,672 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced import script that extracts tool usage metadata from conversations.
4
+ Supports both local and Voyage AI embeddings with tool tracking.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import json
10
+ import glob
11
+ import hashlib
12
+ import gc
13
+ import re
14
+ import time
15
+ from datetime import datetime, timedelta
16
+ from typing import List, Dict, Any, Set, Tuple
17
+ import logging
18
+ from pathlib import Path
19
+
20
+ from qdrant_client import QdrantClient
21
+ from qdrant_client.models import (
22
+ VectorParams, Distance, PointStruct,
23
+ Filter, FieldCondition, MatchValue
24
+ )
25
+
26
+ from tenacity import (
27
+ retry,
28
+ stop_after_attempt,
29
+ wait_random_exponential,
30
+ )
31
+
32
+ # Configuration
33
+ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
34
+ LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
35
+ STATE_FILE = os.getenv("STATE_FILE", "./config/imported-files-enhanced.json")
36
+ BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
37
+ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
38
+ VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
39
+ DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
40
+
41
+ # Set up logging
42
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Import timing stats
46
+ timing_stats = {
47
+ "extract": [],
48
+ "chunk": [],
49
+ "embed": [],
50
+ "store": [],
51
+ "total": []
52
+ }
53
+
54
+ def normalize_path(path: str) -> str:
55
+ """Normalize file paths for consistency across platforms."""
56
+ if not path:
57
+ return ""
58
+
59
+ # Remove common prefixes
60
+ path = path.replace("/Users/", "~/")
61
+ path = path.replace("\\Users\\", "~\\")
62
+
63
+ # Convert to forward slashes
64
+ path = path.replace("\\", "/")
65
+
66
+ # Remove duplicate slashes
67
+ path = re.sub(r'/+', '/', path)
68
+
69
+ return path
70
+
71
+ def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
72
+ """Extract high-level concepts from conversation and tool usage."""
73
+ concepts = set()
74
+
75
+ # Common development concepts with patterns
76
+ concept_patterns = {
77
+ 'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
78
+ 'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
79
+ 'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
80
+ 'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
81
+ 'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
82
+ 'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb)',
83
+ 'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
84
+ 'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
85
+ 'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
86
+ 'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
87
+ 'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
88
+ 'architecture': r'(architecture|design|pattern|structure|component|module)',
89
+ 'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
90
+ 'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
91
+ 'search': r'(search|query|find|filter|match|relevance)'
92
+ }
93
+
94
+ # Check text content
95
+ combined_text = text.lower()
96
+ for concept, pattern in concept_patterns.items():
97
+ if re.search(pattern, combined_text, re.IGNORECASE):
98
+ concepts.add(concept)
99
+
100
+ # Check tool usage patterns
101
+ tool_text = json.dumps(tool_usage).lower()
102
+ for concept, pattern in concept_patterns.items():
103
+ if re.search(pattern, tool_text, re.IGNORECASE):
104
+ concepts.add(concept)
105
+
106
+ # Add concepts based on specific tool usage
107
+ if tool_usage.get('grep_searches'):
108
+ concepts.add('search')
109
+ if tool_usage.get('files_edited') or tool_usage.get('files_created'):
110
+ concepts.add('development')
111
+ if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
112
+ concepts.add('testing')
113
+ if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
114
+ concepts.add('docker')
115
+
116
+ return concepts
117
+
118
+ def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
119
+ """Extract all tool usage from a conversation."""
120
+ tool_usage = {
121
+ "files_read": [],
122
+ "files_edited": [],
123
+ "files_created": [],
124
+ "grep_searches": [],
125
+ "bash_commands": [],
126
+ "glob_patterns": [],
127
+ "task_calls": [],
128
+ "mcp_calls": [],
129
+ "tools_summary": {},
130
+ "concepts": set(),
131
+ "timing": {},
132
+ "errors": [],
133
+ "tool_results": {}
134
+ }
135
+
136
+ start_time = time.time()
137
+
138
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
139
+ for line_num, line in enumerate(f, 1):
140
+ line = line.strip()
141
+ if not line:
142
+ continue
143
+
144
+ try:
145
+ data = json.loads(line)
146
+
147
+ # Skip API error messages
148
+ if data.get('isApiErrorMessage'):
149
+ continue
150
+
151
+ # Process message content
152
+ if 'message' in data and 'content' in data['message']:
153
+ content = data['message']['content']
154
+
155
+ # Handle content array (where tool_use lives)
156
+ if isinstance(content, list):
157
+ for item in content:
158
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
159
+ extract_single_tool_use(item, tool_usage)
160
+
161
+ except json.JSONDecodeError as e:
162
+ logger.debug(f"Skipping invalid JSON at line {line_num}: {e}")
163
+ except Exception as e:
164
+ logger.error(f"Error processing line {line_num}: {e}")
165
+ tool_usage["errors"].append({"line": line_num, "error": str(e)})
166
+
167
+ # Calculate timing
168
+ tool_usage["timing"]["extract_ms"] = int((time.time() - start_time) * 1000)
169
+
170
+ # Convert sets to lists for JSON serialization
171
+ tool_usage["concepts"] = list(tool_usage["concepts"])
172
+
173
+ return tool_usage
174
+
175
+ def extract_single_tool_use(tool_data: Dict[str, Any], usage_dict: Dict[str, Any]) -> None:
176
+ """Parse individual tool usage with enhanced metadata extraction."""
177
+ tool_name = tool_data.get('name')
178
+ inputs = tool_data.get('input', {})
179
+ tool_id = tool_data.get('id')
180
+
181
+ # Track tool frequency
182
+ usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
183
+
184
+ # Extract based on tool type
185
+ if tool_name == 'Read':
186
+ path = inputs.get('file_path')
187
+ if path:
188
+ usage_dict['files_read'].append({
189
+ 'path': normalize_path(path),
190
+ 'offset': inputs.get('offset', 0),
191
+ 'limit': inputs.get('limit', -1),
192
+ 'tool_id': tool_id
193
+ })
194
+
195
+ elif tool_name == 'Grep':
196
+ pattern = inputs.get('pattern')
197
+ if pattern:
198
+ usage_dict['grep_searches'].append({
199
+ 'pattern': pattern[:100], # Limit pattern length
200
+ 'path': normalize_path(inputs.get('path', '.')),
201
+ 'glob': inputs.get('glob'),
202
+ 'output_mode': inputs.get('output_mode', 'files_with_matches'),
203
+ 'case_insensitive': inputs.get('-i', False)
204
+ })
205
+ # Add search concept
206
+ usage_dict['concepts'].add('search')
207
+
208
+ elif tool_name == 'Edit' or tool_name == 'MultiEdit':
209
+ path = inputs.get('file_path')
210
+ if path:
211
+ usage_dict['files_edited'].append({
212
+ 'path': normalize_path(path),
213
+ 'operation': tool_name.lower()
214
+ })
215
+
216
+ elif tool_name == 'Write':
217
+ path = inputs.get('file_path')
218
+ if path:
219
+ usage_dict['files_created'].append(normalize_path(path))
220
+
221
+ elif tool_name == 'Bash':
222
+ cmd = inputs.get('command', '')
223
+ if cmd:
224
+ # Extract command name
225
+ cmd_parts = cmd.split()
226
+ cmd_name = cmd_parts[0] if cmd_parts else 'unknown'
227
+
228
+ usage_dict['bash_commands'].append({
229
+ 'command': cmd_name,
230
+ 'description': inputs.get('description', '')[:100]
231
+ })
232
+
233
+ # Add concepts based on commands
234
+ if 'docker' in cmd.lower():
235
+ usage_dict['concepts'].add('docker')
236
+ if 'git' in cmd.lower():
237
+ usage_dict['concepts'].add('git')
238
+ if 'test' in cmd.lower() or 'pytest' in cmd.lower():
239
+ usage_dict['concepts'].add('testing')
240
+
241
+ elif tool_name == 'Glob':
242
+ pattern = inputs.get('pattern')
243
+ if pattern:
244
+ usage_dict['glob_patterns'].append({
245
+ 'pattern': pattern,
246
+ 'path': normalize_path(inputs.get('path', '.'))
247
+ })
248
+
249
+ elif tool_name == 'Task':
250
+ usage_dict['task_calls'].append({
251
+ 'description': inputs.get('description', '')[:100],
252
+ 'subagent_type': inputs.get('subagent_type')
253
+ })
254
+
255
+ # Handle MCP tools
256
+ elif tool_name and tool_name.startswith('mcp__'):
257
+ usage_dict['mcp_calls'].append({
258
+ 'tool': tool_name,
259
+ 'params': list(inputs.keys()) if inputs else []
260
+ })
261
+ usage_dict['concepts'].add('mcp')
262
+
263
+ def create_enhanced_chunk(messages: List[Dict], chunk_index: int, tool_usage: Dict[str, Any],
264
+ conversation_metadata: Dict[str, Any]) -> Dict[str, Any]:
265
+ """Create chunk with tool usage metadata."""
266
+ # Extract text from messages
267
+ chunk_text = "\n\n".join([
268
+ f"{msg['role'].upper()}: {msg['content']}"
269
+ for msg in messages
270
+ ])
271
+
272
+ # Extract concepts from chunk text and tool usage
273
+ concepts = extract_concepts(chunk_text, tool_usage)
274
+
275
+ # Deduplicate and clean file paths
276
+ all_file_items = tool_usage.get('files_read', []) + tool_usage.get('files_edited', [])
277
+ files_analyzed = list(set([
278
+ item['path'] if isinstance(item, dict) else item
279
+ for item in all_file_items
280
+ if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
281
+ ]))[:20] # Limit to 20 files
282
+
283
+ files_edited = list(set([
284
+ item['path'] if isinstance(item, dict) else item
285
+ for item in tool_usage.get('files_edited', [])
286
+ if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
287
+ ]))[:10] # Limit to 10 files
288
+
289
+ # Build enhanced chunk
290
+ chunk = {
291
+ "text": chunk_text,
292
+ "conversation_id": conversation_metadata['id'],
293
+ "chunk_index": chunk_index,
294
+ "timestamp": conversation_metadata['timestamp'],
295
+ "project": conversation_metadata['project'],
296
+ "start_role": messages[0]['role'] if messages else 'unknown',
297
+
298
+ # Tool usage metadata
299
+ "files_analyzed": files_analyzed,
300
+ "files_edited": files_edited,
301
+ "search_patterns": [s['pattern'] for s in tool_usage.get('grep_searches', [])][:10],
302
+ "concepts": list(concepts)[:15],
303
+ "tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
304
+ "analysis_only": len(tool_usage.get('files_edited', [])) == 0 and len(tool_usage.get('files_created', [])) == 0,
305
+
306
+ # Additional context
307
+ "commands_used": list(set([c['command'] for c in tool_usage.get('bash_commands', [])]))[:10],
308
+ "has_security_check": 'security' in concepts,
309
+ "has_performance_check": 'performance' in concepts,
310
+ "mcp_tools_used": list(set([m['tool'].split('__')[1] if '__' in m['tool'] else m['tool']
311
+ for m in tool_usage.get('mcp_calls', [])]))[:5]
312
+ }
313
+
314
+ return chunk
315
+
316
+ # Import state management functions (same as original)
317
+ def load_state():
318
+ """Load the import state from file."""
319
+ if os.path.exists(STATE_FILE):
320
+ try:
321
+ with open(STATE_FILE, 'r') as f:
322
+ state = json.load(f)
323
+ if "imported_files" not in state:
324
+ state["imported_files"] = {}
325
+ return state
326
+ except Exception as e:
327
+ logger.warning(f"Failed to load state file: {e}")
328
+ return {"imported_files": {}}
329
+
330
+ def save_state(state):
331
+ """Save the import state to file."""
332
+ try:
333
+ os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
334
+ temp_file = STATE_FILE + ".tmp"
335
+ with open(temp_file, 'w') as f:
336
+ json.dump(state, f, indent=2)
337
+ os.replace(temp_file, STATE_FILE)
338
+ logger.debug(f"Saved state with {len(state['imported_files'])} files")
339
+ except Exception as e:
340
+ logger.error(f"Failed to save state file: {e}")
341
+
342
+ def should_import_file(file_path, state):
343
+ """Check if a file should be imported based on modification time."""
344
+ str_path = str(file_path)
345
+ file_mtime = os.path.getmtime(file_path)
346
+
347
+ if str_path in state["imported_files"]:
348
+ last_imported = state["imported_files"][str_path].get("last_imported", 0)
349
+ last_modified = state["imported_files"][str_path].get("last_modified", 0)
350
+
351
+ if file_mtime <= last_modified and last_imported > 0:
352
+ logger.info(f"Skipping unchanged file: {file_path.name}")
353
+ return False
354
+
355
+ return True
356
+
357
+ def update_file_state(file_path, state, chunks_imported, tool_stats=None):
358
+ """Update the state for an imported file with tool usage stats."""
359
+ str_path = str(file_path)
360
+ state["imported_files"][str_path] = {
361
+ "last_modified": os.path.getmtime(file_path),
362
+ "last_imported": datetime.now().timestamp(),
363
+ "chunks_imported": chunks_imported,
364
+ "tool_stats": tool_stats or {}
365
+ }
366
+
367
+ # Initialize embedding provider
368
+ embedding_provider = None
369
+ embedding_dimension = None
370
+ collection_suffix = None
371
+
372
+ if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
373
+ logger.info("Using local FastEmbed embeddings")
374
+ from fastembed import TextEmbedding
375
+ embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
376
+ embedding_dimension = 384
377
+ collection_suffix = "_local"
378
+ else:
379
+ logger.info("Using Voyage AI embeddings")
380
+ import voyageai
381
+ vo = voyageai.Client(api_key=VOYAGE_API_KEY)
382
+ embedding_provider = vo
383
+ embedding_dimension = 1024
384
+ collection_suffix = "_voyage"
385
+
386
+ # Initialize Qdrant client
387
+ client = QdrantClient(url=QDRANT_URL)
388
+
389
+ def chunk_conversation(messages: List[Dict], chunk_size: int = 10) -> List[Dict]:
390
+ """Split conversation into chunks of messages."""
391
+ chunks = []
392
+ for i in range(0, len(messages), chunk_size):
393
+ chunk_messages = messages[i:i + chunk_size]
394
+ chunks.append({
395
+ "messages": chunk_messages,
396
+ "chunk_index": i // chunk_size
397
+ })
398
+ return chunks
399
+
400
+ @retry(stop=stop_after_attempt(3), wait=wait_random_exponential(min=1, max=20))
401
+ def generate_embeddings(texts: List[str]) -> List[List[float]]:
402
+ """Generate embeddings for texts with retry logic."""
403
+ if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
404
+ embeddings = list(embedding_provider.embed(texts))
405
+ return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
406
+ else:
407
+ result = embedding_provider.embed(texts, model="voyage-3", input_type="document")
408
+ return result.embeddings
409
+
410
+ def import_project(project_path: Path, state: Dict) -> int:
411
+ """Import conversations from a single project with tool usage extraction."""
412
+ total_chunks = 0
413
+ jsonl_files = list(project_path.glob("*.jsonl"))
414
+
415
+ if not jsonl_files:
416
+ return 0
417
+
418
+ # Create or verify collection
419
+ collection_name = f"conv_{hashlib.md5(project_path.name.encode()).hexdigest()[:8]}{collection_suffix}"
420
+
421
+ try:
422
+ collections = [c.name for c in client.get_collections().collections]
423
+ if collection_name not in collections:
424
+ client.create_collection(
425
+ collection_name=collection_name,
426
+ vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
427
+ )
428
+ logger.info(f"Created collection: {collection_name}")
429
+ except Exception as e:
430
+ logger.error(f"Failed to create/verify collection {collection_name}: {e}")
431
+ return 0
432
+
433
+ for jsonl_file in jsonl_files:
434
+ if not should_import_file(jsonl_file, state):
435
+ continue
436
+
437
+ logger.info(f"Processing file: {jsonl_file.name}")
438
+
439
+ try:
440
+ file_start_time = time.time()
441
+
442
+ # Extract tool usage
443
+ extract_start = time.time()
444
+ tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
445
+ extract_time = time.time() - extract_start
446
+ timing_stats["extract"].append(extract_time)
447
+
448
+ # Read and process messages (original logic)
449
+ messages = []
450
+ created_at = None
451
+
452
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
453
+ for line_num, line in enumerate(f, 1):
454
+ line = line.strip()
455
+ if not line:
456
+ continue
457
+
458
+ try:
459
+ data = json.loads(line)
460
+
461
+ if created_at is None and 'timestamp' in data:
462
+ created_at = data.get('timestamp')
463
+
464
+ if data.get('type') == 'summary':
465
+ continue
466
+
467
+ if 'message' in data and data['message']:
468
+ msg = data['message']
469
+ if msg.get('role') and msg.get('content'):
470
+ content = msg['content']
471
+ if isinstance(content, list):
472
+ text_parts = []
473
+ for item in content:
474
+ if isinstance(item, dict) and item.get('type') == 'text':
475
+ text_parts.append(item.get('text', ''))
476
+ elif isinstance(item, str):
477
+ text_parts.append(item)
478
+ content = '\n'.join(text_parts)
479
+
480
+ if content:
481
+ messages.append({
482
+ 'role': msg['role'],
483
+ 'content': content
484
+ })
485
+ except Exception as e:
486
+ logger.error(f"Error processing line {line_num}: {e}")
487
+
488
+ if not messages:
489
+ continue
490
+
491
+ # Prepare metadata
492
+ if created_at is None:
493
+ created_at = datetime.now().isoformat()
494
+ conversation_id = jsonl_file.stem
495
+
496
+ conversation_metadata = {
497
+ 'id': conversation_id,
498
+ 'timestamp': created_at,
499
+ 'project': project_path.name
500
+ }
501
+
502
+ # Chunk the conversation
503
+ chunk_start = time.time()
504
+ chunks_data = chunk_conversation(messages)
505
+ enhanced_chunks = []
506
+
507
+ for chunk_data in chunks_data:
508
+ enhanced_chunk = create_enhanced_chunk(
509
+ chunk_data["messages"],
510
+ chunk_data["chunk_index"],
511
+ tool_usage,
512
+ conversation_metadata
513
+ )
514
+ enhanced_chunks.append(enhanced_chunk)
515
+
516
+ chunk_time = time.time() - chunk_start
517
+ timing_stats["chunk"].append(chunk_time)
518
+
519
+ if not enhanced_chunks:
520
+ continue
521
+
522
+ # Process in batches
523
+ for batch_start in range(0, len(enhanced_chunks), BATCH_SIZE):
524
+ batch = enhanced_chunks[batch_start:batch_start + BATCH_SIZE]
525
+ texts = [chunk["text"] for chunk in batch]
526
+
527
+ # Generate embeddings
528
+ embed_start = time.time()
529
+ embeddings = generate_embeddings(texts)
530
+ embed_time = time.time() - embed_start
531
+ timing_stats["embed"].append(embed_time)
532
+
533
+ # Create points
534
+ points = []
535
+ for chunk, embedding in zip(batch, embeddings):
536
+ point_id = hashlib.md5(
537
+ f"{conversation_id}_{chunk['chunk_index']}".encode()
538
+ ).hexdigest()[:16]
539
+
540
+ points.append(PointStruct(
541
+ id=int(point_id, 16) % (2**63),
542
+ vector=embedding,
543
+ payload=chunk
544
+ ))
545
+
546
+ # Upload to Qdrant (unless dry run)
547
+ if not DRY_RUN:
548
+ store_start = time.time()
549
+ client.upsert(
550
+ collection_name=collection_name,
551
+ points=points
552
+ )
553
+ store_time = time.time() - store_start
554
+ timing_stats["store"].append(store_time)
555
+ else:
556
+ logger.info(f"[DRY RUN] Would upload {len(points)} points to {collection_name}")
557
+
558
+ total_chunks += len(points)
559
+
560
+ file_chunks = len(enhanced_chunks)
561
+ total_time = time.time() - file_start_time
562
+ timing_stats["total"].append(total_time)
563
+
564
+ logger.info(f"Imported {file_chunks} chunks from {jsonl_file.name} "
565
+ f"(extract: {extract_time:.2f}s, chunk: {chunk_time:.2f}s, total: {total_time:.2f}s)")
566
+
567
+ # Update state with tool stats
568
+ tool_stats = {
569
+ "tools_used": list(tool_usage['tools_summary'].keys()),
570
+ "files_analyzed": len(enhanced_chunks[0].get('files_analyzed', [])) if enhanced_chunks else 0,
571
+ "concepts": list(tool_usage.get('concepts', []))[:10]
572
+ }
573
+ update_file_state(jsonl_file, state, file_chunks, tool_stats)
574
+
575
+ # Save state after each file
576
+ if not DRY_RUN:
577
+ save_state(state)
578
+
579
+ gc.collect()
580
+
581
+ except Exception as e:
582
+ logger.error(f"Failed to import {jsonl_file}: {e}")
583
+ import traceback
584
+ logger.error(traceback.format_exc())
585
+
586
+ return total_chunks
587
+
588
+ def main():
589
+ """Main import function with enhanced features."""
590
+ import argparse
591
+
592
+ parser = argparse.ArgumentParser(description='Import conversations with tool usage extraction')
593
+ parser.add_argument('--days', type=int, help='Import only files from last N days')
594
+ parser.add_argument('--limit', type=int, help='Limit number of files to import')
595
+ parser.add_argument('--dry-run', action='store_true', help='Run without actually importing')
596
+ parser.add_argument('--project', type=str, help='Import only specific project')
597
+
598
+ args = parser.parse_args()
599
+
600
+ if args.dry_run:
601
+ global DRY_RUN
602
+ DRY_RUN = True
603
+ logger.info("Running in DRY RUN mode - no data will be imported")
604
+
605
+ logs_path = Path(LOGS_DIR)
606
+
607
+ # Handle local development vs Docker paths
608
+ if not logs_path.exists():
609
+ # Try local development path
610
+ home_logs = Path.home() / '.claude' / 'projects'
611
+ if home_logs.exists():
612
+ logs_path = home_logs
613
+ logger.info(f"Using local logs directory: {logs_path}")
614
+ else:
615
+ logger.error(f"Logs directory not found: {LOGS_DIR}")
616
+ return
617
+
618
+ # Load existing state
619
+ state = load_state()
620
+ logger.info(f"Loaded state with {len(state['imported_files'])} previously imported files")
621
+
622
+ # Find project directories
623
+ if args.project:
624
+ project_dirs = [d for d in logs_path.iterdir() if d.is_dir() and args.project in d.name]
625
+ else:
626
+ project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
627
+
628
+ if not project_dirs:
629
+ logger.warning("No project directories found")
630
+ return
631
+
632
+ # Filter by date if specified
633
+ if args.days:
634
+ cutoff_date = datetime.now() - timedelta(days=args.days)
635
+ filtered_dirs = []
636
+ for project_dir in project_dirs:
637
+ jsonl_files = list(project_dir.glob("*.jsonl"))
638
+ recent_files = [f for f in jsonl_files if datetime.fromtimestamp(f.stat().st_mtime) > cutoff_date]
639
+ if recent_files:
640
+ filtered_dirs.append(project_dir)
641
+ project_dirs = filtered_dirs
642
+ logger.info(f"Filtered to {len(project_dirs)} projects with files from last {args.days} days")
643
+
644
+ # Apply limit if specified
645
+ if args.limit:
646
+ project_dirs = project_dirs[:args.limit]
647
+
648
+ logger.info(f"Found {len(project_dirs)} projects to import")
649
+
650
+ # Import each project
651
+ total_imported = 0
652
+ for project_dir in project_dirs:
653
+ logger.info(f"Importing project: {project_dir.name}")
654
+ chunks = import_project(project_dir, state)
655
+ total_imported += chunks
656
+
657
+ # Print timing statistics
658
+ logger.info("\n=== Import Performance Summary ===")
659
+ logger.info(f"Total chunks imported: {total_imported}")
660
+
661
+ if timing_stats["total"]:
662
+ logger.info(f"\nTiming averages:")
663
+ logger.info(f" Extract: {sum(timing_stats['extract'])/len(timing_stats['extract']):.2f}s")
664
+ logger.info(f" Chunk: {sum(timing_stats['chunk'])/len(timing_stats['chunk']):.2f}s")
665
+ if timing_stats['embed']:
666
+ logger.info(f" Embed: {sum(timing_stats['embed'])/len(timing_stats['embed']):.2f}s")
667
+ if timing_stats['store']:
668
+ logger.info(f" Store: {sum(timing_stats['store'])/len(timing_stats['store']):.2f}s")
669
+ logger.info(f" Total: {sum(timing_stats['total'])/len(timing_stats['total']):.2f}s per file")
670
+
671
+ if __name__ == "__main__":
672
+ main()