claude-self-reflect 5.0.7 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/open-source-maintainer.md +1 -1
- package/.claude/agents/reflection-specialist.md +2 -2
- package/Dockerfile.async-importer +6 -4
- package/Dockerfile.importer +6 -6
- package/Dockerfile.safe-watcher +8 -8
- package/Dockerfile.streaming-importer +8 -1
- package/Dockerfile.watcher +8 -16
- package/docker-compose.yaml +12 -6
- package/installer/.claude/agents/README.md +138 -0
- package/package.json +5 -26
- package/src/__init__.py +0 -0
- package/src/cli/__init__.py +0 -0
- package/src/runtime/__init__.py +0 -0
- package/src/runtime/import-latest.py +124 -0
- package/{scripts → src/runtime}/precompact-hook.sh +1 -1
- package/src/runtime/streaming-importer.py +995 -0
- package/{scripts → src/runtime}/watcher-loop.sh +1 -1
- package/.claude/agents/claude-self-reflect-test.md +0 -1274
- package/.claude/agents/reflect-tester.md +0 -300
- package/scripts/add-timestamp-indexes.py +0 -134
- package/scripts/ast_grep_final_analyzer.py +0 -338
- package/scripts/ast_grep_unified_registry.py +0 -710
- package/scripts/check-collections.py +0 -29
- package/scripts/debug-august-parsing.py +0 -80
- package/scripts/debug-import-single.py +0 -91
- package/scripts/debug-project-resolver.py +0 -82
- package/scripts/debug-temporal-tools.py +0 -135
- package/scripts/import-conversations-enhanced.py +0 -672
- package/scripts/migrate-to-unified-state.py +0 -426
- package/scripts/session_quality_tracker.py +0 -671
- package/scripts/update_patterns.py +0 -334
- /package/{scripts → src}/importer/__init__.py +0 -0
- /package/{scripts → src}/importer/__main__.py +0 -0
- /package/{scripts → src}/importer/core/__init__.py +0 -0
- /package/{scripts → src}/importer/core/config.py +0 -0
- /package/{scripts → src}/importer/core/exceptions.py +0 -0
- /package/{scripts → src}/importer/core/models.py +0 -0
- /package/{scripts → src}/importer/embeddings/__init__.py +0 -0
- /package/{scripts → src}/importer/embeddings/base.py +0 -0
- /package/{scripts → src}/importer/embeddings/fastembed_provider.py +0 -0
- /package/{scripts → src}/importer/embeddings/validator.py +0 -0
- /package/{scripts → src}/importer/embeddings/voyage_provider.py +0 -0
- /package/{scripts → src}/importer/main.py +0 -0
- /package/{scripts → src}/importer/processors/__init__.py +0 -0
- /package/{scripts → src}/importer/processors/ast_extractor.py +0 -0
- /package/{scripts → src}/importer/processors/chunker.py +0 -0
- /package/{scripts → src}/importer/processors/concept_extractor.py +0 -0
- /package/{scripts → src}/importer/processors/conversation_parser.py +0 -0
- /package/{scripts → src}/importer/processors/tool_extractor.py +0 -0
- /package/{scripts → src}/importer/state/__init__.py +0 -0
- /package/{scripts → src}/importer/state/state_manager.py +0 -0
- /package/{scripts → src}/importer/storage/__init__.py +0 -0
- /package/{scripts → src}/importer/storage/qdrant_storage.py +0 -0
- /package/{scripts → src}/importer/utils/__init__.py +0 -0
- /package/{scripts → src}/importer/utils/logger.py +0 -0
- /package/{scripts → src}/importer/utils/project_normalizer.py +0 -0
- /package/{scripts → src/runtime}/delta-metadata-update-safe.py +0 -0
- /package/{scripts → src/runtime}/delta-metadata-update.py +0 -0
- /package/{scripts → src/runtime}/doctor.py +0 -0
- /package/{scripts → src/runtime}/embedding_service.py +0 -0
- /package/{scripts → src/runtime}/force-metadata-recovery.py +0 -0
- /package/{scripts → src/runtime}/import-conversations-unified.py +0 -0
- /package/{scripts → src/runtime}/import_strategies.py +0 -0
- /package/{scripts → src/runtime}/message_processors.py +0 -0
- /package/{scripts → src/runtime}/metadata_extractor.py +0 -0
- /package/{scripts → src/runtime}/streaming-watcher.py +0 -0
- /package/{scripts → src/runtime}/unified_state_manager.py +0 -0
- /package/{scripts → src/runtime}/utils.py +0 -0
|
@@ -1,672 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Enhanced import script that extracts tool usage metadata from conversations.
|
|
4
|
-
Supports both local and Voyage AI embeddings with tool tracking.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
import sys
|
|
9
|
-
import json
|
|
10
|
-
import glob
|
|
11
|
-
import hashlib
|
|
12
|
-
import gc
|
|
13
|
-
import re
|
|
14
|
-
import time
|
|
15
|
-
from datetime import datetime, timedelta
|
|
16
|
-
from typing import List, Dict, Any, Set, Tuple
|
|
17
|
-
import logging
|
|
18
|
-
from pathlib import Path
|
|
19
|
-
|
|
20
|
-
from qdrant_client import QdrantClient
|
|
21
|
-
from qdrant_client.models import (
|
|
22
|
-
VectorParams, Distance, PointStruct,
|
|
23
|
-
Filter, FieldCondition, MatchValue
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
from tenacity import (
|
|
27
|
-
retry,
|
|
28
|
-
stop_after_attempt,
|
|
29
|
-
wait_random_exponential,
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
# Configuration
|
|
33
|
-
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
34
|
-
LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
|
|
35
|
-
STATE_FILE = os.getenv("STATE_FILE", "./config/imported-files-enhanced.json")
|
|
36
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
|
|
37
|
-
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
|
|
38
|
-
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
39
|
-
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
40
|
-
|
|
41
|
-
# Set up logging
|
|
42
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
43
|
-
logger = logging.getLogger(__name__)
|
|
44
|
-
|
|
45
|
-
# Import timing stats
|
|
46
|
-
timing_stats = {
|
|
47
|
-
"extract": [],
|
|
48
|
-
"chunk": [],
|
|
49
|
-
"embed": [],
|
|
50
|
-
"store": [],
|
|
51
|
-
"total": []
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
def normalize_path(path: str) -> str:
|
|
55
|
-
"""Normalize file paths for consistency across platforms."""
|
|
56
|
-
if not path:
|
|
57
|
-
return ""
|
|
58
|
-
|
|
59
|
-
# Remove common prefixes
|
|
60
|
-
path = path.replace("/Users/", "~/")
|
|
61
|
-
path = path.replace("\\Users\\", "~\\")
|
|
62
|
-
|
|
63
|
-
# Convert to forward slashes
|
|
64
|
-
path = path.replace("\\", "/")
|
|
65
|
-
|
|
66
|
-
# Remove duplicate slashes
|
|
67
|
-
path = re.sub(r'/+', '/', path)
|
|
68
|
-
|
|
69
|
-
return path
|
|
70
|
-
|
|
71
|
-
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
72
|
-
"""Extract high-level concepts from conversation and tool usage."""
|
|
73
|
-
concepts = set()
|
|
74
|
-
|
|
75
|
-
# Common development concepts with patterns
|
|
76
|
-
concept_patterns = {
|
|
77
|
-
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
78
|
-
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
79
|
-
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
80
|
-
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
81
|
-
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
82
|
-
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb)',
|
|
83
|
-
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
84
|
-
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
85
|
-
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
86
|
-
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
87
|
-
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
88
|
-
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
89
|
-
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
90
|
-
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
91
|
-
'search': r'(search|query|find|filter|match|relevance)'
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
# Check text content
|
|
95
|
-
combined_text = text.lower()
|
|
96
|
-
for concept, pattern in concept_patterns.items():
|
|
97
|
-
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
98
|
-
concepts.add(concept)
|
|
99
|
-
|
|
100
|
-
# Check tool usage patterns
|
|
101
|
-
tool_text = json.dumps(tool_usage).lower()
|
|
102
|
-
for concept, pattern in concept_patterns.items():
|
|
103
|
-
if re.search(pattern, tool_text, re.IGNORECASE):
|
|
104
|
-
concepts.add(concept)
|
|
105
|
-
|
|
106
|
-
# Add concepts based on specific tool usage
|
|
107
|
-
if tool_usage.get('grep_searches'):
|
|
108
|
-
concepts.add('search')
|
|
109
|
-
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
110
|
-
concepts.add('development')
|
|
111
|
-
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
112
|
-
concepts.add('testing')
|
|
113
|
-
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
114
|
-
concepts.add('docker')
|
|
115
|
-
|
|
116
|
-
return concepts
|
|
117
|
-
|
|
118
|
-
def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
119
|
-
"""Extract all tool usage from a conversation."""
|
|
120
|
-
tool_usage = {
|
|
121
|
-
"files_read": [],
|
|
122
|
-
"files_edited": [],
|
|
123
|
-
"files_created": [],
|
|
124
|
-
"grep_searches": [],
|
|
125
|
-
"bash_commands": [],
|
|
126
|
-
"glob_patterns": [],
|
|
127
|
-
"task_calls": [],
|
|
128
|
-
"mcp_calls": [],
|
|
129
|
-
"tools_summary": {},
|
|
130
|
-
"concepts": set(),
|
|
131
|
-
"timing": {},
|
|
132
|
-
"errors": [],
|
|
133
|
-
"tool_results": {}
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
start_time = time.time()
|
|
137
|
-
|
|
138
|
-
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
139
|
-
for line_num, line in enumerate(f, 1):
|
|
140
|
-
line = line.strip()
|
|
141
|
-
if not line:
|
|
142
|
-
continue
|
|
143
|
-
|
|
144
|
-
try:
|
|
145
|
-
data = json.loads(line)
|
|
146
|
-
|
|
147
|
-
# Skip API error messages
|
|
148
|
-
if data.get('isApiErrorMessage'):
|
|
149
|
-
continue
|
|
150
|
-
|
|
151
|
-
# Process message content
|
|
152
|
-
if 'message' in data and 'content' in data['message']:
|
|
153
|
-
content = data['message']['content']
|
|
154
|
-
|
|
155
|
-
# Handle content array (where tool_use lives)
|
|
156
|
-
if isinstance(content, list):
|
|
157
|
-
for item in content:
|
|
158
|
-
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
159
|
-
extract_single_tool_use(item, tool_usage)
|
|
160
|
-
|
|
161
|
-
except json.JSONDecodeError as e:
|
|
162
|
-
logger.debug(f"Skipping invalid JSON at line {line_num}: {e}")
|
|
163
|
-
except Exception as e:
|
|
164
|
-
logger.error(f"Error processing line {line_num}: {e}")
|
|
165
|
-
tool_usage["errors"].append({"line": line_num, "error": str(e)})
|
|
166
|
-
|
|
167
|
-
# Calculate timing
|
|
168
|
-
tool_usage["timing"]["extract_ms"] = int((time.time() - start_time) * 1000)
|
|
169
|
-
|
|
170
|
-
# Convert sets to lists for JSON serialization
|
|
171
|
-
tool_usage["concepts"] = list(tool_usage["concepts"])
|
|
172
|
-
|
|
173
|
-
return tool_usage
|
|
174
|
-
|
|
175
|
-
def extract_single_tool_use(tool_data: Dict[str, Any], usage_dict: Dict[str, Any]) -> None:
|
|
176
|
-
"""Parse individual tool usage with enhanced metadata extraction."""
|
|
177
|
-
tool_name = tool_data.get('name')
|
|
178
|
-
inputs = tool_data.get('input', {})
|
|
179
|
-
tool_id = tool_data.get('id')
|
|
180
|
-
|
|
181
|
-
# Track tool frequency
|
|
182
|
-
usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
|
|
183
|
-
|
|
184
|
-
# Extract based on tool type
|
|
185
|
-
if tool_name == 'Read':
|
|
186
|
-
path = inputs.get('file_path')
|
|
187
|
-
if path:
|
|
188
|
-
usage_dict['files_read'].append({
|
|
189
|
-
'path': normalize_path(path),
|
|
190
|
-
'offset': inputs.get('offset', 0),
|
|
191
|
-
'limit': inputs.get('limit', -1),
|
|
192
|
-
'tool_id': tool_id
|
|
193
|
-
})
|
|
194
|
-
|
|
195
|
-
elif tool_name == 'Grep':
|
|
196
|
-
pattern = inputs.get('pattern')
|
|
197
|
-
if pattern:
|
|
198
|
-
usage_dict['grep_searches'].append({
|
|
199
|
-
'pattern': pattern[:100], # Limit pattern length
|
|
200
|
-
'path': normalize_path(inputs.get('path', '.')),
|
|
201
|
-
'glob': inputs.get('glob'),
|
|
202
|
-
'output_mode': inputs.get('output_mode', 'files_with_matches'),
|
|
203
|
-
'case_insensitive': inputs.get('-i', False)
|
|
204
|
-
})
|
|
205
|
-
# Add search concept
|
|
206
|
-
usage_dict['concepts'].add('search')
|
|
207
|
-
|
|
208
|
-
elif tool_name == 'Edit' or tool_name == 'MultiEdit':
|
|
209
|
-
path = inputs.get('file_path')
|
|
210
|
-
if path:
|
|
211
|
-
usage_dict['files_edited'].append({
|
|
212
|
-
'path': normalize_path(path),
|
|
213
|
-
'operation': tool_name.lower()
|
|
214
|
-
})
|
|
215
|
-
|
|
216
|
-
elif tool_name == 'Write':
|
|
217
|
-
path = inputs.get('file_path')
|
|
218
|
-
if path:
|
|
219
|
-
usage_dict['files_created'].append(normalize_path(path))
|
|
220
|
-
|
|
221
|
-
elif tool_name == 'Bash':
|
|
222
|
-
cmd = inputs.get('command', '')
|
|
223
|
-
if cmd:
|
|
224
|
-
# Extract command name
|
|
225
|
-
cmd_parts = cmd.split()
|
|
226
|
-
cmd_name = cmd_parts[0] if cmd_parts else 'unknown'
|
|
227
|
-
|
|
228
|
-
usage_dict['bash_commands'].append({
|
|
229
|
-
'command': cmd_name,
|
|
230
|
-
'description': inputs.get('description', '')[:100]
|
|
231
|
-
})
|
|
232
|
-
|
|
233
|
-
# Add concepts based on commands
|
|
234
|
-
if 'docker' in cmd.lower():
|
|
235
|
-
usage_dict['concepts'].add('docker')
|
|
236
|
-
if 'git' in cmd.lower():
|
|
237
|
-
usage_dict['concepts'].add('git')
|
|
238
|
-
if 'test' in cmd.lower() or 'pytest' in cmd.lower():
|
|
239
|
-
usage_dict['concepts'].add('testing')
|
|
240
|
-
|
|
241
|
-
elif tool_name == 'Glob':
|
|
242
|
-
pattern = inputs.get('pattern')
|
|
243
|
-
if pattern:
|
|
244
|
-
usage_dict['glob_patterns'].append({
|
|
245
|
-
'pattern': pattern,
|
|
246
|
-
'path': normalize_path(inputs.get('path', '.'))
|
|
247
|
-
})
|
|
248
|
-
|
|
249
|
-
elif tool_name == 'Task':
|
|
250
|
-
usage_dict['task_calls'].append({
|
|
251
|
-
'description': inputs.get('description', '')[:100],
|
|
252
|
-
'subagent_type': inputs.get('subagent_type')
|
|
253
|
-
})
|
|
254
|
-
|
|
255
|
-
# Handle MCP tools
|
|
256
|
-
elif tool_name and tool_name.startswith('mcp__'):
|
|
257
|
-
usage_dict['mcp_calls'].append({
|
|
258
|
-
'tool': tool_name,
|
|
259
|
-
'params': list(inputs.keys()) if inputs else []
|
|
260
|
-
})
|
|
261
|
-
usage_dict['concepts'].add('mcp')
|
|
262
|
-
|
|
263
|
-
def create_enhanced_chunk(messages: List[Dict], chunk_index: int, tool_usage: Dict[str, Any],
|
|
264
|
-
conversation_metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
265
|
-
"""Create chunk with tool usage metadata."""
|
|
266
|
-
# Extract text from messages
|
|
267
|
-
chunk_text = "\n\n".join([
|
|
268
|
-
f"{msg['role'].upper()}: {msg['content']}"
|
|
269
|
-
for msg in messages
|
|
270
|
-
])
|
|
271
|
-
|
|
272
|
-
# Extract concepts from chunk text and tool usage
|
|
273
|
-
concepts = extract_concepts(chunk_text, tool_usage)
|
|
274
|
-
|
|
275
|
-
# Deduplicate and clean file paths
|
|
276
|
-
all_file_items = tool_usage.get('files_read', []) + tool_usage.get('files_edited', [])
|
|
277
|
-
files_analyzed = list(set([
|
|
278
|
-
item['path'] if isinstance(item, dict) else item
|
|
279
|
-
for item in all_file_items
|
|
280
|
-
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
281
|
-
]))[:20] # Limit to 20 files
|
|
282
|
-
|
|
283
|
-
files_edited = list(set([
|
|
284
|
-
item['path'] if isinstance(item, dict) else item
|
|
285
|
-
for item in tool_usage.get('files_edited', [])
|
|
286
|
-
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
287
|
-
]))[:10] # Limit to 10 files
|
|
288
|
-
|
|
289
|
-
# Build enhanced chunk
|
|
290
|
-
chunk = {
|
|
291
|
-
"text": chunk_text,
|
|
292
|
-
"conversation_id": conversation_metadata['id'],
|
|
293
|
-
"chunk_index": chunk_index,
|
|
294
|
-
"timestamp": conversation_metadata['timestamp'],
|
|
295
|
-
"project": conversation_metadata['project'],
|
|
296
|
-
"start_role": messages[0]['role'] if messages else 'unknown',
|
|
297
|
-
|
|
298
|
-
# Tool usage metadata
|
|
299
|
-
"files_analyzed": files_analyzed,
|
|
300
|
-
"files_edited": files_edited,
|
|
301
|
-
"search_patterns": [s['pattern'] for s in tool_usage.get('grep_searches', [])][:10],
|
|
302
|
-
"concepts": list(concepts)[:15],
|
|
303
|
-
"tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
|
|
304
|
-
"analysis_only": len(tool_usage.get('files_edited', [])) == 0 and len(tool_usage.get('files_created', [])) == 0,
|
|
305
|
-
|
|
306
|
-
# Additional context
|
|
307
|
-
"commands_used": list(set([c['command'] for c in tool_usage.get('bash_commands', [])]))[:10],
|
|
308
|
-
"has_security_check": 'security' in concepts,
|
|
309
|
-
"has_performance_check": 'performance' in concepts,
|
|
310
|
-
"mcp_tools_used": list(set([m['tool'].split('__')[1] if '__' in m['tool'] else m['tool']
|
|
311
|
-
for m in tool_usage.get('mcp_calls', [])]))[:5]
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
return chunk
|
|
315
|
-
|
|
316
|
-
# Import state management functions (same as original)
|
|
317
|
-
def load_state():
|
|
318
|
-
"""Load the import state from file."""
|
|
319
|
-
if os.path.exists(STATE_FILE):
|
|
320
|
-
try:
|
|
321
|
-
with open(STATE_FILE, 'r') as f:
|
|
322
|
-
state = json.load(f)
|
|
323
|
-
if "imported_files" not in state:
|
|
324
|
-
state["imported_files"] = {}
|
|
325
|
-
return state
|
|
326
|
-
except Exception as e:
|
|
327
|
-
logger.warning(f"Failed to load state file: {e}")
|
|
328
|
-
return {"imported_files": {}}
|
|
329
|
-
|
|
330
|
-
def save_state(state):
|
|
331
|
-
"""Save the import state to file."""
|
|
332
|
-
try:
|
|
333
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
334
|
-
temp_file = STATE_FILE + ".tmp"
|
|
335
|
-
with open(temp_file, 'w') as f:
|
|
336
|
-
json.dump(state, f, indent=2)
|
|
337
|
-
os.replace(temp_file, STATE_FILE)
|
|
338
|
-
logger.debug(f"Saved state with {len(state['imported_files'])} files")
|
|
339
|
-
except Exception as e:
|
|
340
|
-
logger.error(f"Failed to save state file: {e}")
|
|
341
|
-
|
|
342
|
-
def should_import_file(file_path, state):
|
|
343
|
-
"""Check if a file should be imported based on modification time."""
|
|
344
|
-
str_path = str(file_path)
|
|
345
|
-
file_mtime = os.path.getmtime(file_path)
|
|
346
|
-
|
|
347
|
-
if str_path in state["imported_files"]:
|
|
348
|
-
last_imported = state["imported_files"][str_path].get("last_imported", 0)
|
|
349
|
-
last_modified = state["imported_files"][str_path].get("last_modified", 0)
|
|
350
|
-
|
|
351
|
-
if file_mtime <= last_modified and last_imported > 0:
|
|
352
|
-
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
353
|
-
return False
|
|
354
|
-
|
|
355
|
-
return True
|
|
356
|
-
|
|
357
|
-
def update_file_state(file_path, state, chunks_imported, tool_stats=None):
|
|
358
|
-
"""Update the state for an imported file with tool usage stats."""
|
|
359
|
-
str_path = str(file_path)
|
|
360
|
-
state["imported_files"][str_path] = {
|
|
361
|
-
"last_modified": os.path.getmtime(file_path),
|
|
362
|
-
"last_imported": datetime.now().timestamp(),
|
|
363
|
-
"chunks_imported": chunks_imported,
|
|
364
|
-
"tool_stats": tool_stats or {}
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
# Initialize embedding provider
|
|
368
|
-
embedding_provider = None
|
|
369
|
-
embedding_dimension = None
|
|
370
|
-
collection_suffix = None
|
|
371
|
-
|
|
372
|
-
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
373
|
-
logger.info("Using local FastEmbed embeddings")
|
|
374
|
-
from fastembed import TextEmbedding
|
|
375
|
-
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
376
|
-
embedding_dimension = 384
|
|
377
|
-
collection_suffix = "_local"
|
|
378
|
-
else:
|
|
379
|
-
logger.info("Using Voyage AI embeddings")
|
|
380
|
-
import voyageai
|
|
381
|
-
vo = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
382
|
-
embedding_provider = vo
|
|
383
|
-
embedding_dimension = 1024
|
|
384
|
-
collection_suffix = "_voyage"
|
|
385
|
-
|
|
386
|
-
# Initialize Qdrant client
|
|
387
|
-
client = QdrantClient(url=QDRANT_URL)
|
|
388
|
-
|
|
389
|
-
def chunk_conversation(messages: List[Dict], chunk_size: int = 10) -> List[Dict]:
|
|
390
|
-
"""Split conversation into chunks of messages."""
|
|
391
|
-
chunks = []
|
|
392
|
-
for i in range(0, len(messages), chunk_size):
|
|
393
|
-
chunk_messages = messages[i:i + chunk_size]
|
|
394
|
-
chunks.append({
|
|
395
|
-
"messages": chunk_messages,
|
|
396
|
-
"chunk_index": i // chunk_size
|
|
397
|
-
})
|
|
398
|
-
return chunks
|
|
399
|
-
|
|
400
|
-
@retry(stop=stop_after_attempt(3), wait=wait_random_exponential(min=1, max=20))
|
|
401
|
-
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
402
|
-
"""Generate embeddings for texts with retry logic."""
|
|
403
|
-
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
404
|
-
embeddings = list(embedding_provider.embed(texts))
|
|
405
|
-
return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
|
|
406
|
-
else:
|
|
407
|
-
result = embedding_provider.embed(texts, model="voyage-3", input_type="document")
|
|
408
|
-
return result.embeddings
|
|
409
|
-
|
|
410
|
-
def import_project(project_path: Path, state: Dict) -> int:
|
|
411
|
-
"""Import conversations from a single project with tool usage extraction."""
|
|
412
|
-
total_chunks = 0
|
|
413
|
-
jsonl_files = list(project_path.glob("*.jsonl"))
|
|
414
|
-
|
|
415
|
-
if not jsonl_files:
|
|
416
|
-
return 0
|
|
417
|
-
|
|
418
|
-
# Create or verify collection
|
|
419
|
-
collection_name = f"conv_{hashlib.md5(project_path.name.encode()).hexdigest()[:8]}{collection_suffix}"
|
|
420
|
-
|
|
421
|
-
try:
|
|
422
|
-
collections = [c.name for c in client.get_collections().collections]
|
|
423
|
-
if collection_name not in collections:
|
|
424
|
-
client.create_collection(
|
|
425
|
-
collection_name=collection_name,
|
|
426
|
-
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
|
427
|
-
)
|
|
428
|
-
logger.info(f"Created collection: {collection_name}")
|
|
429
|
-
except Exception as e:
|
|
430
|
-
logger.error(f"Failed to create/verify collection {collection_name}: {e}")
|
|
431
|
-
return 0
|
|
432
|
-
|
|
433
|
-
for jsonl_file in jsonl_files:
|
|
434
|
-
if not should_import_file(jsonl_file, state):
|
|
435
|
-
continue
|
|
436
|
-
|
|
437
|
-
logger.info(f"Processing file: {jsonl_file.name}")
|
|
438
|
-
|
|
439
|
-
try:
|
|
440
|
-
file_start_time = time.time()
|
|
441
|
-
|
|
442
|
-
# Extract tool usage
|
|
443
|
-
extract_start = time.time()
|
|
444
|
-
tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
|
|
445
|
-
extract_time = time.time() - extract_start
|
|
446
|
-
timing_stats["extract"].append(extract_time)
|
|
447
|
-
|
|
448
|
-
# Read and process messages (original logic)
|
|
449
|
-
messages = []
|
|
450
|
-
created_at = None
|
|
451
|
-
|
|
452
|
-
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
453
|
-
for line_num, line in enumerate(f, 1):
|
|
454
|
-
line = line.strip()
|
|
455
|
-
if not line:
|
|
456
|
-
continue
|
|
457
|
-
|
|
458
|
-
try:
|
|
459
|
-
data = json.loads(line)
|
|
460
|
-
|
|
461
|
-
if created_at is None and 'timestamp' in data:
|
|
462
|
-
created_at = data.get('timestamp')
|
|
463
|
-
|
|
464
|
-
if data.get('type') == 'summary':
|
|
465
|
-
continue
|
|
466
|
-
|
|
467
|
-
if 'message' in data and data['message']:
|
|
468
|
-
msg = data['message']
|
|
469
|
-
if msg.get('role') and msg.get('content'):
|
|
470
|
-
content = msg['content']
|
|
471
|
-
if isinstance(content, list):
|
|
472
|
-
text_parts = []
|
|
473
|
-
for item in content:
|
|
474
|
-
if isinstance(item, dict) and item.get('type') == 'text':
|
|
475
|
-
text_parts.append(item.get('text', ''))
|
|
476
|
-
elif isinstance(item, str):
|
|
477
|
-
text_parts.append(item)
|
|
478
|
-
content = '\n'.join(text_parts)
|
|
479
|
-
|
|
480
|
-
if content:
|
|
481
|
-
messages.append({
|
|
482
|
-
'role': msg['role'],
|
|
483
|
-
'content': content
|
|
484
|
-
})
|
|
485
|
-
except Exception as e:
|
|
486
|
-
logger.error(f"Error processing line {line_num}: {e}")
|
|
487
|
-
|
|
488
|
-
if not messages:
|
|
489
|
-
continue
|
|
490
|
-
|
|
491
|
-
# Prepare metadata
|
|
492
|
-
if created_at is None:
|
|
493
|
-
created_at = datetime.now().isoformat()
|
|
494
|
-
conversation_id = jsonl_file.stem
|
|
495
|
-
|
|
496
|
-
conversation_metadata = {
|
|
497
|
-
'id': conversation_id,
|
|
498
|
-
'timestamp': created_at,
|
|
499
|
-
'project': project_path.name
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
# Chunk the conversation
|
|
503
|
-
chunk_start = time.time()
|
|
504
|
-
chunks_data = chunk_conversation(messages)
|
|
505
|
-
enhanced_chunks = []
|
|
506
|
-
|
|
507
|
-
for chunk_data in chunks_data:
|
|
508
|
-
enhanced_chunk = create_enhanced_chunk(
|
|
509
|
-
chunk_data["messages"],
|
|
510
|
-
chunk_data["chunk_index"],
|
|
511
|
-
tool_usage,
|
|
512
|
-
conversation_metadata
|
|
513
|
-
)
|
|
514
|
-
enhanced_chunks.append(enhanced_chunk)
|
|
515
|
-
|
|
516
|
-
chunk_time = time.time() - chunk_start
|
|
517
|
-
timing_stats["chunk"].append(chunk_time)
|
|
518
|
-
|
|
519
|
-
if not enhanced_chunks:
|
|
520
|
-
continue
|
|
521
|
-
|
|
522
|
-
# Process in batches
|
|
523
|
-
for batch_start in range(0, len(enhanced_chunks), BATCH_SIZE):
|
|
524
|
-
batch = enhanced_chunks[batch_start:batch_start + BATCH_SIZE]
|
|
525
|
-
texts = [chunk["text"] for chunk in batch]
|
|
526
|
-
|
|
527
|
-
# Generate embeddings
|
|
528
|
-
embed_start = time.time()
|
|
529
|
-
embeddings = generate_embeddings(texts)
|
|
530
|
-
embed_time = time.time() - embed_start
|
|
531
|
-
timing_stats["embed"].append(embed_time)
|
|
532
|
-
|
|
533
|
-
# Create points
|
|
534
|
-
points = []
|
|
535
|
-
for chunk, embedding in zip(batch, embeddings):
|
|
536
|
-
point_id = hashlib.md5(
|
|
537
|
-
f"{conversation_id}_{chunk['chunk_index']}".encode()
|
|
538
|
-
).hexdigest()[:16]
|
|
539
|
-
|
|
540
|
-
points.append(PointStruct(
|
|
541
|
-
id=int(point_id, 16) % (2**63),
|
|
542
|
-
vector=embedding,
|
|
543
|
-
payload=chunk
|
|
544
|
-
))
|
|
545
|
-
|
|
546
|
-
# Upload to Qdrant (unless dry run)
|
|
547
|
-
if not DRY_RUN:
|
|
548
|
-
store_start = time.time()
|
|
549
|
-
client.upsert(
|
|
550
|
-
collection_name=collection_name,
|
|
551
|
-
points=points
|
|
552
|
-
)
|
|
553
|
-
store_time = time.time() - store_start
|
|
554
|
-
timing_stats["store"].append(store_time)
|
|
555
|
-
else:
|
|
556
|
-
logger.info(f"[DRY RUN] Would upload {len(points)} points to {collection_name}")
|
|
557
|
-
|
|
558
|
-
total_chunks += len(points)
|
|
559
|
-
|
|
560
|
-
file_chunks = len(enhanced_chunks)
|
|
561
|
-
total_time = time.time() - file_start_time
|
|
562
|
-
timing_stats["total"].append(total_time)
|
|
563
|
-
|
|
564
|
-
logger.info(f"Imported {file_chunks} chunks from {jsonl_file.name} "
|
|
565
|
-
f"(extract: {extract_time:.2f}s, chunk: {chunk_time:.2f}s, total: {total_time:.2f}s)")
|
|
566
|
-
|
|
567
|
-
# Update state with tool stats
|
|
568
|
-
tool_stats = {
|
|
569
|
-
"tools_used": list(tool_usage['tools_summary'].keys()),
|
|
570
|
-
"files_analyzed": len(enhanced_chunks[0].get('files_analyzed', [])) if enhanced_chunks else 0,
|
|
571
|
-
"concepts": list(tool_usage.get('concepts', []))[:10]
|
|
572
|
-
}
|
|
573
|
-
update_file_state(jsonl_file, state, file_chunks, tool_stats)
|
|
574
|
-
|
|
575
|
-
# Save state after each file
|
|
576
|
-
if not DRY_RUN:
|
|
577
|
-
save_state(state)
|
|
578
|
-
|
|
579
|
-
gc.collect()
|
|
580
|
-
|
|
581
|
-
except Exception as e:
|
|
582
|
-
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
583
|
-
import traceback
|
|
584
|
-
logger.error(traceback.format_exc())
|
|
585
|
-
|
|
586
|
-
return total_chunks
|
|
587
|
-
|
|
588
|
-
def main():
|
|
589
|
-
"""Main import function with enhanced features."""
|
|
590
|
-
import argparse
|
|
591
|
-
|
|
592
|
-
parser = argparse.ArgumentParser(description='Import conversations with tool usage extraction')
|
|
593
|
-
parser.add_argument('--days', type=int, help='Import only files from last N days')
|
|
594
|
-
parser.add_argument('--limit', type=int, help='Limit number of files to import')
|
|
595
|
-
parser.add_argument('--dry-run', action='store_true', help='Run without actually importing')
|
|
596
|
-
parser.add_argument('--project', type=str, help='Import only specific project')
|
|
597
|
-
|
|
598
|
-
args = parser.parse_args()
|
|
599
|
-
|
|
600
|
-
if args.dry_run:
|
|
601
|
-
global DRY_RUN
|
|
602
|
-
DRY_RUN = True
|
|
603
|
-
logger.info("Running in DRY RUN mode - no data will be imported")
|
|
604
|
-
|
|
605
|
-
logs_path = Path(LOGS_DIR)
|
|
606
|
-
|
|
607
|
-
# Handle local development vs Docker paths
|
|
608
|
-
if not logs_path.exists():
|
|
609
|
-
# Try local development path
|
|
610
|
-
home_logs = Path.home() / '.claude' / 'projects'
|
|
611
|
-
if home_logs.exists():
|
|
612
|
-
logs_path = home_logs
|
|
613
|
-
logger.info(f"Using local logs directory: {logs_path}")
|
|
614
|
-
else:
|
|
615
|
-
logger.error(f"Logs directory not found: {LOGS_DIR}")
|
|
616
|
-
return
|
|
617
|
-
|
|
618
|
-
# Load existing state
|
|
619
|
-
state = load_state()
|
|
620
|
-
logger.info(f"Loaded state with {len(state['imported_files'])} previously imported files")
|
|
621
|
-
|
|
622
|
-
# Find project directories
|
|
623
|
-
if args.project:
|
|
624
|
-
project_dirs = [d for d in logs_path.iterdir() if d.is_dir() and args.project in d.name]
|
|
625
|
-
else:
|
|
626
|
-
project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
|
|
627
|
-
|
|
628
|
-
if not project_dirs:
|
|
629
|
-
logger.warning("No project directories found")
|
|
630
|
-
return
|
|
631
|
-
|
|
632
|
-
# Filter by date if specified
|
|
633
|
-
if args.days:
|
|
634
|
-
cutoff_date = datetime.now() - timedelta(days=args.days)
|
|
635
|
-
filtered_dirs = []
|
|
636
|
-
for project_dir in project_dirs:
|
|
637
|
-
jsonl_files = list(project_dir.glob("*.jsonl"))
|
|
638
|
-
recent_files = [f for f in jsonl_files if datetime.fromtimestamp(f.stat().st_mtime) > cutoff_date]
|
|
639
|
-
if recent_files:
|
|
640
|
-
filtered_dirs.append(project_dir)
|
|
641
|
-
project_dirs = filtered_dirs
|
|
642
|
-
logger.info(f"Filtered to {len(project_dirs)} projects with files from last {args.days} days")
|
|
643
|
-
|
|
644
|
-
# Apply limit if specified
|
|
645
|
-
if args.limit:
|
|
646
|
-
project_dirs = project_dirs[:args.limit]
|
|
647
|
-
|
|
648
|
-
logger.info(f"Found {len(project_dirs)} projects to import")
|
|
649
|
-
|
|
650
|
-
# Import each project
|
|
651
|
-
total_imported = 0
|
|
652
|
-
for project_dir in project_dirs:
|
|
653
|
-
logger.info(f"Importing project: {project_dir.name}")
|
|
654
|
-
chunks = import_project(project_dir, state)
|
|
655
|
-
total_imported += chunks
|
|
656
|
-
|
|
657
|
-
# Print timing statistics
|
|
658
|
-
logger.info("\n=== Import Performance Summary ===")
|
|
659
|
-
logger.info(f"Total chunks imported: {total_imported}")
|
|
660
|
-
|
|
661
|
-
if timing_stats["total"]:
|
|
662
|
-
logger.info(f"\nTiming averages:")
|
|
663
|
-
logger.info(f" Extract: {sum(timing_stats['extract'])/len(timing_stats['extract']):.2f}s")
|
|
664
|
-
logger.info(f" Chunk: {sum(timing_stats['chunk'])/len(timing_stats['chunk']):.2f}s")
|
|
665
|
-
if timing_stats['embed']:
|
|
666
|
-
logger.info(f" Embed: {sum(timing_stats['embed'])/len(timing_stats['embed']):.2f}s")
|
|
667
|
-
if timing_stats['store']:
|
|
668
|
-
logger.info(f" Store: {sum(timing_stats['store'])/len(timing_stats['store']):.2f}s")
|
|
669
|
-
logger.info(f" Total: {sum(timing_stats['total'])/len(timing_stats['total']):.2f}s per file")
|
|
670
|
-
|
|
671
|
-
if __name__ == "__main__":
|
|
672
|
-
main()
|