claude-self-reflect 2.7.4 ā 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile.safe-watcher +6 -3
- package/README.md +11 -4
- package/docker-compose.yaml +18 -10
- package/installer/setup-wizard-docker.js +24 -1
- package/mcp-server/src/server.py +122 -25
- package/mcp-server/src/status.py +30 -1
- package/package.json +1 -1
- package/scripts/import-conversations-unified.backup.py +0 -374
- package/scripts/import-latest.py +0 -124
- package/scripts/import-old-format.py +0 -171
package/Dockerfile.safe-watcher
CHANGED
|
@@ -30,8 +30,11 @@ RUN mkdir -p /root/.cache/fastembed && \
|
|
|
30
30
|
# Set working directory
|
|
31
31
|
WORKDIR /app
|
|
32
32
|
|
|
33
|
-
# Copy scripts
|
|
34
|
-
COPY scripts/ /scripts/
|
|
33
|
+
# Copy application scripts
|
|
34
|
+
COPY scripts/ /app/scripts/
|
|
35
|
+
|
|
36
|
+
# Make watcher-loop.sh executable
|
|
37
|
+
RUN chmod +x /app/scripts/watcher-loop.sh
|
|
35
38
|
|
|
36
39
|
# Create config directory
|
|
37
40
|
RUN mkdir -p /config
|
|
@@ -41,4 +44,4 @@ ENV PYTHONUNBUFFERED=1
|
|
|
41
44
|
ENV MALLOC_ARENA_MAX=2
|
|
42
45
|
|
|
43
46
|
# Run the watcher loop
|
|
44
|
-
CMD ["/scripts/watcher-loop.sh"]
|
|
47
|
+
CMD ["/app/scripts/watcher-loop.sh"]
|
package/README.md
CHANGED
|
@@ -149,10 +149,17 @@ Here's how your conversations get imported and prioritized:
|
|
|
149
149
|
|
|
150
150
|

|
|
151
151
|
|
|
152
|
-
**The system intelligently
|
|
153
|
-
-
|
|
154
|
-
-
|
|
155
|
-
-
|
|
152
|
+
**The system intelligently processes your conversations:**
|
|
153
|
+
- Runs every 60 seconds checking for new conversations
|
|
154
|
+
- Processes newest conversations first (delta import pattern)
|
|
155
|
+
- Maintains low memory usage (<50MB) through streaming
|
|
156
|
+
- Handles up to 5 files per cycle to prevent blocking
|
|
157
|
+
|
|
158
|
+
**HOT/WARM/COLD Intelligent Prioritization:**
|
|
159
|
+
- **š„ HOT** (< 5 minutes): Switches to 2-second intervals for near real-time import
|
|
160
|
+
- **š”ļø WARM** (< 24 hours): Normal priority with starvation prevention (urgent after 30 min wait)
|
|
161
|
+
- **āļø COLD** (> 24 hours): Batch processed, max 5 per cycle to prevent blocking new content
|
|
162
|
+
- Files are categorized by age and processed with priority queuing to ensure newest content gets imported quickly while preventing older files from being starved
|
|
156
163
|
|
|
157
164
|
## Using It
|
|
158
165
|
|
package/docker-compose.yaml
CHANGED
|
@@ -177,21 +177,29 @@ services:
|
|
|
177
177
|
- ./scripts:/scripts:ro
|
|
178
178
|
environment:
|
|
179
179
|
- QDRANT_URL=http://qdrant:6333
|
|
180
|
-
- STATE_FILE=/config/watcher
|
|
180
|
+
- STATE_FILE=/config/csr-watcher.json
|
|
181
|
+
- LOGS_DIR=/logs # Fixed: Point to mounted volume
|
|
181
182
|
- VOYAGE_KEY=${VOYAGE_KEY:-}
|
|
182
183
|
- PREFER_LOCAL_EMBEDDINGS=${PREFER_LOCAL_EMBEDDINGS:-true}
|
|
183
|
-
-
|
|
184
|
-
-
|
|
185
|
-
-
|
|
186
|
-
-
|
|
187
|
-
-
|
|
184
|
+
- ENABLE_MEMORY_DECAY=${ENABLE_MEMORY_DECAY:-false}
|
|
185
|
+
- DECAY_WEIGHT=${DECAY_WEIGHT:-0.3}
|
|
186
|
+
- DECAY_SCALE_DAYS=${DECAY_SCALE_DAYS:-90}
|
|
187
|
+
- CHECK_INTERVAL_S=${CHECK_INTERVAL_S:-60}
|
|
188
|
+
- HOT_CHECK_INTERVAL_S=${HOT_CHECK_INTERVAL_S:-2}
|
|
189
|
+
- HOT_WINDOW_MINUTES=${HOT_WINDOW_MINUTES:-5}
|
|
190
|
+
- WARM_WINDOW_HOURS=${WARM_WINDOW_HOURS:-24}
|
|
191
|
+
- MAX_COLD_FILES=${MAX_COLD_FILES:-5}
|
|
192
|
+
- MAX_WARM_WAIT_MINUTES=${MAX_WARM_WAIT_MINUTES:-30}
|
|
193
|
+
- MAX_MESSAGES_PER_CHUNK=${MAX_MESSAGES_PER_CHUNK:-10}
|
|
188
194
|
- MAX_CHUNK_SIZE=${MAX_CHUNK_SIZE:-50} # Messages per chunk for streaming
|
|
195
|
+
- MEMORY_LIMIT_MB=${MEMORY_LIMIT_MB:-1000}
|
|
196
|
+
- MEMORY_WARNING_MB=${MEMORY_WARNING_MB:-500}
|
|
189
197
|
- PYTHONUNBUFFERED=1
|
|
190
198
|
- MALLOC_ARENA_MAX=2
|
|
191
|
-
restart:
|
|
192
|
-
profiles: ["safe-watch"] # Requires explicit profile to run
|
|
193
|
-
mem_limit:
|
|
194
|
-
memswap_limit:
|
|
199
|
+
restart: unless-stopped
|
|
200
|
+
profiles: ["safe-watch", "watch"] # Requires explicit profile to run
|
|
201
|
+
mem_limit: 1g # Increased to 1GB to match MEMORY_LIMIT_MB
|
|
202
|
+
memswap_limit: 1g
|
|
195
203
|
cpus: 1.0 # Single CPU core limit
|
|
196
204
|
|
|
197
205
|
# MCP server for Claude integration
|
|
@@ -454,6 +454,26 @@ async function enrichMetadata() {
|
|
|
454
454
|
}
|
|
455
455
|
}
|
|
456
456
|
|
|
457
|
+
async function startWatcher() {
|
|
458
|
+
console.log('\nš Starting the streaming watcher...');
|
|
459
|
+
console.log(' ⢠HOT files (<5 min): 2-second processing');
|
|
460
|
+
console.log(' ⢠WARM files (<24 hrs): Normal priority');
|
|
461
|
+
console.log(' ⢠COLD files (>24 hrs): Batch processing');
|
|
462
|
+
|
|
463
|
+
try {
|
|
464
|
+
safeExec('docker', ['compose', '--profile', 'watch', 'up', '-d', 'safe-watcher'], {
|
|
465
|
+
cwd: projectRoot,
|
|
466
|
+
stdio: 'inherit'
|
|
467
|
+
});
|
|
468
|
+
console.log('ā
Watcher started successfully!');
|
|
469
|
+
return true;
|
|
470
|
+
} catch (error) {
|
|
471
|
+
console.log('ā ļø Could not start watcher automatically');
|
|
472
|
+
console.log(' You can start it manually with: docker compose --profile watch up -d');
|
|
473
|
+
return false;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
457
477
|
async function showFinalInstructions() {
|
|
458
478
|
console.log('\nā
Setup complete!');
|
|
459
479
|
|
|
@@ -461,7 +481,7 @@ async function showFinalInstructions() {
|
|
|
461
481
|
console.log(' ⢠š Qdrant Dashboard: http://localhost:6333/dashboard/');
|
|
462
482
|
console.log(' ⢠š Status: All services running');
|
|
463
483
|
console.log(' ⢠š Search: Semantic search with memory decay enabled');
|
|
464
|
-
console.log(' ⢠š
|
|
484
|
+
console.log(' ⢠š Watcher: HOT/WARM/COLD prioritization active');
|
|
465
485
|
|
|
466
486
|
console.log('\nš Quick Reference Commands:');
|
|
467
487
|
console.log(' ⢠Check status: docker compose ps');
|
|
@@ -568,6 +588,9 @@ async function main() {
|
|
|
568
588
|
// Enrich metadata (new in v2.5.19)
|
|
569
589
|
await enrichMetadata();
|
|
570
590
|
|
|
591
|
+
// Start the watcher
|
|
592
|
+
await startWatcher();
|
|
593
|
+
|
|
571
594
|
// Show final instructions
|
|
572
595
|
await showFinalInstructions();
|
|
573
596
|
|
package/mcp-server/src/server.py
CHANGED
|
@@ -9,6 +9,7 @@ import json
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import hashlib
|
|
11
11
|
import time
|
|
12
|
+
import logging
|
|
12
13
|
|
|
13
14
|
from fastmcp import FastMCP, Context
|
|
14
15
|
from .utils import normalize_project_name
|
|
@@ -124,18 +125,48 @@ indexing_status = {
|
|
|
124
125
|
"is_checking": False
|
|
125
126
|
}
|
|
126
127
|
|
|
127
|
-
|
|
128
|
+
# Cache for indexing status (5-second TTL)
|
|
129
|
+
_indexing_cache = {"result": None, "timestamp": 0}
|
|
130
|
+
|
|
131
|
+
# Setup logger
|
|
132
|
+
logger = logging.getLogger(__name__)
|
|
133
|
+
|
|
134
|
+
def normalize_path(path_str: str) -> str:
|
|
135
|
+
"""Normalize path for consistent comparison across platforms.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
path_str: Path string to normalize
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Normalized path string with consistent separators
|
|
142
|
+
"""
|
|
143
|
+
if not path_str:
|
|
144
|
+
return path_str
|
|
145
|
+
p = Path(path_str).expanduser().resolve()
|
|
146
|
+
return str(p).replace('\\', '/') # Consistent separators for all platforms
|
|
147
|
+
|
|
148
|
+
async def update_indexing_status(cache_ttl: int = 5):
|
|
128
149
|
"""Update indexing status by checking JSONL files vs Qdrant collections.
|
|
129
|
-
This is a lightweight check that compares file counts, not full content.
|
|
130
|
-
|
|
150
|
+
This is a lightweight check that compares file counts, not full content.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
cache_ttl: Cache time-to-live in seconds (default: 5)
|
|
154
|
+
"""
|
|
155
|
+
global indexing_status, _indexing_cache
|
|
156
|
+
|
|
157
|
+
# Check cache first (5-second TTL to prevent performance issues)
|
|
158
|
+
current_time = time.time()
|
|
159
|
+
if _indexing_cache["result"] and current_time - _indexing_cache["timestamp"] < cache_ttl:
|
|
160
|
+
# Use cached result
|
|
161
|
+
indexing_status = _indexing_cache["result"].copy()
|
|
162
|
+
return
|
|
131
163
|
|
|
132
164
|
# Don't run concurrent checks
|
|
133
165
|
if indexing_status["is_checking"]:
|
|
134
166
|
return
|
|
135
167
|
|
|
136
|
-
#
|
|
137
|
-
current_time
|
|
138
|
-
if current_time - indexing_status["last_check"] < 300: # 5 minutes
|
|
168
|
+
# Check immediately on first call, then every 60 seconds to avoid overhead
|
|
169
|
+
if indexing_status["last_check"] > 0 and current_time - indexing_status["last_check"] < 60: # 1 minute
|
|
139
170
|
return
|
|
140
171
|
|
|
141
172
|
indexing_status["is_checking"] = True
|
|
@@ -151,46 +182,107 @@ async def update_indexing_status():
|
|
|
151
182
|
jsonl_files = list(projects_dir.glob("**/*.jsonl"))
|
|
152
183
|
total_files = len(jsonl_files)
|
|
153
184
|
|
|
154
|
-
# Check imported-files.json to see what's been imported
|
|
155
|
-
# The
|
|
156
|
-
|
|
185
|
+
# Check imported-files.json AND watcher state files to see what's been imported
|
|
186
|
+
# The system uses multiple state files that need to be merged
|
|
187
|
+
all_imported_files = set() # Use set to avoid duplicates
|
|
188
|
+
file_metadata = {}
|
|
189
|
+
|
|
190
|
+
# 1. Check imported-files.json (batch importer)
|
|
157
191
|
possible_paths = [
|
|
158
192
|
Path.home() / ".claude-self-reflect" / "config" / "imported-files.json",
|
|
159
193
|
Path(__file__).parent.parent.parent / "config" / "imported-files.json",
|
|
160
194
|
Path("/config/imported-files.json") # Docker path if running in container
|
|
161
195
|
]
|
|
162
196
|
|
|
163
|
-
imported_files_path = None
|
|
164
197
|
for path in possible_paths:
|
|
165
198
|
if path.exists():
|
|
166
|
-
|
|
167
|
-
|
|
199
|
+
try:
|
|
200
|
+
with open(path, 'r') as f:
|
|
201
|
+
imported_data = json.load(f)
|
|
202
|
+
imported_files_dict = imported_data.get("imported_files", {})
|
|
203
|
+
file_metadata.update(imported_data.get("file_metadata", {}))
|
|
204
|
+
# Normalize paths before adding to set
|
|
205
|
+
normalized_files = {normalize_path(k) for k in imported_files_dict.keys()}
|
|
206
|
+
all_imported_files.update(normalized_files)
|
|
207
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
208
|
+
logger.debug(f"Failed to read state file {path}: {e}")
|
|
209
|
+
pass # Continue if file is corrupted
|
|
168
210
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
211
|
+
# 2. Check csr-watcher.json (streaming watcher - local mode)
|
|
212
|
+
watcher_paths = [
|
|
213
|
+
Path.home() / ".claude-self-reflect" / "config" / "csr-watcher.json",
|
|
214
|
+
Path("/config/csr-watcher.json") # Docker path
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
for path in watcher_paths:
|
|
218
|
+
if path.exists():
|
|
219
|
+
try:
|
|
220
|
+
with open(path, 'r') as f:
|
|
221
|
+
watcher_data = json.load(f)
|
|
222
|
+
watcher_files = watcher_data.get("imported_files", {})
|
|
223
|
+
# Normalize paths before adding to set
|
|
224
|
+
normalized_files = {normalize_path(k) for k in watcher_files.keys()}
|
|
225
|
+
all_imported_files.update(normalized_files)
|
|
226
|
+
# Add to metadata with normalized paths
|
|
227
|
+
for file_path, info in watcher_files.items():
|
|
228
|
+
normalized = normalize_path(file_path)
|
|
229
|
+
if normalized not in file_metadata:
|
|
230
|
+
file_metadata[normalized] = {
|
|
231
|
+
"position": 1,
|
|
232
|
+
"chunks": info.get("chunks", 0)
|
|
233
|
+
}
|
|
234
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
235
|
+
logger.debug(f"Failed to read watcher state file {path}: {e}")
|
|
236
|
+
pass # Continue if file is corrupted
|
|
237
|
+
|
|
238
|
+
# 3. Check csr-watcher-cloud.json (streaming watcher - cloud mode)
|
|
239
|
+
cloud_watcher_path = Path.home() / ".claude-self-reflect" / "config" / "csr-watcher-cloud.json"
|
|
240
|
+
if cloud_watcher_path.exists():
|
|
241
|
+
try:
|
|
242
|
+
with open(cloud_watcher_path, 'r') as f:
|
|
243
|
+
cloud_data = json.load(f)
|
|
244
|
+
cloud_files = cloud_data.get("imported_files", {})
|
|
245
|
+
# Normalize paths before adding to set
|
|
246
|
+
normalized_files = {normalize_path(k) for k in cloud_files.keys()}
|
|
247
|
+
all_imported_files.update(normalized_files)
|
|
248
|
+
# Add to metadata with normalized paths
|
|
249
|
+
for file_path, info in cloud_files.items():
|
|
250
|
+
normalized = normalize_path(file_path)
|
|
251
|
+
if normalized not in file_metadata:
|
|
252
|
+
file_metadata[normalized] = {
|
|
253
|
+
"position": 1,
|
|
254
|
+
"chunks": info.get("chunks", 0)
|
|
255
|
+
}
|
|
256
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
257
|
+
logger.debug(f"Failed to read cloud watcher state file {cloud_watcher_path}: {e}")
|
|
258
|
+
pass # Continue if file is corrupted
|
|
259
|
+
|
|
260
|
+
# Convert set to list for compatibility
|
|
261
|
+
imported_files_list = list(all_imported_files)
|
|
179
262
|
|
|
180
263
|
# Count files that have been imported
|
|
181
264
|
for file_path in jsonl_files:
|
|
265
|
+
# Normalize the current file path for consistent comparison
|
|
266
|
+
normalized_file = normalize_path(str(file_path))
|
|
267
|
+
|
|
182
268
|
# Try multiple path formats to match Docker's state file
|
|
183
269
|
file_str = str(file_path).replace(str(Path.home()), "/logs").replace("\\", "/")
|
|
184
270
|
# Also try without .claude/projects prefix (Docker mounts directly)
|
|
185
271
|
file_str_alt = file_str.replace("/.claude/projects", "")
|
|
186
272
|
|
|
273
|
+
# Normalize alternative paths as well
|
|
274
|
+
normalized_alt = normalize_path(file_str)
|
|
275
|
+
normalized_alt2 = normalize_path(file_str_alt)
|
|
276
|
+
|
|
187
277
|
# Check if file is in imported_files list (fully imported)
|
|
188
|
-
if
|
|
278
|
+
if normalized_file in imported_files_list or normalized_alt in imported_files_list or normalized_alt2 in imported_files_list:
|
|
189
279
|
indexed_files += 1
|
|
190
280
|
# Or if it has metadata with position > 0 (partially imported)
|
|
191
|
-
elif
|
|
281
|
+
elif normalized_file in file_metadata and file_metadata[normalized_file].get("position", 0) > 0:
|
|
282
|
+
indexed_files += 1
|
|
283
|
+
elif normalized_alt in file_metadata and file_metadata[normalized_alt].get("position", 0) > 0:
|
|
192
284
|
indexed_files += 1
|
|
193
|
-
elif
|
|
285
|
+
elif normalized_alt2 in file_metadata and file_metadata[normalized_alt2].get("position", 0) > 0:
|
|
194
286
|
indexed_files += 1
|
|
195
287
|
|
|
196
288
|
# Update status
|
|
@@ -203,9 +295,14 @@ async def update_indexing_status():
|
|
|
203
295
|
indexing_status["percentage"] = (indexed_files / total_files) * 100
|
|
204
296
|
else:
|
|
205
297
|
indexing_status["percentage"] = 100.0
|
|
298
|
+
|
|
299
|
+
# Update cache
|
|
300
|
+
_indexing_cache["result"] = indexing_status.copy()
|
|
301
|
+
_indexing_cache["timestamp"] = current_time
|
|
206
302
|
|
|
207
303
|
except Exception as e:
|
|
208
304
|
print(f"[WARNING] Failed to update indexing status: {e}")
|
|
305
|
+
logger.error(f"Failed to update indexing status: {e}", exc_info=True)
|
|
209
306
|
finally:
|
|
210
307
|
indexing_status["is_checking"] = False
|
|
211
308
|
|
package/mcp-server/src/status.py
CHANGED
|
@@ -5,6 +5,7 @@ Designed for <20ms execution time to support status bars and shell scripts.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
|
+
import time
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from collections import defaultdict
|
|
10
11
|
|
|
@@ -53,11 +54,36 @@ def normalize_file_path(file_path: str) -> str:
|
|
|
53
54
|
return file_path
|
|
54
55
|
|
|
55
56
|
|
|
57
|
+
def get_watcher_status() -> dict:
|
|
58
|
+
"""Get streaming watcher status if available."""
|
|
59
|
+
watcher_state_file = Path.home() / "config" / "csr-watcher.json"
|
|
60
|
+
|
|
61
|
+
if not watcher_state_file.exists():
|
|
62
|
+
return {"running": False, "status": "not configured"}
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
with open(watcher_state_file) as f:
|
|
66
|
+
state = json.load(f)
|
|
67
|
+
|
|
68
|
+
# Check if watcher is active (modified recently)
|
|
69
|
+
file_age = time.time() - watcher_state_file.stat().st_mtime
|
|
70
|
+
is_active = file_age < 120 # Active if updated in last 2 minutes
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
"running": is_active,
|
|
74
|
+
"files_processed": len(state.get("imported_files", {})),
|
|
75
|
+
"last_update_seconds": int(file_age),
|
|
76
|
+
"status": "š¢ active" if is_active else "š“ inactive"
|
|
77
|
+
}
|
|
78
|
+
except:
|
|
79
|
+
return {"running": False, "status": "error reading state"}
|
|
80
|
+
|
|
81
|
+
|
|
56
82
|
def get_status() -> dict:
|
|
57
83
|
"""Get indexing status with overall stats and per-project breakdown.
|
|
58
84
|
|
|
59
85
|
Returns:
|
|
60
|
-
dict: JSON structure with overall and per-project indexing status
|
|
86
|
+
dict: JSON structure with overall and per-project indexing status, plus watcher status
|
|
61
87
|
"""
|
|
62
88
|
projects_dir = Path.home() / ".claude" / "projects"
|
|
63
89
|
project_stats = defaultdict(lambda: {"indexed": 0, "total": 0})
|
|
@@ -154,6 +180,9 @@ def get_status() -> dict:
|
|
|
154
180
|
"total": stats["total"]
|
|
155
181
|
}
|
|
156
182
|
|
|
183
|
+
# Add watcher status
|
|
184
|
+
result["watcher"] = get_watcher_status()
|
|
185
|
+
|
|
157
186
|
return result
|
|
158
187
|
|
|
159
188
|
|
package/package.json
CHANGED
|
@@ -1,374 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Streaming importer with true line-by-line processing to prevent OOM.
|
|
4
|
-
Processes JSONL files without loading entire file into memory.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import sys
|
|
10
|
-
import hashlib
|
|
11
|
-
import gc
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from datetime import datetime
|
|
14
|
-
from typing import List, Dict, Any, Optional
|
|
15
|
-
import logging
|
|
16
|
-
|
|
17
|
-
# Add the project root to the Python path
|
|
18
|
-
project_root = Path(__file__).parent.parent
|
|
19
|
-
sys.path.insert(0, str(project_root))
|
|
20
|
-
|
|
21
|
-
from qdrant_client import QdrantClient
|
|
22
|
-
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
23
|
-
|
|
24
|
-
# Set up logging
|
|
25
|
-
logging.basicConfig(
|
|
26
|
-
level=logging.INFO,
|
|
27
|
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
28
|
-
)
|
|
29
|
-
logger = logging.getLogger(__name__)
|
|
30
|
-
|
|
31
|
-
# Environment variables
|
|
32
|
-
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
33
|
-
STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
|
|
34
|
-
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
35
|
-
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
36
|
-
MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
|
|
37
|
-
|
|
38
|
-
# Initialize Qdrant client
|
|
39
|
-
client = QdrantClient(url=QDRANT_URL)
|
|
40
|
-
|
|
41
|
-
# Initialize embedding provider
|
|
42
|
-
embedding_provider = None
|
|
43
|
-
embedding_dimension = None
|
|
44
|
-
|
|
45
|
-
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
46
|
-
logger.info("Using local embeddings (fastembed)")
|
|
47
|
-
from fastembed import TextEmbedding
|
|
48
|
-
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
49
|
-
embedding_dimension = 384
|
|
50
|
-
collection_suffix = "local"
|
|
51
|
-
else:
|
|
52
|
-
logger.info("Using Voyage AI embeddings")
|
|
53
|
-
import voyageai
|
|
54
|
-
embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
55
|
-
embedding_dimension = 1024
|
|
56
|
-
collection_suffix = "voyage"
|
|
57
|
-
|
|
58
|
-
def normalize_project_name(project_name: str) -> str:
|
|
59
|
-
"""Normalize project name for consistency."""
|
|
60
|
-
return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
|
|
61
|
-
|
|
62
|
-
def get_collection_name(project_path: Path) -> str:
|
|
63
|
-
"""Generate collection name from project path."""
|
|
64
|
-
normalized = normalize_project_name(project_path.name)
|
|
65
|
-
name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
66
|
-
return f"conv_{name_hash}_{collection_suffix}"
|
|
67
|
-
|
|
68
|
-
def ensure_collection(collection_name: str):
|
|
69
|
-
"""Ensure collection exists with correct configuration."""
|
|
70
|
-
collections = client.get_collections().collections
|
|
71
|
-
if not any(c.name == collection_name for c in collections):
|
|
72
|
-
logger.info(f"Creating collection: {collection_name}")
|
|
73
|
-
client.create_collection(
|
|
74
|
-
collection_name=collection_name,
|
|
75
|
-
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
79
|
-
"""Generate embeddings for texts."""
|
|
80
|
-
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
81
|
-
embeddings = list(embedding_provider.passage_embed(texts))
|
|
82
|
-
return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
|
|
83
|
-
else:
|
|
84
|
-
response = embedding_provider.embed(texts, model="voyage-3")
|
|
85
|
-
return response.embeddings
|
|
86
|
-
|
|
87
|
-
def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
88
|
-
conversation_id: str, created_at: str,
|
|
89
|
-
metadata: Dict[str, Any], collection_name: str,
|
|
90
|
-
project_path: Path) -> int:
|
|
91
|
-
"""Process and immediately upload a single chunk."""
|
|
92
|
-
if not messages:
|
|
93
|
-
return 0
|
|
94
|
-
|
|
95
|
-
# Extract text content
|
|
96
|
-
texts = []
|
|
97
|
-
for msg in messages:
|
|
98
|
-
role = msg.get("role", "unknown")
|
|
99
|
-
content = msg.get("content", "")
|
|
100
|
-
if content:
|
|
101
|
-
texts.append(f"{role.upper()}: {content}")
|
|
102
|
-
|
|
103
|
-
if not texts:
|
|
104
|
-
return 0
|
|
105
|
-
|
|
106
|
-
chunk_text = "\n".join(texts)
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
# Generate embedding
|
|
110
|
-
embeddings = generate_embeddings([chunk_text])
|
|
111
|
-
|
|
112
|
-
# Create point ID
|
|
113
|
-
point_id = hashlib.md5(
|
|
114
|
-
f"{conversation_id}_{chunk_index}".encode()
|
|
115
|
-
).hexdigest()[:16]
|
|
116
|
-
|
|
117
|
-
# Create payload
|
|
118
|
-
payload = {
|
|
119
|
-
"text": chunk_text,
|
|
120
|
-
"conversation_id": conversation_id,
|
|
121
|
-
"chunk_index": chunk_index,
|
|
122
|
-
"timestamp": created_at,
|
|
123
|
-
"project": normalize_project_name(project_path.name),
|
|
124
|
-
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
125
|
-
"message_count": len(messages)
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
# Add metadata
|
|
129
|
-
if metadata:
|
|
130
|
-
payload.update(metadata)
|
|
131
|
-
|
|
132
|
-
# Create point
|
|
133
|
-
point = PointStruct(
|
|
134
|
-
id=int(point_id, 16) % (2**63),
|
|
135
|
-
vector=embeddings[0],
|
|
136
|
-
payload=payload
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
# Upload immediately
|
|
140
|
-
client.upsert(
|
|
141
|
-
collection_name=collection_name,
|
|
142
|
-
points=[point],
|
|
143
|
-
wait=True
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
return 1
|
|
147
|
-
|
|
148
|
-
except Exception as e:
|
|
149
|
-
logger.error(f"Error processing chunk {chunk_index}: {e}")
|
|
150
|
-
return 0
|
|
151
|
-
|
|
152
|
-
def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
|
|
153
|
-
"""Extract metadata in a single pass, return metadata and first timestamp."""
|
|
154
|
-
metadata = {
|
|
155
|
-
"files_analyzed": [],
|
|
156
|
-
"files_edited": [],
|
|
157
|
-
"tools_used": [],
|
|
158
|
-
"concepts": []
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
first_timestamp = None
|
|
162
|
-
|
|
163
|
-
try:
|
|
164
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
165
|
-
for line in f:
|
|
166
|
-
if not line.strip():
|
|
167
|
-
continue
|
|
168
|
-
|
|
169
|
-
try:
|
|
170
|
-
data = json.loads(line)
|
|
171
|
-
|
|
172
|
-
# Get timestamp from first valid entry
|
|
173
|
-
if first_timestamp is None and 'timestamp' in data:
|
|
174
|
-
first_timestamp = data.get('timestamp')
|
|
175
|
-
|
|
176
|
-
# Extract tool usage from messages
|
|
177
|
-
if 'message' in data and data['message']:
|
|
178
|
-
msg = data['message']
|
|
179
|
-
if msg.get('content'):
|
|
180
|
-
content = msg['content']
|
|
181
|
-
if isinstance(content, list):
|
|
182
|
-
for item in content:
|
|
183
|
-
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
184
|
-
tool_name = item.get('name', '')
|
|
185
|
-
if tool_name and tool_name not in metadata['tools_used']:
|
|
186
|
-
metadata['tools_used'].append(tool_name)
|
|
187
|
-
|
|
188
|
-
# Extract file references
|
|
189
|
-
if 'input' in item:
|
|
190
|
-
input_data = item['input']
|
|
191
|
-
if isinstance(input_data, dict):
|
|
192
|
-
if 'file_path' in input_data:
|
|
193
|
-
file_ref = input_data['file_path']
|
|
194
|
-
if file_ref not in metadata['files_analyzed']:
|
|
195
|
-
metadata['files_analyzed'].append(file_ref)
|
|
196
|
-
if 'path' in input_data:
|
|
197
|
-
file_ref = input_data['path']
|
|
198
|
-
if file_ref not in metadata['files_analyzed']:
|
|
199
|
-
metadata['files_analyzed'].append(file_ref)
|
|
200
|
-
|
|
201
|
-
except json.JSONDecodeError:
|
|
202
|
-
continue
|
|
203
|
-
except Exception:
|
|
204
|
-
continue
|
|
205
|
-
|
|
206
|
-
except Exception as e:
|
|
207
|
-
logger.warning(f"Error extracting metadata: {e}")
|
|
208
|
-
|
|
209
|
-
return metadata, first_timestamp or datetime.now().isoformat()
|
|
210
|
-
|
|
211
|
-
def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
212
|
-
"""Stream import a single JSONL file without loading it into memory."""
|
|
213
|
-
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
214
|
-
|
|
215
|
-
# Extract metadata in first pass (lightweight)
|
|
216
|
-
metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
|
|
217
|
-
|
|
218
|
-
# Stream messages and process in chunks
|
|
219
|
-
chunk_buffer = []
|
|
220
|
-
chunk_index = 0
|
|
221
|
-
total_chunks = 0
|
|
222
|
-
conversation_id = jsonl_file.stem
|
|
223
|
-
|
|
224
|
-
try:
|
|
225
|
-
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
226
|
-
for line_num, line in enumerate(f, 1):
|
|
227
|
-
line = line.strip()
|
|
228
|
-
if not line:
|
|
229
|
-
continue
|
|
230
|
-
|
|
231
|
-
try:
|
|
232
|
-
data = json.loads(line)
|
|
233
|
-
|
|
234
|
-
# Skip non-message lines
|
|
235
|
-
if data.get('type') == 'summary':
|
|
236
|
-
continue
|
|
237
|
-
|
|
238
|
-
# Extract message if present
|
|
239
|
-
if 'message' in data and data['message']:
|
|
240
|
-
msg = data['message']
|
|
241
|
-
if msg.get('role') and msg.get('content'):
|
|
242
|
-
# Extract content
|
|
243
|
-
content = msg['content']
|
|
244
|
-
if isinstance(content, list):
|
|
245
|
-
text_parts = []
|
|
246
|
-
for item in content:
|
|
247
|
-
if isinstance(item, dict) and item.get('type') == 'text':
|
|
248
|
-
text_parts.append(item.get('text', ''))
|
|
249
|
-
elif isinstance(item, str):
|
|
250
|
-
text_parts.append(item)
|
|
251
|
-
content = '\n'.join(text_parts)
|
|
252
|
-
|
|
253
|
-
if content:
|
|
254
|
-
chunk_buffer.append({
|
|
255
|
-
'role': msg['role'],
|
|
256
|
-
'content': content
|
|
257
|
-
})
|
|
258
|
-
|
|
259
|
-
# Process chunk when buffer reaches MAX_CHUNK_SIZE
|
|
260
|
-
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
261
|
-
chunks = process_and_upload_chunk(
|
|
262
|
-
chunk_buffer, chunk_index, conversation_id,
|
|
263
|
-
created_at, metadata, collection_name, project_path
|
|
264
|
-
)
|
|
265
|
-
total_chunks += chunks
|
|
266
|
-
chunk_buffer = []
|
|
267
|
-
chunk_index += 1
|
|
268
|
-
|
|
269
|
-
# Force garbage collection after each chunk
|
|
270
|
-
gc.collect()
|
|
271
|
-
|
|
272
|
-
# Log progress
|
|
273
|
-
if chunk_index % 10 == 0:
|
|
274
|
-
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
275
|
-
|
|
276
|
-
except json.JSONDecodeError:
|
|
277
|
-
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
278
|
-
except Exception as e:
|
|
279
|
-
logger.debug(f"Error processing line {line_num}: {e}")
|
|
280
|
-
|
|
281
|
-
# Process remaining messages
|
|
282
|
-
if chunk_buffer:
|
|
283
|
-
chunks = process_and_upload_chunk(
|
|
284
|
-
chunk_buffer, chunk_index, conversation_id,
|
|
285
|
-
created_at, metadata, collection_name, project_path
|
|
286
|
-
)
|
|
287
|
-
total_chunks += chunks
|
|
288
|
-
|
|
289
|
-
logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
|
|
290
|
-
return total_chunks
|
|
291
|
-
|
|
292
|
-
except Exception as e:
|
|
293
|
-
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
294
|
-
return 0
|
|
295
|
-
|
|
296
|
-
def load_state() -> dict:
|
|
297
|
-
"""Load import state."""
|
|
298
|
-
if os.path.exists(STATE_FILE):
|
|
299
|
-
try:
|
|
300
|
-
with open(STATE_FILE, 'r') as f:
|
|
301
|
-
return json.load(f)
|
|
302
|
-
except:
|
|
303
|
-
pass
|
|
304
|
-
return {"imported_files": {}}
|
|
305
|
-
|
|
306
|
-
def save_state(state: dict):
|
|
307
|
-
"""Save import state."""
|
|
308
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
309
|
-
with open(STATE_FILE, 'w') as f:
|
|
310
|
-
json.dump(state, f, indent=2)
|
|
311
|
-
|
|
312
|
-
def should_import_file(file_path: Path, state: dict) -> bool:
|
|
313
|
-
"""Check if file should be imported."""
|
|
314
|
-
file_str = str(file_path)
|
|
315
|
-
if file_str in state.get("imported_files", {}):
|
|
316
|
-
file_info = state["imported_files"][file_str]
|
|
317
|
-
last_modified = file_path.stat().st_mtime
|
|
318
|
-
if file_info.get("last_modified") == last_modified:
|
|
319
|
-
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
320
|
-
return False
|
|
321
|
-
return True
|
|
322
|
-
|
|
323
|
-
def update_file_state(file_path: Path, state: dict, chunks: int):
|
|
324
|
-
"""Update state for imported file."""
|
|
325
|
-
file_str = str(file_path)
|
|
326
|
-
state["imported_files"][file_str] = {
|
|
327
|
-
"imported_at": datetime.now().isoformat(),
|
|
328
|
-
"last_modified": file_path.stat().st_mtime,
|
|
329
|
-
"chunks": chunks
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
def main():
|
|
333
|
-
"""Main import function."""
|
|
334
|
-
# Load state
|
|
335
|
-
state = load_state()
|
|
336
|
-
logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
|
|
337
|
-
|
|
338
|
-
# Find all projects
|
|
339
|
-
logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
|
|
340
|
-
project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
|
|
341
|
-
logger.info(f"Found {len(project_dirs)} projects to import")
|
|
342
|
-
|
|
343
|
-
total_imported = 0
|
|
344
|
-
|
|
345
|
-
for project_dir in project_dirs:
|
|
346
|
-
# Get collection name
|
|
347
|
-
collection_name = get_collection_name(project_dir)
|
|
348
|
-
logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
|
|
349
|
-
|
|
350
|
-
# Ensure collection exists
|
|
351
|
-
ensure_collection(collection_name)
|
|
352
|
-
|
|
353
|
-
# Find JSONL files
|
|
354
|
-
jsonl_files = sorted(project_dir.glob("*.jsonl"))
|
|
355
|
-
|
|
356
|
-
# Limit files per cycle if specified
|
|
357
|
-
max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
|
|
358
|
-
jsonl_files = jsonl_files[:max_files]
|
|
359
|
-
|
|
360
|
-
for jsonl_file in jsonl_files:
|
|
361
|
-
if should_import_file(jsonl_file, state):
|
|
362
|
-
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
363
|
-
if chunks > 0:
|
|
364
|
-
update_file_state(jsonl_file, state, chunks)
|
|
365
|
-
save_state(state)
|
|
366
|
-
total_imported += 1
|
|
367
|
-
|
|
368
|
-
# Force GC after each file
|
|
369
|
-
gc.collect()
|
|
370
|
-
|
|
371
|
-
logger.info(f"Import complete: processed {total_imported} files")
|
|
372
|
-
|
|
373
|
-
if __name__ == "__main__":
|
|
374
|
-
main()
|
package/scripts/import-latest.py
DELETED
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Quick import script for current project's latest conversations.
|
|
4
|
-
Designed for PreCompact hook integration - targets <10 second imports.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
import sys
|
|
9
|
-
import json
|
|
10
|
-
import subprocess
|
|
11
|
-
from datetime import datetime, timedelta
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
import logging
|
|
14
|
-
|
|
15
|
-
# Configuration
|
|
16
|
-
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
17
|
-
STATE_FILE = os.getenv("STATE_FILE", os.path.expanduser("~/.claude-self-reflect-state.json"))
|
|
18
|
-
HOURS_BACK = int(os.getenv("IMPORT_HOURS_BACK", "2")) # Only import last 2 hours by default
|
|
19
|
-
|
|
20
|
-
# Set up logging
|
|
21
|
-
logging.basicConfig(
|
|
22
|
-
level=logging.INFO,
|
|
23
|
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
24
|
-
)
|
|
25
|
-
logger = logging.getLogger(__name__)
|
|
26
|
-
|
|
27
|
-
def load_state():
|
|
28
|
-
"""Load import state from file."""
|
|
29
|
-
if os.path.exists(STATE_FILE):
|
|
30
|
-
try:
|
|
31
|
-
with open(STATE_FILE, 'r') as f:
|
|
32
|
-
return json.load(f)
|
|
33
|
-
except:
|
|
34
|
-
return {}
|
|
35
|
-
return {}
|
|
36
|
-
|
|
37
|
-
def save_state(state):
|
|
38
|
-
"""Save import state to file."""
|
|
39
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
40
|
-
with open(STATE_FILE, 'w') as f:
|
|
41
|
-
json.dump(state, f, indent=2)
|
|
42
|
-
|
|
43
|
-
def get_project_from_cwd():
|
|
44
|
-
"""Detect project from current working directory."""
|
|
45
|
-
cwd = os.getcwd()
|
|
46
|
-
# Convert path to project name format used in logs
|
|
47
|
-
# Claude logs use format: -Users-username-path-to-project
|
|
48
|
-
project_name = cwd.replace('/', '-')
|
|
49
|
-
# Keep the leading dash as that's how Claude stores it
|
|
50
|
-
if not project_name.startswith('-'):
|
|
51
|
-
project_name = '-' + project_name
|
|
52
|
-
return project_name
|
|
53
|
-
|
|
54
|
-
def get_recent_files(project_path: Path, hours_back: int):
|
|
55
|
-
"""Get JSONL files modified in the last N hours."""
|
|
56
|
-
cutoff_time = datetime.now() - timedelta(hours=hours_back)
|
|
57
|
-
recent_files = []
|
|
58
|
-
|
|
59
|
-
for jsonl_file in project_path.glob("*.jsonl"):
|
|
60
|
-
mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
|
|
61
|
-
if mtime > cutoff_time:
|
|
62
|
-
recent_files.append(jsonl_file)
|
|
63
|
-
|
|
64
|
-
return sorted(recent_files, key=lambda f: f.stat().st_mtime, reverse=True)
|
|
65
|
-
|
|
66
|
-
def main():
|
|
67
|
-
"""Main quick import function."""
|
|
68
|
-
start_time = datetime.now()
|
|
69
|
-
|
|
70
|
-
# Detect current project
|
|
71
|
-
project_name = get_project_from_cwd()
|
|
72
|
-
project_path = Path(LOGS_DIR) / project_name
|
|
73
|
-
|
|
74
|
-
if not project_path.exists():
|
|
75
|
-
logger.warning(f"Project logs not found: {project_path}")
|
|
76
|
-
logger.info("Make sure you're in a project directory with Claude conversations.")
|
|
77
|
-
return
|
|
78
|
-
|
|
79
|
-
logger.info(f"Quick importing latest conversations for: {project_name}")
|
|
80
|
-
|
|
81
|
-
# Get recent files
|
|
82
|
-
recent_files = get_recent_files(project_path, HOURS_BACK)
|
|
83
|
-
logger.info(f"Found {len(recent_files)} files modified in last {HOURS_BACK} hours")
|
|
84
|
-
|
|
85
|
-
if not recent_files:
|
|
86
|
-
logger.info("No recent conversations to import")
|
|
87
|
-
return
|
|
88
|
-
|
|
89
|
-
# For now, just call the unified importer with the specific project
|
|
90
|
-
# This is a temporary solution until we implement incremental imports
|
|
91
|
-
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
92
|
-
unified_script = os.path.join(script_dir, "import-conversations-unified.py")
|
|
93
|
-
|
|
94
|
-
# Set environment to only process this project
|
|
95
|
-
env = os.environ.copy()
|
|
96
|
-
env['LOGS_DIR'] = str(project_path.parent)
|
|
97
|
-
env['IMPORT_PROJECT'] = project_name
|
|
98
|
-
|
|
99
|
-
try:
|
|
100
|
-
# Run the unified importer for just this project
|
|
101
|
-
result = subprocess.run(
|
|
102
|
-
[sys.executable, unified_script],
|
|
103
|
-
env=env,
|
|
104
|
-
capture_output=True,
|
|
105
|
-
text=True,
|
|
106
|
-
timeout=60 # 60 second timeout
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
if result.returncode == 0:
|
|
110
|
-
logger.info("Quick import completed successfully")
|
|
111
|
-
else:
|
|
112
|
-
logger.error(f"Import failed: {result.stderr}")
|
|
113
|
-
|
|
114
|
-
except subprocess.TimeoutExpired:
|
|
115
|
-
logger.warning("Import timed out after 60 seconds")
|
|
116
|
-
except Exception as e:
|
|
117
|
-
logger.error(f"Error during import: {e}")
|
|
118
|
-
|
|
119
|
-
# Report timing
|
|
120
|
-
elapsed = (datetime.now() - start_time).total_seconds()
|
|
121
|
-
logger.info(f"Quick import completed in {elapsed:.1f} seconds")
|
|
122
|
-
|
|
123
|
-
if __name__ == "__main__":
|
|
124
|
-
main()
|
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Import old format JSONL files from Claude conversations.
|
|
4
|
-
These files have a different structure with type/summary fields instead of messages.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import sys
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
import hashlib
|
|
11
|
-
import uuid
|
|
12
|
-
from datetime import datetime
|
|
13
|
-
from qdrant_client import QdrantClient
|
|
14
|
-
from qdrant_client.models import Distance, VectorParams, PointStruct
|
|
15
|
-
from fastembed import TextEmbedding
|
|
16
|
-
import logging
|
|
17
|
-
|
|
18
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
def import_old_format_project(project_dir: Path, project_path: str = None):
|
|
22
|
-
"""Import old format JSONL files from a project directory."""
|
|
23
|
-
|
|
24
|
-
# Initialize
|
|
25
|
-
client = QdrantClient(url='http://localhost:6333')
|
|
26
|
-
model = TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2', max_length=512)
|
|
27
|
-
|
|
28
|
-
# Determine project path from directory name if not provided
|
|
29
|
-
if not project_path:
|
|
30
|
-
# Convert -Users-username-projects-projectname back to path
|
|
31
|
-
dir_name = project_dir.name
|
|
32
|
-
project_path = '/' + dir_name.strip('-').replace('-', '/')
|
|
33
|
-
|
|
34
|
-
# Create collection name
|
|
35
|
-
project_hash = hashlib.md5(project_path.encode()).hexdigest()[:8]
|
|
36
|
-
collection_name = f'conv_{project_hash}_local'
|
|
37
|
-
|
|
38
|
-
logger.info(f'Project: {project_path}')
|
|
39
|
-
logger.info(f'Collection: {collection_name}')
|
|
40
|
-
|
|
41
|
-
# Create collection if needed
|
|
42
|
-
try:
|
|
43
|
-
client.get_collection(collection_name)
|
|
44
|
-
logger.info('Collection exists')
|
|
45
|
-
except:
|
|
46
|
-
client.create_collection(
|
|
47
|
-
collection_name=collection_name,
|
|
48
|
-
vectors_config=VectorParams(size=384, distance=Distance.COSINE)
|
|
49
|
-
)
|
|
50
|
-
logger.info('Created collection')
|
|
51
|
-
|
|
52
|
-
# Process all JSONL files
|
|
53
|
-
jsonl_files = list(project_dir.glob('*.jsonl'))
|
|
54
|
-
logger.info(f'Found {len(jsonl_files)} files to import')
|
|
55
|
-
|
|
56
|
-
total_points = 0
|
|
57
|
-
for file_path in jsonl_files:
|
|
58
|
-
logger.info(f'Processing {file_path.name}...')
|
|
59
|
-
points_batch = []
|
|
60
|
-
|
|
61
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
62
|
-
conversation_text = []
|
|
63
|
-
file_timestamp = file_path.stat().st_mtime
|
|
64
|
-
|
|
65
|
-
for line_num, line in enumerate(f, 1):
|
|
66
|
-
try:
|
|
67
|
-
data = json.loads(line)
|
|
68
|
-
msg_type = data.get('type', '')
|
|
69
|
-
|
|
70
|
-
# Extract text content based on type
|
|
71
|
-
content = None
|
|
72
|
-
if msg_type == 'summary' and data.get('summary'):
|
|
73
|
-
content = f"[Conversation Summary] {data['summary']}"
|
|
74
|
-
elif msg_type == 'user' and data.get('summary'):
|
|
75
|
-
content = f"User: {data['summary']}"
|
|
76
|
-
elif msg_type == 'assistant' and data.get('summary'):
|
|
77
|
-
content = f"Assistant: {data['summary']}"
|
|
78
|
-
elif msg_type in ['user', 'assistant']:
|
|
79
|
-
# Try to get content from other fields
|
|
80
|
-
if 'content' in data:
|
|
81
|
-
content = f"{msg_type.title()}: {data['content']}"
|
|
82
|
-
elif 'text' in data:
|
|
83
|
-
content = f"{msg_type.title()}: {data['text']}"
|
|
84
|
-
|
|
85
|
-
if content:
|
|
86
|
-
conversation_text.append(content)
|
|
87
|
-
|
|
88
|
-
# Create chunks every 5 messages or at end
|
|
89
|
-
if len(conversation_text) >= 5:
|
|
90
|
-
chunk_text = '\n\n'.join(conversation_text)
|
|
91
|
-
if chunk_text.strip():
|
|
92
|
-
# Generate embedding
|
|
93
|
-
embedding = list(model.embed([chunk_text[:2000]]))[0] # Limit to 2000 chars
|
|
94
|
-
|
|
95
|
-
point = PointStruct(
|
|
96
|
-
id=str(uuid.uuid4()),
|
|
97
|
-
vector=embedding.tolist(),
|
|
98
|
-
payload={
|
|
99
|
-
'content': chunk_text[:1000], # Store first 1000 chars
|
|
100
|
-
'full_content': chunk_text[:4000], # Store more for context
|
|
101
|
-
'project_path': project_path,
|
|
102
|
-
'file_path': str(file_path),
|
|
103
|
-
'file_name': file_path.name,
|
|
104
|
-
'conversation_id': file_path.stem,
|
|
105
|
-
'chunk_index': len(points_batch),
|
|
106
|
-
'timestamp': file_timestamp,
|
|
107
|
-
'type': 'conversation_chunk'
|
|
108
|
-
}
|
|
109
|
-
)
|
|
110
|
-
points_batch.append(point)
|
|
111
|
-
conversation_text = []
|
|
112
|
-
|
|
113
|
-
except json.JSONDecodeError:
|
|
114
|
-
logger.warning(f'Invalid JSON at line {line_num} in {file_path.name}')
|
|
115
|
-
except Exception as e:
|
|
116
|
-
logger.warning(f'Error processing line {line_num}: {e}')
|
|
117
|
-
|
|
118
|
-
# Handle remaining text
|
|
119
|
-
if conversation_text:
|
|
120
|
-
chunk_text = '\n\n'.join(conversation_text)
|
|
121
|
-
if chunk_text.strip():
|
|
122
|
-
embedding = list(model.embed([chunk_text[:2000]]))[0]
|
|
123
|
-
|
|
124
|
-
point = PointStruct(
|
|
125
|
-
id=str(uuid.uuid4()),
|
|
126
|
-
vector=embedding.tolist(),
|
|
127
|
-
payload={
|
|
128
|
-
'content': chunk_text[:1000],
|
|
129
|
-
'full_content': chunk_text[:4000],
|
|
130
|
-
'project_path': project_path,
|
|
131
|
-
'file_path': str(file_path),
|
|
132
|
-
'file_name': file_path.name,
|
|
133
|
-
'conversation_id': file_path.stem,
|
|
134
|
-
'chunk_index': len(points_batch),
|
|
135
|
-
'timestamp': file_timestamp,
|
|
136
|
-
'type': 'conversation_chunk'
|
|
137
|
-
}
|
|
138
|
-
)
|
|
139
|
-
points_batch.append(point)
|
|
140
|
-
|
|
141
|
-
# Upload batch
|
|
142
|
-
if points_batch:
|
|
143
|
-
client.upsert(collection_name=collection_name, points=points_batch)
|
|
144
|
-
logger.info(f' Uploaded {len(points_batch)} chunks from {file_path.name}')
|
|
145
|
-
total_points += len(points_batch)
|
|
146
|
-
|
|
147
|
-
# Verify
|
|
148
|
-
info = client.get_collection(collection_name)
|
|
149
|
-
logger.info(f'\nImport complete!')
|
|
150
|
-
logger.info(f'Collection {collection_name} now has {info.points_count} points')
|
|
151
|
-
logger.info(f'Added {total_points} new points in this import')
|
|
152
|
-
|
|
153
|
-
return collection_name, total_points
|
|
154
|
-
|
|
155
|
-
def main():
|
|
156
|
-
if len(sys.argv) < 2:
|
|
157
|
-
print("Usage: python import-old-format.py <project-directory> [project-path]")
|
|
158
|
-
print("Example: python import-old-format.py ~/.claude/projects/-Users-me-projects-myapp /Users/me/projects/myapp")
|
|
159
|
-
sys.exit(1)
|
|
160
|
-
|
|
161
|
-
project_dir = Path(sys.argv[1]).expanduser()
|
|
162
|
-
project_path = sys.argv[2] if len(sys.argv) > 2 else None
|
|
163
|
-
|
|
164
|
-
if not project_dir.exists():
|
|
165
|
-
print(f"Error: Directory {project_dir} does not exist")
|
|
166
|
-
sys.exit(1)
|
|
167
|
-
|
|
168
|
-
import_old_format_project(project_dir, project_path)
|
|
169
|
-
|
|
170
|
-
if __name__ == "__main__":
|
|
171
|
-
main()
|