claude-self-reflect 2.7.4 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,8 +30,11 @@ RUN mkdir -p /root/.cache/fastembed && \
30
30
  # Set working directory
31
31
  WORKDIR /app
32
32
 
33
- # Copy scripts
34
- COPY scripts/ /scripts/
33
+ # Copy application scripts
34
+ COPY scripts/ /app/scripts/
35
+
36
+ # Make watcher-loop.sh executable
37
+ RUN chmod +x /app/scripts/watcher-loop.sh
35
38
 
36
39
  # Create config directory
37
40
  RUN mkdir -p /config
@@ -41,4 +44,4 @@ ENV PYTHONUNBUFFERED=1
41
44
  ENV MALLOC_ARENA_MAX=2
42
45
 
43
46
  # Run the watcher loop
44
- CMD ["/scripts/watcher-loop.sh"]
47
+ CMD ["/app/scripts/watcher-loop.sh"]
package/README.md CHANGED
@@ -149,10 +149,17 @@ Here's how your conversations get imported and prioritized:
149
149
 
150
150
  ![Import Architecture](docs/diagrams/import-architecture.png)
151
151
 
152
- **The system intelligently prioritizes your conversations:**
153
- - **HOT** (< 5 minutes): Switches to 2-second intervals for near real-time import
154
- - **šŸŒ”ļø WARM** (< 24 hours): Normal priority, processed every 60 seconds
155
- - **ā„ļø COLD** (> 24 hours): Batch processed, max 5 per cycle to prevent blocking
152
+ **The system intelligently processes your conversations:**
153
+ - Runs every 60 seconds checking for new conversations
154
+ - Processes newest conversations first (delta import pattern)
155
+ - Maintains low memory usage (<50MB) through streaming
156
+ - Handles up to 5 files per cycle to prevent blocking
157
+
158
+ **HOT/WARM/COLD Intelligent Prioritization:**
159
+ - **šŸ”„ HOT** (< 5 minutes): Switches to 2-second intervals for near real-time import
160
+ - **šŸŒ”ļø WARM** (< 24 hours): Normal priority with starvation prevention (urgent after 30 min wait)
161
+ - **ā„ļø COLD** (> 24 hours): Batch processed, max 5 per cycle to prevent blocking new content
162
+ - Files are categorized by age and processed with priority queuing to ensure newest content gets imported quickly while preventing older files from being starved
156
163
 
157
164
  ## Using It
158
165
 
@@ -177,21 +177,29 @@ services:
177
177
  - ./scripts:/scripts:ro
178
178
  environment:
179
179
  - QDRANT_URL=http://qdrant:6333
180
- - STATE_FILE=/config/watcher-state.json
180
+ - STATE_FILE=/config/csr-watcher.json
181
+ - LOGS_DIR=/logs # Fixed: Point to mounted volume
181
182
  - VOYAGE_KEY=${VOYAGE_KEY:-}
182
183
  - PREFER_LOCAL_EMBEDDINGS=${PREFER_LOCAL_EMBEDDINGS:-true}
183
- - HOT_WINDOW_MINUTES=${HOT_WINDOW_MINUTES:-15}
184
- - MAX_COLD_FILES_PER_CYCLE=${MAX_COLD_FILES_PER_CYCLE:-3}
185
- - MAX_MEMORY_MB=${MAX_MEMORY_MB:-300}
186
- - WATCH_INTERVAL_SECONDS=${WATCH_INTERVAL_SECONDS:-30}
187
- - MAX_FILES_PER_CYCLE=${MAX_FILES_PER_CYCLE:-10}
184
+ - ENABLE_MEMORY_DECAY=${ENABLE_MEMORY_DECAY:-false}
185
+ - DECAY_WEIGHT=${DECAY_WEIGHT:-0.3}
186
+ - DECAY_SCALE_DAYS=${DECAY_SCALE_DAYS:-90}
187
+ - CHECK_INTERVAL_S=${CHECK_INTERVAL_S:-60}
188
+ - HOT_CHECK_INTERVAL_S=${HOT_CHECK_INTERVAL_S:-2}
189
+ - HOT_WINDOW_MINUTES=${HOT_WINDOW_MINUTES:-5}
190
+ - WARM_WINDOW_HOURS=${WARM_WINDOW_HOURS:-24}
191
+ - MAX_COLD_FILES=${MAX_COLD_FILES:-5}
192
+ - MAX_WARM_WAIT_MINUTES=${MAX_WARM_WAIT_MINUTES:-30}
193
+ - MAX_MESSAGES_PER_CHUNK=${MAX_MESSAGES_PER_CHUNK:-10}
188
194
  - MAX_CHUNK_SIZE=${MAX_CHUNK_SIZE:-50} # Messages per chunk for streaming
195
+ - MEMORY_LIMIT_MB=${MEMORY_LIMIT_MB:-1000}
196
+ - MEMORY_WARNING_MB=${MEMORY_WARNING_MB:-500}
189
197
  - PYTHONUNBUFFERED=1
190
198
  - MALLOC_ARENA_MAX=2
191
- restart: "no" # Manual start only - prevent system overload
192
- profiles: ["safe-watch"] # Requires explicit profile to run
193
- mem_limit: 600m # Increased from 400m to handle large files safely
194
- memswap_limit: 600m
199
+ restart: unless-stopped
200
+ profiles: ["safe-watch", "watch"] # Requires explicit profile to run
201
+ mem_limit: 1g # Increased to 1GB to match MEMORY_LIMIT_MB
202
+ memswap_limit: 1g
195
203
  cpus: 1.0 # Single CPU core limit
196
204
 
197
205
  # MCP server for Claude integration
@@ -454,6 +454,26 @@ async function enrichMetadata() {
454
454
  }
455
455
  }
456
456
 
457
+ async function startWatcher() {
458
+ console.log('\nšŸ”„ Starting the streaming watcher...');
459
+ console.log(' • HOT files (<5 min): 2-second processing');
460
+ console.log(' • WARM files (<24 hrs): Normal priority');
461
+ console.log(' • COLD files (>24 hrs): Batch processing');
462
+
463
+ try {
464
+ safeExec('docker', ['compose', '--profile', 'watch', 'up', '-d', 'safe-watcher'], {
465
+ cwd: projectRoot,
466
+ stdio: 'inherit'
467
+ });
468
+ console.log('āœ… Watcher started successfully!');
469
+ return true;
470
+ } catch (error) {
471
+ console.log('āš ļø Could not start watcher automatically');
472
+ console.log(' You can start it manually with: docker compose --profile watch up -d');
473
+ return false;
474
+ }
475
+ }
476
+
457
477
  async function showFinalInstructions() {
458
478
  console.log('\nāœ… Setup complete!');
459
479
 
@@ -461,7 +481,7 @@ async function showFinalInstructions() {
461
481
  console.log(' • 🌐 Qdrant Dashboard: http://localhost:6333/dashboard/');
462
482
  console.log(' • šŸ“Š Status: All services running');
463
483
  console.log(' • šŸ” Search: Semantic search with memory decay enabled');
464
- console.log(' • šŸš€ Import: Watcher checking every 60 seconds');
484
+ console.log(' • šŸš€ Watcher: HOT/WARM/COLD prioritization active');
465
485
 
466
486
  console.log('\nšŸ“‹ Quick Reference Commands:');
467
487
  console.log(' • Check status: docker compose ps');
@@ -568,6 +588,9 @@ async function main() {
568
588
  // Enrich metadata (new in v2.5.19)
569
589
  await enrichMetadata();
570
590
 
591
+ // Start the watcher
592
+ await startWatcher();
593
+
571
594
  // Show final instructions
572
595
  await showFinalInstructions();
573
596
 
@@ -9,6 +9,7 @@ import json
9
9
  import numpy as np
10
10
  import hashlib
11
11
  import time
12
+ import logging
12
13
 
13
14
  from fastmcp import FastMCP, Context
14
15
  from .utils import normalize_project_name
@@ -124,18 +125,48 @@ indexing_status = {
124
125
  "is_checking": False
125
126
  }
126
127
 
127
- async def update_indexing_status():
128
+ # Cache for indexing status (5-second TTL)
129
+ _indexing_cache = {"result": None, "timestamp": 0}
130
+
131
+ # Setup logger
132
+ logger = logging.getLogger(__name__)
133
+
134
+ def normalize_path(path_str: str) -> str:
135
+ """Normalize path for consistent comparison across platforms.
136
+
137
+ Args:
138
+ path_str: Path string to normalize
139
+
140
+ Returns:
141
+ Normalized path string with consistent separators
142
+ """
143
+ if not path_str:
144
+ return path_str
145
+ p = Path(path_str).expanduser().resolve()
146
+ return str(p).replace('\\', '/') # Consistent separators for all platforms
147
+
148
+ async def update_indexing_status(cache_ttl: int = 5):
128
149
  """Update indexing status by checking JSONL files vs Qdrant collections.
129
- This is a lightweight check that compares file counts, not full content."""
130
- global indexing_status
150
+ This is a lightweight check that compares file counts, not full content.
151
+
152
+ Args:
153
+ cache_ttl: Cache time-to-live in seconds (default: 5)
154
+ """
155
+ global indexing_status, _indexing_cache
156
+
157
+ # Check cache first (5-second TTL to prevent performance issues)
158
+ current_time = time.time()
159
+ if _indexing_cache["result"] and current_time - _indexing_cache["timestamp"] < cache_ttl:
160
+ # Use cached result
161
+ indexing_status = _indexing_cache["result"].copy()
162
+ return
131
163
 
132
164
  # Don't run concurrent checks
133
165
  if indexing_status["is_checking"]:
134
166
  return
135
167
 
136
- # Only check every 5 minutes to avoid overhead
137
- current_time = time.time()
138
- if current_time - indexing_status["last_check"] < 300: # 5 minutes
168
+ # Check immediately on first call, then every 60 seconds to avoid overhead
169
+ if indexing_status["last_check"] > 0 and current_time - indexing_status["last_check"] < 60: # 1 minute
139
170
  return
140
171
 
141
172
  indexing_status["is_checking"] = True
@@ -151,46 +182,107 @@ async def update_indexing_status():
151
182
  jsonl_files = list(projects_dir.glob("**/*.jsonl"))
152
183
  total_files = len(jsonl_files)
153
184
 
154
- # Check imported-files.json to see what's been imported
155
- # The streaming importer uses imported-files.json with nested structure
156
- # Try multiple possible locations for the config file
185
+ # Check imported-files.json AND watcher state files to see what's been imported
186
+ # The system uses multiple state files that need to be merged
187
+ all_imported_files = set() # Use set to avoid duplicates
188
+ file_metadata = {}
189
+
190
+ # 1. Check imported-files.json (batch importer)
157
191
  possible_paths = [
158
192
  Path.home() / ".claude-self-reflect" / "config" / "imported-files.json",
159
193
  Path(__file__).parent.parent.parent / "config" / "imported-files.json",
160
194
  Path("/config/imported-files.json") # Docker path if running in container
161
195
  ]
162
196
 
163
- imported_files_path = None
164
197
  for path in possible_paths:
165
198
  if path.exists():
166
- imported_files_path = path
167
- break
199
+ try:
200
+ with open(path, 'r') as f:
201
+ imported_data = json.load(f)
202
+ imported_files_dict = imported_data.get("imported_files", {})
203
+ file_metadata.update(imported_data.get("file_metadata", {}))
204
+ # Normalize paths before adding to set
205
+ normalized_files = {normalize_path(k) for k in imported_files_dict.keys()}
206
+ all_imported_files.update(normalized_files)
207
+ except (json.JSONDecodeError, IOError) as e:
208
+ logger.debug(f"Failed to read state file {path}: {e}")
209
+ pass # Continue if file is corrupted
168
210
 
169
- if imported_files_path and imported_files_path.exists():
170
- with open(imported_files_path, 'r') as f:
171
- imported_data = json.load(f)
172
- # The actual structure has imported_files and file_metadata at the top level
173
- # NOT nested under stream_position as previously assumed
174
- imported_files_dict = imported_data.get("imported_files", {})
175
- file_metadata = imported_data.get("file_metadata", {})
176
-
177
- # Convert dict keys to list for compatibility with existing logic
178
- imported_files_list = list(imported_files_dict.keys())
211
+ # 2. Check csr-watcher.json (streaming watcher - local mode)
212
+ watcher_paths = [
213
+ Path.home() / ".claude-self-reflect" / "config" / "csr-watcher.json",
214
+ Path("/config/csr-watcher.json") # Docker path
215
+ ]
216
+
217
+ for path in watcher_paths:
218
+ if path.exists():
219
+ try:
220
+ with open(path, 'r') as f:
221
+ watcher_data = json.load(f)
222
+ watcher_files = watcher_data.get("imported_files", {})
223
+ # Normalize paths before adding to set
224
+ normalized_files = {normalize_path(k) for k in watcher_files.keys()}
225
+ all_imported_files.update(normalized_files)
226
+ # Add to metadata with normalized paths
227
+ for file_path, info in watcher_files.items():
228
+ normalized = normalize_path(file_path)
229
+ if normalized not in file_metadata:
230
+ file_metadata[normalized] = {
231
+ "position": 1,
232
+ "chunks": info.get("chunks", 0)
233
+ }
234
+ except (json.JSONDecodeError, IOError) as e:
235
+ logger.debug(f"Failed to read watcher state file {path}: {e}")
236
+ pass # Continue if file is corrupted
237
+
238
+ # 3. Check csr-watcher-cloud.json (streaming watcher - cloud mode)
239
+ cloud_watcher_path = Path.home() / ".claude-self-reflect" / "config" / "csr-watcher-cloud.json"
240
+ if cloud_watcher_path.exists():
241
+ try:
242
+ with open(cloud_watcher_path, 'r') as f:
243
+ cloud_data = json.load(f)
244
+ cloud_files = cloud_data.get("imported_files", {})
245
+ # Normalize paths before adding to set
246
+ normalized_files = {normalize_path(k) for k in cloud_files.keys()}
247
+ all_imported_files.update(normalized_files)
248
+ # Add to metadata with normalized paths
249
+ for file_path, info in cloud_files.items():
250
+ normalized = normalize_path(file_path)
251
+ if normalized not in file_metadata:
252
+ file_metadata[normalized] = {
253
+ "position": 1,
254
+ "chunks": info.get("chunks", 0)
255
+ }
256
+ except (json.JSONDecodeError, IOError) as e:
257
+ logger.debug(f"Failed to read cloud watcher state file {cloud_watcher_path}: {e}")
258
+ pass # Continue if file is corrupted
259
+
260
+ # Convert set to list for compatibility
261
+ imported_files_list = list(all_imported_files)
179
262
 
180
263
  # Count files that have been imported
181
264
  for file_path in jsonl_files:
265
+ # Normalize the current file path for consistent comparison
266
+ normalized_file = normalize_path(str(file_path))
267
+
182
268
  # Try multiple path formats to match Docker's state file
183
269
  file_str = str(file_path).replace(str(Path.home()), "/logs").replace("\\", "/")
184
270
  # Also try without .claude/projects prefix (Docker mounts directly)
185
271
  file_str_alt = file_str.replace("/.claude/projects", "")
186
272
 
273
+ # Normalize alternative paths as well
274
+ normalized_alt = normalize_path(file_str)
275
+ normalized_alt2 = normalize_path(file_str_alt)
276
+
187
277
  # Check if file is in imported_files list (fully imported)
188
- if file_str in imported_files_list or file_str_alt in imported_files_list:
278
+ if normalized_file in imported_files_list or normalized_alt in imported_files_list or normalized_alt2 in imported_files_list:
189
279
  indexed_files += 1
190
280
  # Or if it has metadata with position > 0 (partially imported)
191
- elif file_str in file_metadata and file_metadata[file_str].get("position", 0) > 0:
281
+ elif normalized_file in file_metadata and file_metadata[normalized_file].get("position", 0) > 0:
282
+ indexed_files += 1
283
+ elif normalized_alt in file_metadata and file_metadata[normalized_alt].get("position", 0) > 0:
192
284
  indexed_files += 1
193
- elif file_str_alt in file_metadata and file_metadata[file_str_alt].get("position", 0) > 0:
285
+ elif normalized_alt2 in file_metadata and file_metadata[normalized_alt2].get("position", 0) > 0:
194
286
  indexed_files += 1
195
287
 
196
288
  # Update status
@@ -203,9 +295,14 @@ async def update_indexing_status():
203
295
  indexing_status["percentage"] = (indexed_files / total_files) * 100
204
296
  else:
205
297
  indexing_status["percentage"] = 100.0
298
+
299
+ # Update cache
300
+ _indexing_cache["result"] = indexing_status.copy()
301
+ _indexing_cache["timestamp"] = current_time
206
302
 
207
303
  except Exception as e:
208
304
  print(f"[WARNING] Failed to update indexing status: {e}")
305
+ logger.error(f"Failed to update indexing status: {e}", exc_info=True)
209
306
  finally:
210
307
  indexing_status["is_checking"] = False
211
308
 
@@ -5,6 +5,7 @@ Designed for <20ms execution time to support status bars and shell scripts.
5
5
  """
6
6
 
7
7
  import json
8
+ import time
8
9
  from pathlib import Path
9
10
  from collections import defaultdict
10
11
 
@@ -53,11 +54,36 @@ def normalize_file_path(file_path: str) -> str:
53
54
  return file_path
54
55
 
55
56
 
57
+ def get_watcher_status() -> dict:
58
+ """Get streaming watcher status if available."""
59
+ watcher_state_file = Path.home() / "config" / "csr-watcher.json"
60
+
61
+ if not watcher_state_file.exists():
62
+ return {"running": False, "status": "not configured"}
63
+
64
+ try:
65
+ with open(watcher_state_file) as f:
66
+ state = json.load(f)
67
+
68
+ # Check if watcher is active (modified recently)
69
+ file_age = time.time() - watcher_state_file.stat().st_mtime
70
+ is_active = file_age < 120 # Active if updated in last 2 minutes
71
+
72
+ return {
73
+ "running": is_active,
74
+ "files_processed": len(state.get("imported_files", {})),
75
+ "last_update_seconds": int(file_age),
76
+ "status": "🟢 active" if is_active else "šŸ”“ inactive"
77
+ }
78
+ except:
79
+ return {"running": False, "status": "error reading state"}
80
+
81
+
56
82
  def get_status() -> dict:
57
83
  """Get indexing status with overall stats and per-project breakdown.
58
84
 
59
85
  Returns:
60
- dict: JSON structure with overall and per-project indexing status
86
+ dict: JSON structure with overall and per-project indexing status, plus watcher status
61
87
  """
62
88
  projects_dir = Path.home() / ".claude" / "projects"
63
89
  project_stats = defaultdict(lambda: {"indexed": 0, "total": 0})
@@ -154,6 +180,9 @@ def get_status() -> dict:
154
180
  "total": stats["total"]
155
181
  }
156
182
 
183
+ # Add watcher status
184
+ result["watcher"] = get_watcher_status()
185
+
157
186
  return result
158
187
 
159
188
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-self-reflect",
3
- "version": "2.7.4",
3
+ "version": "2.8.0",
4
4
  "description": "Give Claude perfect memory of all your conversations - Installation wizard for Python MCP server",
5
5
  "keywords": [
6
6
  "claude",
@@ -1,374 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Streaming importer with true line-by-line processing to prevent OOM.
4
- Processes JSONL files without loading entire file into memory.
5
- """
6
-
7
- import json
8
- import os
9
- import sys
10
- import hashlib
11
- import gc
12
- from pathlib import Path
13
- from datetime import datetime
14
- from typing import List, Dict, Any, Optional
15
- import logging
16
-
17
- # Add the project root to the Python path
18
- project_root = Path(__file__).parent.parent
19
- sys.path.insert(0, str(project_root))
20
-
21
- from qdrant_client import QdrantClient
22
- from qdrant_client.models import PointStruct, Distance, VectorParams
23
-
24
- # Set up logging
25
- logging.basicConfig(
26
- level=logging.INFO,
27
- format='%(asctime)s - %(levelname)s - %(message)s'
28
- )
29
- logger = logging.getLogger(__name__)
30
-
31
- # Environment variables
32
- QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
33
- STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
34
- PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
35
- VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
36
- MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
37
-
38
- # Initialize Qdrant client
39
- client = QdrantClient(url=QDRANT_URL)
40
-
41
- # Initialize embedding provider
42
- embedding_provider = None
43
- embedding_dimension = None
44
-
45
- if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
46
- logger.info("Using local embeddings (fastembed)")
47
- from fastembed import TextEmbedding
48
- embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
49
- embedding_dimension = 384
50
- collection_suffix = "local"
51
- else:
52
- logger.info("Using Voyage AI embeddings")
53
- import voyageai
54
- embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
55
- embedding_dimension = 1024
56
- collection_suffix = "voyage"
57
-
58
- def normalize_project_name(project_name: str) -> str:
59
- """Normalize project name for consistency."""
60
- return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
61
-
62
- def get_collection_name(project_path: Path) -> str:
63
- """Generate collection name from project path."""
64
- normalized = normalize_project_name(project_path.name)
65
- name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
66
- return f"conv_{name_hash}_{collection_suffix}"
67
-
68
- def ensure_collection(collection_name: str):
69
- """Ensure collection exists with correct configuration."""
70
- collections = client.get_collections().collections
71
- if not any(c.name == collection_name for c in collections):
72
- logger.info(f"Creating collection: {collection_name}")
73
- client.create_collection(
74
- collection_name=collection_name,
75
- vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
76
- )
77
-
78
- def generate_embeddings(texts: List[str]) -> List[List[float]]:
79
- """Generate embeddings for texts."""
80
- if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
81
- embeddings = list(embedding_provider.passage_embed(texts))
82
- return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
83
- else:
84
- response = embedding_provider.embed(texts, model="voyage-3")
85
- return response.embeddings
86
-
87
- def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
88
- conversation_id: str, created_at: str,
89
- metadata: Dict[str, Any], collection_name: str,
90
- project_path: Path) -> int:
91
- """Process and immediately upload a single chunk."""
92
- if not messages:
93
- return 0
94
-
95
- # Extract text content
96
- texts = []
97
- for msg in messages:
98
- role = msg.get("role", "unknown")
99
- content = msg.get("content", "")
100
- if content:
101
- texts.append(f"{role.upper()}: {content}")
102
-
103
- if not texts:
104
- return 0
105
-
106
- chunk_text = "\n".join(texts)
107
-
108
- try:
109
- # Generate embedding
110
- embeddings = generate_embeddings([chunk_text])
111
-
112
- # Create point ID
113
- point_id = hashlib.md5(
114
- f"{conversation_id}_{chunk_index}".encode()
115
- ).hexdigest()[:16]
116
-
117
- # Create payload
118
- payload = {
119
- "text": chunk_text,
120
- "conversation_id": conversation_id,
121
- "chunk_index": chunk_index,
122
- "timestamp": created_at,
123
- "project": normalize_project_name(project_path.name),
124
- "start_role": messages[0].get("role", "unknown") if messages else "unknown",
125
- "message_count": len(messages)
126
- }
127
-
128
- # Add metadata
129
- if metadata:
130
- payload.update(metadata)
131
-
132
- # Create point
133
- point = PointStruct(
134
- id=int(point_id, 16) % (2**63),
135
- vector=embeddings[0],
136
- payload=payload
137
- )
138
-
139
- # Upload immediately
140
- client.upsert(
141
- collection_name=collection_name,
142
- points=[point],
143
- wait=True
144
- )
145
-
146
- return 1
147
-
148
- except Exception as e:
149
- logger.error(f"Error processing chunk {chunk_index}: {e}")
150
- return 0
151
-
152
- def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
153
- """Extract metadata in a single pass, return metadata and first timestamp."""
154
- metadata = {
155
- "files_analyzed": [],
156
- "files_edited": [],
157
- "tools_used": [],
158
- "concepts": []
159
- }
160
-
161
- first_timestamp = None
162
-
163
- try:
164
- with open(file_path, 'r', encoding='utf-8') as f:
165
- for line in f:
166
- if not line.strip():
167
- continue
168
-
169
- try:
170
- data = json.loads(line)
171
-
172
- # Get timestamp from first valid entry
173
- if first_timestamp is None and 'timestamp' in data:
174
- first_timestamp = data.get('timestamp')
175
-
176
- # Extract tool usage from messages
177
- if 'message' in data and data['message']:
178
- msg = data['message']
179
- if msg.get('content'):
180
- content = msg['content']
181
- if isinstance(content, list):
182
- for item in content:
183
- if isinstance(item, dict) and item.get('type') == 'tool_use':
184
- tool_name = item.get('name', '')
185
- if tool_name and tool_name not in metadata['tools_used']:
186
- metadata['tools_used'].append(tool_name)
187
-
188
- # Extract file references
189
- if 'input' in item:
190
- input_data = item['input']
191
- if isinstance(input_data, dict):
192
- if 'file_path' in input_data:
193
- file_ref = input_data['file_path']
194
- if file_ref not in metadata['files_analyzed']:
195
- metadata['files_analyzed'].append(file_ref)
196
- if 'path' in input_data:
197
- file_ref = input_data['path']
198
- if file_ref not in metadata['files_analyzed']:
199
- metadata['files_analyzed'].append(file_ref)
200
-
201
- except json.JSONDecodeError:
202
- continue
203
- except Exception:
204
- continue
205
-
206
- except Exception as e:
207
- logger.warning(f"Error extracting metadata: {e}")
208
-
209
- return metadata, first_timestamp or datetime.now().isoformat()
210
-
211
- def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
212
- """Stream import a single JSONL file without loading it into memory."""
213
- logger.info(f"Streaming import of {jsonl_file.name}")
214
-
215
- # Extract metadata in first pass (lightweight)
216
- metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
217
-
218
- # Stream messages and process in chunks
219
- chunk_buffer = []
220
- chunk_index = 0
221
- total_chunks = 0
222
- conversation_id = jsonl_file.stem
223
-
224
- try:
225
- with open(jsonl_file, 'r', encoding='utf-8') as f:
226
- for line_num, line in enumerate(f, 1):
227
- line = line.strip()
228
- if not line:
229
- continue
230
-
231
- try:
232
- data = json.loads(line)
233
-
234
- # Skip non-message lines
235
- if data.get('type') == 'summary':
236
- continue
237
-
238
- # Extract message if present
239
- if 'message' in data and data['message']:
240
- msg = data['message']
241
- if msg.get('role') and msg.get('content'):
242
- # Extract content
243
- content = msg['content']
244
- if isinstance(content, list):
245
- text_parts = []
246
- for item in content:
247
- if isinstance(item, dict) and item.get('type') == 'text':
248
- text_parts.append(item.get('text', ''))
249
- elif isinstance(item, str):
250
- text_parts.append(item)
251
- content = '\n'.join(text_parts)
252
-
253
- if content:
254
- chunk_buffer.append({
255
- 'role': msg['role'],
256
- 'content': content
257
- })
258
-
259
- # Process chunk when buffer reaches MAX_CHUNK_SIZE
260
- if len(chunk_buffer) >= MAX_CHUNK_SIZE:
261
- chunks = process_and_upload_chunk(
262
- chunk_buffer, chunk_index, conversation_id,
263
- created_at, metadata, collection_name, project_path
264
- )
265
- total_chunks += chunks
266
- chunk_buffer = []
267
- chunk_index += 1
268
-
269
- # Force garbage collection after each chunk
270
- gc.collect()
271
-
272
- # Log progress
273
- if chunk_index % 10 == 0:
274
- logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
275
-
276
- except json.JSONDecodeError:
277
- logger.debug(f"Skipping invalid JSON at line {line_num}")
278
- except Exception as e:
279
- logger.debug(f"Error processing line {line_num}: {e}")
280
-
281
- # Process remaining messages
282
- if chunk_buffer:
283
- chunks = process_and_upload_chunk(
284
- chunk_buffer, chunk_index, conversation_id,
285
- created_at, metadata, collection_name, project_path
286
- )
287
- total_chunks += chunks
288
-
289
- logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
290
- return total_chunks
291
-
292
- except Exception as e:
293
- logger.error(f"Failed to import {jsonl_file}: {e}")
294
- return 0
295
-
296
- def load_state() -> dict:
297
- """Load import state."""
298
- if os.path.exists(STATE_FILE):
299
- try:
300
- with open(STATE_FILE, 'r') as f:
301
- return json.load(f)
302
- except:
303
- pass
304
- return {"imported_files": {}}
305
-
306
- def save_state(state: dict):
307
- """Save import state."""
308
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
309
- with open(STATE_FILE, 'w') as f:
310
- json.dump(state, f, indent=2)
311
-
312
- def should_import_file(file_path: Path, state: dict) -> bool:
313
- """Check if file should be imported."""
314
- file_str = str(file_path)
315
- if file_str in state.get("imported_files", {}):
316
- file_info = state["imported_files"][file_str]
317
- last_modified = file_path.stat().st_mtime
318
- if file_info.get("last_modified") == last_modified:
319
- logger.info(f"Skipping unchanged file: {file_path.name}")
320
- return False
321
- return True
322
-
323
- def update_file_state(file_path: Path, state: dict, chunks: int):
324
- """Update state for imported file."""
325
- file_str = str(file_path)
326
- state["imported_files"][file_str] = {
327
- "imported_at": datetime.now().isoformat(),
328
- "last_modified": file_path.stat().st_mtime,
329
- "chunks": chunks
330
- }
331
-
332
- def main():
333
- """Main import function."""
334
- # Load state
335
- state = load_state()
336
- logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
337
-
338
- # Find all projects
339
- logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
340
- project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
341
- logger.info(f"Found {len(project_dirs)} projects to import")
342
-
343
- total_imported = 0
344
-
345
- for project_dir in project_dirs:
346
- # Get collection name
347
- collection_name = get_collection_name(project_dir)
348
- logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
349
-
350
- # Ensure collection exists
351
- ensure_collection(collection_name)
352
-
353
- # Find JSONL files
354
- jsonl_files = sorted(project_dir.glob("*.jsonl"))
355
-
356
- # Limit files per cycle if specified
357
- max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
358
- jsonl_files = jsonl_files[:max_files]
359
-
360
- for jsonl_file in jsonl_files:
361
- if should_import_file(jsonl_file, state):
362
- chunks = stream_import_file(jsonl_file, collection_name, project_dir)
363
- if chunks > 0:
364
- update_file_state(jsonl_file, state, chunks)
365
- save_state(state)
366
- total_imported += 1
367
-
368
- # Force GC after each file
369
- gc.collect()
370
-
371
- logger.info(f"Import complete: processed {total_imported} files")
372
-
373
- if __name__ == "__main__":
374
- main()
@@ -1,124 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Quick import script for current project's latest conversations.
4
- Designed for PreCompact hook integration - targets <10 second imports.
5
- """
6
-
7
- import os
8
- import sys
9
- import json
10
- import subprocess
11
- from datetime import datetime, timedelta
12
- from pathlib import Path
13
- import logging
14
-
15
- # Configuration
16
- LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
17
- STATE_FILE = os.getenv("STATE_FILE", os.path.expanduser("~/.claude-self-reflect-state.json"))
18
- HOURS_BACK = int(os.getenv("IMPORT_HOURS_BACK", "2")) # Only import last 2 hours by default
19
-
20
- # Set up logging
21
- logging.basicConfig(
22
- level=logging.INFO,
23
- format='%(asctime)s - %(levelname)s - %(message)s'
24
- )
25
- logger = logging.getLogger(__name__)
26
-
27
- def load_state():
28
- """Load import state from file."""
29
- if os.path.exists(STATE_FILE):
30
- try:
31
- with open(STATE_FILE, 'r') as f:
32
- return json.load(f)
33
- except:
34
- return {}
35
- return {}
36
-
37
- def save_state(state):
38
- """Save import state to file."""
39
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
40
- with open(STATE_FILE, 'w') as f:
41
- json.dump(state, f, indent=2)
42
-
43
- def get_project_from_cwd():
44
- """Detect project from current working directory."""
45
- cwd = os.getcwd()
46
- # Convert path to project name format used in logs
47
- # Claude logs use format: -Users-username-path-to-project
48
- project_name = cwd.replace('/', '-')
49
- # Keep the leading dash as that's how Claude stores it
50
- if not project_name.startswith('-'):
51
- project_name = '-' + project_name
52
- return project_name
53
-
54
- def get_recent_files(project_path: Path, hours_back: int):
55
- """Get JSONL files modified in the last N hours."""
56
- cutoff_time = datetime.now() - timedelta(hours=hours_back)
57
- recent_files = []
58
-
59
- for jsonl_file in project_path.glob("*.jsonl"):
60
- mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
61
- if mtime > cutoff_time:
62
- recent_files.append(jsonl_file)
63
-
64
- return sorted(recent_files, key=lambda f: f.stat().st_mtime, reverse=True)
65
-
66
- def main():
67
- """Main quick import function."""
68
- start_time = datetime.now()
69
-
70
- # Detect current project
71
- project_name = get_project_from_cwd()
72
- project_path = Path(LOGS_DIR) / project_name
73
-
74
- if not project_path.exists():
75
- logger.warning(f"Project logs not found: {project_path}")
76
- logger.info("Make sure you're in a project directory with Claude conversations.")
77
- return
78
-
79
- logger.info(f"Quick importing latest conversations for: {project_name}")
80
-
81
- # Get recent files
82
- recent_files = get_recent_files(project_path, HOURS_BACK)
83
- logger.info(f"Found {len(recent_files)} files modified in last {HOURS_BACK} hours")
84
-
85
- if not recent_files:
86
- logger.info("No recent conversations to import")
87
- return
88
-
89
- # For now, just call the unified importer with the specific project
90
- # This is a temporary solution until we implement incremental imports
91
- script_dir = os.path.dirname(os.path.abspath(__file__))
92
- unified_script = os.path.join(script_dir, "import-conversations-unified.py")
93
-
94
- # Set environment to only process this project
95
- env = os.environ.copy()
96
- env['LOGS_DIR'] = str(project_path.parent)
97
- env['IMPORT_PROJECT'] = project_name
98
-
99
- try:
100
- # Run the unified importer for just this project
101
- result = subprocess.run(
102
- [sys.executable, unified_script],
103
- env=env,
104
- capture_output=True,
105
- text=True,
106
- timeout=60 # 60 second timeout
107
- )
108
-
109
- if result.returncode == 0:
110
- logger.info("Quick import completed successfully")
111
- else:
112
- logger.error(f"Import failed: {result.stderr}")
113
-
114
- except subprocess.TimeoutExpired:
115
- logger.warning("Import timed out after 60 seconds")
116
- except Exception as e:
117
- logger.error(f"Error during import: {e}")
118
-
119
- # Report timing
120
- elapsed = (datetime.now() - start_time).total_seconds()
121
- logger.info(f"Quick import completed in {elapsed:.1f} seconds")
122
-
123
- if __name__ == "__main__":
124
- main()
@@ -1,171 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Import old format JSONL files from Claude conversations.
4
- These files have a different structure with type/summary fields instead of messages.
5
- """
6
-
7
- import json
8
- import sys
9
- from pathlib import Path
10
- import hashlib
11
- import uuid
12
- from datetime import datetime
13
- from qdrant_client import QdrantClient
14
- from qdrant_client.models import Distance, VectorParams, PointStruct
15
- from fastembed import TextEmbedding
16
- import logging
17
-
18
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
- logger = logging.getLogger(__name__)
20
-
21
- def import_old_format_project(project_dir: Path, project_path: str = None):
22
- """Import old format JSONL files from a project directory."""
23
-
24
- # Initialize
25
- client = QdrantClient(url='http://localhost:6333')
26
- model = TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2', max_length=512)
27
-
28
- # Determine project path from directory name if not provided
29
- if not project_path:
30
- # Convert -Users-username-projects-projectname back to path
31
- dir_name = project_dir.name
32
- project_path = '/' + dir_name.strip('-').replace('-', '/')
33
-
34
- # Create collection name
35
- project_hash = hashlib.md5(project_path.encode()).hexdigest()[:8]
36
- collection_name = f'conv_{project_hash}_local'
37
-
38
- logger.info(f'Project: {project_path}')
39
- logger.info(f'Collection: {collection_name}')
40
-
41
- # Create collection if needed
42
- try:
43
- client.get_collection(collection_name)
44
- logger.info('Collection exists')
45
- except:
46
- client.create_collection(
47
- collection_name=collection_name,
48
- vectors_config=VectorParams(size=384, distance=Distance.COSINE)
49
- )
50
- logger.info('Created collection')
51
-
52
- # Process all JSONL files
53
- jsonl_files = list(project_dir.glob('*.jsonl'))
54
- logger.info(f'Found {len(jsonl_files)} files to import')
55
-
56
- total_points = 0
57
- for file_path in jsonl_files:
58
- logger.info(f'Processing {file_path.name}...')
59
- points_batch = []
60
-
61
- with open(file_path, 'r', encoding='utf-8') as f:
62
- conversation_text = []
63
- file_timestamp = file_path.stat().st_mtime
64
-
65
- for line_num, line in enumerate(f, 1):
66
- try:
67
- data = json.loads(line)
68
- msg_type = data.get('type', '')
69
-
70
- # Extract text content based on type
71
- content = None
72
- if msg_type == 'summary' and data.get('summary'):
73
- content = f"[Conversation Summary] {data['summary']}"
74
- elif msg_type == 'user' and data.get('summary'):
75
- content = f"User: {data['summary']}"
76
- elif msg_type == 'assistant' and data.get('summary'):
77
- content = f"Assistant: {data['summary']}"
78
- elif msg_type in ['user', 'assistant']:
79
- # Try to get content from other fields
80
- if 'content' in data:
81
- content = f"{msg_type.title()}: {data['content']}"
82
- elif 'text' in data:
83
- content = f"{msg_type.title()}: {data['text']}"
84
-
85
- if content:
86
- conversation_text.append(content)
87
-
88
- # Create chunks every 5 messages or at end
89
- if len(conversation_text) >= 5:
90
- chunk_text = '\n\n'.join(conversation_text)
91
- if chunk_text.strip():
92
- # Generate embedding
93
- embedding = list(model.embed([chunk_text[:2000]]))[0] # Limit to 2000 chars
94
-
95
- point = PointStruct(
96
- id=str(uuid.uuid4()),
97
- vector=embedding.tolist(),
98
- payload={
99
- 'content': chunk_text[:1000], # Store first 1000 chars
100
- 'full_content': chunk_text[:4000], # Store more for context
101
- 'project_path': project_path,
102
- 'file_path': str(file_path),
103
- 'file_name': file_path.name,
104
- 'conversation_id': file_path.stem,
105
- 'chunk_index': len(points_batch),
106
- 'timestamp': file_timestamp,
107
- 'type': 'conversation_chunk'
108
- }
109
- )
110
- points_batch.append(point)
111
- conversation_text = []
112
-
113
- except json.JSONDecodeError:
114
- logger.warning(f'Invalid JSON at line {line_num} in {file_path.name}')
115
- except Exception as e:
116
- logger.warning(f'Error processing line {line_num}: {e}')
117
-
118
- # Handle remaining text
119
- if conversation_text:
120
- chunk_text = '\n\n'.join(conversation_text)
121
- if chunk_text.strip():
122
- embedding = list(model.embed([chunk_text[:2000]]))[0]
123
-
124
- point = PointStruct(
125
- id=str(uuid.uuid4()),
126
- vector=embedding.tolist(),
127
- payload={
128
- 'content': chunk_text[:1000],
129
- 'full_content': chunk_text[:4000],
130
- 'project_path': project_path,
131
- 'file_path': str(file_path),
132
- 'file_name': file_path.name,
133
- 'conversation_id': file_path.stem,
134
- 'chunk_index': len(points_batch),
135
- 'timestamp': file_timestamp,
136
- 'type': 'conversation_chunk'
137
- }
138
- )
139
- points_batch.append(point)
140
-
141
- # Upload batch
142
- if points_batch:
143
- client.upsert(collection_name=collection_name, points=points_batch)
144
- logger.info(f' Uploaded {len(points_batch)} chunks from {file_path.name}')
145
- total_points += len(points_batch)
146
-
147
- # Verify
148
- info = client.get_collection(collection_name)
149
- logger.info(f'\nImport complete!')
150
- logger.info(f'Collection {collection_name} now has {info.points_count} points')
151
- logger.info(f'Added {total_points} new points in this import')
152
-
153
- return collection_name, total_points
154
-
155
- def main():
156
- if len(sys.argv) < 2:
157
- print("Usage: python import-old-format.py <project-directory> [project-path]")
158
- print("Example: python import-old-format.py ~/.claude/projects/-Users-me-projects-myapp /Users/me/projects/myapp")
159
- sys.exit(1)
160
-
161
- project_dir = Path(sys.argv[1]).expanduser()
162
- project_path = sys.argv[2] if len(sys.argv) > 2 else None
163
-
164
- if not project_dir.exists():
165
- print(f"Error: Directory {project_dir} does not exist")
166
- sys.exit(1)
167
-
168
- import_old_format_project(project_dir, project_path)
169
-
170
- if __name__ == "__main__":
171
- main()