claude-self-reflect 5.0.2 → 5.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,59 +1,45 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- Streaming importer with true line-by-line processing to prevent OOM.
4
- Processes JSONL files without loading entire file into memory.
3
+ Refactored import script with reduced complexity using modular components.
4
+ All functions have cyclomatic complexity < 10.
5
5
  """
6
6
 
7
- import json
8
7
  import os
9
8
  import sys
10
- import hashlib
11
9
  import gc
12
- import ast
13
- import re
14
- import fcntl
15
- import time
16
10
  import argparse
17
- from pathlib import Path
18
- from datetime import datetime, timezone
19
- from typing import List, Dict, Any, Optional, Set
20
11
  import logging
12
+ import hashlib
13
+ import uuid
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ from typing import List, Dict, Any, Optional
21
17
 
22
- # Load .env file if it exists
23
- try:
24
- from dotenv import load_dotenv
25
- # Load from project root
26
- env_path = Path(__file__).parent.parent / '.env'
27
- if env_path.exists():
28
- load_dotenv(env_path)
29
- print(f"Loaded .env from {env_path}")
30
- except ImportError:
31
- pass # dotenv not available, use system environment
32
-
33
- # Add the scripts directory to the Python path for utils import
18
+ # Add the scripts directory to the Python path
34
19
  scripts_dir = Path(__file__).parent
35
20
  sys.path.insert(0, str(scripts_dir))
36
21
 
37
- # Import UnifiedStateManager
22
+ # Import refactored components
23
+ from metadata_extractor import MetadataExtractor
24
+ from embedding_service import create_embedding_service
25
+ from import_strategies import StreamImportStrategy
38
26
  from unified_state_manager import UnifiedStateManager
39
27
 
28
+ # Import Qdrant client
40
29
  from qdrant_client import QdrantClient
41
30
  from qdrant_client.models import PointStruct, Distance, VectorParams
42
31
 
43
- # Import normalize_project_name from shared module
44
- # Add parent directory to path to import shared module
32
+ # Import shared modules
45
33
  sys.path.insert(0, str(Path(__file__).parent.parent))
46
34
  try:
47
35
  from shared.normalization import normalize_project_name
48
- except ImportError as e:
49
- logging.error(f"Failed to import normalize_project_name from shared module: {e}")
50
- # Fall back to local utils if shared module not found
36
+ except ImportError:
51
37
  try:
52
- from utils import normalize_project_name
53
- logging.warning("Using legacy utils.normalize_project_name - consider updating")
38
+ from importer.utils.project_normalizer import normalize_project_name
39
+ logging.debug("Using importer.utils.project_normalizer.normalize_project_name")
54
40
  except ImportError:
55
- logging.error("Could not import normalize_project_name from any source")
56
- sys.exit(1)
41
+ from utils import normalize_project_name
42
+ logging.warning("Using legacy utils.normalize_project_name")
57
43
 
58
44
  # Set up logging
59
45
  logging.basicConfig(
@@ -62,826 +48,311 @@ logging.basicConfig(
62
48
  )
63
49
  logger = logging.getLogger(__name__)
64
50
 
65
- # Environment variables
51
+ # Constants
66
52
  QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
53
+ MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50"))
54
+
55
+
56
+ class ConversationImporter:
57
+ """Main class for importing conversations with reduced complexity."""
58
+
59
+ def __init__(self):
60
+ """Initialize the importer with all required services."""
61
+ self.client = self._init_qdrant_client()
62
+ self.embedding_service = create_embedding_service()
63
+ self.state_manager = self._init_state_manager()
64
+ self.metadata_extractor = MetadataExtractor()
65
+ self.import_strategy = None
66
+
67
+ def _init_qdrant_client(self) -> QdrantClient:
68
+ """Initialize Qdrant client with optional authentication."""
69
+ api_key = os.getenv("QDRANT_API_KEY")
70
+ if api_key:
71
+ return QdrantClient(url=QDRANT_URL, api_key=api_key, timeout=30)
72
+ return QdrantClient(url=QDRANT_URL, timeout=30)
73
+
74
+ def _init_state_manager(self) -> UnifiedStateManager:
75
+ """Initialize state manager."""
76
+ env_state = os.getenv("STATE_FILE")
77
+ if env_state:
78
+ state_file_path = Path(env_state).expanduser().resolve()
79
+ return UnifiedStateManager(state_file_path)
80
+ return UnifiedStateManager()
81
+
82
+ def get_collection_name(self, project_path: Path) -> str:
83
+ """Get collection name for a project."""
84
+ project_name = normalize_project_name(str(project_path))
85
+ suffix = self.embedding_service.get_collection_suffix()
86
+ return f"csr_{project_name}_{suffix}"
87
+
88
+ def ensure_collection(self, collection_name: str):
89
+ """Ensure collection exists with correct configuration."""
90
+ collections = self.client.get_collections().collections
91
+ exists = any(c.name == collection_name for c in collections)
92
+
93
+ if not exists:
94
+ dimension = self.embedding_service.get_dimension()
95
+ self.client.create_collection(
96
+ collection_name=collection_name,
97
+ vectors_config=VectorParams(size=dimension, distance=Distance.COSINE)
98
+ )
99
+ logger.info(f"Created collection: {collection_name} with {dimension} dimensions")
100
+
101
+ def process_and_upload_chunk(
102
+ self,
103
+ messages: List[Dict[str, Any]],
104
+ chunk_index: int,
105
+ conversation_id: str,
106
+ created_at: str,
107
+ metadata: Dict[str, Any],
108
+ collection_name: str,
109
+ project_path: Path,
110
+ total_messages: int
111
+ ) -> int:
112
+ """Process and upload a chunk of messages."""
113
+ if not messages:
114
+ return 0
67
115
 
68
- # Constants for metadata limits
69
- MAX_CONCEPTS = 10
70
- MAX_AST_ELEMENTS = 30
71
- MAX_CODE_BLOCKS = 5
72
- MAX_ELEMENTS_PER_BLOCK = 10
73
- MAX_FILES_ANALYZED = 20
74
- MAX_FILES_EDITED = 20
75
- MAX_TOOLS_USED = 15
76
- MAX_CONCEPT_MESSAGES = 50
77
-
78
- # Initialize UnifiedStateManager
79
- # Support legacy STATE_FILE environment variable
80
- env_state = os.getenv("STATE_FILE")
81
- if env_state:
82
- from pathlib import Path
83
- state_file_path = Path(env_state).expanduser().resolve()
84
- state_manager = UnifiedStateManager(state_file_path)
85
- else:
86
- state_manager = UnifiedStateManager() # Uses default location
87
- PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
88
- VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
89
- MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
90
-
91
- # Initialize Qdrant client with timeout
92
- client = QdrantClient(
93
- url=QDRANT_URL,
94
- timeout=30 # 30 second timeout for network operations
95
- )
116
+ # Combine all message content into a single text for the chunk
117
+ combined_text = "\n".join([msg['content'] for msg in messages])
96
118
 
97
- # Initialize embedding provider
98
- embedding_provider = None
99
- embedding_dimension = None
100
-
101
- if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
102
- logger.info("Using local embeddings (fastembed)")
103
- from fastembed import TextEmbedding
104
- # Using the same model as official Qdrant MCP server
105
- embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
106
- embedding_dimension = 384
107
- collection_suffix = "local"
108
- logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
109
- else:
110
- logger.info("Using Voyage AI embeddings")
111
- import voyageai
112
- embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
113
- embedding_dimension = 1024
114
- collection_suffix = "voyage"
115
-
116
- def get_collection_name(project_path: Path) -> str:
117
- """Generate collection name from project path."""
118
- normalized = normalize_project_name(str(project_path))
119
- name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
120
- return f"conv_{name_hash}_{collection_suffix}"
121
-
122
- def ensure_collection(collection_name: str):
123
- """Ensure collection exists with correct configuration."""
124
- collections = client.get_collections().collections
125
- if not any(c.name == collection_name for c in collections):
126
- logger.info(f"Creating collection: {collection_name}")
127
- client.create_collection(
128
- collection_name=collection_name,
129
- vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
119
+ # Generate a single embedding for the entire chunk
120
+ embeddings = self.embedding_service.generate_embeddings([combined_text])
121
+ if not embeddings:
122
+ return 0
123
+
124
+ # Create points for upload
125
+ points = self._create_points(
126
+ messages, embeddings, chunk_index,
127
+ conversation_id, created_at, metadata,
128
+ project_path, total_messages
130
129
  )
131
130
 
132
- def generate_embeddings(texts: List[str]) -> List[List[float]]:
133
- """Generate embeddings for texts."""
134
- # Use the global embedding_provider which gets updated by command-line args
135
- if PREFER_LOCAL_EMBEDDINGS:
136
- # FastEmbed uses 'embed' method, not 'passage_embed'
137
- # Try 'embed' first, fall back to 'passage_embed' for compatibility
138
- if hasattr(embedding_provider, 'embed'):
139
- embeddings = list(embedding_provider.embed(texts))
140
- elif hasattr(embedding_provider, 'passage_embed'):
141
- # Fallback for older versions (shouldn't exist but kept for safety)
142
- embeddings = list(embedding_provider.passage_embed(texts))
143
- else:
144
- raise AttributeError("FastEmbed provider has neither 'embed' nor 'passage_embed' method")
145
- return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
146
- else:
147
- response = embedding_provider.embed(texts, model="voyage-3")
148
- return response.embeddings
149
-
150
- def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
151
- conversation_id: str, created_at: str,
152
- metadata: Dict[str, Any], collection_name: str,
153
- project_path: Path, total_messages: int) -> int:
154
- """Process and immediately upload a single chunk."""
155
- if not messages:
156
- return 0
157
-
158
- # Extract text content and message indices
159
- texts = []
160
- message_indices = []
161
- for msg in messages:
162
- role = msg.get("role", "unknown")
163
- content = msg.get("content", "")
164
- if content:
165
- texts.append(f"{role.upper()}: {content}")
166
- # Fix: Check for None instead of truthiness to include 0 values
167
- idx = msg.get("message_index")
168
- if idx is not None:
169
- message_indices.append(idx)
170
-
171
- if not texts:
172
- return 0
173
-
174
- chunk_text = "\n".join(texts)
175
-
176
- try:
177
- # Generate embedding
178
- embeddings = generate_embeddings([chunk_text])
179
-
180
- # Sanity check embeddings
181
- if not embeddings or not embeddings[0]:
182
- logger.error(f"Empty embedding generated for chunk {chunk_index}")
183
- return 0
184
-
185
- embedding = embeddings[0]
186
-
187
- # Check for degenerate embeddings (all values identical)
188
- if len(set(embedding)) == 1:
189
- logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
190
- return 0
191
-
192
- # Check variance is above threshold
193
- import statistics
194
- variance = statistics.variance(embedding)
195
- if variance < 1e-4: # Less strict threshold for valid embeddings
196
- logger.warning(f"Low variance embedding detected: {variance}")
197
-
198
- # Validate dimension
199
- if len(embedding) != embedding_dimension:
200
- logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
201
- return 0
202
-
203
- # Create point ID
204
- point_id = hashlib.md5(
205
- f"{conversation_id}_{chunk_index}".encode()
206
- ).hexdigest()[:16]
207
-
208
- # Create payload
209
- payload = {
210
- "text": chunk_text,
211
- "conversation_id": conversation_id,
212
- "chunk_index": chunk_index,
213
- "timestamp": created_at,
214
- "project": normalize_project_name(str(project_path)),
215
- "start_role": messages[0].get("role", "unknown") if messages else "unknown",
216
- "message_count": len(messages),
217
- "total_messages": total_messages,
218
- "message_index": message_indices[0] if message_indices else None,
219
- "message_indices": message_indices # Store all indices in this chunk
220
- }
221
-
222
- # Add metadata
223
- if metadata:
224
- payload.update(metadata)
225
-
226
- # Create point
131
+ # Upload to Qdrant
132
+ self._upload_points(collection_name, points)
133
+
134
+ return 1 # Return number of chunks processed
135
+
136
+ def _create_points(
137
+ self,
138
+ messages: List[Dict[str, Any]],
139
+ embeddings: List[List[float]],
140
+ chunk_index: int,
141
+ conversation_id: str,
142
+ created_at: str,
143
+ metadata: Dict[str, Any],
144
+ project_path: Path,
145
+ total_messages: int
146
+ ) -> List[PointStruct]:
147
+ """Create Qdrant points from messages and embeddings."""
148
+ points = []
149
+ # Generate a proper UUID for the chunk ID
150
+ # Use a deterministic UUID based on conversation_id and chunk_index for consistency
151
+ chunk_string = f"{conversation_id}_chunk_{chunk_index}"
152
+ chunk_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk_string))
153
+
154
+ # Build conversation snippet
155
+ snippet_parts = []
156
+ for msg in messages[:5]: # First 5 messages for snippet
157
+ role = msg['role']
158
+ content = msg['content'][:200] # Truncate for snippet
159
+ snippet_parts.append(f"{role}: {content}")
160
+ conversation_snippet = "\n".join(snippet_parts)
161
+
162
+ # Create point with proper vector format
163
+ # Always use the first embedding for a chunk (combining messages into one embedding)
227
164
  point = PointStruct(
228
- id=int(point_id, 16) % (2**63),
229
- vector=embedding, # Use validated embedding variable
230
- payload=payload
165
+ id=chunk_uuid,
166
+ vector=embeddings[0],
167
+ payload={
168
+ "conversation_id": conversation_id,
169
+ "chunk_index": chunk_index,
170
+ "created_at": created_at,
171
+ "project": str(project_path),
172
+ "messages": messages,
173
+ "metadata": metadata,
174
+ "conversation_snippet": conversation_snippet,
175
+ "total_messages": total_messages,
176
+ "embedding_model": self.embedding_service.get_provider_name()
177
+ }
231
178
  )
232
-
233
- # Upload with wait to ensure persistence (with retries)
234
- result = _with_retries(lambda: client.upsert(
235
- collection_name=collection_name,
236
- points=[point],
237
- wait=True # Ensure operation completed before continuing
238
- ))
239
-
240
- # Verify the operation completed successfully (handle enum or string representations)
241
- status = getattr(result, 'status', None)
242
- if status and 'completed' not in str(status).lower():
243
- logger.error(f"Upsert not completed for {conversation_id}:{chunk_index}, status={status}")
244
- return 0
245
-
246
- return 1
247
-
248
- except Exception as e:
249
- logger.error(f"Error processing chunk {chunk_index}: {e}")
250
- return 0
251
-
252
- def extract_ast_elements(code_text: str) -> Set[str]:
253
- """Extract function and class names from code using AST parsing."""
254
- elements = set()
255
-
256
- # Try to parse as Python code
257
- try:
258
- tree = ast.parse(code_text)
259
- for node in ast.walk(tree):
260
- if isinstance(node, ast.FunctionDef):
261
- elements.add(f"func:{node.name}")
262
- elif isinstance(node, ast.AsyncFunctionDef):
263
- elements.add(f"func:{node.name}")
264
- elif isinstance(node, ast.ClassDef):
265
- elements.add(f"class:{node.name}")
266
- except SyntaxError:
267
- # Python regex fallback for partial fragments
268
- for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
269
- elements.add(f"func:{m.group(1)}")
270
- for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
271
- elements.add(f"func:{m.group(1)}")
272
- for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
273
- elements.add(f"class:{m.group(1)}")
274
- except Exception as e:
275
- logger.debug(f"Unexpected error parsing AST: {e}")
276
-
277
- # Try regex patterns for other languages
278
- # JavaScript/TypeScript functions
279
- js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
280
- for match in re.finditer(js_func_pattern, code_text):
281
- elements.add(f"func:{match.group(1)}")
282
-
283
- # Class definitions (multiple languages)
284
- class_pattern = r'(?:class|interface|struct)\s+(\w+)'
285
- for match in re.finditer(class_pattern, code_text):
286
- elements.add(f"class:{match.group(1)}")
287
-
288
- return elements
289
-
290
- def extract_concepts(text: str) -> List[str]:
291
- """Extract development concepts from text."""
292
- concepts = []
293
- concept_patterns = {
294
- 'docker': r'\b(?:docker|container|compose|dockerfile)\b',
295
- 'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
296
- 'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
297
- 'api': r'\b(?:api|rest|graphql|endpoint)\b',
298
- 'security': r'\b(?:security|auth|authentication|encryption)\b',
299
- 'performance': r'\b(?:performance|optimization|cache|speed)\b',
300
- 'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
301
- 'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
302
- 'git': r'\b(?:git|commit|branch|merge|pull request)\b',
303
- 'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
304
- 'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
305
- }
306
-
307
- text_lower = text.lower()
308
- for concept, pattern in concept_patterns.items():
309
- if re.search(pattern, text_lower, re.IGNORECASE):
310
- if concept not in concepts:
311
- concepts.append(concept)
312
-
313
- return concepts[:MAX_CONCEPTS]
314
-
315
- def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
316
- """Extract metadata in a single pass, return metadata, first timestamp, and message count."""
317
- metadata = {
318
- "files_analyzed": [],
319
- "files_edited": [],
320
- "tools_used": [],
321
- "concepts": [],
322
- "ast_elements": [],
323
- "has_code_blocks": False,
324
- "total_messages": 0,
325
- "project_path": None # Add project path from cwd
326
- }
327
-
328
- first_timestamp = None
329
- message_count = 0
330
- all_text = []
179
+ points.append(point)
331
180
 
332
- try:
333
- with open(file_path, 'r', encoding='utf-8') as f:
334
- for line in f:
335
- if not line.strip():
336
- continue
181
+ return points
337
182
 
183
+ def _upload_points(self, collection_name: str, points: List[PointStruct]):
184
+ """Upload points to Qdrant with retry logic."""
185
+ max_retries = 3
186
+ for attempt in range(max_retries):
187
+ try:
188
+ self.client.upsert(
189
+ collection_name=collection_name,
190
+ points=points,
191
+ wait=True
192
+ )
193
+ return
194
+ except Exception as e:
195
+ if attempt < max_retries - 1:
196
+ logger.warning(f"Upload attempt {attempt + 1} failed: {e}")
197
+ else:
198
+ raise
199
+
200
+ def should_import_file(self, file_path: Path) -> bool:
201
+ """Check if a file should be imported."""
202
+ if not file_path.exists() or file_path.stat().st_size == 0:
203
+ return False
204
+
205
+ # Check if file was already imported using UnifiedStateManager API
206
+ imported_files = self.state_manager.get_imported_files()
207
+ normalized_path = self.state_manager.normalize_path(str(file_path))
208
+
209
+ # UnifiedStateManager returns files directly, not nested in 'files' key
210
+ file_state = imported_files.get(normalized_path)
211
+ if file_state:
212
+ file_mtime = datetime.fromtimestamp(file_path.stat().st_mtime).replace(tzinfo=None)
213
+ # Handle both old and new timestamp field names
214
+ state_mtime_str = file_state.get('last_modified') or file_state.get('imported_at')
215
+ if state_mtime_str:
338
216
  try:
339
- data = json.loads(line)
340
-
341
- # Extract cwd (current working directory) as project path
342
- if metadata["project_path"] is None and 'cwd' in data:
343
- metadata["project_path"] = data.get('cwd')
344
-
345
- # Get timestamp from first valid entry
346
- if first_timestamp is None and 'timestamp' in data:
347
- first_timestamp = data.get('timestamp')
348
-
349
- # Count messages
350
- if 'message' in data and data['message']:
351
- msg = data['message']
352
- if msg.get('role') in ['user', 'assistant']:
353
- message_count += 1
354
-
355
- if msg.get('content'):
356
- content = msg['content']
357
- text_content = ""
358
-
359
- if isinstance(content, list):
360
- for item in content:
361
- if isinstance(item, dict):
362
- if item.get('type') == 'text':
363
- text_content += item.get('text', '')
364
- # Check for code blocks
365
- if '```' in item.get('text', ''):
366
- metadata['has_code_blocks'] = True
367
- # Extract code for AST analysis with bounds checking
368
- if len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
369
- # Fix: More permissive regex to handle various fence formats
370
- # Handles both ```\n and ```python\n cases, with optional newline
371
- code_blocks = re.findall(r'```[^`\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
372
- for code_block in code_blocks[:MAX_CODE_BLOCKS]: # Use defined constant
373
- if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
374
- break
375
- ast_elems = extract_ast_elements(code_block)
376
- for elem in list(ast_elems)[:MAX_ELEMENTS_PER_BLOCK]: # Use defined constant
377
- if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
378
- metadata['ast_elements'].append(elem)
379
-
380
- elif item.get('type') == 'thinking':
381
- # Also include thinking content in metadata extraction
382
- text_content += item.get('thinking', '')
383
-
384
- elif item.get('type') == 'tool_use':
385
- tool_name = item.get('name', '')
386
- if tool_name and tool_name not in metadata['tools_used']:
387
- metadata['tools_used'].append(tool_name)
388
-
389
- # Extract file references
390
- if 'input' in item:
391
- input_data = item['input']
392
- if isinstance(input_data, dict):
393
- # Determine if it's an edit tool
394
- is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
395
-
396
- if 'file_path' in input_data:
397
- file_ref = input_data['file_path']
398
- if is_edit:
399
- if file_ref not in metadata['files_edited']:
400
- metadata['files_edited'].append(file_ref)
401
- else:
402
- if file_ref not in metadata['files_analyzed']:
403
- metadata['files_analyzed'].append(file_ref)
404
-
405
- if 'path' in input_data:
406
- file_ref = input_data['path']
407
- if file_ref not in metadata['files_analyzed']:
408
- metadata['files_analyzed'].append(file_ref)
409
- elif isinstance(item, str):
410
- text_content += item
411
- elif isinstance(content, str):
412
- text_content = content
413
-
414
- # Collect text for concept extraction
415
- if text_content:
416
- all_text.append(text_content[:1000]) # Limit text per message
417
-
418
- except json.JSONDecodeError:
419
- continue
420
- except Exception:
421
- continue
422
-
423
- except Exception as e:
424
- logger.warning(f"Error extracting metadata: {e}")
425
-
426
- # Extract concepts from collected text
427
- if all_text:
428
- combined_text = ' '.join(all_text[:MAX_CONCEPT_MESSAGES]) # Limit messages for concept extraction
429
- metadata['concepts'] = extract_concepts(combined_text)
430
-
431
- # MANDATORY: AST-GREP Pattern Analysis
432
- # Analyze code quality for files mentioned in conversation
433
- pattern_quality = {}
434
- avg_quality_score = 0.0
435
-
436
- try:
437
- # Update patterns first (uses 24h cache, <100ms)
438
- from update_patterns import check_and_update_patterns
439
- check_and_update_patterns()
440
-
441
- # Import analyzer
442
- from ast_grep_final_analyzer import FinalASTGrepAnalyzer
443
- analyzer = FinalASTGrepAnalyzer()
217
+ state_mtime = datetime.fromisoformat(state_mtime_str).replace(tzinfo=None)
218
+ if file_mtime <= state_mtime:
219
+ logger.debug(f"Skipping {file_path.name} - already imported")
220
+ return False
221
+ except ValueError:
222
+ logger.debug(f"Invalid timestamp in state for {file_path.name}; will re-import")
444
223
 
445
- # Analyze edited and analyzed files
446
- files_to_analyze = list(set(metadata['files_edited'] + metadata['files_analyzed'][:10]))
447
- quality_scores = []
224
+ return True
448
225
 
449
- for file_path in files_to_analyze:
450
- # Only analyze code files
451
- if file_path and any(file_path.endswith(ext) for ext in ['.py', '.ts', '.js', '.tsx', '.jsx']):
452
- try:
453
- # Check if file exists and is accessible
454
- if os.path.exists(file_path):
455
- result = analyzer.analyze_file(file_path)
456
- metrics = result['quality_metrics']
457
- pattern_quality[file_path] = {
458
- 'score': metrics['quality_score'],
459
- 'good_patterns': metrics['good_patterns_found'],
460
- 'bad_patterns': metrics['bad_patterns_found'],
461
- 'issues': metrics['total_issues']
462
- }
463
- quality_scores.append(metrics['quality_score'])
464
- except Exception as e:
465
- logger.debug(f"Could not analyze {file_path}: {e}")
466
-
467
- # Calculate average quality
468
- if quality_scores:
469
- avg_quality_score = sum(quality_scores) / len(quality_scores)
470
-
471
- except Exception as e:
472
- logger.debug(f"AST analysis not available: {e}")
473
-
474
- # Add pattern analysis to metadata
475
- metadata['pattern_analysis'] = pattern_quality
476
- metadata['avg_quality_score'] = round(avg_quality_score, 3)
477
-
478
- # Set total messages
479
- metadata['total_messages'] = message_count
480
-
481
- # Limit arrays
482
- metadata['files_analyzed'] = metadata['files_analyzed'][:MAX_FILES_ANALYZED]
483
- metadata['files_edited'] = metadata['files_edited'][:MAX_FILES_EDITED]
484
- metadata['tools_used'] = metadata['tools_used'][:MAX_TOOLS_USED]
485
- metadata['ast_elements'] = metadata['ast_elements'][:MAX_AST_ELEMENTS]
486
-
487
- return metadata, first_timestamp or datetime.now().isoformat(), message_count
488
-
489
- def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
490
- """Stream import a single JSONL file without loading it into memory."""
491
- logger.info(f"Streaming import of {jsonl_file.name}")
492
-
493
- # Extract conversation ID
494
- conversation_id = jsonl_file.stem
495
-
496
- # Extract metadata in first pass (lightweight)
497
- metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
498
-
499
- # Track whether we should delete old points (only after successful import)
500
- should_delete_old = False
501
-
502
- # Reset counters for each conversation (critical for correct indexing)
503
- current_message_index = 0 # Must be reset before processing each conversation
504
-
505
- # Stream messages and process in chunks
506
- chunk_buffer = []
507
- chunk_index = 0
508
- total_chunks = 0
509
- conversation_id = jsonl_file.stem
510
-
511
- try:
512
- with open(jsonl_file, 'r', encoding='utf-8') as f:
513
- for line_num, line in enumerate(f, 1):
514
- line = line.strip()
515
- if not line:
516
- continue
517
-
518
- try:
519
- data = json.loads(line)
520
-
521
- # Skip non-message lines
522
- if data.get('type') == 'summary':
523
- continue
524
-
525
- # Extract message if present
526
- if 'message' in data and data['message']:
527
- msg = data['message']
528
- if msg.get('role') and msg.get('content'):
529
- # Extract content from various message types
530
- content = msg['content']
531
- if isinstance(content, list):
532
- text_parts = []
533
- for item in content:
534
- if isinstance(item, dict):
535
- item_type = item.get('type', '')
536
- if item_type == 'text':
537
- text_parts.append(item.get('text', ''))
538
- elif item_type == 'thinking':
539
- # Include thinking content (from Claude's thinking blocks)
540
- thinking_content = item.get('thinking', '')
541
- if thinking_content:
542
- text_parts.append(f"[Thinking] {thinking_content[:1000]}") # Limit size
543
- elif item_type == 'tool_use':
544
- # Include tool use information
545
- tool_name = item.get('name', 'unknown')
546
- tool_input = str(item.get('input', ''))[:500] # Limit size
547
- text_parts.append(f"[Tool: {tool_name}] {tool_input}")
548
- elif item_type == 'tool_result':
549
- # Include tool results
550
- result_content = str(item.get('content', ''))[:1000] # Limit size
551
- text_parts.append(f"[Result] {result_content}")
552
- elif isinstance(item, str):
553
- text_parts.append(item)
554
- content = '\n'.join(text_parts)
555
-
556
- if content:
557
- # Track message index for user/assistant messages
558
- if msg['role'] in ['user', 'assistant']:
559
- message_idx = current_message_index
560
- current_message_index += 1
561
- else:
562
- message_idx = 0
563
-
564
- chunk_buffer.append({
565
- 'role': msg['role'],
566
- 'content': content,
567
- 'message_index': message_idx
568
- })
569
-
570
- # Process chunk when buffer reaches MAX_CHUNK_SIZE
571
- if len(chunk_buffer) >= MAX_CHUNK_SIZE:
572
- chunks = process_and_upload_chunk(
573
- chunk_buffer, chunk_index, conversation_id,
574
- created_at, metadata, collection_name, project_path, total_messages
575
- )
576
- total_chunks += chunks
577
- chunk_buffer = []
578
- chunk_index += 1
579
-
580
- # Force garbage collection after each chunk
581
- gc.collect()
582
-
583
- # Log progress
584
- if chunk_index % 10 == 0:
585
- logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
586
-
587
- # Handle top-level tool_result/tool_use events (no message wrapper)
588
- entry_type = data.get('type')
589
- if entry_type in ('tool_result', 'tool_use'):
590
- text_parts = []
591
- if entry_type == 'tool_use':
592
- tool_name = data.get('name', 'unknown')
593
- tool_input = str(data.get('input', ''))[:500]
594
- text_parts.append(f"[Tool: {tool_name}] {tool_input}")
595
- elif entry_type == 'tool_result':
596
- # Common structures: either 'content' (list/str) or 'result'
597
- result_content = data.get('content')
598
- if isinstance(result_content, list):
599
- # flatten to text
600
- flat = []
601
- for itm in result_content:
602
- if isinstance(itm, dict) and itm.get('type') == 'text':
603
- flat.append(itm.get('text', ''))
604
- elif isinstance(itm, str):
605
- flat.append(itm)
606
- result_content = "\n".join(flat)
607
- if not result_content:
608
- result_content = data.get('result', '') # fallback key used by some tools
609
- text_parts.append(f"[Result] {str(result_content)[:1000]}")
610
-
611
- content = "\n".join([p for p in text_parts if p])
612
- if content:
613
- # Track message index for summary format too
614
- message_idx = current_message_index
615
- current_message_index += 1
616
-
617
- chunk_buffer.append({
618
- 'role': entry_type,
619
- 'content': content,
620
- 'message_index': message_idx
621
- })
622
- if len(chunk_buffer) >= MAX_CHUNK_SIZE:
623
- chunks = process_and_upload_chunk(
624
- chunk_buffer, chunk_index, conversation_id,
625
- created_at, metadata, collection_name, project_path, total_messages
626
- )
627
- total_chunks += chunks
628
- chunk_buffer = []
629
- chunk_index += 1
630
- gc.collect()
631
-
632
- except json.JSONDecodeError:
633
- logger.debug(f"Skipping invalid JSON at line {line_num}")
634
- except Exception as e:
635
- logger.debug(f"Error processing line {line_num}: {e}")
636
-
637
- # Process remaining messages
638
- if chunk_buffer:
639
- chunks = process_and_upload_chunk(
640
- chunk_buffer, chunk_index, conversation_id,
641
- created_at, metadata, collection_name, project_path, total_messages
226
+ def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
227
+ """Import a single JSONL file."""
228
+ # Initialize import strategy if not already done
229
+ if not self.import_strategy:
230
+ self.import_strategy = StreamImportStrategy(
231
+ self.client,
232
+ self.process_and_upload_chunk,
233
+ self.state_manager,
234
+ MAX_CHUNK_SIZE
642
235
  )
643
- total_chunks += chunks
644
236
 
645
- # Only delete old points after successful import verification
646
- if total_chunks > 0:
647
- try:
648
- from qdrant_client.models import Filter, FieldCondition, MatchValue
649
- # Count old points before deletion for verification
650
- old_count_filter = Filter(
651
- must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
652
- )
653
- old_points = client.scroll(
654
- collection_name=collection_name,
655
- scroll_filter=old_count_filter,
656
- limit=1
657
- )[0]
658
-
659
- if len(old_points) > total_chunks + 5: # Allow some tolerance
660
- # Only delete if we have significantly more old points than new
661
- client.delete(
662
- collection_name=collection_name,
663
- points_selector=old_count_filter,
664
- wait=True
665
- )
666
- logger.info(f"Deleted old points for conversation {conversation_id} after verifying new import")
667
- except Exception as e:
668
- logger.warning(f"Could not clean up old points for {conversation_id}: {e}")
237
+ # Use strategy to import file
238
+ chunks = self.import_strategy.import_file(jsonl_file, collection_name, project_path)
669
239
 
670
- logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
671
- return total_chunks
240
+ # Update state if successful
241
+ if chunks > 0:
242
+ self.update_file_state(jsonl_file, chunks, collection_name)
672
243
 
673
- except Exception as e:
674
- logger.error(f"Failed to import {jsonl_file}: {e}")
675
- # Mark file as failed in state manager
676
- try:
677
- state_manager.mark_file_failed(str(jsonl_file), str(e))
678
- except Exception as state_error:
679
- logger.warning(f"Could not mark file as failed in state: {state_error}")
680
- return 0
681
-
682
- def _with_retries(fn, attempts=3, base_sleep=0.5):
683
- """Execute function with retries and exponential backoff."""
684
- for i in range(attempts):
244
+ return chunks
245
+
246
+ def update_file_state(self, file_path: Path, chunks: int, collection_name: str):
247
+ """Update state for successfully imported file."""
685
248
  try:
686
- return fn()
249
+ self.state_manager.add_imported_file(
250
+ file_path=str(file_path),
251
+ chunks=chunks,
252
+ collection=collection_name,
253
+ embedding_mode="local" if "Local" in self.embedding_service.get_provider_name() else "cloud"
254
+ )
255
+ logger.debug(f"Updated state for {file_path.name}")
687
256
  except Exception as e:
688
- if i == attempts - 1:
689
- raise
690
- time.sleep(base_sleep * (2 ** i))
691
- logger.debug(f"Retrying after error: {e}")
257
+ logger.warning(f"Could not update state for {file_path}: {e}")
692
258
 
693
- def should_import_file(file_path: Path) -> bool:
694
- """Check if file should be imported using UnifiedStateManager."""
695
- try:
696
- # Get imported files from state manager
697
- imported_files = state_manager.get_imported_files()
259
+ def import_project(self, project_path: Path, limit: Optional[int] = None) -> Dict[str, Any]:
260
+ """Import all conversations from a project."""
261
+ collection_name = self.get_collection_name(project_path)
262
+ self.ensure_collection(collection_name)
698
263
 
699
- # Normalize the file path for comparison
700
- normalized_path = state_manager.normalize_path(str(file_path))
264
+ # Find JSONL files
265
+ jsonl_files = sorted(project_path.glob("*.jsonl"))
266
+ if not jsonl_files:
267
+ logger.warning(f"No JSONL files found in {project_path}")
268
+ return {"imported": 0, "skipped": 0, "failed": 0}
701
269
 
702
- if normalized_path in imported_files:
703
- file_info = imported_files[normalized_path]
270
+ # Apply limit if specified
271
+ if limit:
272
+ jsonl_files = jsonl_files[:limit]
704
273
 
705
- # Skip if file failed and we haven't reached retry limit
706
- if file_info.get("status") == "failed" and file_info.get("retry_count", 0) >= 3:
707
- logger.info(f"Skipping failed file (max retries reached): {file_path.name}")
708
- return False
274
+ # Import files
275
+ stats = {"imported": 0, "skipped": 0, "failed": 0}
709
276
 
710
- # Get file modification time for comparison
711
- last_modified = file_path.stat().st_mtime
712
- stored_modified = file_info.get("last_modified")
277
+ for jsonl_file in jsonl_files:
278
+ if not self.should_import_file(jsonl_file):
279
+ stats["skipped"] += 1
280
+ continue
713
281
 
714
- # Check if file has been modified (convert stored timestamp to float if needed)
715
- if stored_modified:
716
- try:
717
- # Parse ISO timestamp to float for comparison
718
- stored_time = datetime.fromisoformat(stored_modified.replace("Z", "+00:00")).timestamp()
719
- if abs(last_modified - stored_time) > 1: # Allow 1 second tolerance
720
- logger.info(f"File modified, will re-import: {file_path.name}")
721
- return True
722
- except (ValueError, TypeError):
723
- # If we can't parse the stored time, re-import to be safe
724
- logger.warning(f"Could not parse stored modification time, will re-import: {file_path.name}")
725
- return True
726
-
727
- # Check for suspiciously low chunk counts (likely failed imports)
728
- chunks = file_info.get("chunks", 0)
729
- file_size_kb = file_path.stat().st_size / 1024
730
-
731
- # Heuristic: Files > 10KB should have more than 2 chunks
732
- if file_size_kb > 10 and chunks <= 2 and file_info.get("status") != "failed":
733
- logger.warning(f"File has suspiciously low chunks ({chunks}) for size {file_size_kb:.1f}KB, will re-import: {file_path.name}")
734
- return True
735
-
736
- # Skip if successfully imported
737
- if file_info.get("status") == "completed":
738
- logger.info(f"Skipping successfully imported file: {file_path.name}")
739
- return False
282
+ try:
283
+ # Calculate expected chunks based on file size
284
+ file_size = jsonl_file.stat().st_size
285
+ expected_chunks = max(1, file_size // (1024 * 100)) # Rough estimate
740
286
 
741
- return True
287
+ chunks = self.import_file(jsonl_file, collection_name, project_path)
742
288
 
743
- except Exception as e:
744
- logger.warning(f"Error checking import status for {file_path}: {e}")
745
- return True # Default to importing if we can't check status
289
+ # Validate chunk count is reasonable
290
+ if chunks > 0:
291
+ if chunks > expected_chunks * 10:
292
+ logger.warning(f"Unusual chunk count for {jsonl_file.name}: {chunks} chunks (expected ~{expected_chunks})")
293
+ stats["imported"] += 1
294
+ else:
295
+ stats["failed"] += 1
296
+ except Exception as e:
297
+ logger.error(f"Failed to import {jsonl_file}: {e}")
298
+ stats["failed"] += 1
299
+
300
+ # Force garbage collection periodically
301
+ if (stats["imported"] + stats["failed"]) % 10 == 0:
302
+ gc.collect()
303
+
304
+ return stats
746
305
 
747
- def update_file_state(file_path: Path, chunks: int, collection_name: str):
748
- """Update state for imported file using UnifiedStateManager."""
749
- try:
750
- # Determine embedding mode from collection suffix
751
- embedding_mode = "local" if collection_suffix == "local" else "cloud"
752
-
753
- # Add file to state manager
754
- state_manager.add_imported_file(
755
- file_path=str(file_path),
756
- chunks=chunks,
757
- importer="streaming",
758
- collection=collection_name,
759
- embedding_mode=embedding_mode,
760
- status="completed"
761
- )
762
- logger.debug(f"Updated state for {file_path.name}: {chunks} chunks")
763
- except Exception as e:
764
- logger.error(f"Failed to update state for {file_path}: {e}")
765
306
 
766
307
  def main():
767
- """Main import function."""
768
- # Parse command-line arguments
769
- parser = argparse.ArgumentParser(description='Import conversations with unified embeddings support')
770
- parser.add_argument('--prefer-voyage', action='store_true',
771
- help='Use Voyage AI embeddings instead of local FastEmbed')
772
- parser.add_argument('--limit', type=int,
773
- help='Limit number of files to import')
774
- parser.add_argument('--max-files-per-cycle', type=int,
775
- help='Maximum files to process per cycle')
308
+ """Main entry point."""
309
+ parser = argparse.ArgumentParser(description="Import conversations with reduced complexity")
310
+ parser.add_argument("--project", type=str, help="Specific project path to import")
311
+ parser.add_argument("--limit", type=int, help="Limit number of files to import")
312
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
313
+
776
314
  args = parser.parse_args()
777
-
778
- # Override environment variable if --prefer-voyage is specified
779
- global PREFER_LOCAL_EMBEDDINGS, embedding_provider, embedding_dimension, collection_suffix
780
- if args.prefer_voyage:
781
- if not VOYAGE_API_KEY:
782
- logger.error("--prefer-voyage specified but VOYAGE_KEY environment variable not set")
315
+
316
+ if args.verbose:
317
+ logging.getLogger().setLevel(logging.DEBUG)
318
+
319
+ # Create importer
320
+ importer = ConversationImporter()
321
+
322
+ # Determine project path
323
+ if args.project:
324
+ project_path = Path(args.project).expanduser().resolve()
325
+ if not project_path.exists():
326
+ logger.error(f"Project path does not exist: {project_path}")
783
327
  sys.exit(1)
784
- logger.info("Command-line flag --prefer-voyage detected, switching to Voyage AI embeddings")
785
- PREFER_LOCAL_EMBEDDINGS = False
786
-
787
- # Re-initialize embedding provider with Voyage
788
- import voyageai
789
- embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
790
- embedding_dimension = 1024
791
- collection_suffix = "voyage"
792
- logger.info("Switched to Voyage AI embeddings (dimension: 1024)")
793
-
794
- # Get status from state manager
795
- status = state_manager.get_status()
796
- logger.info(f"Loaded state with {status['indexed_files']} previously imported files")
797
-
798
- # Find all projects
799
- # Use LOGS_DIR env var, or fall back to Claude projects directory, then /logs for Docker
800
- logs_dir_env = os.getenv("LOGS_DIR")
801
- if logs_dir_env:
802
- logs_dir = Path(logs_dir_env)
803
- elif (Path.home() / ".claude" / "projects").exists():
804
- logs_dir = Path.home() / ".claude" / "projects"
328
+ projects = [project_path]
805
329
  else:
806
- logs_dir = Path("/logs") # Docker fallback
807
-
808
- if not logs_dir.exists():
809
- logger.error(f"Projects directory not found: {logs_dir}")
810
- sys.exit(1)
811
-
812
- project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
813
- logger.info(f"Found {len(project_dirs)} projects to import")
814
-
815
- total_imported = 0
816
- files_processed = 0
817
-
818
- for project_dir in project_dirs:
819
- # Get collection name
820
- collection_name = get_collection_name(project_dir)
821
- logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
822
-
823
- # Ensure collection exists
824
- ensure_collection(collection_name)
825
-
826
- # Find JSONL files
827
- jsonl_files = sorted(project_dir.glob("*.jsonl"))
828
-
829
- # Apply limit from command line if specified
830
- if args.limit and files_processed >= args.limit:
831
- logger.info(f"Reached limit of {args.limit} files, stopping import")
832
- break
833
-
834
- # Limit files per cycle if specified
835
- max_files = args.max_files_per_cycle or int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
836
- jsonl_files = jsonl_files[:max_files]
837
-
838
- for jsonl_file in jsonl_files:
839
- # Check limit again per file
840
- if args.limit and files_processed >= args.limit:
841
- logger.info(f"Reached limit of {args.limit} files, stopping import")
842
- break
843
-
844
- if should_import_file(jsonl_file):
845
- chunks = stream_import_file(jsonl_file, collection_name, project_dir)
846
- files_processed += 1
847
- if chunks > 0:
848
- # Verify data is actually in Qdrant before marking as imported
849
- from qdrant_client.models import Filter, FieldCondition, MatchValue
850
- try:
851
- conversation_id = jsonl_file.stem
852
- count_result = _with_retries(lambda: client.count(
853
- collection_name=collection_name,
854
- count_filter=Filter(
855
- must=[FieldCondition(key="conversation_id",
856
- match=MatchValue(value=conversation_id))]
857
- ),
858
- exact=True # Ensure exact count, not approximation
859
- ))
860
- actual_count = count_result.count if hasattr(count_result, 'count') else 0
861
-
862
- if actual_count > 0:
863
- logger.info(f"Verified {actual_count} points in Qdrant for {conversation_id}")
864
- update_file_state(jsonl_file, chunks, collection_name)
865
- total_imported += 1
866
- else:
867
- logger.error(f"No points found in Qdrant for {conversation_id} despite {chunks} chunks processed - not marking as imported")
868
- except Exception as e:
869
- logger.error(f"Failed to verify Qdrant points for {jsonl_file.name}: {e}")
870
- # Don't mark as imported if we can't verify
871
-
872
- # Force GC after each file
873
- gc.collect()
874
- else:
875
- # Critical fix: Don't mark files with 0 chunks as imported
876
- # This allows retry on next run
877
- logger.warning(f"File produced 0 chunks, not marking as imported: {jsonl_file.name}")
878
- # Mark as failed so we don't keep retrying indefinitely
879
- try:
880
- state_manager.mark_file_failed(str(jsonl_file), "File produced 0 chunks during import")
881
- except Exception as state_error:
882
- logger.warning(f"Could not mark file as failed in state: {state_error}")
883
-
884
- logger.info(f"Import complete: processed {total_imported} files")
330
+ # Import all projects
331
+ claude_dir = Path.home() / ".claude" / "projects"
332
+ if not claude_dir.exists():
333
+ logger.error(f"Claude projects directory not found: {claude_dir}")
334
+ sys.exit(1)
335
+ projects = [p for p in claude_dir.iterdir() if p.is_dir()]
336
+
337
+ # Import projects
338
+ total_stats = {"imported": 0, "skipped": 0, "failed": 0}
339
+
340
+ for project in projects:
341
+ logger.info(f"Importing project: {project.name}")
342
+ stats = importer.import_project(project, args.limit)
343
+
344
+ # Aggregate stats
345
+ for key in total_stats:
346
+ total_stats[key] += stats[key]
347
+
348
+ logger.info(f"Project {project.name}: {stats}")
349
+
350
+ # Print summary
351
+ logger.info(f"\nImport complete:")
352
+ logger.info(f" Imported: {total_stats['imported']} conversations")
353
+ logger.info(f" Skipped: {total_stats['skipped']} conversations")
354
+ logger.info(f" Failed: {total_stats['failed']} conversations")
355
+
885
356
 
886
357
  if __name__ == "__main__":
887
358
  main()