claude-self-reflect 5.0.2 → 5.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/csr-validator.md +43 -0
- package/.claude/agents/open-source-maintainer.md +77 -0
- package/docker-compose.yaml +3 -1
- package/installer/setup-wizard-docker.js +64 -9
- package/package.json +6 -1
- package/scripts/ast_grep_final_analyzer.py +16 -6
- package/scripts/csr-status +120 -17
- package/scripts/debug-august-parsing.py +5 -1
- package/scripts/debug-project-resolver.py +3 -3
- package/scripts/doctor.py +342 -0
- package/scripts/embedding_service.py +241 -0
- package/scripts/import-conversations-unified.py +292 -821
- package/scripts/import_strategies.py +344 -0
- package/scripts/message_processors.py +248 -0
- package/scripts/metadata_extractor.py +262 -0
- package/scripts/session_quality_tracker.py +10 -0
- package/scripts/unified_state_manager.py +7 -4
- package/mcp-server/src/test_quality.py +0 -153
|
@@ -1,59 +1,45 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Refactored import script with reduced complexity using modular components.
|
|
4
|
+
All functions have cyclomatic complexity < 10.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import json
|
|
8
7
|
import os
|
|
9
8
|
import sys
|
|
10
|
-
import hashlib
|
|
11
9
|
import gc
|
|
12
|
-
import ast
|
|
13
|
-
import re
|
|
14
|
-
import fcntl
|
|
15
|
-
import time
|
|
16
10
|
import argparse
|
|
17
|
-
from pathlib import Path
|
|
18
|
-
from datetime import datetime, timezone
|
|
19
|
-
from typing import List, Dict, Any, Optional, Set
|
|
20
11
|
import logging
|
|
12
|
+
import hashlib
|
|
13
|
+
import uuid
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from typing import List, Dict, Any, Optional
|
|
21
17
|
|
|
22
|
-
#
|
|
23
|
-
try:
|
|
24
|
-
from dotenv import load_dotenv
|
|
25
|
-
# Load from project root
|
|
26
|
-
env_path = Path(__file__).parent.parent / '.env'
|
|
27
|
-
if env_path.exists():
|
|
28
|
-
load_dotenv(env_path)
|
|
29
|
-
print(f"Loaded .env from {env_path}")
|
|
30
|
-
except ImportError:
|
|
31
|
-
pass # dotenv not available, use system environment
|
|
32
|
-
|
|
33
|
-
# Add the scripts directory to the Python path for utils import
|
|
18
|
+
# Add the scripts directory to the Python path
|
|
34
19
|
scripts_dir = Path(__file__).parent
|
|
35
20
|
sys.path.insert(0, str(scripts_dir))
|
|
36
21
|
|
|
37
|
-
# Import
|
|
22
|
+
# Import refactored components
|
|
23
|
+
from metadata_extractor import MetadataExtractor
|
|
24
|
+
from embedding_service import create_embedding_service
|
|
25
|
+
from import_strategies import StreamImportStrategy
|
|
38
26
|
from unified_state_manager import UnifiedStateManager
|
|
39
27
|
|
|
28
|
+
# Import Qdrant client
|
|
40
29
|
from qdrant_client import QdrantClient
|
|
41
30
|
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
42
31
|
|
|
43
|
-
# Import
|
|
44
|
-
# Add parent directory to path to import shared module
|
|
32
|
+
# Import shared modules
|
|
45
33
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
46
34
|
try:
|
|
47
35
|
from shared.normalization import normalize_project_name
|
|
48
|
-
except ImportError
|
|
49
|
-
logging.error(f"Failed to import normalize_project_name from shared module: {e}")
|
|
50
|
-
# Fall back to local utils if shared module not found
|
|
36
|
+
except ImportError:
|
|
51
37
|
try:
|
|
52
|
-
from utils import normalize_project_name
|
|
53
|
-
logging.
|
|
38
|
+
from importer.utils.project_normalizer import normalize_project_name
|
|
39
|
+
logging.debug("Using importer.utils.project_normalizer.normalize_project_name")
|
|
54
40
|
except ImportError:
|
|
55
|
-
|
|
56
|
-
|
|
41
|
+
from utils import normalize_project_name
|
|
42
|
+
logging.warning("Using legacy utils.normalize_project_name")
|
|
57
43
|
|
|
58
44
|
# Set up logging
|
|
59
45
|
logging.basicConfig(
|
|
@@ -62,826 +48,311 @@ logging.basicConfig(
|
|
|
62
48
|
)
|
|
63
49
|
logger = logging.getLogger(__name__)
|
|
64
50
|
|
|
65
|
-
#
|
|
51
|
+
# Constants
|
|
66
52
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
53
|
+
MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50"))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ConversationImporter:
|
|
57
|
+
"""Main class for importing conversations with reduced complexity."""
|
|
58
|
+
|
|
59
|
+
def __init__(self):
|
|
60
|
+
"""Initialize the importer with all required services."""
|
|
61
|
+
self.client = self._init_qdrant_client()
|
|
62
|
+
self.embedding_service = create_embedding_service()
|
|
63
|
+
self.state_manager = self._init_state_manager()
|
|
64
|
+
self.metadata_extractor = MetadataExtractor()
|
|
65
|
+
self.import_strategy = None
|
|
66
|
+
|
|
67
|
+
def _init_qdrant_client(self) -> QdrantClient:
|
|
68
|
+
"""Initialize Qdrant client with optional authentication."""
|
|
69
|
+
api_key = os.getenv("QDRANT_API_KEY")
|
|
70
|
+
if api_key:
|
|
71
|
+
return QdrantClient(url=QDRANT_URL, api_key=api_key, timeout=30)
|
|
72
|
+
return QdrantClient(url=QDRANT_URL, timeout=30)
|
|
73
|
+
|
|
74
|
+
def _init_state_manager(self) -> UnifiedStateManager:
|
|
75
|
+
"""Initialize state manager."""
|
|
76
|
+
env_state = os.getenv("STATE_FILE")
|
|
77
|
+
if env_state:
|
|
78
|
+
state_file_path = Path(env_state).expanduser().resolve()
|
|
79
|
+
return UnifiedStateManager(state_file_path)
|
|
80
|
+
return UnifiedStateManager()
|
|
81
|
+
|
|
82
|
+
def get_collection_name(self, project_path: Path) -> str:
|
|
83
|
+
"""Get collection name for a project."""
|
|
84
|
+
project_name = normalize_project_name(str(project_path))
|
|
85
|
+
suffix = self.embedding_service.get_collection_suffix()
|
|
86
|
+
return f"csr_{project_name}_{suffix}"
|
|
87
|
+
|
|
88
|
+
def ensure_collection(self, collection_name: str):
|
|
89
|
+
"""Ensure collection exists with correct configuration."""
|
|
90
|
+
collections = self.client.get_collections().collections
|
|
91
|
+
exists = any(c.name == collection_name for c in collections)
|
|
92
|
+
|
|
93
|
+
if not exists:
|
|
94
|
+
dimension = self.embedding_service.get_dimension()
|
|
95
|
+
self.client.create_collection(
|
|
96
|
+
collection_name=collection_name,
|
|
97
|
+
vectors_config=VectorParams(size=dimension, distance=Distance.COSINE)
|
|
98
|
+
)
|
|
99
|
+
logger.info(f"Created collection: {collection_name} with {dimension} dimensions")
|
|
100
|
+
|
|
101
|
+
def process_and_upload_chunk(
|
|
102
|
+
self,
|
|
103
|
+
messages: List[Dict[str, Any]],
|
|
104
|
+
chunk_index: int,
|
|
105
|
+
conversation_id: str,
|
|
106
|
+
created_at: str,
|
|
107
|
+
metadata: Dict[str, Any],
|
|
108
|
+
collection_name: str,
|
|
109
|
+
project_path: Path,
|
|
110
|
+
total_messages: int
|
|
111
|
+
) -> int:
|
|
112
|
+
"""Process and upload a chunk of messages."""
|
|
113
|
+
if not messages:
|
|
114
|
+
return 0
|
|
67
115
|
|
|
68
|
-
#
|
|
69
|
-
|
|
70
|
-
MAX_AST_ELEMENTS = 30
|
|
71
|
-
MAX_CODE_BLOCKS = 5
|
|
72
|
-
MAX_ELEMENTS_PER_BLOCK = 10
|
|
73
|
-
MAX_FILES_ANALYZED = 20
|
|
74
|
-
MAX_FILES_EDITED = 20
|
|
75
|
-
MAX_TOOLS_USED = 15
|
|
76
|
-
MAX_CONCEPT_MESSAGES = 50
|
|
77
|
-
|
|
78
|
-
# Initialize UnifiedStateManager
|
|
79
|
-
# Support legacy STATE_FILE environment variable
|
|
80
|
-
env_state = os.getenv("STATE_FILE")
|
|
81
|
-
if env_state:
|
|
82
|
-
from pathlib import Path
|
|
83
|
-
state_file_path = Path(env_state).expanduser().resolve()
|
|
84
|
-
state_manager = UnifiedStateManager(state_file_path)
|
|
85
|
-
else:
|
|
86
|
-
state_manager = UnifiedStateManager() # Uses default location
|
|
87
|
-
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
88
|
-
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
89
|
-
MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
|
|
90
|
-
|
|
91
|
-
# Initialize Qdrant client with timeout
|
|
92
|
-
client = QdrantClient(
|
|
93
|
-
url=QDRANT_URL,
|
|
94
|
-
timeout=30 # 30 second timeout for network operations
|
|
95
|
-
)
|
|
116
|
+
# Combine all message content into a single text for the chunk
|
|
117
|
+
combined_text = "\n".join([msg['content'] for msg in messages])
|
|
96
118
|
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
collection_suffix = "local"
|
|
108
|
-
logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
|
|
109
|
-
else:
|
|
110
|
-
logger.info("Using Voyage AI embeddings")
|
|
111
|
-
import voyageai
|
|
112
|
-
embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
113
|
-
embedding_dimension = 1024
|
|
114
|
-
collection_suffix = "voyage"
|
|
115
|
-
|
|
116
|
-
def get_collection_name(project_path: Path) -> str:
|
|
117
|
-
"""Generate collection name from project path."""
|
|
118
|
-
normalized = normalize_project_name(str(project_path))
|
|
119
|
-
name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
120
|
-
return f"conv_{name_hash}_{collection_suffix}"
|
|
121
|
-
|
|
122
|
-
def ensure_collection(collection_name: str):
|
|
123
|
-
"""Ensure collection exists with correct configuration."""
|
|
124
|
-
collections = client.get_collections().collections
|
|
125
|
-
if not any(c.name == collection_name for c in collections):
|
|
126
|
-
logger.info(f"Creating collection: {collection_name}")
|
|
127
|
-
client.create_collection(
|
|
128
|
-
collection_name=collection_name,
|
|
129
|
-
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
|
119
|
+
# Generate a single embedding for the entire chunk
|
|
120
|
+
embeddings = self.embedding_service.generate_embeddings([combined_text])
|
|
121
|
+
if not embeddings:
|
|
122
|
+
return 0
|
|
123
|
+
|
|
124
|
+
# Create points for upload
|
|
125
|
+
points = self._create_points(
|
|
126
|
+
messages, embeddings, chunk_index,
|
|
127
|
+
conversation_id, created_at, metadata,
|
|
128
|
+
project_path, total_messages
|
|
130
129
|
)
|
|
131
130
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
texts.append(f"{role.upper()}: {content}")
|
|
166
|
-
# Fix: Check for None instead of truthiness to include 0 values
|
|
167
|
-
idx = msg.get("message_index")
|
|
168
|
-
if idx is not None:
|
|
169
|
-
message_indices.append(idx)
|
|
170
|
-
|
|
171
|
-
if not texts:
|
|
172
|
-
return 0
|
|
173
|
-
|
|
174
|
-
chunk_text = "\n".join(texts)
|
|
175
|
-
|
|
176
|
-
try:
|
|
177
|
-
# Generate embedding
|
|
178
|
-
embeddings = generate_embeddings([chunk_text])
|
|
179
|
-
|
|
180
|
-
# Sanity check embeddings
|
|
181
|
-
if not embeddings or not embeddings[0]:
|
|
182
|
-
logger.error(f"Empty embedding generated for chunk {chunk_index}")
|
|
183
|
-
return 0
|
|
184
|
-
|
|
185
|
-
embedding = embeddings[0]
|
|
186
|
-
|
|
187
|
-
# Check for degenerate embeddings (all values identical)
|
|
188
|
-
if len(set(embedding)) == 1:
|
|
189
|
-
logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
|
|
190
|
-
return 0
|
|
191
|
-
|
|
192
|
-
# Check variance is above threshold
|
|
193
|
-
import statistics
|
|
194
|
-
variance = statistics.variance(embedding)
|
|
195
|
-
if variance < 1e-4: # Less strict threshold for valid embeddings
|
|
196
|
-
logger.warning(f"Low variance embedding detected: {variance}")
|
|
197
|
-
|
|
198
|
-
# Validate dimension
|
|
199
|
-
if len(embedding) != embedding_dimension:
|
|
200
|
-
logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
|
|
201
|
-
return 0
|
|
202
|
-
|
|
203
|
-
# Create point ID
|
|
204
|
-
point_id = hashlib.md5(
|
|
205
|
-
f"{conversation_id}_{chunk_index}".encode()
|
|
206
|
-
).hexdigest()[:16]
|
|
207
|
-
|
|
208
|
-
# Create payload
|
|
209
|
-
payload = {
|
|
210
|
-
"text": chunk_text,
|
|
211
|
-
"conversation_id": conversation_id,
|
|
212
|
-
"chunk_index": chunk_index,
|
|
213
|
-
"timestamp": created_at,
|
|
214
|
-
"project": normalize_project_name(str(project_path)),
|
|
215
|
-
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
216
|
-
"message_count": len(messages),
|
|
217
|
-
"total_messages": total_messages,
|
|
218
|
-
"message_index": message_indices[0] if message_indices else None,
|
|
219
|
-
"message_indices": message_indices # Store all indices in this chunk
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
# Add metadata
|
|
223
|
-
if metadata:
|
|
224
|
-
payload.update(metadata)
|
|
225
|
-
|
|
226
|
-
# Create point
|
|
131
|
+
# Upload to Qdrant
|
|
132
|
+
self._upload_points(collection_name, points)
|
|
133
|
+
|
|
134
|
+
return 1 # Return number of chunks processed
|
|
135
|
+
|
|
136
|
+
def _create_points(
|
|
137
|
+
self,
|
|
138
|
+
messages: List[Dict[str, Any]],
|
|
139
|
+
embeddings: List[List[float]],
|
|
140
|
+
chunk_index: int,
|
|
141
|
+
conversation_id: str,
|
|
142
|
+
created_at: str,
|
|
143
|
+
metadata: Dict[str, Any],
|
|
144
|
+
project_path: Path,
|
|
145
|
+
total_messages: int
|
|
146
|
+
) -> List[PointStruct]:
|
|
147
|
+
"""Create Qdrant points from messages and embeddings."""
|
|
148
|
+
points = []
|
|
149
|
+
# Generate a proper UUID for the chunk ID
|
|
150
|
+
# Use a deterministic UUID based on conversation_id and chunk_index for consistency
|
|
151
|
+
chunk_string = f"{conversation_id}_chunk_{chunk_index}"
|
|
152
|
+
chunk_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk_string))
|
|
153
|
+
|
|
154
|
+
# Build conversation snippet
|
|
155
|
+
snippet_parts = []
|
|
156
|
+
for msg in messages[:5]: # First 5 messages for snippet
|
|
157
|
+
role = msg['role']
|
|
158
|
+
content = msg['content'][:200] # Truncate for snippet
|
|
159
|
+
snippet_parts.append(f"{role}: {content}")
|
|
160
|
+
conversation_snippet = "\n".join(snippet_parts)
|
|
161
|
+
|
|
162
|
+
# Create point with proper vector format
|
|
163
|
+
# Always use the first embedding for a chunk (combining messages into one embedding)
|
|
227
164
|
point = PointStruct(
|
|
228
|
-
id=
|
|
229
|
-
vector=
|
|
230
|
-
payload=
|
|
165
|
+
id=chunk_uuid,
|
|
166
|
+
vector=embeddings[0],
|
|
167
|
+
payload={
|
|
168
|
+
"conversation_id": conversation_id,
|
|
169
|
+
"chunk_index": chunk_index,
|
|
170
|
+
"created_at": created_at,
|
|
171
|
+
"project": str(project_path),
|
|
172
|
+
"messages": messages,
|
|
173
|
+
"metadata": metadata,
|
|
174
|
+
"conversation_snippet": conversation_snippet,
|
|
175
|
+
"total_messages": total_messages,
|
|
176
|
+
"embedding_model": self.embedding_service.get_provider_name()
|
|
177
|
+
}
|
|
231
178
|
)
|
|
232
|
-
|
|
233
|
-
# Upload with wait to ensure persistence (with retries)
|
|
234
|
-
result = _with_retries(lambda: client.upsert(
|
|
235
|
-
collection_name=collection_name,
|
|
236
|
-
points=[point],
|
|
237
|
-
wait=True # Ensure operation completed before continuing
|
|
238
|
-
))
|
|
239
|
-
|
|
240
|
-
# Verify the operation completed successfully (handle enum or string representations)
|
|
241
|
-
status = getattr(result, 'status', None)
|
|
242
|
-
if status and 'completed' not in str(status).lower():
|
|
243
|
-
logger.error(f"Upsert not completed for {conversation_id}:{chunk_index}, status={status}")
|
|
244
|
-
return 0
|
|
245
|
-
|
|
246
|
-
return 1
|
|
247
|
-
|
|
248
|
-
except Exception as e:
|
|
249
|
-
logger.error(f"Error processing chunk {chunk_index}: {e}")
|
|
250
|
-
return 0
|
|
251
|
-
|
|
252
|
-
def extract_ast_elements(code_text: str) -> Set[str]:
|
|
253
|
-
"""Extract function and class names from code using AST parsing."""
|
|
254
|
-
elements = set()
|
|
255
|
-
|
|
256
|
-
# Try to parse as Python code
|
|
257
|
-
try:
|
|
258
|
-
tree = ast.parse(code_text)
|
|
259
|
-
for node in ast.walk(tree):
|
|
260
|
-
if isinstance(node, ast.FunctionDef):
|
|
261
|
-
elements.add(f"func:{node.name}")
|
|
262
|
-
elif isinstance(node, ast.AsyncFunctionDef):
|
|
263
|
-
elements.add(f"func:{node.name}")
|
|
264
|
-
elif isinstance(node, ast.ClassDef):
|
|
265
|
-
elements.add(f"class:{node.name}")
|
|
266
|
-
except SyntaxError:
|
|
267
|
-
# Python regex fallback for partial fragments
|
|
268
|
-
for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
|
|
269
|
-
elements.add(f"func:{m.group(1)}")
|
|
270
|
-
for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
|
|
271
|
-
elements.add(f"func:{m.group(1)}")
|
|
272
|
-
for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
|
|
273
|
-
elements.add(f"class:{m.group(1)}")
|
|
274
|
-
except Exception as e:
|
|
275
|
-
logger.debug(f"Unexpected error parsing AST: {e}")
|
|
276
|
-
|
|
277
|
-
# Try regex patterns for other languages
|
|
278
|
-
# JavaScript/TypeScript functions
|
|
279
|
-
js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
|
|
280
|
-
for match in re.finditer(js_func_pattern, code_text):
|
|
281
|
-
elements.add(f"func:{match.group(1)}")
|
|
282
|
-
|
|
283
|
-
# Class definitions (multiple languages)
|
|
284
|
-
class_pattern = r'(?:class|interface|struct)\s+(\w+)'
|
|
285
|
-
for match in re.finditer(class_pattern, code_text):
|
|
286
|
-
elements.add(f"class:{match.group(1)}")
|
|
287
|
-
|
|
288
|
-
return elements
|
|
289
|
-
|
|
290
|
-
def extract_concepts(text: str) -> List[str]:
|
|
291
|
-
"""Extract development concepts from text."""
|
|
292
|
-
concepts = []
|
|
293
|
-
concept_patterns = {
|
|
294
|
-
'docker': r'\b(?:docker|container|compose|dockerfile)\b',
|
|
295
|
-
'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
|
|
296
|
-
'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
|
|
297
|
-
'api': r'\b(?:api|rest|graphql|endpoint)\b',
|
|
298
|
-
'security': r'\b(?:security|auth|authentication|encryption)\b',
|
|
299
|
-
'performance': r'\b(?:performance|optimization|cache|speed)\b',
|
|
300
|
-
'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
|
|
301
|
-
'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
|
|
302
|
-
'git': r'\b(?:git|commit|branch|merge|pull request)\b',
|
|
303
|
-
'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
|
|
304
|
-
'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
text_lower = text.lower()
|
|
308
|
-
for concept, pattern in concept_patterns.items():
|
|
309
|
-
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
310
|
-
if concept not in concepts:
|
|
311
|
-
concepts.append(concept)
|
|
312
|
-
|
|
313
|
-
return concepts[:MAX_CONCEPTS]
|
|
314
|
-
|
|
315
|
-
def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
|
|
316
|
-
"""Extract metadata in a single pass, return metadata, first timestamp, and message count."""
|
|
317
|
-
metadata = {
|
|
318
|
-
"files_analyzed": [],
|
|
319
|
-
"files_edited": [],
|
|
320
|
-
"tools_used": [],
|
|
321
|
-
"concepts": [],
|
|
322
|
-
"ast_elements": [],
|
|
323
|
-
"has_code_blocks": False,
|
|
324
|
-
"total_messages": 0,
|
|
325
|
-
"project_path": None # Add project path from cwd
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
first_timestamp = None
|
|
329
|
-
message_count = 0
|
|
330
|
-
all_text = []
|
|
179
|
+
points.append(point)
|
|
331
180
|
|
|
332
|
-
|
|
333
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
334
|
-
for line in f:
|
|
335
|
-
if not line.strip():
|
|
336
|
-
continue
|
|
181
|
+
return points
|
|
337
182
|
|
|
183
|
+
def _upload_points(self, collection_name: str, points: List[PointStruct]):
|
|
184
|
+
"""Upload points to Qdrant with retry logic."""
|
|
185
|
+
max_retries = 3
|
|
186
|
+
for attempt in range(max_retries):
|
|
187
|
+
try:
|
|
188
|
+
self.client.upsert(
|
|
189
|
+
collection_name=collection_name,
|
|
190
|
+
points=points,
|
|
191
|
+
wait=True
|
|
192
|
+
)
|
|
193
|
+
return
|
|
194
|
+
except Exception as e:
|
|
195
|
+
if attempt < max_retries - 1:
|
|
196
|
+
logger.warning(f"Upload attempt {attempt + 1} failed: {e}")
|
|
197
|
+
else:
|
|
198
|
+
raise
|
|
199
|
+
|
|
200
|
+
def should_import_file(self, file_path: Path) -> bool:
|
|
201
|
+
"""Check if a file should be imported."""
|
|
202
|
+
if not file_path.exists() or file_path.stat().st_size == 0:
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
# Check if file was already imported using UnifiedStateManager API
|
|
206
|
+
imported_files = self.state_manager.get_imported_files()
|
|
207
|
+
normalized_path = self.state_manager.normalize_path(str(file_path))
|
|
208
|
+
|
|
209
|
+
# UnifiedStateManager returns files directly, not nested in 'files' key
|
|
210
|
+
file_state = imported_files.get(normalized_path)
|
|
211
|
+
if file_state:
|
|
212
|
+
file_mtime = datetime.fromtimestamp(file_path.stat().st_mtime).replace(tzinfo=None)
|
|
213
|
+
# Handle both old and new timestamp field names
|
|
214
|
+
state_mtime_str = file_state.get('last_modified') or file_state.get('imported_at')
|
|
215
|
+
if state_mtime_str:
|
|
338
216
|
try:
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
# Get timestamp from first valid entry
|
|
346
|
-
if first_timestamp is None and 'timestamp' in data:
|
|
347
|
-
first_timestamp = data.get('timestamp')
|
|
348
|
-
|
|
349
|
-
# Count messages
|
|
350
|
-
if 'message' in data and data['message']:
|
|
351
|
-
msg = data['message']
|
|
352
|
-
if msg.get('role') in ['user', 'assistant']:
|
|
353
|
-
message_count += 1
|
|
354
|
-
|
|
355
|
-
if msg.get('content'):
|
|
356
|
-
content = msg['content']
|
|
357
|
-
text_content = ""
|
|
358
|
-
|
|
359
|
-
if isinstance(content, list):
|
|
360
|
-
for item in content:
|
|
361
|
-
if isinstance(item, dict):
|
|
362
|
-
if item.get('type') == 'text':
|
|
363
|
-
text_content += item.get('text', '')
|
|
364
|
-
# Check for code blocks
|
|
365
|
-
if '```' in item.get('text', ''):
|
|
366
|
-
metadata['has_code_blocks'] = True
|
|
367
|
-
# Extract code for AST analysis with bounds checking
|
|
368
|
-
if len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
|
|
369
|
-
# Fix: More permissive regex to handle various fence formats
|
|
370
|
-
# Handles both ```\n and ```python\n cases, with optional newline
|
|
371
|
-
code_blocks = re.findall(r'```[^`\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
|
|
372
|
-
for code_block in code_blocks[:MAX_CODE_BLOCKS]: # Use defined constant
|
|
373
|
-
if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
|
|
374
|
-
break
|
|
375
|
-
ast_elems = extract_ast_elements(code_block)
|
|
376
|
-
for elem in list(ast_elems)[:MAX_ELEMENTS_PER_BLOCK]: # Use defined constant
|
|
377
|
-
if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
|
|
378
|
-
metadata['ast_elements'].append(elem)
|
|
379
|
-
|
|
380
|
-
elif item.get('type') == 'thinking':
|
|
381
|
-
# Also include thinking content in metadata extraction
|
|
382
|
-
text_content += item.get('thinking', '')
|
|
383
|
-
|
|
384
|
-
elif item.get('type') == 'tool_use':
|
|
385
|
-
tool_name = item.get('name', '')
|
|
386
|
-
if tool_name and tool_name not in metadata['tools_used']:
|
|
387
|
-
metadata['tools_used'].append(tool_name)
|
|
388
|
-
|
|
389
|
-
# Extract file references
|
|
390
|
-
if 'input' in item:
|
|
391
|
-
input_data = item['input']
|
|
392
|
-
if isinstance(input_data, dict):
|
|
393
|
-
# Determine if it's an edit tool
|
|
394
|
-
is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
|
|
395
|
-
|
|
396
|
-
if 'file_path' in input_data:
|
|
397
|
-
file_ref = input_data['file_path']
|
|
398
|
-
if is_edit:
|
|
399
|
-
if file_ref not in metadata['files_edited']:
|
|
400
|
-
metadata['files_edited'].append(file_ref)
|
|
401
|
-
else:
|
|
402
|
-
if file_ref not in metadata['files_analyzed']:
|
|
403
|
-
metadata['files_analyzed'].append(file_ref)
|
|
404
|
-
|
|
405
|
-
if 'path' in input_data:
|
|
406
|
-
file_ref = input_data['path']
|
|
407
|
-
if file_ref not in metadata['files_analyzed']:
|
|
408
|
-
metadata['files_analyzed'].append(file_ref)
|
|
409
|
-
elif isinstance(item, str):
|
|
410
|
-
text_content += item
|
|
411
|
-
elif isinstance(content, str):
|
|
412
|
-
text_content = content
|
|
413
|
-
|
|
414
|
-
# Collect text for concept extraction
|
|
415
|
-
if text_content:
|
|
416
|
-
all_text.append(text_content[:1000]) # Limit text per message
|
|
417
|
-
|
|
418
|
-
except json.JSONDecodeError:
|
|
419
|
-
continue
|
|
420
|
-
except Exception:
|
|
421
|
-
continue
|
|
422
|
-
|
|
423
|
-
except Exception as e:
|
|
424
|
-
logger.warning(f"Error extracting metadata: {e}")
|
|
425
|
-
|
|
426
|
-
# Extract concepts from collected text
|
|
427
|
-
if all_text:
|
|
428
|
-
combined_text = ' '.join(all_text[:MAX_CONCEPT_MESSAGES]) # Limit messages for concept extraction
|
|
429
|
-
metadata['concepts'] = extract_concepts(combined_text)
|
|
430
|
-
|
|
431
|
-
# MANDATORY: AST-GREP Pattern Analysis
|
|
432
|
-
# Analyze code quality for files mentioned in conversation
|
|
433
|
-
pattern_quality = {}
|
|
434
|
-
avg_quality_score = 0.0
|
|
435
|
-
|
|
436
|
-
try:
|
|
437
|
-
# Update patterns first (uses 24h cache, <100ms)
|
|
438
|
-
from update_patterns import check_and_update_patterns
|
|
439
|
-
check_and_update_patterns()
|
|
440
|
-
|
|
441
|
-
# Import analyzer
|
|
442
|
-
from ast_grep_final_analyzer import FinalASTGrepAnalyzer
|
|
443
|
-
analyzer = FinalASTGrepAnalyzer()
|
|
217
|
+
state_mtime = datetime.fromisoformat(state_mtime_str).replace(tzinfo=None)
|
|
218
|
+
if file_mtime <= state_mtime:
|
|
219
|
+
logger.debug(f"Skipping {file_path.name} - already imported")
|
|
220
|
+
return False
|
|
221
|
+
except ValueError:
|
|
222
|
+
logger.debug(f"Invalid timestamp in state for {file_path.name}; will re-import")
|
|
444
223
|
|
|
445
|
-
|
|
446
|
-
files_to_analyze = list(set(metadata['files_edited'] + metadata['files_analyzed'][:10]))
|
|
447
|
-
quality_scores = []
|
|
224
|
+
return True
|
|
448
225
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
'score': metrics['quality_score'],
|
|
459
|
-
'good_patterns': metrics['good_patterns_found'],
|
|
460
|
-
'bad_patterns': metrics['bad_patterns_found'],
|
|
461
|
-
'issues': metrics['total_issues']
|
|
462
|
-
}
|
|
463
|
-
quality_scores.append(metrics['quality_score'])
|
|
464
|
-
except Exception as e:
|
|
465
|
-
logger.debug(f"Could not analyze {file_path}: {e}")
|
|
466
|
-
|
|
467
|
-
# Calculate average quality
|
|
468
|
-
if quality_scores:
|
|
469
|
-
avg_quality_score = sum(quality_scores) / len(quality_scores)
|
|
470
|
-
|
|
471
|
-
except Exception as e:
|
|
472
|
-
logger.debug(f"AST analysis not available: {e}")
|
|
473
|
-
|
|
474
|
-
# Add pattern analysis to metadata
|
|
475
|
-
metadata['pattern_analysis'] = pattern_quality
|
|
476
|
-
metadata['avg_quality_score'] = round(avg_quality_score, 3)
|
|
477
|
-
|
|
478
|
-
# Set total messages
|
|
479
|
-
metadata['total_messages'] = message_count
|
|
480
|
-
|
|
481
|
-
# Limit arrays
|
|
482
|
-
metadata['files_analyzed'] = metadata['files_analyzed'][:MAX_FILES_ANALYZED]
|
|
483
|
-
metadata['files_edited'] = metadata['files_edited'][:MAX_FILES_EDITED]
|
|
484
|
-
metadata['tools_used'] = metadata['tools_used'][:MAX_TOOLS_USED]
|
|
485
|
-
metadata['ast_elements'] = metadata['ast_elements'][:MAX_AST_ELEMENTS]
|
|
486
|
-
|
|
487
|
-
return metadata, first_timestamp or datetime.now().isoformat(), message_count
|
|
488
|
-
|
|
489
|
-
def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
490
|
-
"""Stream import a single JSONL file without loading it into memory."""
|
|
491
|
-
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
492
|
-
|
|
493
|
-
# Extract conversation ID
|
|
494
|
-
conversation_id = jsonl_file.stem
|
|
495
|
-
|
|
496
|
-
# Extract metadata in first pass (lightweight)
|
|
497
|
-
metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
|
|
498
|
-
|
|
499
|
-
# Track whether we should delete old points (only after successful import)
|
|
500
|
-
should_delete_old = False
|
|
501
|
-
|
|
502
|
-
# Reset counters for each conversation (critical for correct indexing)
|
|
503
|
-
current_message_index = 0 # Must be reset before processing each conversation
|
|
504
|
-
|
|
505
|
-
# Stream messages and process in chunks
|
|
506
|
-
chunk_buffer = []
|
|
507
|
-
chunk_index = 0
|
|
508
|
-
total_chunks = 0
|
|
509
|
-
conversation_id = jsonl_file.stem
|
|
510
|
-
|
|
511
|
-
try:
|
|
512
|
-
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
513
|
-
for line_num, line in enumerate(f, 1):
|
|
514
|
-
line = line.strip()
|
|
515
|
-
if not line:
|
|
516
|
-
continue
|
|
517
|
-
|
|
518
|
-
try:
|
|
519
|
-
data = json.loads(line)
|
|
520
|
-
|
|
521
|
-
# Skip non-message lines
|
|
522
|
-
if data.get('type') == 'summary':
|
|
523
|
-
continue
|
|
524
|
-
|
|
525
|
-
# Extract message if present
|
|
526
|
-
if 'message' in data and data['message']:
|
|
527
|
-
msg = data['message']
|
|
528
|
-
if msg.get('role') and msg.get('content'):
|
|
529
|
-
# Extract content from various message types
|
|
530
|
-
content = msg['content']
|
|
531
|
-
if isinstance(content, list):
|
|
532
|
-
text_parts = []
|
|
533
|
-
for item in content:
|
|
534
|
-
if isinstance(item, dict):
|
|
535
|
-
item_type = item.get('type', '')
|
|
536
|
-
if item_type == 'text':
|
|
537
|
-
text_parts.append(item.get('text', ''))
|
|
538
|
-
elif item_type == 'thinking':
|
|
539
|
-
# Include thinking content (from Claude's thinking blocks)
|
|
540
|
-
thinking_content = item.get('thinking', '')
|
|
541
|
-
if thinking_content:
|
|
542
|
-
text_parts.append(f"[Thinking] {thinking_content[:1000]}") # Limit size
|
|
543
|
-
elif item_type == 'tool_use':
|
|
544
|
-
# Include tool use information
|
|
545
|
-
tool_name = item.get('name', 'unknown')
|
|
546
|
-
tool_input = str(item.get('input', ''))[:500] # Limit size
|
|
547
|
-
text_parts.append(f"[Tool: {tool_name}] {tool_input}")
|
|
548
|
-
elif item_type == 'tool_result':
|
|
549
|
-
# Include tool results
|
|
550
|
-
result_content = str(item.get('content', ''))[:1000] # Limit size
|
|
551
|
-
text_parts.append(f"[Result] {result_content}")
|
|
552
|
-
elif isinstance(item, str):
|
|
553
|
-
text_parts.append(item)
|
|
554
|
-
content = '\n'.join(text_parts)
|
|
555
|
-
|
|
556
|
-
if content:
|
|
557
|
-
# Track message index for user/assistant messages
|
|
558
|
-
if msg['role'] in ['user', 'assistant']:
|
|
559
|
-
message_idx = current_message_index
|
|
560
|
-
current_message_index += 1
|
|
561
|
-
else:
|
|
562
|
-
message_idx = 0
|
|
563
|
-
|
|
564
|
-
chunk_buffer.append({
|
|
565
|
-
'role': msg['role'],
|
|
566
|
-
'content': content,
|
|
567
|
-
'message_index': message_idx
|
|
568
|
-
})
|
|
569
|
-
|
|
570
|
-
# Process chunk when buffer reaches MAX_CHUNK_SIZE
|
|
571
|
-
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
572
|
-
chunks = process_and_upload_chunk(
|
|
573
|
-
chunk_buffer, chunk_index, conversation_id,
|
|
574
|
-
created_at, metadata, collection_name, project_path, total_messages
|
|
575
|
-
)
|
|
576
|
-
total_chunks += chunks
|
|
577
|
-
chunk_buffer = []
|
|
578
|
-
chunk_index += 1
|
|
579
|
-
|
|
580
|
-
# Force garbage collection after each chunk
|
|
581
|
-
gc.collect()
|
|
582
|
-
|
|
583
|
-
# Log progress
|
|
584
|
-
if chunk_index % 10 == 0:
|
|
585
|
-
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
586
|
-
|
|
587
|
-
# Handle top-level tool_result/tool_use events (no message wrapper)
|
|
588
|
-
entry_type = data.get('type')
|
|
589
|
-
if entry_type in ('tool_result', 'tool_use'):
|
|
590
|
-
text_parts = []
|
|
591
|
-
if entry_type == 'tool_use':
|
|
592
|
-
tool_name = data.get('name', 'unknown')
|
|
593
|
-
tool_input = str(data.get('input', ''))[:500]
|
|
594
|
-
text_parts.append(f"[Tool: {tool_name}] {tool_input}")
|
|
595
|
-
elif entry_type == 'tool_result':
|
|
596
|
-
# Common structures: either 'content' (list/str) or 'result'
|
|
597
|
-
result_content = data.get('content')
|
|
598
|
-
if isinstance(result_content, list):
|
|
599
|
-
# flatten to text
|
|
600
|
-
flat = []
|
|
601
|
-
for itm in result_content:
|
|
602
|
-
if isinstance(itm, dict) and itm.get('type') == 'text':
|
|
603
|
-
flat.append(itm.get('text', ''))
|
|
604
|
-
elif isinstance(itm, str):
|
|
605
|
-
flat.append(itm)
|
|
606
|
-
result_content = "\n".join(flat)
|
|
607
|
-
if not result_content:
|
|
608
|
-
result_content = data.get('result', '') # fallback key used by some tools
|
|
609
|
-
text_parts.append(f"[Result] {str(result_content)[:1000]}")
|
|
610
|
-
|
|
611
|
-
content = "\n".join([p for p in text_parts if p])
|
|
612
|
-
if content:
|
|
613
|
-
# Track message index for summary format too
|
|
614
|
-
message_idx = current_message_index
|
|
615
|
-
current_message_index += 1
|
|
616
|
-
|
|
617
|
-
chunk_buffer.append({
|
|
618
|
-
'role': entry_type,
|
|
619
|
-
'content': content,
|
|
620
|
-
'message_index': message_idx
|
|
621
|
-
})
|
|
622
|
-
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
623
|
-
chunks = process_and_upload_chunk(
|
|
624
|
-
chunk_buffer, chunk_index, conversation_id,
|
|
625
|
-
created_at, metadata, collection_name, project_path, total_messages
|
|
626
|
-
)
|
|
627
|
-
total_chunks += chunks
|
|
628
|
-
chunk_buffer = []
|
|
629
|
-
chunk_index += 1
|
|
630
|
-
gc.collect()
|
|
631
|
-
|
|
632
|
-
except json.JSONDecodeError:
|
|
633
|
-
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
634
|
-
except Exception as e:
|
|
635
|
-
logger.debug(f"Error processing line {line_num}: {e}")
|
|
636
|
-
|
|
637
|
-
# Process remaining messages
|
|
638
|
-
if chunk_buffer:
|
|
639
|
-
chunks = process_and_upload_chunk(
|
|
640
|
-
chunk_buffer, chunk_index, conversation_id,
|
|
641
|
-
created_at, metadata, collection_name, project_path, total_messages
|
|
226
|
+
def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
227
|
+
"""Import a single JSONL file."""
|
|
228
|
+
# Initialize import strategy if not already done
|
|
229
|
+
if not self.import_strategy:
|
|
230
|
+
self.import_strategy = StreamImportStrategy(
|
|
231
|
+
self.client,
|
|
232
|
+
self.process_and_upload_chunk,
|
|
233
|
+
self.state_manager,
|
|
234
|
+
MAX_CHUNK_SIZE
|
|
642
235
|
)
|
|
643
|
-
total_chunks += chunks
|
|
644
236
|
|
|
645
|
-
#
|
|
646
|
-
|
|
647
|
-
try:
|
|
648
|
-
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
649
|
-
# Count old points before deletion for verification
|
|
650
|
-
old_count_filter = Filter(
|
|
651
|
-
must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
|
|
652
|
-
)
|
|
653
|
-
old_points = client.scroll(
|
|
654
|
-
collection_name=collection_name,
|
|
655
|
-
scroll_filter=old_count_filter,
|
|
656
|
-
limit=1
|
|
657
|
-
)[0]
|
|
658
|
-
|
|
659
|
-
if len(old_points) > total_chunks + 5: # Allow some tolerance
|
|
660
|
-
# Only delete if we have significantly more old points than new
|
|
661
|
-
client.delete(
|
|
662
|
-
collection_name=collection_name,
|
|
663
|
-
points_selector=old_count_filter,
|
|
664
|
-
wait=True
|
|
665
|
-
)
|
|
666
|
-
logger.info(f"Deleted old points for conversation {conversation_id} after verifying new import")
|
|
667
|
-
except Exception as e:
|
|
668
|
-
logger.warning(f"Could not clean up old points for {conversation_id}: {e}")
|
|
237
|
+
# Use strategy to import file
|
|
238
|
+
chunks = self.import_strategy.import_file(jsonl_file, collection_name, project_path)
|
|
669
239
|
|
|
670
|
-
|
|
671
|
-
|
|
240
|
+
# Update state if successful
|
|
241
|
+
if chunks > 0:
|
|
242
|
+
self.update_file_state(jsonl_file, chunks, collection_name)
|
|
672
243
|
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
state_manager.mark_file_failed(str(jsonl_file), str(e))
|
|
678
|
-
except Exception as state_error:
|
|
679
|
-
logger.warning(f"Could not mark file as failed in state: {state_error}")
|
|
680
|
-
return 0
|
|
681
|
-
|
|
682
|
-
def _with_retries(fn, attempts=3, base_sleep=0.5):
|
|
683
|
-
"""Execute function with retries and exponential backoff."""
|
|
684
|
-
for i in range(attempts):
|
|
244
|
+
return chunks
|
|
245
|
+
|
|
246
|
+
def update_file_state(self, file_path: Path, chunks: int, collection_name: str):
|
|
247
|
+
"""Update state for successfully imported file."""
|
|
685
248
|
try:
|
|
686
|
-
|
|
249
|
+
self.state_manager.add_imported_file(
|
|
250
|
+
file_path=str(file_path),
|
|
251
|
+
chunks=chunks,
|
|
252
|
+
collection=collection_name,
|
|
253
|
+
embedding_mode="local" if "Local" in self.embedding_service.get_provider_name() else "cloud"
|
|
254
|
+
)
|
|
255
|
+
logger.debug(f"Updated state for {file_path.name}")
|
|
687
256
|
except Exception as e:
|
|
688
|
-
|
|
689
|
-
raise
|
|
690
|
-
time.sleep(base_sleep * (2 ** i))
|
|
691
|
-
logger.debug(f"Retrying after error: {e}")
|
|
257
|
+
logger.warning(f"Could not update state for {file_path}: {e}")
|
|
692
258
|
|
|
693
|
-
def
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
imported_files = state_manager.get_imported_files()
|
|
259
|
+
def import_project(self, project_path: Path, limit: Optional[int] = None) -> Dict[str, Any]:
|
|
260
|
+
"""Import all conversations from a project."""
|
|
261
|
+
collection_name = self.get_collection_name(project_path)
|
|
262
|
+
self.ensure_collection(collection_name)
|
|
698
263
|
|
|
699
|
-
#
|
|
700
|
-
|
|
264
|
+
# Find JSONL files
|
|
265
|
+
jsonl_files = sorted(project_path.glob("*.jsonl"))
|
|
266
|
+
if not jsonl_files:
|
|
267
|
+
logger.warning(f"No JSONL files found in {project_path}")
|
|
268
|
+
return {"imported": 0, "skipped": 0, "failed": 0}
|
|
701
269
|
|
|
702
|
-
|
|
703
|
-
|
|
270
|
+
# Apply limit if specified
|
|
271
|
+
if limit:
|
|
272
|
+
jsonl_files = jsonl_files[:limit]
|
|
704
273
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
logger.info(f"Skipping failed file (max retries reached): {file_path.name}")
|
|
708
|
-
return False
|
|
274
|
+
# Import files
|
|
275
|
+
stats = {"imported": 0, "skipped": 0, "failed": 0}
|
|
709
276
|
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
277
|
+
for jsonl_file in jsonl_files:
|
|
278
|
+
if not self.should_import_file(jsonl_file):
|
|
279
|
+
stats["skipped"] += 1
|
|
280
|
+
continue
|
|
713
281
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
stored_time = datetime.fromisoformat(stored_modified.replace("Z", "+00:00")).timestamp()
|
|
719
|
-
if abs(last_modified - stored_time) > 1: # Allow 1 second tolerance
|
|
720
|
-
logger.info(f"File modified, will re-import: {file_path.name}")
|
|
721
|
-
return True
|
|
722
|
-
except (ValueError, TypeError):
|
|
723
|
-
# If we can't parse the stored time, re-import to be safe
|
|
724
|
-
logger.warning(f"Could not parse stored modification time, will re-import: {file_path.name}")
|
|
725
|
-
return True
|
|
726
|
-
|
|
727
|
-
# Check for suspiciously low chunk counts (likely failed imports)
|
|
728
|
-
chunks = file_info.get("chunks", 0)
|
|
729
|
-
file_size_kb = file_path.stat().st_size / 1024
|
|
730
|
-
|
|
731
|
-
# Heuristic: Files > 10KB should have more than 2 chunks
|
|
732
|
-
if file_size_kb > 10 and chunks <= 2 and file_info.get("status") != "failed":
|
|
733
|
-
logger.warning(f"File has suspiciously low chunks ({chunks}) for size {file_size_kb:.1f}KB, will re-import: {file_path.name}")
|
|
734
|
-
return True
|
|
735
|
-
|
|
736
|
-
# Skip if successfully imported
|
|
737
|
-
if file_info.get("status") == "completed":
|
|
738
|
-
logger.info(f"Skipping successfully imported file: {file_path.name}")
|
|
739
|
-
return False
|
|
282
|
+
try:
|
|
283
|
+
# Calculate expected chunks based on file size
|
|
284
|
+
file_size = jsonl_file.stat().st_size
|
|
285
|
+
expected_chunks = max(1, file_size // (1024 * 100)) # Rough estimate
|
|
740
286
|
|
|
741
|
-
|
|
287
|
+
chunks = self.import_file(jsonl_file, collection_name, project_path)
|
|
742
288
|
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
289
|
+
# Validate chunk count is reasonable
|
|
290
|
+
if chunks > 0:
|
|
291
|
+
if chunks > expected_chunks * 10:
|
|
292
|
+
logger.warning(f"Unusual chunk count for {jsonl_file.name}: {chunks} chunks (expected ~{expected_chunks})")
|
|
293
|
+
stats["imported"] += 1
|
|
294
|
+
else:
|
|
295
|
+
stats["failed"] += 1
|
|
296
|
+
except Exception as e:
|
|
297
|
+
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
298
|
+
stats["failed"] += 1
|
|
299
|
+
|
|
300
|
+
# Force garbage collection periodically
|
|
301
|
+
if (stats["imported"] + stats["failed"]) % 10 == 0:
|
|
302
|
+
gc.collect()
|
|
303
|
+
|
|
304
|
+
return stats
|
|
746
305
|
|
|
747
|
-
def update_file_state(file_path: Path, chunks: int, collection_name: str):
|
|
748
|
-
"""Update state for imported file using UnifiedStateManager."""
|
|
749
|
-
try:
|
|
750
|
-
# Determine embedding mode from collection suffix
|
|
751
|
-
embedding_mode = "local" if collection_suffix == "local" else "cloud"
|
|
752
|
-
|
|
753
|
-
# Add file to state manager
|
|
754
|
-
state_manager.add_imported_file(
|
|
755
|
-
file_path=str(file_path),
|
|
756
|
-
chunks=chunks,
|
|
757
|
-
importer="streaming",
|
|
758
|
-
collection=collection_name,
|
|
759
|
-
embedding_mode=embedding_mode,
|
|
760
|
-
status="completed"
|
|
761
|
-
)
|
|
762
|
-
logger.debug(f"Updated state for {file_path.name}: {chunks} chunks")
|
|
763
|
-
except Exception as e:
|
|
764
|
-
logger.error(f"Failed to update state for {file_path}: {e}")
|
|
765
306
|
|
|
766
307
|
def main():
|
|
767
|
-
"""Main
|
|
768
|
-
|
|
769
|
-
parser =
|
|
770
|
-
parser.add_argument(
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
help='Limit number of files to import')
|
|
774
|
-
parser.add_argument('--max-files-per-cycle', type=int,
|
|
775
|
-
help='Maximum files to process per cycle')
|
|
308
|
+
"""Main entry point."""
|
|
309
|
+
parser = argparse.ArgumentParser(description="Import conversations with reduced complexity")
|
|
310
|
+
parser.add_argument("--project", type=str, help="Specific project path to import")
|
|
311
|
+
parser.add_argument("--limit", type=int, help="Limit number of files to import")
|
|
312
|
+
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
|
|
313
|
+
|
|
776
314
|
args = parser.parse_args()
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
315
|
+
|
|
316
|
+
if args.verbose:
|
|
317
|
+
logging.getLogger().setLevel(logging.DEBUG)
|
|
318
|
+
|
|
319
|
+
# Create importer
|
|
320
|
+
importer = ConversationImporter()
|
|
321
|
+
|
|
322
|
+
# Determine project path
|
|
323
|
+
if args.project:
|
|
324
|
+
project_path = Path(args.project).expanduser().resolve()
|
|
325
|
+
if not project_path.exists():
|
|
326
|
+
logger.error(f"Project path does not exist: {project_path}")
|
|
783
327
|
sys.exit(1)
|
|
784
|
-
|
|
785
|
-
PREFER_LOCAL_EMBEDDINGS = False
|
|
786
|
-
|
|
787
|
-
# Re-initialize embedding provider with Voyage
|
|
788
|
-
import voyageai
|
|
789
|
-
embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
790
|
-
embedding_dimension = 1024
|
|
791
|
-
collection_suffix = "voyage"
|
|
792
|
-
logger.info("Switched to Voyage AI embeddings (dimension: 1024)")
|
|
793
|
-
|
|
794
|
-
# Get status from state manager
|
|
795
|
-
status = state_manager.get_status()
|
|
796
|
-
logger.info(f"Loaded state with {status['indexed_files']} previously imported files")
|
|
797
|
-
|
|
798
|
-
# Find all projects
|
|
799
|
-
# Use LOGS_DIR env var, or fall back to Claude projects directory, then /logs for Docker
|
|
800
|
-
logs_dir_env = os.getenv("LOGS_DIR")
|
|
801
|
-
if logs_dir_env:
|
|
802
|
-
logs_dir = Path(logs_dir_env)
|
|
803
|
-
elif (Path.home() / ".claude" / "projects").exists():
|
|
804
|
-
logs_dir = Path.home() / ".claude" / "projects"
|
|
328
|
+
projects = [project_path]
|
|
805
329
|
else:
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
break
|
|
833
|
-
|
|
834
|
-
# Limit files per cycle if specified
|
|
835
|
-
max_files = args.max_files_per_cycle or int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
|
|
836
|
-
jsonl_files = jsonl_files[:max_files]
|
|
837
|
-
|
|
838
|
-
for jsonl_file in jsonl_files:
|
|
839
|
-
# Check limit again per file
|
|
840
|
-
if args.limit and files_processed >= args.limit:
|
|
841
|
-
logger.info(f"Reached limit of {args.limit} files, stopping import")
|
|
842
|
-
break
|
|
843
|
-
|
|
844
|
-
if should_import_file(jsonl_file):
|
|
845
|
-
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
846
|
-
files_processed += 1
|
|
847
|
-
if chunks > 0:
|
|
848
|
-
# Verify data is actually in Qdrant before marking as imported
|
|
849
|
-
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
850
|
-
try:
|
|
851
|
-
conversation_id = jsonl_file.stem
|
|
852
|
-
count_result = _with_retries(lambda: client.count(
|
|
853
|
-
collection_name=collection_name,
|
|
854
|
-
count_filter=Filter(
|
|
855
|
-
must=[FieldCondition(key="conversation_id",
|
|
856
|
-
match=MatchValue(value=conversation_id))]
|
|
857
|
-
),
|
|
858
|
-
exact=True # Ensure exact count, not approximation
|
|
859
|
-
))
|
|
860
|
-
actual_count = count_result.count if hasattr(count_result, 'count') else 0
|
|
861
|
-
|
|
862
|
-
if actual_count > 0:
|
|
863
|
-
logger.info(f"Verified {actual_count} points in Qdrant for {conversation_id}")
|
|
864
|
-
update_file_state(jsonl_file, chunks, collection_name)
|
|
865
|
-
total_imported += 1
|
|
866
|
-
else:
|
|
867
|
-
logger.error(f"No points found in Qdrant for {conversation_id} despite {chunks} chunks processed - not marking as imported")
|
|
868
|
-
except Exception as e:
|
|
869
|
-
logger.error(f"Failed to verify Qdrant points for {jsonl_file.name}: {e}")
|
|
870
|
-
# Don't mark as imported if we can't verify
|
|
871
|
-
|
|
872
|
-
# Force GC after each file
|
|
873
|
-
gc.collect()
|
|
874
|
-
else:
|
|
875
|
-
# Critical fix: Don't mark files with 0 chunks as imported
|
|
876
|
-
# This allows retry on next run
|
|
877
|
-
logger.warning(f"File produced 0 chunks, not marking as imported: {jsonl_file.name}")
|
|
878
|
-
# Mark as failed so we don't keep retrying indefinitely
|
|
879
|
-
try:
|
|
880
|
-
state_manager.mark_file_failed(str(jsonl_file), "File produced 0 chunks during import")
|
|
881
|
-
except Exception as state_error:
|
|
882
|
-
logger.warning(f"Could not mark file as failed in state: {state_error}")
|
|
883
|
-
|
|
884
|
-
logger.info(f"Import complete: processed {total_imported} files")
|
|
330
|
+
# Import all projects
|
|
331
|
+
claude_dir = Path.home() / ".claude" / "projects"
|
|
332
|
+
if not claude_dir.exists():
|
|
333
|
+
logger.error(f"Claude projects directory not found: {claude_dir}")
|
|
334
|
+
sys.exit(1)
|
|
335
|
+
projects = [p for p in claude_dir.iterdir() if p.is_dir()]
|
|
336
|
+
|
|
337
|
+
# Import projects
|
|
338
|
+
total_stats = {"imported": 0, "skipped": 0, "failed": 0}
|
|
339
|
+
|
|
340
|
+
for project in projects:
|
|
341
|
+
logger.info(f"Importing project: {project.name}")
|
|
342
|
+
stats = importer.import_project(project, args.limit)
|
|
343
|
+
|
|
344
|
+
# Aggregate stats
|
|
345
|
+
for key in total_stats:
|
|
346
|
+
total_stats[key] += stats[key]
|
|
347
|
+
|
|
348
|
+
logger.info(f"Project {project.name}: {stats}")
|
|
349
|
+
|
|
350
|
+
# Print summary
|
|
351
|
+
logger.info(f"\nImport complete:")
|
|
352
|
+
logger.info(f" Imported: {total_stats['imported']} conversations")
|
|
353
|
+
logger.info(f" Skipped: {total_stats['skipped']} conversations")
|
|
354
|
+
logger.info(f" Failed: {total_stats['failed']} conversations")
|
|
355
|
+
|
|
885
356
|
|
|
886
357
|
if __name__ == "__main__":
|
|
887
358
|
main()
|