claude-self-reflect 3.0.0 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +110 -66
- package/README.md +1 -1
- package/installer/setup-wizard.js +4 -2
- package/mcp-server/pyproject.toml +1 -0
- package/mcp-server/src/server.py +84 -0
- package/package.json +2 -1
- package/scripts/import-conversations-unified.py +225 -44
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -9,18 +9,27 @@ import os
|
|
|
9
9
|
import sys
|
|
10
10
|
import hashlib
|
|
11
11
|
import gc
|
|
12
|
+
import ast
|
|
13
|
+
import re
|
|
12
14
|
from pathlib import Path
|
|
13
15
|
from datetime import datetime
|
|
14
|
-
from typing import List, Dict, Any, Optional
|
|
16
|
+
from typing import List, Dict, Any, Optional, Set
|
|
15
17
|
import logging
|
|
16
18
|
|
|
17
|
-
# Add the
|
|
18
|
-
|
|
19
|
-
sys.path.insert(0, str(
|
|
19
|
+
# Add the scripts directory to the Python path for utils import
|
|
20
|
+
scripts_dir = Path(__file__).parent
|
|
21
|
+
sys.path.insert(0, str(scripts_dir))
|
|
20
22
|
|
|
21
23
|
from qdrant_client import QdrantClient
|
|
22
24
|
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
23
25
|
|
|
26
|
+
# Import the correct normalize_project_name from utils
|
|
27
|
+
try:
|
|
28
|
+
from utils import normalize_project_name
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
logging.error(f"Failed to import normalize_project_name from utils: {e}")
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
24
33
|
# Set up logging
|
|
25
34
|
logging.basicConfig(
|
|
26
35
|
level=logging.INFO,
|
|
@@ -31,6 +40,12 @@ logger = logging.getLogger(__name__)
|
|
|
31
40
|
# Environment variables
|
|
32
41
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
33
42
|
|
|
43
|
+
# Constants for metadata limits
|
|
44
|
+
MAX_CONCEPTS = 10
|
|
45
|
+
MAX_AST_ELEMENTS = 30
|
|
46
|
+
MAX_CODE_BLOCKS = 5
|
|
47
|
+
MAX_ELEMENTS_PER_BLOCK = 10
|
|
48
|
+
|
|
34
49
|
# Robust cross-platform state file resolution
|
|
35
50
|
def get_default_state_file():
|
|
36
51
|
"""Determine the default state file location with cross-platform support."""
|
|
@@ -74,9 +89,11 @@ embedding_dimension = None
|
|
|
74
89
|
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
75
90
|
logger.info("Using local embeddings (fastembed)")
|
|
76
91
|
from fastembed import TextEmbedding
|
|
92
|
+
# Using the same model as official Qdrant MCP server
|
|
77
93
|
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
78
94
|
embedding_dimension = 384
|
|
79
95
|
collection_suffix = "local"
|
|
96
|
+
logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
|
|
80
97
|
else:
|
|
81
98
|
logger.info("Using Voyage AI embeddings")
|
|
82
99
|
import voyageai
|
|
@@ -84,15 +101,9 @@ else:
|
|
|
84
101
|
embedding_dimension = 1024
|
|
85
102
|
collection_suffix = "voyage"
|
|
86
103
|
|
|
87
|
-
def normalize_project_name(project_name: str) -> str:
|
|
88
|
-
"""Normalize project name for consistency."""
|
|
89
|
-
# For compatibility with delta-metadata-update, just use the project name as-is
|
|
90
|
-
# This ensures collection names match between import and delta update scripts
|
|
91
|
-
return project_name
|
|
92
|
-
|
|
93
104
|
def get_collection_name(project_path: Path) -> str:
|
|
94
105
|
"""Generate collection name from project path."""
|
|
95
|
-
normalized = normalize_project_name(project_path
|
|
106
|
+
normalized = normalize_project_name(str(project_path))
|
|
96
107
|
name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
97
108
|
return f"conv_{name_hash}_{collection_suffix}"
|
|
98
109
|
|
|
@@ -118,18 +129,23 @@ def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
|
118
129
|
def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
119
130
|
conversation_id: str, created_at: str,
|
|
120
131
|
metadata: Dict[str, Any], collection_name: str,
|
|
121
|
-
project_path: Path) -> int:
|
|
132
|
+
project_path: Path, total_messages: int) -> int:
|
|
122
133
|
"""Process and immediately upload a single chunk."""
|
|
123
134
|
if not messages:
|
|
124
135
|
return 0
|
|
125
136
|
|
|
126
|
-
# Extract text content
|
|
137
|
+
# Extract text content and message indices
|
|
127
138
|
texts = []
|
|
139
|
+
message_indices = []
|
|
128
140
|
for msg in messages:
|
|
129
141
|
role = msg.get("role", "unknown")
|
|
130
142
|
content = msg.get("content", "")
|
|
131
143
|
if content:
|
|
132
144
|
texts.append(f"{role.upper()}: {content}")
|
|
145
|
+
# Fix: Check for None instead of truthiness to include 0 values
|
|
146
|
+
idx = msg.get("message_index")
|
|
147
|
+
if idx is not None:
|
|
148
|
+
message_indices.append(idx)
|
|
133
149
|
|
|
134
150
|
if not texts:
|
|
135
151
|
return 0
|
|
@@ -140,6 +156,29 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
140
156
|
# Generate embedding
|
|
141
157
|
embeddings = generate_embeddings([chunk_text])
|
|
142
158
|
|
|
159
|
+
# Sanity check embeddings
|
|
160
|
+
if not embeddings or not embeddings[0]:
|
|
161
|
+
logger.error(f"Empty embedding generated for chunk {chunk_index}")
|
|
162
|
+
return 0
|
|
163
|
+
|
|
164
|
+
embedding = embeddings[0]
|
|
165
|
+
|
|
166
|
+
# Check for degenerate embeddings (all values identical)
|
|
167
|
+
if len(set(embedding)) == 1:
|
|
168
|
+
logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
|
|
169
|
+
return 0
|
|
170
|
+
|
|
171
|
+
# Check variance is above threshold
|
|
172
|
+
import statistics
|
|
173
|
+
variance = statistics.variance(embedding)
|
|
174
|
+
if variance < 1e-6:
|
|
175
|
+
logger.warning(f"Low variance embedding detected: {variance}")
|
|
176
|
+
|
|
177
|
+
# Validate dimension
|
|
178
|
+
if len(embedding) != embedding_dimension:
|
|
179
|
+
logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
|
|
180
|
+
return 0
|
|
181
|
+
|
|
143
182
|
# Create point ID
|
|
144
183
|
point_id = hashlib.md5(
|
|
145
184
|
f"{conversation_id}_{chunk_index}".encode()
|
|
@@ -151,9 +190,12 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
151
190
|
"conversation_id": conversation_id,
|
|
152
191
|
"chunk_index": chunk_index,
|
|
153
192
|
"timestamp": created_at,
|
|
154
|
-
"project": normalize_project_name(project_path
|
|
193
|
+
"project": normalize_project_name(str(project_path)),
|
|
155
194
|
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
156
|
-
"message_count": len(messages)
|
|
195
|
+
"message_count": len(messages),
|
|
196
|
+
"total_messages": total_messages,
|
|
197
|
+
"message_index": message_indices[0] if message_indices else 0,
|
|
198
|
+
"message_indices": message_indices # Store all indices in this chunk
|
|
157
199
|
}
|
|
158
200
|
|
|
159
201
|
# Add metadata
|
|
@@ -180,16 +222,84 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
180
222
|
logger.error(f"Error processing chunk {chunk_index}: {e}")
|
|
181
223
|
return 0
|
|
182
224
|
|
|
183
|
-
def
|
|
184
|
-
"""Extract
|
|
225
|
+
def extract_ast_elements(code_text: str) -> Set[str]:
|
|
226
|
+
"""Extract function and class names from code using AST parsing."""
|
|
227
|
+
elements = set()
|
|
228
|
+
|
|
229
|
+
# Try to parse as Python code
|
|
230
|
+
try:
|
|
231
|
+
tree = ast.parse(code_text)
|
|
232
|
+
for node in ast.walk(tree):
|
|
233
|
+
if isinstance(node, ast.FunctionDef):
|
|
234
|
+
elements.add(f"func:{node.name}")
|
|
235
|
+
elif isinstance(node, ast.AsyncFunctionDef):
|
|
236
|
+
elements.add(f"func:{node.name}")
|
|
237
|
+
elif isinstance(node, ast.ClassDef):
|
|
238
|
+
elements.add(f"class:{node.name}")
|
|
239
|
+
except SyntaxError:
|
|
240
|
+
# Python regex fallback for partial fragments
|
|
241
|
+
for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
|
|
242
|
+
elements.add(f"func:{m.group(1)}")
|
|
243
|
+
for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
|
|
244
|
+
elements.add(f"func:{m.group(1)}")
|
|
245
|
+
for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
|
|
246
|
+
elements.add(f"class:{m.group(1)}")
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.debug(f"Unexpected error parsing AST: {e}")
|
|
249
|
+
|
|
250
|
+
# Try regex patterns for other languages
|
|
251
|
+
# JavaScript/TypeScript functions
|
|
252
|
+
js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
|
|
253
|
+
for match in re.finditer(js_func_pattern, code_text):
|
|
254
|
+
elements.add(f"func:{match.group(1)}")
|
|
255
|
+
|
|
256
|
+
# Class definitions (multiple languages)
|
|
257
|
+
class_pattern = r'(?:class|interface|struct)\s+(\w+)'
|
|
258
|
+
for match in re.finditer(class_pattern, code_text):
|
|
259
|
+
elements.add(f"class:{match.group(1)}")
|
|
260
|
+
|
|
261
|
+
return elements
|
|
262
|
+
|
|
263
|
+
def extract_concepts(text: str) -> List[str]:
|
|
264
|
+
"""Extract development concepts from text."""
|
|
265
|
+
concepts = []
|
|
266
|
+
concept_patterns = {
|
|
267
|
+
'docker': r'\b(?:docker|container|compose|dockerfile)\b',
|
|
268
|
+
'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
|
|
269
|
+
'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
|
|
270
|
+
'api': r'\b(?:api|rest|graphql|endpoint)\b',
|
|
271
|
+
'security': r'\b(?:security|auth|authentication|encryption)\b',
|
|
272
|
+
'performance': r'\b(?:performance|optimization|cache|speed)\b',
|
|
273
|
+
'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
|
|
274
|
+
'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
|
|
275
|
+
'git': r'\b(?:git|commit|branch|merge|pull request)\b',
|
|
276
|
+
'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
|
|
277
|
+
'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
text_lower = text.lower()
|
|
281
|
+
for concept, pattern in concept_patterns.items():
|
|
282
|
+
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
283
|
+
if concept not in concepts:
|
|
284
|
+
concepts.append(concept)
|
|
285
|
+
|
|
286
|
+
return concepts[:MAX_CONCEPTS]
|
|
287
|
+
|
|
288
|
+
def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
|
|
289
|
+
"""Extract metadata in a single pass, return metadata, first timestamp, and message count."""
|
|
185
290
|
metadata = {
|
|
186
291
|
"files_analyzed": [],
|
|
187
292
|
"files_edited": [],
|
|
188
293
|
"tools_used": [],
|
|
189
|
-
"concepts": []
|
|
294
|
+
"concepts": [],
|
|
295
|
+
"ast_elements": [],
|
|
296
|
+
"has_code_blocks": False,
|
|
297
|
+
"total_messages": 0
|
|
190
298
|
}
|
|
191
299
|
|
|
192
300
|
first_timestamp = None
|
|
301
|
+
message_count = 0
|
|
302
|
+
all_text = []
|
|
193
303
|
|
|
194
304
|
try:
|
|
195
305
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
@@ -204,53 +314,107 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
|
|
|
204
314
|
if first_timestamp is None and 'timestamp' in data:
|
|
205
315
|
first_timestamp = data.get('timestamp')
|
|
206
316
|
|
|
207
|
-
#
|
|
317
|
+
# Count messages
|
|
208
318
|
if 'message' in data and data['message']:
|
|
209
319
|
msg = data['message']
|
|
320
|
+
if msg.get('role') in ['user', 'assistant']:
|
|
321
|
+
message_count += 1
|
|
322
|
+
|
|
210
323
|
if msg.get('content'):
|
|
211
324
|
content = msg['content']
|
|
325
|
+
text_content = ""
|
|
326
|
+
|
|
212
327
|
if isinstance(content, list):
|
|
213
328
|
for item in content:
|
|
214
|
-
if isinstance(item, dict)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
329
|
+
if isinstance(item, dict):
|
|
330
|
+
if item.get('type') == 'text':
|
|
331
|
+
text_content += item.get('text', '')
|
|
332
|
+
# Check for code blocks
|
|
333
|
+
if '```' in item.get('text', ''):
|
|
334
|
+
metadata['has_code_blocks'] = True
|
|
335
|
+
# Extract code for AST analysis with bounds checking
|
|
336
|
+
if len(metadata['ast_elements']) < 30:
|
|
337
|
+
# Fix: More permissive regex to handle various fence formats
|
|
338
|
+
code_blocks = re.findall(r'```[^\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
|
|
339
|
+
for code_block in code_blocks[:5]: # Limit to 5 blocks
|
|
340
|
+
if len(metadata['ast_elements']) >= 30:
|
|
341
|
+
break
|
|
342
|
+
ast_elems = extract_ast_elements(code_block)
|
|
343
|
+
for elem in list(ast_elems)[:10]: # Limit elements per block
|
|
344
|
+
if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < 30:
|
|
345
|
+
metadata['ast_elements'].append(elem)
|
|
218
346
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
if
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
347
|
+
elif item.get('type') == 'tool_use':
|
|
348
|
+
tool_name = item.get('name', '')
|
|
349
|
+
if tool_name and tool_name not in metadata['tools_used']:
|
|
350
|
+
metadata['tools_used'].append(tool_name)
|
|
351
|
+
|
|
352
|
+
# Extract file references
|
|
353
|
+
if 'input' in item:
|
|
354
|
+
input_data = item['input']
|
|
355
|
+
if isinstance(input_data, dict):
|
|
356
|
+
# Determine if it's an edit tool
|
|
357
|
+
is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
|
|
358
|
+
|
|
359
|
+
if 'file_path' in input_data:
|
|
360
|
+
file_ref = input_data['file_path']
|
|
361
|
+
if is_edit:
|
|
362
|
+
if file_ref not in metadata['files_edited']:
|
|
363
|
+
metadata['files_edited'].append(file_ref)
|
|
364
|
+
else:
|
|
365
|
+
if file_ref not in metadata['files_analyzed']:
|
|
366
|
+
metadata['files_analyzed'].append(file_ref)
|
|
367
|
+
|
|
368
|
+
if 'path' in input_data:
|
|
369
|
+
file_ref = input_data['path']
|
|
370
|
+
if file_ref not in metadata['files_analyzed']:
|
|
371
|
+
metadata['files_analyzed'].append(file_ref)
|
|
372
|
+
elif isinstance(item, str):
|
|
373
|
+
text_content += item
|
|
374
|
+
elif isinstance(content, str):
|
|
375
|
+
text_content = content
|
|
376
|
+
|
|
377
|
+
# Collect text for concept extraction
|
|
378
|
+
if text_content:
|
|
379
|
+
all_text.append(text_content[:1000]) # Limit text per message
|
|
231
380
|
|
|
232
381
|
except json.JSONDecodeError:
|
|
233
382
|
continue
|
|
234
383
|
except Exception:
|
|
235
384
|
continue
|
|
236
|
-
|
|
385
|
+
|
|
237
386
|
except Exception as e:
|
|
238
387
|
logger.warning(f"Error extracting metadata: {e}")
|
|
239
388
|
|
|
240
|
-
|
|
389
|
+
# Extract concepts from collected text
|
|
390
|
+
if all_text:
|
|
391
|
+
combined_text = ' '.join(all_text[:50]) # Limit to first 50 messages
|
|
392
|
+
metadata['concepts'] = extract_concepts(combined_text)
|
|
393
|
+
|
|
394
|
+
# Set total messages
|
|
395
|
+
metadata['total_messages'] = message_count
|
|
396
|
+
|
|
397
|
+
# Limit arrays
|
|
398
|
+
metadata['files_analyzed'] = metadata['files_analyzed'][:20]
|
|
399
|
+
metadata['files_edited'] = metadata['files_edited'][:20]
|
|
400
|
+
metadata['tools_used'] = metadata['tools_used'][:15]
|
|
401
|
+
metadata['ast_elements'] = metadata['ast_elements'][:30]
|
|
402
|
+
|
|
403
|
+
return metadata, first_timestamp or datetime.now().isoformat(), message_count
|
|
241
404
|
|
|
242
405
|
def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
243
406
|
"""Stream import a single JSONL file without loading it into memory."""
|
|
244
407
|
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
245
408
|
|
|
246
409
|
# Extract metadata in first pass (lightweight)
|
|
247
|
-
metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
|
|
410
|
+
metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
|
|
248
411
|
|
|
249
412
|
# Stream messages and process in chunks
|
|
250
413
|
chunk_buffer = []
|
|
251
414
|
chunk_index = 0
|
|
252
415
|
total_chunks = 0
|
|
253
416
|
conversation_id = jsonl_file.stem
|
|
417
|
+
current_message_index = 0
|
|
254
418
|
|
|
255
419
|
try:
|
|
256
420
|
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
@@ -282,16 +446,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
282
446
|
content = '\n'.join(text_parts)
|
|
283
447
|
|
|
284
448
|
if content:
|
|
449
|
+
# Track message index for user/assistant messages
|
|
450
|
+
if msg['role'] in ['user', 'assistant']:
|
|
451
|
+
current_message_index += 1
|
|
452
|
+
message_idx = current_message_index
|
|
453
|
+
else:
|
|
454
|
+
message_idx = 0
|
|
455
|
+
|
|
285
456
|
chunk_buffer.append({
|
|
286
457
|
'role': msg['role'],
|
|
287
|
-
'content': content
|
|
458
|
+
'content': content,
|
|
459
|
+
'message_index': message_idx
|
|
288
460
|
})
|
|
289
461
|
|
|
290
462
|
# Process chunk when buffer reaches MAX_CHUNK_SIZE
|
|
291
463
|
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
292
464
|
chunks = process_and_upload_chunk(
|
|
293
465
|
chunk_buffer, chunk_index, conversation_id,
|
|
294
|
-
created_at, metadata, collection_name, project_path
|
|
466
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
295
467
|
)
|
|
296
468
|
total_chunks += chunks
|
|
297
469
|
chunk_buffer = []
|
|
@@ -313,7 +485,7 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
313
485
|
if chunk_buffer:
|
|
314
486
|
chunks = process_and_upload_chunk(
|
|
315
487
|
chunk_buffer, chunk_index, conversation_id,
|
|
316
|
-
created_at, metadata, collection_name, project_path
|
|
488
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
317
489
|
)
|
|
318
490
|
total_chunks += chunks
|
|
319
491
|
|
|
@@ -335,10 +507,19 @@ def load_state() -> dict:
|
|
|
335
507
|
return {"imported_files": {}}
|
|
336
508
|
|
|
337
509
|
def save_state(state: dict):
|
|
338
|
-
"""Save import state."""
|
|
339
|
-
|
|
340
|
-
|
|
510
|
+
"""Save import state with atomic write."""
|
|
511
|
+
# Fix: Handle case where STATE_FILE has no directory component
|
|
512
|
+
state_dir = os.path.dirname(STATE_FILE)
|
|
513
|
+
if state_dir:
|
|
514
|
+
os.makedirs(state_dir, exist_ok=True)
|
|
515
|
+
|
|
516
|
+
# Use atomic write to prevent corruption during crashes
|
|
517
|
+
temp_file = f"{STATE_FILE}.tmp"
|
|
518
|
+
with open(temp_file, 'w') as f:
|
|
341
519
|
json.dump(state, f, indent=2)
|
|
520
|
+
|
|
521
|
+
# Atomic rename (on POSIX systems)
|
|
522
|
+
os.replace(temp_file, STATE_FILE)
|
|
342
523
|
|
|
343
524
|
def should_import_file(file_path: Path, state: dict) -> bool:
|
|
344
525
|
"""Check if file should be imported."""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Claude Self-Reflect Modular Import System
|
|
3
|
+
==========================================
|
|
4
|
+
|
|
5
|
+
A pristine, modular conversation import system following SOLID principles
|
|
6
|
+
and clean architecture patterns.
|
|
7
|
+
|
|
8
|
+
Version: 3.0.0
|
|
9
|
+
Author: Claude Self-Reflect Team
|
|
10
|
+
License: MIT
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .core.config import ImportConfig
|
|
14
|
+
from .core.models import Message, ConversationChunk, ProcessedPoint
|
|
15
|
+
from .main import ConversationProcessor, ImporterContainer
|
|
16
|
+
|
|
17
|
+
__version__ = "3.0.0"
|
|
18
|
+
__all__ = [
|
|
19
|
+
"ImportConfig",
|
|
20
|
+
"Message",
|
|
21
|
+
"ConversationChunk",
|
|
22
|
+
"ProcessedPoint",
|
|
23
|
+
"ConversationProcessor",
|
|
24
|
+
"ImporterContainer"
|
|
25
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Entry point for running the importer as a module."""
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Add parent directory to path for standalone execution
|
|
9
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
10
|
+
|
|
11
|
+
from importer.main import main
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
sys.exit(main())
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Core domain models and configuration."""
|
|
2
|
+
|
|
3
|
+
from .config import ImportConfig
|
|
4
|
+
from .models import Message, ConversationChunk, ProcessedPoint, ImportResult, ImportStats
|
|
5
|
+
from .exceptions import (
|
|
6
|
+
ImportError,
|
|
7
|
+
ValidationError,
|
|
8
|
+
EmbeddingError,
|
|
9
|
+
StorageError,
|
|
10
|
+
ParseError
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ImportConfig",
|
|
15
|
+
"Message",
|
|
16
|
+
"ConversationChunk",
|
|
17
|
+
"ProcessedPoint",
|
|
18
|
+
"ImportResult",
|
|
19
|
+
"ImportStats",
|
|
20
|
+
"ImportError",
|
|
21
|
+
"ValidationError",
|
|
22
|
+
"EmbeddingError",
|
|
23
|
+
"StorageError",
|
|
24
|
+
"ParseError"
|
|
25
|
+
]
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Immutable configuration with validation."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class ImportConfig:
|
|
11
|
+
"""
|
|
12
|
+
Immutable configuration for the import system.
|
|
13
|
+
|
|
14
|
+
All validation happens in __post_init__ to ensure configuration
|
|
15
|
+
is always in a valid state.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
# Qdrant settings
|
|
19
|
+
qdrant_url: str = field(default="http://localhost:6333")
|
|
20
|
+
qdrant_api_key: Optional[str] = field(default=None)
|
|
21
|
+
|
|
22
|
+
# Embedding settings
|
|
23
|
+
embedding_model: str = field(default="sentence-transformers/all-MiniLM-L6-v2")
|
|
24
|
+
embedding_dimension: int = field(default=384)
|
|
25
|
+
use_voyage: bool = field(default=False)
|
|
26
|
+
voyage_api_key: Optional[str] = field(default=None)
|
|
27
|
+
|
|
28
|
+
# Chunking settings
|
|
29
|
+
chunk_size: int = field(default=3000)
|
|
30
|
+
chunk_overlap: int = field(default=200)
|
|
31
|
+
|
|
32
|
+
# Processing settings
|
|
33
|
+
batch_size: int = field(default=10)
|
|
34
|
+
max_ast_elements: int = field(default=100)
|
|
35
|
+
max_workers: int = field(default=4)
|
|
36
|
+
|
|
37
|
+
# State management
|
|
38
|
+
state_file: str = field(default="~/.claude-self-reflect/config/imported-files.json")
|
|
39
|
+
|
|
40
|
+
# Operational settings
|
|
41
|
+
log_level: str = field(default="INFO")
|
|
42
|
+
dry_run: bool = field(default=False)
|
|
43
|
+
force_reimport: bool = field(default=False)
|
|
44
|
+
|
|
45
|
+
# Limits
|
|
46
|
+
file_limit: Optional[int] = field(default=None)
|
|
47
|
+
|
|
48
|
+
def __post_init__(self):
|
|
49
|
+
"""Validate configuration on initialization."""
|
|
50
|
+
# Validate chunk settings
|
|
51
|
+
if self.chunk_size <= 0:
|
|
52
|
+
raise ValueError(f"chunk_size must be positive, got {self.chunk_size}")
|
|
53
|
+
|
|
54
|
+
if self.chunk_overlap < 0:
|
|
55
|
+
raise ValueError(f"chunk_overlap cannot be negative, got {self.chunk_overlap}")
|
|
56
|
+
|
|
57
|
+
if self.chunk_overlap >= self.chunk_size:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"chunk_overlap ({self.chunk_overlap}) must be less than "
|
|
60
|
+
f"chunk_size ({self.chunk_size})"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Validate batch settings
|
|
64
|
+
if self.batch_size < 1:
|
|
65
|
+
raise ValueError(f"batch_size must be at least 1, got {self.batch_size}")
|
|
66
|
+
|
|
67
|
+
if self.max_workers < 1:
|
|
68
|
+
raise ValueError(f"max_workers must be at least 1, got {self.max_workers}")
|
|
69
|
+
|
|
70
|
+
# Validate embedding settings
|
|
71
|
+
if self.embedding_dimension <= 0:
|
|
72
|
+
raise ValueError(f"embedding_dimension must be positive, got {self.embedding_dimension}")
|
|
73
|
+
|
|
74
|
+
if self.use_voyage and not self.voyage_api_key:
|
|
75
|
+
# Document the limitation of frozen dataclass
|
|
76
|
+
voyage_key = os.getenv("VOYAGE_KEY")
|
|
77
|
+
if not voyage_key:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
"voyage_api_key must be provided at initialization when use_voyage=True. "
|
|
80
|
+
"Set VOYAGE_KEY environment variable before creating config."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Validate log level
|
|
84
|
+
valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
|
|
85
|
+
if self.log_level.upper() not in valid_levels:
|
|
86
|
+
raise ValueError(f"log_level must be one of {valid_levels}, got {self.log_level}")
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def state_file_path(self) -> Path:
|
|
90
|
+
"""Get expanded state file path with fallback."""
|
|
91
|
+
try:
|
|
92
|
+
return Path(self.state_file).expanduser()
|
|
93
|
+
except (RuntimeError, OSError):
|
|
94
|
+
# Fallback to current directory if expansion fails
|
|
95
|
+
return Path.cwd() / ".import-state.json"
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_env(cls) -> "ImportConfig":
|
|
99
|
+
"""Create configuration from environment variables."""
|
|
100
|
+
return cls(
|
|
101
|
+
qdrant_url=os.getenv("QDRANT_URL", "http://localhost:6333"),
|
|
102
|
+
qdrant_api_key=os.getenv("QDRANT_API_KEY"),
|
|
103
|
+
use_voyage=os.getenv("USE_VOYAGE", "false").lower() == "true",
|
|
104
|
+
voyage_api_key=os.getenv("VOYAGE_KEY"),
|
|
105
|
+
chunk_size=int(os.getenv("CHUNK_SIZE", "3000")),
|
|
106
|
+
chunk_overlap=int(os.getenv("CHUNK_OVERLAP", "200")),
|
|
107
|
+
batch_size=int(os.getenv("BATCH_SIZE", "10")),
|
|
108
|
+
max_workers=int(os.getenv("MAX_WORKERS", "4")),
|
|
109
|
+
log_level=os.getenv("LOG_LEVEL", "INFO"),
|
|
110
|
+
dry_run=os.getenv("DRY_RUN", "false").lower() == "true",
|
|
111
|
+
force_reimport=os.getenv("FORCE_REIMPORT", "false").lower() == "true"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_dict(cls, config_dict: dict) -> "ImportConfig":
|
|
116
|
+
"""Create configuration from dictionary."""
|
|
117
|
+
# Filter out any unknown keys
|
|
118
|
+
known_fields = {f.name for f in cls.__dataclass_fields__.values()}
|
|
119
|
+
filtered_dict = {k: v for k, v in config_dict.items() if k in known_fields}
|
|
120
|
+
return cls(**filtered_dict)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Custom exception hierarchy for import system."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ImportError(Exception):
|
|
7
|
+
"""Base exception for all import-related errors."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, message: str, details: Optional[dict] = None):
|
|
10
|
+
super().__init__(message)
|
|
11
|
+
self.details = details or {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ValidationError(ImportError):
|
|
15
|
+
"""Raised when input validation fails."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, field: str, value: Any, reason: str):
|
|
18
|
+
super().__init__(f"Validation failed for {field}: {reason}")
|
|
19
|
+
self.field = field
|
|
20
|
+
self.value = value
|
|
21
|
+
self.reason = reason
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EmbeddingError(ImportError):
|
|
25
|
+
"""Raised when embedding generation or validation fails."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, message: str, provider: Optional[str] = None):
|
|
28
|
+
super().__init__(message)
|
|
29
|
+
self.provider = provider
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class StorageError(ImportError):
|
|
33
|
+
"""Raised when storage operations fail."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, operation: str, collection: str, reason: str):
|
|
36
|
+
super().__init__(f"Storage {operation} failed for {collection}: {reason}")
|
|
37
|
+
self.operation = operation
|
|
38
|
+
self.collection = collection
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ParseError(ImportError):
|
|
42
|
+
"""Raised when parsing conversation files fails."""
|
|
43
|
+
|
|
44
|
+
def __init__(self, file_path: str, line_number: Optional[int] = None, reason: str = ""):
|
|
45
|
+
message = f"Failed to parse {file_path}"
|
|
46
|
+
if line_number:
|
|
47
|
+
message += f" at line {line_number}"
|
|
48
|
+
if reason:
|
|
49
|
+
message += f": {reason}"
|
|
50
|
+
super().__init__(message)
|
|
51
|
+
self.file_path = file_path
|
|
52
|
+
self.line_number = line_number
|