claude-self-reflect 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/.claude/agents/claude-self-reflect-test.md +110 -66
  2. package/README.md +1 -1
  3. package/installer/setup-wizard.js +4 -2
  4. package/mcp-server/pyproject.toml +1 -0
  5. package/mcp-server/src/server.py +84 -0
  6. package/package.json +2 -1
  7. package/scripts/import-conversations-unified.py +225 -44
  8. package/scripts/importer/__init__.py +25 -0
  9. package/scripts/importer/__main__.py +14 -0
  10. package/scripts/importer/core/__init__.py +25 -0
  11. package/scripts/importer/core/config.py +120 -0
  12. package/scripts/importer/core/exceptions.py +52 -0
  13. package/scripts/importer/core/models.py +184 -0
  14. package/scripts/importer/embeddings/__init__.py +22 -0
  15. package/scripts/importer/embeddings/base.py +141 -0
  16. package/scripts/importer/embeddings/fastembed_provider.py +164 -0
  17. package/scripts/importer/embeddings/validator.py +136 -0
  18. package/scripts/importer/embeddings/voyage_provider.py +251 -0
  19. package/scripts/importer/main.py +393 -0
  20. package/scripts/importer/processors/__init__.py +15 -0
  21. package/scripts/importer/processors/ast_extractor.py +197 -0
  22. package/scripts/importer/processors/chunker.py +157 -0
  23. package/scripts/importer/processors/concept_extractor.py +109 -0
  24. package/scripts/importer/processors/conversation_parser.py +181 -0
  25. package/scripts/importer/processors/tool_extractor.py +165 -0
  26. package/scripts/importer/state/__init__.py +5 -0
  27. package/scripts/importer/state/state_manager.py +190 -0
  28. package/scripts/importer/storage/__init__.py +5 -0
  29. package/scripts/importer/storage/qdrant_storage.py +250 -0
  30. package/scripts/importer/utils/__init__.py +9 -0
  31. package/scripts/importer/utils/logger.py +87 -0
  32. package/scripts/importer/utils/project_normalizer.py +120 -0
@@ -9,18 +9,27 @@ import os
9
9
  import sys
10
10
  import hashlib
11
11
  import gc
12
+ import ast
13
+ import re
12
14
  from pathlib import Path
13
15
  from datetime import datetime
14
- from typing import List, Dict, Any, Optional
16
+ from typing import List, Dict, Any, Optional, Set
15
17
  import logging
16
18
 
17
- # Add the project root to the Python path
18
- project_root = Path(__file__).parent.parent
19
- sys.path.insert(0, str(project_root))
19
+ # Add the scripts directory to the Python path for utils import
20
+ scripts_dir = Path(__file__).parent
21
+ sys.path.insert(0, str(scripts_dir))
20
22
 
21
23
  from qdrant_client import QdrantClient
22
24
  from qdrant_client.models import PointStruct, Distance, VectorParams
23
25
 
26
+ # Import the correct normalize_project_name from utils
27
+ try:
28
+ from utils import normalize_project_name
29
+ except ImportError as e:
30
+ logging.error(f"Failed to import normalize_project_name from utils: {e}")
31
+ sys.exit(1)
32
+
24
33
  # Set up logging
25
34
  logging.basicConfig(
26
35
  level=logging.INFO,
@@ -31,6 +40,12 @@ logger = logging.getLogger(__name__)
31
40
  # Environment variables
32
41
  QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
33
42
 
43
+ # Constants for metadata limits
44
+ MAX_CONCEPTS = 10
45
+ MAX_AST_ELEMENTS = 30
46
+ MAX_CODE_BLOCKS = 5
47
+ MAX_ELEMENTS_PER_BLOCK = 10
48
+
34
49
  # Robust cross-platform state file resolution
35
50
  def get_default_state_file():
36
51
  """Determine the default state file location with cross-platform support."""
@@ -74,9 +89,11 @@ embedding_dimension = None
74
89
  if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
75
90
  logger.info("Using local embeddings (fastembed)")
76
91
  from fastembed import TextEmbedding
92
+ # Using the same model as official Qdrant MCP server
77
93
  embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
78
94
  embedding_dimension = 384
79
95
  collection_suffix = "local"
96
+ logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
80
97
  else:
81
98
  logger.info("Using Voyage AI embeddings")
82
99
  import voyageai
@@ -84,15 +101,9 @@ else:
84
101
  embedding_dimension = 1024
85
102
  collection_suffix = "voyage"
86
103
 
87
- def normalize_project_name(project_name: str) -> str:
88
- """Normalize project name for consistency."""
89
- # For compatibility with delta-metadata-update, just use the project name as-is
90
- # This ensures collection names match between import and delta update scripts
91
- return project_name
92
-
93
104
  def get_collection_name(project_path: Path) -> str:
94
105
  """Generate collection name from project path."""
95
- normalized = normalize_project_name(project_path.name)
106
+ normalized = normalize_project_name(str(project_path))
96
107
  name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
97
108
  return f"conv_{name_hash}_{collection_suffix}"
98
109
 
@@ -118,18 +129,23 @@ def generate_embeddings(texts: List[str]) -> List[List[float]]:
118
129
  def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
119
130
  conversation_id: str, created_at: str,
120
131
  metadata: Dict[str, Any], collection_name: str,
121
- project_path: Path) -> int:
132
+ project_path: Path, total_messages: int) -> int:
122
133
  """Process and immediately upload a single chunk."""
123
134
  if not messages:
124
135
  return 0
125
136
 
126
- # Extract text content
137
+ # Extract text content and message indices
127
138
  texts = []
139
+ message_indices = []
128
140
  for msg in messages:
129
141
  role = msg.get("role", "unknown")
130
142
  content = msg.get("content", "")
131
143
  if content:
132
144
  texts.append(f"{role.upper()}: {content}")
145
+ # Fix: Check for None instead of truthiness to include 0 values
146
+ idx = msg.get("message_index")
147
+ if idx is not None:
148
+ message_indices.append(idx)
133
149
 
134
150
  if not texts:
135
151
  return 0
@@ -140,6 +156,29 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
140
156
  # Generate embedding
141
157
  embeddings = generate_embeddings([chunk_text])
142
158
 
159
+ # Sanity check embeddings
160
+ if not embeddings or not embeddings[0]:
161
+ logger.error(f"Empty embedding generated for chunk {chunk_index}")
162
+ return 0
163
+
164
+ embedding = embeddings[0]
165
+
166
+ # Check for degenerate embeddings (all values identical)
167
+ if len(set(embedding)) == 1:
168
+ logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
169
+ return 0
170
+
171
+ # Check variance is above threshold
172
+ import statistics
173
+ variance = statistics.variance(embedding)
174
+ if variance < 1e-6:
175
+ logger.warning(f"Low variance embedding detected: {variance}")
176
+
177
+ # Validate dimension
178
+ if len(embedding) != embedding_dimension:
179
+ logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
180
+ return 0
181
+
143
182
  # Create point ID
144
183
  point_id = hashlib.md5(
145
184
  f"{conversation_id}_{chunk_index}".encode()
@@ -151,9 +190,12 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
151
190
  "conversation_id": conversation_id,
152
191
  "chunk_index": chunk_index,
153
192
  "timestamp": created_at,
154
- "project": normalize_project_name(project_path.name),
193
+ "project": normalize_project_name(str(project_path)),
155
194
  "start_role": messages[0].get("role", "unknown") if messages else "unknown",
156
- "message_count": len(messages)
195
+ "message_count": len(messages),
196
+ "total_messages": total_messages,
197
+ "message_index": message_indices[0] if message_indices else 0,
198
+ "message_indices": message_indices # Store all indices in this chunk
157
199
  }
158
200
 
159
201
  # Add metadata
@@ -180,16 +222,84 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
180
222
  logger.error(f"Error processing chunk {chunk_index}: {e}")
181
223
  return 0
182
224
 
183
- def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
184
- """Extract metadata in a single pass, return metadata and first timestamp."""
225
+ def extract_ast_elements(code_text: str) -> Set[str]:
226
+ """Extract function and class names from code using AST parsing."""
227
+ elements = set()
228
+
229
+ # Try to parse as Python code
230
+ try:
231
+ tree = ast.parse(code_text)
232
+ for node in ast.walk(tree):
233
+ if isinstance(node, ast.FunctionDef):
234
+ elements.add(f"func:{node.name}")
235
+ elif isinstance(node, ast.AsyncFunctionDef):
236
+ elements.add(f"func:{node.name}")
237
+ elif isinstance(node, ast.ClassDef):
238
+ elements.add(f"class:{node.name}")
239
+ except SyntaxError:
240
+ # Python regex fallback for partial fragments
241
+ for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
242
+ elements.add(f"func:{m.group(1)}")
243
+ for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
244
+ elements.add(f"func:{m.group(1)}")
245
+ for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
246
+ elements.add(f"class:{m.group(1)}")
247
+ except Exception as e:
248
+ logger.debug(f"Unexpected error parsing AST: {e}")
249
+
250
+ # Try regex patterns for other languages
251
+ # JavaScript/TypeScript functions
252
+ js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
253
+ for match in re.finditer(js_func_pattern, code_text):
254
+ elements.add(f"func:{match.group(1)}")
255
+
256
+ # Class definitions (multiple languages)
257
+ class_pattern = r'(?:class|interface|struct)\s+(\w+)'
258
+ for match in re.finditer(class_pattern, code_text):
259
+ elements.add(f"class:{match.group(1)}")
260
+
261
+ return elements
262
+
263
+ def extract_concepts(text: str) -> List[str]:
264
+ """Extract development concepts from text."""
265
+ concepts = []
266
+ concept_patterns = {
267
+ 'docker': r'\b(?:docker|container|compose|dockerfile)\b',
268
+ 'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
269
+ 'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
270
+ 'api': r'\b(?:api|rest|graphql|endpoint)\b',
271
+ 'security': r'\b(?:security|auth|authentication|encryption)\b',
272
+ 'performance': r'\b(?:performance|optimization|cache|speed)\b',
273
+ 'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
274
+ 'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
275
+ 'git': r'\b(?:git|commit|branch|merge|pull request)\b',
276
+ 'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
277
+ 'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
278
+ }
279
+
280
+ text_lower = text.lower()
281
+ for concept, pattern in concept_patterns.items():
282
+ if re.search(pattern, text_lower, re.IGNORECASE):
283
+ if concept not in concepts:
284
+ concepts.append(concept)
285
+
286
+ return concepts[:MAX_CONCEPTS]
287
+
288
+ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
289
+ """Extract metadata in a single pass, return metadata, first timestamp, and message count."""
185
290
  metadata = {
186
291
  "files_analyzed": [],
187
292
  "files_edited": [],
188
293
  "tools_used": [],
189
- "concepts": []
294
+ "concepts": [],
295
+ "ast_elements": [],
296
+ "has_code_blocks": False,
297
+ "total_messages": 0
190
298
  }
191
299
 
192
300
  first_timestamp = None
301
+ message_count = 0
302
+ all_text = []
193
303
 
194
304
  try:
195
305
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -204,53 +314,107 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
204
314
  if first_timestamp is None and 'timestamp' in data:
205
315
  first_timestamp = data.get('timestamp')
206
316
 
207
- # Extract tool usage from messages
317
+ # Count messages
208
318
  if 'message' in data and data['message']:
209
319
  msg = data['message']
320
+ if msg.get('role') in ['user', 'assistant']:
321
+ message_count += 1
322
+
210
323
  if msg.get('content'):
211
324
  content = msg['content']
325
+ text_content = ""
326
+
212
327
  if isinstance(content, list):
213
328
  for item in content:
214
- if isinstance(item, dict) and item.get('type') == 'tool_use':
215
- tool_name = item.get('name', '')
216
- if tool_name and tool_name not in metadata['tools_used']:
217
- metadata['tools_used'].append(tool_name)
329
+ if isinstance(item, dict):
330
+ if item.get('type') == 'text':
331
+ text_content += item.get('text', '')
332
+ # Check for code blocks
333
+ if '```' in item.get('text', ''):
334
+ metadata['has_code_blocks'] = True
335
+ # Extract code for AST analysis with bounds checking
336
+ if len(metadata['ast_elements']) < 30:
337
+ # Fix: More permissive regex to handle various fence formats
338
+ code_blocks = re.findall(r'```[^\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
339
+ for code_block in code_blocks[:5]: # Limit to 5 blocks
340
+ if len(metadata['ast_elements']) >= 30:
341
+ break
342
+ ast_elems = extract_ast_elements(code_block)
343
+ for elem in list(ast_elems)[:10]: # Limit elements per block
344
+ if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < 30:
345
+ metadata['ast_elements'].append(elem)
218
346
 
219
- # Extract file references
220
- if 'input' in item:
221
- input_data = item['input']
222
- if isinstance(input_data, dict):
223
- if 'file_path' in input_data:
224
- file_ref = input_data['file_path']
225
- if file_ref not in metadata['files_analyzed']:
226
- metadata['files_analyzed'].append(file_ref)
227
- if 'path' in input_data:
228
- file_ref = input_data['path']
229
- if file_ref not in metadata['files_analyzed']:
230
- metadata['files_analyzed'].append(file_ref)
347
+ elif item.get('type') == 'tool_use':
348
+ tool_name = item.get('name', '')
349
+ if tool_name and tool_name not in metadata['tools_used']:
350
+ metadata['tools_used'].append(tool_name)
351
+
352
+ # Extract file references
353
+ if 'input' in item:
354
+ input_data = item['input']
355
+ if isinstance(input_data, dict):
356
+ # Determine if it's an edit tool
357
+ is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
358
+
359
+ if 'file_path' in input_data:
360
+ file_ref = input_data['file_path']
361
+ if is_edit:
362
+ if file_ref not in metadata['files_edited']:
363
+ metadata['files_edited'].append(file_ref)
364
+ else:
365
+ if file_ref not in metadata['files_analyzed']:
366
+ metadata['files_analyzed'].append(file_ref)
367
+
368
+ if 'path' in input_data:
369
+ file_ref = input_data['path']
370
+ if file_ref not in metadata['files_analyzed']:
371
+ metadata['files_analyzed'].append(file_ref)
372
+ elif isinstance(item, str):
373
+ text_content += item
374
+ elif isinstance(content, str):
375
+ text_content = content
376
+
377
+ # Collect text for concept extraction
378
+ if text_content:
379
+ all_text.append(text_content[:1000]) # Limit text per message
231
380
 
232
381
  except json.JSONDecodeError:
233
382
  continue
234
383
  except Exception:
235
384
  continue
236
-
385
+
237
386
  except Exception as e:
238
387
  logger.warning(f"Error extracting metadata: {e}")
239
388
 
240
- return metadata, first_timestamp or datetime.now().isoformat()
389
+ # Extract concepts from collected text
390
+ if all_text:
391
+ combined_text = ' '.join(all_text[:50]) # Limit to first 50 messages
392
+ metadata['concepts'] = extract_concepts(combined_text)
393
+
394
+ # Set total messages
395
+ metadata['total_messages'] = message_count
396
+
397
+ # Limit arrays
398
+ metadata['files_analyzed'] = metadata['files_analyzed'][:20]
399
+ metadata['files_edited'] = metadata['files_edited'][:20]
400
+ metadata['tools_used'] = metadata['tools_used'][:15]
401
+ metadata['ast_elements'] = metadata['ast_elements'][:30]
402
+
403
+ return metadata, first_timestamp or datetime.now().isoformat(), message_count
241
404
 
242
405
  def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
243
406
  """Stream import a single JSONL file without loading it into memory."""
244
407
  logger.info(f"Streaming import of {jsonl_file.name}")
245
408
 
246
409
  # Extract metadata in first pass (lightweight)
247
- metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
410
+ metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
248
411
 
249
412
  # Stream messages and process in chunks
250
413
  chunk_buffer = []
251
414
  chunk_index = 0
252
415
  total_chunks = 0
253
416
  conversation_id = jsonl_file.stem
417
+ current_message_index = 0
254
418
 
255
419
  try:
256
420
  with open(jsonl_file, 'r', encoding='utf-8') as f:
@@ -282,16 +446,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
282
446
  content = '\n'.join(text_parts)
283
447
 
284
448
  if content:
449
+ # Track message index for user/assistant messages
450
+ if msg['role'] in ['user', 'assistant']:
451
+ current_message_index += 1
452
+ message_idx = current_message_index
453
+ else:
454
+ message_idx = 0
455
+
285
456
  chunk_buffer.append({
286
457
  'role': msg['role'],
287
- 'content': content
458
+ 'content': content,
459
+ 'message_index': message_idx
288
460
  })
289
461
 
290
462
  # Process chunk when buffer reaches MAX_CHUNK_SIZE
291
463
  if len(chunk_buffer) >= MAX_CHUNK_SIZE:
292
464
  chunks = process_and_upload_chunk(
293
465
  chunk_buffer, chunk_index, conversation_id,
294
- created_at, metadata, collection_name, project_path
466
+ created_at, metadata, collection_name, project_path, total_messages
295
467
  )
296
468
  total_chunks += chunks
297
469
  chunk_buffer = []
@@ -313,7 +485,7 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
313
485
  if chunk_buffer:
314
486
  chunks = process_and_upload_chunk(
315
487
  chunk_buffer, chunk_index, conversation_id,
316
- created_at, metadata, collection_name, project_path
488
+ created_at, metadata, collection_name, project_path, total_messages
317
489
  )
318
490
  total_chunks += chunks
319
491
 
@@ -335,10 +507,19 @@ def load_state() -> dict:
335
507
  return {"imported_files": {}}
336
508
 
337
509
  def save_state(state: dict):
338
- """Save import state."""
339
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
340
- with open(STATE_FILE, 'w') as f:
510
+ """Save import state with atomic write."""
511
+ # Fix: Handle case where STATE_FILE has no directory component
512
+ state_dir = os.path.dirname(STATE_FILE)
513
+ if state_dir:
514
+ os.makedirs(state_dir, exist_ok=True)
515
+
516
+ # Use atomic write to prevent corruption during crashes
517
+ temp_file = f"{STATE_FILE}.tmp"
518
+ with open(temp_file, 'w') as f:
341
519
  json.dump(state, f, indent=2)
520
+
521
+ # Atomic rename (on POSIX systems)
522
+ os.replace(temp_file, STATE_FILE)
342
523
 
343
524
  def should_import_file(file_path: Path, state: dict) -> bool:
344
525
  """Check if file should be imported."""
@@ -0,0 +1,25 @@
1
+ """
2
+ Claude Self-Reflect Modular Import System
3
+ ==========================================
4
+
5
+ A pristine, modular conversation import system following SOLID principles
6
+ and clean architecture patterns.
7
+
8
+ Version: 3.0.0
9
+ Author: Claude Self-Reflect Team
10
+ License: MIT
11
+ """
12
+
13
+ from .core.config import ImportConfig
14
+ from .core.models import Message, ConversationChunk, ProcessedPoint
15
+ from .main import ConversationProcessor, ImporterContainer
16
+
17
+ __version__ = "3.0.0"
18
+ __all__ = [
19
+ "ImportConfig",
20
+ "Message",
21
+ "ConversationChunk",
22
+ "ProcessedPoint",
23
+ "ConversationProcessor",
24
+ "ImporterContainer"
25
+ ]
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env python3
2
+ """Entry point for running the importer as a module."""
3
+
4
+ import sys
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ # Add parent directory to path for standalone execution
9
+ sys.path.insert(0, str(Path(__file__).parent.parent))
10
+
11
+ from importer.main import main
12
+
13
+ if __name__ == "__main__":
14
+ sys.exit(main())
@@ -0,0 +1,25 @@
1
+ """Core domain models and configuration."""
2
+
3
+ from .config import ImportConfig
4
+ from .models import Message, ConversationChunk, ProcessedPoint, ImportResult, ImportStats
5
+ from .exceptions import (
6
+ ImportError,
7
+ ValidationError,
8
+ EmbeddingError,
9
+ StorageError,
10
+ ParseError
11
+ )
12
+
13
+ __all__ = [
14
+ "ImportConfig",
15
+ "Message",
16
+ "ConversationChunk",
17
+ "ProcessedPoint",
18
+ "ImportResult",
19
+ "ImportStats",
20
+ "ImportError",
21
+ "ValidationError",
22
+ "EmbeddingError",
23
+ "StorageError",
24
+ "ParseError"
25
+ ]
@@ -0,0 +1,120 @@
1
+ """Immutable configuration with validation."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Optional
6
+ import os
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ImportConfig:
11
+ """
12
+ Immutable configuration for the import system.
13
+
14
+ All validation happens in __post_init__ to ensure configuration
15
+ is always in a valid state.
16
+ """
17
+
18
+ # Qdrant settings
19
+ qdrant_url: str = field(default="http://localhost:6333")
20
+ qdrant_api_key: Optional[str] = field(default=None)
21
+
22
+ # Embedding settings
23
+ embedding_model: str = field(default="sentence-transformers/all-MiniLM-L6-v2")
24
+ embedding_dimension: int = field(default=384)
25
+ use_voyage: bool = field(default=False)
26
+ voyage_api_key: Optional[str] = field(default=None)
27
+
28
+ # Chunking settings
29
+ chunk_size: int = field(default=3000)
30
+ chunk_overlap: int = field(default=200)
31
+
32
+ # Processing settings
33
+ batch_size: int = field(default=10)
34
+ max_ast_elements: int = field(default=100)
35
+ max_workers: int = field(default=4)
36
+
37
+ # State management
38
+ state_file: str = field(default="~/.claude-self-reflect/config/imported-files.json")
39
+
40
+ # Operational settings
41
+ log_level: str = field(default="INFO")
42
+ dry_run: bool = field(default=False)
43
+ force_reimport: bool = field(default=False)
44
+
45
+ # Limits
46
+ file_limit: Optional[int] = field(default=None)
47
+
48
+ def __post_init__(self):
49
+ """Validate configuration on initialization."""
50
+ # Validate chunk settings
51
+ if self.chunk_size <= 0:
52
+ raise ValueError(f"chunk_size must be positive, got {self.chunk_size}")
53
+
54
+ if self.chunk_overlap < 0:
55
+ raise ValueError(f"chunk_overlap cannot be negative, got {self.chunk_overlap}")
56
+
57
+ if self.chunk_overlap >= self.chunk_size:
58
+ raise ValueError(
59
+ f"chunk_overlap ({self.chunk_overlap}) must be less than "
60
+ f"chunk_size ({self.chunk_size})"
61
+ )
62
+
63
+ # Validate batch settings
64
+ if self.batch_size < 1:
65
+ raise ValueError(f"batch_size must be at least 1, got {self.batch_size}")
66
+
67
+ if self.max_workers < 1:
68
+ raise ValueError(f"max_workers must be at least 1, got {self.max_workers}")
69
+
70
+ # Validate embedding settings
71
+ if self.embedding_dimension <= 0:
72
+ raise ValueError(f"embedding_dimension must be positive, got {self.embedding_dimension}")
73
+
74
+ if self.use_voyage and not self.voyage_api_key:
75
+ # Document the limitation of frozen dataclass
76
+ voyage_key = os.getenv("VOYAGE_KEY")
77
+ if not voyage_key:
78
+ raise ValueError(
79
+ "voyage_api_key must be provided at initialization when use_voyage=True. "
80
+ "Set VOYAGE_KEY environment variable before creating config."
81
+ )
82
+
83
+ # Validate log level
84
+ valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
85
+ if self.log_level.upper() not in valid_levels:
86
+ raise ValueError(f"log_level must be one of {valid_levels}, got {self.log_level}")
87
+
88
+ @property
89
+ def state_file_path(self) -> Path:
90
+ """Get expanded state file path with fallback."""
91
+ try:
92
+ return Path(self.state_file).expanduser()
93
+ except (RuntimeError, OSError):
94
+ # Fallback to current directory if expansion fails
95
+ return Path.cwd() / ".import-state.json"
96
+
97
+ @classmethod
98
+ def from_env(cls) -> "ImportConfig":
99
+ """Create configuration from environment variables."""
100
+ return cls(
101
+ qdrant_url=os.getenv("QDRANT_URL", "http://localhost:6333"),
102
+ qdrant_api_key=os.getenv("QDRANT_API_KEY"),
103
+ use_voyage=os.getenv("USE_VOYAGE", "false").lower() == "true",
104
+ voyage_api_key=os.getenv("VOYAGE_KEY"),
105
+ chunk_size=int(os.getenv("CHUNK_SIZE", "3000")),
106
+ chunk_overlap=int(os.getenv("CHUNK_OVERLAP", "200")),
107
+ batch_size=int(os.getenv("BATCH_SIZE", "10")),
108
+ max_workers=int(os.getenv("MAX_WORKERS", "4")),
109
+ log_level=os.getenv("LOG_LEVEL", "INFO"),
110
+ dry_run=os.getenv("DRY_RUN", "false").lower() == "true",
111
+ force_reimport=os.getenv("FORCE_REIMPORT", "false").lower() == "true"
112
+ )
113
+
114
+ @classmethod
115
+ def from_dict(cls, config_dict: dict) -> "ImportConfig":
116
+ """Create configuration from dictionary."""
117
+ # Filter out any unknown keys
118
+ known_fields = {f.name for f in cls.__dataclass_fields__.values()}
119
+ filtered_dict = {k: v for k, v in config_dict.items() if k in known_fields}
120
+ return cls(**filtered_dict)
@@ -0,0 +1,52 @@
1
+ """Custom exception hierarchy for import system."""
2
+
3
+ from typing import Optional, Any
4
+
5
+
6
+ class ImportError(Exception):
7
+ """Base exception for all import-related errors."""
8
+
9
+ def __init__(self, message: str, details: Optional[dict] = None):
10
+ super().__init__(message)
11
+ self.details = details or {}
12
+
13
+
14
+ class ValidationError(ImportError):
15
+ """Raised when input validation fails."""
16
+
17
+ def __init__(self, field: str, value: Any, reason: str):
18
+ super().__init__(f"Validation failed for {field}: {reason}")
19
+ self.field = field
20
+ self.value = value
21
+ self.reason = reason
22
+
23
+
24
+ class EmbeddingError(ImportError):
25
+ """Raised when embedding generation or validation fails."""
26
+
27
+ def __init__(self, message: str, provider: Optional[str] = None):
28
+ super().__init__(message)
29
+ self.provider = provider
30
+
31
+
32
+ class StorageError(ImportError):
33
+ """Raised when storage operations fail."""
34
+
35
+ def __init__(self, operation: str, collection: str, reason: str):
36
+ super().__init__(f"Storage {operation} failed for {collection}: {reason}")
37
+ self.operation = operation
38
+ self.collection = collection
39
+
40
+
41
+ class ParseError(ImportError):
42
+ """Raised when parsing conversation files fails."""
43
+
44
+ def __init__(self, file_path: str, line_number: Optional[int] = None, reason: str = ""):
45
+ message = f"Failed to parse {file_path}"
46
+ if line_number:
47
+ message += f" at line {line_number}"
48
+ if reason:
49
+ message += f": {reason}"
50
+ super().__init__(message)
51
+ self.file_path = file_path
52
+ self.line_number = line_number