npm - claude-self-reflect - Versions diffs - 3.0.0 → 3.0.2 - Mend

claude-self-reflect 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/.claude/agents/claude-self-reflect-test.md +110 -66
package/README.md +1 -1
package/installer/setup-wizard.js +4 -2
package/mcp-server/pyproject.toml +1 -0
package/mcp-server/src/server.py +84 -0
package/package.json +2 -1
package/scripts/import-conversations-unified.py +225 -44
package/scripts/importer/__init__.py +25 -0
package/scripts/importer/__main__.py +14 -0
package/scripts/importer/core/__init__.py +25 -0
package/scripts/importer/core/config.py +120 -0
package/scripts/importer/core/exceptions.py +52 -0
package/scripts/importer/core/models.py +184 -0
package/scripts/importer/embeddings/__init__.py +22 -0
package/scripts/importer/embeddings/base.py +141 -0
package/scripts/importer/embeddings/fastembed_provider.py +164 -0
package/scripts/importer/embeddings/validator.py +136 -0
package/scripts/importer/embeddings/voyage_provider.py +251 -0
package/scripts/importer/main.py +393 -0
package/scripts/importer/processors/__init__.py +15 -0
package/scripts/importer/processors/ast_extractor.py +197 -0
package/scripts/importer/processors/chunker.py +157 -0
package/scripts/importer/processors/concept_extractor.py +109 -0
package/scripts/importer/processors/conversation_parser.py +181 -0
package/scripts/importer/processors/tool_extractor.py +165 -0
package/scripts/importer/state/__init__.py +5 -0
package/scripts/importer/state/state_manager.py +190 -0
package/scripts/importer/storage/__init__.py +5 -0
package/scripts/importer/storage/qdrant_storage.py +250 -0
package/scripts/importer/utils/__init__.py +9 -0
package/scripts/importer/utils/logger.py +87 -0
package/scripts/importer/utils/project_normalizer.py +120 -0

package/scripts/import-conversations-unified.py CHANGED Viewed

@@ -9,18 +9,27 @@ import os
 import sys
 import hashlib
 import gc
+import ast
+import re
 from pathlib import Path
 from datetime import datetime
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Set
 import logging
-# Add the project root to the Python path
-project_root = Path(__file__).parent.parent
-sys.path.insert(0, str(project_root))
+# Add the scripts directory to the Python path for utils import
+scripts_dir = Path(__file__).parent
+sys.path.insert(0, str(scripts_dir))
 from qdrant_client import QdrantClient
 from qdrant_client.models import PointStruct, Distance, VectorParams
+# Import the correct normalize_project_name from utils
+try:
+    from utils import normalize_project_name
+except ImportError as e:
+    logging.error(f"Failed to import normalize_project_name from utils: {e}")
+    sys.exit(1)
 # Set up logging
 logging.basicConfig(
     level=logging.INFO,
@@ -31,6 +40,12 @@ logger = logging.getLogger(__name__)
 # Environment variables
 QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+# Constants for metadata limits
+MAX_CONCEPTS = 10
+MAX_AST_ELEMENTS = 30
+MAX_CODE_BLOCKS = 5
+MAX_ELEMENTS_PER_BLOCK = 10
 # Robust cross-platform state file resolution
 def get_default_state_file():
     """Determine the default state file location with cross-platform support."""
@@ -74,9 +89,11 @@ embedding_dimension = None
 if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
     logger.info("Using local embeddings (fastembed)")
     from fastembed import TextEmbedding
+    # Using the same model as official Qdrant MCP server
     embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
     embedding_dimension = 384
     collection_suffix = "local"
+    logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
 else:
     logger.info("Using Voyage AI embeddings")
     import voyageai
@@ -84,15 +101,9 @@ else:
     embedding_dimension = 1024
     collection_suffix = "voyage"
-def normalize_project_name(project_name: str) -> str:
-    """Normalize project name for consistency."""
-    # For compatibility with delta-metadata-update, just use the project name as-is
-    # This ensures collection names match between import and delta update scripts
-    return project_name
 def get_collection_name(project_path: Path) -> str:
     """Generate collection name from project path."""
-    normalized = normalize_project_name(project_path.name)
+    normalized = normalize_project_name(str(project_path))
     name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
     return f"conv_{name_hash}_{collection_suffix}"
@@ -118,18 +129,23 @@ def generate_embeddings(texts: List[str]) -> List[List[float]]:
 def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
                             conversation_id: str, created_at: str,
                             metadata: Dict[str, Any], collection_name: str,
-                            project_path: Path) -> int:
+                            project_path: Path, total_messages: int) -> int:
     """Process and immediately upload a single chunk."""
     if not messages:
         return 0
-    # Extract text content
+    # Extract text content and message indices
     texts = []
+    message_indices = []
     for msg in messages:
         role = msg.get("role", "unknown")
         content = msg.get("content", "")
         if content:
             texts.append(f"{role.upper()}: {content}")
+            # Fix: Check for None instead of truthiness to include 0 values
+            idx = msg.get("message_index")
+            if idx is not None:
+                message_indices.append(idx)
     if not texts:
         return 0
@@ -140,6 +156,29 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
         # Generate embedding
         embeddings = generate_embeddings([chunk_text])
+        # Sanity check embeddings
+        if not embeddings or not embeddings[0]:
+            logger.error(f"Empty embedding generated for chunk {chunk_index}")
+            return 0
+        embedding = embeddings[0]
+        # Check for degenerate embeddings (all values identical)
+        if len(set(embedding)) == 1:
+            logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
+            return 0
+        # Check variance is above threshold
+        import statistics
+        variance = statistics.variance(embedding)
+        if variance < 1e-6:
+            logger.warning(f"Low variance embedding detected: {variance}")
+        # Validate dimension
+        if len(embedding) != embedding_dimension:
+            logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
+            return 0
         # Create point ID
         point_id = hashlib.md5(
             f"{conversation_id}_{chunk_index}".encode()
@@ -151,9 +190,12 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
             "conversation_id": conversation_id,
             "chunk_index": chunk_index,
             "timestamp": created_at,
-            "project": normalize_project_name(project_path.name),
+            "project": normalize_project_name(str(project_path)),
             "start_role": messages[0].get("role", "unknown") if messages else "unknown",
-            "message_count": len(messages)
+            "message_count": len(messages),
+            "total_messages": total_messages,
+            "message_index": message_indices[0] if message_indices else 0,
+            "message_indices": message_indices  # Store all indices in this chunk
         }
         # Add metadata
@@ -180,16 +222,84 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
         logger.error(f"Error processing chunk {chunk_index}: {e}")
         return 0
-def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
-    """Extract metadata in a single pass, return metadata and first timestamp."""
+def extract_ast_elements(code_text: str) -> Set[str]:
+    """Extract function and class names from code using AST parsing."""
+    elements = set()
+    # Try to parse as Python code
+    try:
+        tree = ast.parse(code_text)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef):
+                elements.add(f"func:{node.name}")
+            elif isinstance(node, ast.AsyncFunctionDef):
+                elements.add(f"func:{node.name}")
+            elif isinstance(node, ast.ClassDef):
+                elements.add(f"class:{node.name}")
+    except SyntaxError:
+        # Python regex fallback for partial fragments
+        for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
+            elements.add(f"func:{m.group(1)}")
+        for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
+            elements.add(f"func:{m.group(1)}")
+        for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
+            elements.add(f"class:{m.group(1)}")
+    except Exception as e:
+        logger.debug(f"Unexpected error parsing AST: {e}")
+    # Try regex patterns for other languages
+    # JavaScript/TypeScript functions
+    js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
+    for match in re.finditer(js_func_pattern, code_text):
+        elements.add(f"func:{match.group(1)}")
+    # Class definitions (multiple languages)
+    class_pattern = r'(?:class|interface|struct)\s+(\w+)'
+    for match in re.finditer(class_pattern, code_text):
+        elements.add(f"class:{match.group(1)}")
+    return elements
+def extract_concepts(text: str) -> List[str]:
+    """Extract development concepts from text."""
+    concepts = []
+    concept_patterns = {
+        'docker': r'\b(?:docker|container|compose|dockerfile)\b',
+        'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
+        'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
+        'api': r'\b(?:api|rest|graphql|endpoint)\b',
+        'security': r'\b(?:security|auth|authentication|encryption)\b',
+        'performance': r'\b(?:performance|optimization|cache|speed)\b',
+        'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
+        'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
+        'git': r'\b(?:git|commit|branch|merge|pull request)\b',
+        'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
+        'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
+    }
+    text_lower = text.lower()
+    for concept, pattern in concept_patterns.items():
+        if re.search(pattern, text_lower, re.IGNORECASE):
+            if concept not in concepts:
+                concepts.append(concept)
+    return concepts[:MAX_CONCEPTS]
+def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
+    """Extract metadata in a single pass, return metadata, first timestamp, and message count."""
     metadata = {
         "files_analyzed": [],
         "files_edited": [],
         "tools_used": [],
-        "concepts": []
+        "concepts": [],
+        "ast_elements": [],
+        "has_code_blocks": False,
+        "total_messages": 0
     }
     first_timestamp = None
+    message_count = 0
+    all_text = []
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
@@ -204,53 +314,107 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
                     if first_timestamp is None and 'timestamp' in data:
                         first_timestamp = data.get('timestamp')
-                    # Extract tool usage from messages
+                    # Count messages
                     if 'message' in data and data['message']:
                         msg = data['message']
+                        if msg.get('role') in ['user', 'assistant']:
+                            message_count += 1
                         if msg.get('content'):
                             content = msg['content']
+                            text_content = ""
                             if isinstance(content, list):
                                 for item in content:
-                                    if isinstance(item, dict) and item.get('type') == 'tool_use':
-                                        tool_name = item.get('name', '')
-                                        if tool_name and tool_name not in metadata['tools_used']:
-                                            metadata['tools_used'].append(tool_name)
+                                    if isinstance(item, dict):
+                                        if item.get('type') == 'text':
+                                            text_content += item.get('text', '')
+                                            # Check for code blocks
+                                            if '```' in item.get('text', ''):
+                                                metadata['has_code_blocks'] = True
+                                                # Extract code for AST analysis with bounds checking
+                                                if len(metadata['ast_elements']) < 30:
+                                                    # Fix: More permissive regex to handle various fence formats
+                                                    code_blocks = re.findall(r'```[^\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
+                                                    for code_block in code_blocks[:5]:  # Limit to 5 blocks
+                                                        if len(metadata['ast_elements']) >= 30:
+                                                            break
+                                                        ast_elems = extract_ast_elements(code_block)
+                                                        for elem in list(ast_elems)[:10]:  # Limit elements per block
+                                                            if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < 30:
+                                                                metadata['ast_elements'].append(elem)
-                                        # Extract file references
-                                        if 'input' in item:
-                                            input_data = item['input']
-                                            if isinstance(input_data, dict):
-                                                if 'file_path' in input_data:
-                                                    file_ref = input_data['file_path']
-                                                    if file_ref not in metadata['files_analyzed']:
-                                                        metadata['files_analyzed'].append(file_ref)
-                                                if 'path' in input_data:
-                                                    file_ref = input_data['path']
-                                                    if file_ref not in metadata['files_analyzed']:
-                                                        metadata['files_analyzed'].append(file_ref)
+                                        elif item.get('type') == 'tool_use':
+                                            tool_name = item.get('name', '')
+                                            if tool_name and tool_name not in metadata['tools_used']:
+                                                metadata['tools_used'].append(tool_name)
+                                            # Extract file references
+                                            if 'input' in item:
+                                                input_data = item['input']
+                                                if isinstance(input_data, dict):
+                                                    # Determine if it's an edit tool
+                                                    is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
+                                                    if 'file_path' in input_data:
+                                                        file_ref = input_data['file_path']
+                                                        if is_edit:
+                                                            if file_ref not in metadata['files_edited']:
+                                                                metadata['files_edited'].append(file_ref)
+                                                        else:
+                                                            if file_ref not in metadata['files_analyzed']:
+                                                                metadata['files_analyzed'].append(file_ref)
+                                                    if 'path' in input_data:
+                                                        file_ref = input_data['path']
+                                                        if file_ref not in metadata['files_analyzed']:
+                                                            metadata['files_analyzed'].append(file_ref)
+                                    elif isinstance(item, str):
+                                        text_content += item
+                            elif isinstance(content, str):
+                                text_content = content
+                            # Collect text for concept extraction
+                            if text_content:
+                                all_text.append(text_content[:1000])  # Limit text per message
                 except json.JSONDecodeError:
                     continue
                 except Exception:
                     continue
     except Exception as e:
         logger.warning(f"Error extracting metadata: {e}")
-    return metadata, first_timestamp or datetime.now().isoformat()
+    # Extract concepts from collected text
+    if all_text:
+        combined_text = ' '.join(all_text[:50])  # Limit to first 50 messages
+        metadata['concepts'] = extract_concepts(combined_text)
+    # Set total messages
+    metadata['total_messages'] = message_count
+    # Limit arrays
+    metadata['files_analyzed'] = metadata['files_analyzed'][:20]
+    metadata['files_edited'] = metadata['files_edited'][:20]
+    metadata['tools_used'] = metadata['tools_used'][:15]
+    metadata['ast_elements'] = metadata['ast_elements'][:30]
+    return metadata, first_timestamp or datetime.now().isoformat(), message_count
 def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
     """Stream import a single JSONL file without loading it into memory."""
     logger.info(f"Streaming import of {jsonl_file.name}")
     # Extract metadata in first pass (lightweight)
-    metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
+    metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
     # Stream messages and process in chunks
     chunk_buffer = []
     chunk_index = 0
     total_chunks = 0
     conversation_id = jsonl_file.stem
+    current_message_index = 0
     try:
         with open(jsonl_file, 'r', encoding='utf-8') as f:
@@ -282,16 +446,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
                                 content = '\n'.join(text_parts)
                             if content:
+                                # Track message index for user/assistant messages
+                                if msg['role'] in ['user', 'assistant']:
+                                    current_message_index += 1
+                                    message_idx = current_message_index
+                                else:
+                                    message_idx = 0
                                 chunk_buffer.append({
                                     'role': msg['role'],
-                                    'content': content
+                                    'content': content,
+                                    'message_index': message_idx
                                 })
                                 # Process chunk when buffer reaches MAX_CHUNK_SIZE
                                 if len(chunk_buffer) >= MAX_CHUNK_SIZE:
                                     chunks = process_and_upload_chunk(
                                         chunk_buffer, chunk_index, conversation_id,
-                                        created_at, metadata, collection_name, project_path
+                                        created_at, metadata, collection_name, project_path, total_messages
                                     )
                                     total_chunks += chunks
                                     chunk_buffer = []
@@ -313,7 +485,7 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
         if chunk_buffer:
             chunks = process_and_upload_chunk(
                 chunk_buffer, chunk_index, conversation_id,
-                created_at, metadata, collection_name, project_path
+                created_at, metadata, collection_name, project_path, total_messages
             )
             total_chunks += chunks
@@ -335,10 +507,19 @@ def load_state() -> dict:
     return {"imported_files": {}}
 def save_state(state: dict):
-    """Save import state."""
-    os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
-    with open(STATE_FILE, 'w') as f:
+    """Save import state with atomic write."""
+    # Fix: Handle case where STATE_FILE has no directory component
+    state_dir = os.path.dirname(STATE_FILE)
+    if state_dir:
+        os.makedirs(state_dir, exist_ok=True)
+    # Use atomic write to prevent corruption during crashes
+    temp_file = f"{STATE_FILE}.tmp"
+    with open(temp_file, 'w') as f:
         json.dump(state, f, indent=2)
+    # Atomic rename (on POSIX systems)
+    os.replace(temp_file, STATE_FILE)
 def should_import_file(file_path: Path, state: dict) -> bool:
     """Check if file should be imported."""

package/scripts/importer/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""
+Claude Self-Reflect Modular Import System
+==========================================
+A pristine, modular conversation import system following SOLID principles
+and clean architecture patterns.
+Version: 3.0.0
+Author: Claude Self-Reflect Team
+License: MIT
+"""
+from .core.config import ImportConfig
+from .core.models import Message, ConversationChunk, ProcessedPoint
+from .main import ConversationProcessor, ImporterContainer
+__version__ = "3.0.0"
+__all__ = [
+    "ImportConfig",
+    "Message",
+    "ConversationChunk",
+    "ProcessedPoint",
+    "ConversationProcessor",
+    "ImporterContainer"
+]

package/scripts/importer/__main__.py ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+"""Entry point for running the importer as a module."""
+import sys
+import logging
+from pathlib import Path
+# Add parent directory to path for standalone execution
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from importer.main import main
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/importer/core/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Core domain models and configuration."""
+from .config import ImportConfig
+from .models import Message, ConversationChunk, ProcessedPoint, ImportResult, ImportStats
+from .exceptions import (
+    ImportError,
+    ValidationError,
+    EmbeddingError,
+    StorageError,
+    ParseError
+)
+__all__ = [
+    "ImportConfig",
+    "Message",
+    "ConversationChunk",
+    "ProcessedPoint",
+    "ImportResult",
+    "ImportStats",
+    "ImportError",
+    "ValidationError",
+    "EmbeddingError",
+    "StorageError",
+    "ParseError"
+]

package/scripts/importer/core/config.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Immutable configuration with validation."""
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import os
+@dataclass(frozen=True)
+class ImportConfig:
+    """
+    Immutable configuration for the import system.
+    All validation happens in __post_init__ to ensure configuration
+    is always in a valid state.
+    """
+    # Qdrant settings
+    qdrant_url: str = field(default="http://localhost:6333")
+    qdrant_api_key: Optional[str] = field(default=None)
+    # Embedding settings
+    embedding_model: str = field(default="sentence-transformers/all-MiniLM-L6-v2")
+    embedding_dimension: int = field(default=384)
+    use_voyage: bool = field(default=False)
+    voyage_api_key: Optional[str] = field(default=None)
+    # Chunking settings
+    chunk_size: int = field(default=3000)
+    chunk_overlap: int = field(default=200)
+    # Processing settings
+    batch_size: int = field(default=10)
+    max_ast_elements: int = field(default=100)
+    max_workers: int = field(default=4)
+    # State management
+    state_file: str = field(default="~/.claude-self-reflect/config/imported-files.json")
+    # Operational settings
+    log_level: str = field(default="INFO")
+    dry_run: bool = field(default=False)
+    force_reimport: bool = field(default=False)
+    # Limits
+    file_limit: Optional[int] = field(default=None)
+    def __post_init__(self):
+        """Validate configuration on initialization."""
+        # Validate chunk settings
+        if self.chunk_size <= 0:
+            raise ValueError(f"chunk_size must be positive, got {self.chunk_size}")
+        if self.chunk_overlap < 0:
+            raise ValueError(f"chunk_overlap cannot be negative, got {self.chunk_overlap}")
+        if self.chunk_overlap >= self.chunk_size:
+            raise ValueError(
+                f"chunk_overlap ({self.chunk_overlap}) must be less than "
+                f"chunk_size ({self.chunk_size})"
+            )
+        # Validate batch settings
+        if self.batch_size < 1:
+            raise ValueError(f"batch_size must be at least 1, got {self.batch_size}")
+        if self.max_workers < 1:
+            raise ValueError(f"max_workers must be at least 1, got {self.max_workers}")
+        # Validate embedding settings
+        if self.embedding_dimension <= 0:
+            raise ValueError(f"embedding_dimension must be positive, got {self.embedding_dimension}")
+        if self.use_voyage and not self.voyage_api_key:
+            # Document the limitation of frozen dataclass
+            voyage_key = os.getenv("VOYAGE_KEY")
+            if not voyage_key:
+                raise ValueError(
+                    "voyage_api_key must be provided at initialization when use_voyage=True. "
+                    "Set VOYAGE_KEY environment variable before creating config."
+                )
+        # Validate log level
+        valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
+        if self.log_level.upper() not in valid_levels:
+            raise ValueError(f"log_level must be one of {valid_levels}, got {self.log_level}")
+    @property
+    def state_file_path(self) -> Path:
+        """Get expanded state file path with fallback."""
+        try:
+            return Path(self.state_file).expanduser()
+        except (RuntimeError, OSError):
+            # Fallback to current directory if expansion fails
+            return Path.cwd() / ".import-state.json"
+    @classmethod
+    def from_env(cls) -> "ImportConfig":
+        """Create configuration from environment variables."""
+        return cls(
+            qdrant_url=os.getenv("QDRANT_URL", "http://localhost:6333"),
+            qdrant_api_key=os.getenv("QDRANT_API_KEY"),
+            use_voyage=os.getenv("USE_VOYAGE", "false").lower() == "true",
+            voyage_api_key=os.getenv("VOYAGE_KEY"),
+            chunk_size=int(os.getenv("CHUNK_SIZE", "3000")),
+            chunk_overlap=int(os.getenv("CHUNK_OVERLAP", "200")),
+            batch_size=int(os.getenv("BATCH_SIZE", "10")),
+            max_workers=int(os.getenv("MAX_WORKERS", "4")),
+            log_level=os.getenv("LOG_LEVEL", "INFO"),
+            dry_run=os.getenv("DRY_RUN", "false").lower() == "true",
+            force_reimport=os.getenv("FORCE_REIMPORT", "false").lower() == "true"
+        )
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> "ImportConfig":
+        """Create configuration from dictionary."""
+        # Filter out any unknown keys
+        known_fields = {f.name for f in cls.__dataclass_fields__.values()}
+        filtered_dict = {k: v for k, v in config_dict.items() if k in known_fields}
+        return cls(**filtered_dict)

package/scripts/importer/core/exceptions.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Custom exception hierarchy for import system."""
+from typing import Optional, Any
+class ImportError(Exception):
+    """Base exception for all import-related errors."""
+    def __init__(self, message: str, details: Optional[dict] = None):
+        super().__init__(message)
+        self.details = details or {}
+class ValidationError(ImportError):
+    """Raised when input validation fails."""
+    def __init__(self, field: str, value: Any, reason: str):
+        super().__init__(f"Validation failed for {field}: {reason}")
+        self.field = field
+        self.value = value
+        self.reason = reason
+class EmbeddingError(ImportError):
+    """Raised when embedding generation or validation fails."""
+    def __init__(self, message: str, provider: Optional[str] = None):
+        super().__init__(message)
+        self.provider = provider
+class StorageError(ImportError):
+    """Raised when storage operations fail."""
+    def __init__(self, operation: str, collection: str, reason: str):
+        super().__init__(f"Storage {operation} failed for {collection}: {reason}")
+        self.operation = operation
+        self.collection = collection
+class ParseError(ImportError):
+    """Raised when parsing conversation files fails."""
+    def __init__(self, file_path: str, line_number: Optional[int] = None, reason: str = ""):
+        message = f"Failed to parse {file_path}"
+        if line_number:
+            message += f" at line {line_number}"
+        if reason:
+            message += f": {reason}"
+        super().__init__(message)
+        self.file_path = file_path
+        self.line_number = line_number