claude-self-reflect 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ """State management with atomic writes."""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Dict, Any, Optional, Set
8
+ from datetime import datetime
9
+ import fcntl
10
+ import tempfile
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class StateManager:
16
+ """
17
+ Manage import state with atomic writes and crash recovery.
18
+
19
+ Implements the critical STATE_FILE fixes:
20
+ 1. Handle empty dirname case
21
+ 2. Atomic writes to prevent corruption
22
+ 3. File locking for concurrent access
23
+ """
24
+
25
+ def __init__(self, state_file: Path):
26
+ self.state_file = Path(state_file)
27
+ self.state: Dict[str, Any] = self._load_state()
28
+ self._lock_file = None
29
+
30
+ def _load_state(self) -> Dict[str, Any]:
31
+ """Load state from file or create new."""
32
+ if self.state_file.exists():
33
+ try:
34
+ with open(self.state_file, 'r') as f:
35
+ state = json.load(f)
36
+ logger.debug(f"Loaded state with {len(state.get('processed', []))} processed files")
37
+ return state
38
+ except Exception as e:
39
+ logger.error(f"Failed to load state: {e}")
40
+ # Create backup of corrupted state
41
+ backup = self.state_file.with_suffix('.corrupt')
42
+ try:
43
+ self.state_file.rename(backup)
44
+ logger.info(f"Backed up corrupted state to {backup}")
45
+ except Exception:
46
+ pass
47
+
48
+ # Return default state
49
+ return {
50
+ "processed": {},
51
+ "failed": {},
52
+ "last_updated": None,
53
+ "version": "3.0.0"
54
+ }
55
+
56
+ def save_state(self) -> None:
57
+ """
58
+ Save state with atomic write.
59
+
60
+ Implements the critical fix for STATE_FILE directory handling.
61
+ """
62
+ try:
63
+ # FIX: Handle case where STATE_FILE has no directory component
64
+ state_dir = self.state_file.parent
65
+ if state_dir and state_dir != Path('.'):
66
+ state_dir.mkdir(parents=True, exist_ok=True)
67
+
68
+ # Update timestamp
69
+ self.state["last_updated"] = datetime.now().isoformat()
70
+
71
+ # Use atomic write to prevent corruption during crashes
72
+ # Create temp file in same directory for atomic rename
73
+ temp_fd, temp_path = tempfile.mkstemp(
74
+ dir=str(state_dir) if state_dir else '.',
75
+ prefix='.tmp_state_',
76
+ suffix='.json'
77
+ )
78
+
79
+ try:
80
+ # Write to temp file
81
+ with os.fdopen(temp_fd, 'w') as f:
82
+ json.dump(self.state, f, indent=2)
83
+
84
+ # Atomic rename (on POSIX systems)
85
+ Path(temp_path).replace(self.state_file)
86
+
87
+ logger.debug("State saved successfully")
88
+
89
+ except Exception as e:
90
+ # Clean up temp file on error
91
+ try:
92
+ os.unlink(temp_path)
93
+ except:
94
+ pass
95
+ raise e
96
+
97
+ except Exception as e:
98
+ logger.error(f"Failed to save state: {e}")
99
+ raise
100
+
101
+ def is_processed(self, file_path: Path) -> bool:
102
+ """Check if file has been processed."""
103
+ return str(file_path) in self.state.get("processed", {})
104
+
105
+ def mark_processed(self, file_path: Path, points_created: int) -> None:
106
+ """Mark file as processed."""
107
+ if "processed" not in self.state:
108
+ self.state["processed"] = {}
109
+
110
+ self.state["processed"][str(file_path)] = {
111
+ "timestamp": datetime.now().isoformat(),
112
+ "points_created": points_created
113
+ }
114
+
115
+ # Remove from failed if present
116
+ if str(file_path) in self.state.get("failed", {}):
117
+ del self.state["failed"][str(file_path)]
118
+
119
+ self.save_state()
120
+
121
+ def mark_failed(self, file_path: Path, error: str) -> None:
122
+ """Mark file as failed."""
123
+ if "failed" not in self.state:
124
+ self.state["failed"] = {}
125
+
126
+ self.state["failed"][str(file_path)] = {
127
+ "timestamp": datetime.now().isoformat(),
128
+ "error": error
129
+ }
130
+
131
+ self.save_state()
132
+
133
+ def get_processed_files(self) -> Set[str]:
134
+ """Get set of processed file paths."""
135
+ return set(self.state.get("processed", {}).keys())
136
+
137
+ def get_failed_files(self) -> Set[str]:
138
+ """Get set of failed file paths."""
139
+ return set(self.state.get("failed", {}).keys())
140
+
141
+ def reset(self) -> None:
142
+ """Reset state to empty."""
143
+ self.state = {
144
+ "processed": {},
145
+ "failed": {},
146
+ "last_updated": datetime.now().isoformat(),
147
+ "version": "3.0.0"
148
+ }
149
+ self.save_state()
150
+
151
+ def get_statistics(self) -> Dict[str, Any]:
152
+ """Get import statistics."""
153
+ processed = self.state.get("processed", {})
154
+ failed = self.state.get("failed", {})
155
+
156
+ total_points = sum(
157
+ info.get("points_created", 0)
158
+ for info in processed.values()
159
+ )
160
+
161
+ return {
162
+ "processed_count": len(processed),
163
+ "failed_count": len(failed),
164
+ "total_points": total_points,
165
+ "last_updated": self.state.get("last_updated"),
166
+ "version": self.state.get("version")
167
+ }
168
+
169
+ def acquire_lock(self) -> bool:
170
+ """Acquire exclusive lock for state file (Unix/Linux only)."""
171
+ if os.name == 'nt': # Windows
172
+ logger.warning("File locking not implemented for Windows")
173
+ return True # Allow operation to proceed without locking on Windows
174
+ try:
175
+ lock_path = self.state_file.with_suffix('.lock')
176
+ self._lock_file = open(lock_path, 'w')
177
+ fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
178
+ return True
179
+ except (IOError, OSError):
180
+ return False
181
+
182
+ def release_lock(self) -> None:
183
+ """Release state file lock."""
184
+ if self._lock_file:
185
+ try:
186
+ fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_UN)
187
+ self._lock_file.close()
188
+ except:
189
+ pass
190
+ self._lock_file = None
@@ -0,0 +1,5 @@
1
+ """Storage backend for vector database operations."""
2
+
3
+ from .qdrant_storage import QdrantStorage
4
+
5
+ __all__ = ["QdrantStorage"]
@@ -0,0 +1,250 @@
1
+ """Qdrant vector database storage implementation."""
2
+
3
+ import logging
4
+ from typing import List, Optional, Dict, Any
5
+ from qdrant_client import QdrantClient
6
+ from qdrant_client.models import (
7
+ Distance,
8
+ VectorParams,
9
+ PointStruct,
10
+ CollectionInfo
11
+ )
12
+
13
+ from ..core import ProcessedPoint
14
+ from ..core.exceptions import StorageError
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class QdrantStorage:
20
+ """
21
+ Qdrant storage backend implementation.
22
+
23
+ Handles all interactions with the Qdrant vector database.
24
+ """
25
+
26
+ def __init__(self, url: str = "http://localhost:6333", api_key: Optional[str] = None):
27
+ self.url = url
28
+ self.api_key = api_key
29
+ self.client = None
30
+ self._initialized = False
31
+
32
+ def initialize(self) -> None:
33
+ """Initialize connection to Qdrant."""
34
+ try:
35
+ self.client = QdrantClient(
36
+ url=self.url,
37
+ api_key=self.api_key,
38
+ timeout=30
39
+ )
40
+
41
+ # Test connection
42
+ self.client.get_collections()
43
+ self._initialized = True
44
+ logger.info(f"Connected to Qdrant at {self.url}")
45
+
46
+ except Exception as e:
47
+ raise StorageError(
48
+ operation="initialize",
49
+ collection="N/A",
50
+ reason=f"Failed to connect to Qdrant: {e}"
51
+ )
52
+
53
+ def create_collection(self, name: str, dimension: int) -> bool:
54
+ """
55
+ Create a new collection if it doesn't exist.
56
+
57
+ Args:
58
+ name: Collection name
59
+ dimension: Vector dimension
60
+
61
+ Returns:
62
+ True if created, False if already exists
63
+ """
64
+ if not self._initialized:
65
+ raise StorageError(
66
+ operation="create_collection",
67
+ collection=name,
68
+ reason="Storage not initialized"
69
+ )
70
+
71
+ try:
72
+ if self.collection_exists(name):
73
+ logger.debug(f"Collection {name} already exists")
74
+ return False
75
+
76
+ self.client.create_collection(
77
+ collection_name=name,
78
+ vectors_config=VectorParams(
79
+ size=dimension,
80
+ distance=Distance.COSINE
81
+ )
82
+ )
83
+
84
+ logger.info(f"Created collection {name} with dimension {dimension}")
85
+ return True
86
+
87
+ except Exception as e:
88
+ raise StorageError(
89
+ operation="create_collection",
90
+ collection=name,
91
+ reason=str(e)
92
+ )
93
+
94
+ def collection_exists(self, name: str) -> bool:
95
+ """Check if a collection exists."""
96
+ if not self._initialized:
97
+ raise StorageError(
98
+ operation="collection_exists",
99
+ collection=name,
100
+ reason="Storage not initialized"
101
+ )
102
+
103
+ try:
104
+ collections = self.client.get_collections().collections
105
+ return any(c.name == name for c in collections)
106
+ except Exception as e:
107
+ logger.error(f"Failed to check collection existence: {e}")
108
+ return False
109
+
110
+ def upsert_points(self, collection: str, points: List[ProcessedPoint]) -> int:
111
+ """
112
+ Insert or update points in a collection.
113
+
114
+ Args:
115
+ collection: Collection name
116
+ points: List of points to upsert
117
+
118
+ Returns:
119
+ Number of points upserted
120
+ """
121
+ if not self._initialized:
122
+ raise StorageError(
123
+ operation="upsert_points",
124
+ collection=collection,
125
+ reason="Storage not initialized"
126
+ )
127
+
128
+ if not points:
129
+ return 0
130
+
131
+ try:
132
+ # Ensure collection exists with correct dimension
133
+ if not self.collection_exists(collection):
134
+ dimension = len(points[0].vector)
135
+ self.create_collection(collection, dimension)
136
+
137
+ # Convert to Qdrant points
138
+ qdrant_points = [
139
+ PointStruct(
140
+ id=self._generate_point_id(point.id),
141
+ vector=point.vector,
142
+ payload=point.payload
143
+ )
144
+ for point in points
145
+ ]
146
+
147
+ # Batch upsert
148
+ operation_info = self.client.upsert(
149
+ collection_name=collection,
150
+ points=qdrant_points
151
+ )
152
+
153
+ logger.debug(f"Upserted {len(points)} points to {collection}")
154
+ return len(points)
155
+
156
+ except Exception as e:
157
+ raise StorageError(
158
+ operation="upsert_points",
159
+ collection=collection,
160
+ reason=str(e)
161
+ )
162
+
163
+ def get_collection_info(self, name: str) -> Optional[CollectionInfo]:
164
+ """Get information about a collection."""
165
+ if not self._initialized:
166
+ return None
167
+
168
+ try:
169
+ return self.client.get_collection(collection_name=name)
170
+ except Exception as e:
171
+ logger.error(f"Failed to get collection info: {e}")
172
+ return None
173
+
174
+ def delete_collection(self, name: str) -> bool:
175
+ """Delete a collection."""
176
+ if not self._initialized:
177
+ raise StorageError(
178
+ operation="delete_collection",
179
+ collection=name,
180
+ reason="Storage not initialized"
181
+ )
182
+
183
+ try:
184
+ if not self.collection_exists(name):
185
+ return False
186
+
187
+ self.client.delete_collection(collection_name=name)
188
+ logger.info(f"Deleted collection {name}")
189
+ return True
190
+
191
+ except Exception as e:
192
+ raise StorageError(
193
+ operation="delete_collection",
194
+ collection=name,
195
+ reason=str(e)
196
+ )
197
+
198
+ def get_collections(self) -> List[str]:
199
+ """Get list of all collection names."""
200
+ if not self._initialized:
201
+ return []
202
+
203
+ try:
204
+ collections = self.client.get_collections().collections
205
+ return [c.name for c in collections]
206
+ except Exception as e:
207
+ logger.error(f"Failed to get collections: {e}")
208
+ return []
209
+
210
+ def count_points(self, collection: str) -> int:
211
+ """Count points in a collection."""
212
+ if not self._initialized:
213
+ return 0
214
+
215
+ try:
216
+ info = self.get_collection_info(collection)
217
+ return info.points_count if info else 0
218
+ except Exception as e:
219
+ logger.error(f"Failed to count points: {e}")
220
+ return 0
221
+
222
+ def _generate_point_id(self, string_id: str) -> str:
223
+ """
224
+ Generate a valid point ID from a string.
225
+
226
+ Qdrant accepts string IDs directly in newer versions.
227
+ For older versions, we might need to hash to integer.
228
+ """
229
+ # For now, use string IDs directly
230
+ # If needed, can hash: int(hashlib.md5(string_id.encode()).hexdigest()[:16], 16)
231
+ return string_id
232
+
233
+ def health_check(self) -> Dict[str, Any]:
234
+ """Check health of Qdrant connection."""
235
+ if not self._initialized:
236
+ return {"healthy": False, "reason": "Not initialized"}
237
+
238
+ try:
239
+ collections = self.client.get_collections()
240
+ return {
241
+ "healthy": True,
242
+ "url": self.url,
243
+ "collections_count": len(collections.collections),
244
+ "total_points": sum(
245
+ self.count_points(c.name)
246
+ for c in collections.collections
247
+ )
248
+ }
249
+ except Exception as e:
250
+ return {"healthy": False, "reason": str(e)}
@@ -0,0 +1,9 @@
1
+ """Utility functions for the import system."""
2
+
3
+ from .project_normalizer import ProjectNormalizer
4
+ from .logger import setup_logging
5
+
6
+ __all__ = [
7
+ "ProjectNormalizer",
8
+ "setup_logging"
9
+ ]
@@ -0,0 +1,87 @@
1
+ """Centralized logging configuration."""
2
+
3
+ import logging
4
+ import sys
5
+ from pathlib import Path
6
+ from datetime import datetime
7
+ from typing import Optional
8
+
9
+
10
+ def setup_logging(
11
+ level: str = "INFO",
12
+ log_file: Optional[str] = None,
13
+ format_string: Optional[str] = None
14
+ ) -> logging.Logger:
15
+ """
16
+ Set up logging configuration.
17
+
18
+ Args:
19
+ level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
20
+ log_file: Optional log file path
21
+ format_string: Optional custom format string
22
+
23
+ Returns:
24
+ Configured root logger
25
+ """
26
+ # Default format
27
+ if not format_string:
28
+ format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
29
+
30
+ # Configure root logger
31
+ logging.basicConfig(
32
+ level=getattr(logging, level.upper()),
33
+ format=format_string,
34
+ handlers=[] # Clear default handlers
35
+ )
36
+
37
+ logger = logging.getLogger()
38
+
39
+ # Console handler
40
+ console_handler = logging.StreamHandler(sys.stdout)
41
+ console_handler.setFormatter(logging.Formatter(format_string))
42
+ logger.addHandler(console_handler)
43
+
44
+ # File handler if specified
45
+ if log_file:
46
+ log_path = Path(log_file)
47
+ log_path.parent.mkdir(parents=True, exist_ok=True)
48
+
49
+ file_handler = logging.FileHandler(log_path)
50
+ file_handler.setFormatter(logging.Formatter(format_string))
51
+ logger.addHandler(file_handler)
52
+
53
+ return logger
54
+
55
+
56
+ class ProgressLogger:
57
+ """Logger for tracking import progress."""
58
+
59
+ def __init__(self, total: int, logger: Optional[logging.Logger] = None):
60
+ self.total = total
61
+ self.current = 0
62
+ self.logger = logger or logging.getLogger(__name__)
63
+ self.start_time = datetime.now()
64
+
65
+ def update(self, increment: int = 1, message: Optional[str] = None) -> None:
66
+ """Update progress."""
67
+ self.current += increment
68
+ percentage = (self.current / self.total * 100) if self.total > 0 else 0
69
+
70
+ elapsed = (datetime.now() - self.start_time).total_seconds()
71
+ rate = self.current / elapsed if elapsed > 0 else 0
72
+ eta = (self.total - self.current) / rate if rate > 0 else 0
73
+
74
+ log_message = f"Progress: {self.current}/{self.total} ({percentage:.1f}%)"
75
+ if message:
76
+ log_message += f" - {message}"
77
+ log_message += f" - Rate: {rate:.1f}/s - ETA: {eta:.0f}s"
78
+
79
+ self.logger.info(log_message)
80
+
81
+ def complete(self) -> None:
82
+ """Mark as complete."""
83
+ elapsed = (datetime.now() - self.start_time).total_seconds()
84
+ self.logger.info(
85
+ f"Completed {self.total} items in {elapsed:.1f}s "
86
+ f"({self.total/elapsed:.1f} items/s)"
87
+ )
@@ -0,0 +1,120 @@
1
+ """Project name normalization utilities."""
2
+
3
+ import hashlib
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class ProjectNormalizer:
11
+ """
12
+ Normalize project names and generate collection names.
13
+
14
+ This is the CRITICAL component that was broken before.
15
+ Must correctly extract project names from Claude's dash-separated format.
16
+ """
17
+
18
+ @staticmethod
19
+ def normalize_project_name(project_path: str) -> str:
20
+ """
21
+ Normalize a project path to a consistent project name.
22
+
23
+ CRITICAL: This must match the implementation in utils.py
24
+
25
+ Examples:
26
+ - "-Users-name-projects-claude-self-reflect" -> "claude-self-reflect"
27
+ - "claude-self-reflect" -> "claude-self-reflect"
28
+ - "/path/to/-Users-name-projects-myapp" -> "myapp"
29
+ """
30
+ # Get the final component of the path
31
+ if '/' in project_path:
32
+ final_component = project_path.split('/')[-1]
33
+ else:
34
+ final_component = project_path
35
+
36
+ # Handle Claude's dash-separated format
37
+ if final_component.startswith('-') and 'projects' in final_component:
38
+ # Find the last occurrence of 'projects-'
39
+ idx = final_component.rfind('projects-')
40
+ if idx != -1:
41
+ # Extract everything after 'projects-'
42
+ project_name = final_component[idx + len('projects-'):]
43
+ logger.debug(f"Normalized '{project_path}' to '{project_name}'")
44
+ return project_name
45
+
46
+ # Already normalized or different format
47
+ logger.debug(f"Project path '{project_path}' already normalized")
48
+ return final_component
49
+
50
+ def get_project_name(self, file_path: Path) -> str:
51
+ """
52
+ Extract project name from a file path.
53
+
54
+ Args:
55
+ file_path: Path to a conversation file
56
+
57
+ Returns:
58
+ Normalized project name
59
+ """
60
+ # Get the parent directory name
61
+ parent_name = file_path.parent.name
62
+
63
+ # Normalize it
64
+ return self.normalize_project_name(parent_name)
65
+
66
+ def get_collection_name(self, file_path: Path) -> str:
67
+ """
68
+ Generate collection name for a file.
69
+
70
+ Format: conv_HASH_local
71
+ Where HASH is first 8 chars of MD5 hash of normalized project name.
72
+ """
73
+ project_name = self.get_project_name(file_path)
74
+
75
+ # Generate hash
76
+ project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
77
+
78
+ # Generate collection name
79
+ collection_name = f"conv_{project_hash}_local"
80
+
81
+ logger.debug(f"Collection for project '{project_name}': {collection_name}")
82
+ return collection_name
83
+
84
+ @staticmethod
85
+ def validate_normalization() -> bool:
86
+ """
87
+ Self-test to ensure normalization is working correctly.
88
+
89
+ Returns:
90
+ True if all tests pass
91
+ """
92
+ test_cases = [
93
+ ("-Users-name-projects-claude-self-reflect", "claude-self-reflect", "7f6df0fc"),
94
+ ("claude-self-reflect", "claude-self-reflect", "7f6df0fc"),
95
+ ("/Users/name/.claude/projects/-Users-name-projects-myapp", "myapp", None),
96
+ ("-Users-name-projects-procsolve-website", "procsolve-website", "9f2f312b")
97
+ ]
98
+
99
+ normalizer = ProjectNormalizer()
100
+ all_passed = True
101
+
102
+ for input_path, expected_name, expected_hash in test_cases:
103
+ normalized = normalizer.normalize_project_name(input_path)
104
+ if normalized != expected_name:
105
+ logger.error(
106
+ f"Normalization failed: '{input_path}' -> '{normalized}' "
107
+ f"(expected '{expected_name}')"
108
+ )
109
+ all_passed = False
110
+
111
+ if expected_hash:
112
+ actual_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
113
+ if actual_hash != expected_hash:
114
+ logger.error(
115
+ f"Hash mismatch for '{normalized}': "
116
+ f"{actual_hash} != {expected_hash}"
117
+ )
118
+ all_passed = False
119
+
120
+ return all_passed