claude-self-reflect 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-server/pyproject.toml +1 -0
- package/package.json +2 -1
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""State management with atomic writes."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, Any, Optional, Set
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
import fcntl
|
|
10
|
+
import tempfile
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StateManager:
|
|
16
|
+
"""
|
|
17
|
+
Manage import state with atomic writes and crash recovery.
|
|
18
|
+
|
|
19
|
+
Implements the critical STATE_FILE fixes:
|
|
20
|
+
1. Handle empty dirname case
|
|
21
|
+
2. Atomic writes to prevent corruption
|
|
22
|
+
3. File locking for concurrent access
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, state_file: Path):
|
|
26
|
+
self.state_file = Path(state_file)
|
|
27
|
+
self.state: Dict[str, Any] = self._load_state()
|
|
28
|
+
self._lock_file = None
|
|
29
|
+
|
|
30
|
+
def _load_state(self) -> Dict[str, Any]:
|
|
31
|
+
"""Load state from file or create new."""
|
|
32
|
+
if self.state_file.exists():
|
|
33
|
+
try:
|
|
34
|
+
with open(self.state_file, 'r') as f:
|
|
35
|
+
state = json.load(f)
|
|
36
|
+
logger.debug(f"Loaded state with {len(state.get('processed', []))} processed files")
|
|
37
|
+
return state
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.error(f"Failed to load state: {e}")
|
|
40
|
+
# Create backup of corrupted state
|
|
41
|
+
backup = self.state_file.with_suffix('.corrupt')
|
|
42
|
+
try:
|
|
43
|
+
self.state_file.rename(backup)
|
|
44
|
+
logger.info(f"Backed up corrupted state to {backup}")
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
# Return default state
|
|
49
|
+
return {
|
|
50
|
+
"processed": {},
|
|
51
|
+
"failed": {},
|
|
52
|
+
"last_updated": None,
|
|
53
|
+
"version": "3.0.0"
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def save_state(self) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Save state with atomic write.
|
|
59
|
+
|
|
60
|
+
Implements the critical fix for STATE_FILE directory handling.
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
# FIX: Handle case where STATE_FILE has no directory component
|
|
64
|
+
state_dir = self.state_file.parent
|
|
65
|
+
if state_dir and state_dir != Path('.'):
|
|
66
|
+
state_dir.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
# Update timestamp
|
|
69
|
+
self.state["last_updated"] = datetime.now().isoformat()
|
|
70
|
+
|
|
71
|
+
# Use atomic write to prevent corruption during crashes
|
|
72
|
+
# Create temp file in same directory for atomic rename
|
|
73
|
+
temp_fd, temp_path = tempfile.mkstemp(
|
|
74
|
+
dir=str(state_dir) if state_dir else '.',
|
|
75
|
+
prefix='.tmp_state_',
|
|
76
|
+
suffix='.json'
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
# Write to temp file
|
|
81
|
+
with os.fdopen(temp_fd, 'w') as f:
|
|
82
|
+
json.dump(self.state, f, indent=2)
|
|
83
|
+
|
|
84
|
+
# Atomic rename (on POSIX systems)
|
|
85
|
+
Path(temp_path).replace(self.state_file)
|
|
86
|
+
|
|
87
|
+
logger.debug("State saved successfully")
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
# Clean up temp file on error
|
|
91
|
+
try:
|
|
92
|
+
os.unlink(temp_path)
|
|
93
|
+
except:
|
|
94
|
+
pass
|
|
95
|
+
raise e
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.error(f"Failed to save state: {e}")
|
|
99
|
+
raise
|
|
100
|
+
|
|
101
|
+
def is_processed(self, file_path: Path) -> bool:
|
|
102
|
+
"""Check if file has been processed."""
|
|
103
|
+
return str(file_path) in self.state.get("processed", {})
|
|
104
|
+
|
|
105
|
+
def mark_processed(self, file_path: Path, points_created: int) -> None:
|
|
106
|
+
"""Mark file as processed."""
|
|
107
|
+
if "processed" not in self.state:
|
|
108
|
+
self.state["processed"] = {}
|
|
109
|
+
|
|
110
|
+
self.state["processed"][str(file_path)] = {
|
|
111
|
+
"timestamp": datetime.now().isoformat(),
|
|
112
|
+
"points_created": points_created
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Remove from failed if present
|
|
116
|
+
if str(file_path) in self.state.get("failed", {}):
|
|
117
|
+
del self.state["failed"][str(file_path)]
|
|
118
|
+
|
|
119
|
+
self.save_state()
|
|
120
|
+
|
|
121
|
+
def mark_failed(self, file_path: Path, error: str) -> None:
|
|
122
|
+
"""Mark file as failed."""
|
|
123
|
+
if "failed" not in self.state:
|
|
124
|
+
self.state["failed"] = {}
|
|
125
|
+
|
|
126
|
+
self.state["failed"][str(file_path)] = {
|
|
127
|
+
"timestamp": datetime.now().isoformat(),
|
|
128
|
+
"error": error
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
self.save_state()
|
|
132
|
+
|
|
133
|
+
def get_processed_files(self) -> Set[str]:
|
|
134
|
+
"""Get set of processed file paths."""
|
|
135
|
+
return set(self.state.get("processed", {}).keys())
|
|
136
|
+
|
|
137
|
+
def get_failed_files(self) -> Set[str]:
|
|
138
|
+
"""Get set of failed file paths."""
|
|
139
|
+
return set(self.state.get("failed", {}).keys())
|
|
140
|
+
|
|
141
|
+
def reset(self) -> None:
|
|
142
|
+
"""Reset state to empty."""
|
|
143
|
+
self.state = {
|
|
144
|
+
"processed": {},
|
|
145
|
+
"failed": {},
|
|
146
|
+
"last_updated": datetime.now().isoformat(),
|
|
147
|
+
"version": "3.0.0"
|
|
148
|
+
}
|
|
149
|
+
self.save_state()
|
|
150
|
+
|
|
151
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
152
|
+
"""Get import statistics."""
|
|
153
|
+
processed = self.state.get("processed", {})
|
|
154
|
+
failed = self.state.get("failed", {})
|
|
155
|
+
|
|
156
|
+
total_points = sum(
|
|
157
|
+
info.get("points_created", 0)
|
|
158
|
+
for info in processed.values()
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
"processed_count": len(processed),
|
|
163
|
+
"failed_count": len(failed),
|
|
164
|
+
"total_points": total_points,
|
|
165
|
+
"last_updated": self.state.get("last_updated"),
|
|
166
|
+
"version": self.state.get("version")
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
def acquire_lock(self) -> bool:
|
|
170
|
+
"""Acquire exclusive lock for state file (Unix/Linux only)."""
|
|
171
|
+
if os.name == 'nt': # Windows
|
|
172
|
+
logger.warning("File locking not implemented for Windows")
|
|
173
|
+
return True # Allow operation to proceed without locking on Windows
|
|
174
|
+
try:
|
|
175
|
+
lock_path = self.state_file.with_suffix('.lock')
|
|
176
|
+
self._lock_file = open(lock_path, 'w')
|
|
177
|
+
fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
178
|
+
return True
|
|
179
|
+
except (IOError, OSError):
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
def release_lock(self) -> None:
|
|
183
|
+
"""Release state file lock."""
|
|
184
|
+
if self._lock_file:
|
|
185
|
+
try:
|
|
186
|
+
fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_UN)
|
|
187
|
+
self._lock_file.close()
|
|
188
|
+
except:
|
|
189
|
+
pass
|
|
190
|
+
self._lock_file = None
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""Qdrant vector database storage implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Optional, Dict, Any
|
|
5
|
+
from qdrant_client import QdrantClient
|
|
6
|
+
from qdrant_client.models import (
|
|
7
|
+
Distance,
|
|
8
|
+
VectorParams,
|
|
9
|
+
PointStruct,
|
|
10
|
+
CollectionInfo
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from ..core import ProcessedPoint
|
|
14
|
+
from ..core.exceptions import StorageError
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class QdrantStorage:
|
|
20
|
+
"""
|
|
21
|
+
Qdrant storage backend implementation.
|
|
22
|
+
|
|
23
|
+
Handles all interactions with the Qdrant vector database.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, url: str = "http://localhost:6333", api_key: Optional[str] = None):
|
|
27
|
+
self.url = url
|
|
28
|
+
self.api_key = api_key
|
|
29
|
+
self.client = None
|
|
30
|
+
self._initialized = False
|
|
31
|
+
|
|
32
|
+
def initialize(self) -> None:
|
|
33
|
+
"""Initialize connection to Qdrant."""
|
|
34
|
+
try:
|
|
35
|
+
self.client = QdrantClient(
|
|
36
|
+
url=self.url,
|
|
37
|
+
api_key=self.api_key,
|
|
38
|
+
timeout=30
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Test connection
|
|
42
|
+
self.client.get_collections()
|
|
43
|
+
self._initialized = True
|
|
44
|
+
logger.info(f"Connected to Qdrant at {self.url}")
|
|
45
|
+
|
|
46
|
+
except Exception as e:
|
|
47
|
+
raise StorageError(
|
|
48
|
+
operation="initialize",
|
|
49
|
+
collection="N/A",
|
|
50
|
+
reason=f"Failed to connect to Qdrant: {e}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def create_collection(self, name: str, dimension: int) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Create a new collection if it doesn't exist.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
name: Collection name
|
|
59
|
+
dimension: Vector dimension
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
True if created, False if already exists
|
|
63
|
+
"""
|
|
64
|
+
if not self._initialized:
|
|
65
|
+
raise StorageError(
|
|
66
|
+
operation="create_collection",
|
|
67
|
+
collection=name,
|
|
68
|
+
reason="Storage not initialized"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
if self.collection_exists(name):
|
|
73
|
+
logger.debug(f"Collection {name} already exists")
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
self.client.create_collection(
|
|
77
|
+
collection_name=name,
|
|
78
|
+
vectors_config=VectorParams(
|
|
79
|
+
size=dimension,
|
|
80
|
+
distance=Distance.COSINE
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
logger.info(f"Created collection {name} with dimension {dimension}")
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
raise StorageError(
|
|
89
|
+
operation="create_collection",
|
|
90
|
+
collection=name,
|
|
91
|
+
reason=str(e)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def collection_exists(self, name: str) -> bool:
|
|
95
|
+
"""Check if a collection exists."""
|
|
96
|
+
if not self._initialized:
|
|
97
|
+
raise StorageError(
|
|
98
|
+
operation="collection_exists",
|
|
99
|
+
collection=name,
|
|
100
|
+
reason="Storage not initialized"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
collections = self.client.get_collections().collections
|
|
105
|
+
return any(c.name == name for c in collections)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Failed to check collection existence: {e}")
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
def upsert_points(self, collection: str, points: List[ProcessedPoint]) -> int:
|
|
111
|
+
"""
|
|
112
|
+
Insert or update points in a collection.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
collection: Collection name
|
|
116
|
+
points: List of points to upsert
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Number of points upserted
|
|
120
|
+
"""
|
|
121
|
+
if not self._initialized:
|
|
122
|
+
raise StorageError(
|
|
123
|
+
operation="upsert_points",
|
|
124
|
+
collection=collection,
|
|
125
|
+
reason="Storage not initialized"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
if not points:
|
|
129
|
+
return 0
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
# Ensure collection exists with correct dimension
|
|
133
|
+
if not self.collection_exists(collection):
|
|
134
|
+
dimension = len(points[0].vector)
|
|
135
|
+
self.create_collection(collection, dimension)
|
|
136
|
+
|
|
137
|
+
# Convert to Qdrant points
|
|
138
|
+
qdrant_points = [
|
|
139
|
+
PointStruct(
|
|
140
|
+
id=self._generate_point_id(point.id),
|
|
141
|
+
vector=point.vector,
|
|
142
|
+
payload=point.payload
|
|
143
|
+
)
|
|
144
|
+
for point in points
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
# Batch upsert
|
|
148
|
+
operation_info = self.client.upsert(
|
|
149
|
+
collection_name=collection,
|
|
150
|
+
points=qdrant_points
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
logger.debug(f"Upserted {len(points)} points to {collection}")
|
|
154
|
+
return len(points)
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
raise StorageError(
|
|
158
|
+
operation="upsert_points",
|
|
159
|
+
collection=collection,
|
|
160
|
+
reason=str(e)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def get_collection_info(self, name: str) -> Optional[CollectionInfo]:
|
|
164
|
+
"""Get information about a collection."""
|
|
165
|
+
if not self._initialized:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
return self.client.get_collection(collection_name=name)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Failed to get collection info: {e}")
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
def delete_collection(self, name: str) -> bool:
|
|
175
|
+
"""Delete a collection."""
|
|
176
|
+
if not self._initialized:
|
|
177
|
+
raise StorageError(
|
|
178
|
+
operation="delete_collection",
|
|
179
|
+
collection=name,
|
|
180
|
+
reason="Storage not initialized"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
if not self.collection_exists(name):
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
self.client.delete_collection(collection_name=name)
|
|
188
|
+
logger.info(f"Deleted collection {name}")
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
raise StorageError(
|
|
193
|
+
operation="delete_collection",
|
|
194
|
+
collection=name,
|
|
195
|
+
reason=str(e)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def get_collections(self) -> List[str]:
|
|
199
|
+
"""Get list of all collection names."""
|
|
200
|
+
if not self._initialized:
|
|
201
|
+
return []
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
collections = self.client.get_collections().collections
|
|
205
|
+
return [c.name for c in collections]
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error(f"Failed to get collections: {e}")
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
def count_points(self, collection: str) -> int:
|
|
211
|
+
"""Count points in a collection."""
|
|
212
|
+
if not self._initialized:
|
|
213
|
+
return 0
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
info = self.get_collection_info(collection)
|
|
217
|
+
return info.points_count if info else 0
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Failed to count points: {e}")
|
|
220
|
+
return 0
|
|
221
|
+
|
|
222
|
+
def _generate_point_id(self, string_id: str) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Generate a valid point ID from a string.
|
|
225
|
+
|
|
226
|
+
Qdrant accepts string IDs directly in newer versions.
|
|
227
|
+
For older versions, we might need to hash to integer.
|
|
228
|
+
"""
|
|
229
|
+
# For now, use string IDs directly
|
|
230
|
+
# If needed, can hash: int(hashlib.md5(string_id.encode()).hexdigest()[:16], 16)
|
|
231
|
+
return string_id
|
|
232
|
+
|
|
233
|
+
def health_check(self) -> Dict[str, Any]:
|
|
234
|
+
"""Check health of Qdrant connection."""
|
|
235
|
+
if not self._initialized:
|
|
236
|
+
return {"healthy": False, "reason": "Not initialized"}
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
collections = self.client.get_collections()
|
|
240
|
+
return {
|
|
241
|
+
"healthy": True,
|
|
242
|
+
"url": self.url,
|
|
243
|
+
"collections_count": len(collections.collections),
|
|
244
|
+
"total_points": sum(
|
|
245
|
+
self.count_points(c.name)
|
|
246
|
+
for c in collections.collections
|
|
247
|
+
)
|
|
248
|
+
}
|
|
249
|
+
except Exception as e:
|
|
250
|
+
return {"healthy": False, "reason": str(e)}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Centralized logging configuration."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def setup_logging(
|
|
11
|
+
level: str = "INFO",
|
|
12
|
+
log_file: Optional[str] = None,
|
|
13
|
+
format_string: Optional[str] = None
|
|
14
|
+
) -> logging.Logger:
|
|
15
|
+
"""
|
|
16
|
+
Set up logging configuration.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
20
|
+
log_file: Optional log file path
|
|
21
|
+
format_string: Optional custom format string
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Configured root logger
|
|
25
|
+
"""
|
|
26
|
+
# Default format
|
|
27
|
+
if not format_string:
|
|
28
|
+
format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
29
|
+
|
|
30
|
+
# Configure root logger
|
|
31
|
+
logging.basicConfig(
|
|
32
|
+
level=getattr(logging, level.upper()),
|
|
33
|
+
format=format_string,
|
|
34
|
+
handlers=[] # Clear default handlers
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger()
|
|
38
|
+
|
|
39
|
+
# Console handler
|
|
40
|
+
console_handler = logging.StreamHandler(sys.stdout)
|
|
41
|
+
console_handler.setFormatter(logging.Formatter(format_string))
|
|
42
|
+
logger.addHandler(console_handler)
|
|
43
|
+
|
|
44
|
+
# File handler if specified
|
|
45
|
+
if log_file:
|
|
46
|
+
log_path = Path(log_file)
|
|
47
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
file_handler = logging.FileHandler(log_path)
|
|
50
|
+
file_handler.setFormatter(logging.Formatter(format_string))
|
|
51
|
+
logger.addHandler(file_handler)
|
|
52
|
+
|
|
53
|
+
return logger
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ProgressLogger:
|
|
57
|
+
"""Logger for tracking import progress."""
|
|
58
|
+
|
|
59
|
+
def __init__(self, total: int, logger: Optional[logging.Logger] = None):
|
|
60
|
+
self.total = total
|
|
61
|
+
self.current = 0
|
|
62
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
63
|
+
self.start_time = datetime.now()
|
|
64
|
+
|
|
65
|
+
def update(self, increment: int = 1, message: Optional[str] = None) -> None:
|
|
66
|
+
"""Update progress."""
|
|
67
|
+
self.current += increment
|
|
68
|
+
percentage = (self.current / self.total * 100) if self.total > 0 else 0
|
|
69
|
+
|
|
70
|
+
elapsed = (datetime.now() - self.start_time).total_seconds()
|
|
71
|
+
rate = self.current / elapsed if elapsed > 0 else 0
|
|
72
|
+
eta = (self.total - self.current) / rate if rate > 0 else 0
|
|
73
|
+
|
|
74
|
+
log_message = f"Progress: {self.current}/{self.total} ({percentage:.1f}%)"
|
|
75
|
+
if message:
|
|
76
|
+
log_message += f" - {message}"
|
|
77
|
+
log_message += f" - Rate: {rate:.1f}/s - ETA: {eta:.0f}s"
|
|
78
|
+
|
|
79
|
+
self.logger.info(log_message)
|
|
80
|
+
|
|
81
|
+
def complete(self) -> None:
|
|
82
|
+
"""Mark as complete."""
|
|
83
|
+
elapsed = (datetime.now() - self.start_time).total_seconds()
|
|
84
|
+
self.logger.info(
|
|
85
|
+
f"Completed {self.total} items in {elapsed:.1f}s "
|
|
86
|
+
f"({self.total/elapsed:.1f} items/s)"
|
|
87
|
+
)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Project name normalization utilities."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ProjectNormalizer:
|
|
11
|
+
"""
|
|
12
|
+
Normalize project names and generate collection names.
|
|
13
|
+
|
|
14
|
+
This is the CRITICAL component that was broken before.
|
|
15
|
+
Must correctly extract project names from Claude's dash-separated format.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def normalize_project_name(project_path: str) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Normalize a project path to a consistent project name.
|
|
22
|
+
|
|
23
|
+
CRITICAL: This must match the implementation in utils.py
|
|
24
|
+
|
|
25
|
+
Examples:
|
|
26
|
+
- "-Users-name-projects-claude-self-reflect" -> "claude-self-reflect"
|
|
27
|
+
- "claude-self-reflect" -> "claude-self-reflect"
|
|
28
|
+
- "/path/to/-Users-name-projects-myapp" -> "myapp"
|
|
29
|
+
"""
|
|
30
|
+
# Get the final component of the path
|
|
31
|
+
if '/' in project_path:
|
|
32
|
+
final_component = project_path.split('/')[-1]
|
|
33
|
+
else:
|
|
34
|
+
final_component = project_path
|
|
35
|
+
|
|
36
|
+
# Handle Claude's dash-separated format
|
|
37
|
+
if final_component.startswith('-') and 'projects' in final_component:
|
|
38
|
+
# Find the last occurrence of 'projects-'
|
|
39
|
+
idx = final_component.rfind('projects-')
|
|
40
|
+
if idx != -1:
|
|
41
|
+
# Extract everything after 'projects-'
|
|
42
|
+
project_name = final_component[idx + len('projects-'):]
|
|
43
|
+
logger.debug(f"Normalized '{project_path}' to '{project_name}'")
|
|
44
|
+
return project_name
|
|
45
|
+
|
|
46
|
+
# Already normalized or different format
|
|
47
|
+
logger.debug(f"Project path '{project_path}' already normalized")
|
|
48
|
+
return final_component
|
|
49
|
+
|
|
50
|
+
def get_project_name(self, file_path: Path) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Extract project name from a file path.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
file_path: Path to a conversation file
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Normalized project name
|
|
59
|
+
"""
|
|
60
|
+
# Get the parent directory name
|
|
61
|
+
parent_name = file_path.parent.name
|
|
62
|
+
|
|
63
|
+
# Normalize it
|
|
64
|
+
return self.normalize_project_name(parent_name)
|
|
65
|
+
|
|
66
|
+
def get_collection_name(self, file_path: Path) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Generate collection name for a file.
|
|
69
|
+
|
|
70
|
+
Format: conv_HASH_local
|
|
71
|
+
Where HASH is first 8 chars of MD5 hash of normalized project name.
|
|
72
|
+
"""
|
|
73
|
+
project_name = self.get_project_name(file_path)
|
|
74
|
+
|
|
75
|
+
# Generate hash
|
|
76
|
+
project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
|
|
77
|
+
|
|
78
|
+
# Generate collection name
|
|
79
|
+
collection_name = f"conv_{project_hash}_local"
|
|
80
|
+
|
|
81
|
+
logger.debug(f"Collection for project '{project_name}': {collection_name}")
|
|
82
|
+
return collection_name
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def validate_normalization() -> bool:
|
|
86
|
+
"""
|
|
87
|
+
Self-test to ensure normalization is working correctly.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
True if all tests pass
|
|
91
|
+
"""
|
|
92
|
+
test_cases = [
|
|
93
|
+
("-Users-name-projects-claude-self-reflect", "claude-self-reflect", "7f6df0fc"),
|
|
94
|
+
("claude-self-reflect", "claude-self-reflect", "7f6df0fc"),
|
|
95
|
+
("/Users/name/.claude/projects/-Users-name-projects-myapp", "myapp", None),
|
|
96
|
+
("-Users-name-projects-procsolve-website", "procsolve-website", "9f2f312b")
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
normalizer = ProjectNormalizer()
|
|
100
|
+
all_passed = True
|
|
101
|
+
|
|
102
|
+
for input_path, expected_name, expected_hash in test_cases:
|
|
103
|
+
normalized = normalizer.normalize_project_name(input_path)
|
|
104
|
+
if normalized != expected_name:
|
|
105
|
+
logger.error(
|
|
106
|
+
f"Normalization failed: '{input_path}' -> '{normalized}' "
|
|
107
|
+
f"(expected '{expected_name}')"
|
|
108
|
+
)
|
|
109
|
+
all_passed = False
|
|
110
|
+
|
|
111
|
+
if expected_hash:
|
|
112
|
+
actual_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
113
|
+
if actual_hash != expected_hash:
|
|
114
|
+
logger.error(
|
|
115
|
+
f"Hash mismatch for '{normalized}': "
|
|
116
|
+
f"{actual_hash} != {expected_hash}"
|
|
117
|
+
)
|
|
118
|
+
all_passed = False
|
|
119
|
+
|
|
120
|
+
return all_passed
|