claude-self-reflect 3.0.0 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +110 -66
- package/README.md +1 -1
- package/installer/setup-wizard.js +4 -2
- package/mcp-server/pyproject.toml +1 -0
- package/mcp-server/src/server.py +84 -0
- package/package.json +2 -1
- package/scripts/import-conversations-unified.py +225 -44
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Extract tool usage and file references from conversations."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, Any, Set, List
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ToolUsageExtractor:
|
|
12
|
+
"""Extract files analyzed, edited, and tools used from conversations."""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
# Patterns for file operations
|
|
16
|
+
self.file_patterns = {
|
|
17
|
+
'analyzed': [
|
|
18
|
+
re.compile(r'(?:reading|analyzing|examining|looking at|checking)\s+(?:file\s+)?([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
19
|
+
re.compile(r'(?:Read|read)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
20
|
+
re.compile(r'(?:in|from)\s+file\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
|
|
21
|
+
],
|
|
22
|
+
'edited': [
|
|
23
|
+
re.compile(r'(?:editing|modifying|updating|changing|writing to)\s+(?:file\s+)?([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
24
|
+
re.compile(r'(?:Edit|Write)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
25
|
+
re.compile(r'(?:changes to|modified|updated)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
|
|
26
|
+
],
|
|
27
|
+
'created': [
|
|
28
|
+
re.compile(r'(?:creating|created|new file)\s+([/\w\-\.]+\.\w+)', re.IGNORECASE),
|
|
29
|
+
re.compile(r'(?:Write|Create)\s+new\s+file\s+([/\w\-\.]+\.\w+)', re.IGNORECASE)
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Tool patterns
|
|
34
|
+
self.tool_patterns = [
|
|
35
|
+
re.compile(r'(?:using|running|executing)\s+(\w+)\s+(?:tool|command)', re.IGNORECASE),
|
|
36
|
+
re.compile(r'(?:Tool:|Command:)\s*(\w+)', re.IGNORECASE),
|
|
37
|
+
re.compile(r'```(?:bash|shell|sh)\n([a-z]+)', re.IGNORECASE),
|
|
38
|
+
re.compile(r'\$\s+([a-z]+)\s+', re.IGNORECASE), # Command line
|
|
39
|
+
re.compile(r'(?:npm|yarn|pip|cargo|go|cargo)\s+([\w\-]+)', re.IGNORECASE)
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# MCP tool pattern
|
|
43
|
+
self.mcp_pattern = re.compile(r'mcp__([a-zA-Z0-9\-_]+)__([a-zA-Z0-9\-_]+)')
|
|
44
|
+
|
|
45
|
+
def extract(self, text: str) -> Dict[str, Any]:
|
|
46
|
+
"""
|
|
47
|
+
Extract tool usage information from text.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Dictionary with files_analyzed, files_edited, tools_used
|
|
51
|
+
"""
|
|
52
|
+
files_analyzed = set()
|
|
53
|
+
files_edited = set()
|
|
54
|
+
files_created = set()
|
|
55
|
+
tools_used = set()
|
|
56
|
+
|
|
57
|
+
# Extract file operations
|
|
58
|
+
for pattern in self.file_patterns['analyzed']:
|
|
59
|
+
matches = pattern.findall(text)
|
|
60
|
+
for match in matches:
|
|
61
|
+
file_path = self._normalize_file_path(match)
|
|
62
|
+
if file_path:
|
|
63
|
+
files_analyzed.add(file_path)
|
|
64
|
+
|
|
65
|
+
for pattern in self.file_patterns['edited']:
|
|
66
|
+
matches = pattern.findall(text)
|
|
67
|
+
for match in matches:
|
|
68
|
+
file_path = self._normalize_file_path(match)
|
|
69
|
+
if file_path:
|
|
70
|
+
files_edited.add(file_path)
|
|
71
|
+
|
|
72
|
+
for pattern in self.file_patterns['created']:
|
|
73
|
+
matches = pattern.findall(text)
|
|
74
|
+
for match in matches:
|
|
75
|
+
file_path = self._normalize_file_path(match)
|
|
76
|
+
if file_path:
|
|
77
|
+
files_created.add(file_path)
|
|
78
|
+
|
|
79
|
+
# Extract tools
|
|
80
|
+
for pattern in self.tool_patterns:
|
|
81
|
+
matches = pattern.findall(text)
|
|
82
|
+
for match in matches:
|
|
83
|
+
tool = match.lower().strip()
|
|
84
|
+
if self._is_valid_tool(tool):
|
|
85
|
+
tools_used.add(tool)
|
|
86
|
+
|
|
87
|
+
# Extract MCP tools specifically
|
|
88
|
+
mcp_matches = self.mcp_pattern.findall(text)
|
|
89
|
+
for server, tool in mcp_matches:
|
|
90
|
+
tools_used.add(f"mcp:{server}:{tool}")
|
|
91
|
+
|
|
92
|
+
# Look for common CLI tools
|
|
93
|
+
common_tools = [
|
|
94
|
+
'git', 'npm', 'yarn', 'pip', 'python', 'node', 'docker',
|
|
95
|
+
'kubectl', 'aws', 'gcloud', 'az', 'terraform', 'ansible',
|
|
96
|
+
'make', 'gradle', 'maven', 'cargo', 'go', 'rustc'
|
|
97
|
+
]
|
|
98
|
+
for tool in common_tools:
|
|
99
|
+
if re.search(rf'\b{tool}\b', text, re.IGNORECASE):
|
|
100
|
+
tools_used.add(tool)
|
|
101
|
+
|
|
102
|
+
# Combine all files for backward compatibility
|
|
103
|
+
all_files = files_analyzed | files_edited | files_created
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
"files_analyzed": list(files_analyzed)[:50],
|
|
107
|
+
"files_edited": list(files_edited)[:50],
|
|
108
|
+
"files_created": list(files_created)[:50],
|
|
109
|
+
"files": list(all_files)[:50], # Legacy field
|
|
110
|
+
"tools_used": list(tools_used)[:30],
|
|
111
|
+
"file_count": len(all_files),
|
|
112
|
+
"tool_count": len(tools_used)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
def _normalize_file_path(self, path: str) -> str:
|
|
116
|
+
"""Normalize and validate file path."""
|
|
117
|
+
# Remove quotes and whitespace
|
|
118
|
+
path = path.strip('\'"` \n')
|
|
119
|
+
|
|
120
|
+
# Skip if too short or too long
|
|
121
|
+
if len(path) < 3 or len(path) > 200:
|
|
122
|
+
return ""
|
|
123
|
+
|
|
124
|
+
# Must have an extension
|
|
125
|
+
if '.' not in path:
|
|
126
|
+
return ""
|
|
127
|
+
|
|
128
|
+
# Extract just the filename if it's a full path
|
|
129
|
+
if '/' in path:
|
|
130
|
+
# Get the last component
|
|
131
|
+
path = path.split('/')[-1]
|
|
132
|
+
|
|
133
|
+
# Validate extension
|
|
134
|
+
valid_extensions = {
|
|
135
|
+
'py', 'js', 'ts', 'jsx', 'tsx', 'java', 'go', 'rs', 'cpp', 'c', 'h',
|
|
136
|
+
'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'html', 'css', 'scss',
|
|
137
|
+
'sql', 'sh', 'bash', 'dockerfile', 'makefile', 'toml', 'ini', 'cfg'
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
ext = path.split('.')[-1].lower()
|
|
141
|
+
if ext not in valid_extensions:
|
|
142
|
+
return ""
|
|
143
|
+
|
|
144
|
+
return path
|
|
145
|
+
|
|
146
|
+
def _is_valid_tool(self, tool: str) -> bool:
|
|
147
|
+
"""Check if a string is a valid tool name."""
|
|
148
|
+
# Skip common words
|
|
149
|
+
skip_words = {
|
|
150
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
151
|
+
'this', 'that', 'with', 'from', 'as', 'is', 'was', 'are', 'were'
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if tool in skip_words:
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
# Must be alphanumeric with possible hyphens/underscores
|
|
158
|
+
if not re.match(r'^[a-z0-9\-_]+$', tool):
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
# Reasonable length
|
|
162
|
+
if len(tool) < 2 or len(tool) > 30:
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
return True
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""State management with atomic writes."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, Any, Optional, Set
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
import fcntl
|
|
10
|
+
import tempfile
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StateManager:
|
|
16
|
+
"""
|
|
17
|
+
Manage import state with atomic writes and crash recovery.
|
|
18
|
+
|
|
19
|
+
Implements the critical STATE_FILE fixes:
|
|
20
|
+
1. Handle empty dirname case
|
|
21
|
+
2. Atomic writes to prevent corruption
|
|
22
|
+
3. File locking for concurrent access
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, state_file: Path):
|
|
26
|
+
self.state_file = Path(state_file)
|
|
27
|
+
self.state: Dict[str, Any] = self._load_state()
|
|
28
|
+
self._lock_file = None
|
|
29
|
+
|
|
30
|
+
def _load_state(self) -> Dict[str, Any]:
|
|
31
|
+
"""Load state from file or create new."""
|
|
32
|
+
if self.state_file.exists():
|
|
33
|
+
try:
|
|
34
|
+
with open(self.state_file, 'r') as f:
|
|
35
|
+
state = json.load(f)
|
|
36
|
+
logger.debug(f"Loaded state with {len(state.get('processed', []))} processed files")
|
|
37
|
+
return state
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.error(f"Failed to load state: {e}")
|
|
40
|
+
# Create backup of corrupted state
|
|
41
|
+
backup = self.state_file.with_suffix('.corrupt')
|
|
42
|
+
try:
|
|
43
|
+
self.state_file.rename(backup)
|
|
44
|
+
logger.info(f"Backed up corrupted state to {backup}")
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
# Return default state
|
|
49
|
+
return {
|
|
50
|
+
"processed": {},
|
|
51
|
+
"failed": {},
|
|
52
|
+
"last_updated": None,
|
|
53
|
+
"version": "3.0.0"
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def save_state(self) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Save state with atomic write.
|
|
59
|
+
|
|
60
|
+
Implements the critical fix for STATE_FILE directory handling.
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
# FIX: Handle case where STATE_FILE has no directory component
|
|
64
|
+
state_dir = self.state_file.parent
|
|
65
|
+
if state_dir and state_dir != Path('.'):
|
|
66
|
+
state_dir.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
# Update timestamp
|
|
69
|
+
self.state["last_updated"] = datetime.now().isoformat()
|
|
70
|
+
|
|
71
|
+
# Use atomic write to prevent corruption during crashes
|
|
72
|
+
# Create temp file in same directory for atomic rename
|
|
73
|
+
temp_fd, temp_path = tempfile.mkstemp(
|
|
74
|
+
dir=str(state_dir) if state_dir else '.',
|
|
75
|
+
prefix='.tmp_state_',
|
|
76
|
+
suffix='.json'
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
# Write to temp file
|
|
81
|
+
with os.fdopen(temp_fd, 'w') as f:
|
|
82
|
+
json.dump(self.state, f, indent=2)
|
|
83
|
+
|
|
84
|
+
# Atomic rename (on POSIX systems)
|
|
85
|
+
Path(temp_path).replace(self.state_file)
|
|
86
|
+
|
|
87
|
+
logger.debug("State saved successfully")
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
# Clean up temp file on error
|
|
91
|
+
try:
|
|
92
|
+
os.unlink(temp_path)
|
|
93
|
+
except:
|
|
94
|
+
pass
|
|
95
|
+
raise e
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.error(f"Failed to save state: {e}")
|
|
99
|
+
raise
|
|
100
|
+
|
|
101
|
+
def is_processed(self, file_path: Path) -> bool:
|
|
102
|
+
"""Check if file has been processed."""
|
|
103
|
+
return str(file_path) in self.state.get("processed", {})
|
|
104
|
+
|
|
105
|
+
def mark_processed(self, file_path: Path, points_created: int) -> None:
|
|
106
|
+
"""Mark file as processed."""
|
|
107
|
+
if "processed" not in self.state:
|
|
108
|
+
self.state["processed"] = {}
|
|
109
|
+
|
|
110
|
+
self.state["processed"][str(file_path)] = {
|
|
111
|
+
"timestamp": datetime.now().isoformat(),
|
|
112
|
+
"points_created": points_created
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Remove from failed if present
|
|
116
|
+
if str(file_path) in self.state.get("failed", {}):
|
|
117
|
+
del self.state["failed"][str(file_path)]
|
|
118
|
+
|
|
119
|
+
self.save_state()
|
|
120
|
+
|
|
121
|
+
def mark_failed(self, file_path: Path, error: str) -> None:
|
|
122
|
+
"""Mark file as failed."""
|
|
123
|
+
if "failed" not in self.state:
|
|
124
|
+
self.state["failed"] = {}
|
|
125
|
+
|
|
126
|
+
self.state["failed"][str(file_path)] = {
|
|
127
|
+
"timestamp": datetime.now().isoformat(),
|
|
128
|
+
"error": error
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
self.save_state()
|
|
132
|
+
|
|
133
|
+
def get_processed_files(self) -> Set[str]:
|
|
134
|
+
"""Get set of processed file paths."""
|
|
135
|
+
return set(self.state.get("processed", {}).keys())
|
|
136
|
+
|
|
137
|
+
def get_failed_files(self) -> Set[str]:
|
|
138
|
+
"""Get set of failed file paths."""
|
|
139
|
+
return set(self.state.get("failed", {}).keys())
|
|
140
|
+
|
|
141
|
+
def reset(self) -> None:
|
|
142
|
+
"""Reset state to empty."""
|
|
143
|
+
self.state = {
|
|
144
|
+
"processed": {},
|
|
145
|
+
"failed": {},
|
|
146
|
+
"last_updated": datetime.now().isoformat(),
|
|
147
|
+
"version": "3.0.0"
|
|
148
|
+
}
|
|
149
|
+
self.save_state()
|
|
150
|
+
|
|
151
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
152
|
+
"""Get import statistics."""
|
|
153
|
+
processed = self.state.get("processed", {})
|
|
154
|
+
failed = self.state.get("failed", {})
|
|
155
|
+
|
|
156
|
+
total_points = sum(
|
|
157
|
+
info.get("points_created", 0)
|
|
158
|
+
for info in processed.values()
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
"processed_count": len(processed),
|
|
163
|
+
"failed_count": len(failed),
|
|
164
|
+
"total_points": total_points,
|
|
165
|
+
"last_updated": self.state.get("last_updated"),
|
|
166
|
+
"version": self.state.get("version")
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
def acquire_lock(self) -> bool:
|
|
170
|
+
"""Acquire exclusive lock for state file (Unix/Linux only)."""
|
|
171
|
+
if os.name == 'nt': # Windows
|
|
172
|
+
logger.warning("File locking not implemented for Windows")
|
|
173
|
+
return True # Allow operation to proceed without locking on Windows
|
|
174
|
+
try:
|
|
175
|
+
lock_path = self.state_file.with_suffix('.lock')
|
|
176
|
+
self._lock_file = open(lock_path, 'w')
|
|
177
|
+
fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
178
|
+
return True
|
|
179
|
+
except (IOError, OSError):
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
def release_lock(self) -> None:
|
|
183
|
+
"""Release state file lock."""
|
|
184
|
+
if self._lock_file:
|
|
185
|
+
try:
|
|
186
|
+
fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_UN)
|
|
187
|
+
self._lock_file.close()
|
|
188
|
+
except:
|
|
189
|
+
pass
|
|
190
|
+
self._lock_file = None
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""Qdrant vector database storage implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Optional, Dict, Any
|
|
5
|
+
from qdrant_client import QdrantClient
|
|
6
|
+
from qdrant_client.models import (
|
|
7
|
+
Distance,
|
|
8
|
+
VectorParams,
|
|
9
|
+
PointStruct,
|
|
10
|
+
CollectionInfo
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from ..core import ProcessedPoint
|
|
14
|
+
from ..core.exceptions import StorageError
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class QdrantStorage:
|
|
20
|
+
"""
|
|
21
|
+
Qdrant storage backend implementation.
|
|
22
|
+
|
|
23
|
+
Handles all interactions with the Qdrant vector database.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, url: str = "http://localhost:6333", api_key: Optional[str] = None):
|
|
27
|
+
self.url = url
|
|
28
|
+
self.api_key = api_key
|
|
29
|
+
self.client = None
|
|
30
|
+
self._initialized = False
|
|
31
|
+
|
|
32
|
+
def initialize(self) -> None:
|
|
33
|
+
"""Initialize connection to Qdrant."""
|
|
34
|
+
try:
|
|
35
|
+
self.client = QdrantClient(
|
|
36
|
+
url=self.url,
|
|
37
|
+
api_key=self.api_key,
|
|
38
|
+
timeout=30
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Test connection
|
|
42
|
+
self.client.get_collections()
|
|
43
|
+
self._initialized = True
|
|
44
|
+
logger.info(f"Connected to Qdrant at {self.url}")
|
|
45
|
+
|
|
46
|
+
except Exception as e:
|
|
47
|
+
raise StorageError(
|
|
48
|
+
operation="initialize",
|
|
49
|
+
collection="N/A",
|
|
50
|
+
reason=f"Failed to connect to Qdrant: {e}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def create_collection(self, name: str, dimension: int) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Create a new collection if it doesn't exist.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
name: Collection name
|
|
59
|
+
dimension: Vector dimension
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
True if created, False if already exists
|
|
63
|
+
"""
|
|
64
|
+
if not self._initialized:
|
|
65
|
+
raise StorageError(
|
|
66
|
+
operation="create_collection",
|
|
67
|
+
collection=name,
|
|
68
|
+
reason="Storage not initialized"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
if self.collection_exists(name):
|
|
73
|
+
logger.debug(f"Collection {name} already exists")
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
self.client.create_collection(
|
|
77
|
+
collection_name=name,
|
|
78
|
+
vectors_config=VectorParams(
|
|
79
|
+
size=dimension,
|
|
80
|
+
distance=Distance.COSINE
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
logger.info(f"Created collection {name} with dimension {dimension}")
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
raise StorageError(
|
|
89
|
+
operation="create_collection",
|
|
90
|
+
collection=name,
|
|
91
|
+
reason=str(e)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def collection_exists(self, name: str) -> bool:
|
|
95
|
+
"""Check if a collection exists."""
|
|
96
|
+
if not self._initialized:
|
|
97
|
+
raise StorageError(
|
|
98
|
+
operation="collection_exists",
|
|
99
|
+
collection=name,
|
|
100
|
+
reason="Storage not initialized"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
collections = self.client.get_collections().collections
|
|
105
|
+
return any(c.name == name for c in collections)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Failed to check collection existence: {e}")
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
def upsert_points(self, collection: str, points: List[ProcessedPoint]) -> int:
|
|
111
|
+
"""
|
|
112
|
+
Insert or update points in a collection.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
collection: Collection name
|
|
116
|
+
points: List of points to upsert
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Number of points upserted
|
|
120
|
+
"""
|
|
121
|
+
if not self._initialized:
|
|
122
|
+
raise StorageError(
|
|
123
|
+
operation="upsert_points",
|
|
124
|
+
collection=collection,
|
|
125
|
+
reason="Storage not initialized"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
if not points:
|
|
129
|
+
return 0
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
# Ensure collection exists with correct dimension
|
|
133
|
+
if not self.collection_exists(collection):
|
|
134
|
+
dimension = len(points[0].vector)
|
|
135
|
+
self.create_collection(collection, dimension)
|
|
136
|
+
|
|
137
|
+
# Convert to Qdrant points
|
|
138
|
+
qdrant_points = [
|
|
139
|
+
PointStruct(
|
|
140
|
+
id=self._generate_point_id(point.id),
|
|
141
|
+
vector=point.vector,
|
|
142
|
+
payload=point.payload
|
|
143
|
+
)
|
|
144
|
+
for point in points
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
# Batch upsert
|
|
148
|
+
operation_info = self.client.upsert(
|
|
149
|
+
collection_name=collection,
|
|
150
|
+
points=qdrant_points
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
logger.debug(f"Upserted {len(points)} points to {collection}")
|
|
154
|
+
return len(points)
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
raise StorageError(
|
|
158
|
+
operation="upsert_points",
|
|
159
|
+
collection=collection,
|
|
160
|
+
reason=str(e)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def get_collection_info(self, name: str) -> Optional[CollectionInfo]:
|
|
164
|
+
"""Get information about a collection."""
|
|
165
|
+
if not self._initialized:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
return self.client.get_collection(collection_name=name)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Failed to get collection info: {e}")
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
def delete_collection(self, name: str) -> bool:
|
|
175
|
+
"""Delete a collection."""
|
|
176
|
+
if not self._initialized:
|
|
177
|
+
raise StorageError(
|
|
178
|
+
operation="delete_collection",
|
|
179
|
+
collection=name,
|
|
180
|
+
reason="Storage not initialized"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
if not self.collection_exists(name):
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
self.client.delete_collection(collection_name=name)
|
|
188
|
+
logger.info(f"Deleted collection {name}")
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
raise StorageError(
|
|
193
|
+
operation="delete_collection",
|
|
194
|
+
collection=name,
|
|
195
|
+
reason=str(e)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def get_collections(self) -> List[str]:
|
|
199
|
+
"""Get list of all collection names."""
|
|
200
|
+
if not self._initialized:
|
|
201
|
+
return []
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
collections = self.client.get_collections().collections
|
|
205
|
+
return [c.name for c in collections]
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error(f"Failed to get collections: {e}")
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
def count_points(self, collection: str) -> int:
|
|
211
|
+
"""Count points in a collection."""
|
|
212
|
+
if not self._initialized:
|
|
213
|
+
return 0
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
info = self.get_collection_info(collection)
|
|
217
|
+
return info.points_count if info else 0
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Failed to count points: {e}")
|
|
220
|
+
return 0
|
|
221
|
+
|
|
222
|
+
def _generate_point_id(self, string_id: str) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Generate a valid point ID from a string.
|
|
225
|
+
|
|
226
|
+
Qdrant accepts string IDs directly in newer versions.
|
|
227
|
+
For older versions, we might need to hash to integer.
|
|
228
|
+
"""
|
|
229
|
+
# For now, use string IDs directly
|
|
230
|
+
# If needed, can hash: int(hashlib.md5(string_id.encode()).hexdigest()[:16], 16)
|
|
231
|
+
return string_id
|
|
232
|
+
|
|
233
|
+
def health_check(self) -> Dict[str, Any]:
|
|
234
|
+
"""Check health of Qdrant connection."""
|
|
235
|
+
if not self._initialized:
|
|
236
|
+
return {"healthy": False, "reason": "Not initialized"}
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
collections = self.client.get_collections()
|
|
240
|
+
return {
|
|
241
|
+
"healthy": True,
|
|
242
|
+
"url": self.url,
|
|
243
|
+
"collections_count": len(collections.collections),
|
|
244
|
+
"total_points": sum(
|
|
245
|
+
self.count_points(c.name)
|
|
246
|
+
for c in collections.collections
|
|
247
|
+
)
|
|
248
|
+
}
|
|
249
|
+
except Exception as e:
|
|
250
|
+
return {"healthy": False, "reason": str(e)}
|