footprinter-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +444 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/cli/__init__.py +128 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +332 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +579 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +1836 -0
- footprinter/cli/status.py +729 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +610 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +741 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +515 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +328 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +261 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +125 -0
- footprinter/ingest/pipe_runner.py +217 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +201 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +57 -0
- footprinter/mcp/errors.py +102 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +15 -0
- footprinter/paths.py +91 -0
- footprinter/permissions.py +1160 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1272 -0
- footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0.dist-info/METADATA +229 -0
- footprinter_cli-1.0.0.dist-info/RECORD +134 -0
- footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
"""Unified VectorStore: single entry point for all ChromaDB operations."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import threading
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from footprinter.paths import get_chroma_path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _semantic_available() -> bool:
|
|
12
|
+
"""Check whether chromadb and onnxruntime are importable.
|
|
13
|
+
|
|
14
|
+
Evaluated lazily (on first call, not at import time) so that test
|
|
15
|
+
stubs injected into sys.modules are picked up regardless of pytest
|
|
16
|
+
collection order.
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
import chromadb # noqa: F401
|
|
20
|
+
import onnxruntime # noqa: F401
|
|
21
|
+
|
|
22
|
+
return True
|
|
23
|
+
except ImportError:
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# File vectorization at ingest time. Off by default — opt-in via config.
|
|
31
|
+
def _file_vectorization_enabled() -> bool:
|
|
32
|
+
try:
|
|
33
|
+
from footprinter.source_registry import get_config
|
|
34
|
+
|
|
35
|
+
return bool(get_config().get("semantic", {}).get("file_vectorization", False))
|
|
36
|
+
except Exception as e:
|
|
37
|
+
# Intentional broad catch: ConfigError, ImportError, AttributeError all realistic
|
|
38
|
+
logger.debug("Config unavailable for file_vectorization check: %s", e)
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Chat vectorization at ingest time. Off by default — opt-in via config.
|
|
43
|
+
def _chat_vectorization_enabled() -> bool:
|
|
44
|
+
try:
|
|
45
|
+
from footprinter.source_registry import get_config
|
|
46
|
+
|
|
47
|
+
return bool(get_config().get("semantic", {}).get("chat_vectorization", False))
|
|
48
|
+
except Exception as e:
|
|
49
|
+
# Intentional broad catch: ConfigError, ImportError, AttributeError all realistic
|
|
50
|
+
logger.debug("Config unavailable for chat_vectorization check: %s", e)
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class VectorStore:
|
|
55
|
+
"""Unified vector store managing both files and chats collections."""
|
|
56
|
+
|
|
57
|
+
_instance: Optional["VectorStore"] = None
|
|
58
|
+
_lock = threading.Lock()
|
|
59
|
+
|
|
60
|
+
def __init__(self, chroma_path: Optional[str] = None):
|
|
61
|
+
if not _semantic_available():
|
|
62
|
+
raise ImportError("Semantic search libraries required.\nInstall with: pip install footprinter-cli[semantic]")
|
|
63
|
+
self.chroma_path = Path(chroma_path) if chroma_path else get_chroma_path()
|
|
64
|
+
self.chroma_path.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
import chromadb
|
|
67
|
+
from chromadb.config import Settings
|
|
68
|
+
|
|
69
|
+
self.client = chromadb.PersistentClient(
|
|
70
|
+
path=str(self.chroma_path),
|
|
71
|
+
settings=Settings(anonymized_telemetry=False),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self._files = self.client.get_or_create_collection(
|
|
75
|
+
name="footprinter_files",
|
|
76
|
+
metadata={"description": "Footprinter file contents"},
|
|
77
|
+
)
|
|
78
|
+
self._chats = self.client.get_or_create_collection(
|
|
79
|
+
name="footprinter_chats",
|
|
80
|
+
metadata={"description": "Footprinter chats"},
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
files_count = self._files.count()
|
|
84
|
+
chats_count = self._chats.count()
|
|
85
|
+
if files_count == 0 and chats_count == 0 and (_file_vectorization_enabled() or _chat_vectorization_enabled()):
|
|
86
|
+
logger.warning(
|
|
87
|
+
"VectorStore initialized with 0 documents in both collections "
|
|
88
|
+
"(chroma_path=%s). Searches will return empty results.",
|
|
89
|
+
self.chroma_path,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
from footprinter.semantic.embeddings import EMBEDDING_DIM, get_embedding_function
|
|
93
|
+
|
|
94
|
+
self.ef = get_embedding_function()
|
|
95
|
+
self._embedding_dim = EMBEDDING_DIM
|
|
96
|
+
|
|
97
|
+
# ------------------------------------------------------------------
|
|
98
|
+
# Singleton
|
|
99
|
+
# ------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def get_instance(cls, chroma_path: Optional[str] = None) -> "VectorStore":
|
|
103
|
+
"""Return a shared singleton instance (thread-safe).
|
|
104
|
+
|
|
105
|
+
If the chroma directory has been deleted (e.g., by _rebuild_vectors in
|
|
106
|
+
another process), the stale singleton is discarded and a fresh instance
|
|
107
|
+
is created.
|
|
108
|
+
"""
|
|
109
|
+
with cls._lock:
|
|
110
|
+
if cls._instance is not None and not cls._instance.chroma_path.exists():
|
|
111
|
+
logger.warning(
|
|
112
|
+
"Chroma path %s no longer exists — resetting stale VectorStore singleton",
|
|
113
|
+
cls._instance.chroma_path,
|
|
114
|
+
)
|
|
115
|
+
cls._instance = None
|
|
116
|
+
if cls._instance is None:
|
|
117
|
+
cls._instance = cls(chroma_path=chroma_path)
|
|
118
|
+
return cls._instance
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def reset_instance(cls) -> None:
|
|
122
|
+
"""Clear the singleton so the next get_instance() creates a fresh one."""
|
|
123
|
+
with cls._lock:
|
|
124
|
+
cls._instance = None
|
|
125
|
+
|
|
126
|
+
def check_integrity(self) -> Dict:
|
|
127
|
+
"""Check chroma collection integrity.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
{"status": "ok", "files": N, "chats": M} on success,
|
|
131
|
+
{"status": "corrupted", "error": "..."} on FTS5 corruption,
|
|
132
|
+
{"status": "empty", "files": 0, "chats": 0} when both empty.
|
|
133
|
+
"""
|
|
134
|
+
try:
|
|
135
|
+
files_count = self._files.count()
|
|
136
|
+
chats_count = self._chats.count()
|
|
137
|
+
except Exception as e:
|
|
138
|
+
error_msg = str(e).lower()
|
|
139
|
+
if "malformed" in error_msg or "corrupt" in error_msg:
|
|
140
|
+
return {"status": "corrupted", "error": str(e)}
|
|
141
|
+
raise
|
|
142
|
+
|
|
143
|
+
if files_count == 0 and chats_count == 0:
|
|
144
|
+
return {"status": "empty", "files": 0, "chats": 0}
|
|
145
|
+
|
|
146
|
+
# Probe collections with a dummy query to verify FTS5 index
|
|
147
|
+
try:
|
|
148
|
+
dummy = [0.0] * self._embedding_dim
|
|
149
|
+
if files_count > 0:
|
|
150
|
+
self._files.query(query_embeddings=[dummy], n_results=1)
|
|
151
|
+
if chats_count > 0:
|
|
152
|
+
self._chats.query(query_embeddings=[dummy], n_results=1)
|
|
153
|
+
except Exception as e:
|
|
154
|
+
error_msg = str(e).lower()
|
|
155
|
+
if any(kw in error_msg for kw in ("malformed", "corrupt", "fts5")):
|
|
156
|
+
return {"status": "corrupted", "error": str(e)}
|
|
157
|
+
raise
|
|
158
|
+
|
|
159
|
+
return {"status": "ok", "files": files_count, "chats": chats_count}
|
|
160
|
+
|
|
161
|
+
# ------------------------------------------------------------------
|
|
162
|
+
# File operations
|
|
163
|
+
# ------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
def index_file(
|
|
166
|
+
self,
|
|
167
|
+
file_id: int,
|
|
168
|
+
file_path: str,
|
|
169
|
+
chunks: List[Dict],
|
|
170
|
+
metadata: Optional[Dict] = None,
|
|
171
|
+
) -> int:
|
|
172
|
+
"""
|
|
173
|
+
Index file chunks into the files collection.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
file_id: Database ID of the file.
|
|
177
|
+
file_path: Path to the file.
|
|
178
|
+
chunks: List of chunk dicts with 'content', 'chunk_index', 'total_chunks'.
|
|
179
|
+
metadata: Additional metadata to store with each chunk.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Number of chunks indexed.
|
|
183
|
+
"""
|
|
184
|
+
if not chunks:
|
|
185
|
+
return 0
|
|
186
|
+
|
|
187
|
+
ids = []
|
|
188
|
+
contents = []
|
|
189
|
+
metas = []
|
|
190
|
+
for chunk in chunks:
|
|
191
|
+
ids.append(f"file_{file_id}_chunk_{chunk['chunk_index']}")
|
|
192
|
+
contents.append(chunk["content"])
|
|
193
|
+
meta = dict(metadata) if metadata else {}
|
|
194
|
+
meta.update(
|
|
195
|
+
{
|
|
196
|
+
"file_id": file_id,
|
|
197
|
+
"file_path": file_path,
|
|
198
|
+
"chunk_index": chunk["chunk_index"],
|
|
199
|
+
"total_chunks": chunk["total_chunks"],
|
|
200
|
+
"content_length": len(chunk["content"]),
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
metas.append(meta)
|
|
204
|
+
|
|
205
|
+
embeddings = self.ef(contents)
|
|
206
|
+
self._files.add(
|
|
207
|
+
ids=ids,
|
|
208
|
+
embeddings=embeddings,
|
|
209
|
+
documents=contents,
|
|
210
|
+
metadatas=metas,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return len(chunks)
|
|
214
|
+
|
|
215
|
+
def upsert_file(self, file_id, file_path, chunks, metadata=None):
|
|
216
|
+
"""Index file chunks, replacing any existing vectors for this file."""
|
|
217
|
+
self.delete_file(file_id)
|
|
218
|
+
return self.index_file(file_id, file_path, chunks, metadata)
|
|
219
|
+
|
|
220
|
+
def search_files(
|
|
221
|
+
self,
|
|
222
|
+
query: str,
|
|
223
|
+
n_results: int = 10,
|
|
224
|
+
filter_metadata: Optional[Dict] = None,
|
|
225
|
+
) -> List[Dict]:
|
|
226
|
+
"""
|
|
227
|
+
Semantic search across indexed file content.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
List of dicts with file_id, file_path, chunk_index, total_chunks,
|
|
231
|
+
content_snippet, distance.
|
|
232
|
+
"""
|
|
233
|
+
query_embedding = self.ef([query])[0]
|
|
234
|
+
results = self._files.query(
|
|
235
|
+
query_embeddings=[query_embedding],
|
|
236
|
+
n_results=n_results,
|
|
237
|
+
where=filter_metadata,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
formatted = []
|
|
241
|
+
if results["ids"] and len(results["ids"]) > 0:
|
|
242
|
+
for i in range(len(results["ids"][0])):
|
|
243
|
+
formatted.append(
|
|
244
|
+
{
|
|
245
|
+
"file_id": results["metadatas"][0][i].get("file_id"),
|
|
246
|
+
"file_path": results["metadatas"][0][i]["file_path"],
|
|
247
|
+
"chunk_index": results["metadatas"][0][i]["chunk_index"],
|
|
248
|
+
"total_chunks": results["metadatas"][0][i]["total_chunks"],
|
|
249
|
+
"content_snippet": results["documents"][0][i][:500],
|
|
250
|
+
"distance": results["distances"][0][i] if "distances" in results else None,
|
|
251
|
+
}
|
|
252
|
+
)
|
|
253
|
+
return formatted
|
|
254
|
+
|
|
255
|
+
def delete_file(self, file_id: int) -> None:
|
|
256
|
+
"""Delete all chunks for a given file_id."""
|
|
257
|
+
self._files.delete(where={"file_id": file_id})
|
|
258
|
+
|
|
259
|
+
def get_file_stats(self) -> Dict:
|
|
260
|
+
"""Return count and collection name for files."""
|
|
261
|
+
return {
|
|
262
|
+
"total_chunks": self._files.count(),
|
|
263
|
+
"collection_name": self._files.name,
|
|
264
|
+
"model": self._embedding_dim,
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
# ------------------------------------------------------------------
|
|
268
|
+
# Chat operations
|
|
269
|
+
# ------------------------------------------------------------------
|
|
270
|
+
|
|
271
|
+
def index_chat_message(
|
|
272
|
+
self,
|
|
273
|
+
message_id: int,
|
|
274
|
+
chat_id: int,
|
|
275
|
+
content: str,
|
|
276
|
+
metadata: Dict,
|
|
277
|
+
) -> int:
|
|
278
|
+
"""
|
|
279
|
+
Index a single chat message, auto-chunking if needed.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Number of chunks indexed.
|
|
283
|
+
"""
|
|
284
|
+
if not content or not content.strip():
|
|
285
|
+
return 0
|
|
286
|
+
|
|
287
|
+
from footprinter.semantic.chunking import chunk_content
|
|
288
|
+
|
|
289
|
+
chunks = chunk_content(content)
|
|
290
|
+
|
|
291
|
+
for chunk_text, chunk_index, total_chunks in chunks:
|
|
292
|
+
doc_id = f"msg_{message_id}_chunk_{chunk_index}"
|
|
293
|
+
embedding = self.ef([chunk_text])[0]
|
|
294
|
+
|
|
295
|
+
meta = {
|
|
296
|
+
"message_id": message_id,
|
|
297
|
+
"chat_id": chat_id,
|
|
298
|
+
"chunk_index": chunk_index,
|
|
299
|
+
"total_chunks": total_chunks,
|
|
300
|
+
"content_length": len(chunk_text),
|
|
301
|
+
**metadata,
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
self._chats.add(
|
|
305
|
+
ids=[doc_id],
|
|
306
|
+
embeddings=[embedding],
|
|
307
|
+
documents=[chunk_text],
|
|
308
|
+
metadatas=[meta],
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
return len(chunks)
|
|
312
|
+
|
|
313
|
+
def upsert_chat_message(self, message_id, chat_id, content, metadata):
|
|
314
|
+
"""Index a message, replacing any existing vectors."""
|
|
315
|
+
try:
|
|
316
|
+
self._chats.delete(where={"message_id": message_id})
|
|
317
|
+
except Exception: # Intentional broad catch: ChromaDB operations are best-effort cleanup
|
|
318
|
+
logger.warning(
|
|
319
|
+
"Failed to delete existing vectors for message_id=%s before re-index",
|
|
320
|
+
message_id,
|
|
321
|
+
exc_info=True,
|
|
322
|
+
)
|
|
323
|
+
return self.index_chat_message(message_id, chat_id, content, metadata)
|
|
324
|
+
|
|
325
|
+
def index_chat_info(
|
|
326
|
+
self,
|
|
327
|
+
chat_id: int,
|
|
328
|
+
title: str,
|
|
329
|
+
summary: Optional[str],
|
|
330
|
+
source: str,
|
|
331
|
+
created_at: str,
|
|
332
|
+
message_count: int,
|
|
333
|
+
) -> bool:
|
|
334
|
+
"""
|
|
335
|
+
Index chat title+summary as a searchable document.
|
|
336
|
+
|
|
337
|
+
Uses upsert so it can be re-indexed safely.
|
|
338
|
+
"""
|
|
339
|
+
text_parts = [f"Chat: {title or '(untitled)'}"]
|
|
340
|
+
if summary:
|
|
341
|
+
text_parts.append(f"Summary: {summary}")
|
|
342
|
+
text_parts.append(f"Source: {source}")
|
|
343
|
+
|
|
344
|
+
searchable_text = "\n\n".join(text_parts)
|
|
345
|
+
doc_id = f"chat_info_{chat_id}"
|
|
346
|
+
embedding = self.ef([searchable_text])[0]
|
|
347
|
+
|
|
348
|
+
metadata = {
|
|
349
|
+
"chat_id": chat_id,
|
|
350
|
+
"chat_title": (title or "(untitled)")[:200],
|
|
351
|
+
"source": source or "unknown",
|
|
352
|
+
"created_at": created_at or "",
|
|
353
|
+
"message_count": message_count or 0,
|
|
354
|
+
"chunk_type": "chat_info",
|
|
355
|
+
"has_summary": bool(summary),
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
self._chats.upsert(
|
|
359
|
+
ids=[doc_id],
|
|
360
|
+
embeddings=[embedding],
|
|
361
|
+
documents=[searchable_text],
|
|
362
|
+
metadatas=[metadata],
|
|
363
|
+
)
|
|
364
|
+
return True
|
|
365
|
+
|
|
366
|
+
def search_chats(
|
|
367
|
+
self,
|
|
368
|
+
query: str,
|
|
369
|
+
n_results: int = 20,
|
|
370
|
+
source: Optional[str] = None,
|
|
371
|
+
role: Optional[str] = None,
|
|
372
|
+
min_score: float = 0.3,
|
|
373
|
+
) -> List[Dict]:
|
|
374
|
+
"""
|
|
375
|
+
Hybrid search combining semantic + FTS5 keyword search via RRF.
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
List of result dicts with chat_id, chat_title,
|
|
379
|
+
relevance_score, snippet, etc.
|
|
380
|
+
"""
|
|
381
|
+
if not query or len(query) < 3:
|
|
382
|
+
return []
|
|
383
|
+
|
|
384
|
+
from footprinter.paths import get_db_path
|
|
385
|
+
from footprinter.semantic.hybrid_search import (
|
|
386
|
+
extract_snippet,
|
|
387
|
+
keyword_search,
|
|
388
|
+
reciprocal_rank_fusion,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# 1. Semantic search via ChromaDB
|
|
392
|
+
query_embedding = self.ef([query])[0]
|
|
393
|
+
|
|
394
|
+
where_filter = None
|
|
395
|
+
if source or role:
|
|
396
|
+
conditions = []
|
|
397
|
+
if source:
|
|
398
|
+
conditions.append({"source": source})
|
|
399
|
+
if role:
|
|
400
|
+
conditions.append({"role": role})
|
|
401
|
+
where_filter = conditions[0] if len(conditions) == 1 else {"$and": conditions}
|
|
402
|
+
|
|
403
|
+
semantic_raw = self._chats.query(
|
|
404
|
+
query_embeddings=[query_embedding],
|
|
405
|
+
n_results=n_results * 3,
|
|
406
|
+
where=where_filter,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
semantic_results = []
|
|
410
|
+
seen_chats = {}
|
|
411
|
+
|
|
412
|
+
if semantic_raw["ids"] and len(semantic_raw["ids"]) > 0:
|
|
413
|
+
for i in range(len(semantic_raw["ids"][0])):
|
|
414
|
+
meta = semantic_raw["metadatas"][0][i]
|
|
415
|
+
content = semantic_raw["documents"][0][i]
|
|
416
|
+
distance = semantic_raw["distances"][0][i] if "distances" in semantic_raw else 0
|
|
417
|
+
|
|
418
|
+
relevance = max(0, 1 - (distance / 2))
|
|
419
|
+
chat_id = meta.get("chat_id")
|
|
420
|
+
|
|
421
|
+
if chat_id in seen_chats:
|
|
422
|
+
if relevance <= seen_chats[chat_id]:
|
|
423
|
+
continue
|
|
424
|
+
seen_chats[chat_id] = relevance
|
|
425
|
+
|
|
426
|
+
chunk_type = meta.get("chunk_type", "message")
|
|
427
|
+
snippet = extract_snippet(content, query)
|
|
428
|
+
|
|
429
|
+
semantic_results.append(
|
|
430
|
+
{
|
|
431
|
+
"chat_id": chat_id,
|
|
432
|
+
"chat_title": meta.get("chat_title", "(untitled)"),
|
|
433
|
+
"message_id": meta.get("message_id"),
|
|
434
|
+
"role": meta.get(
|
|
435
|
+
"role",
|
|
436
|
+
"info" if chunk_type == "chat_info" else "unknown",
|
|
437
|
+
),
|
|
438
|
+
"source": meta.get("source", "unknown"),
|
|
439
|
+
"created_at": meta.get("created_at", ""),
|
|
440
|
+
"snippet": snippet,
|
|
441
|
+
"relevance_score": round(relevance, 3),
|
|
442
|
+
"chunk_type": chunk_type,
|
|
443
|
+
"chunk_index": meta.get("chunk_index", 0),
|
|
444
|
+
"total_chunks": meta.get("total_chunks", 1),
|
|
445
|
+
}
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# 2. Keyword search via FTS5
|
|
449
|
+
keyword_results = keyword_search(query, db_path=str(get_db_path()), account=source, limit=n_results * 2)
|
|
450
|
+
|
|
451
|
+
# 3. Combine with RRF
|
|
452
|
+
combined = reciprocal_rank_fusion(semantic_results, keyword_results)
|
|
453
|
+
|
|
454
|
+
filtered = [r for r in combined if r["relevance_score"] >= min_score]
|
|
455
|
+
return filtered[:n_results]
|
|
456
|
+
|
|
457
|
+
def delete_message(self, message_id: int) -> None:
|
|
458
|
+
"""Delete all chunks for a given message_id."""
|
|
459
|
+
self._chats.delete(where={"message_id": message_id})
|
|
460
|
+
|
|
461
|
+
def delete_chat(self, chat_id: int) -> None:
|
|
462
|
+
"""Delete all message chunks and info doc for a chat."""
|
|
463
|
+
self._chats.delete(where={"chat_id": chat_id})
|
|
464
|
+
|
|
465
|
+
def get_chat_stats(self) -> Dict:
|
|
466
|
+
"""Return count and collection name for chats."""
|
|
467
|
+
return {
|
|
468
|
+
"total_documents": self._chats.count(),
|
|
469
|
+
"collection_name": self._chats.name,
|
|
470
|
+
"embedding_dimensions": self._embedding_dim,
|
|
471
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""footprinter.services — shared service layer between interfaces and repository.
|
|
2
|
+
|
|
3
|
+
Service function signature pattern:
|
|
4
|
+
|
|
5
|
+
def get_thing(conn: sqlite3.Connection, *, role: Role = Role.ADMIN, ...) -> dict:
|
|
6
|
+
...
|
|
7
|
+
|
|
8
|
+
Every service function:
|
|
9
|
+
- Takes ``conn: sqlite3.Connection`` as first positional arg
|
|
10
|
+
- Takes ``role: Role`` as a keyword arg (default ``Role.ADMIN``)
|
|
11
|
+
- Returns plain ``dict`` (matching the repository layer convention in footprinter.db)
|
|
12
|
+
- Uses keyword-only args for filters and options
|
|
13
|
+
|
|
14
|
+
Interface layers assign the role:
|
|
15
|
+
- CLI passes ``Role.ADMIN`` (full access, local user)
|
|
16
|
+
- MCP passes ``Role.VIEWER`` (read-only, filtered metadata)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from footprinter.services import (
|
|
20
|
+
access_service,
|
|
21
|
+
chat_service,
|
|
22
|
+
client_service,
|
|
23
|
+
content_service,
|
|
24
|
+
email_service,
|
|
25
|
+
file_service,
|
|
26
|
+
folder_service,
|
|
27
|
+
project_service,
|
|
28
|
+
search_service,
|
|
29
|
+
semantic_service,
|
|
30
|
+
status_service,
|
|
31
|
+
visit_service,
|
|
32
|
+
)
|
|
33
|
+
from footprinter.services.roles import Role
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"Role",
|
|
37
|
+
"access_service",
|
|
38
|
+
"client_service",
|
|
39
|
+
"project_service",
|
|
40
|
+
"file_service",
|
|
41
|
+
"folder_service",
|
|
42
|
+
"chat_service",
|
|
43
|
+
"content_service",
|
|
44
|
+
"email_service",
|
|
45
|
+
"visit_service",
|
|
46
|
+
"status_service",
|
|
47
|
+
"search_service",
|
|
48
|
+
"semantic_service",
|
|
49
|
+
]
|