footprinter-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +444 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/cli/__init__.py +128 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +332 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +579 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +1836 -0
- footprinter/cli/status.py +729 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +610 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +741 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +515 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +328 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +261 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +125 -0
- footprinter/ingest/pipe_runner.py +217 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +201 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +57 -0
- footprinter/mcp/errors.py +102 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +15 -0
- footprinter/paths.py +91 -0
- footprinter/permissions.py +1160 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1272 -0
- footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0.dist-info/METADATA +229 -0
- footprinter_cli-1.0.0.dist-info/RECORD +134 -0
- footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text extraction from document formats for MCP read tool.
|
|
3
|
+
|
|
4
|
+
Extracts plaintext from binary document data (.docx, .pdf, .xlsx, .pptx, .csv, .tsv).
|
|
5
|
+
Sanitizes output to only include visible text (no comments, tracked changes, formulas).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
import io
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional, Tuple
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Map file extensions to extractor types
|
|
16
|
+
EXTENSION_MAP = {
|
|
17
|
+
".pdf": "pdf",
|
|
18
|
+
".docx": "docx",
|
|
19
|
+
".xlsx": "xlsx",
|
|
20
|
+
".pptx": "pptx",
|
|
21
|
+
".csv": "csv",
|
|
22
|
+
".tsv": "tsv",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Map MIME types to extractor types
|
|
26
|
+
MIME_MAP = {
|
|
27
|
+
"application/pdf": "pdf",
|
|
28
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
|
29
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
|
30
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
|
31
|
+
"text/csv": "csv",
|
|
32
|
+
"text/tab-separated-values": "tsv",
|
|
33
|
+
# Google Workspace types (handled as text after Drive export)
|
|
34
|
+
"application/vnd.google-apps.document": "text",
|
|
35
|
+
"application/vnd.google-apps.spreadsheet": "text",
|
|
36
|
+
"application/vnd.google-apps.presentation": "text",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_extractor_for_file(name: str, mime_type: str = "") -> Optional[str]:
|
|
41
|
+
"""
|
|
42
|
+
Determine the extractor type for a file.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
name: Filename with extension
|
|
46
|
+
mime_type: MIME type hint (optional)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Extractor type string ('pdf', 'docx', etc.) or None if no extraction needed
|
|
50
|
+
"""
|
|
51
|
+
# Check MIME type first (more reliable)
|
|
52
|
+
if mime_type and mime_type in MIME_MAP:
|
|
53
|
+
return MIME_MAP[mime_type]
|
|
54
|
+
|
|
55
|
+
# Fall back to extension
|
|
56
|
+
ext = "." + name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
|
57
|
+
return EXTENSION_MAP.get(ext)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def extract_text(data: bytes, extractor_type: str) -> Tuple[Optional[str], Optional[str]]:
|
|
61
|
+
"""
|
|
62
|
+
Extract text from binary document data.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
data: Raw file bytes
|
|
66
|
+
extractor_type: Type of extractor to use ('pdf', 'docx', etc.)
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Tuple of (extracted_text, error_message)
|
|
70
|
+
- On success: (text, None)
|
|
71
|
+
- On failure: (None, error_description)
|
|
72
|
+
"""
|
|
73
|
+
extractors = {
|
|
74
|
+
"pdf": _extract_pdf,
|
|
75
|
+
"docx": _extract_docx,
|
|
76
|
+
"xlsx": _extract_xlsx,
|
|
77
|
+
"pptx": _extract_pptx,
|
|
78
|
+
"csv": _extract_csv,
|
|
79
|
+
"tsv": _extract_tsv,
|
|
80
|
+
"text": _extract_text,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
extractor = extractors.get(extractor_type)
|
|
84
|
+
if not extractor:
|
|
85
|
+
return None, f"Unknown extractor type: {extractor_type}"
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
text = extractor(data)
|
|
89
|
+
return text, None
|
|
90
|
+
except ImportError as e:
|
|
91
|
+
return None, f"Missing dependency: {e}"
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error(f"Extraction error ({extractor_type}): {e}")
|
|
94
|
+
return None, str(e)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _extract_pdf(data: bytes) -> str:
|
|
98
|
+
"""Extract text from PDF bytes."""
|
|
99
|
+
import pypdf
|
|
100
|
+
|
|
101
|
+
reader = pypdf.PdfReader(io.BytesIO(data))
|
|
102
|
+
text_parts = []
|
|
103
|
+
|
|
104
|
+
for page in reader.pages:
|
|
105
|
+
page_text = page.extract_text()
|
|
106
|
+
if page_text:
|
|
107
|
+
text_parts.append(page_text)
|
|
108
|
+
|
|
109
|
+
return "\n\n".join(text_parts)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _extract_docx(data: bytes) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Extract text from DOCX bytes.
|
|
115
|
+
|
|
116
|
+
Only extracts visible paragraph text - no comments, tracked changes, or headers/footers.
|
|
117
|
+
"""
|
|
118
|
+
import docx
|
|
119
|
+
|
|
120
|
+
doc = docx.Document(io.BytesIO(data))
|
|
121
|
+
text_parts = []
|
|
122
|
+
|
|
123
|
+
for para in doc.paragraphs:
|
|
124
|
+
if para.text.strip():
|
|
125
|
+
text_parts.append(para.text)
|
|
126
|
+
|
|
127
|
+
# Also extract text from tables
|
|
128
|
+
for table in doc.tables:
|
|
129
|
+
for row in table.rows:
|
|
130
|
+
row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
|
131
|
+
if row_text:
|
|
132
|
+
text_parts.append(" | ".join(row_text))
|
|
133
|
+
|
|
134
|
+
return "\n\n".join(text_parts)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _extract_xlsx(data: bytes) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Extract text from XLSX bytes.
|
|
140
|
+
|
|
141
|
+
Extracts cell values only - no formulas, comments, or hidden data.
|
|
142
|
+
"""
|
|
143
|
+
import openpyxl
|
|
144
|
+
|
|
145
|
+
wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
|
|
146
|
+
text_parts = []
|
|
147
|
+
|
|
148
|
+
for sheet_name in wb.sheetnames:
|
|
149
|
+
sheet = wb[sheet_name]
|
|
150
|
+
text_parts.append(f"=== {sheet_name} ===")
|
|
151
|
+
|
|
152
|
+
for row in sheet.iter_rows(values_only=True):
|
|
153
|
+
# Filter out None values and convert to strings
|
|
154
|
+
row_values = [str(cell) for cell in row if cell is not None]
|
|
155
|
+
if row_values:
|
|
156
|
+
text_parts.append(" | ".join(row_values))
|
|
157
|
+
|
|
158
|
+
wb.close()
|
|
159
|
+
return "\n".join(text_parts)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _extract_pptx(data: bytes) -> str:
|
|
163
|
+
"""
|
|
164
|
+
Extract text from PPTX bytes.
|
|
165
|
+
|
|
166
|
+
Extracts text from shapes and text frames - no speaker notes or comments.
|
|
167
|
+
"""
|
|
168
|
+
from pptx import Presentation
|
|
169
|
+
|
|
170
|
+
prs = Presentation(io.BytesIO(data))
|
|
171
|
+
text_parts = []
|
|
172
|
+
|
|
173
|
+
for slide_num, slide in enumerate(prs.slides, 1):
|
|
174
|
+
slide_text = []
|
|
175
|
+
text_parts.append(f"--- Slide {slide_num} ---")
|
|
176
|
+
|
|
177
|
+
for shape in slide.shapes:
|
|
178
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
179
|
+
slide_text.append(shape.text)
|
|
180
|
+
|
|
181
|
+
if slide_text:
|
|
182
|
+
text_parts.append("\n".join(slide_text))
|
|
183
|
+
|
|
184
|
+
return "\n\n".join(text_parts)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _extract_csv(data: bytes) -> str:
|
|
188
|
+
"""Extract text from CSV bytes."""
|
|
189
|
+
# Try to decode as UTF-8 first, fall back to latin-1
|
|
190
|
+
try:
|
|
191
|
+
text = data.decode("utf-8")
|
|
192
|
+
except UnicodeDecodeError:
|
|
193
|
+
text = data.decode("latin-1")
|
|
194
|
+
|
|
195
|
+
# Parse and format as readable text
|
|
196
|
+
reader = csv.reader(io.StringIO(text))
|
|
197
|
+
rows = []
|
|
198
|
+
for row in reader:
|
|
199
|
+
if any(cell.strip() for cell in row):
|
|
200
|
+
rows.append(" | ".join(row))
|
|
201
|
+
|
|
202
|
+
return "\n".join(rows)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _extract_tsv(data: bytes) -> str:
|
|
206
|
+
"""Extract text from TSV bytes."""
|
|
207
|
+
try:
|
|
208
|
+
text = data.decode("utf-8")
|
|
209
|
+
except UnicodeDecodeError:
|
|
210
|
+
text = data.decode("latin-1")
|
|
211
|
+
|
|
212
|
+
reader = csv.reader(io.StringIO(text), delimiter="\t")
|
|
213
|
+
rows = []
|
|
214
|
+
for row in reader:
|
|
215
|
+
if any(cell.strip() for cell in row):
|
|
216
|
+
rows.append(" | ".join(row))
|
|
217
|
+
|
|
218
|
+
return "\n".join(rows)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _extract_text(data: bytes) -> str:
|
|
222
|
+
"""Pass-through for already-text content (e.g., Google Workspace exports)."""
|
|
223
|
+
try:
|
|
224
|
+
return data.decode("utf-8")
|
|
225
|
+
except UnicodeDecodeError:
|
|
226
|
+
return data.decode("latin-1")
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Footprinter MCP server — permission-gated access to indexed data and content."""
|
|
2
|
+
|
|
3
|
+
from mcp.server.fastmcp import FastMCP
|
|
4
|
+
|
|
5
|
+
from footprinter.mcp.tools.navigation import footprinter_client, footprinter_folder, footprinter_project
|
|
6
|
+
from footprinter.mcp.tools.read import footprinter_read
|
|
7
|
+
from footprinter.mcp.tools.search import footprinter_search
|
|
8
|
+
from footprinter.mcp.tools.status import footprinter_status
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from footprinter.mcp.tools.semantic import _SEMANTIC_AVAILABLE, footprinter_semantic
|
|
12
|
+
except ImportError:
|
|
13
|
+
_SEMANTIC_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
_server = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _build_server():
|
|
19
|
+
global _server
|
|
20
|
+
_server = FastMCP("footprinter")
|
|
21
|
+
_server.tool()(footprinter_status)
|
|
22
|
+
_server.tool()(footprinter_search)
|
|
23
|
+
_server.tool()(footprinter_project)
|
|
24
|
+
_server.tool()(footprinter_client)
|
|
25
|
+
_server.tool()(footprinter_folder)
|
|
26
|
+
if _SEMANTIC_AVAILABLE:
|
|
27
|
+
_server.tool()(footprinter_semantic)
|
|
28
|
+
_server.tool()(footprinter_read)
|
|
29
|
+
return _server
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def main():
|
|
33
|
+
"""Launch the Footprinter MCP server."""
|
|
34
|
+
_build_server()
|
|
35
|
+
_server.run()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == "__main__":
|
|
39
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Navigation tools: projects, clients, folders.
|
|
2
|
+
|
|
3
|
+
Thin MCP adapters — all query/filtering logic lives in the service layer.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from footprinter.mcp.db import get_db, handle_db_errors
|
|
9
|
+
from footprinter.mcp.errors import mcp_error
|
|
10
|
+
from footprinter.services import client_service, folder_service, project_service
|
|
11
|
+
from footprinter.services.roles import Role
|
|
12
|
+
|
|
13
|
+
HOME = str(Path.home())
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _shorten(path: str) -> str:
|
|
17
|
+
if path and path.startswith(HOME):
|
|
18
|
+
return "~" + path[len(HOME) :]
|
|
19
|
+
return path or ""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@handle_db_errors
|
|
23
|
+
def footprinter_project(project_name: str) -> dict:
|
|
24
|
+
"""Get project metadata, file counts, and basic stats."""
|
|
25
|
+
with get_db() as conn:
|
|
26
|
+
result = project_service.resolve_by_name(conn, project_name, role=Role.VIEWER)
|
|
27
|
+
if result is None:
|
|
28
|
+
return mcp_error("NOT_FOUND", internal_message=f"project search: {project_name}")
|
|
29
|
+
if result.get("disambiguation"):
|
|
30
|
+
return result
|
|
31
|
+
# Shorten paths for MCP display
|
|
32
|
+
if "root_path" in result:
|
|
33
|
+
result["root_path"] = _shorten(result["root_path"])
|
|
34
|
+
for f in result.get("folders", []):
|
|
35
|
+
if "path" in f:
|
|
36
|
+
f["path"] = _shorten(f["path"])
|
|
37
|
+
return result
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@handle_db_errors
|
|
41
|
+
def footprinter_client(client_name: str) -> dict:
|
|
42
|
+
"""Get group info with all projects and aggregate stats."""
|
|
43
|
+
with get_db() as conn:
|
|
44
|
+
result = client_service.resolve_by_name(conn, client_name, role=Role.VIEWER)
|
|
45
|
+
if result is None:
|
|
46
|
+
return mcp_error("NOT_FOUND", internal_message=f"client search: {client_name}")
|
|
47
|
+
if result.get("disambiguation"):
|
|
48
|
+
return result
|
|
49
|
+
# Shorten paths for MCP display
|
|
50
|
+
for p in result.get("projects", []):
|
|
51
|
+
if "root_path" in p:
|
|
52
|
+
p["root_path"] = _shorten(p["root_path"])
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@handle_db_errors
|
|
57
|
+
def footprinter_folder(path: str) -> dict:
|
|
58
|
+
"""Get folder contents and metadata."""
|
|
59
|
+
if path.startswith("~"):
|
|
60
|
+
path = HOME + path[1:]
|
|
61
|
+
|
|
62
|
+
with get_db() as conn:
|
|
63
|
+
result = folder_service.get_by_path(conn, path, role=Role.VIEWER)
|
|
64
|
+
if result is None:
|
|
65
|
+
return mcp_error("NOT_FOUND", internal_message=f"folder: {path}")
|
|
66
|
+
if "path" in result:
|
|
67
|
+
result["path"] = _shorten(result["path"])
|
|
68
|
+
for sf in result.get("subfolders", []):
|
|
69
|
+
sf["path"] = _shorten(sf.get("path", ""))
|
|
70
|
+
return result
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Read tool: fetch content for permitted items."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from footprinter.mcp.db import get_db, handle_db_errors
|
|
6
|
+
from footprinter.mcp.errors import mcp_error
|
|
7
|
+
from footprinter.services import access_service, content_service
|
|
8
|
+
from footprinter.services.roles import Role
|
|
9
|
+
|
|
10
|
+
# Map service status codes to MCP error codes
|
|
11
|
+
_STATUS_TO_MCP = {
|
|
12
|
+
"hidden": "NOT_FOUND",
|
|
13
|
+
"not_found": "NOT_FOUND",
|
|
14
|
+
"opaque": "VISIBILITY_RESTRICTED",
|
|
15
|
+
"denied": "PERMISSION_DENIED",
|
|
16
|
+
"invalid_type": "INVALID_TYPE",
|
|
17
|
+
"read_failed": "READ_FAILED",
|
|
18
|
+
"decode_failed": "DECODE_FAILED",
|
|
19
|
+
"extraction_failed": "EXTRACTION_FAILED",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@handle_db_errors
|
|
24
|
+
def footprinter_read(
|
|
25
|
+
item_type: str,
|
|
26
|
+
item_id: int,
|
|
27
|
+
format: Literal["text", "raw"] = "text",
|
|
28
|
+
) -> dict:
|
|
29
|
+
"""Read content for a file, email, or chat if Claude has permission.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
item_type: 'file', 'email', or 'chat'
|
|
33
|
+
item_id: Row ID of the item.
|
|
34
|
+
format: 'text' (default) extracts plaintext from documents,
|
|
35
|
+
'raw' returns raw decoded content without extraction.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dict with 'content' and 'metadata' on success,
|
|
39
|
+
or 'error', 'error_code', and 'metadata' on denial/failure.
|
|
40
|
+
"""
|
|
41
|
+
with get_db() as conn:
|
|
42
|
+
result = access_service.gate_access(
|
|
43
|
+
conn,
|
|
44
|
+
item_type,
|
|
45
|
+
item_id,
|
|
46
|
+
role=Role.VIEWER,
|
|
47
|
+
)
|
|
48
|
+
status = result.get("status")
|
|
49
|
+
|
|
50
|
+
if status != "ok":
|
|
51
|
+
return mcp_error(
|
|
52
|
+
_STATUS_TO_MCP.get(status, "READ_FAILED"),
|
|
53
|
+
metadata=result.get("metadata"),
|
|
54
|
+
internal_message=result.get("message"),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Email/chat: content already in result
|
|
58
|
+
if item_type != "file":
|
|
59
|
+
return {
|
|
60
|
+
"content": result.get("content", ""),
|
|
61
|
+
"metadata": result["metadata"],
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# File: read content via service I/O
|
|
65
|
+
file_result = content_service.read_file(conn, result["metadata"], format=format)
|
|
66
|
+
if file_result.get("status") != "ok":
|
|
67
|
+
return mcp_error(
|
|
68
|
+
_STATUS_TO_MCP.get(file_result["status"], "READ_FAILED"),
|
|
69
|
+
metadata=file_result.get("metadata", result["metadata"]),
|
|
70
|
+
internal_message=file_result.get("message"),
|
|
71
|
+
)
|
|
72
|
+
return {
|
|
73
|
+
"content": file_result["content"],
|
|
74
|
+
"metadata": file_result["metadata"],
|
|
75
|
+
}
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Search tool: query across data sources."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from footprinter.mcp.db import get_db, handle_db_errors
|
|
7
|
+
from footprinter.services import search_service
|
|
8
|
+
from footprinter.services.roles import Role
|
|
9
|
+
|
|
10
|
+
HOME = str(Path.home())
|
|
11
|
+
|
|
12
|
+
# Display names for source keys in summary text
|
|
13
|
+
_SOURCE_LABELS = {
|
|
14
|
+
"files": ("file", "files"),
|
|
15
|
+
"emails": ("email", "emails"),
|
|
16
|
+
"chats": ("chat", "chats"),
|
|
17
|
+
"browser": ("browser result", "browser results"),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _build_search_summary(results: dict, query: str, sources: list[str]) -> str:
|
|
22
|
+
"""Build a human-readable summary of search results."""
|
|
23
|
+
found_parts = []
|
|
24
|
+
empty_parts = []
|
|
25
|
+
|
|
26
|
+
for source in sources:
|
|
27
|
+
items = results.get(source, [])
|
|
28
|
+
singular, plural = _SOURCE_LABELS.get(source, (source, source))
|
|
29
|
+
count = len(items)
|
|
30
|
+
if count > 0:
|
|
31
|
+
label = singular if count == 1 else plural
|
|
32
|
+
found_parts.append(f"{count} {label}")
|
|
33
|
+
else:
|
|
34
|
+
empty_parts.append(plural)
|
|
35
|
+
|
|
36
|
+
total_suppressed = results.get("suppressed", 0)
|
|
37
|
+
|
|
38
|
+
if found_parts:
|
|
39
|
+
query_part = f" matching '{query}'" if query and query.strip() else ""
|
|
40
|
+
summary = f"Found {', '.join(found_parts)}{query_part}."
|
|
41
|
+
if empty_parts:
|
|
42
|
+
summary += f" No {' or '.join(empty_parts)} matched."
|
|
43
|
+
else:
|
|
44
|
+
query_part = f" for '{query}'" if query and query.strip() else ""
|
|
45
|
+
summary = (
|
|
46
|
+
f"No results{query_part}. "
|
|
47
|
+
f"Tips: try single keywords, use footprinter_semantic "
|
|
48
|
+
f"for semantic matching, or browse recent items with date_from/date_to "
|
|
49
|
+
f"and no query."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if total_suppressed > 0:
|
|
53
|
+
item_word = "item" if total_suppressed == 1 else "items"
|
|
54
|
+
summary += f" ({total_suppressed} {item_word} hidden by visibility policy)"
|
|
55
|
+
|
|
56
|
+
return summary
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _shorten_path(path: str) -> str:
|
|
60
|
+
if path and path.startswith(HOME):
|
|
61
|
+
return "~" + path[len(HOME) :]
|
|
62
|
+
return path or ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@handle_db_errors
|
|
66
|
+
def footprinter_search(
|
|
67
|
+
query: str = "",
|
|
68
|
+
sources: Optional[list[str]] = None,
|
|
69
|
+
project: Optional[str] = None,
|
|
70
|
+
client: Optional[str] = None,
|
|
71
|
+
date_from: Optional[str] = None,
|
|
72
|
+
date_to: Optional[str] = None,
|
|
73
|
+
limit: int = 50,
|
|
74
|
+
account: Optional[str] = None,
|
|
75
|
+
sender: Optional[str] = None,
|
|
76
|
+
days_back: Optional[int] = None,
|
|
77
|
+
folder: Optional[str] = None,
|
|
78
|
+
mime_type: Optional[str] = None,
|
|
79
|
+
) -> dict:
|
|
80
|
+
"""Search across indexed sources by keyword. Returns metadata only, no file content.
|
|
81
|
+
|
|
82
|
+
SEARCH BEHAVIOR:
|
|
83
|
+
- Matches against file names, email subjects/senders, chat titles, and
|
|
84
|
+
browser page titles/URLs depending on which sources are included.
|
|
85
|
+
- Multi-word queries use AND logic: every term must appear. "project report" only returns
|
|
86
|
+
items containing both "project" AND "report".
|
|
87
|
+
- Terms shorter than 2 characters are ignored.
|
|
88
|
+
- When query is empty, returns the most recent items (sorted by date descending).
|
|
89
|
+
Combine with date_from/date_to to browse a specific time range.
|
|
90
|
+
|
|
91
|
+
QUERY TIPS:
|
|
92
|
+
- Use 1-3 short, specific keywords. Each additional word narrows results further.
|
|
93
|
+
- Avoid long natural-language phrases — they produce too many AND terms and return nothing.
|
|
94
|
+
- Good: "salesforce proposal" — Bad: "nonprofit technology consulting Salesforce partner"
|
|
95
|
+
- To search by time period, leave query empty and use date_from/date_to.
|
|
96
|
+
|
|
97
|
+
WHEN TO USE THIS vs footprinter_semantic:
|
|
98
|
+
- Use THIS tool for keyword/metadata lookups: finding files by name, emails by subject,
|
|
99
|
+
chats by title, or browsing recent activity across all sources.
|
|
100
|
+
- Use footprinter_semantic when you want meaning-based search across
|
|
101
|
+
chat or file content (e.g., "discussions about authentication architecture").
|
|
102
|
+
|
|
103
|
+
SOURCE-SPECIFIC FILTERS:
|
|
104
|
+
- account: Filter by account name. Applies to emails and files.
|
|
105
|
+
- sender: Partial match on email sender name or address. Applies to emails only.
|
|
106
|
+
- days_back: Only include emails from the last N days. Applies to emails only.
|
|
107
|
+
- folder: Filter files by path prefix (e.g. "~/Work/projects"). Applies to files only.
|
|
108
|
+
- mime_type: Exact MIME type match (e.g. "application/pdf"). Applies to files only.
|
|
109
|
+
Source-specific filters are silently ignored when the relevant source is not being searched.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
query: Keyword(s) matched against names, titles, subjects. Empty = list recent.
|
|
113
|
+
sources: Which sources to search. Default: all.
|
|
114
|
+
Options: "files", "emails", "chats", "browser".
|
|
115
|
+
project: Filter to a project name (exact match, applies to files, emails, chats).
|
|
116
|
+
client: Filter to a client name (exact match, applies to files, emails, chats).
|
|
117
|
+
date_from: ISO date string lower bound (e.g. "2026-02-01").
|
|
118
|
+
date_to: ISO date string upper bound (e.g. "2026-02-14").
|
|
119
|
+
limit: Max results per source (default 50).
|
|
120
|
+
account: Filter by account (e.g. "personal", "work"). Applies to emails and files.
|
|
121
|
+
sender: Partial match on sender name or address (e.g. "alice"). Emails only.
|
|
122
|
+
days_back: Only include emails from the last N days. Emails only.
|
|
123
|
+
folder: Filter files under this path prefix (e.g. "~/Work/projects"). Files only.
|
|
124
|
+
mime_type: Exact MIME type filter (e.g. "application/pdf"). Files only.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Dict with keys per source (e.g. "files", "emails", "chats", "browser"),
|
|
128
|
+
each containing a list of result dicts. Includes a "summary" key with
|
|
129
|
+
a human-readable overview of what was found.
|
|
130
|
+
"""
|
|
131
|
+
if not sources:
|
|
132
|
+
sources = ["files", "emails", "chats", "browser"]
|
|
133
|
+
|
|
134
|
+
with get_db() as conn:
|
|
135
|
+
results = search_service.search(
|
|
136
|
+
conn,
|
|
137
|
+
role=Role.VIEWER,
|
|
138
|
+
query=query,
|
|
139
|
+
sources=sources,
|
|
140
|
+
project=project,
|
|
141
|
+
client=client,
|
|
142
|
+
date_from=date_from,
|
|
143
|
+
date_to=date_to,
|
|
144
|
+
limit=limit,
|
|
145
|
+
account=account,
|
|
146
|
+
sender=sender,
|
|
147
|
+
days_back=days_back,
|
|
148
|
+
folder=folder,
|
|
149
|
+
mime_type=mime_type,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Shorten file paths for MCP display
|
|
153
|
+
for f in results.get("files", []):
|
|
154
|
+
if "path" in f:
|
|
155
|
+
f["path"] = _shorten_path(f["path"])
|
|
156
|
+
|
|
157
|
+
results["summary"] = _build_search_summary(results, query, sources)
|
|
158
|
+
return results
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Semantic search tool — thin adapter over semantic_service."""
|
|
2
|
+
|
|
3
|
+
from footprinter.mcp.db import get_db, handle_db_errors
|
|
4
|
+
from footprinter.mcp.errors import mcp_error
|
|
5
|
+
from footprinter.services import semantic_service
|
|
6
|
+
from footprinter.services.roles import Role
|
|
7
|
+
|
|
8
|
+
_SEMANTIC_AVAILABLE = True
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@handle_db_errors
|
|
12
|
+
def footprinter_semantic(query: str, source: str = "all", limit: int = 10) -> dict:
|
|
13
|
+
"""Search chats and/or files by meaning using semantic (embedding-based) search.
|
|
14
|
+
|
|
15
|
+
SEARCH BEHAVIOR:
|
|
16
|
+
- Uses vector embeddings to find items with similar meaning, not exact keyword
|
|
17
|
+
matches. "auth problems" can find chats about "login failures"; "revenue
|
|
18
|
+
forecast" can find files about "financial projections".
|
|
19
|
+
- Falls back to FTS5 keyword search per-collection when ML dependencies
|
|
20
|
+
(ChromaDB, sentence-transformers) are unavailable. A note is included when
|
|
21
|
+
fallback is active.
|
|
22
|
+
- The ``source`` parameter controls which collections to search:
|
|
23
|
+
"chats" (conversations only), "files" (file content only), or "all" (both).
|
|
24
|
+
- ``limit`` applies per-collection: source="all" with limit=10 returns up to
|
|
25
|
+
10 chats + 10 files.
|
|
26
|
+
- Files are chunked during indexing; results are deduplicated so each file
|
|
27
|
+
appears at most once (best-matching chunk).
|
|
28
|
+
|
|
29
|
+
QUERY TIPS:
|
|
30
|
+
- Natural language works best: describe what you're looking for conceptually.
|
|
31
|
+
- Ideal query length is 3-10 words. Very short (1 word) or very long (10+ words)
|
|
32
|
+
queries reduce relevance.
|
|
33
|
+
- Good: "database migration strategies" — Bad: "database"
|
|
34
|
+
- Good: "client onboarding process" — Bad: "how did we onboard that new client
|
|
35
|
+
who came in last month through the partner referral program"
|
|
36
|
+
|
|
37
|
+
WHEN TO USE THIS vs footprinter_search:
|
|
38
|
+
- Use THIS tool for meaning-based search — finding items by topic even when you
|
|
39
|
+
don't know the exact words used.
|
|
40
|
+
- Use footprinter_search for exact keyword matches in names/subjects/titles, or
|
|
41
|
+
to search non-semantic sources (emails, browser history).
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
query: Natural language search query (minimum 3 characters).
|
|
45
|
+
source: Which collection(s) to search: "chats", "files", or "all" (default).
|
|
46
|
+
limit: Max results per collection (default 10).
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Dict with source-specific keys. "chats" and/or "files" lists are present
|
|
50
|
+
based on the source parameter. Only items with both visible access AND
|
|
51
|
+
read permission appear (semantic matches are content-derived, so
|
|
52
|
+
presence in results reveals content — per decision D2). Visible+allowed
|
|
53
|
+
chats have: chat_id, chat_title, snippet (text excerpt showing why it
|
|
54
|
+
matched), relevance_score, source, created_at, message_id.
|
|
55
|
+
Visible+allowed files have: id, name, path, content_type, size_bytes,
|
|
56
|
+
modified_at, relevance_score, snippet (from best-matching chunk).
|
|
57
|
+
Opaque items have minimal fields only. Hidden and permission-denied
|
|
58
|
+
items are excluded. Includes "summary" with a human-readable overview.
|
|
59
|
+
"""
|
|
60
|
+
# Validate before opening DB connection
|
|
61
|
+
if not query or len(query) < 3:
|
|
62
|
+
return mcp_error(
|
|
63
|
+
"QUERY_INVALID",
|
|
64
|
+
internal_message=f"query too short: {len(query) if query else 0}",
|
|
65
|
+
)
|
|
66
|
+
if source not in ("chats", "files", "all"):
|
|
67
|
+
return mcp_error(
|
|
68
|
+
"INVALID_INPUT",
|
|
69
|
+
hint="source must be 'chats', 'files', or 'all'",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
with get_db() as conn:
|
|
73
|
+
return semantic_service.semantic_search(
|
|
74
|
+
conn,
|
|
75
|
+
query,
|
|
76
|
+
role=Role.VIEWER,
|
|
77
|
+
source=source,
|
|
78
|
+
limit=limit,
|
|
79
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Status tool: system overview via status_service.
|
|
2
|
+
|
|
3
|
+
Thin MCP adapter — all query logic lives in the service layer.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from footprinter.mcp.db import get_db, handle_db_errors
|
|
7
|
+
from footprinter.services import status_service
|
|
8
|
+
from footprinter.services.roles import Role
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@handle_db_errors
|
|
12
|
+
def footprinter_status() -> dict:
|
|
13
|
+
"""System status: record counts, sync times, and breakdowns for all data sources."""
|
|
14
|
+
with get_db() as conn:
|
|
15
|
+
return status_service.get_status(conn, role=Role.VIEWER)
|