footprinter-cli 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +431 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
  19. footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
  20. footprinter/bundled/samples/visible-file-sample.txt +2 -0
  21. footprinter/cli/__init__.py +135 -0
  22. footprinter/cli/__main__.py +6 -0
  23. footprinter/cli/_common.py +327 -0
  24. footprinter/cli/_policy_helpers.py +646 -0
  25. footprinter/cli/_prompt.py +220 -0
  26. footprinter/cli/_sample_seed.py +204 -0
  27. footprinter/cli/api_cmd.py +32 -0
  28. footprinter/cli/connect.py +591 -0
  29. footprinter/cli/data.py +879 -0
  30. footprinter/cli/delete.py +128 -0
  31. footprinter/cli/ingest.py +543 -0
  32. footprinter/cli/mcp_cmd.py +750 -0
  33. footprinter/cli/mcp_setup.py +306 -0
  34. footprinter/cli/search.py +393 -0
  35. footprinter/cli/search_cmd.py +69 -0
  36. footprinter/cli/setup.py +2001 -0
  37. footprinter/cli/status.py +747 -0
  38. footprinter/cli/status_cmd.py +104 -0
  39. footprinter/cli/upsert.py +794 -0
  40. footprinter/cli/vectorize_cmd.py +215 -0
  41. footprinter/cli/view.py +322 -0
  42. footprinter/connectors/__init__.py +171 -0
  43. footprinter/connectors/config_utils.py +141 -0
  44. footprinter/db/__init__.py +37 -0
  45. footprinter/db/browser.py +198 -0
  46. footprinter/db/chats.py +602 -0
  47. footprinter/db/clients.py +307 -0
  48. footprinter/db/emails.py +279 -0
  49. footprinter/db/files.py +724 -0
  50. footprinter/db/folders.py +659 -0
  51. footprinter/db/messages.py +192 -0
  52. footprinter/db/policies.py +151 -0
  53. footprinter/db/projects.py +673 -0
  54. footprinter/db/search.py +573 -0
  55. footprinter/db/sql_utils.py +168 -0
  56. footprinter/db/status.py +320 -0
  57. footprinter/db/uploads.py +70 -0
  58. footprinter/ingest/__init__.py +0 -0
  59. footprinter/ingest/adapters/__init__.py +33 -0
  60. footprinter/ingest/adapters/browser.py +54 -0
  61. footprinter/ingest/adapters/chat.py +57 -0
  62. footprinter/ingest/adapters/ingest.py +146 -0
  63. footprinter/ingest/adapters/local_files.py +68 -0
  64. footprinter/ingest/adapters/local_folders.py +52 -0
  65. footprinter/ingest/adapters/protocol.py +174 -0
  66. footprinter/ingest/browser_indexer.py +216 -0
  67. footprinter/ingest/chat_dedup.py +156 -0
  68. footprinter/ingest/chat_indexer.py +487 -0
  69. footprinter/ingest/chat_parsers/__init__.py +8 -0
  70. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  71. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  72. footprinter/ingest/cli.py +827 -0
  73. footprinter/ingest/content_extractors.py +117 -0
  74. footprinter/ingest/database.py +36 -0
  75. footprinter/ingest/db/__init__.py +1 -0
  76. footprinter/ingest/db/connector_schema.py +47 -0
  77. footprinter/ingest/db/migration.py +315 -0
  78. footprinter/ingest/db/schema.py +1043 -0
  79. footprinter/ingest/db/security.py +6 -0
  80. footprinter/ingest/file_indexer.py +223 -0
  81. footprinter/ingest/file_scanner.py +277 -0
  82. footprinter/ingest/folder_indexer.py +226 -0
  83. footprinter/ingest/full_content_extractor.py +321 -0
  84. footprinter/ingest/orchestrator.py +112 -0
  85. footprinter/ingest/pipe_runner.py +200 -0
  86. footprinter/ingest/processing.py +165 -0
  87. footprinter/ingest/registry.py +186 -0
  88. footprinter/ingest/run_record.py +91 -0
  89. footprinter/ingest/status.py +346 -0
  90. footprinter/mcp/__init__.py +0 -0
  91. footprinter/mcp/__main__.py +5 -0
  92. footprinter/mcp/db.py +67 -0
  93. footprinter/mcp/errors.py +105 -0
  94. footprinter/mcp/extraction.py +226 -0
  95. footprinter/mcp/server.py +39 -0
  96. footprinter/mcp/tools/__init__.py +0 -0
  97. footprinter/mcp/tools/navigation.py +70 -0
  98. footprinter/mcp/tools/read.py +75 -0
  99. footprinter/mcp/tools/search.py +158 -0
  100. footprinter/mcp/tools/semantic.py +79 -0
  101. footprinter/mcp/tools/status.py +19 -0
  102. footprinter/paths.py +117 -0
  103. footprinter/permissions.py +1152 -0
  104. footprinter/semantic/__init__.py +13 -0
  105. footprinter/semantic/chunking.py +52 -0
  106. footprinter/semantic/embeddings.py +23 -0
  107. footprinter/semantic/hybrid_search.py +273 -0
  108. footprinter/semantic/vector_store.py +471 -0
  109. footprinter/services/__init__.py +49 -0
  110. footprinter/services/access_service.py +342 -0
  111. footprinter/services/chat_service.py +85 -0
  112. footprinter/services/client_service.py +267 -0
  113. footprinter/services/content_service.py +181 -0
  114. footprinter/services/email_service.py +89 -0
  115. footprinter/services/file_service.py +83 -0
  116. footprinter/services/folder_service.py +122 -0
  117. footprinter/services/includes.py +19 -0
  118. footprinter/services/ingest_service.py +231 -0
  119. footprinter/services/project_service.py +262 -0
  120. footprinter/services/roles.py +25 -0
  121. footprinter/services/search_service.py +177 -0
  122. footprinter/services/semantic_service.py +360 -0
  123. footprinter/services/status_service.py +18 -0
  124. footprinter/services/visit_service.py +65 -0
  125. footprinter/source_registry.py +194 -0
  126. footprinter/utils/__init__.py +7 -0
  127. footprinter/utils/hash_utils.py +59 -0
  128. footprinter/utils/logging_config.py +68 -0
  129. footprinter/utils/mime.py +30 -0
  130. footprinter/utils/text.py +6 -0
  131. footprinter/utils/time.py +11 -0
  132. footprinter/visibility.py +1264 -0
  133. footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
  134. footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
  135. footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
  136. footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
  137. footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
  138. footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,226 @@
1
+ """
2
+ Text extraction from document formats for MCP read tool.
3
+
4
+ Extracts plaintext from binary document data (.docx, .pdf, .xlsx, .pptx, .csv, .tsv).
5
+ Sanitizes output to only include visible text (no comments, tracked changes, formulas).
6
+ """
7
+
8
+ import csv
9
+ import io
10
+ import logging
11
+ from typing import Optional, Tuple
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Map file extensions to extractor types
16
+ EXTENSION_MAP = {
17
+ ".pdf": "pdf",
18
+ ".docx": "docx",
19
+ ".xlsx": "xlsx",
20
+ ".pptx": "pptx",
21
+ ".csv": "csv",
22
+ ".tsv": "tsv",
23
+ }
24
+
25
+ # Map MIME types to extractor types
26
+ MIME_MAP = {
27
+ "application/pdf": "pdf",
28
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
29
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
30
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
31
+ "text/csv": "csv",
32
+ "text/tab-separated-values": "tsv",
33
+ # Google Workspace types (handled as text after Drive export)
34
+ "application/vnd.google-apps.document": "text",
35
+ "application/vnd.google-apps.spreadsheet": "text",
36
+ "application/vnd.google-apps.presentation": "text",
37
+ }
38
+
39
+
40
+ def get_extractor_for_file(name: str, mime_type: str = "") -> Optional[str]:
41
+ """
42
+ Determine the extractor type for a file.
43
+
44
+ Args:
45
+ name: Filename with extension
46
+ mime_type: MIME type hint (optional)
47
+
48
+ Returns:
49
+ Extractor type string ('pdf', 'docx', etc.) or None if no extraction needed
50
+ """
51
+ # Check MIME type first (more reliable)
52
+ if mime_type and mime_type in MIME_MAP:
53
+ return MIME_MAP[mime_type]
54
+
55
+ # Fall back to extension
56
+ ext = "." + name.rsplit(".", 1)[-1].lower() if "." in name else ""
57
+ return EXTENSION_MAP.get(ext)
58
+
59
+
60
+ def extract_text(data: bytes, extractor_type: str) -> Tuple[Optional[str], Optional[str]]:
61
+ """
62
+ Extract text from binary document data.
63
+
64
+ Args:
65
+ data: Raw file bytes
66
+ extractor_type: Type of extractor to use ('pdf', 'docx', etc.)
67
+
68
+ Returns:
69
+ Tuple of (extracted_text, error_message)
70
+ - On success: (text, None)
71
+ - On failure: (None, error_description)
72
+ """
73
+ extractors = {
74
+ "pdf": _extract_pdf,
75
+ "docx": _extract_docx,
76
+ "xlsx": _extract_xlsx,
77
+ "pptx": _extract_pptx,
78
+ "csv": _extract_csv,
79
+ "tsv": _extract_tsv,
80
+ "text": _extract_text,
81
+ }
82
+
83
+ extractor = extractors.get(extractor_type)
84
+ if not extractor:
85
+ return None, f"Unknown extractor type: {extractor_type}"
86
+
87
+ try:
88
+ text = extractor(data)
89
+ return text, None
90
+ except ImportError as e:
91
+ return None, f"Missing dependency: {e}"
92
+ except Exception as e:
93
+ logger.error(f"Extraction error ({extractor_type}): {e}")
94
+ return None, str(e)
95
+
96
+
97
+ def _extract_pdf(data: bytes) -> str:
98
+ """Extract text from PDF bytes."""
99
+ import pypdf
100
+
101
+ reader = pypdf.PdfReader(io.BytesIO(data))
102
+ text_parts = []
103
+
104
+ for page in reader.pages:
105
+ page_text = page.extract_text()
106
+ if page_text:
107
+ text_parts.append(page_text)
108
+
109
+ return "\n\n".join(text_parts)
110
+
111
+
112
+ def _extract_docx(data: bytes) -> str:
113
+ """
114
+ Extract text from DOCX bytes.
115
+
116
+ Only extracts visible paragraph text - no comments, tracked changes, or headers/footers.
117
+ """
118
+ import docx
119
+
120
+ doc = docx.Document(io.BytesIO(data))
121
+ text_parts = []
122
+
123
+ for para in doc.paragraphs:
124
+ if para.text.strip():
125
+ text_parts.append(para.text)
126
+
127
+ # Also extract text from tables
128
+ for table in doc.tables:
129
+ for row in table.rows:
130
+ row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
131
+ if row_text:
132
+ text_parts.append(" | ".join(row_text))
133
+
134
+ return "\n\n".join(text_parts)
135
+
136
+
137
+ def _extract_xlsx(data: bytes) -> str:
138
+ """
139
+ Extract text from XLSX bytes.
140
+
141
+ Extracts cell values only - no formulas, comments, or hidden data.
142
+ """
143
+ import openpyxl
144
+
145
+ wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
146
+ text_parts = []
147
+
148
+ for sheet_name in wb.sheetnames:
149
+ sheet = wb[sheet_name]
150
+ text_parts.append(f"=== {sheet_name} ===")
151
+
152
+ for row in sheet.iter_rows(values_only=True):
153
+ # Filter out None values and convert to strings
154
+ row_values = [str(cell) for cell in row if cell is not None]
155
+ if row_values:
156
+ text_parts.append(" | ".join(row_values))
157
+
158
+ wb.close()
159
+ return "\n".join(text_parts)
160
+
161
+
162
+ def _extract_pptx(data: bytes) -> str:
163
+ """
164
+ Extract text from PPTX bytes.
165
+
166
+ Extracts text from shapes and text frames - no speaker notes or comments.
167
+ """
168
+ from pptx import Presentation
169
+
170
+ prs = Presentation(io.BytesIO(data))
171
+ text_parts = []
172
+
173
+ for slide_num, slide in enumerate(prs.slides, 1):
174
+ slide_text = []
175
+ text_parts.append(f"--- Slide {slide_num} ---")
176
+
177
+ for shape in slide.shapes:
178
+ if hasattr(shape, "text") and shape.text.strip():
179
+ slide_text.append(shape.text)
180
+
181
+ if slide_text:
182
+ text_parts.append("\n".join(slide_text))
183
+
184
+ return "\n\n".join(text_parts)
185
+
186
+
187
+ def _extract_csv(data: bytes) -> str:
188
+ """Extract text from CSV bytes."""
189
+ # Try to decode as UTF-8 first, fall back to latin-1
190
+ try:
191
+ text = data.decode("utf-8")
192
+ except UnicodeDecodeError:
193
+ text = data.decode("latin-1")
194
+
195
+ # Parse and format as readable text
196
+ reader = csv.reader(io.StringIO(text))
197
+ rows = []
198
+ for row in reader:
199
+ if any(cell.strip() for cell in row):
200
+ rows.append(" | ".join(row))
201
+
202
+ return "\n".join(rows)
203
+
204
+
205
+ def _extract_tsv(data: bytes) -> str:
206
+ """Extract text from TSV bytes."""
207
+ try:
208
+ text = data.decode("utf-8")
209
+ except UnicodeDecodeError:
210
+ text = data.decode("latin-1")
211
+
212
+ reader = csv.reader(io.StringIO(text), delimiter="\t")
213
+ rows = []
214
+ for row in reader:
215
+ if any(cell.strip() for cell in row):
216
+ rows.append(" | ".join(row))
217
+
218
+ return "\n".join(rows)
219
+
220
+
221
+ def _extract_text(data: bytes) -> str:
222
+ """Pass-through for already-text content (e.g., Google Workspace exports)."""
223
+ try:
224
+ return data.decode("utf-8")
225
+ except UnicodeDecodeError:
226
+ return data.decode("latin-1")
@@ -0,0 +1,39 @@
1
+ """Footprinter MCP server — permission-gated access to indexed data and content."""
2
+
3
+ from mcp.server.fastmcp import FastMCP
4
+
5
+ from footprinter.mcp.tools.navigation import footprinter_client, footprinter_folder, footprinter_project
6
+ from footprinter.mcp.tools.read import footprinter_read
7
+ from footprinter.mcp.tools.search import footprinter_search
8
+ from footprinter.mcp.tools.status import footprinter_status
9
+
10
+ try:
11
+ from footprinter.mcp.tools.semantic import _SEMANTIC_AVAILABLE, footprinter_semantic
12
+ except ImportError:
13
+ _SEMANTIC_AVAILABLE = False
14
+
15
+ _server = None
16
+
17
+
18
+ def _build_server():
19
+ global _server
20
+ _server = FastMCP("footprinter")
21
+ _server.tool()(footprinter_status)
22
+ _server.tool()(footprinter_search)
23
+ _server.tool()(footprinter_project)
24
+ _server.tool()(footprinter_client)
25
+ _server.tool()(footprinter_folder)
26
+ if _SEMANTIC_AVAILABLE:
27
+ _server.tool()(footprinter_semantic)
28
+ _server.tool()(footprinter_read)
29
+ return _server
30
+
31
+
32
+ def main():
33
+ """Launch the Footprinter MCP server."""
34
+ _build_server()
35
+ _server.run()
36
+
37
+
38
+ if __name__ == "__main__":
39
+ main()
File without changes
@@ -0,0 +1,70 @@
1
+ """Navigation tools: projects, clients, folders.
2
+
3
+ Thin MCP adapters — all query/filtering logic lives in the service layer.
4
+ """
5
+
6
+ from pathlib import Path
7
+
8
+ from footprinter.mcp.db import get_db, handle_db_errors
9
+ from footprinter.mcp.errors import mcp_error
10
+ from footprinter.services import client_service, folder_service, project_service
11
+ from footprinter.services.roles import Role
12
+
13
+ HOME = str(Path.home())
14
+
15
+
16
+ def _shorten(path: str) -> str:
17
+ if path and path.startswith(HOME):
18
+ return "~" + path[len(HOME) :]
19
+ return path or ""
20
+
21
+
22
+ @handle_db_errors
23
+ def footprinter_project(project_name: str) -> dict:
24
+ """Get project metadata, file counts, and basic stats."""
25
+ with get_db() as conn:
26
+ result = project_service.resolve_by_name(conn, project_name, role=Role.VIEWER)
27
+ if result is None:
28
+ return mcp_error("NOT_FOUND", internal_message=f"project search: {project_name}")
29
+ if result.get("disambiguation"):
30
+ return result
31
+ # Shorten paths for MCP display
32
+ if "root_path" in result:
33
+ result["root_path"] = _shorten(result["root_path"])
34
+ for f in result.get("folders", []):
35
+ if "path" in f:
36
+ f["path"] = _shorten(f["path"])
37
+ return result
38
+
39
+
40
+ @handle_db_errors
41
+ def footprinter_client(client_name: str) -> dict:
42
+ """Get group info with all projects and aggregate stats."""
43
+ with get_db() as conn:
44
+ result = client_service.resolve_by_name(conn, client_name, role=Role.VIEWER)
45
+ if result is None:
46
+ return mcp_error("NOT_FOUND", internal_message=f"client search: {client_name}")
47
+ if result.get("disambiguation"):
48
+ return result
49
+ # Shorten paths for MCP display
50
+ for p in result.get("projects", []):
51
+ if "root_path" in p:
52
+ p["root_path"] = _shorten(p["root_path"])
53
+ return result
54
+
55
+
56
+ @handle_db_errors
57
+ def footprinter_folder(path: str) -> dict:
58
+ """Get folder contents and metadata."""
59
+ if path.startswith("~"):
60
+ path = HOME + path[1:]
61
+
62
+ with get_db() as conn:
63
+ result = folder_service.get_by_path(conn, path, role=Role.VIEWER)
64
+ if result is None:
65
+ return mcp_error("NOT_FOUND", internal_message=f"folder: {path}")
66
+ if "path" in result:
67
+ result["path"] = _shorten(result["path"])
68
+ for sf in result.get("subfolders", []):
69
+ sf["path"] = _shorten(sf.get("path", ""))
70
+ return result
@@ -0,0 +1,75 @@
1
+ """Read tool: fetch content for permitted items."""
2
+
3
+ from typing import Literal
4
+
5
+ from footprinter.mcp.db import get_db, handle_db_errors
6
+ from footprinter.mcp.errors import mcp_error
7
+ from footprinter.services import access_service, content_service
8
+ from footprinter.services.roles import Role
9
+
10
+ # Map service status codes to MCP error codes
11
+ _STATUS_TO_MCP = {
12
+ "hidden": "NOT_FOUND",
13
+ "not_found": "NOT_FOUND",
14
+ "opaque": "VISIBILITY_RESTRICTED",
15
+ "denied": "PERMISSION_DENIED",
16
+ "invalid_type": "INVALID_TYPE",
17
+ "read_failed": "READ_FAILED",
18
+ "decode_failed": "DECODE_FAILED",
19
+ "extraction_failed": "EXTRACTION_FAILED",
20
+ }
21
+
22
+
23
+ @handle_db_errors
24
+ def footprinter_read(
25
+ item_type: str,
26
+ item_id: int,
27
+ format: Literal["text", "raw"] = "text",
28
+ ) -> dict:
29
+ """Read content for a file, email, or chat if Claude has permission.
30
+
31
+ Args:
32
+ item_type: 'file', 'email', or 'chat'
33
+ item_id: Row ID of the item.
34
+ format: 'text' (default) extracts plaintext from documents,
35
+ 'raw' returns raw decoded content without extraction.
36
+
37
+ Returns:
38
+ Dict with 'content' and 'metadata' on success,
39
+ or 'error', 'error_code', and 'metadata' on denial/failure.
40
+ """
41
+ with get_db() as conn:
42
+ result = access_service.gate_access(
43
+ conn,
44
+ item_type,
45
+ item_id,
46
+ role=Role.VIEWER,
47
+ )
48
+ status = result.get("status")
49
+
50
+ if status != "ok":
51
+ return mcp_error(
52
+ _STATUS_TO_MCP.get(status, "READ_FAILED"),
53
+ metadata=result.get("metadata"),
54
+ internal_message=result.get("message"),
55
+ )
56
+
57
+ # Email/chat: content already in result
58
+ if item_type != "file":
59
+ return {
60
+ "content": result.get("content", ""),
61
+ "metadata": result["metadata"],
62
+ }
63
+
64
+ # File: read content via service I/O
65
+ file_result = content_service.read_file(conn, result["metadata"], format=format)
66
+ if file_result.get("status") != "ok":
67
+ return mcp_error(
68
+ _STATUS_TO_MCP.get(file_result["status"], "READ_FAILED"),
69
+ metadata=file_result.get("metadata", result["metadata"]),
70
+ internal_message=file_result.get("message"),
71
+ )
72
+ return {
73
+ "content": file_result["content"],
74
+ "metadata": file_result["metadata"],
75
+ }
@@ -0,0 +1,158 @@
1
+ """Search tool: query across data sources."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from footprinter.mcp.db import get_db, handle_db_errors
7
+ from footprinter.services import search_service
8
+ from footprinter.services.roles import Role
9
+
10
+ HOME = str(Path.home())
11
+
12
+ # Display names for source keys in summary text
13
+ _SOURCE_LABELS = {
14
+ "files": ("file", "files"),
15
+ "emails": ("email", "emails"),
16
+ "chats": ("chat", "chats"),
17
+ "browser": ("browser result", "browser results"),
18
+ }
19
+
20
+
21
+ def _build_search_summary(results: dict, query: str, sources: list[str]) -> str:
22
+ """Build a human-readable summary of search results."""
23
+ found_parts = []
24
+ empty_parts = []
25
+
26
+ for source in sources:
27
+ items = results.get(source, [])
28
+ singular, plural = _SOURCE_LABELS.get(source, (source, source))
29
+ count = len(items)
30
+ if count > 0:
31
+ label = singular if count == 1 else plural
32
+ found_parts.append(f"{count} {label}")
33
+ else:
34
+ empty_parts.append(plural)
35
+
36
+ total_suppressed = results.get("suppressed", 0)
37
+
38
+ if found_parts:
39
+ query_part = f" matching '{query}'" if query and query.strip() else ""
40
+ summary = f"Found {', '.join(found_parts)}{query_part}."
41
+ if empty_parts:
42
+ summary += f" No {' or '.join(empty_parts)} matched."
43
+ else:
44
+ query_part = f" for '{query}'" if query and query.strip() else ""
45
+ summary = (
46
+ f"No results{query_part}. "
47
+ f"Tips: try single keywords, use footprinter_semantic "
48
+ f"for semantic matching, or browse recent items with date_from/date_to "
49
+ f"and no query."
50
+ )
51
+
52
+ if total_suppressed > 0:
53
+ item_word = "item" if total_suppressed == 1 else "items"
54
+ summary += f" ({total_suppressed} {item_word} hidden by visibility policy)"
55
+
56
+ return summary
57
+
58
+
59
+ def _shorten_path(path: str) -> str:
60
+ if path and path.startswith(HOME):
61
+ return "~" + path[len(HOME) :]
62
+ return path or ""
63
+
64
+
65
+ @handle_db_errors
66
+ def footprinter_search(
67
+ query: str = "",
68
+ sources: Optional[list[str]] = None,
69
+ project: Optional[str] = None,
70
+ client: Optional[str] = None,
71
+ date_from: Optional[str] = None,
72
+ date_to: Optional[str] = None,
73
+ limit: int = 50,
74
+ account: Optional[str] = None,
75
+ sender: Optional[str] = None,
76
+ days_back: Optional[int] = None,
77
+ folder: Optional[str] = None,
78
+ mime_type: Optional[str] = None,
79
+ ) -> dict:
80
+ """Search across indexed sources by keyword. Returns metadata only, no file content.
81
+
82
+ SEARCH BEHAVIOR:
83
+ - Matches against file names, email subjects/senders, chat titles, and
84
+ browser page titles/URLs depending on which sources are included.
85
+ - Multi-word queries use AND logic: every term must appear. "project report" only returns
86
+ items containing both "project" AND "report".
87
+ - Terms shorter than 2 characters are ignored.
88
+ - When query is empty, returns the most recent items (sorted by date descending).
89
+ Combine with date_from/date_to to browse a specific time range.
90
+
91
+ QUERY TIPS:
92
+ - Use 1-3 short, specific keywords. Each additional word narrows results further.
93
+ - Avoid long natural-language phrases — they produce too many AND terms and return nothing.
94
+ - Good: "salesforce proposal" — Bad: "nonprofit technology consulting Salesforce partner"
95
+ - To search by time period, leave query empty and use date_from/date_to.
96
+
97
+ WHEN TO USE THIS vs footprinter_semantic:
98
+ - Use THIS tool for keyword/metadata lookups: finding files by name, emails by subject,
99
+ chats by title, or browsing recent activity across all sources.
100
+ - Use footprinter_semantic when you want meaning-based search across
101
+ chat or file content (e.g., "discussions about authentication architecture").
102
+
103
+ SOURCE-SPECIFIC FILTERS:
104
+ - account: Filter by account name. Applies to emails and files.
105
+ - sender: Partial match on email sender name or address. Applies to emails only.
106
+ - days_back: Only include emails from the last N days. Applies to emails only.
107
+ - folder: Filter files by path prefix (e.g. "~/Work/projects"). Applies to files only.
108
+ - mime_type: Exact MIME type match (e.g. "application/pdf"). Applies to files only.
109
+ Source-specific filters are silently ignored when the relevant source is not being searched.
110
+
111
+ Args:
112
+ query: Keyword(s) matched against names, titles, subjects. Empty = list recent.
113
+ sources: Which sources to search. Default: all.
114
+ Options: "files", "emails", "chats", "browser".
115
+ project: Filter to a project name (exact match, applies to files, emails, chats).
116
+ client: Filter to a client name (exact match, applies to files, emails, chats).
117
+ date_from: ISO date string lower bound (e.g. "2026-02-01").
118
+ date_to: ISO date string upper bound (e.g. "2026-02-14").
119
+ limit: Max results per source (default 50).
120
+ account: Filter by account (e.g. "personal", "work"). Applies to emails and files.
121
+ sender: Partial match on sender name or address (e.g. "alice"). Emails only.
122
+ days_back: Only include emails from the last N days. Emails only.
123
+ folder: Filter files under this path prefix (e.g. "~/Work/projects"). Files only.
124
+ mime_type: Exact MIME type filter (e.g. "application/pdf"). Files only.
125
+
126
+ Returns:
127
+ Dict with keys per source (e.g. "files", "emails", "chats", "browser"),
128
+ each containing a list of result dicts. Includes a "summary" key with
129
+ a human-readable overview of what was found.
130
+ """
131
+ if not sources:
132
+ sources = ["files", "emails", "chats", "browser"]
133
+
134
+ with get_db() as conn:
135
+ results = search_service.search(
136
+ conn,
137
+ role=Role.VIEWER,
138
+ query=query,
139
+ sources=sources,
140
+ project=project,
141
+ client=client,
142
+ date_from=date_from,
143
+ date_to=date_to,
144
+ limit=limit,
145
+ account=account,
146
+ sender=sender,
147
+ days_back=days_back,
148
+ folder=folder,
149
+ mime_type=mime_type,
150
+ )
151
+
152
+ # Shorten file paths for MCP display
153
+ for f in results.get("files", []):
154
+ if "path" in f:
155
+ f["path"] = _shorten_path(f["path"])
156
+
157
+ results["summary"] = _build_search_summary(results, query, sources)
158
+ return results
@@ -0,0 +1,79 @@
1
+ """Semantic search tool — thin adapter over semantic_service."""
2
+
3
+ from footprinter.mcp.db import get_db, handle_db_errors
4
+ from footprinter.mcp.errors import mcp_error
5
+ from footprinter.services import semantic_service
6
+ from footprinter.services.roles import Role
7
+
8
+ _SEMANTIC_AVAILABLE = True
9
+
10
+
11
+ @handle_db_errors
12
+ def footprinter_semantic(query: str, source: str = "all", limit: int = 10) -> dict:
13
+ """Search chats and/or files by meaning using semantic (embedding-based) search.
14
+
15
+ SEARCH BEHAVIOR:
16
+ - Uses vector embeddings to find items with similar meaning, not exact keyword
17
+ matches. "auth problems" can find chats about "login failures"; "revenue
18
+ forecast" can find files about "financial projections".
19
+ - Falls back to FTS5 keyword search per-collection when ML dependencies
20
+ (ChromaDB, sentence-transformers) are unavailable. A note is included when
21
+ fallback is active.
22
+ - The ``source`` parameter controls which collections to search:
23
+ "chats" (conversations only), "files" (file content only), or "all" (both).
24
+ - ``limit`` applies per-collection: source="all" with limit=10 returns up to
25
+ 10 chats + 10 files.
26
+ - Files are chunked during indexing; results are deduplicated so each file
27
+ appears at most once (best-matching chunk).
28
+
29
+ QUERY TIPS:
30
+ - Natural language works best: describe what you're looking for conceptually.
31
+ - Ideal query length is 3-10 words. Very short (1 word) or very long (10+ words)
32
+ queries reduce relevance.
33
+ - Good: "database migration strategies" — Bad: "database"
34
+ - Good: "client onboarding process" — Bad: "how did we onboard that new client
35
+ who came in last month through the partner referral program"
36
+
37
+ WHEN TO USE THIS vs footprinter_search:
38
+ - Use THIS tool for meaning-based search — finding items by topic even when you
39
+ don't know the exact words used.
40
+ - Use footprinter_search for exact keyword matches in names/subjects/titles, or
41
+ to search non-semantic sources (emails, browser history).
42
+
43
+ Args:
44
+ query: Natural language search query (minimum 3 characters).
45
+ source: Which collection(s) to search: "chats", "files", or "all" (default).
46
+ limit: Max results per collection (default 10).
47
+
48
+ Returns:
49
+ Dict with source-specific keys. "chats" and/or "files" lists are present
50
+ based on the source parameter. Only items with both visible access AND
51
+ read permission appear (semantic matches are content-derived, so
52
+ presence in results reveals content — per decision D2). Visible+allowed
53
+ chats have: chat_id, chat_title, snippet (text excerpt showing why it
54
+ matched), relevance_score, source, created_at, message_id.
55
+ Visible+allowed files have: id, name, path, content_type, size_bytes,
56
+ modified_at, relevance_score, snippet (from best-matching chunk).
57
+ Opaque items have minimal fields only. Hidden and permission-denied
58
+ items are excluded. Includes "summary" with a human-readable overview.
59
+ """
60
+ # Validate before opening DB connection
61
+ if not query or len(query) < 3:
62
+ return mcp_error(
63
+ "QUERY_INVALID",
64
+ internal_message=f"query too short: {len(query) if query else 0}",
65
+ )
66
+ if source not in ("chats", "files", "all"):
67
+ return mcp_error(
68
+ "INVALID_INPUT",
69
+ hint="source must be 'chats', 'files', or 'all'",
70
+ )
71
+
72
+ with get_db() as conn:
73
+ return semantic_service.semantic_search(
74
+ conn,
75
+ query,
76
+ role=Role.VIEWER,
77
+ source=source,
78
+ limit=limit,
79
+ )
@@ -0,0 +1,19 @@
1
+ """Status tool: system overview via status_service.
2
+
3
+ Thin MCP adapter — all query logic lives in the service layer.
4
+ """
5
+
6
+ from footprinter.mcp.db import get_db, handle_db_errors
7
+ from footprinter.paths import is_test_mode
8
+ from footprinter.services import status_service
9
+ from footprinter.services.roles import Role
10
+
11
+
12
+ @handle_db_errors
13
+ def footprinter_status() -> dict:
14
+ """System status: record counts, sync times, and breakdowns for all data sources."""
15
+ with get_db() as conn:
16
+ result = status_service.get_status(conn, role=Role.VIEWER)
17
+ if is_test_mode():
18
+ result["_sandbox"] = True
19
+ return result