footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,13 @@
1
+ """Semantic search module for Footprinter."""
2
+
3
+ try:
4
+ from .embeddings import get_embedding_function
5
+ except ImportError:
6
+ pass
7
+
8
+ try:
9
+ from .vector_store import VectorStore
10
+ except ImportError:
11
+ pass
12
+
13
+ __all__ = ["VectorStore", "get_embedding_function"]
@@ -0,0 +1,52 @@
1
+ """Pure chunking function for splitting content into overlapping chunks."""
2
+
3
+ from typing import List, Tuple
4
+
5
+ DEFAULT_CHUNK_SIZE = 1000 # chars — tuned for MiniLM-L6-v2 (256-token window)
6
+ DEFAULT_CHUNK_OVERLAP = 150 # chars (15% of default chunk size)
7
+
8
+
9
+ def chunk_content(
10
+ content: str,
11
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
12
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
13
+ ) -> List[Tuple[str, int, int]]:
14
+ """
15
+ Split content into overlapping chunks with word-boundary awareness.
16
+
17
+ Args:
18
+ content: Text to split.
19
+ chunk_size: Maximum characters per chunk.
20
+ chunk_overlap: Character overlap between consecutive chunks.
21
+
22
+ Returns:
23
+ List of (chunk_text, chunk_index, total_chunks) tuples.
24
+ """
25
+ if len(content) <= chunk_size:
26
+ return [(content, 0, 1)]
27
+
28
+ chunks = []
29
+ start = 0
30
+ chunk_index = 0
31
+
32
+ while start < len(content):
33
+ end = start + chunk_size
34
+
35
+ # Try to break at word boundary
36
+ if end < len(content):
37
+ # Look for space within last 200 chars of chunk
38
+ space_pos = content.rfind(" ", end - 200, end)
39
+ if space_pos > start:
40
+ end = space_pos
41
+
42
+ chunk_text = content[start:end].strip()
43
+ if chunk_text:
44
+ chunks.append((chunk_text, chunk_index, -1)) # Total set later
45
+ chunk_index += 1
46
+
47
+ # Move start with overlap
48
+ start = end - chunk_overlap if end < len(content) else end
49
+
50
+ # Set total_chunks
51
+ total = len(chunks)
52
+ return [(text, idx, total) for text, idx, _ in chunks]
@@ -0,0 +1,23 @@
1
+ """Shared embedding function using ChromaDB's built-in ONNX backend."""
2
+
3
+ try:
4
+ from chromadb.utils.embedding_functions import ONNXMiniLM_L6_V2
5
+
6
+ _SEMANTIC_AVAILABLE = True
7
+ except ImportError:
8
+ _SEMANTIC_AVAILABLE = False
9
+
10
+ EMBEDDING_DIM = 384
11
+
12
+
13
+ def get_embedding_function():
14
+ """Return ChromaDB's built-in ONNX embedding function (ONNXMiniLM_L6_V2).
15
+
16
+ Returns a callable: ef(texts: list[str]) -> list[list[float]]
17
+
18
+ Raises:
19
+ ImportError: If chromadb is not installed.
20
+ """
21
+ if not _SEMANTIC_AVAILABLE:
22
+ raise ImportError("chromadb is required for embeddings. Install with: pip install footprinter-cli[semantic]")
23
+ return ONNXMiniLM_L6_V2()
@@ -0,0 +1,273 @@
1
+ """Hybrid search functions: FTS5 keyword search, snippet extraction, and RRF fusion."""
2
+
3
+ import logging
4
+ import sqlite3
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def extract_snippet(content: str, query: str, window: int = 250) -> str:
11
+ """
12
+ Extract the most relevant snippet from content based on query.
13
+
14
+ Args:
15
+ content: Full message content.
16
+ query: Search query.
17
+ window: Character window around match.
18
+
19
+ Returns:
20
+ Snippet with ellipsis and context.
21
+ """
22
+ if len(content) <= window * 2:
23
+ return content
24
+
25
+ # Find first query term match
26
+ query_terms = query.lower().split()
27
+ content_lower = content.lower()
28
+
29
+ best_pos = 0
30
+ for term in query_terms:
31
+ if len(term) >= 3: # Skip short terms
32
+ pos = content_lower.find(term)
33
+ if pos >= 0:
34
+ best_pos = pos
35
+ break
36
+
37
+ # Calculate window
38
+ start = max(0, best_pos - window // 2)
39
+ end = min(len(content), best_pos + window + window // 2)
40
+
41
+ # Expand to word boundaries
42
+ if start > 0:
43
+ space = content.rfind(" ", 0, start + 20)
44
+ if space > 0:
45
+ start = space + 1
46
+
47
+ if end < len(content):
48
+ space = content.find(" ", end - 20)
49
+ if space > 0:
50
+ end = space
51
+
52
+ snippet = content[start:end].strip()
53
+
54
+ # Add ellipsis
55
+ if start > 0:
56
+ snippet = "..." + snippet
57
+ if end < len(content):
58
+ snippet = snippet + "..."
59
+
60
+ return snippet
61
+
62
+
63
+ def chat_snippet(row: Dict) -> str:
64
+ """Build a display snippet from a keyword_search result row."""
65
+ summary = row.get("summary")
66
+ if summary:
67
+ if len(summary) > 300:
68
+ return summary[:300] + "..."
69
+ return summary
70
+ return f"Title match: {row['chat_title']}"
71
+
72
+
73
+ def reciprocal_rank_fusion(semantic_results: List[Dict], keyword_results: List[Dict], k: int = 60) -> List[Dict]:
74
+ """
75
+ Combine semantic and keyword results using Reciprocal Rank Fusion.
76
+
77
+ RRF score = sum(1 / (k + rank)) for each result list.
78
+ Higher k reduces the impact of high-ranking items.
79
+
80
+ Args:
81
+ semantic_results: Results from semantic search.
82
+ keyword_results: Results from FTS5 keyword search.
83
+ k: RRF constant (default 60, standard value).
84
+
85
+ Returns:
86
+ Combined and re-ranked results.
87
+ """
88
+ rrf_scores = {}
89
+ result_data = {}
90
+
91
+ # Process semantic results
92
+ for rank, result in enumerate(semantic_results):
93
+ chat_id = result["chat_id"]
94
+ rrf_scores[chat_id] = rrf_scores.get(chat_id, 0) + 1.0 / (k + rank + 1)
95
+ if chat_id not in result_data:
96
+ result_data[chat_id] = result
97
+ result_data[chat_id]["semantic_rank"] = rank + 1
98
+ result_data[chat_id]["keyword_rank"] = None
99
+
100
+ # Process keyword results
101
+ for rank, result in enumerate(keyword_results):
102
+ chat_id = result["chat_id"]
103
+ rrf_scores[chat_id] = rrf_scores.get(chat_id, 0) + 1.0 / (k + rank + 1)
104
+
105
+ if chat_id not in result_data:
106
+ # Keyword-only result — use "source" if present, fall back to "account"
107
+ source = result.get("source", result.get("account", "unknown"))
108
+ result_data[chat_id] = {
109
+ "chat_id": chat_id,
110
+ "chat_title": result["chat_title"],
111
+ "message_id": None,
112
+ "role": "keyword",
113
+ "source": source,
114
+ "created_at": result["created_at"],
115
+ "snippet": chat_snippet(result),
116
+ "relevance_score": 0,
117
+ "chunk_type": "keyword_match",
118
+ "chunk_index": 0,
119
+ "total_chunks": 1,
120
+ "semantic_rank": None,
121
+ "keyword_rank": rank + 1,
122
+ }
123
+ else:
124
+ result_data[chat_id]["keyword_rank"] = rank + 1
125
+
126
+ # Sort by RRF score and update relevance_score
127
+ sorted_ids = sorted(rrf_scores.keys(), key=lambda x: rrf_scores[x], reverse=True)
128
+
129
+ combined = []
130
+ for chat_id in sorted_ids:
131
+ result = result_data[chat_id]
132
+ result["rrf_score"] = round(rrf_scores[chat_id], 4)
133
+ # Use RRF score as the display relevance (scaled for readability)
134
+ result["relevance_score"] = round(min(1.0, rrf_scores[chat_id] * 30), 3)
135
+ combined.append(result)
136
+
137
+ return combined
138
+
139
+
140
+ def keyword_search(
141
+ query: str,
142
+ db_path: str,
143
+ account: Optional[str] = None,
144
+ limit: int = 50,
145
+ ) -> List[Dict]:
146
+ """
147
+ FTS5 keyword search on chat titles and summaries.
148
+
149
+ Args:
150
+ query: Search query.
151
+ db_path: Path to SQLite database.
152
+ account: Filter by account ('claude', 'chatgpt').
153
+ limit: Maximum results.
154
+
155
+ Returns:
156
+ List of chat matches with FTS5 rank scores.
157
+ Uses 'source' key (not 'account') for consistency with semantic results.
158
+ """
159
+ conn = sqlite3.connect(str(db_path), timeout=10)
160
+ conn.row_factory = sqlite3.Row
161
+ conn.execute("PRAGMA busy_timeout=5000")
162
+ conn.execute("PRAGMA foreign_keys=ON")
163
+ cursor = conn.cursor()
164
+
165
+ # Escape special FTS5 characters and build query
166
+ safe_query = query.replace('"', '""')
167
+ fts_query = f'"{safe_query}"*'
168
+
169
+ try:
170
+ if account:
171
+ cursor.execute(
172
+ """
173
+ SELECT chat.id, chat.title, chat.summary, chat.account, chat.created_at, chat.message_count,
174
+ fts.rank as fts_rank
175
+ FROM chats_fts fts
176
+ JOIN chats chat ON chat.id = fts.rowid
177
+ WHERE chats_fts MATCH ?
178
+ AND chat.account = ?
179
+ AND chat.status != 'removed'
180
+ ORDER BY fts.rank
181
+ LIMIT ?
182
+ """,
183
+ (fts_query, account, limit),
184
+ )
185
+ else:
186
+ cursor.execute(
187
+ """
188
+ SELECT chat.id, chat.title, chat.summary, chat.account, chat.created_at, chat.message_count,
189
+ fts.rank as fts_rank
190
+ FROM chats_fts fts
191
+ JOIN chats chat ON chat.id = fts.rowid
192
+ WHERE chats_fts MATCH ?
193
+ AND chat.status != 'removed'
194
+ ORDER BY fts.rank
195
+ LIMIT ?
196
+ """,
197
+ (fts_query, limit),
198
+ )
199
+
200
+ results = []
201
+ for row in cursor.fetchall():
202
+ # FTS5 rank is negative (more negative = better match)
203
+ fts_score = min(1.0, abs(row["fts_rank"]) / 10.0)
204
+
205
+ results.append(
206
+ {
207
+ "chat_id": row["id"],
208
+ "chat_title": row["title"] or "(untitled)",
209
+ "source": row["account"] or "unknown",
210
+ "created_at": row["created_at"] or "",
211
+ "message_count": row["message_count"] or 0,
212
+ "summary": row["summary"] or "",
213
+ "fts_score": fts_score,
214
+ "match_type": "keyword",
215
+ }
216
+ )
217
+
218
+ except Exception as e:
219
+ logger.error(f"FTS5 search error: {e}")
220
+ results = []
221
+
222
+ conn.close()
223
+ return results
224
+
225
+
226
+ def fts5_fallback_search(
227
+ query: str,
228
+ n_results: int = 20,
229
+ source: Optional[str] = None,
230
+ db_path: Optional[str] = None,
231
+ ) -> Tuple[List[Dict], bool]:
232
+ """
233
+ FTS5-only fallback for when ML dependencies are unavailable.
234
+
235
+ Normalizes result shape to match hybrid search output so consumers
236
+ don't need to branch on search mode.
237
+
238
+ Args:
239
+ query: Search query.
240
+ n_results: Max results.
241
+ source: Filter by source/account.
242
+ db_path: Path to SQLite database (auto-detected if None).
243
+
244
+ Returns:
245
+ (results, True) — True indicates fallback mode.
246
+ """
247
+ if db_path is None:
248
+ from footprinter.paths import get_db_path
249
+
250
+ db_path = str(get_db_path())
251
+
252
+ raw = keyword_search(query, db_path=db_path, account=source, limit=n_results)
253
+
254
+ # Normalize to match hybrid search result shape
255
+ results = []
256
+ for r in raw:
257
+ results.append(
258
+ {
259
+ "chat_id": r["chat_id"],
260
+ "chat_title": r["chat_title"],
261
+ "message_id": None,
262
+ "role": "keyword",
263
+ "source": r["source"],
264
+ "created_at": r["created_at"],
265
+ "snippet": chat_snippet(r),
266
+ "relevance_score": round(r["fts_score"], 3),
267
+ "chunk_type": "keyword_match",
268
+ "chunk_index": 0,
269
+ "total_chunks": 1,
270
+ }
271
+ )
272
+
273
+ return results, True