footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,168 @@
1
+ """Shared SQL helper functions for building dynamic CASE/WHEN clauses,
2
+ pagination utilities, and chunked query execution."""
3
+
4
+ import sqlite3
5
+
6
+ # Stay well under SQLite's variable limit (999 on older builds, 32766 on newer).
7
+ _SQLITE_VAR_LIMIT = 500
8
+
9
+
10
+ def chunked_query(cursor, sql_template: str, item_ids: list[int]) -> list:
11
+ """Execute a query in chunks to stay under SQLite's variable limit.
12
+
13
+ The *sql_template* must contain a ``{placeholders}`` marker where the
14
+ ``IN (?, ?, ...)`` list will be inserted.
15
+ """
16
+ results = []
17
+ for i in range(0, len(item_ids), _SQLITE_VAR_LIMIT):
18
+ chunk = item_ids[i : i + _SQLITE_VAR_LIMIT]
19
+ placeholders = ",".join("?" * len(chunk))
20
+ sql = sql_template.format(placeholders=placeholders)
21
+ cursor.execute(sql, chunk)
22
+ results.extend(cursor.fetchall())
23
+ return results
24
+
25
+
26
+ def paginate(
27
+ conn: sqlite3.Connection,
28
+ count_sql: str,
29
+ fetch_sql: str,
30
+ params,
31
+ *,
32
+ page: int = 1,
33
+ limit: int = 50,
34
+ ) -> tuple[list, dict]:
35
+ """Execute a count + paginated fetch and return (rows, pagination_dict)."""
36
+ total = conn.execute(count_sql, list(params)).fetchone()[0]
37
+ total_pages = max(1, (total + limit - 1) // limit)
38
+ offset = (page - 1) * limit
39
+ rows = conn.execute(fetch_sql, list(params) + [limit, offset]).fetchall()
40
+ return rows, {"page": page, "limit": limit, "total": total, "total_pages": total_pages}
41
+
42
+
43
+ def paginated_response(entity_key: str, items, pagination: dict, **extras) -> dict:
44
+ """Build a standard paginated response envelope."""
45
+ return {entity_key: items, "pagination": pagination, **extras}
46
+
47
+
48
+ def build_location_case_clauses(
49
+ config: dict, home: str, path_col: str = "path", prefix: str = ""
50
+ ) -> tuple[list[str], list]:
51
+ """Build CASE/WHEN SQL clauses from config directories.
52
+ Returns (case_lines: list[str], params: list)."""
53
+ dirs = config.get("directories", [])
54
+ case_lines = []
55
+ params = []
56
+ for d in dirs:
57
+ expanded = d.replace("~", home)
58
+ label = d.rstrip("/").split("/")[-1]
59
+ if not label.startswith("."):
60
+ label = label.title()
61
+ case_lines.append(f"WHEN {path_col} LIKE ? THEN ?")
62
+ params.extend([expanded + "/%", prefix + label])
63
+ return case_lines, params
64
+
65
+
66
+ def build_remote_source_label_clauses(
67
+ sources_data: list[dict],
68
+ ) -> tuple[list[str], list]:
69
+ """Build CASE/WHEN clauses for remote source labels.
70
+ Returns (case_lines: list[str], params: list)."""
71
+ case_lines = []
72
+ params = []
73
+ for s in sources_data:
74
+ if s["source_type"] == "remote":
75
+ case_lines.append("WHEN source = ? THEN ?")
76
+ params.extend([s["name"], f"Drive (indexed): {s['account']}"])
77
+ return case_lines, params
78
+
79
+
80
+ def build_status_filter(
81
+ status: "str | list[str] | None",
82
+ *,
83
+ column: str,
84
+ default_exclude: "list[str] | None" = None,
85
+ default_include: "list[str] | None" = None,
86
+ ) -> tuple[list[str], list]:
87
+ """Build a status filter clause for dynamic WHERE construction.
88
+
89
+ Returns (conditions, params) where conditions is a list of 0 or 1
90
+ SQL fragments suitable for extending a WHERE clause.
91
+
92
+ Parameters
93
+ ----------
94
+ status : str, list[str], or None
95
+ ``None`` → apply default filter.
96
+ ``"all"`` → no filter (bypass defaults).
97
+ Single string → exact match.
98
+ List of strings → IN clause. Empty list → no filter.
99
+ column : str
100
+ Fully qualified column reference (e.g. ``"file.status"``).
101
+ default_exclude : list[str], optional
102
+ Statuses to exclude when ``status is None``.
103
+ default_include : list[str], optional
104
+ Statuses to include when ``status is None``.
105
+
106
+ ``default_exclude`` and ``default_include`` are mutually exclusive.
107
+ If both are provided, ``default_exclude`` takes precedence.
108
+ """
109
+ if status == "all":
110
+ return [], []
111
+
112
+ if status is None:
113
+ if default_exclude:
114
+ placeholders = ",".join("?" for _ in default_exclude)
115
+ return [f"{column} NOT IN ({placeholders})"], list(default_exclude)
116
+ if default_include:
117
+ placeholders = ",".join("?" for _ in default_include)
118
+ return [f"{column} IN ({placeholders})"], list(default_include)
119
+ return [], []
120
+
121
+ if isinstance(status, list):
122
+ if not status:
123
+ return [], []
124
+ placeholders = ",".join("?" for _ in status)
125
+ return [f"{column} IN ({placeholders})"], list(status)
126
+
127
+ # Single string — exact match
128
+ return [f"{column} = ?"], [status]
129
+
130
+
131
+ def split_query_terms(query: str) -> list[str]:
132
+ """Split query on whitespace, dropping terms shorter than 2 chars."""
133
+ return [t for t in query.split() if len(t) >= 2]
134
+
135
+
136
+ def build_fts5_query(terms: list[str]) -> str:
137
+ """Build an FTS5 query with AND semantics and prefix matching."""
138
+ sanitized = [term.replace('"', "") for term in terms]
139
+ return " ".join(f'"{term}"*' for term in sanitized if len(term) >= 2)
140
+
141
+
142
+ def build_term_conditions(
143
+ columns: list[str],
144
+ terms: list[str],
145
+ ) -> tuple[str, list[str]]:
146
+ """Build AND-ed LIKE conditions: every term must appear in at least one column."""
147
+ groups = []
148
+ params: list[str] = []
149
+ for term in terms:
150
+ like = f"%{term}%"
151
+ col_parts = [f"{col} LIKE ?" for col in columns]
152
+ groups.append(f"({' OR '.join(col_parts)})")
153
+ params.extend([like] * len(columns))
154
+ return " AND ".join(groups), params
155
+
156
+
157
+ def build_remote_account_case_clauses(
158
+ sources_data: list[dict],
159
+ ) -> tuple[list[str], list]:
160
+ """Build CASE/WHEN clauses mapping remote source names to accounts.
161
+ Returns (case_lines: list[str], params: list)."""
162
+ case_lines = []
163
+ params = []
164
+ for s in sources_data:
165
+ if s["source_type"] == "remote":
166
+ case_lines.append("WHEN ? THEN ?")
167
+ params.extend([s["name"], s["account"]])
168
+ return case_lines, params
@@ -0,0 +1,320 @@
1
+ """System status queries — table counts, config presence, last-indexed timestamp.
2
+
3
+ Includes both MCP-oriented status and the legacy get_stats.
4
+ """
5
+
6
+ import sqlite3
7
+ from pathlib import Path
8
+ from typing import Any, Dict, Optional
9
+
10
+
11
+ def _safe_count(cursor: sqlite3.Cursor, query: str) -> int:
12
+ """Execute a COUNT query, returning 0 if the table doesn't exist."""
13
+ try:
14
+ cursor.execute(query)
15
+ return cursor.fetchone()[0]
16
+ except sqlite3.OperationalError:
17
+ return 0
18
+
19
+
20
+ def _safe_query(conn: sqlite3.Connection, query: str, *, default: Any = None) -> Any:
21
+ """Execute a query and return the first column of the first row.
22
+
23
+ Returns ``default`` if the table doesn't exist or the query returns no rows.
24
+ """
25
+ try:
26
+ row = conn.execute(query).fetchone()
27
+ return row[0] if row else default
28
+ except sqlite3.OperationalError:
29
+ return default
30
+
31
+
32
+ def _safe_fetchall(conn: sqlite3.Connection, query: str) -> list[sqlite3.Row]:
33
+ """Execute a query and return all rows, or [] on missing table."""
34
+ try:
35
+ return conn.execute(query).fetchall()
36
+ except sqlite3.OperationalError:
37
+ return []
38
+
39
+
40
+ # -- Hidden-client NOT EXISTS clause (reused across source queries) -----------
41
+ _NOT_HIDDEN_CLIENT = (
42
+ "NOT EXISTS ( SELECT 1 FROM clients client WHERE client.id = {alias}.client_id AND client.mcp_view = 'hidden')"
43
+ )
44
+
45
+
46
+ def get_mcp_status(conn: sqlite3.Connection) -> dict:
47
+ """Return MCP-oriented status: per-source counts excluding hidden clients.
48
+
49
+ Designed for ``role=Role.VIEWER`` callers — hidden-client rows are
50
+ excluded from emails, chats, messages, and browser counts.
51
+ """
52
+ # -- Per-table counts and last-sync timestamps ----------------------------
53
+ tables: Dict[str, Dict[str, Optional[str]]] = {
54
+ "files": {
55
+ "count": (
56
+ "SELECT COUNT(*) FROM files WHERE status != 'removed' AND COALESCE(mcp_view, 'inherit') != 'hidden'"
57
+ ),
58
+ "latest": (
59
+ "SELECT MAX(indexed_at) FROM files "
60
+ "WHERE status != 'removed' "
61
+ "AND COALESCE(mcp_view, 'inherit') != 'hidden'"
62
+ ),
63
+ },
64
+ "emails": {
65
+ "count": (
66
+ "SELECT COUNT(*) FROM emails email "
67
+ "WHERE email.status != 'removed' "
68
+ f"AND {_NOT_HIDDEN_CLIENT.format(alias='email')}"
69
+ ),
70
+ "latest": (
71
+ "SELECT MAX(indexed_at) FROM emails email "
72
+ "WHERE email.status != 'removed' "
73
+ f"AND {_NOT_HIDDEN_CLIENT.format(alias='email')}"
74
+ ),
75
+ },
76
+ "chats": {
77
+ "count": (
78
+ "SELECT COUNT(*) FROM chats chat "
79
+ "WHERE COALESCE(chat.status, 'active') != 'removed' "
80
+ f"AND {_NOT_HIDDEN_CLIENT.format(alias='chat')}"
81
+ ),
82
+ "latest": (
83
+ "SELECT MAX(modified_at) FROM chats chat "
84
+ "WHERE COALESCE(chat.status, 'active') != 'removed' "
85
+ f"AND {_NOT_HIDDEN_CLIENT.format(alias='chat')}"
86
+ ),
87
+ },
88
+ "messages": {
89
+ "count": (
90
+ "SELECT COUNT(*) FROM messages message "
91
+ "WHERE NOT EXISTS ("
92
+ " SELECT 1 FROM chats chat"
93
+ " JOIN clients client ON client.id = chat.client_id"
94
+ " WHERE chat.id = message.chat_id AND client.mcp_view = 'hidden'"
95
+ ")"
96
+ ),
97
+ "latest": (
98
+ "SELECT MAX(created_at) FROM messages message "
99
+ "WHERE NOT EXISTS ("
100
+ " SELECT 1 FROM chats chat"
101
+ " JOIN clients client ON client.id = chat.client_id"
102
+ " WHERE chat.id = message.chat_id AND client.mcp_view = 'hidden'"
103
+ ")"
104
+ ),
105
+ },
106
+ "browser": {
107
+ "count": (
108
+ "SELECT COUNT(*) FROM visits bv "
109
+ "WHERE bv.status != 'removed' "
110
+ f"AND {_NOT_HIDDEN_CLIENT.format(alias='bv')}"
111
+ ),
112
+ "latest": (
113
+ "SELECT MAX(visit_time) FROM visits bv "
114
+ "WHERE bv.status != 'removed' "
115
+ f"AND {_NOT_HIDDEN_CLIENT.format(alias='bv')}"
116
+ ),
117
+ },
118
+ "projects": {
119
+ "count": "SELECT COUNT(*) FROM projects",
120
+ "latest": "SELECT MAX(created_at) FROM projects",
121
+ },
122
+ "clients": {
123
+ "count": ("SELECT COUNT(*) FROM clients WHERE COALESCE(mcp_view, 'inherit') != 'hidden'"),
124
+ "latest": None,
125
+ },
126
+ }
127
+
128
+ sources: Dict[str, dict] = {}
129
+ for table, queries in tables.items():
130
+ count = _safe_query(conn, queries["count"], default=0)
131
+ latest = _safe_query(conn, queries["latest"]) if queries["latest"] else None
132
+ if count == 0 and latest is None:
133
+ # Confirm count isn't masking a missing-table error
134
+ try:
135
+ conn.execute(queries["count"])
136
+ except sqlite3.OperationalError:
137
+ sources[table] = {"count": 0, "last_sync": None, "error": "table not found"}
138
+ continue
139
+ sources[table] = {"count": count, "last_sync": latest}
140
+
141
+ # -- File breakdown by source ---------------------------------------------
142
+ rows = _safe_fetchall(
143
+ conn,
144
+ """
145
+ SELECT source, COUNT(*) as count, COALESCE(SUM(size_bytes), 0) as size
146
+ FROM files WHERE status != 'removed'
147
+ AND COALESCE(mcp_view, 'inherit') != 'hidden'
148
+ GROUP BY source
149
+ """,
150
+ )
151
+ files_by_source = {r["source"]: {"count": r["count"], "size_bytes": r["size"]} for r in rows}
152
+
153
+ # -- File breakdown by status ---------------------------------------------
154
+ rows = _safe_fetchall(
155
+ conn,
156
+ """
157
+ SELECT status, COUNT(*) as count FROM files GROUP BY status
158
+ """,
159
+ )
160
+ files_by_status = {r["status"]: r["count"] for r in rows}
161
+
162
+ # -- Project count by status ----------------------------------------------
163
+ rows = _safe_fetchall(
164
+ conn,
165
+ """
166
+ SELECT status, COUNT(*) as count FROM projects GROUP BY status
167
+ """,
168
+ )
169
+ projects_by_status = {r["status"]: r["count"] for r in rows}
170
+
171
+ # -- Emails by client (excludes hidden clients and removed emails) --------
172
+ rows = _safe_fetchall(
173
+ conn,
174
+ """
175
+ SELECT COALESCE(client.name, '(unassigned)') AS client_name, COUNT(*) as count
176
+ FROM emails email LEFT JOIN clients client ON email.client_id = client.id
177
+ WHERE email.status != 'removed'
178
+ AND (client.mcp_view IS NULL OR client.mcp_view != 'hidden')
179
+ GROUP BY client_name
180
+ """,
181
+ )
182
+ emails_by_client = {r["client_name"]: r["count"] for r in rows}
183
+
184
+ # -- Chats by client (excludes hidden clients and removed chats) ----------
185
+ rows = _safe_fetchall(
186
+ conn,
187
+ """
188
+ SELECT COALESCE(client.name, '(unassigned)') AS client_name, COUNT(*) as count
189
+ FROM chats chat LEFT JOIN clients client ON chat.client_id = client.id
190
+ WHERE chat.status != 'removed'
191
+ AND (client.mcp_view IS NULL OR client.mcp_view != 'hidden')
192
+ GROUP BY client_name
193
+ """,
194
+ )
195
+ chats_by_client = {r["client_name"]: r["count"] for r in rows}
196
+
197
+ return {
198
+ "sources": sources,
199
+ "files_by_source": files_by_source,
200
+ "files_by_status": files_by_status,
201
+ "projects_by_status": projects_by_status,
202
+ "emails_by_client": emails_by_client,
203
+ "chats_by_client": chats_by_client,
204
+ }
205
+
206
+
207
+ def get_system_status(conn: sqlite3.Connection, config_path: Path) -> dict:
208
+ """Return system status dict with table counts, data presence, and config check.
209
+
210
+ Parameters
211
+ ----------
212
+ conn : sqlite3.Connection
213
+ Database connection with row_factory = sqlite3.Row.
214
+ config_path : Path
215
+ Absolute path to config.yaml (caller decides where it lives).
216
+
217
+ Returns
218
+ -------
219
+ dict with keys: has_data, config_exists, counts, total, last_indexed
220
+ """
221
+ cur = conn.cursor()
222
+
223
+ counts = {
224
+ "files": _safe_count(cur, "SELECT COUNT(*) FROM files WHERE status != 'removed'"),
225
+ "folders": _safe_count(cur, "SELECT COUNT(*) FROM folders WHERE status != 'removed'"),
226
+ "visits": _safe_count(cur, "SELECT COUNT(*) FROM visits"),
227
+ "emails": _safe_count(cur, "SELECT COUNT(*) FROM emails"),
228
+ "messages": _safe_count(cur, "SELECT COUNT(*) FROM messages"),
229
+ "projects": _safe_count(cur, "SELECT COUNT(*) FROM projects"),
230
+ }
231
+
232
+ try:
233
+ cur.execute("SELECT MAX(indexed_at) FROM files")
234
+ last_indexed = cur.fetchone()[0]
235
+ except sqlite3.OperationalError:
236
+ last_indexed = None
237
+
238
+ total = sum(counts.values())
239
+
240
+ return {
241
+ "has_data": total > 0,
242
+ "config_exists": config_path.exists(),
243
+ "counts": counts,
244
+ "total": total,
245
+ "last_indexed": last_indexed,
246
+ }
247
+
248
+
249
+ # ---------------------------------------------------------------------------
250
+ # Write/aggregate operations
251
+ # ---------------------------------------------------------------------------
252
+
253
+
254
+ def get_stats(conn: sqlite3.Connection) -> Dict[str, Any]:
255
+ """Get database statistics."""
256
+ cursor = conn.cursor()
257
+
258
+ cursor.execute("SELECT COUNT(*) as count FROM files WHERE status != 'removed'")
259
+ files_count = cursor.fetchone()["count"]
260
+
261
+ cursor.execute("SELECT COUNT(DISTINCT content_type) as count FROM files WHERE status != 'removed'")
262
+ content_types_count = cursor.fetchone()["count"]
263
+
264
+ cursor.execute("SELECT COUNT(*) as count FROM visits")
265
+ urls_count = cursor.fetchone()["count"]
266
+
267
+ cursor.execute("SELECT COUNT(DISTINCT browser) as count FROM visits")
268
+ browsers_count = cursor.fetchone()["count"]
269
+
270
+ cursor.execute(
271
+ """
272
+ SELECT MAX(modified_at) as latest_file
273
+ FROM files
274
+ WHERE modified_at IS NOT NULL AND status != 'removed'
275
+ """
276
+ )
277
+ latest_file = cursor.fetchone()["latest_file"]
278
+
279
+ cursor.execute("SELECT MAX(visit_time) as latest_visit FROM visits")
280
+ latest_visit = cursor.fetchone()["latest_visit"]
281
+
282
+ # Query remote source names directly from the sources table
283
+ rows = cursor.execute("SELECT name FROM sources WHERE source_type = 'remote'").fetchall()
284
+ remote_sources = [r["name"] for r in rows]
285
+
286
+ if remote_sources:
287
+ remote_ph = ",".join("?" * len(remote_sources))
288
+ cursor.execute(
289
+ f"SELECT COUNT(*) as count FROM files WHERE source IN ({remote_ph}) AND status != 'removed'",
290
+ remote_sources,
291
+ )
292
+ remote_files_count = cursor.fetchone()["count"]
293
+
294
+ cursor.execute(
295
+ f"SELECT COUNT(DISTINCT source) as count FROM files WHERE source IN ({remote_ph}) AND status != 'removed'",
296
+ remote_sources,
297
+ )
298
+ remote_sources_count = cursor.fetchone()["count"]
299
+ else:
300
+ remote_files_count = 0
301
+ remote_sources_count = 0
302
+
303
+ cursor.execute("SELECT COUNT(*) as count FROM emails")
304
+ emails_count = cursor.fetchone()["count"]
305
+
306
+ cursor.execute("SELECT COUNT(DISTINCT account) as count FROM emails")
307
+ email_accounts_count = cursor.fetchone()["count"]
308
+
309
+ return {
310
+ "total_files": files_count,
311
+ "content_types": content_types_count,
312
+ "total_urls": urls_count,
313
+ "browsers": browsers_count,
314
+ "latest_file_modified": latest_file,
315
+ "latest_visit": latest_visit,
316
+ "remote_files": remote_files_count,
317
+ "remote_sources": remote_sources_count,
318
+ "total_emails": emails_count,
319
+ "email_accounts": email_accounts_count,
320
+ }
@@ -0,0 +1,70 @@
1
+ """Upload tracking — CRUD for the uploads table."""
2
+
3
+ import json
4
+ import sqlite3
5
+ from typing import Any, Dict, List, Optional
6
+
7
+
8
+ def create_upload(conn: sqlite3.Connection, data: Dict[str, Any]) -> int:
9
+ """Create a new upload record."""
10
+ cursor = conn.cursor()
11
+ cursor.execute(
12
+ """
13
+ INSERT INTO uploads
14
+ (filename, file_hash, file_size, type, source, status, metadata)
15
+ VALUES (?, ?, ?, ?, ?, ?, ?)
16
+ """,
17
+ (
18
+ data["filename"],
19
+ data["file_hash"],
20
+ data.get("file_size"),
21
+ data["type"],
22
+ data.get("source"),
23
+ data.get("status", "pending"),
24
+ json.dumps(data.get("metadata", {})),
25
+ ),
26
+ )
27
+ return cursor.lastrowid
28
+
29
+
30
+ def update_upload(conn: sqlite3.Connection, upload_id: int, **kwargs) -> None:
31
+ """Update an upload record with results."""
32
+ allowed = {
33
+ "status",
34
+ "items_added",
35
+ "items_updated",
36
+ "items_total",
37
+ "completed_at",
38
+ "error_message",
39
+ }
40
+ updates, values = [], []
41
+ for field, value in kwargs.items():
42
+ if field in allowed:
43
+ updates.append(f"{field} = ?")
44
+ values.append(value)
45
+ if updates:
46
+ values.append(upload_id)
47
+ cursor = conn.cursor()
48
+ cursor.execute(f"UPDATE uploads SET {', '.join(updates)} WHERE id = ?", values)
49
+ conn.commit()
50
+
51
+
52
+ def get_upload_by_hash(conn: sqlite3.Connection, file_hash: str) -> Optional[Dict]:
53
+ """Check if a file was already uploaded."""
54
+ cursor = conn.cursor()
55
+ cursor.execute("SELECT * FROM uploads WHERE file_hash = ?", (file_hash,))
56
+ row = cursor.fetchone()
57
+ return dict(row) if row else None
58
+
59
+
60
+ def get_recent_uploads(conn: sqlite3.Connection, upload_type: Optional[str] = None, limit: int = 10) -> List[Dict]:
61
+ """Get recent uploads, optionally filtered by type."""
62
+ cursor = conn.cursor()
63
+ if upload_type:
64
+ cursor.execute(
65
+ "SELECT * FROM uploads WHERE type = ? ORDER BY uploaded_at DESC LIMIT ?",
66
+ (upload_type, limit),
67
+ )
68
+ else:
69
+ cursor.execute("SELECT * FROM uploads ORDER BY uploaded_at DESC LIMIT ?", (limit,))
70
+ return [dict(row) for row in cursor.fetchall()]
File without changes
@@ -0,0 +1,33 @@
1
+ """Pipeline adapter types and concrete adapters.
2
+
3
+ Re-exports the core types and built-in source adapters:
4
+
5
+ from footprinter.ingest.adapters import PipeAdapter, PipeResult
6
+ from footprinter.ingest.adapters import LocalFoldersAdapter, BrowserAdapter
7
+
8
+ Connector adapters (Drive, Gmail) live in connectors/google/adapters/.
9
+ """
10
+
11
+ from footprinter.ingest.adapters.browser import BrowserAdapter
12
+ from footprinter.ingest.adapters.chat import ChatAdapter
13
+ from footprinter.ingest.adapters.local_files import LocalFilesAdapter
14
+ from footprinter.ingest.adapters.local_folders import LocalFoldersAdapter
15
+ from footprinter.ingest.adapters.protocol import (
16
+ ErrorType,
17
+ PipeAdapter,
18
+ PipeContext,
19
+ PipeResult,
20
+ PipeStatus,
21
+ )
22
+
23
+ __all__ = [
24
+ "BrowserAdapter",
25
+ "ChatAdapter",
26
+ "PipeAdapter",
27
+ "PipeContext",
28
+ "ErrorType",
29
+ "LocalFilesAdapter",
30
+ "LocalFoldersAdapter",
31
+ "PipeResult",
32
+ "PipeStatus",
33
+ ]
@@ -0,0 +1,54 @@
1
+ """Browser adapter.
2
+
3
+ Wraps BrowserManager to conform to PipeAdapter protocol.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from functools import partial
10
+ from typing import Any, Dict, List
11
+
12
+ from footprinter.db import browser as browser_db
13
+ from footprinter.ingest.adapters.ingest import ingest_entries
14
+ from footprinter.ingest.adapters.protocol import ErrorType, PipeContext, PipeResult
15
+ from footprinter.ingest.browser_indexer import BrowserManager
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class BrowserAdapter:
21
+ """Adapter wrapping BrowserManager for the browser stage."""
22
+
23
+ name = "browser"
24
+ pipe_name = "browser"
25
+ required_extras: List[str] = []
26
+
27
+ def run(self, db: Any, ctx: PipeContext) -> PipeResult:
28
+ """Index browser history into visits table."""
29
+ try:
30
+ last_run = None if ctx.full_mode else ctx.last_run
31
+ manager = BrowserManager(ctx.source_config, since=last_run)
32
+ result = ingest_entries(
33
+ "browser",
34
+ manager.parse_all(),
35
+ partial(browser_db.insert_visit, db.conn),
36
+ count_label="urls_indexed",
37
+ conn=db.conn,
38
+ on_progress=ctx.on_progress,
39
+ )
40
+ return result
41
+ except Exception as e:
42
+ logger.error(f"browser stage failed: {e}")
43
+ return PipeResult.make_error(
44
+ "browser",
45
+ error=str(e),
46
+ error_type=ErrorType.RUNTIME,
47
+ )
48
+
49
+ def status(self, db: Any) -> Dict[str, Any]:
50
+ """Return browser visit entry count."""
51
+ cursor = db.conn.cursor()
52
+ cursor.execute("SELECT COUNT(*) FROM visits")
53
+ count = cursor.fetchone()[0]
54
+ return {"visits": count}