footprinter-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +444 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/cli/__init__.py +128 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +332 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +579 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +1836 -0
- footprinter/cli/status.py +729 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +610 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +741 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +515 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +328 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +261 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +125 -0
- footprinter/ingest/pipe_runner.py +217 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +201 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +57 -0
- footprinter/mcp/errors.py +102 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +15 -0
- footprinter/paths.py +91 -0
- footprinter/permissions.py +1160 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1272 -0
- footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0.dist-info/METADATA +229 -0
- footprinter_cli-1.0.0.dist-info/RECORD +134 -0
- footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Shared SQL helper functions for building dynamic CASE/WHEN clauses,
|
|
2
|
+
pagination utilities, and chunked query execution."""
|
|
3
|
+
|
|
4
|
+
import sqlite3
|
|
5
|
+
|
|
6
|
+
# Stay well under SQLite's variable limit (999 on older builds, 32766 on newer).
|
|
7
|
+
_SQLITE_VAR_LIMIT = 500
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def chunked_query(cursor, sql_template: str, item_ids: list[int]) -> list:
|
|
11
|
+
"""Execute a query in chunks to stay under SQLite's variable limit.
|
|
12
|
+
|
|
13
|
+
The *sql_template* must contain a ``{placeholders}`` marker where the
|
|
14
|
+
``IN (?, ?, ...)`` list will be inserted.
|
|
15
|
+
"""
|
|
16
|
+
results = []
|
|
17
|
+
for i in range(0, len(item_ids), _SQLITE_VAR_LIMIT):
|
|
18
|
+
chunk = item_ids[i : i + _SQLITE_VAR_LIMIT]
|
|
19
|
+
placeholders = ",".join("?" * len(chunk))
|
|
20
|
+
sql = sql_template.format(placeholders=placeholders)
|
|
21
|
+
cursor.execute(sql, chunk)
|
|
22
|
+
results.extend(cursor.fetchall())
|
|
23
|
+
return results
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def paginate(
|
|
27
|
+
conn: sqlite3.Connection,
|
|
28
|
+
count_sql: str,
|
|
29
|
+
fetch_sql: str,
|
|
30
|
+
params,
|
|
31
|
+
*,
|
|
32
|
+
page: int = 1,
|
|
33
|
+
limit: int = 50,
|
|
34
|
+
) -> tuple[list, dict]:
|
|
35
|
+
"""Execute a count + paginated fetch and return (rows, pagination_dict)."""
|
|
36
|
+
total = conn.execute(count_sql, list(params)).fetchone()[0]
|
|
37
|
+
total_pages = max(1, (total + limit - 1) // limit)
|
|
38
|
+
offset = (page - 1) * limit
|
|
39
|
+
rows = conn.execute(fetch_sql, list(params) + [limit, offset]).fetchall()
|
|
40
|
+
return rows, {"page": page, "limit": limit, "total": total, "total_pages": total_pages}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def paginated_response(entity_key: str, items, pagination: dict, **extras) -> dict:
|
|
44
|
+
"""Build a standard paginated response envelope."""
|
|
45
|
+
return {entity_key: items, "pagination": pagination, **extras}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def build_location_case_clauses(
|
|
49
|
+
config: dict, home: str, path_col: str = "path", prefix: str = ""
|
|
50
|
+
) -> tuple[list[str], list]:
|
|
51
|
+
"""Build CASE/WHEN SQL clauses from config directories.
|
|
52
|
+
Returns (case_lines: list[str], params: list)."""
|
|
53
|
+
dirs = config.get("directories", [])
|
|
54
|
+
case_lines = []
|
|
55
|
+
params = []
|
|
56
|
+
for d in dirs:
|
|
57
|
+
expanded = d.replace("~", home)
|
|
58
|
+
label = d.rstrip("/").split("/")[-1]
|
|
59
|
+
if not label.startswith("."):
|
|
60
|
+
label = label.title()
|
|
61
|
+
case_lines.append(f"WHEN {path_col} LIKE ? THEN ?")
|
|
62
|
+
params.extend([expanded + "/%", prefix + label])
|
|
63
|
+
return case_lines, params
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def build_remote_source_label_clauses(
|
|
67
|
+
sources_data: list[dict],
|
|
68
|
+
) -> tuple[list[str], list]:
|
|
69
|
+
"""Build CASE/WHEN clauses for remote source labels.
|
|
70
|
+
Returns (case_lines: list[str], params: list)."""
|
|
71
|
+
case_lines = []
|
|
72
|
+
params = []
|
|
73
|
+
for s in sources_data:
|
|
74
|
+
if s["source_type"] == "remote":
|
|
75
|
+
case_lines.append("WHEN source = ? THEN ?")
|
|
76
|
+
params.extend([s["name"], f"Drive (indexed): {s['account']}"])
|
|
77
|
+
return case_lines, params
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def build_status_filter(
|
|
81
|
+
status: "str | list[str] | None",
|
|
82
|
+
*,
|
|
83
|
+
column: str,
|
|
84
|
+
default_exclude: "list[str] | None" = None,
|
|
85
|
+
default_include: "list[str] | None" = None,
|
|
86
|
+
) -> tuple[list[str], list]:
|
|
87
|
+
"""Build a status filter clause for dynamic WHERE construction.
|
|
88
|
+
|
|
89
|
+
Returns (conditions, params) where conditions is a list of 0 or 1
|
|
90
|
+
SQL fragments suitable for extending a WHERE clause.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
status : str, list[str], or None
|
|
95
|
+
``None`` → apply default filter.
|
|
96
|
+
``"all"`` → no filter (bypass defaults).
|
|
97
|
+
Single string → exact match.
|
|
98
|
+
List of strings → IN clause. Empty list → no filter.
|
|
99
|
+
column : str
|
|
100
|
+
Fully qualified column reference (e.g. ``"file.status"``).
|
|
101
|
+
default_exclude : list[str], optional
|
|
102
|
+
Statuses to exclude when ``status is None``.
|
|
103
|
+
default_include : list[str], optional
|
|
104
|
+
Statuses to include when ``status is None``.
|
|
105
|
+
|
|
106
|
+
``default_exclude`` and ``default_include`` are mutually exclusive.
|
|
107
|
+
If both are provided, ``default_exclude`` takes precedence.
|
|
108
|
+
"""
|
|
109
|
+
if status == "all":
|
|
110
|
+
return [], []
|
|
111
|
+
|
|
112
|
+
if status is None:
|
|
113
|
+
if default_exclude:
|
|
114
|
+
placeholders = ",".join("?" for _ in default_exclude)
|
|
115
|
+
return [f"{column} NOT IN ({placeholders})"], list(default_exclude)
|
|
116
|
+
if default_include:
|
|
117
|
+
placeholders = ",".join("?" for _ in default_include)
|
|
118
|
+
return [f"{column} IN ({placeholders})"], list(default_include)
|
|
119
|
+
return [], []
|
|
120
|
+
|
|
121
|
+
if isinstance(status, list):
|
|
122
|
+
if not status:
|
|
123
|
+
return [], []
|
|
124
|
+
placeholders = ",".join("?" for _ in status)
|
|
125
|
+
return [f"{column} IN ({placeholders})"], list(status)
|
|
126
|
+
|
|
127
|
+
# Single string — exact match
|
|
128
|
+
return [f"{column} = ?"], [status]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def split_query_terms(query: str) -> list[str]:
|
|
132
|
+
"""Split query on whitespace, dropping terms shorter than 2 chars."""
|
|
133
|
+
return [t for t in query.split() if len(t) >= 2]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def build_fts5_query(terms: list[str]) -> str:
|
|
137
|
+
"""Build an FTS5 query with AND semantics and prefix matching."""
|
|
138
|
+
sanitized = [term.replace('"', "") for term in terms]
|
|
139
|
+
return " ".join(f'"{term}"*' for term in sanitized if len(term) >= 2)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def build_term_conditions(
|
|
143
|
+
columns: list[str],
|
|
144
|
+
terms: list[str],
|
|
145
|
+
) -> tuple[str, list[str]]:
|
|
146
|
+
"""Build AND-ed LIKE conditions: every term must appear in at least one column."""
|
|
147
|
+
groups = []
|
|
148
|
+
params: list[str] = []
|
|
149
|
+
for term in terms:
|
|
150
|
+
like = f"%{term}%"
|
|
151
|
+
col_parts = [f"{col} LIKE ?" for col in columns]
|
|
152
|
+
groups.append(f"({' OR '.join(col_parts)})")
|
|
153
|
+
params.extend([like] * len(columns))
|
|
154
|
+
return " AND ".join(groups), params
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def build_remote_account_case_clauses(
|
|
158
|
+
sources_data: list[dict],
|
|
159
|
+
) -> tuple[list[str], list]:
|
|
160
|
+
"""Build CASE/WHEN clauses mapping remote source names to accounts.
|
|
161
|
+
Returns (case_lines: list[str], params: list)."""
|
|
162
|
+
case_lines = []
|
|
163
|
+
params = []
|
|
164
|
+
for s in sources_data:
|
|
165
|
+
if s["source_type"] == "remote":
|
|
166
|
+
case_lines.append("WHEN ? THEN ?")
|
|
167
|
+
params.extend([s["name"], s["account"]])
|
|
168
|
+
return case_lines, params
|
footprinter/db/status.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""System status queries — table counts, config presence, last-indexed timestamp.
|
|
2
|
+
|
|
3
|
+
Includes both MCP-oriented status and the legacy get_stats.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import sqlite3
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _safe_count(cursor: sqlite3.Cursor, query: str) -> int:
|
|
12
|
+
"""Execute a COUNT query, returning 0 if the table doesn't exist."""
|
|
13
|
+
try:
|
|
14
|
+
cursor.execute(query)
|
|
15
|
+
return cursor.fetchone()[0]
|
|
16
|
+
except sqlite3.OperationalError:
|
|
17
|
+
return 0
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _safe_query(conn: sqlite3.Connection, query: str, *, default: Any = None) -> Any:
|
|
21
|
+
"""Execute a query and return the first column of the first row.
|
|
22
|
+
|
|
23
|
+
Returns ``default`` if the table doesn't exist or the query returns no rows.
|
|
24
|
+
"""
|
|
25
|
+
try:
|
|
26
|
+
row = conn.execute(query).fetchone()
|
|
27
|
+
return row[0] if row else default
|
|
28
|
+
except sqlite3.OperationalError:
|
|
29
|
+
return default
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _safe_fetchall(conn: sqlite3.Connection, query: str) -> list[sqlite3.Row]:
|
|
33
|
+
"""Execute a query and return all rows, or [] on missing table."""
|
|
34
|
+
try:
|
|
35
|
+
return conn.execute(query).fetchall()
|
|
36
|
+
except sqlite3.OperationalError:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# -- Hidden-client NOT EXISTS clause (reused across source queries) -----------
|
|
41
|
+
_NOT_HIDDEN_CLIENT = (
|
|
42
|
+
"NOT EXISTS ( SELECT 1 FROM clients client WHERE client.id = {alias}.client_id AND client.mcp_view = 'hidden')"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_mcp_status(conn: sqlite3.Connection) -> dict:
|
|
47
|
+
"""Return MCP-oriented status: per-source counts excluding hidden clients.
|
|
48
|
+
|
|
49
|
+
Designed for ``role=Role.VIEWER`` callers — hidden-client rows are
|
|
50
|
+
excluded from emails, chats, messages, and browser counts.
|
|
51
|
+
"""
|
|
52
|
+
# -- Per-table counts and last-sync timestamps ----------------------------
|
|
53
|
+
tables: Dict[str, Dict[str, Optional[str]]] = {
|
|
54
|
+
"files": {
|
|
55
|
+
"count": (
|
|
56
|
+
"SELECT COUNT(*) FROM files WHERE status != 'removed' AND COALESCE(mcp_view, 'inherit') != 'hidden'"
|
|
57
|
+
),
|
|
58
|
+
"latest": (
|
|
59
|
+
"SELECT MAX(indexed_at) FROM files "
|
|
60
|
+
"WHERE status != 'removed' "
|
|
61
|
+
"AND COALESCE(mcp_view, 'inherit') != 'hidden'"
|
|
62
|
+
),
|
|
63
|
+
},
|
|
64
|
+
"emails": {
|
|
65
|
+
"count": (
|
|
66
|
+
"SELECT COUNT(*) FROM emails email "
|
|
67
|
+
"WHERE email.status != 'removed' "
|
|
68
|
+
f"AND {_NOT_HIDDEN_CLIENT.format(alias='email')}"
|
|
69
|
+
),
|
|
70
|
+
"latest": (
|
|
71
|
+
"SELECT MAX(indexed_at) FROM emails email "
|
|
72
|
+
"WHERE email.status != 'removed' "
|
|
73
|
+
f"AND {_NOT_HIDDEN_CLIENT.format(alias='email')}"
|
|
74
|
+
),
|
|
75
|
+
},
|
|
76
|
+
"chats": {
|
|
77
|
+
"count": (
|
|
78
|
+
"SELECT COUNT(*) FROM chats chat "
|
|
79
|
+
"WHERE COALESCE(chat.status, 'active') != 'removed' "
|
|
80
|
+
f"AND {_NOT_HIDDEN_CLIENT.format(alias='chat')}"
|
|
81
|
+
),
|
|
82
|
+
"latest": (
|
|
83
|
+
"SELECT MAX(modified_at) FROM chats chat "
|
|
84
|
+
"WHERE COALESCE(chat.status, 'active') != 'removed' "
|
|
85
|
+
f"AND {_NOT_HIDDEN_CLIENT.format(alias='chat')}"
|
|
86
|
+
),
|
|
87
|
+
},
|
|
88
|
+
"messages": {
|
|
89
|
+
"count": (
|
|
90
|
+
"SELECT COUNT(*) FROM messages message "
|
|
91
|
+
"WHERE NOT EXISTS ("
|
|
92
|
+
" SELECT 1 FROM chats chat"
|
|
93
|
+
" JOIN clients client ON client.id = chat.client_id"
|
|
94
|
+
" WHERE chat.id = message.chat_id AND client.mcp_view = 'hidden'"
|
|
95
|
+
")"
|
|
96
|
+
),
|
|
97
|
+
"latest": (
|
|
98
|
+
"SELECT MAX(created_at) FROM messages message "
|
|
99
|
+
"WHERE NOT EXISTS ("
|
|
100
|
+
" SELECT 1 FROM chats chat"
|
|
101
|
+
" JOIN clients client ON client.id = chat.client_id"
|
|
102
|
+
" WHERE chat.id = message.chat_id AND client.mcp_view = 'hidden'"
|
|
103
|
+
")"
|
|
104
|
+
),
|
|
105
|
+
},
|
|
106
|
+
"browser": {
|
|
107
|
+
"count": (
|
|
108
|
+
"SELECT COUNT(*) FROM visits bv "
|
|
109
|
+
"WHERE bv.status != 'removed' "
|
|
110
|
+
f"AND {_NOT_HIDDEN_CLIENT.format(alias='bv')}"
|
|
111
|
+
),
|
|
112
|
+
"latest": (
|
|
113
|
+
"SELECT MAX(visit_time) FROM visits bv "
|
|
114
|
+
"WHERE bv.status != 'removed' "
|
|
115
|
+
f"AND {_NOT_HIDDEN_CLIENT.format(alias='bv')}"
|
|
116
|
+
),
|
|
117
|
+
},
|
|
118
|
+
"projects": {
|
|
119
|
+
"count": "SELECT COUNT(*) FROM projects",
|
|
120
|
+
"latest": "SELECT MAX(created_at) FROM projects",
|
|
121
|
+
},
|
|
122
|
+
"clients": {
|
|
123
|
+
"count": ("SELECT COUNT(*) FROM clients WHERE COALESCE(mcp_view, 'inherit') != 'hidden'"),
|
|
124
|
+
"latest": None,
|
|
125
|
+
},
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
sources: Dict[str, dict] = {}
|
|
129
|
+
for table, queries in tables.items():
|
|
130
|
+
count = _safe_query(conn, queries["count"], default=0)
|
|
131
|
+
latest = _safe_query(conn, queries["latest"]) if queries["latest"] else None
|
|
132
|
+
if count == 0 and latest is None:
|
|
133
|
+
# Confirm count isn't masking a missing-table error
|
|
134
|
+
try:
|
|
135
|
+
conn.execute(queries["count"])
|
|
136
|
+
except sqlite3.OperationalError:
|
|
137
|
+
sources[table] = {"count": 0, "last_sync": None, "error": "table not found"}
|
|
138
|
+
continue
|
|
139
|
+
sources[table] = {"count": count, "last_sync": latest}
|
|
140
|
+
|
|
141
|
+
# -- File breakdown by source ---------------------------------------------
|
|
142
|
+
rows = _safe_fetchall(
|
|
143
|
+
conn,
|
|
144
|
+
"""
|
|
145
|
+
SELECT source, COUNT(*) as count, COALESCE(SUM(size_bytes), 0) as size
|
|
146
|
+
FROM files WHERE status != 'removed'
|
|
147
|
+
AND COALESCE(mcp_view, 'inherit') != 'hidden'
|
|
148
|
+
GROUP BY source
|
|
149
|
+
""",
|
|
150
|
+
)
|
|
151
|
+
files_by_source = {r["source"]: {"count": r["count"], "size_bytes": r["size"]} for r in rows}
|
|
152
|
+
|
|
153
|
+
# -- File breakdown by status ---------------------------------------------
|
|
154
|
+
rows = _safe_fetchall(
|
|
155
|
+
conn,
|
|
156
|
+
"""
|
|
157
|
+
SELECT status, COUNT(*) as count FROM files GROUP BY status
|
|
158
|
+
""",
|
|
159
|
+
)
|
|
160
|
+
files_by_status = {r["status"]: r["count"] for r in rows}
|
|
161
|
+
|
|
162
|
+
# -- Project count by status ----------------------------------------------
|
|
163
|
+
rows = _safe_fetchall(
|
|
164
|
+
conn,
|
|
165
|
+
"""
|
|
166
|
+
SELECT status, COUNT(*) as count FROM projects GROUP BY status
|
|
167
|
+
""",
|
|
168
|
+
)
|
|
169
|
+
projects_by_status = {r["status"]: r["count"] for r in rows}
|
|
170
|
+
|
|
171
|
+
# -- Emails by client (excludes hidden clients and removed emails) --------
|
|
172
|
+
rows = _safe_fetchall(
|
|
173
|
+
conn,
|
|
174
|
+
"""
|
|
175
|
+
SELECT COALESCE(client.name, '(unassigned)') AS client_name, COUNT(*) as count
|
|
176
|
+
FROM emails email LEFT JOIN clients client ON email.client_id = client.id
|
|
177
|
+
WHERE email.status != 'removed'
|
|
178
|
+
AND (client.mcp_view IS NULL OR client.mcp_view != 'hidden')
|
|
179
|
+
GROUP BY client_name
|
|
180
|
+
""",
|
|
181
|
+
)
|
|
182
|
+
emails_by_client = {r["client_name"]: r["count"] for r in rows}
|
|
183
|
+
|
|
184
|
+
# -- Chats by client (excludes hidden clients and removed chats) ----------
|
|
185
|
+
rows = _safe_fetchall(
|
|
186
|
+
conn,
|
|
187
|
+
"""
|
|
188
|
+
SELECT COALESCE(client.name, '(unassigned)') AS client_name, COUNT(*) as count
|
|
189
|
+
FROM chats chat LEFT JOIN clients client ON chat.client_id = client.id
|
|
190
|
+
WHERE chat.status != 'removed'
|
|
191
|
+
AND (client.mcp_view IS NULL OR client.mcp_view != 'hidden')
|
|
192
|
+
GROUP BY client_name
|
|
193
|
+
""",
|
|
194
|
+
)
|
|
195
|
+
chats_by_client = {r["client_name"]: r["count"] for r in rows}
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
"sources": sources,
|
|
199
|
+
"files_by_source": files_by_source,
|
|
200
|
+
"files_by_status": files_by_status,
|
|
201
|
+
"projects_by_status": projects_by_status,
|
|
202
|
+
"emails_by_client": emails_by_client,
|
|
203
|
+
"chats_by_client": chats_by_client,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def get_system_status(conn: sqlite3.Connection, config_path: Path) -> dict:
|
|
208
|
+
"""Return system status dict with table counts, data presence, and config check.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
conn : sqlite3.Connection
|
|
213
|
+
Database connection with row_factory = sqlite3.Row.
|
|
214
|
+
config_path : Path
|
|
215
|
+
Absolute path to config.yaml (caller decides where it lives).
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
dict with keys: has_data, config_exists, counts, total, last_indexed
|
|
220
|
+
"""
|
|
221
|
+
cur = conn.cursor()
|
|
222
|
+
|
|
223
|
+
counts = {
|
|
224
|
+
"files": _safe_count(cur, "SELECT COUNT(*) FROM files WHERE status != 'removed'"),
|
|
225
|
+
"folders": _safe_count(cur, "SELECT COUNT(*) FROM folders WHERE status != 'removed'"),
|
|
226
|
+
"visits": _safe_count(cur, "SELECT COUNT(*) FROM visits"),
|
|
227
|
+
"emails": _safe_count(cur, "SELECT COUNT(*) FROM emails"),
|
|
228
|
+
"messages": _safe_count(cur, "SELECT COUNT(*) FROM messages"),
|
|
229
|
+
"projects": _safe_count(cur, "SELECT COUNT(*) FROM projects"),
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
cur.execute("SELECT MAX(indexed_at) FROM files")
|
|
234
|
+
last_indexed = cur.fetchone()[0]
|
|
235
|
+
except sqlite3.OperationalError:
|
|
236
|
+
last_indexed = None
|
|
237
|
+
|
|
238
|
+
total = sum(counts.values())
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
"has_data": total > 0,
|
|
242
|
+
"config_exists": config_path.exists(),
|
|
243
|
+
"counts": counts,
|
|
244
|
+
"total": total,
|
|
245
|
+
"last_indexed": last_indexed,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# ---------------------------------------------------------------------------
|
|
250
|
+
# Write/aggregate operations
|
|
251
|
+
# ---------------------------------------------------------------------------
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def get_stats(conn: sqlite3.Connection) -> Dict[str, Any]:
|
|
255
|
+
"""Get database statistics."""
|
|
256
|
+
cursor = conn.cursor()
|
|
257
|
+
|
|
258
|
+
cursor.execute("SELECT COUNT(*) as count FROM files WHERE status != 'removed'")
|
|
259
|
+
files_count = cursor.fetchone()["count"]
|
|
260
|
+
|
|
261
|
+
cursor.execute("SELECT COUNT(DISTINCT content_type) as count FROM files WHERE status != 'removed'")
|
|
262
|
+
content_types_count = cursor.fetchone()["count"]
|
|
263
|
+
|
|
264
|
+
cursor.execute("SELECT COUNT(*) as count FROM visits")
|
|
265
|
+
urls_count = cursor.fetchone()["count"]
|
|
266
|
+
|
|
267
|
+
cursor.execute("SELECT COUNT(DISTINCT browser) as count FROM visits")
|
|
268
|
+
browsers_count = cursor.fetchone()["count"]
|
|
269
|
+
|
|
270
|
+
cursor.execute(
|
|
271
|
+
"""
|
|
272
|
+
SELECT MAX(modified_at) as latest_file
|
|
273
|
+
FROM files
|
|
274
|
+
WHERE modified_at IS NOT NULL AND status != 'removed'
|
|
275
|
+
"""
|
|
276
|
+
)
|
|
277
|
+
latest_file = cursor.fetchone()["latest_file"]
|
|
278
|
+
|
|
279
|
+
cursor.execute("SELECT MAX(visit_time) as latest_visit FROM visits")
|
|
280
|
+
latest_visit = cursor.fetchone()["latest_visit"]
|
|
281
|
+
|
|
282
|
+
# Query remote source names directly from the sources table
|
|
283
|
+
rows = cursor.execute("SELECT name FROM sources WHERE source_type = 'remote'").fetchall()
|
|
284
|
+
remote_sources = [r["name"] for r in rows]
|
|
285
|
+
|
|
286
|
+
if remote_sources:
|
|
287
|
+
remote_ph = ",".join("?" * len(remote_sources))
|
|
288
|
+
cursor.execute(
|
|
289
|
+
f"SELECT COUNT(*) as count FROM files WHERE source IN ({remote_ph}) AND status != 'removed'",
|
|
290
|
+
remote_sources,
|
|
291
|
+
)
|
|
292
|
+
remote_files_count = cursor.fetchone()["count"]
|
|
293
|
+
|
|
294
|
+
cursor.execute(
|
|
295
|
+
f"SELECT COUNT(DISTINCT source) as count FROM files WHERE source IN ({remote_ph}) AND status != 'removed'",
|
|
296
|
+
remote_sources,
|
|
297
|
+
)
|
|
298
|
+
remote_sources_count = cursor.fetchone()["count"]
|
|
299
|
+
else:
|
|
300
|
+
remote_files_count = 0
|
|
301
|
+
remote_sources_count = 0
|
|
302
|
+
|
|
303
|
+
cursor.execute("SELECT COUNT(*) as count FROM emails")
|
|
304
|
+
emails_count = cursor.fetchone()["count"]
|
|
305
|
+
|
|
306
|
+
cursor.execute("SELECT COUNT(DISTINCT account) as count FROM emails")
|
|
307
|
+
email_accounts_count = cursor.fetchone()["count"]
|
|
308
|
+
|
|
309
|
+
return {
|
|
310
|
+
"total_files": files_count,
|
|
311
|
+
"content_types": content_types_count,
|
|
312
|
+
"total_urls": urls_count,
|
|
313
|
+
"browsers": browsers_count,
|
|
314
|
+
"latest_file_modified": latest_file,
|
|
315
|
+
"latest_visit": latest_visit,
|
|
316
|
+
"remote_files": remote_files_count,
|
|
317
|
+
"remote_sources": remote_sources_count,
|
|
318
|
+
"total_emails": emails_count,
|
|
319
|
+
"email_accounts": email_accounts_count,
|
|
320
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Upload tracking — CRUD for the uploads table."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_upload(conn: sqlite3.Connection, data: Dict[str, Any]) -> int:
|
|
9
|
+
"""Create a new upload record."""
|
|
10
|
+
cursor = conn.cursor()
|
|
11
|
+
cursor.execute(
|
|
12
|
+
"""
|
|
13
|
+
INSERT INTO uploads
|
|
14
|
+
(filename, file_hash, file_size, type, source, status, metadata)
|
|
15
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
16
|
+
""",
|
|
17
|
+
(
|
|
18
|
+
data["filename"],
|
|
19
|
+
data["file_hash"],
|
|
20
|
+
data.get("file_size"),
|
|
21
|
+
data["type"],
|
|
22
|
+
data.get("source"),
|
|
23
|
+
data.get("status", "pending"),
|
|
24
|
+
json.dumps(data.get("metadata", {})),
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
return cursor.lastrowid
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def update_upload(conn: sqlite3.Connection, upload_id: int, **kwargs) -> None:
|
|
31
|
+
"""Update an upload record with results."""
|
|
32
|
+
allowed = {
|
|
33
|
+
"status",
|
|
34
|
+
"items_added",
|
|
35
|
+
"items_updated",
|
|
36
|
+
"items_total",
|
|
37
|
+
"completed_at",
|
|
38
|
+
"error_message",
|
|
39
|
+
}
|
|
40
|
+
updates, values = [], []
|
|
41
|
+
for field, value in kwargs.items():
|
|
42
|
+
if field in allowed:
|
|
43
|
+
updates.append(f"{field} = ?")
|
|
44
|
+
values.append(value)
|
|
45
|
+
if updates:
|
|
46
|
+
values.append(upload_id)
|
|
47
|
+
cursor = conn.cursor()
|
|
48
|
+
cursor.execute(f"UPDATE uploads SET {', '.join(updates)} WHERE id = ?", values)
|
|
49
|
+
conn.commit()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_upload_by_hash(conn: sqlite3.Connection, file_hash: str) -> Optional[Dict]:
|
|
53
|
+
"""Check if a file was already uploaded."""
|
|
54
|
+
cursor = conn.cursor()
|
|
55
|
+
cursor.execute("SELECT * FROM uploads WHERE file_hash = ?", (file_hash,))
|
|
56
|
+
row = cursor.fetchone()
|
|
57
|
+
return dict(row) if row else None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_recent_uploads(conn: sqlite3.Connection, upload_type: Optional[str] = None, limit: int = 10) -> List[Dict]:
|
|
61
|
+
"""Get recent uploads, optionally filtered by type."""
|
|
62
|
+
cursor = conn.cursor()
|
|
63
|
+
if upload_type:
|
|
64
|
+
cursor.execute(
|
|
65
|
+
"SELECT * FROM uploads WHERE type = ? ORDER BY uploaded_at DESC LIMIT ?",
|
|
66
|
+
(upload_type, limit),
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
cursor.execute("SELECT * FROM uploads ORDER BY uploaded_at DESC LIMIT ?", (limit,))
|
|
70
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Pipeline adapter types and concrete adapters.
|
|
2
|
+
|
|
3
|
+
Re-exports the core types and built-in source adapters:
|
|
4
|
+
|
|
5
|
+
from footprinter.ingest.adapters import PipeAdapter, PipeResult
|
|
6
|
+
from footprinter.ingest.adapters import LocalFoldersAdapter, BrowserAdapter
|
|
7
|
+
|
|
8
|
+
Connector adapters (Drive, Gmail) live in connectors/google/adapters/.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from footprinter.ingest.adapters.browser import BrowserAdapter
|
|
12
|
+
from footprinter.ingest.adapters.chat import ChatAdapter
|
|
13
|
+
from footprinter.ingest.adapters.local_files import LocalFilesAdapter
|
|
14
|
+
from footprinter.ingest.adapters.local_folders import LocalFoldersAdapter
|
|
15
|
+
from footprinter.ingest.adapters.protocol import (
|
|
16
|
+
ErrorType,
|
|
17
|
+
PipeAdapter,
|
|
18
|
+
PipeContext,
|
|
19
|
+
PipeResult,
|
|
20
|
+
PipeStatus,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"BrowserAdapter",
|
|
25
|
+
"ChatAdapter",
|
|
26
|
+
"PipeAdapter",
|
|
27
|
+
"PipeContext",
|
|
28
|
+
"ErrorType",
|
|
29
|
+
"LocalFilesAdapter",
|
|
30
|
+
"LocalFoldersAdapter",
|
|
31
|
+
"PipeResult",
|
|
32
|
+
"PipeStatus",
|
|
33
|
+
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Browser adapter.
|
|
2
|
+
|
|
3
|
+
Wraps BrowserManager to conform to PipeAdapter protocol.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
|
+
|
|
12
|
+
from footprinter.db import browser as browser_db
|
|
13
|
+
from footprinter.ingest.adapters.ingest import ingest_entries
|
|
14
|
+
from footprinter.ingest.adapters.protocol import ErrorType, PipeContext, PipeResult
|
|
15
|
+
from footprinter.ingest.browser_indexer import BrowserManager
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BrowserAdapter:
|
|
21
|
+
"""Adapter wrapping BrowserManager for the browser stage."""
|
|
22
|
+
|
|
23
|
+
name = "browser"
|
|
24
|
+
pipe_name = "browser"
|
|
25
|
+
required_extras: List[str] = []
|
|
26
|
+
|
|
27
|
+
def run(self, db: Any, ctx: PipeContext) -> PipeResult:
|
|
28
|
+
"""Index browser history into visits table."""
|
|
29
|
+
try:
|
|
30
|
+
last_run = None if ctx.full_mode else ctx.last_run
|
|
31
|
+
manager = BrowserManager(ctx.source_config, since=last_run)
|
|
32
|
+
result = ingest_entries(
|
|
33
|
+
"browser",
|
|
34
|
+
manager.parse_all(),
|
|
35
|
+
partial(browser_db.insert_visit, db.conn),
|
|
36
|
+
count_label="urls_indexed",
|
|
37
|
+
conn=db.conn,
|
|
38
|
+
on_progress=ctx.on_progress,
|
|
39
|
+
)
|
|
40
|
+
return result
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f"browser stage failed: {e}")
|
|
43
|
+
return PipeResult.make_error(
|
|
44
|
+
"browser",
|
|
45
|
+
error=str(e),
|
|
46
|
+
error_type=ErrorType.RUNTIME,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def status(self, db: Any) -> Dict[str, Any]:
|
|
50
|
+
"""Return browser visit entry count."""
|
|
51
|
+
cursor = db.conn.cursor()
|
|
52
|
+
cursor.execute("SELECT COUNT(*) FROM visits")
|
|
53
|
+
count = cursor.fetchone()[0]
|
|
54
|
+
return {"visits": count}
|