footprinter-cli 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +431 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
- footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
- footprinter/bundled/samples/visible-file-sample.txt +2 -0
- footprinter/cli/__init__.py +135 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +327 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/_sample_seed.py +204 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +543 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +2001 -0
- footprinter/cli/status.py +747 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +602 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +724 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +487 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +315 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +223 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +112 -0
- footprinter/ingest/pipe_runner.py +200 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +186 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +67 -0
- footprinter/mcp/errors.py +105 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +19 -0
- footprinter/paths.py +117 -0
- footprinter/permissions.py +1152 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1264 -0
- footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
- footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
- footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
footprinter/db/files.py
ADDED
|
@@ -0,0 +1,724 @@
|
|
|
1
|
+
"""File queries and write operations.
|
|
2
|
+
|
|
3
|
+
Provides list, detail, status-update, and insert functions for indexed files.
|
|
4
|
+
All functions take a raw ``sqlite3.Connection`` and return plain dicts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import sqlite3
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from footprinter.db.sql_utils import build_status_filter, paginate, paginated_response
|
|
14
|
+
|
|
15
|
+
VALID_FILE_STATUSES = frozenset({"active", "hidden", "removed"})
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def list_files(
|
|
19
|
+
conn: sqlite3.Connection,
|
|
20
|
+
*,
|
|
21
|
+
project_id: Optional[int] = None,
|
|
22
|
+
source: Optional[list[str]] = None,
|
|
23
|
+
status: Optional[str | list[str]] = None,
|
|
24
|
+
content_type: Optional[str] = None,
|
|
25
|
+
limit: int = 50,
|
|
26
|
+
page: int = 1,
|
|
27
|
+
) -> dict:
|
|
28
|
+
"""List files with optional filters and pagination.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
conn : sqlite3.Connection
|
|
33
|
+
project_id : int, optional
|
|
34
|
+
Filter to a single project.
|
|
35
|
+
source : list[str], optional
|
|
36
|
+
Filter by source names (e.g. ``["local"]``, ``["workdrive"]``).
|
|
37
|
+
status : str, list[str], or None
|
|
38
|
+
``None`` → exclude removed (default).
|
|
39
|
+
``"all"`` → no status filter.
|
|
40
|
+
Single string → exact match (``"active"``, ``"hidden"``, ``"removed"``).
|
|
41
|
+
List of strings → ``WHERE status IN (...)``.
|
|
42
|
+
content_type : str, optional
|
|
43
|
+
Exact match on ``files.content_type``.
|
|
44
|
+
limit, page : int
|
|
45
|
+
Pagination.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
dict
|
|
50
|
+
``{"files": [...], "pagination": {page, limit, total, total_pages}}``
|
|
51
|
+
"""
|
|
52
|
+
base = """
|
|
53
|
+
SELECT file.id, file.name, file.path, file.source, file.status, file.content_type,
|
|
54
|
+
file.size_bytes, file.modified_at, project.project_name,
|
|
55
|
+
file.mcp_view, file.mcp_read
|
|
56
|
+
FROM files file
|
|
57
|
+
LEFT JOIN projects project ON file.project_id = project.id
|
|
58
|
+
"""
|
|
59
|
+
conditions: list[str] = []
|
|
60
|
+
params: list = []
|
|
61
|
+
|
|
62
|
+
# Status filter
|
|
63
|
+
status_conds, status_params = build_status_filter(
|
|
64
|
+
status,
|
|
65
|
+
column="file.status",
|
|
66
|
+
default_exclude=["removed"],
|
|
67
|
+
)
|
|
68
|
+
conditions.extend(status_conds)
|
|
69
|
+
params.extend(status_params)
|
|
70
|
+
|
|
71
|
+
if project_id is not None:
|
|
72
|
+
conditions.append("file.project_id = ?")
|
|
73
|
+
params.append(project_id)
|
|
74
|
+
|
|
75
|
+
if source:
|
|
76
|
+
placeholders = ",".join("?" * len(source))
|
|
77
|
+
conditions.append(f"file.source IN ({placeholders})")
|
|
78
|
+
params.extend(source)
|
|
79
|
+
|
|
80
|
+
if content_type:
|
|
81
|
+
conditions.append("file.content_type = ?")
|
|
82
|
+
params.append(content_type)
|
|
83
|
+
|
|
84
|
+
where = " WHERE " + " AND ".join(conditions) if conditions else ""
|
|
85
|
+
query = base + where
|
|
86
|
+
|
|
87
|
+
count_sql = f"SELECT COUNT(*) FROM ({query}) _c"
|
|
88
|
+
fetch_sql = query + " ORDER BY file.id LIMIT ? OFFSET ?"
|
|
89
|
+
rows, pagination = paginate(conn, count_sql, fetch_sql, params, page=page, limit=limit)
|
|
90
|
+
|
|
91
|
+
files = [
|
|
92
|
+
{
|
|
93
|
+
"id": r["id"],
|
|
94
|
+
"name": r["name"],
|
|
95
|
+
"path": r["path"],
|
|
96
|
+
"source": r["source"],
|
|
97
|
+
"status": r["status"],
|
|
98
|
+
"content_type": r["content_type"] or "",
|
|
99
|
+
"size_bytes": r["size_bytes"],
|
|
100
|
+
"modified_at": r["modified_at"],
|
|
101
|
+
"project_name": r["project_name"] or "",
|
|
102
|
+
"mcp_view": r["mcp_view"],
|
|
103
|
+
"mcp_read": r["mcp_read"],
|
|
104
|
+
}
|
|
105
|
+
for r in rows
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
return paginated_response("files", files, pagination)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def get_file(
|
|
112
|
+
conn: sqlite3.Connection,
|
|
113
|
+
file_id: int,
|
|
114
|
+
) -> Optional[dict]:
|
|
115
|
+
"""Return full detail for a single file, or None if not found.
|
|
116
|
+
|
|
117
|
+
Joins ``projects`` for project_name.
|
|
118
|
+
"""
|
|
119
|
+
row = conn.execute(
|
|
120
|
+
"""
|
|
121
|
+
SELECT file.id, file.name, file.path, file.source, file.status, file.status_reason,
|
|
122
|
+
file.content_type, file.mime_type, file.size_bytes, file.created_at,
|
|
123
|
+
file.modified_at, file.indexed_at, file.project_id, file.md5_hash,
|
|
124
|
+
file.external_id, file.account,
|
|
125
|
+
file.mcp_view, file.mcp_read,
|
|
126
|
+
project.project_name
|
|
127
|
+
FROM files file
|
|
128
|
+
LEFT JOIN projects project ON file.project_id = project.id
|
|
129
|
+
WHERE file.id = ?
|
|
130
|
+
""",
|
|
131
|
+
(file_id,),
|
|
132
|
+
).fetchone()
|
|
133
|
+
|
|
134
|
+
if row is None:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
"id": row["id"],
|
|
139
|
+
"name": row["name"],
|
|
140
|
+
"path": row["path"],
|
|
141
|
+
"source": row["source"],
|
|
142
|
+
"status": row["status"],
|
|
143
|
+
"status_reason": row["status_reason"],
|
|
144
|
+
"content_type": row["content_type"],
|
|
145
|
+
"mime_type": row["mime_type"],
|
|
146
|
+
"size_bytes": row["size_bytes"],
|
|
147
|
+
"created_at": row["created_at"],
|
|
148
|
+
"modified_at": row["modified_at"],
|
|
149
|
+
"indexed_at": row["indexed_at"],
|
|
150
|
+
"project_id": row["project_id"],
|
|
151
|
+
"md5_hash": row["md5_hash"],
|
|
152
|
+
"external_id": row["external_id"],
|
|
153
|
+
"account": row["account"],
|
|
154
|
+
"project_name": row["project_name"],
|
|
155
|
+
"mcp_view": row["mcp_view"] or "inherit",
|
|
156
|
+
"mcp_read": row["mcp_read"] or "inherit",
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def update_file_status(
|
|
161
|
+
conn: sqlite3.Connection,
|
|
162
|
+
file_id: int,
|
|
163
|
+
status: str,
|
|
164
|
+
reason: Optional[str] = None,
|
|
165
|
+
) -> Optional[bool]:
|
|
166
|
+
"""Change a file's status.
|
|
167
|
+
|
|
168
|
+
Returns True on success, None if not found.
|
|
169
|
+
Raises ValueError for invalid status values.
|
|
170
|
+
"""
|
|
171
|
+
if status not in VALID_FILE_STATUSES:
|
|
172
|
+
raise ValueError(f"Invalid status '{status}'. Must be one of: {', '.join(sorted(VALID_FILE_STATUSES))}")
|
|
173
|
+
|
|
174
|
+
cursor = conn.execute("SELECT id FROM files WHERE id = ?", (file_id,))
|
|
175
|
+
if cursor.fetchone() is None:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
conn.execute(
|
|
179
|
+
"""
|
|
180
|
+
UPDATE files
|
|
181
|
+
SET status = ?, status_reason = ?, status_changed_at = CURRENT_TIMESTAMP
|
|
182
|
+
WHERE id = ?
|
|
183
|
+
""",
|
|
184
|
+
(status, reason, file_id),
|
|
185
|
+
)
|
|
186
|
+
conn.commit()
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def update_file_relationships(
|
|
191
|
+
conn: sqlite3.Connection,
|
|
192
|
+
file_id: int,
|
|
193
|
+
*,
|
|
194
|
+
project_id: Optional[int] = None,
|
|
195
|
+
client_id: Optional[int] = None,
|
|
196
|
+
) -> Optional[bool]:
|
|
197
|
+
"""Update project and/or client assignment on a file.
|
|
198
|
+
|
|
199
|
+
Only updates fields that are passed (not None). Pass ``0`` to clear
|
|
200
|
+
a field (set to NULL). Stamps ``assignment_source = 'user'``
|
|
201
|
+
when the column exists (app-scope DBs only), so auto-detection
|
|
202
|
+
won't overwrite manual assignments.
|
|
203
|
+
Returns True on success, None if file not found.
|
|
204
|
+
"""
|
|
205
|
+
cursor = conn.execute("SELECT id FROM files WHERE id = ?", (file_id,))
|
|
206
|
+
if cursor.fetchone() is None:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
if project_id is not None and project_id != 0:
|
|
210
|
+
proj = conn.execute("SELECT id FROM projects WHERE id = ?", (project_id,)).fetchone()
|
|
211
|
+
if not proj:
|
|
212
|
+
raise ValueError(f"No project with id {project_id}")
|
|
213
|
+
if client_id is not None and client_id != 0:
|
|
214
|
+
cli = conn.execute("SELECT id FROM clients WHERE id = ?", (client_id,)).fetchone()
|
|
215
|
+
if not cli:
|
|
216
|
+
raise ValueError(f"No client with id {client_id}")
|
|
217
|
+
|
|
218
|
+
sets: list[str] = []
|
|
219
|
+
params: list = []
|
|
220
|
+
if project_id is not None:
|
|
221
|
+
if project_id == 0:
|
|
222
|
+
sets.append("project_id = NULL")
|
|
223
|
+
else:
|
|
224
|
+
sets.append("project_id = ?")
|
|
225
|
+
params.append(project_id)
|
|
226
|
+
if client_id is not None:
|
|
227
|
+
if client_id == 0:
|
|
228
|
+
sets.append("client_id = NULL")
|
|
229
|
+
else:
|
|
230
|
+
sets.append("client_id = ?")
|
|
231
|
+
params.append(client_id)
|
|
232
|
+
if not sets:
|
|
233
|
+
return True
|
|
234
|
+
|
|
235
|
+
sets.append("assignment_source = 'user'")
|
|
236
|
+
params.append(file_id)
|
|
237
|
+
try:
|
|
238
|
+
conn.execute(f"UPDATE files SET {', '.join(sets)} WHERE id = ?", params)
|
|
239
|
+
except sqlite3.OperationalError as e:
|
|
240
|
+
if "no such column" not in str(e):
|
|
241
|
+
raise
|
|
242
|
+
# assignment_source not present (tool-only DB)
|
|
243
|
+
sets.pop()
|
|
244
|
+
conn.execute(f"UPDATE files SET {', '.join(sets)} WHERE id = ?", params)
|
|
245
|
+
conn.commit()
|
|
246
|
+
return True
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def list_file_ids_under_path(
|
|
250
|
+
conn: sqlite3.Connection,
|
|
251
|
+
folder_path: str,
|
|
252
|
+
) -> List[int]:
|
|
253
|
+
"""Return IDs of all non-removed files whose path is under *folder_path*."""
|
|
254
|
+
escaped = folder_path.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
|
255
|
+
cursor = conn.execute(
|
|
256
|
+
"SELECT id FROM files WHERE path LIKE ? ESCAPE '\\' AND status != 'removed'",
|
|
257
|
+
(escaped + "/%",),
|
|
258
|
+
)
|
|
259
|
+
return [row["id"] for row in cursor.fetchall()]
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# ---------------------------------------------------------------------------
|
|
263
|
+
# Write operations
|
|
264
|
+
# ---------------------------------------------------------------------------
|
|
265
|
+
|
|
266
|
+
logger = logging.getLogger(__name__)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _determine_file_status(name: str, path: str) -> tuple:
|
|
270
|
+
"""Determine status for a file based on name/path.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Tuple of (status, status_reason)
|
|
274
|
+
"""
|
|
275
|
+
if name.startswith("."):
|
|
276
|
+
return "hidden", "dot_file"
|
|
277
|
+
|
|
278
|
+
path_parts = path.split("/")
|
|
279
|
+
for part in path_parts:
|
|
280
|
+
if part.startswith(".") and part not in ("", "."):
|
|
281
|
+
return "hidden", "in_dot_folder"
|
|
282
|
+
|
|
283
|
+
return "active", None
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _find_project_for_path(
|
|
287
|
+
conn: sqlite3.Connection,
|
|
288
|
+
file_path: str,
|
|
289
|
+
project_prefix_map: Optional[List[Tuple[str, int]]] = None,
|
|
290
|
+
) -> Optional[int]:
|
|
291
|
+
"""Find most specific project by root_path prefix match."""
|
|
292
|
+
if project_prefix_map is not None:
|
|
293
|
+
for root_path, project_id in project_prefix_map:
|
|
294
|
+
if file_path.startswith(root_path):
|
|
295
|
+
return project_id
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
cursor = conn.cursor()
|
|
299
|
+
cursor.execute(
|
|
300
|
+
"""
|
|
301
|
+
SELECT id FROM projects
|
|
302
|
+
WHERE root_path IS NOT NULL
|
|
303
|
+
AND ? LIKE root_path || '%'
|
|
304
|
+
ORDER BY LENGTH(root_path) DESC
|
|
305
|
+
LIMIT 1
|
|
306
|
+
""",
|
|
307
|
+
(file_path,),
|
|
308
|
+
)
|
|
309
|
+
row = cursor.fetchone()
|
|
310
|
+
return row["id"] if row else None
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _get_folder_project_id(
|
|
314
|
+
conn: sqlite3.Connection,
|
|
315
|
+
folder_id: int,
|
|
316
|
+
folder_project_map: Optional[Dict[int, int]] = None,
|
|
317
|
+
) -> Optional[int]:
|
|
318
|
+
"""Look up project_id from a folder row."""
|
|
319
|
+
if folder_project_map is not None:
|
|
320
|
+
return folder_project_map.get(folder_id)
|
|
321
|
+
|
|
322
|
+
cursor = conn.cursor()
|
|
323
|
+
cursor.execute("SELECT project_id FROM folders WHERE id = ?", (folder_id,))
|
|
324
|
+
row = cursor.fetchone()
|
|
325
|
+
return row["project_id"] if row else None
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _is_remote_source(conn: sqlite3.Connection, source: str) -> bool:
|
|
329
|
+
"""Check if a source name is a remote source via the sources table."""
|
|
330
|
+
row = conn.execute("SELECT source_type FROM sources WHERE name = ?", (source,)).fetchone()
|
|
331
|
+
return row is not None and row["source_type"] == "remote"
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _find_folder_in_map(
|
|
335
|
+
conn: sqlite3.Connection,
|
|
336
|
+
source: str,
|
|
337
|
+
path: str,
|
|
338
|
+
folder_path_map: Dict[Tuple[str, str], int],
|
|
339
|
+
remote_source_names: Optional[frozenset] = None,
|
|
340
|
+
) -> Optional[int]:
|
|
341
|
+
"""Resolve folder_id using in-memory map with ancestor walk."""
|
|
342
|
+
parent_dir = os.path.dirname(path)
|
|
343
|
+
|
|
344
|
+
if remote_source_names is not None:
|
|
345
|
+
is_remote = source in remote_source_names
|
|
346
|
+
else:
|
|
347
|
+
is_remote = _is_remote_source(conn, source)
|
|
348
|
+
|
|
349
|
+
if is_remote:
|
|
350
|
+
folder_path = f"{source}:{parent_dir}"
|
|
351
|
+
else:
|
|
352
|
+
folder_path = parent_dir
|
|
353
|
+
|
|
354
|
+
folder_id = folder_path_map.get((source, folder_path))
|
|
355
|
+
if folder_id is not None:
|
|
356
|
+
return folder_id
|
|
357
|
+
|
|
358
|
+
while parent_dir and parent_dir != "/" and len(parent_dir) > 1:
|
|
359
|
+
parent_dir = os.path.dirname(parent_dir)
|
|
360
|
+
if is_remote:
|
|
361
|
+
folder_path = f"{source}:{parent_dir}"
|
|
362
|
+
else:
|
|
363
|
+
folder_path = parent_dir
|
|
364
|
+
|
|
365
|
+
folder_id = folder_path_map.get((source, folder_path))
|
|
366
|
+
if folder_id is not None:
|
|
367
|
+
return folder_id
|
|
368
|
+
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _find_folder_for_path(
|
|
373
|
+
conn: sqlite3.Connection,
|
|
374
|
+
source: str,
|
|
375
|
+
path: str,
|
|
376
|
+
folder_path_map: Optional[Dict[Tuple[str, str], int]] = None,
|
|
377
|
+
remote_source_names: Optional[frozenset] = None,
|
|
378
|
+
) -> Optional[int]:
|
|
379
|
+
"""Find folder_id for a file by matching path to folders."""
|
|
380
|
+
if not path:
|
|
381
|
+
return None
|
|
382
|
+
|
|
383
|
+
if folder_path_map is not None:
|
|
384
|
+
return _find_folder_in_map(
|
|
385
|
+
conn,
|
|
386
|
+
source,
|
|
387
|
+
path,
|
|
388
|
+
folder_path_map,
|
|
389
|
+
remote_source_names=remote_source_names,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
is_remote = _is_remote_source(conn, source)
|
|
393
|
+
cursor = conn.cursor()
|
|
394
|
+
parent_dir = os.path.dirname(path)
|
|
395
|
+
|
|
396
|
+
if is_remote:
|
|
397
|
+
folder_path = f"{source}:{parent_dir}"
|
|
398
|
+
else:
|
|
399
|
+
folder_path = parent_dir
|
|
400
|
+
|
|
401
|
+
cursor.execute(
|
|
402
|
+
"SELECT id FROM folders WHERE source = ? AND path = ?",
|
|
403
|
+
(source, folder_path),
|
|
404
|
+
)
|
|
405
|
+
row = cursor.fetchone()
|
|
406
|
+
if row:
|
|
407
|
+
return row["id"]
|
|
408
|
+
|
|
409
|
+
while parent_dir and parent_dir != "/" and len(parent_dir) > 1:
|
|
410
|
+
parent_dir = os.path.dirname(parent_dir)
|
|
411
|
+
if is_remote:
|
|
412
|
+
folder_path = f"{source}:{parent_dir}"
|
|
413
|
+
else:
|
|
414
|
+
folder_path = parent_dir
|
|
415
|
+
|
|
416
|
+
cursor.execute(
|
|
417
|
+
"SELECT id FROM folders WHERE source = ? AND path = ?",
|
|
418
|
+
(source, folder_path),
|
|
419
|
+
)
|
|
420
|
+
row = cursor.fetchone()
|
|
421
|
+
if row:
|
|
422
|
+
return row["id"]
|
|
423
|
+
|
|
424
|
+
return None
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def build_project_prefix_map(conn: sqlite3.Connection) -> List[Tuple[str, int]]:
|
|
428
|
+
"""Load project prefix map sorted by path length descending.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
List of (root_path, project_id) tuples, longest path first.
|
|
432
|
+
"""
|
|
433
|
+
cursor = conn.cursor()
|
|
434
|
+
cursor.execute(
|
|
435
|
+
"""
|
|
436
|
+
SELECT id, root_path FROM projects
|
|
437
|
+
WHERE root_path IS NOT NULL
|
|
438
|
+
ORDER BY LENGTH(root_path) DESC
|
|
439
|
+
"""
|
|
440
|
+
)
|
|
441
|
+
return [(row["root_path"], row["id"]) for row in cursor.fetchall()]
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def build_folder_maps(
|
|
445
|
+
conn: sqlite3.Connection,
|
|
446
|
+
) -> Tuple[Dict[Tuple[str, str], int], Dict[int, int]]:
|
|
447
|
+
"""Load folder path->id and folder->project maps.
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
Tuple of (folder_path_map, folder_project_map).
|
|
451
|
+
"""
|
|
452
|
+
cursor = conn.cursor()
|
|
453
|
+
cursor.execute("SELECT id, source, path, project_id FROM folders")
|
|
454
|
+
path_map: Dict[Tuple[str, str], int] = {}
|
|
455
|
+
project_map: Dict[int, int] = {}
|
|
456
|
+
for row in cursor.fetchall():
|
|
457
|
+
path_map[(row["source"], row["path"])] = row["id"]
|
|
458
|
+
if row["project_id"] is not None:
|
|
459
|
+
project_map[row["id"]] = row["project_id"]
|
|
460
|
+
return path_map, project_map
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def insert_file(
|
|
464
|
+
conn: sqlite3.Connection,
|
|
465
|
+
file_data: Dict[str, Any],
|
|
466
|
+
relationship_maps: Optional[Dict[str, Any]] = None,
|
|
467
|
+
) -> Optional[Tuple[str, int]]:
|
|
468
|
+
"""Insert or update local file with project auto-linking and status assignment.
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
('inserted', file_id) on new insert or reactivation,
|
|
472
|
+
('updated', file_id) on update
|
|
473
|
+
"""
|
|
474
|
+
cursor = conn.cursor()
|
|
475
|
+
|
|
476
|
+
file_path = file_data.get("file_path") or file_data.get("path")
|
|
477
|
+
|
|
478
|
+
cursor.execute(
|
|
479
|
+
"SELECT id, status FROM files WHERE source = 'local' AND path = ?",
|
|
480
|
+
(file_path,),
|
|
481
|
+
)
|
|
482
|
+
existing = cursor.fetchone()
|
|
483
|
+
|
|
484
|
+
proj_map = relationship_maps.get("project_prefix_map") if relationship_maps else None
|
|
485
|
+
fpath_map = relationship_maps.get("folder_path_map") if relationship_maps else None
|
|
486
|
+
fproj_map = relationship_maps.get("folder_project_map") if relationship_maps else None
|
|
487
|
+
dsn = relationship_maps.get("remote_source_names") if relationship_maps else None
|
|
488
|
+
|
|
489
|
+
project_id = _find_project_for_path(conn, file_path, project_prefix_map=proj_map)
|
|
490
|
+
folder_id = _find_folder_for_path(
|
|
491
|
+
conn,
|
|
492
|
+
"local",
|
|
493
|
+
file_path,
|
|
494
|
+
folder_path_map=fpath_map,
|
|
495
|
+
remote_source_names=dsn,
|
|
496
|
+
)
|
|
497
|
+
if project_id is None and folder_id is not None:
|
|
498
|
+
project_id = _get_folder_project_id(conn, folder_id, folder_project_map=fproj_map)
|
|
499
|
+
|
|
500
|
+
name = file_data.get("file_name") or file_data.get("name")
|
|
501
|
+
content_type = file_data.get("file_type") or file_data.get("content_type")
|
|
502
|
+
size_bytes = file_data.get("file_size") or file_data.get("size_bytes")
|
|
503
|
+
|
|
504
|
+
status, status_reason = _determine_file_status(name, file_path)
|
|
505
|
+
|
|
506
|
+
try:
|
|
507
|
+
cursor.execute(
|
|
508
|
+
"""
|
|
509
|
+
INSERT INTO files (
|
|
510
|
+
source, name, path, content_type, mime_type, size_bytes,
|
|
511
|
+
created_at, modified_at, accessed_at,
|
|
512
|
+
indexed_at, updated_at,
|
|
513
|
+
content_preview, sha256_hash, md5_hash, project_id, folder_id, metadata,
|
|
514
|
+
status, status_reason, status_changed_at
|
|
515
|
+
) VALUES ('local', ?, ?, ?, ?, ?, ?, ?, ?,
|
|
516
|
+
CURRENT_TIMESTAMP, CURRENT_TIMESTAMP,
|
|
517
|
+
?, ?, ?, ?, ?, ?,
|
|
518
|
+
?, ?, CURRENT_TIMESTAMP)
|
|
519
|
+
""",
|
|
520
|
+
(
|
|
521
|
+
name,
|
|
522
|
+
file_path,
|
|
523
|
+
content_type,
|
|
524
|
+
file_data.get("mime_type"),
|
|
525
|
+
size_bytes,
|
|
526
|
+
file_data.get("created_at"),
|
|
527
|
+
file_data.get("modified_at"),
|
|
528
|
+
file_data.get("accessed_at"),
|
|
529
|
+
file_data.get("content_preview"),
|
|
530
|
+
file_data.get("sha256_hash"),
|
|
531
|
+
file_data.get("md5_hash"),
|
|
532
|
+
project_id,
|
|
533
|
+
folder_id,
|
|
534
|
+
json.dumps(file_data.get("metadata", {})),
|
|
535
|
+
status,
|
|
536
|
+
status_reason,
|
|
537
|
+
),
|
|
538
|
+
)
|
|
539
|
+
except sqlite3.IntegrityError:
|
|
540
|
+
cursor.execute(
|
|
541
|
+
"""
|
|
542
|
+
UPDATE files SET
|
|
543
|
+
name = ?,
|
|
544
|
+
content_type = ?,
|
|
545
|
+
size_bytes = ?,
|
|
546
|
+
modified_at = ?,
|
|
547
|
+
accessed_at = ?,
|
|
548
|
+
updated_at = CURRENT_TIMESTAMP,
|
|
549
|
+
content_preview = ?,
|
|
550
|
+
sha256_hash = ?,
|
|
551
|
+
md5_hash = ?,
|
|
552
|
+
project_id = CASE WHEN project_id IS NULL THEN ? ELSE project_id END,
|
|
553
|
+
folder_id = ?,
|
|
554
|
+
status = CASE
|
|
555
|
+
WHEN status = 'removed' THEN ?
|
|
556
|
+
WHEN status IS NULL THEN ?
|
|
557
|
+
ELSE status
|
|
558
|
+
END,
|
|
559
|
+
status_reason = CASE
|
|
560
|
+
WHEN status = 'removed' THEN ?
|
|
561
|
+
WHEN status IS NULL THEN ?
|
|
562
|
+
ELSE status_reason
|
|
563
|
+
END,
|
|
564
|
+
status_changed_at = CASE
|
|
565
|
+
WHEN status = 'removed' OR status IS NULL THEN CURRENT_TIMESTAMP
|
|
566
|
+
ELSE status_changed_at
|
|
567
|
+
END
|
|
568
|
+
WHERE source = 'local' AND path = ?
|
|
569
|
+
""",
|
|
570
|
+
(
|
|
571
|
+
name,
|
|
572
|
+
content_type,
|
|
573
|
+
size_bytes,
|
|
574
|
+
file_data.get("modified_at"),
|
|
575
|
+
file_data.get("accessed_at"),
|
|
576
|
+
file_data.get("content_preview"),
|
|
577
|
+
file_data.get("sha256_hash"),
|
|
578
|
+
file_data.get("md5_hash"),
|
|
579
|
+
project_id,
|
|
580
|
+
folder_id,
|
|
581
|
+
status,
|
|
582
|
+
status,
|
|
583
|
+
status_reason,
|
|
584
|
+
status_reason,
|
|
585
|
+
file_path,
|
|
586
|
+
),
|
|
587
|
+
)
|
|
588
|
+
if existing:
|
|
589
|
+
action = "updated" if existing["status"] != "removed" else "inserted"
|
|
590
|
+
return (action, existing["id"])
|
|
591
|
+
return ("inserted", cursor.lastrowid)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def insert_drive_file(
|
|
595
|
+
conn: sqlite3.Connection,
|
|
596
|
+
data: Dict[str, Any],
|
|
597
|
+
relationship_maps: Optional[Dict[str, Any]] = None,
|
|
598
|
+
) -> Optional[int]:
|
|
599
|
+
"""Insert or update a Drive file with folder auto-linking.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
File ID on success
|
|
603
|
+
"""
|
|
604
|
+
cursor = conn.cursor()
|
|
605
|
+
|
|
606
|
+
fpath_map = relationship_maps.get("folder_path_map") if relationship_maps else None
|
|
607
|
+
fproj_map = relationship_maps.get("folder_project_map") if relationship_maps else None
|
|
608
|
+
dsn = relationship_maps.get("remote_source_names") if relationship_maps else None
|
|
609
|
+
|
|
610
|
+
folder_id = _find_folder_for_path(
|
|
611
|
+
conn,
|
|
612
|
+
data["source"],
|
|
613
|
+
data["path"],
|
|
614
|
+
folder_path_map=fpath_map,
|
|
615
|
+
remote_source_names=dsn,
|
|
616
|
+
)
|
|
617
|
+
project_id = _get_folder_project_id(conn, folder_id, folder_project_map=fproj_map) if folder_id else None
|
|
618
|
+
|
|
619
|
+
cursor.execute(
|
|
620
|
+
"SELECT id, status FROM files WHERE source = ? AND external_id = ? AND account = ?",
|
|
621
|
+
(data["source"], data["external_id"], data["account"]),
|
|
622
|
+
)
|
|
623
|
+
existing = cursor.fetchone()
|
|
624
|
+
|
|
625
|
+
if existing:
|
|
626
|
+
cursor.execute(
|
|
627
|
+
"""
|
|
628
|
+
UPDATE files SET
|
|
629
|
+
name = ?,
|
|
630
|
+
path = ?,
|
|
631
|
+
content_type = ?,
|
|
632
|
+
mime_type = ?,
|
|
633
|
+
size_bytes = ?,
|
|
634
|
+
created_at = ?,
|
|
635
|
+
modified_at = ?,
|
|
636
|
+
md5_hash = ?,
|
|
637
|
+
metadata = ?,
|
|
638
|
+
folder_id = ?,
|
|
639
|
+
project_id = CASE WHEN project_id IS NULL THEN ? ELSE project_id END,
|
|
640
|
+
updated_at = CURRENT_TIMESTAMP
|
|
641
|
+
WHERE id = ?
|
|
642
|
+
""",
|
|
643
|
+
(
|
|
644
|
+
data["name"],
|
|
645
|
+
data["path"],
|
|
646
|
+
data.get("content_type"),
|
|
647
|
+
data.get("mime_type"),
|
|
648
|
+
data.get("size_bytes"),
|
|
649
|
+
data.get("created_at"),
|
|
650
|
+
data.get("modified_at"),
|
|
651
|
+
data.get("md5_hash"),
|
|
652
|
+
data.get("metadata"),
|
|
653
|
+
folder_id,
|
|
654
|
+
project_id,
|
|
655
|
+
existing["id"],
|
|
656
|
+
),
|
|
657
|
+
)
|
|
658
|
+
return existing["id"]
|
|
659
|
+
else:
|
|
660
|
+
cursor.execute(
|
|
661
|
+
"""
|
|
662
|
+
INSERT INTO files (
|
|
663
|
+
source, external_id, account, name, path,
|
|
664
|
+
content_type, mime_type, size_bytes,
|
|
665
|
+
created_at, modified_at, md5_hash, metadata,
|
|
666
|
+
folder_id, project_id, indexed_at, updated_at, status
|
|
667
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 'active')
|
|
668
|
+
""",
|
|
669
|
+
(
|
|
670
|
+
data["source"],
|
|
671
|
+
data["external_id"],
|
|
672
|
+
data["account"],
|
|
673
|
+
data["name"],
|
|
674
|
+
data["path"],
|
|
675
|
+
data.get("content_type"),
|
|
676
|
+
data.get("mime_type"),
|
|
677
|
+
data.get("size_bytes"),
|
|
678
|
+
data.get("created_at"),
|
|
679
|
+
data.get("modified_at"),
|
|
680
|
+
data.get("md5_hash"),
|
|
681
|
+
data.get("metadata"),
|
|
682
|
+
folder_id,
|
|
683
|
+
project_id,
|
|
684
|
+
),
|
|
685
|
+
)
|
|
686
|
+
return cursor.lastrowid
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def mark_removed_files(conn: sqlite3.Connection, indexed_paths: set) -> List[int]:
|
|
690
|
+
"""Mark local files as 'removed' if path not in indexed_paths.
|
|
691
|
+
|
|
692
|
+
Returns:
|
|
693
|
+
List of file IDs that were marked as removed
|
|
694
|
+
"""
|
|
695
|
+
if not indexed_paths:
|
|
696
|
+
return []
|
|
697
|
+
|
|
698
|
+
cursor = conn.cursor()
|
|
699
|
+
cursor.execute("SELECT id, path FROM files WHERE source = 'local' AND status != 'removed'")
|
|
700
|
+
|
|
701
|
+
removed_ids = []
|
|
702
|
+
for row in cursor.fetchall():
|
|
703
|
+
if row["path"] not in indexed_paths:
|
|
704
|
+
removed_ids.append(row["id"])
|
|
705
|
+
|
|
706
|
+
if removed_ids:
|
|
707
|
+
for i in range(0, len(removed_ids), 500):
|
|
708
|
+
batch = removed_ids[i : i + 500]
|
|
709
|
+
placeholders = ",".join("?" * len(batch))
|
|
710
|
+
cursor.execute(
|
|
711
|
+
f"""
|
|
712
|
+
UPDATE files
|
|
713
|
+
SET status = 'removed',
|
|
714
|
+
status_reason = 'file_deleted',
|
|
715
|
+
status_changed_at = CURRENT_TIMESTAMP,
|
|
716
|
+
vectorized_at = NULL,
|
|
717
|
+
vectorized_chunks = 0
|
|
718
|
+
WHERE id IN ({placeholders})
|
|
719
|
+
""",
|
|
720
|
+
batch,
|
|
721
|
+
)
|
|
722
|
+
conn.commit()
|
|
723
|
+
|
|
724
|
+
return removed_ids
|