footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,741 @@
1
+ """File queries and write operations.
2
+
3
+ Provides list, detail, status-update, and insert functions for indexed files.
4
+ All functions take a raw ``sqlite3.Connection`` and return plain dicts.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import os
10
+ import sqlite3
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ from footprinter.db.sql_utils import build_status_filter, paginate, paginated_response
14
+
15
+ VALID_FILE_STATUSES = frozenset({"active", "hidden", "removed"})
16
+
17
+
18
+ def list_files(
19
+ conn: sqlite3.Connection,
20
+ *,
21
+ project_id: Optional[int] = None,
22
+ source: Optional[list[str]] = None,
23
+ status: Optional[str | list[str]] = None,
24
+ content_type: Optional[str] = None,
25
+ limit: int = 50,
26
+ page: int = 1,
27
+ ) -> dict:
28
+ """List files with optional filters and pagination.
29
+
30
+ Parameters
31
+ ----------
32
+ conn : sqlite3.Connection
33
+ project_id : int, optional
34
+ Filter to a single project.
35
+ source : list[str], optional
36
+ Filter by source names (e.g. ``["local"]``, ``["workdrive"]``).
37
+ status : str, list[str], or None
38
+ ``None`` → exclude removed (default).
39
+ ``"all"`` → no status filter.
40
+ Single string → exact match (``"active"``, ``"hidden"``, ``"removed"``).
41
+ List of strings → ``WHERE status IN (...)``.
42
+ content_type : str, optional
43
+ Exact match on ``files.content_type``.
44
+ limit, page : int
45
+ Pagination.
46
+
47
+ Returns
48
+ -------
49
+ dict
50
+ ``{"files": [...], "pagination": {page, limit, total, total_pages}}``
51
+ """
52
+ base = """
53
+ SELECT file.id, file.name, file.path, file.source, file.status, file.content_type,
54
+ file.size_bytes, file.modified_at, project.project_name,
55
+ file.mcp_view, file.mcp_read
56
+ FROM files file
57
+ LEFT JOIN projects project ON file.project_id = project.id
58
+ """
59
+ conditions: list[str] = []
60
+ params: list = []
61
+
62
+ # Status filter
63
+ status_conds, status_params = build_status_filter(
64
+ status,
65
+ column="file.status",
66
+ default_exclude=["removed"],
67
+ )
68
+ conditions.extend(status_conds)
69
+ params.extend(status_params)
70
+
71
+ if project_id is not None:
72
+ conditions.append("file.project_id = ?")
73
+ params.append(project_id)
74
+
75
+ if source:
76
+ placeholders = ",".join("?" * len(source))
77
+ conditions.append(f"file.source IN ({placeholders})")
78
+ params.extend(source)
79
+
80
+ if content_type:
81
+ conditions.append("file.content_type = ?")
82
+ params.append(content_type)
83
+
84
+ where = " WHERE " + " AND ".join(conditions) if conditions else ""
85
+ query = base + where
86
+
87
+ count_sql = f"SELECT COUNT(*) FROM ({query}) _c"
88
+ fetch_sql = query + " ORDER BY file.id LIMIT ? OFFSET ?"
89
+ rows, pagination = paginate(conn, count_sql, fetch_sql, params, page=page, limit=limit)
90
+
91
+ files = [
92
+ {
93
+ "id": r["id"],
94
+ "name": r["name"],
95
+ "path": r["path"],
96
+ "source": r["source"],
97
+ "status": r["status"],
98
+ "content_type": r["content_type"] or "",
99
+ "size_bytes": r["size_bytes"],
100
+ "modified_at": r["modified_at"],
101
+ "project_name": r["project_name"] or "",
102
+ "mcp_view": r["mcp_view"],
103
+ "mcp_read": r["mcp_read"],
104
+ }
105
+ for r in rows
106
+ ]
107
+
108
+ return paginated_response("files", files, pagination)
109
+
110
+
111
+ def get_file(
112
+ conn: sqlite3.Connection,
113
+ file_id: int,
114
+ ) -> Optional[dict]:
115
+ """Return full detail for a single file, or None if not found.
116
+
117
+ Joins ``projects`` for project_name.
118
+ """
119
+ row = conn.execute(
120
+ """
121
+ SELECT file.id, file.name, file.path, file.source, file.status, file.status_reason,
122
+ file.content_type, file.mime_type, file.size_bytes, file.created_at,
123
+ file.modified_at, file.indexed_at, file.project_id, file.md5_hash,
124
+ file.external_id, file.account,
125
+ file.mcp_view, file.mcp_read,
126
+ project.project_name
127
+ FROM files file
128
+ LEFT JOIN projects project ON file.project_id = project.id
129
+ WHERE file.id = ?
130
+ """,
131
+ (file_id,),
132
+ ).fetchone()
133
+
134
+ if row is None:
135
+ return None
136
+
137
+ return {
138
+ "id": row["id"],
139
+ "name": row["name"],
140
+ "path": row["path"],
141
+ "source": row["source"],
142
+ "status": row["status"],
143
+ "status_reason": row["status_reason"],
144
+ "content_type": row["content_type"],
145
+ "mime_type": row["mime_type"],
146
+ "size_bytes": row["size_bytes"],
147
+ "created_at": row["created_at"],
148
+ "modified_at": row["modified_at"],
149
+ "indexed_at": row["indexed_at"],
150
+ "project_id": row["project_id"],
151
+ "md5_hash": row["md5_hash"],
152
+ "external_id": row["external_id"],
153
+ "account": row["account"],
154
+ "project_name": row["project_name"],
155
+ "mcp_view": row["mcp_view"] or "inherit",
156
+ "mcp_read": row["mcp_read"] or "inherit",
157
+ }
158
+
159
+
160
+ def update_file_status(
161
+ conn: sqlite3.Connection,
162
+ file_id: int,
163
+ status: str,
164
+ reason: Optional[str] = None,
165
+ ) -> Optional[bool]:
166
+ """Change a file's status.
167
+
168
+ Returns True on success, None if not found.
169
+ Raises ValueError for invalid status values.
170
+ """
171
+ if status not in VALID_FILE_STATUSES:
172
+ raise ValueError(f"Invalid status '{status}'. Must be one of: {', '.join(sorted(VALID_FILE_STATUSES))}")
173
+
174
+ cursor = conn.execute("SELECT id FROM files WHERE id = ?", (file_id,))
175
+ if cursor.fetchone() is None:
176
+ return None
177
+
178
+ conn.execute(
179
+ """
180
+ UPDATE files
181
+ SET status = ?, status_reason = ?, status_changed_at = CURRENT_TIMESTAMP
182
+ WHERE id = ?
183
+ """,
184
+ (status, reason, file_id),
185
+ )
186
+ conn.commit()
187
+ return True
188
+
189
+
190
+ def update_file_relationships(
191
+ conn: sqlite3.Connection,
192
+ file_id: int,
193
+ *,
194
+ project_id: Optional[int] = None,
195
+ client_id: Optional[int] = None,
196
+ ) -> Optional[bool]:
197
+ """Update project and/or client assignment on a file.
198
+
199
+ Only updates fields that are passed (not None). Pass ``0`` to clear
200
+ a field (set to NULL). Stamps ``assignment_source = 'user'``
201
+ when the column exists (app-scope DBs only), so auto-detection
202
+ won't overwrite manual assignments.
203
+ Returns True on success, None if file not found.
204
+ """
205
+ cursor = conn.execute("SELECT id FROM files WHERE id = ?", (file_id,))
206
+ if cursor.fetchone() is None:
207
+ return None
208
+
209
+ if project_id is not None and project_id != 0:
210
+ proj = conn.execute("SELECT id FROM projects WHERE id = ?", (project_id,)).fetchone()
211
+ if not proj:
212
+ raise ValueError(f"No project with id {project_id}")
213
+ if client_id is not None and client_id != 0:
214
+ cli = conn.execute("SELECT id FROM clients WHERE id = ?", (client_id,)).fetchone()
215
+ if not cli:
216
+ raise ValueError(f"No client with id {client_id}")
217
+
218
+ sets: list[str] = []
219
+ params: list = []
220
+ if project_id is not None:
221
+ if project_id == 0:
222
+ sets.append("project_id = NULL")
223
+ else:
224
+ sets.append("project_id = ?")
225
+ params.append(project_id)
226
+ if client_id is not None:
227
+ if client_id == 0:
228
+ sets.append("client_id = NULL")
229
+ else:
230
+ sets.append("client_id = ?")
231
+ params.append(client_id)
232
+ if not sets:
233
+ return True
234
+
235
+ sets.append("assignment_source = 'user'")
236
+ params.append(file_id)
237
+ try:
238
+ conn.execute(f"UPDATE files SET {', '.join(sets)} WHERE id = ?", params)
239
+ except sqlite3.OperationalError as e:
240
+ if "no such column" not in str(e):
241
+ raise
242
+ # assignment_source not present (tool-only DB)
243
+ sets.pop()
244
+ conn.execute(f"UPDATE files SET {', '.join(sets)} WHERE id = ?", params)
245
+ conn.commit()
246
+ return True
247
+
248
+
249
+ def list_file_ids_under_path(
250
+ conn: sqlite3.Connection,
251
+ folder_path: str,
252
+ ) -> List[int]:
253
+ """Return IDs of all non-removed files whose path is under *folder_path*."""
254
+ escaped = folder_path.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
255
+ cursor = conn.execute(
256
+ "SELECT id FROM files WHERE path LIKE ? ESCAPE '\\' AND status != 'removed'",
257
+ (escaped + "/%",),
258
+ )
259
+ return [row["id"] for row in cursor.fetchall()]
260
+
261
+
262
+ # ---------------------------------------------------------------------------
263
+ # Write operations
264
+ # ---------------------------------------------------------------------------
265
+
266
+ logger = logging.getLogger(__name__)
267
+
268
+
269
+ def _determine_file_status(name: str, path: str) -> tuple:
270
+ """Determine status for a file based on name/path.
271
+
272
+ Returns:
273
+ Tuple of (status, status_reason)
274
+ """
275
+ if name.startswith("."):
276
+ return "hidden", "dot_file"
277
+
278
+ path_parts = path.split("/")
279
+ for part in path_parts:
280
+ if part.startswith(".") and part not in ("", "."):
281
+ return "hidden", "in_dot_folder"
282
+
283
+ return "active", None
284
+
285
+
286
+ def _find_project_for_path(
287
+ conn: sqlite3.Connection,
288
+ file_path: str,
289
+ project_prefix_map: Optional[List[Tuple[str, int]]] = None,
290
+ ) -> Optional[int]:
291
+ """Find most specific project by root_path prefix match."""
292
+ if project_prefix_map is not None:
293
+ for root_path, project_id in project_prefix_map:
294
+ if file_path.startswith(root_path):
295
+ return project_id
296
+ return None
297
+
298
+ cursor = conn.cursor()
299
+ cursor.execute(
300
+ """
301
+ SELECT id FROM projects
302
+ WHERE root_path IS NOT NULL
303
+ AND ? LIKE root_path || '%'
304
+ ORDER BY LENGTH(root_path) DESC
305
+ LIMIT 1
306
+ """,
307
+ (file_path,),
308
+ )
309
+ row = cursor.fetchone()
310
+ return row["id"] if row else None
311
+
312
+
313
+ def _get_folder_project_id(
314
+ conn: sqlite3.Connection,
315
+ folder_id: int,
316
+ folder_project_map: Optional[Dict[int, int]] = None,
317
+ ) -> Optional[int]:
318
+ """Look up project_id from a folder row."""
319
+ if folder_project_map is not None:
320
+ return folder_project_map.get(folder_id)
321
+
322
+ cursor = conn.cursor()
323
+ cursor.execute("SELECT project_id FROM folders WHERE id = ?", (folder_id,))
324
+ row = cursor.fetchone()
325
+ return row["project_id"] if row else None
326
+
327
+
328
+ def _is_remote_source(conn: sqlite3.Connection, source: str) -> bool:
329
+ """Check if a source name is a remote source via the sources table."""
330
+ row = conn.execute("SELECT source_type FROM sources WHERE name = ?", (source,)).fetchone()
331
+ return row is not None and row["source_type"] == "remote"
332
+
333
+
334
+ def _find_folder_in_map(
335
+ conn: sqlite3.Connection,
336
+ source: str,
337
+ path: str,
338
+ folder_path_map: Dict[Tuple[str, str], int],
339
+ remote_source_names: Optional[frozenset] = None,
340
+ ) -> Optional[int]:
341
+ """Resolve folder_id using in-memory map with ancestor walk."""
342
+ parent_dir = os.path.dirname(path)
343
+
344
+ if remote_source_names is not None:
345
+ is_remote = source in remote_source_names
346
+ else:
347
+ is_remote = _is_remote_source(conn, source)
348
+
349
+ if is_remote:
350
+ folder_path = f"{source}:{parent_dir}"
351
+ else:
352
+ folder_path = parent_dir
353
+
354
+ folder_id = folder_path_map.get((source, folder_path))
355
+ if folder_id is not None:
356
+ return folder_id
357
+
358
+ while parent_dir and parent_dir != "/" and len(parent_dir) > 1:
359
+ parent_dir = os.path.dirname(parent_dir)
360
+ if is_remote:
361
+ folder_path = f"{source}:{parent_dir}"
362
+ else:
363
+ folder_path = parent_dir
364
+
365
+ folder_id = folder_path_map.get((source, folder_path))
366
+ if folder_id is not None:
367
+ return folder_id
368
+
369
+ return None
370
+
371
+
372
+ def _find_folder_for_path(
373
+ conn: sqlite3.Connection,
374
+ source: str,
375
+ path: str,
376
+ folder_path_map: Optional[Dict[Tuple[str, str], int]] = None,
377
+ remote_source_names: Optional[frozenset] = None,
378
+ ) -> Optional[int]:
379
+ """Find folder_id for a file by matching path to folders."""
380
+ if not path:
381
+ return None
382
+
383
+ if folder_path_map is not None:
384
+ return _find_folder_in_map(
385
+ conn,
386
+ source,
387
+ path,
388
+ folder_path_map,
389
+ remote_source_names=remote_source_names,
390
+ )
391
+
392
+ is_remote = _is_remote_source(conn, source)
393
+ cursor = conn.cursor()
394
+ parent_dir = os.path.dirname(path)
395
+
396
+ if is_remote:
397
+ folder_path = f"{source}:{parent_dir}"
398
+ else:
399
+ folder_path = parent_dir
400
+
401
+ cursor.execute(
402
+ "SELECT id FROM folders WHERE source = ? AND path = ?",
403
+ (source, folder_path),
404
+ )
405
+ row = cursor.fetchone()
406
+ if row:
407
+ return row["id"]
408
+
409
+ while parent_dir and parent_dir != "/" and len(parent_dir) > 1:
410
+ parent_dir = os.path.dirname(parent_dir)
411
+ if is_remote:
412
+ folder_path = f"{source}:{parent_dir}"
413
+ else:
414
+ folder_path = parent_dir
415
+
416
+ cursor.execute(
417
+ "SELECT id FROM folders WHERE source = ? AND path = ?",
418
+ (source, folder_path),
419
+ )
420
+ row = cursor.fetchone()
421
+ if row:
422
+ return row["id"]
423
+
424
+ return None
425
+
426
+
427
+ def build_project_prefix_map(conn: sqlite3.Connection) -> List[Tuple[str, int]]:
428
+ """Load project prefix map sorted by path length descending.
429
+
430
+ Returns:
431
+ List of (root_path, project_id) tuples, longest path first.
432
+ """
433
+ cursor = conn.cursor()
434
+ cursor.execute(
435
+ """
436
+ SELECT id, root_path FROM projects
437
+ WHERE root_path IS NOT NULL
438
+ ORDER BY LENGTH(root_path) DESC
439
+ """
440
+ )
441
+ return [(row["root_path"], row["id"]) for row in cursor.fetchall()]
442
+
443
+
444
+ def build_folder_maps(
445
+ conn: sqlite3.Connection,
446
+ ) -> Tuple[Dict[Tuple[str, str], int], Dict[int, int]]:
447
+ """Load folder path->id and folder->project maps.
448
+
449
+ Returns:
450
+ Tuple of (folder_path_map, folder_project_map).
451
+ """
452
+ cursor = conn.cursor()
453
+ cursor.execute("SELECT id, source, path, project_id FROM folders")
454
+ path_map: Dict[Tuple[str, str], int] = {}
455
+ project_map: Dict[int, int] = {}
456
+ for row in cursor.fetchall():
457
+ path_map[(row["source"], row["path"])] = row["id"]
458
+ if row["project_id"] is not None:
459
+ project_map[row["id"]] = row["project_id"]
460
+ return path_map, project_map
461
+
462
+
463
+ def insert_file(
464
+ conn: sqlite3.Connection,
465
+ file_data: Dict[str, Any],
466
+ relationship_maps: Optional[Dict[str, Any]] = None,
467
+ ) -> Optional[Tuple[str, int]]:
468
+ """Insert or update local file with project auto-linking and status assignment.
469
+
470
+ Returns:
471
+ ('inserted', file_id) on new insert or reactivation,
472
+ ('updated', file_id) on content/metadata change,
473
+ ('unchanged', file_id) when the existing active row's sha256 and size match
474
+ the incoming payload (no SQL UPDATE is issued)
475
+ """
476
+ cursor = conn.cursor()
477
+
478
+ file_path = file_data.get("file_path") or file_data.get("path")
479
+
480
+ cursor.execute(
481
+ "SELECT id, status, sha256_hash, size_bytes, project_id FROM files WHERE source = 'local' AND path = ?",
482
+ (file_path,),
483
+ )
484
+ existing = cursor.fetchone()
485
+
486
+ # Fast path: unchanged active row → skip project/folder resolution and the UPDATE.
487
+ # Requires a non-None sha256 on both sides so missing hashes never short-circuit,
488
+ # and a non-NULL project_id so we don't strand files waiting on late-binding
489
+ # project detection (the UPDATE's CASE WHEN project_id IS NULL THEN ? backfill path).
490
+ if existing is not None and existing["status"] != "removed" and existing["project_id"] is not None:
491
+ incoming_sha = file_data.get("sha256_hash")
492
+ incoming_size = file_data.get("file_size") or file_data.get("size_bytes")
493
+ if (
494
+ incoming_sha is not None
495
+ and existing["sha256_hash"] is not None
496
+ and incoming_sha == existing["sha256_hash"]
497
+ and incoming_size == existing["size_bytes"]
498
+ ):
499
+ return ("unchanged", existing["id"])
500
+
501
+ proj_map = relationship_maps.get("project_prefix_map") if relationship_maps else None
502
+ fpath_map = relationship_maps.get("folder_path_map") if relationship_maps else None
503
+ fproj_map = relationship_maps.get("folder_project_map") if relationship_maps else None
504
+ dsn = relationship_maps.get("remote_source_names") if relationship_maps else None
505
+
506
+ project_id = _find_project_for_path(conn, file_path, project_prefix_map=proj_map)
507
+ folder_id = _find_folder_for_path(
508
+ conn,
509
+ "local",
510
+ file_path,
511
+ folder_path_map=fpath_map,
512
+ remote_source_names=dsn,
513
+ )
514
+ if project_id is None and folder_id is not None:
515
+ project_id = _get_folder_project_id(conn, folder_id, folder_project_map=fproj_map)
516
+
517
+ name = file_data.get("file_name") or file_data.get("name")
518
+ content_type = file_data.get("file_type") or file_data.get("content_type")
519
+ size_bytes = file_data.get("file_size") or file_data.get("size_bytes")
520
+
521
+ status, status_reason = _determine_file_status(name, file_path)
522
+
523
+ try:
524
+ cursor.execute(
525
+ """
526
+ INSERT INTO files (
527
+ source, name, path, content_type, mime_type, size_bytes,
528
+ created_at, modified_at, accessed_at,
529
+ indexed_at, updated_at,
530
+ content_preview, sha256_hash, md5_hash, project_id, folder_id, metadata,
531
+ status, status_reason, status_changed_at
532
+ ) VALUES ('local', ?, ?, ?, ?, ?, ?, ?, ?,
533
+ CURRENT_TIMESTAMP, CURRENT_TIMESTAMP,
534
+ ?, ?, ?, ?, ?, ?,
535
+ ?, ?, CURRENT_TIMESTAMP)
536
+ """,
537
+ (
538
+ name,
539
+ file_path,
540
+ content_type,
541
+ file_data.get("mime_type"),
542
+ size_bytes,
543
+ file_data.get("created_at"),
544
+ file_data.get("modified_at"),
545
+ file_data.get("accessed_at"),
546
+ file_data.get("content_preview"),
547
+ file_data.get("sha256_hash"),
548
+ file_data.get("md5_hash"),
549
+ project_id,
550
+ folder_id,
551
+ json.dumps(file_data.get("metadata", {})),
552
+ status,
553
+ status_reason,
554
+ ),
555
+ )
556
+ except sqlite3.IntegrityError:
557
+ cursor.execute(
558
+ """
559
+ UPDATE files SET
560
+ name = ?,
561
+ content_type = ?,
562
+ size_bytes = ?,
563
+ modified_at = ?,
564
+ accessed_at = ?,
565
+ updated_at = CURRENT_TIMESTAMP,
566
+ content_preview = ?,
567
+ sha256_hash = ?,
568
+ md5_hash = ?,
569
+ project_id = CASE WHEN project_id IS NULL THEN ? ELSE project_id END,
570
+ folder_id = ?,
571
+ status = CASE
572
+ WHEN status = 'removed' THEN ?
573
+ WHEN status IS NULL THEN ?
574
+ ELSE status
575
+ END,
576
+ status_reason = CASE
577
+ WHEN status = 'removed' THEN ?
578
+ WHEN status IS NULL THEN ?
579
+ ELSE status_reason
580
+ END,
581
+ status_changed_at = CASE
582
+ WHEN status = 'removed' OR status IS NULL THEN CURRENT_TIMESTAMP
583
+ ELSE status_changed_at
584
+ END
585
+ WHERE source = 'local' AND path = ?
586
+ """,
587
+ (
588
+ name,
589
+ content_type,
590
+ size_bytes,
591
+ file_data.get("modified_at"),
592
+ file_data.get("accessed_at"),
593
+ file_data.get("content_preview"),
594
+ file_data.get("sha256_hash"),
595
+ file_data.get("md5_hash"),
596
+ project_id,
597
+ folder_id,
598
+ status,
599
+ status,
600
+ status_reason,
601
+ status_reason,
602
+ file_path,
603
+ ),
604
+ )
605
+ if existing:
606
+ action = "updated" if existing["status"] != "removed" else "inserted"
607
+ return (action, existing["id"])
608
+ return ("inserted", cursor.lastrowid)
609
+
610
+
611
+ def insert_drive_file(
612
+ conn: sqlite3.Connection,
613
+ data: Dict[str, Any],
614
+ relationship_maps: Optional[Dict[str, Any]] = None,
615
+ ) -> Optional[int]:
616
+ """Insert or update a Drive file with folder auto-linking.
617
+
618
+ Returns:
619
+ File ID on success
620
+ """
621
+ cursor = conn.cursor()
622
+
623
+ fpath_map = relationship_maps.get("folder_path_map") if relationship_maps else None
624
+ fproj_map = relationship_maps.get("folder_project_map") if relationship_maps else None
625
+ dsn = relationship_maps.get("remote_source_names") if relationship_maps else None
626
+
627
+ folder_id = _find_folder_for_path(
628
+ conn,
629
+ data["source"],
630
+ data["path"],
631
+ folder_path_map=fpath_map,
632
+ remote_source_names=dsn,
633
+ )
634
+ project_id = _get_folder_project_id(conn, folder_id, folder_project_map=fproj_map) if folder_id else None
635
+
636
+ cursor.execute(
637
+ "SELECT id, status FROM files WHERE source = ? AND external_id = ? AND account = ?",
638
+ (data["source"], data["external_id"], data["account"]),
639
+ )
640
+ existing = cursor.fetchone()
641
+
642
+ if existing:
643
+ cursor.execute(
644
+ """
645
+ UPDATE files SET
646
+ name = ?,
647
+ path = ?,
648
+ content_type = ?,
649
+ mime_type = ?,
650
+ size_bytes = ?,
651
+ created_at = ?,
652
+ modified_at = ?,
653
+ md5_hash = ?,
654
+ metadata = ?,
655
+ folder_id = ?,
656
+ project_id = CASE WHEN project_id IS NULL THEN ? ELSE project_id END,
657
+ updated_at = CURRENT_TIMESTAMP
658
+ WHERE id = ?
659
+ """,
660
+ (
661
+ data["name"],
662
+ data["path"],
663
+ data.get("content_type"),
664
+ data.get("mime_type"),
665
+ data.get("size_bytes"),
666
+ data.get("created_at"),
667
+ data.get("modified_at"),
668
+ data.get("md5_hash"),
669
+ data.get("metadata"),
670
+ folder_id,
671
+ project_id,
672
+ existing["id"],
673
+ ),
674
+ )
675
+ return existing["id"]
676
+ else:
677
+ cursor.execute(
678
+ """
679
+ INSERT INTO files (
680
+ source, external_id, account, name, path,
681
+ content_type, mime_type, size_bytes,
682
+ created_at, modified_at, md5_hash, metadata,
683
+ folder_id, project_id, indexed_at, updated_at, status
684
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 'active')
685
+ """,
686
+ (
687
+ data["source"],
688
+ data["external_id"],
689
+ data["account"],
690
+ data["name"],
691
+ data["path"],
692
+ data.get("content_type"),
693
+ data.get("mime_type"),
694
+ data.get("size_bytes"),
695
+ data.get("created_at"),
696
+ data.get("modified_at"),
697
+ data.get("md5_hash"),
698
+ data.get("metadata"),
699
+ folder_id,
700
+ project_id,
701
+ ),
702
+ )
703
+ return cursor.lastrowid
704
+
705
+
706
+ def mark_removed_files(conn: sqlite3.Connection, indexed_paths: set) -> List[int]:
707
+ """Mark local files as 'removed' if path not in indexed_paths.
708
+
709
+ Returns:
710
+ List of file IDs that were marked as removed
711
+ """
712
+ if not indexed_paths:
713
+ return []
714
+
715
+ cursor = conn.cursor()
716
+ cursor.execute("SELECT id, path FROM files WHERE source = 'local' AND status != 'removed'")
717
+
718
+ removed_ids = []
719
+ for row in cursor.fetchall():
720
+ if row["path"] not in indexed_paths:
721
+ removed_ids.append(row["id"])
722
+
723
+ if removed_ids:
724
+ for i in range(0, len(removed_ids), 500):
725
+ batch = removed_ids[i : i + 500]
726
+ placeholders = ",".join("?" * len(batch))
727
+ cursor.execute(
728
+ f"""
729
+ UPDATE files
730
+ SET status = 'removed',
731
+ status_reason = 'file_deleted',
732
+ status_changed_at = CURRENT_TIMESTAMP,
733
+ vectorized_at = NULL,
734
+ vectorized_chunks = 0
735
+ WHERE id IN ({placeholders})
736
+ """,
737
+ batch,
738
+ )
739
+ conn.commit()
740
+
741
+ return removed_ids