footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,827 @@
1
+ """CLI utilities — _rebuild_vectors with phase isolation and interrupt safety."""
2
+
3
+ import logging
4
+ import signal
5
+ import sqlite3
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ from footprinter.paths import get_db_path
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _BATCH_SIZE = 100
14
+
15
+ # Graceful shutdown flag — set by SIGINT/SIGTERM handler
16
+ _shutdown = False
17
+
18
+
19
+ def _handle_shutdown(signum, frame):
20
+ """Signal handler that requests graceful shutdown."""
21
+ global _shutdown
22
+ _shutdown = True
23
+ sig_name = signal.Signals(signum).name
24
+ logger.warning("Received %s — finishing current batch...", sig_name)
25
+
26
+
27
+ def _repair_fts(quiet: bool = False):
28
+ """Drop and rebuild all FTS search indexes from base table data."""
29
+ from rich.console import Console
30
+
31
+ from footprinter.ingest.database import Database
32
+
33
+ console = Console() if not quiet else None
34
+
35
+ db = Database(str(get_db_path()))
36
+
37
+ if console:
38
+ before = db.check_fts_health()
39
+ console.print()
40
+ console.print("[bold]FTS Repair[/bold]")
41
+ console.print()
42
+ for table, info in before.items():
43
+ status = info["status"]
44
+ icon = {"ok": "[green]ok[/green]", "error": "[red]error[/red]"}.get(status, status)
45
+ console.print(f" Before: {table} {icon}")
46
+
47
+ result = db.repair_fts()
48
+ db.close()
49
+
50
+ if console:
51
+ console.print()
52
+ for table, counts in result.items():
53
+ console.print(f" After: {table} [green]{counts['after']}[/green] rows")
54
+ console.print()
55
+ console.print("[bold green]FTS repair complete[/bold green]")
56
+ console.print()
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # Pre-flight validation
61
+ # ---------------------------------------------------------------------------
62
+
63
+
64
+ def _preflight_check(conn, cursor, files_enabled, chats_enabled, console, mode: str = "full") -> dict:
65
+ """Run pre-flight validation before rebuild.
66
+
67
+ Returns dict with counts: {"files": N, "messages": M, "chats": K}.
68
+ Raises RuntimeError if validation fails.
69
+ """
70
+ # Test DB writability
71
+ try:
72
+ cursor.execute("CREATE TEMP TABLE IF NOT EXISTS _fp_preflight (x INTEGER)")
73
+ cursor.execute("INSERT INTO _fp_preflight VALUES (1)")
74
+ cursor.execute("DELETE FROM _fp_preflight")
75
+ except sqlite3.OperationalError as e:
76
+ raise RuntimeError(f"Database is not writable: {e}") from e
77
+
78
+ # Count items to process — in incremental/sync mode, count only
79
+ # items that actually need processing (new/modified/removed).
80
+ counts = {"files": 0, "messages": 0, "chats": 0}
81
+ incremental = mode in ("incremental", "sync")
82
+ if files_enabled:
83
+ if incremental:
84
+ cursor.execute(
85
+ "SELECT COUNT(*) FROM files "
86
+ "WHERE source = 'local' AND status != 'removed' AND path IS NOT NULL"
87
+ " AND COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0"
88
+ " AND (vectorized_at IS NULL OR modified_at > vectorized_at)"
89
+ )
90
+ else:
91
+ cursor.execute(
92
+ "SELECT COUNT(*) FROM files "
93
+ "WHERE source = 'local' AND status != 'removed' AND path IS NOT NULL"
94
+ " AND COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0"
95
+ )
96
+ counts["files"] = cursor.fetchone()[0]
97
+ if chats_enabled:
98
+ if incremental:
99
+ cursor.execute(
100
+ "SELECT COUNT(*) FROM messages "
101
+ "WHERE content IS NOT NULL AND TRIM(content) != ''"
102
+ " AND status != 'removed'"
103
+ " AND COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0"
104
+ " AND vectorized_at IS NULL"
105
+ )
106
+ else:
107
+ cursor.execute(
108
+ "SELECT COUNT(*) FROM messages "
109
+ "WHERE content IS NOT NULL AND TRIM(content) != ''"
110
+ " AND status != 'removed'"
111
+ " AND COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0"
112
+ )
113
+ counts["messages"] = cursor.fetchone()[0]
114
+ if incremental:
115
+ cursor.execute(
116
+ "SELECT COUNT(*) FROM chats "
117
+ "WHERE status != 'removed'"
118
+ " AND COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0"
119
+ " AND metadata_vectorized_at IS NULL"
120
+ )
121
+ else:
122
+ cursor.execute(
123
+ "SELECT COUNT(*) FROM chats "
124
+ "WHERE status != 'removed'"
125
+ " AND COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0"
126
+ )
127
+ counts["chats"] = cursor.fetchone()[0]
128
+
129
+ if console:
130
+ parts = []
131
+ if files_enabled:
132
+ parts.append(f"{counts['files']} files")
133
+ if chats_enabled:
134
+ parts.append(f"{counts['messages']} messages")
135
+ parts.append(f"{counts['chats']} chats")
136
+ label = "Will process" if incremental else "Will vectorize"
137
+ console.print(f" {label}: {', '.join(parts)}")
138
+
139
+ return counts
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Phase functions
144
+ # ---------------------------------------------------------------------------
145
+
146
+
147
+ def _cleanup_removed_vectors(conn, cursor, store, *, clean_files=True, clean_messages=True, clean_chats=True) -> dict:
148
+ """Remove vectors for files, messages, and chats with status='removed'.
149
+
150
+ Returns {"removed": N, "removed_messages": M, "removed_chats": C}.
151
+ """
152
+ removed_count = 0
153
+ msg_count = 0
154
+ chat_count = 0
155
+
156
+ # --- Files ---
157
+ if clean_files:
158
+ cursor.execute("SELECT id FROM files WHERE status = 'removed' AND vectorized_at IS NOT NULL")
159
+ for f in cursor.fetchall():
160
+ try:
161
+ store.delete_file(f["id"])
162
+ cursor.execute(
163
+ "UPDATE files SET vectorized_at = NULL, vectorized_chunks = NULL WHERE id = ?",
164
+ (f["id"],),
165
+ )
166
+ removed_count += 1
167
+ except Exception as e:
168
+ logger.warning("Failed to remove vectors for file %s: %s", f["id"], e)
169
+
170
+ # --- Messages ---
171
+ if clean_messages:
172
+ cursor.execute("SELECT id FROM messages WHERE status = 'removed' AND vectorized_at IS NOT NULL")
173
+ for m in cursor.fetchall():
174
+ try:
175
+ store.delete_message(m["id"])
176
+ cursor.execute(
177
+ "UPDATE messages SET vectorized_at = NULL, vectorized_chunks = NULL WHERE id = ?",
178
+ (m["id"],),
179
+ )
180
+ msg_count += 1
181
+ except Exception as e:
182
+ logger.warning("Failed to remove vectors for message %s: %s", m["id"], e)
183
+
184
+ # --- Chats ---
185
+ if clean_chats:
186
+ cursor.execute("SELECT id FROM chats WHERE status = 'removed' AND metadata_vectorized_at IS NOT NULL")
187
+ for c in cursor.fetchall():
188
+ try:
189
+ store.delete_chat(c["id"])
190
+ cursor.execute(
191
+ "UPDATE chats SET metadata_vectorized_at = NULL WHERE id = ?",
192
+ (c["id"],),
193
+ )
194
+ # delete_chat also removes message chunks for this chat —
195
+ # clear their vectorization state to keep DB in sync
196
+ cursor.execute(
197
+ "UPDATE messages SET vectorized_at = NULL,"
198
+ " vectorized_chunks = NULL"
199
+ " WHERE chat_id = ? AND vectorized_at IS NOT NULL",
200
+ (c["id"],),
201
+ )
202
+ chat_count += 1
203
+ except Exception as e:
204
+ logger.warning("Failed to remove vectors for chat %s: %s", c["id"], e)
205
+
206
+ conn.commit()
207
+ return {"removed": removed_count, "removed_messages": msg_count, "removed_chats": chat_count}
208
+
209
+
210
+ def _vectorize_files(conn, cursor, store, extractor, vec_config, console, mode: str = "full") -> dict:
211
+ """Vectorize local files.
212
+
213
+ Returns {"done": N, "chunks": M, "interrupted": bool}.
214
+ """
215
+ global _shutdown
216
+
217
+ file_types = vec_config.get("file_types")
218
+ exclude_patterns = vec_config.get("exclude_patterns", [])
219
+ where_parts = [
220
+ "source = 'local'",
221
+ "status != 'removed'",
222
+ "path IS NOT NULL",
223
+ "COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0",
224
+ ]
225
+ params: list = []
226
+
227
+ # Incremental/sync: only process new or modified files
228
+ if mode in ("incremental", "sync"):
229
+ where_parts.append("(vectorized_at IS NULL OR modified_at > vectorized_at)")
230
+
231
+ if file_types:
232
+ like_clauses = " OR ".join("LOWER(path) LIKE ?" for _ in file_types)
233
+ where_parts.append(f"({like_clauses})")
234
+ params.extend(f"%{ext.lower()}" for ext in file_types)
235
+ for pat in exclude_patterns:
236
+ sql_pat = pat.replace("**", "%").replace("*", "%")
237
+ where_parts.append("path NOT LIKE ?")
238
+ params.append(sql_pat)
239
+
240
+ cursor.execute(
241
+ f"SELECT id, path as file_path FROM files WHERE {' AND '.join(where_parts)} ORDER BY id",
242
+ params,
243
+ )
244
+ files = cursor.fetchall()
245
+ total = len(files)
246
+ done = 0
247
+ chunks = 0
248
+
249
+ # Use upsert in incremental/sync mode (handles both new and modified)
250
+ use_upsert = mode in ("incremental", "sync")
251
+
252
+ for f in files:
253
+ if _shutdown:
254
+ conn.commit()
255
+ if console:
256
+ console.print(f" [yellow]Interrupted[/yellow] at {done}/{total} files")
257
+ return {"done": done, "chunks": chunks, "interrupted": True}
258
+
259
+ try:
260
+ fpath = Path(f["file_path"])
261
+ if not fpath.exists():
262
+ continue
263
+ file_chunks = extractor.extract_with_chunking(fpath)
264
+ if file_chunks:
265
+ metadata = {"file_type": fpath.suffix.lower(), "file_name": fpath.name}
266
+ try:
267
+ if use_upsert:
268
+ store.upsert_file(f["id"], f["file_path"], file_chunks, metadata)
269
+ else:
270
+ store.index_file(f["id"], f["file_path"], file_chunks, metadata)
271
+ except Exception as e:
272
+ logger.warning("Chroma write failed for file %s: %s", f["file_path"], e)
273
+ continue
274
+ # SQLite update only after chroma success
275
+ cursor.execute(
276
+ "UPDATE files SET vectorized_at = CURRENT_TIMESTAMP, vectorized_chunks = ? WHERE id = ?",
277
+ (len(file_chunks), f["id"]),
278
+ )
279
+ done += 1
280
+ chunks += len(file_chunks)
281
+ if done % 100 == 0:
282
+ conn.commit()
283
+ if console:
284
+ console.print(f" Vectorizing files: {done}/{total}")
285
+ except Exception as e:
286
+ logger.debug("Skipped file %s: %s", f["file_path"], e)
287
+
288
+ conn.commit()
289
+ return {"done": done, "chunks": chunks, "interrupted": False}
290
+
291
+
292
+ def _vectorize_messages(conn, cursor, store, console, mode: str = "full") -> dict:
293
+ """Vectorize chat messages.
294
+
295
+ Returns {"done": N, "interrupted": bool}.
296
+ """
297
+ global _shutdown
298
+
299
+ from footprinter.semantic.chunking import chunk_content
300
+
301
+ incremental_filter = ""
302
+ use_upsert = mode in ("incremental", "sync")
303
+ if use_upsert:
304
+ incremental_filter = "AND message.vectorized_at IS NULL"
305
+
306
+ cursor.execute(
307
+ f"""
308
+ SELECT message.id, message.chat_id, message.role, message.content,
309
+ message.created_at, chat.title, chat.account as source
310
+ FROM messages message
311
+ JOIN chats chat ON message.chat_id = chat.id
312
+ WHERE message.content IS NOT NULL AND message.content != ''
313
+ AND message.status != 'removed'
314
+ AND COALESCE(json_extract(message.metadata, '$.vectorize'), 1) != 0
315
+ {incremental_filter}
316
+ ORDER BY message.id
317
+ """
318
+ )
319
+ messages = cursor.fetchall()
320
+ total = len(messages)
321
+ done = 0
322
+
323
+ batch_ids: list = []
324
+ batch_texts: list = []
325
+ batch_metas: list = []
326
+ batch_msg_ids: list = []
327
+ batch_msg_chunks: dict = {} # msg_id -> chunk count
328
+
329
+ def flush_batch() -> int:
330
+ """Flush current batch to chroma then SQLite. Returns count flushed."""
331
+ if not batch_ids:
332
+ return 0
333
+ count = len(batch_msg_ids)
334
+ try:
335
+ embeddings = store.ef(batch_texts)
336
+ chroma_op = store._chats.upsert if use_upsert else store._chats.add
337
+ chroma_op(
338
+ ids=list(batch_ids),
339
+ embeddings=embeddings,
340
+ documents=list(batch_texts),
341
+ metadatas=list(batch_metas),
342
+ )
343
+ except Exception as e:
344
+ logger.warning("Chroma write failed for message batch: %s", e)
345
+ batch_ids.clear()
346
+ batch_texts.clear()
347
+ batch_metas.clear()
348
+ batch_msg_ids.clear()
349
+ batch_msg_chunks.clear()
350
+ return 0
351
+ # SQLite update only after chroma success
352
+ for mid in batch_msg_ids:
353
+ cursor.execute(
354
+ "UPDATE messages SET vectorized_at = CURRENT_TIMESTAMP, vectorized_chunks = ? WHERE id = ?",
355
+ (batch_msg_chunks.get(mid, 0), mid),
356
+ )
357
+ batch_ids.clear()
358
+ batch_texts.clear()
359
+ batch_metas.clear()
360
+ batch_msg_ids.clear()
361
+ batch_msg_chunks.clear()
362
+ return count
363
+
364
+ for msg in messages:
365
+ if _shutdown:
366
+ done += flush_batch()
367
+ conn.commit()
368
+ if console:
369
+ console.print(f" [yellow]Interrupted[/yellow] at {done}/{total} messages")
370
+ return {"done": done, "interrupted": True}
371
+
372
+ try:
373
+ content = msg["content"]
374
+ if not content or not content.strip():
375
+ continue
376
+ msg_chunks = chunk_content(content)
377
+ for chunk_text, chunk_index, total_chunks in msg_chunks:
378
+ batch_ids.append(f"msg_{msg['id']}_chunk_{chunk_index}")
379
+ batch_texts.append(chunk_text)
380
+ batch_metas.append(
381
+ {
382
+ "message_id": msg["id"],
383
+ "chat_id": msg["chat_id"],
384
+ "chunk_index": chunk_index,
385
+ "total_chunks": total_chunks,
386
+ "content_length": len(chunk_text),
387
+ "source": msg["source"] or "unknown",
388
+ "role": msg["role"] or "unknown",
389
+ "chat_title": (msg["title"] or "(untitled)")[:200],
390
+ "created_at": msg["created_at"] or "",
391
+ "message_position": 0,
392
+ }
393
+ )
394
+ batch_msg_chunks[msg["id"]] = len(msg_chunks)
395
+ batch_msg_ids.append(msg["id"])
396
+ if len(batch_ids) >= _BATCH_SIZE:
397
+ done += flush_batch()
398
+ conn.commit()
399
+ if console:
400
+ console.print(f" Vectorizing messages: {done}/{total}")
401
+ except Exception as e:
402
+ logger.debug("Skipped message %s: %s", msg["id"], e)
403
+
404
+ done += flush_batch()
405
+ conn.commit()
406
+ return {"done": done, "interrupted": False}
407
+
408
+
409
+ def _vectorize_chat_info(conn, cursor, store, console, mode: str = "full") -> dict:
410
+ """Vectorize chat info records.
411
+
412
+ Returns {"done": N, "interrupted": bool}.
413
+ """
414
+ global _shutdown
415
+
416
+ incremental_filter = ""
417
+ if mode in ("incremental", "sync"):
418
+ incremental_filter = "AND metadata_vectorized_at IS NULL"
419
+
420
+ cursor.execute(
421
+ f"""
422
+ SELECT id, title, summary, account as source, created_at, message_count
423
+ FROM chats
424
+ WHERE status != 'removed'
425
+ AND COALESCE(json_extract(metadata, '$.vectorize'), 1) != 0
426
+ {incremental_filter}
427
+ ORDER BY id
428
+ """
429
+ )
430
+ chats = cursor.fetchall()
431
+ total = len(chats)
432
+ done = 0
433
+
434
+ batch_ids: list = []
435
+ batch_texts: list = []
436
+ batch_metas: list = []
437
+ batch_conv_ids: list = []
438
+
439
+ def flush_batch() -> int:
440
+ """Flush current batch to chroma then SQLite. Returns count flushed."""
441
+ if not batch_ids:
442
+ return 0
443
+ count = len(batch_conv_ids)
444
+ try:
445
+ embeddings = store.ef(batch_texts)
446
+ store._chats.upsert(
447
+ ids=list(batch_ids),
448
+ embeddings=embeddings,
449
+ documents=list(batch_texts),
450
+ metadatas=list(batch_metas),
451
+ )
452
+ except Exception as e:
453
+ logger.warning("Chroma write failed for chat info batch: %s", e)
454
+ batch_ids.clear()
455
+ batch_texts.clear()
456
+ batch_metas.clear()
457
+ batch_conv_ids.clear()
458
+ return 0
459
+ # SQLite update only after chroma success
460
+ for cid in batch_conv_ids:
461
+ cursor.execute(
462
+ "UPDATE chats SET metadata_vectorized_at = CURRENT_TIMESTAMP WHERE id = ?",
463
+ (cid,),
464
+ )
465
+ batch_ids.clear()
466
+ batch_texts.clear()
467
+ batch_metas.clear()
468
+ batch_conv_ids.clear()
469
+ return count
470
+
471
+ for conv in chats:
472
+ if _shutdown:
473
+ done += flush_batch()
474
+ conn.commit()
475
+ if console:
476
+ console.print(f" [yellow]Interrupted[/yellow] at {done}/{total} chats")
477
+ return {"done": done, "interrupted": True}
478
+
479
+ try:
480
+ text_parts = [f"Chat: {conv['title'] or '(untitled)'}"]
481
+ if conv["summary"]:
482
+ text_parts.append(f"Summary: {conv['summary']}")
483
+ text_parts.append(f"Source: {conv['source']}")
484
+ searchable_text = "\n\n".join(text_parts)
485
+
486
+ batch_ids.append(f"chat_info_{conv['id']}")
487
+ batch_texts.append(searchable_text)
488
+ batch_metas.append(
489
+ {
490
+ "chat_id": conv["id"],
491
+ "chat_title": (conv["title"] or "(untitled)")[:200],
492
+ "source": conv["source"] or "unknown",
493
+ "created_at": conv["created_at"] or "",
494
+ "message_count": conv["message_count"] or 0,
495
+ "chunk_type": "chat_info",
496
+ "has_summary": bool(conv["summary"]),
497
+ }
498
+ )
499
+ batch_conv_ids.append(conv["id"])
500
+ if len(batch_ids) >= _BATCH_SIZE:
501
+ done += flush_batch()
502
+ conn.commit()
503
+ if console:
504
+ console.print(f" Vectorizing chat info: {done}/{total}")
505
+ except Exception as e:
506
+ logger.debug("Skipped chat %s: %s", conv["id"], e)
507
+
508
+ done += flush_batch()
509
+ conn.commit()
510
+ return {"done": done, "interrupted": False}
511
+
512
+
513
+ # ---------------------------------------------------------------------------
514
+ # Sync verification
515
+ # ---------------------------------------------------------------------------
516
+
517
+
518
+ def _sync_verify(cursor, store, files_enabled, chats_enabled, console) -> None:
519
+ """Compare DB vectorization counts against chroma document counts."""
520
+ if console:
521
+ console.print()
522
+ console.print("[bold]Sync verification[/bold]")
523
+
524
+ if files_enabled:
525
+ cursor.execute(
526
+ "SELECT COALESCE(SUM(vectorized_chunks), 0) FROM files"
527
+ " WHERE vectorized_at IS NOT NULL AND status != 'removed'"
528
+ )
529
+ db_file_chunks = cursor.fetchone()[0]
530
+ chroma_file_chunks = store.get_file_stats().get("total_chunks", 0)
531
+
532
+ if db_file_chunks == chroma_file_chunks:
533
+ if console:
534
+ console.print(
535
+ f" [green]\u2713[/green] Files: DB={db_file_chunks} chunks, chroma={chroma_file_chunks} chunks"
536
+ )
537
+ else:
538
+ if console:
539
+ console.print(
540
+ f" [yellow]\u26a0[/yellow] Files: DB={db_file_chunks}"
541
+ f" chunks, chroma={chroma_file_chunks} chunks"
542
+ f" (discrepancy: {abs(db_file_chunks - chroma_file_chunks)})"
543
+ )
544
+
545
+ if chats_enabled:
546
+ # Chroma stores message chunks + chat_info docs in one collection.
547
+ # DB side: SUM(vectorized_chunks) for messages (chunk count),
548
+ # COUNT(*) for chat_info (each chat = exactly 1 chroma doc).
549
+ cursor.execute(
550
+ "SELECT COALESCE(SUM(vectorized_chunks), 0) FROM messages"
551
+ " WHERE status != 'removed' AND vectorized_at IS NOT NULL"
552
+ )
553
+ db_msg_chunks = cursor.fetchone()[0]
554
+
555
+ # Detect stale chunk counts: messages vectorized before the
556
+ # vectorized_chunks column was added will have chunks=0.
557
+ cursor.execute(
558
+ "SELECT COUNT(*) FROM messages"
559
+ " WHERE status != 'removed' AND vectorized_at IS NOT NULL AND vectorized_chunks = 0"
560
+ )
561
+ stale_count = cursor.fetchone()[0]
562
+
563
+ cursor.execute("SELECT COUNT(*) FROM chats WHERE status != 'removed' AND metadata_vectorized_at IS NOT NULL")
564
+ db_chat_info_count = cursor.fetchone()[0]
565
+ chroma_chat_docs = store.get_chat_stats().get("total_documents", 0)
566
+
567
+ if console:
568
+ if stale_count > 0:
569
+ console.print(
570
+ f" [yellow]\u26a0[/yellow] Chats: {stale_count} messages"
571
+ " missing chunk counts — re-run"
572
+ " [bold]fp ingest --rebuild-vectors[/bold] to populate"
573
+ )
574
+ else:
575
+ db_total = db_msg_chunks + db_chat_info_count
576
+ if db_total == chroma_chat_docs:
577
+ console.print(
578
+ f" [green]\u2713[/green] Chats:"
579
+ f" DB={db_msg_chunks} message chunks"
580
+ f" + {db_chat_info_count} chat info,"
581
+ f" chroma={chroma_chat_docs} documents"
582
+ )
583
+ else:
584
+ console.print(
585
+ f" [yellow]\u26a0[/yellow] Chats:"
586
+ f" DB={db_msg_chunks} message chunks"
587
+ f" + {db_chat_info_count} chat info,"
588
+ f" chroma={chroma_chat_docs} documents"
589
+ f" (discrepancy:"
590
+ f" {abs(db_total - chroma_chat_docs)})"
591
+ )
592
+
593
+
594
+ # ---------------------------------------------------------------------------
595
+ # Main entry point
596
+ # ---------------------------------------------------------------------------
597
+
598
+
599
+ def _rebuild_vectors(
600
+ quiet: bool = False,
601
+ source: str = "all",
602
+ phase: Optional[str] = None,
603
+ mode: str = "incremental",
604
+ ):
605
+ """Rebuild the vector store from database contents.
606
+
607
+ Args:
608
+ quiet: Suppress Rich output.
609
+ source: Which vectors to rebuild — "files", "chats", or "all".
610
+ phase: Run a single phase — "files", "messages", or "chat_info".
611
+ Overrides source. When set, chroma is not deleted.
612
+ mode: Rebuild mode — "incremental" (default, new/modified/removed only),
613
+ "sync" (incremental + count verification), or "full" (delete all, rebuild).
614
+ """
615
+ global _shutdown
616
+ _shutdown = False
617
+
618
+ import shutil
619
+
620
+ from rich.console import Console
621
+
622
+ from footprinter.paths import get_chroma_path
623
+ from footprinter.source_registry import get_config
624
+
625
+ from ..semantic.vector_store import (
626
+ VectorStore,
627
+ _chat_vectorization_enabled,
628
+ _file_vectorization_enabled,
629
+ )
630
+ from .full_content_extractor import FullContentExtractor
631
+
632
+ config = get_config()
633
+ console = Console() if not quiet else None
634
+
635
+ # Determine which phases are enabled based on phase/source and flags
636
+ do_files = source in ("files", "all")
637
+ do_chats = source in ("chats", "all")
638
+
639
+ if phase == "files":
640
+ files_enabled = _file_vectorization_enabled()
641
+ chats_enabled = False
642
+ elif phase in ("messages", "chat_info"):
643
+ files_enabled = False
644
+ chats_enabled = _chat_vectorization_enabled()
645
+ else:
646
+ files_enabled = do_files and _file_vectorization_enabled()
647
+ chats_enabled = do_chats and _chat_vectorization_enabled()
648
+
649
+ # Guard: refuse when vectorization is disabled for requested phases
650
+ if not files_enabled and not chats_enabled:
651
+ if console:
652
+ console.print()
653
+ console.print(
654
+ "[bold yellow]Vectorization is not enabled[/bold yellow] — "
655
+ "run [bold]fp setup[/bold] or add a [bold]semantic:[/bold] "
656
+ "section to config."
657
+ )
658
+ console.print()
659
+ return
660
+
661
+ # Install signal handlers for graceful shutdown
662
+ old_sigint = signal.getsignal(signal.SIGINT)
663
+ old_sigterm = signal.getsignal(signal.SIGTERM)
664
+ signal.signal(signal.SIGINT, _handle_shutdown)
665
+ signal.signal(signal.SIGTERM, _handle_shutdown)
666
+
667
+ conn = None
668
+ try:
669
+ # Open DB connection — before any destructive action
670
+ conn = sqlite3.connect(str(get_db_path()), timeout=10)
671
+ conn.row_factory = sqlite3.Row
672
+ conn.execute("PRAGMA busy_timeout=5000")
673
+ conn.execute("PRAGMA foreign_keys=ON")
674
+ cursor = conn.cursor()
675
+
676
+ # Pre-flight validation
677
+ try:
678
+ counts = _preflight_check(conn, cursor, files_enabled, chats_enabled, console, mode=mode)
679
+ except RuntimeError as e:
680
+ if console:
681
+ console.print(f"\n[bold red]Pre-flight failed:[/bold red] {e}")
682
+ return
683
+
684
+ # Chroma handling: full mode (no phase) deletes everything;
685
+ # incremental/sync and single-phase preserve existing chroma.
686
+ if mode == "full" and phase is None:
687
+ # Full rebuild — delete chroma and start fresh
688
+ VectorStore.reset_instance()
689
+ chroma_path = get_chroma_path()
690
+ if chroma_path.exists():
691
+ shutil.rmtree(chroma_path)
692
+ if console:
693
+ console.print(f"[dim]Deleted {chroma_path}[/dim]")
694
+ store = VectorStore.get_instance()
695
+ else:
696
+ # Incremental/sync/single-phase — check existing chroma integrity
697
+ try:
698
+ store = VectorStore.get_instance()
699
+ integrity = store.check_integrity()
700
+ if integrity["status"] == "corrupted":
701
+ if console:
702
+ console.print()
703
+ console.print(f"[bold red]Chroma is corrupted:[/bold red] {integrity['error']}")
704
+ console.print("Run [bold]fp ingest --rebuild-vectors full[/bold] to rebuild from scratch.")
705
+ return
706
+ except ImportError:
707
+ raise
708
+
709
+ # Build extractor (vec_config also used by _vectorize_files for SQL pre-filtering)
710
+ vec_config = config.get("vectorization", {})
711
+ extractor = FullContentExtractor.from_config(config)
712
+
713
+ if console:
714
+ console.print()
715
+ console.print(f"[bold]Rebuilding vectors[/bold] [dim]({mode})[/dim]")
716
+ console.print()
717
+
718
+ # Determine which phases to run
719
+ run_files = files_enabled and (phase is None or phase == "files")
720
+ run_messages = chats_enabled and (phase is None or phase == "messages")
721
+ run_chat_info = chats_enabled and (phase is None or phase == "chat_info")
722
+
723
+ results = {}
724
+
725
+ # Cleanup removed vectors (incremental/sync only)
726
+ if mode in ("incremental", "sync") and not _shutdown:
727
+ results["cleanup"] = _cleanup_removed_vectors(
728
+ conn,
729
+ cursor,
730
+ store,
731
+ clean_files=run_files,
732
+ clean_messages=run_messages,
733
+ clean_chats=run_chat_info,
734
+ )
735
+
736
+ if run_files and not _shutdown:
737
+ results["files"] = _vectorize_files(
738
+ conn,
739
+ cursor,
740
+ store,
741
+ extractor,
742
+ vec_config,
743
+ console,
744
+ mode=mode,
745
+ )
746
+
747
+ if run_messages and not _shutdown:
748
+ results["messages"] = _vectorize_messages(
749
+ conn,
750
+ cursor,
751
+ store,
752
+ console,
753
+ mode=mode,
754
+ )
755
+
756
+ if run_chat_info and not _shutdown:
757
+ results["chat_info"] = _vectorize_chat_info(
758
+ conn,
759
+ cursor,
760
+ store,
761
+ console,
762
+ mode=mode,
763
+ )
764
+
765
+ # Sync verification — compare DB and chroma counts
766
+ if mode == "sync" and not _shutdown:
767
+ _sync_verify(cursor, store, files_enabled, chats_enabled, console)
768
+
769
+ # Summary
770
+ if console:
771
+ console.print()
772
+ interrupted = any(r.get("interrupted") for r in results.values())
773
+ label = (
774
+ "[bold yellow]Rebuild interrupted[/bold yellow]"
775
+ if interrupted
776
+ else "[bold green]Rebuild complete[/bold green]"
777
+ )
778
+ console.print(label)
779
+
780
+ cleanup = results.get("cleanup", {})
781
+
782
+ if mode in ("incremental", "sync"):
783
+ # Show categorized counts
784
+ files_r = results.get("files", {})
785
+ if run_files:
786
+ new_count = files_r.get("done", 0)
787
+ removed_count = cleanup.get("removed", 0)
788
+ chunks_count = files_r.get("chunks", 0)
789
+ console.print(f" Files: {new_count} new/modified ({chunks_count} chunks), {removed_count} removed")
790
+ elif do_files:
791
+ console.print(" Files: skipped (--source chats)")
792
+ else:
793
+ if "files" in results:
794
+ r = results["files"]
795
+ console.print(f" Files: {r['done']} vectorized ({r['chunks']} chunks)")
796
+ elif run_files:
797
+ console.print(" Files: skipped (disabled)")
798
+ elif do_files:
799
+ console.print(" Files: skipped (--source chats)")
800
+
801
+ if "messages" in results:
802
+ msg_removed = cleanup.get("removed_messages", 0)
803
+ msg_line = f" Messages: {results['messages']['done']} vectorized"
804
+ if msg_removed:
805
+ msg_line += f", {msg_removed} removed"
806
+ console.print(msg_line)
807
+ elif run_messages:
808
+ console.print(" Messages: skipped (disabled)")
809
+
810
+ if "chat_info" in results:
811
+ chat_removed = cleanup.get("removed_chats", 0)
812
+ chat_line = f" Chats: {results['chat_info']['done']} info indexed"
813
+ if chat_removed:
814
+ chat_line += f", {chat_removed} removed"
815
+ console.print(chat_line)
816
+ elif run_chat_info:
817
+ console.print(" Chats: skipped (disabled)")
818
+ elif do_chats:
819
+ console.print(" Chats: skipped (--source files)")
820
+
821
+ console.print()
822
+ finally:
823
+ if conn is not None:
824
+ conn.close()
825
+ # Restore original signal handlers
826
+ signal.signal(signal.SIGINT, old_sigint)
827
+ signal.signal(signal.SIGTERM, old_sigterm)