footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1043 @@
1
+ """Database schema initialization."""
2
+
3
+ import logging
4
+ import sqlite3
5
+ from typing import Any
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ # Standard Entity Column Set
11
+ # ─────────────────────────
12
+ # All 8 entity tables (files, folders, visits, projects, chats,
13
+ # messages, emails, clients) share these baseline columns:
14
+ #
15
+ # id INTEGER PRIMARY KEY AUTOINCREMENT
16
+ # status TEXT DEFAULT 'active' CHECK (active|hidden|removed)
17
+ # created_at DATETIME DEFAULT CURRENT_TIMESTAMP
18
+ # display_name TEXT (auto-populated via trigger)
19
+ # mcp_read TEXT DEFAULT 'inherit' CHECK (allow|deny|inherit)
20
+ # mcp_view TEXT DEFAULT 'inherit' CHECK (hidden|opaque|visible|inherit)
21
+ #
22
+ # Data-source entities (files, folders, emails, chats, visits, messages)
23
+ # also have audit timestamp columns:
24
+ # indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP (immutable first-seen)
25
+ # updated_at DATETIME DEFAULT CURRENT_TIMESTAMP (refreshed on re-process)
26
+ # project_id INTEGER REFERENCES projects(id)
27
+ # client_id INTEGER REFERENCES clients(id)
28
+ #
29
+ # Timestamp format: YYYY-MM-DD HH:MM:SS (UTC, matches SQLite CURRENT_TIMESTAMP).
30
+ # Python code uses utils.time.UTC_FMT / utc_now_iso() for the same format.
31
+ #
32
+ # Source-specific metadata is stored in the `metadata` TEXT column
33
+ # (JSON) on tables that need it: files, projects, chats, messages,
34
+ # emails, clients.
35
+ #
36
+ # Columns populated by app or future scope
37
+ # ─────────────────────────────────────────
38
+ # summary TEXT — AI-generated summary (files, emails, chats)
39
+ # summarized_at DATETIME — when summary was generated (files only)
40
+ #
41
+ # files_fts and chats_fts reference the summary column via FTS5
42
+ # triggers, so summary stays in the standard schema. emails also
43
+ # has summary for consistency. Tool-only installs leave them NULL.
44
+
45
+
46
+ # Single source of truth for FTS5 virtual table definitions.
47
+ # All CREATE TABLE, backfill, and trigger SQL is derived from this.
48
+ _FTS_DEFINITIONS: dict[str, dict[str, Any]] = {
49
+ "files_fts": {
50
+ "base_table": "files",
51
+ "columns": ["name", "content_preview", "summary"],
52
+ "content_columns": ["content_preview", "summary"],
53
+ },
54
+ "emails_fts": {
55
+ "base_table": "emails",
56
+ "columns": ["subject", "from_name", "from_address", "body_preview"],
57
+ "content_columns": ["body_preview"],
58
+ },
59
+ "chats_fts": {
60
+ "base_table": "chats",
61
+ "columns": ["title", "summary"],
62
+ "content_columns": ["summary"],
63
+ },
64
+ }
65
+
66
+ # Single source of truth for the ingests table DDL.
67
+ # Referenced by both migration.py (early creation for last-run migration)
68
+ # and init_db() (canonical DDL).
69
+ _INGESTS_DDL = (
70
+ "CREATE TABLE IF NOT EXISTS ingests ("
71
+ "id INTEGER PRIMARY KEY AUTOINCREMENT, "
72
+ "pipe TEXT NOT NULL, "
73
+ "started_at DATETIME NOT NULL, "
74
+ "completed_at DATETIME, "
75
+ "status TEXT NOT NULL DEFAULT 'running' "
76
+ " CHECK (status IN ('running', 'completed', 'failed', 'interrupted')), "
77
+ "mode TEXT, "
78
+ "trigger TEXT, "
79
+ "items_processed INTEGER DEFAULT 0, "
80
+ "items_new INTEGER DEFAULT 0, "
81
+ "items_updated INTEGER DEFAULT 0, "
82
+ "items_skipped INTEGER DEFAULT 0, "
83
+ "errors INTEGER DEFAULT 0, "
84
+ "elapsed_seconds REAL, "
85
+ "metadata TEXT)"
86
+ )
87
+
88
+
89
+ # All 8 entity tables that carry mcp_read / mcp_view columns.
90
+ # Shared by init_db() (display_name triggers) and migration.py.
91
+ ACCESS_CONTROL_TABLES = (
92
+ "files",
93
+ "folders",
94
+ "visits",
95
+ "projects",
96
+ "chats",
97
+ "messages",
98
+ "emails",
99
+ "clients",
100
+ )
101
+
102
+
103
+ class SchemaMixin:
104
+ """Mixin providing database schema initialization."""
105
+
106
+ def init_db(self):
107
+ """Initialize database with schema."""
108
+ self.conn = sqlite3.connect(self.db_path, timeout=10)
109
+ self.conn.row_factory = sqlite3.Row
110
+ self.conn.execute("PRAGMA journal_mode=WAL")
111
+ self.conn.execute("PRAGMA busy_timeout=5000")
112
+
113
+ cursor = self.conn.cursor()
114
+
115
+ # Only run migration on existing databases (not fresh installs).
116
+ cursor.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
117
+ if cursor.fetchone() is not None:
118
+ from footprinter.ingest.db.migration import migrate_schema
119
+
120
+ migrate_schema(cursor)
121
+
122
+ # Enable FK enforcement AFTER migrations. The browser_visits →
123
+ # visits rename triggers SQLite's schema rewriter which recompiles
124
+ # FK references. The messages table's FK was originally REFERENCES
125
+ # chat_conversations(id); with foreign_keys ON the rewriter
126
+ # validates the stale compiled reference and fails.
127
+ self.conn.execute("PRAGMA foreign_keys=ON")
128
+
129
+ # ========================================
130
+ # Files Table (unified content metadata)
131
+ # ========================================
132
+ cursor.execute(
133
+ """
134
+ CREATE TABLE IF NOT EXISTS files (
135
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
136
+
137
+ -- Source identification
138
+ source TEXT NOT NULL,
139
+ external_id TEXT,
140
+ account TEXT,
141
+
142
+ -- Core file info
143
+ name TEXT NOT NULL,
144
+ path TEXT,
145
+ content_type TEXT,
146
+ mime_type TEXT,
147
+ size_bytes INTEGER,
148
+
149
+ -- Origin timestamps
150
+ created_at DATETIME,
151
+ modified_at DATETIME,
152
+ accessed_at DATETIME,
153
+
154
+ -- Audit timestamps
155
+ indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
156
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
157
+
158
+ -- Content
159
+ content_preview TEXT,
160
+ sha256_hash TEXT,
161
+
162
+ -- Vectorization status
163
+ vectorized_at DATETIME,
164
+ vectorized_chunks INTEGER DEFAULT 0,
165
+
166
+ -- Project/client association
167
+ project_id INTEGER REFERENCES projects(id),
168
+ client_id INTEGER REFERENCES clients(id),
169
+
170
+ -- Flexible metadata (source-specific fields as JSON)
171
+ metadata TEXT,
172
+
173
+ -- Folder linkage
174
+ folder_id INTEGER REFERENCES folders(id),
175
+
176
+ -- Hash for Drive linking
177
+ md5_hash TEXT,
178
+
179
+ -- Status tracking
180
+ status TEXT DEFAULT 'active'
181
+ CHECK (status IN ('active', 'hidden', 'removed')),
182
+ status_reason TEXT,
183
+ status_changed_at DATETIME,
184
+
185
+ -- MCP access control
186
+ mcp_read TEXT DEFAULT 'inherit'
187
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
188
+ mcp_view TEXT DEFAULT 'inherit'
189
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
190
+
191
+ -- AI-generated summaries
192
+ summary TEXT,
193
+ summarized_at DATETIME,
194
+
195
+ -- Display
196
+ display_name TEXT
197
+ )
198
+ """
199
+ )
200
+
201
+ # Files indexes
202
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_source ON files(source)")
203
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)")
204
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_modified ON files(modified_at)")
205
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_type ON files(content_type)")
206
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_project ON files(project_id)")
207
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_hash ON files(sha256_hash)")
208
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_account ON files(account)")
209
+
210
+ cursor.execute(
211
+ """
212
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_files_local_unique
213
+ ON files(source, path)
214
+ WHERE source = 'local' AND path IS NOT NULL
215
+ """
216
+ )
217
+
218
+ cursor.execute(
219
+ """
220
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_files_drive_unique
221
+ ON files(source, external_id, account)
222
+ WHERE source != 'local' AND external_id IS NOT NULL
223
+ """
224
+ )
225
+
226
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_folder ON files(folder_id)")
227
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_md5 ON files(md5_hash)")
228
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_status ON files(status)")
229
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_visibility ON files(mcp_view)")
230
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_client ON files(client_id)")
231
+
232
+ # ========================================
233
+ # Folders Table
234
+ # ========================================
235
+ cursor.execute(
236
+ """
237
+ CREATE TABLE IF NOT EXISTS folders (
238
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
239
+
240
+ -- Core folder info
241
+ path TEXT NOT NULL,
242
+ relative_path TEXT NOT NULL,
243
+ name TEXT NOT NULL,
244
+ parent_path TEXT,
245
+
246
+ -- Stats
247
+ file_count INTEGER DEFAULT 0,
248
+
249
+ -- Timestamps
250
+ scanned_at DATETIME,
251
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
252
+
253
+ -- Project association
254
+ project_id INTEGER REFERENCES projects(id),
255
+
256
+ -- Source identification (for remote folders)
257
+ source TEXT DEFAULT 'local',
258
+ external_id TEXT,
259
+ account TEXT,
260
+
261
+ -- Hierarchy
262
+ parent_folder_id INTEGER REFERENCES folders(id),
263
+
264
+ -- Pre-computed counts
265
+ direct_file_count INTEGER DEFAULT 0,
266
+ total_file_count INTEGER DEFAULT 0,
267
+ total_size_bytes INTEGER DEFAULT 0,
268
+
269
+ -- Status tracking
270
+ status TEXT DEFAULT 'active'
271
+ CHECK (status IN ('active', 'hidden', 'removed')),
272
+
273
+ -- Audit timestamps
274
+ indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
275
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
276
+
277
+ -- Client association
278
+ client_id INTEGER REFERENCES clients(id),
279
+
280
+ -- MCP access control
281
+ mcp_view TEXT DEFAULT 'inherit'
282
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
283
+ mcp_read TEXT DEFAULT 'inherit'
284
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
285
+
286
+ -- Display
287
+ display_name TEXT
288
+ )
289
+ """
290
+ )
291
+
292
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_folders_path ON folders(path)")
293
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_folders_project ON folders(project_id)")
294
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_folders_source ON folders(source)")
295
+ cursor.execute(
296
+ "CREATE UNIQUE INDEX IF NOT EXISTS idx_folders_unique_path ON folders(path) WHERE source = 'local'"
297
+ )
298
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_folders_visibility ON folders(mcp_view)")
299
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_folders_status ON folders(status)")
300
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_folders_client ON folders(client_id)")
301
+
302
+ # ========================================
303
+ # Visits Table
304
+ # ========================================
305
+ cursor.execute(
306
+ """
307
+ CREATE TABLE IF NOT EXISTS visits (
308
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
309
+ url TEXT NOT NULL,
310
+ title TEXT,
311
+ visit_time DATETIME NOT NULL,
312
+ browser TEXT NOT NULL,
313
+ visit_count INTEGER DEFAULT 1,
314
+
315
+ -- Audit timestamps
316
+ indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
317
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
318
+
319
+ -- Status tracking
320
+ status TEXT DEFAULT 'active'
321
+ CHECK (status IN ('active', 'hidden', 'removed')),
322
+
323
+ -- MCP access control
324
+ mcp_read TEXT DEFAULT 'inherit'
325
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
326
+ mcp_view TEXT DEFAULT 'inherit'
327
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
328
+
329
+ -- Origin timestamps
330
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
331
+
332
+ -- Client/project association
333
+ client_id INTEGER REFERENCES clients(id),
334
+ project_id INTEGER REFERENCES projects(id),
335
+
336
+ -- Display
337
+ display_name TEXT
338
+ )
339
+ """
340
+ )
341
+
342
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_visits_time ON visits(visit_time)")
343
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_visits_browser ON visits(browser)")
344
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_visits_project ON visits(project_id)")
345
+ cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_visits_unique ON visits(url, visit_time, browser)")
346
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_visits_client ON visits(client_id)")
347
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_visits_status ON visits(status)")
348
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_visits_visibility ON visits(mcp_view)")
349
+
350
+ # ========================================
351
+ # Projects Table
352
+ # ========================================
353
+ cursor.execute(
354
+ """
355
+ CREATE TABLE IF NOT EXISTS projects (
356
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
357
+ project_name TEXT NOT NULL,
358
+ description TEXT,
359
+ status TEXT DEFAULT 'active'
360
+ CHECK (status IN ('active', 'hidden', 'removed',
361
+ 'paused', 'completed', 'abandoned',
362
+ 'archived', 'merged')),
363
+ status_reason TEXT,
364
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
365
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
366
+ metadata TEXT,
367
+
368
+ -- Code project info (app-scope adds more columns)
369
+ root_path TEXT,
370
+ project_type TEXT,
371
+
372
+ -- Client association
373
+ client_id INTEGER REFERENCES clients(id),
374
+ client TEXT,
375
+ github_url TEXT,
376
+ root_folder_id INTEGER REFERENCES folders(id),
377
+
378
+ -- MCP access control
379
+ mcp_read TEXT DEFAULT 'inherit'
380
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
381
+ mcp_view TEXT DEFAULT 'inherit'
382
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
383
+
384
+ -- Display
385
+ display_name TEXT
386
+ )
387
+ """
388
+ )
389
+
390
+ cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_projects_root ON projects(root_path)")
391
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_projects_client ON projects(client_id)")
392
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_projects_visibility ON projects(mcp_view)")
393
+
394
+ # ========================================
395
+ # Chats Table
396
+ # ========================================
397
+ cursor.execute(
398
+ """
399
+ CREATE TABLE IF NOT EXISTS chats (
400
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
401
+ external_id TEXT UNIQUE NOT NULL,
402
+ account TEXT NOT NULL,
403
+ title TEXT,
404
+ summary TEXT,
405
+
406
+ -- Origin timestamps
407
+ created_at DATETIME,
408
+ modified_at DATETIME,
409
+
410
+ message_count INTEGER DEFAULT 0,
411
+
412
+ -- Audit timestamps
413
+ indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
414
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
415
+
416
+ metadata TEXT,
417
+
418
+ -- Vectorization
419
+ metadata_vectorized_at DATETIME,
420
+
421
+ -- Status tracking
422
+ status TEXT DEFAULT 'active'
423
+ CHECK (status IN ('active', 'hidden', 'removed', 'merged')),
424
+
425
+ -- MCP access control
426
+ mcp_read TEXT DEFAULT 'inherit'
427
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
428
+ mcp_view TEXT DEFAULT 'inherit'
429
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
430
+
431
+ -- Client/project association
432
+ client_id INTEGER REFERENCES clients(id),
433
+ project_id INTEGER REFERENCES projects(id),
434
+
435
+ -- Merge tracking
436
+ merged_into_id INTEGER REFERENCES chats(id),
437
+
438
+ -- Display
439
+ display_name TEXT
440
+ )
441
+ """
442
+ )
443
+
444
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chat_conv_created ON chats(created_at)")
445
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chat_conv_account ON chats(account)")
446
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chat_conv_status ON chats(status)")
447
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chats_client ON chats(client_id)")
448
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chats_project ON chats(project_id)")
449
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chats_visibility ON chats(mcp_view)")
450
+
451
+ # ========================================
452
+ # Messages Table
453
+ # ========================================
454
+ cursor.execute(
455
+ """
456
+ CREATE TABLE IF NOT EXISTS messages (
457
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
458
+ chat_id INTEGER NOT NULL,
459
+ message_id TEXT,
460
+ role TEXT NOT NULL,
461
+ content TEXT,
462
+ created_at DATETIME,
463
+ metadata TEXT,
464
+ vectorized_at DATETIME,
465
+ vectorized_chunks INTEGER DEFAULT 0,
466
+
467
+ -- Audit timestamps
468
+ indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
469
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
470
+
471
+ -- Status tracking
472
+ status TEXT DEFAULT 'active'
473
+ CHECK (status IN ('active', 'hidden', 'removed')),
474
+
475
+ -- MCP access control
476
+ mcp_read TEXT DEFAULT 'inherit'
477
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
478
+ mcp_view TEXT DEFAULT 'inherit'
479
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
480
+
481
+ -- Display
482
+ display_name TEXT,
483
+
484
+ FOREIGN KEY (chat_id) REFERENCES chats(id)
485
+ )
486
+ """
487
+ )
488
+
489
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chat_msg_conv ON messages(chat_id)")
490
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chat_msg_created ON messages(created_at)")
491
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_messages_visibility ON messages(mcp_view)")
492
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_messages_status ON messages(status)")
493
+
494
+ # ========================================
495
+ # Emails Table
496
+ # ========================================
497
+ cursor.execute(
498
+ """
499
+ CREATE TABLE IF NOT EXISTS emails (
500
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
501
+ message_id TEXT NOT NULL,
502
+ thread_id TEXT NOT NULL,
503
+ account TEXT NOT NULL,
504
+ from_address TEXT,
505
+ from_name TEXT,
506
+ to_addresses TEXT,
507
+ cc_addresses TEXT,
508
+ subject TEXT,
509
+ body_preview TEXT,
510
+ received_at DATETIME NOT NULL,
511
+ labels TEXT,
512
+ has_attachments BOOLEAN DEFAULT 0,
513
+ is_read BOOLEAN DEFAULT 1,
514
+
515
+ -- Audit timestamps
516
+ indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
517
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
518
+
519
+ metadata TEXT,
520
+
521
+ -- Status tracking
522
+ status TEXT DEFAULT 'active'
523
+ CHECK (status IN ('active', 'hidden', 'removed')),
524
+
525
+ -- MCP access control
526
+ mcp_read TEXT DEFAULT 'inherit'
527
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
528
+ mcp_view TEXT DEFAULT 'inherit'
529
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
530
+
531
+ -- AI-generated summaries
532
+ summary TEXT,
533
+
534
+ -- Timestamps
535
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
536
+
537
+ -- Client/project association
538
+ client_id INTEGER REFERENCES clients(id),
539
+ project_id INTEGER REFERENCES projects(id),
540
+
541
+ -- Display
542
+ display_name TEXT,
543
+
544
+ UNIQUE(message_id, account)
545
+ )
546
+ """
547
+ )
548
+
549
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_email_account ON emails(account)")
550
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_email_received ON emails(received_at)")
551
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_email_from ON emails(from_address)")
552
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_email_thread ON emails(thread_id)")
553
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_emails_client ON emails(client_id)")
554
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_emails_project ON emails(project_id)")
555
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_emails_visibility ON emails(mcp_view)")
556
+
557
+ # ========================================
558
+ # Clients Table
559
+ # ========================================
560
+ cursor.execute(
561
+ """
562
+ CREATE TABLE IF NOT EXISTS clients (
563
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
564
+ name TEXT NOT NULL UNIQUE,
565
+ slug TEXT NOT NULL UNIQUE,
566
+ client_type TEXT NOT NULL,
567
+ path_pattern TEXT,
568
+ status TEXT DEFAULT 'active'
569
+ CHECK (status IN ('active', 'hidden', 'removed')),
570
+ status_reason TEXT,
571
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
572
+ metadata TEXT,
573
+
574
+ -- MCP access control
575
+ mcp_read TEXT DEFAULT 'inherit'
576
+ CHECK (mcp_read IN ('allow', 'deny', 'inherit')),
577
+ mcp_view TEXT DEFAULT 'inherit'
578
+ CHECK (mcp_view IN ('hidden', 'opaque', 'visible', 'inherit')),
579
+
580
+ -- Display
581
+ display_name TEXT
582
+ )
583
+ """
584
+ )
585
+
586
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_clients_slug ON clients(slug)")
587
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_clients_type ON clients(client_type)")
588
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_clients_visibility ON clients(mcp_view)")
589
+
590
+ # ========================================
591
+ # Sources Table (runtime registry)
592
+ # ========================================
593
+ cursor.execute(
594
+ """
595
+ CREATE TABLE IF NOT EXISTS sources (
596
+ name TEXT PRIMARY KEY,
597
+ source_type TEXT NOT NULL,
598
+ adapter TEXT,
599
+ account TEXT,
600
+ label TEXT,
601
+ icon TEXT,
602
+ enabled INTEGER DEFAULT 1,
603
+ config TEXT,
604
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
605
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
606
+ )
607
+ """
608
+ )
609
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_sources_type ON sources(source_type)")
610
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_sources_enabled ON sources(enabled)")
611
+
612
+ # ========================================
613
+ # Uploads Table (generic upload log)
614
+ # ========================================
615
+ cursor.execute(
616
+ """
617
+ CREATE TABLE IF NOT EXISTS uploads (
618
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
619
+ filename TEXT NOT NULL,
620
+ file_hash TEXT NOT NULL UNIQUE,
621
+ file_size INTEGER,
622
+ type TEXT NOT NULL,
623
+ source TEXT,
624
+ items_added INTEGER DEFAULT 0,
625
+ items_updated INTEGER DEFAULT 0,
626
+ items_total INTEGER DEFAULT 0,
627
+ status TEXT DEFAULT 'pending',
628
+ error_message TEXT,
629
+ uploaded_at DATETIME DEFAULT CURRENT_TIMESTAMP,
630
+ completed_at DATETIME,
631
+ metadata TEXT
632
+ )
633
+ """
634
+ )
635
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_uploads_type ON uploads(type)")
636
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_uploads_hash ON uploads(file_hash)")
637
+
638
+ # ========================================
639
+ # Permission Policies Table
640
+ # ========================================
641
+ cursor.execute(
642
+ """
643
+ CREATE TABLE IF NOT EXISTS permission_policies (
644
+ scope TEXT PRIMARY KEY,
645
+ setting TEXT NOT NULL CHECK (setting IN ('allow', 'deny')),
646
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
647
+ )
648
+ """
649
+ )
650
+
651
+ # ========================================
652
+ # Visibility Policies Table
653
+ # ========================================
654
+ cursor.execute(
655
+ """
656
+ CREATE TABLE IF NOT EXISTS visibility_policies (
657
+ scope TEXT PRIMARY KEY,
658
+ setting TEXT NOT NULL CHECK (setting IN ('hidden', 'opaque', 'visible')),
659
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
660
+ )
661
+ """
662
+ )
663
+
664
+ # ========================================
665
+ # Ingests Table (per-pipe run history)
666
+ # ========================================
667
+ cursor.execute(_INGESTS_DDL)
668
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_ingests_pipe_status ON ingests(pipe, status)")
669
+
670
+ # ========================================
671
+ # FTS5 Full-Text Search Indexes
672
+ # ========================================
673
+ for fts_table in _FTS_DEFINITIONS:
674
+ try:
675
+ cursor.execute(self._fts_create_sql(fts_table, if_not_exists=True))
676
+ except sqlite3.OperationalError as e:
677
+ if "no such module: fts5" in str(e):
678
+ logger.warning(
679
+ "FTS5 not available — %s keyword search will use LIKE fallback",
680
+ _FTS_DEFINITIONS[fts_table]["base_table"],
681
+ )
682
+ else:
683
+ raise
684
+
685
+ # Drop old FTS _au triggers so they can be recreated with WHEN
686
+ # clauses (prevents spurious re-indexing on non-FTS column updates).
687
+ for fts_table in _FTS_DEFINITIONS:
688
+ cursor.execute(f"DROP TRIGGER IF EXISTS {fts_table}_au")
689
+
690
+ # Create all FTS triggers (shared with rebuild_fts_indexes)
691
+ self.create_fts_triggers()
692
+
693
+ # ========================================
694
+ # FTS5 Backfill (idempotent)
695
+ # ========================================
696
+ try:
697
+ for fts_table in _FTS_DEFINITIONS:
698
+ cursor.execute(f"SELECT COUNT(*) FROM {fts_table}")
699
+ if cursor.fetchone()[0] == 0:
700
+ cursor.execute(self._fts_backfill_sql(fts_table))
701
+ except sqlite3.OperationalError:
702
+ logger.debug("FTS5 backfill skipped — FTS tables do not exist")
703
+
704
+ # ========================================
705
+ # display_name AFTER INSERT triggers
706
+ # ========================================
707
+ _DISPLAY_NAME_SOURCES = {
708
+ "files": "NEW.name",
709
+ "folders": "NEW.name",
710
+ "visits": "NEW.title",
711
+ "projects": "NEW.project_name",
712
+ "chats": "NEW.title",
713
+ "messages": "SUBSTR(NEW.content, 1, 100)",
714
+ "emails": "NEW.subject",
715
+ "clients": "NEW.name",
716
+ }
717
+ for table, source_expr in _DISPLAY_NAME_SOURCES.items():
718
+ cursor.execute(f"""
719
+ CREATE TRIGGER IF NOT EXISTS set_display_name_{table}
720
+ AFTER INSERT ON {table}
721
+ FOR EACH ROW
722
+ WHEN NEW.display_name IS NULL
723
+ BEGIN
724
+ UPDATE {table} SET display_name = {source_expr}
725
+ WHERE id = NEW.id;
726
+ END
727
+ """)
728
+
729
+ self.conn.commit()
730
+
731
+ # Seed the sources registry from config
732
+ try:
733
+ from footprinter.source_registry import SourceRegistry
734
+
735
+ registry = SourceRegistry(self.conn)
736
+ registry.seed_from_config()
737
+ except Exception as e:
738
+ logger.warning(f"Could not seed sources from config: {e}")
739
+
740
+ # ========================================
741
+ # FTS Trigger Management
742
+ # ========================================
743
+
744
+ _FTS_TRIGGER_NAMES = [f"{fts_table}_{suffix}" for fts_table in _FTS_DEFINITIONS for suffix in ("ai", "ad", "au")]
745
+
746
+ @staticmethod
747
+ def _fts_create_sql(fts_table: str, *, if_not_exists: bool = False) -> str:
748
+ """Return CREATE VIRTUAL TABLE SQL for an FTS5 table."""
749
+ defn = _FTS_DEFINITIONS[fts_table]
750
+ cols = ", ".join(defn["columns"])
751
+ exists = "IF NOT EXISTS " if if_not_exists else ""
752
+ return (
753
+ f"CREATE VIRTUAL TABLE {exists}{fts_table} USING fts5("
754
+ f"{cols}, content='{defn['base_table']}', content_rowid='id')"
755
+ )
756
+
757
+ @staticmethod
758
+ def _fts_backfill_sql(fts_table: str) -> str:
759
+ """Return INSERT...SELECT SQL to backfill an FTS table from its base table."""
760
+ defn = _FTS_DEFINITIONS[fts_table]
761
+ content_cols = set(defn.get("content_columns", []))
762
+ cols_str = ", ".join(defn["columns"])
763
+ select_exprs = []
764
+ for col in defn["columns"]:
765
+ if col in content_cols:
766
+ select_exprs.append(
767
+ f"CASE WHEN COALESCE(mcp_view, 'inherit') IN ('opaque', 'hidden') THEN NULL ELSE {col} END"
768
+ )
769
+ else:
770
+ select_exprs.append(col)
771
+ select_str = ", ".join(select_exprs)
772
+ return f"INSERT INTO {fts_table}(rowid, {cols_str}) SELECT id, {select_str} FROM {defn['base_table']}"
773
+
774
+ @staticmethod
775
+ def _fts_col_expr(col: str, prefix: str, content_columns: set[str]) -> str:
776
+ """Return a SQL expression for a column value in FTS triggers.
777
+
778
+ Content columns are NULLed when mcp_view is opaque or hidden,
779
+ preventing sensitive content from entering the FTS index.
780
+ Metadata columns (name, subject, title, etc.) pass through unchanged.
781
+ """
782
+ if col in content_columns:
783
+ return (
784
+ f"CASE WHEN COALESCE({prefix}.mcp_view, 'inherit') "
785
+ f"IN ('opaque', 'hidden') THEN NULL ELSE {prefix}.{col} END"
786
+ )
787
+ return f"{prefix}.{col}"
788
+
789
+ @staticmethod
790
+ def _fts_trigger_sql(fts_table: str) -> list[str]:
791
+ """Return the 3 trigger CREATE statements (ai, ad, au) for an FTS table."""
792
+ defn = _FTS_DEFINITIONS[fts_table]
793
+ base = defn["base_table"]
794
+ cols = defn["columns"]
795
+ content_cols = set(defn.get("content_columns", []))
796
+ cols_str = ", ".join(cols)
797
+
798
+ new_vals = ", ".join(SchemaMixin._fts_col_expr(c, "new", content_cols) for c in cols)
799
+ old_vals = ", ".join(SchemaMixin._fts_col_expr(c, "old", content_cols) for c in cols)
800
+
801
+ # WHEN clause for _au: only re-index when FTS-tracked columns or
802
+ # mcp_view change. mcp_view affects what's stored in FTS for content
803
+ # columns (opaque/hidden → NULL). Prevents spurious re-indexing from
804
+ # non-FTS updates (e.g. display_name) and avoids corruption when
805
+ # AFTER INSERT triggers do UPDATE on the same row.
806
+ when_cols = list(cols) + ["mcp_view"]
807
+ when_parts = " OR ".join(f"OLD.{c} IS NOT NEW.{c}" for c in when_cols)
808
+
809
+ return [
810
+ # AFTER INSERT
811
+ f"CREATE TRIGGER IF NOT EXISTS {fts_table}_ai AFTER INSERT ON {base} BEGIN "
812
+ f"INSERT INTO {fts_table}(rowid, {cols_str}) "
813
+ f"VALUES (new.id, {new_vals}); END",
814
+ # AFTER DELETE
815
+ f"CREATE TRIGGER IF NOT EXISTS {fts_table}_ad AFTER DELETE ON {base} BEGIN "
816
+ f"INSERT INTO {fts_table}({fts_table}, rowid, {cols_str}) "
817
+ f"VALUES ('delete', old.id, {old_vals}); END",
818
+ # AFTER UPDATE (only when FTS-tracked columns change)
819
+ f"CREATE TRIGGER IF NOT EXISTS {fts_table}_au AFTER UPDATE ON {base} "
820
+ f"WHEN {when_parts} BEGIN "
821
+ f"INSERT INTO {fts_table}({fts_table}, rowid, {cols_str}) "
822
+ f"VALUES ('delete', old.id, {old_vals}); "
823
+ f"INSERT INTO {fts_table}(rowid, {cols_str}) "
824
+ f"VALUES (new.id, {new_vals}); END",
825
+ ]
826
+
827
+ def check_fts_triggers(self) -> list[str]:
828
+ """Return names of expected FTS triggers that are missing from the database.
829
+
830
+ Returns an empty list when all triggers are present.
831
+ """
832
+ cursor = self.conn.cursor()
833
+ placeholders = ", ".join("?" for _ in self._FTS_TRIGGER_NAMES)
834
+ present = {
835
+ row[0]
836
+ for row in cursor.execute(
837
+ f"SELECT name FROM sqlite_master WHERE type='trigger' AND name IN ({placeholders})",
838
+ self._FTS_TRIGGER_NAMES,
839
+ ).fetchall()
840
+ }
841
+ return [name for name in self._FTS_TRIGGER_NAMES if name not in present]
842
+
843
+ def drop_fts_triggers(self) -> None:
844
+ """Drop all FTS sync triggers. Safe to call when FTS5 is unavailable."""
845
+ try:
846
+ cursor = self.conn.cursor()
847
+ for name in self._FTS_TRIGGER_NAMES:
848
+ cursor.execute(f"DROP TRIGGER IF EXISTS {name}")
849
+ self.conn.commit()
850
+ logger.info("Dropped FTS triggers for bulk ingest")
851
+ except sqlite3.OperationalError as e:
852
+ if "no such module: fts5" in str(e):
853
+ logger.debug("drop_fts_triggers skipped — FTS5 not available")
854
+ else:
855
+ raise
856
+
857
+ def create_fts_triggers(self) -> None:
858
+ """Create all FTS sync triggers. Safe to call when FTS5 is unavailable."""
859
+ try:
860
+ cursor = self.conn.cursor()
861
+
862
+ # Only create triggers if FTS tables exist
863
+ placeholders = ", ".join("?" for _ in _FTS_DEFINITIONS)
864
+ cursor.execute(
865
+ f"SELECT name FROM sqlite_master WHERE type='table' AND name IN ({placeholders})",
866
+ list(_FTS_DEFINITIONS.keys()),
867
+ )
868
+ fts_tables = {row[0] for row in cursor.fetchall()}
869
+ if not fts_tables:
870
+ logger.debug("create_fts_triggers skipped — no FTS tables exist")
871
+ return
872
+
873
+ for fts_table in _FTS_DEFINITIONS:
874
+ if fts_table in fts_tables:
875
+ for sql in self._fts_trigger_sql(fts_table):
876
+ cursor.execute(sql)
877
+
878
+ self.conn.commit()
879
+ except sqlite3.OperationalError as e:
880
+ if "no such module: fts5" in str(e):
881
+ logger.debug("create_fts_triggers skipped — FTS5 not available")
882
+ else:
883
+ raise
884
+
885
+ def rebuild_fts_indexes(self) -> None:
886
+ """Rebuild all FTS indexes from base tables and restore triggers.
887
+
888
+ Uses drop+create+backfill (not FTS5 ``rebuild``) so that content
889
+ columns are NULLed for opaque/hidden records via ``_fts_backfill_sql``.
890
+ Safe to call when FTS5 is unavailable.
891
+ """
892
+ try:
893
+ cursor = self.conn.cursor()
894
+
895
+ # Drop triggers first (they reference FTS tables)
896
+ for name in self._FTS_TRIGGER_NAMES:
897
+ cursor.execute(f"DROP TRIGGER IF EXISTS {name}")
898
+
899
+ # Drop and recreate with filtered backfill
900
+ for fts_table in _FTS_DEFINITIONS:
901
+ cursor.execute(f"DROP TABLE IF EXISTS {fts_table}")
902
+ cursor.execute(self._fts_create_sql(fts_table))
903
+ cursor.execute(self._fts_backfill_sql(fts_table))
904
+
905
+ counts = {
906
+ fts_table: cursor.execute(f"SELECT COUNT(*) FROM {fts_table}").fetchone()[0]
907
+ for fts_table in _FTS_DEFINITIONS
908
+ }
909
+
910
+ self.conn.commit()
911
+ logger.info(
912
+ "Rebuilt FTS indexes: %s",
913
+ ", ".join(f"{t}={c}" for t, c in counts.items()),
914
+ )
915
+ except sqlite3.OperationalError as e:
916
+ if "no such table" in str(e) or "no such module" in str(e):
917
+ logger.debug("rebuild_fts_indexes skipped: %s", e)
918
+ else:
919
+ raise
920
+ finally:
921
+ # Always restore triggers — even if rebuild raised
922
+ self.create_fts_triggers()
923
+
924
+ # ========================================
925
+ # FTS Health Check & Repair
926
+ # ========================================
927
+
928
+ _FTS_TABLE_MAP = {k: v["base_table"] for k, v in _FTS_DEFINITIONS.items()}
929
+
930
+ def check_fts_health(self) -> dict:
931
+ """Check FTS table health: existence and queryability.
932
+
933
+ All three FTS tables are external content tables, so
934
+ ``SELECT COUNT(*)`` delegates to the content table and row counts
935
+ always match. Drift detection via row counts is therefore a no-op.
936
+ Real drift protection comes from sync triggers and
937
+ auto-recovery on pipeline startup.
938
+
939
+ We don't use FTS5 ``integrity-check`` because our triggers
940
+ intentionally NULL content columns for opaque/hidden records.
941
+
942
+ Safe to call when FTS5 is unavailable — returns all tables as
943
+ ``"error"`` with an explanatory message.
944
+
945
+ Returns a dict keyed by FTS table name, each with:
946
+ status: "ok" | "error"
947
+ fts_rows: int (or None if table missing)
948
+ base_rows: int
949
+ message: str (only on error)
950
+ triggers_missing: list[str] (trigger names missing for this table)
951
+ """
952
+ cursor = self.conn.cursor()
953
+ result = {}
954
+ all_missing = set(self.check_fts_triggers())
955
+
956
+ for fts_table, base_table in self._FTS_TABLE_MAP.items():
957
+ table_triggers_missing = [t for t in all_missing if t.startswith(f"{fts_table}_")]
958
+ base_rows = cursor.execute(f"SELECT COUNT(*) FROM {base_table}").fetchone()[0]
959
+
960
+ try:
961
+ fts_rows = cursor.execute(f"SELECT COUNT(*) FROM {fts_table}").fetchone()[0]
962
+ except sqlite3.OperationalError as e:
963
+ if "no such module: fts5" in str(e) or "no such table" in str(e):
964
+ result[fts_table] = {
965
+ "status": "error",
966
+ "fts_rows": None,
967
+ "base_rows": base_rows,
968
+ "message": f"{fts_table} is missing or corrupted",
969
+ "triggers_missing": table_triggers_missing,
970
+ }
971
+ continue
972
+ raise
973
+ except sqlite3.DatabaseError:
974
+ result[fts_table] = {
975
+ "status": "error",
976
+ "fts_rows": None,
977
+ "base_rows": base_rows,
978
+ "message": f"{fts_table} is corrupted or unreadable",
979
+ "triggers_missing": table_triggers_missing,
980
+ }
981
+ continue
982
+
983
+ result[fts_table] = {
984
+ "status": "ok",
985
+ "fts_rows": fts_rows,
986
+ "base_rows": base_rows,
987
+ "triggers_missing": table_triggers_missing,
988
+ }
989
+
990
+ return result
991
+
992
+ def repair_fts(self) -> dict:
993
+ """Drop and rebuild all FTS tables from base table data.
994
+
995
+ Safe to call when FTS5 is unavailable — logs a debug message
996
+ and returns empty dict. Always restores triggers in a finally
997
+ block, matching the safety pattern of ``rebuild_fts_indexes()``.
998
+
999
+ Returns a dict keyed by FTS table name with before/after row counts.
1000
+ """
1001
+ try:
1002
+ cursor = self.conn.cursor()
1003
+
1004
+ # Capture before state
1005
+ before = {}
1006
+ for fts_table in self._FTS_TABLE_MAP:
1007
+ try:
1008
+ before[fts_table] = cursor.execute(f"SELECT COUNT(*) FROM {fts_table}").fetchone()[0]
1009
+ except sqlite3.OperationalError:
1010
+ before[fts_table] = None
1011
+
1012
+ # Drop triggers and FTS tables
1013
+ self.drop_fts_triggers()
1014
+ for fts_table in self._FTS_TABLE_MAP:
1015
+ cursor.execute(f"DROP TABLE IF EXISTS {fts_table}")
1016
+
1017
+ # Recreate FTS virtual tables and backfill from base tables
1018
+ for fts_table in _FTS_DEFINITIONS:
1019
+ cursor.execute(self._fts_create_sql(fts_table))
1020
+ cursor.execute(self._fts_backfill_sql(fts_table))
1021
+
1022
+ self.conn.commit()
1023
+
1024
+ # Capture after state
1025
+ result = {}
1026
+ for fts_table in self._FTS_TABLE_MAP:
1027
+ after = cursor.execute(f"SELECT COUNT(*) FROM {fts_table}").fetchone()[0]
1028
+ result[fts_table] = {"before": before[fts_table], "after": after}
1029
+
1030
+ logger.info(
1031
+ "Repaired FTS indexes: %s",
1032
+ ", ".join(f"{t}={r['after']}" for t, r in result.items()),
1033
+ )
1034
+ return result
1035
+
1036
+ except sqlite3.OperationalError as e:
1037
+ if "no such module: fts5" in str(e):
1038
+ logger.debug("repair_fts skipped — FTS5 not available")
1039
+ return {}
1040
+ raise
1041
+ finally:
1042
+ # Always restore triggers — even if repair raised
1043
+ self.create_fts_triggers()