footprinter-cli 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +431 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
- footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
- footprinter/bundled/samples/visible-file-sample.txt +2 -0
- footprinter/cli/__init__.py +135 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +327 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/_sample_seed.py +204 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +543 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +2001 -0
- footprinter/cli/status.py +747 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +602 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +724 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +487 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +315 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +223 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +112 -0
- footprinter/ingest/pipe_runner.py +200 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +186 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +67 -0
- footprinter/mcp/errors.py +105 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +19 -0
- footprinter/paths.py +117 -0
- footprinter/permissions.py +1152 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1264 -0
- footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
- footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
- footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content extraction from various file types.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
|
11
|
+
|
|
12
|
+
_pypdf_warned = False
|
|
13
|
+
_docx_warned = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ContentExtractor:
|
|
17
|
+
"""Extract text content from various file types."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, max_preview_length: int = 1000):
|
|
20
|
+
self.max_preview_length = max_preview_length
|
|
21
|
+
|
|
22
|
+
def extract(self, file_path: Path) -> Optional[str]:
|
|
23
|
+
"""
|
|
24
|
+
Extract content from file based on type.
|
|
25
|
+
|
|
26
|
+
Returns preview of file content (first N characters).
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
file_type = file_path.suffix.lower()
|
|
30
|
+
|
|
31
|
+
if file_type in [".txt", ".md", ".py", ".js", ".json", ".yaml", ".yml"]:
|
|
32
|
+
return self._extract_text(file_path)
|
|
33
|
+
elif file_type == ".pdf":
|
|
34
|
+
return self._extract_pdf(file_path)
|
|
35
|
+
elif file_type == ".docx":
|
|
36
|
+
return self._extract_docx(file_path)
|
|
37
|
+
else:
|
|
38
|
+
logger.debug(f"No extractor for {file_type}")
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
except (
|
|
42
|
+
Exception
|
|
43
|
+
) as e: # Intentional broad catch: extraction is inherently brittle (encoding, corrupt files, library bugs)
|
|
44
|
+
logger.error(f"Error extracting content from {file_path}: {e}")
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
def _extract_text(self, file_path: Path) -> str:
|
|
48
|
+
"""Extract from plain text files."""
|
|
49
|
+
try:
|
|
50
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
51
|
+
content = f.read(self.max_preview_length * 2) # Read a bit more
|
|
52
|
+
return content[: self.max_preview_length]
|
|
53
|
+
except (
|
|
54
|
+
Exception
|
|
55
|
+
) as e: # Intentional broad catch: extraction is inherently brittle (encoding, corrupt files, library bugs)
|
|
56
|
+
logger.error(f"Error reading text file {file_path}: {e}")
|
|
57
|
+
return ""
|
|
58
|
+
|
|
59
|
+
def _extract_pdf(self, file_path: Path) -> str:
|
|
60
|
+
"""Extract from PDF files."""
|
|
61
|
+
try:
|
|
62
|
+
import pypdf
|
|
63
|
+
|
|
64
|
+
with open(file_path, "rb") as f:
|
|
65
|
+
reader = pypdf.PdfReader(f)
|
|
66
|
+
|
|
67
|
+
# Extract first few pages
|
|
68
|
+
text = ""
|
|
69
|
+
for page_num in range(min(3, len(reader.pages))):
|
|
70
|
+
page = reader.pages[page_num]
|
|
71
|
+
text += page.extract_text() + "\n"
|
|
72
|
+
|
|
73
|
+
if len(text) >= self.max_preview_length:
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
return text[: self.max_preview_length]
|
|
77
|
+
|
|
78
|
+
except ImportError:
|
|
79
|
+
global _pypdf_warned
|
|
80
|
+
if not _pypdf_warned:
|
|
81
|
+
logger.warning("pypdf not installed, skipping PDF extraction")
|
|
82
|
+
_pypdf_warned = True
|
|
83
|
+
return None
|
|
84
|
+
except (
|
|
85
|
+
Exception
|
|
86
|
+
) as e: # Intentional broad catch: extraction is inherently brittle (encoding, corrupt files, library bugs)
|
|
87
|
+
logger.error(f"Error reading PDF {file_path}: {e}")
|
|
88
|
+
return ""
|
|
89
|
+
|
|
90
|
+
def _extract_docx(self, file_path: Path) -> str:
|
|
91
|
+
"""Extract from DOCX files."""
|
|
92
|
+
try:
|
|
93
|
+
import docx
|
|
94
|
+
|
|
95
|
+
doc = docx.Document(file_path)
|
|
96
|
+
|
|
97
|
+
# Extract paragraphs
|
|
98
|
+
text = ""
|
|
99
|
+
for para in doc.paragraphs:
|
|
100
|
+
text += para.text + "\n"
|
|
101
|
+
|
|
102
|
+
if len(text) >= self.max_preview_length:
|
|
103
|
+
break
|
|
104
|
+
|
|
105
|
+
return text[: self.max_preview_length]
|
|
106
|
+
|
|
107
|
+
except ImportError:
|
|
108
|
+
global _docx_warned
|
|
109
|
+
if not _docx_warned:
|
|
110
|
+
logger.warning("python-docx not installed, skipping DOCX extraction")
|
|
111
|
+
_docx_warned = True
|
|
112
|
+
return None
|
|
113
|
+
except (
|
|
114
|
+
Exception
|
|
115
|
+
) as e: # Intentional broad catch: extraction is inherently brittle (encoding, corrupt files, library bugs)
|
|
116
|
+
logger.error(f"Error reading DOCX {file_path}: {e}")
|
|
117
|
+
return ""
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""SQLite database schema and operations for Footprinter."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from footprinter.ingest.db.connector_schema import init_connector_schemas
|
|
7
|
+
from footprinter.ingest.db.schema import SchemaMixin
|
|
8
|
+
from footprinter.paths import get_db_path
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Database(SchemaMixin):
|
|
14
|
+
"""SQLite database connection and schema manager for Footprinter."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, db_path: str = None, connector_specs: list = None):
|
|
17
|
+
if db_path is None:
|
|
18
|
+
db_path = str(get_db_path())
|
|
19
|
+
self.db_path = db_path
|
|
20
|
+
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
self.conn = None
|
|
22
|
+
self.init_db()
|
|
23
|
+
init_connector_schemas(self.conn, connector_specs or [])
|
|
24
|
+
|
|
25
|
+
def close(self):
|
|
26
|
+
"""Close database connection."""
|
|
27
|
+
if self.conn:
|
|
28
|
+
self.conn.close()
|
|
29
|
+
|
|
30
|
+
def __enter__(self):
|
|
31
|
+
"""Enter context manager, returning self."""
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
35
|
+
"""Exit context manager, closing the database connection."""
|
|
36
|
+
self.close()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Database sub-package."""
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Connector-scope schema extensions.
|
|
2
|
+
|
|
3
|
+
Connectors declare extra columns via ConnectorSpec.schema_extensions.
|
|
4
|
+
This module applies those declarations using idempotent ALTER TABLE,
|
|
5
|
+
mirroring the pattern in app_schema.py.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import sqlite3
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def init_connector_schemas(conn: sqlite3.Connection, connector_specs: list) -> None:
|
|
15
|
+
"""Apply schema extensions for the given connector specs.
|
|
16
|
+
|
|
17
|
+
The caller is responsible for filtering to installed connectors.
|
|
18
|
+
For each spec with schema_extensions, calls register_connector_schema()
|
|
19
|
+
to add columns via ALTER TABLE.
|
|
20
|
+
"""
|
|
21
|
+
for spec in connector_specs:
|
|
22
|
+
if spec.schema_extensions:
|
|
23
|
+
register_connector_schema(conn, spec.schema_extensions)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def register_connector_schema(
|
|
27
|
+
conn: sqlite3.Connection,
|
|
28
|
+
extensions: dict[str, list[tuple[str, str]]],
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Add connector-declared columns to existing tables.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
conn: An open sqlite3 connection with base schema already initialized.
|
|
34
|
+
extensions: Mapping of table_name → [(col_name, col_definition), ...].
|
|
35
|
+
Example: {"folders": [("web_link", "TEXT")]}
|
|
36
|
+
"""
|
|
37
|
+
cursor = conn.cursor()
|
|
38
|
+
for table, columns in extensions.items():
|
|
39
|
+
for col_name, col_def in columns:
|
|
40
|
+
try:
|
|
41
|
+
cursor.execute(f"ALTER TABLE {table} ADD COLUMN {col_name} {col_def}")
|
|
42
|
+
except sqlite3.OperationalError as e:
|
|
43
|
+
if "duplicate column" in str(e).lower():
|
|
44
|
+
pass # column already exists
|
|
45
|
+
else:
|
|
46
|
+
raise
|
|
47
|
+
conn.commit()
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""Database schema migration for pre-existing Footprinter databases.
|
|
2
|
+
|
|
3
|
+
Extracted for separation of concerns. Contains all
|
|
4
|
+
ALTER TABLE, RENAME, DROP, and data-migration logic needed to upgrade
|
|
5
|
+
databases created before the current DDL.
|
|
6
|
+
|
|
7
|
+
Only runs on databases that already have tables — fresh installs skip
|
|
8
|
+
this entirely (init_db handles everything via CREATE TABLE IF NOT EXISTS).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import sqlite3
|
|
13
|
+
|
|
14
|
+
from footprinter.ingest.db.schema import _INGESTS_DDL, ACCESS_CONTROL_TABLES
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
_DISPLAY_NAME_BACKFILL = {
|
|
19
|
+
"files": "name",
|
|
20
|
+
"folders": "name",
|
|
21
|
+
"visits": "title",
|
|
22
|
+
"projects": "project_name",
|
|
23
|
+
"chats": "title",
|
|
24
|
+
"messages": "SUBSTR(content, 1, 100)",
|
|
25
|
+
"emails": "subject",
|
|
26
|
+
"clients": "name",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def migrate_schema(cursor: sqlite3.Cursor) -> None:
|
|
31
|
+
"""Upgrade a pre-existing database to the current schema.
|
|
32
|
+
|
|
33
|
+
Adds missing columns, renames legacy columns, drops stale artefacts,
|
|
34
|
+
and migrates data where needed. Silently skips tables that don't
|
|
35
|
+
exist yet and columns that already exist.
|
|
36
|
+
|
|
37
|
+
Must run BEFORE ``PRAGMA foreign_keys=ON`` — the browser_visits →
|
|
38
|
+
visits rename triggers SQLite's schema rewriter which recompiles FK
|
|
39
|
+
references and fails on stale compiled references with FK enforcement.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# ── mcp_read / mcp_view on all entity tables ──
|
|
43
|
+
for table in ACCESS_CONTROL_TABLES:
|
|
44
|
+
for col, col_def in [
|
|
45
|
+
("mcp_read", "TEXT DEFAULT 'inherit'"),
|
|
46
|
+
("mcp_view", "TEXT DEFAULT 'inherit'"),
|
|
47
|
+
]:
|
|
48
|
+
try:
|
|
49
|
+
cursor.execute(f"ALTER TABLE {table} ADD COLUMN {col} {col_def}")
|
|
50
|
+
except sqlite3.OperationalError:
|
|
51
|
+
pass # table doesn't exist yet or column already exists
|
|
52
|
+
|
|
53
|
+
# Drop stale artefacts from chat_conversations → chats rename.
|
|
54
|
+
for name in (
|
|
55
|
+
"chat_conversations_ai",
|
|
56
|
+
"chat_conversations_ad",
|
|
57
|
+
"chat_conversations_au",
|
|
58
|
+
"chats_ai",
|
|
59
|
+
"chats_ad",
|
|
60
|
+
"chats_au",
|
|
61
|
+
):
|
|
62
|
+
cursor.execute(f"DROP TRIGGER IF EXISTS {name}")
|
|
63
|
+
cursor.execute("DROP TABLE IF EXISTS chat_conversations_fts")
|
|
64
|
+
# If browser_visits still exists, the RENAME TO visits below will
|
|
65
|
+
# trigger SQLite's schema rewriter. With foreign_keys ON, the
|
|
66
|
+
# rewriter recompiles FK references and hits the stale compiled
|
|
67
|
+
# chat_conversations FK. Dropping chats_fts BEFORE the rename
|
|
68
|
+
# prevents corruption. Skip on fresh/already-migrated DBs.
|
|
69
|
+
cursor.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name='browser_visits'")
|
|
70
|
+
if cursor.fetchone() is not None:
|
|
71
|
+
for name in ("chats_fts_ai", "chats_fts_ad", "chats_fts_au"):
|
|
72
|
+
cursor.execute(f"DROP TRIGGER IF EXISTS {name}")
|
|
73
|
+
cursor.execute("DROP TABLE IF EXISTS chats_fts")
|
|
74
|
+
|
|
75
|
+
# Rename indexed_drive_id → remote_file_id, indexed_drive_folder_id → remote_folder_id
|
|
76
|
+
for old, new, table in [
|
|
77
|
+
("indexed_drive_id", "remote_file_id", "files"),
|
|
78
|
+
("indexed_drive_folder_id", "remote_folder_id", "folders"),
|
|
79
|
+
]:
|
|
80
|
+
try:
|
|
81
|
+
cursor.execute(f"ALTER TABLE {table} RENAME COLUMN {old} TO {new}")
|
|
82
|
+
except sqlite3.OperationalError:
|
|
83
|
+
pass # table doesn't exist yet or column already renamed
|
|
84
|
+
|
|
85
|
+
# Standardize column naming conventions
|
|
86
|
+
for old, new, table in [
|
|
87
|
+
("last_scanned_at", "scanned_at", "folders"),
|
|
88
|
+
("info_vectorized_at", "metadata_vectorized_at", "chats"),
|
|
89
|
+
("direct_in_drive", "remote_file_count", "folders"),
|
|
90
|
+
("total_in_drive", "remote_file_count_recursive", "folders"),
|
|
91
|
+
("last_drive_check", "remote_checked_at", "folders"),
|
|
92
|
+
]:
|
|
93
|
+
try:
|
|
94
|
+
cursor.execute(f"ALTER TABLE {table} RENAME COLUMN {old} TO {new}")
|
|
95
|
+
except sqlite3.OperationalError:
|
|
96
|
+
pass # table doesn't exist yet or column already renamed
|
|
97
|
+
|
|
98
|
+
# Rename artifact_count → file_count (missed in artifacts → files rename)
|
|
99
|
+
for old, new, table in [
|
|
100
|
+
("direct_artifact_count", "direct_file_count", "folders"),
|
|
101
|
+
("total_artifact_count", "total_file_count", "folders"),
|
|
102
|
+
]:
|
|
103
|
+
try:
|
|
104
|
+
cursor.execute(f"ALTER TABLE {table} RENAME COLUMN {old} TO {new}")
|
|
105
|
+
except sqlite3.OperationalError:
|
|
106
|
+
pass # table doesn't exist yet or column already renamed
|
|
107
|
+
|
|
108
|
+
# Rename files.content_hash → sha256_hash
|
|
109
|
+
try:
|
|
110
|
+
cursor.execute("ALTER TABLE files RENAME COLUMN content_hash TO sha256_hash")
|
|
111
|
+
except sqlite3.OperationalError:
|
|
112
|
+
pass # table doesn't exist yet or column already renamed
|
|
113
|
+
|
|
114
|
+
# Data migration: Drive files stored MD5 in content_hash — move to md5_hash
|
|
115
|
+
try:
|
|
116
|
+
cursor.execute("""
|
|
117
|
+
UPDATE files SET md5_hash = sha256_hash, sha256_hash = NULL
|
|
118
|
+
WHERE source != 'local' AND sha256_hash IS NOT NULL AND md5_hash IS NULL
|
|
119
|
+
""")
|
|
120
|
+
except sqlite3.OperationalError:
|
|
121
|
+
pass # table doesn't exist yet on fresh install
|
|
122
|
+
|
|
123
|
+
# Drop duplicate total_size column (total_size_bytes is canonical)
|
|
124
|
+
try:
|
|
125
|
+
cursor.execute("ALTER TABLE folders DROP COLUMN total_size")
|
|
126
|
+
except sqlite3.OperationalError:
|
|
127
|
+
pass # column doesn't exist or already dropped
|
|
128
|
+
|
|
129
|
+
# Drop dead columns: written but never read
|
|
130
|
+
# Include old name (counts_updated_at) for DBs that were never
|
|
131
|
+
# migrated through the rename step.
|
|
132
|
+
for col, table in [
|
|
133
|
+
("stats_updated_at", "folders"),
|
|
134
|
+
("counts_updated_at", "folders"),
|
|
135
|
+
("summarized_at", "emails"),
|
|
136
|
+
]:
|
|
137
|
+
try:
|
|
138
|
+
cursor.execute(f"ALTER TABLE {table} DROP COLUMN {col}")
|
|
139
|
+
except sqlite3.OperationalError:
|
|
140
|
+
pass # column doesn't exist or table missing
|
|
141
|
+
|
|
142
|
+
# Drop orphan tables from old schema.
|
|
143
|
+
for table in ("artifact_sync_state", "file_ai_analysis", "permission_defaults", "visibility_defaults"):
|
|
144
|
+
cursor.execute(f"DROP TABLE IF EXISTS {table}")
|
|
145
|
+
|
|
146
|
+
# Retire dead tracking tables.
|
|
147
|
+
# Migrate the live watermark row before dropping the table.
|
|
148
|
+
# Create ingests early if needed — the main DDL is idempotent.
|
|
149
|
+
cursor.execute(_INGESTS_DDL)
|
|
150
|
+
try:
|
|
151
|
+
cursor.execute("SELECT stage, last_completed_at FROM pipeline_watermarks")
|
|
152
|
+
for row in cursor.fetchall():
|
|
153
|
+
stage = row[0] if isinstance(row, tuple) else row["stage"]
|
|
154
|
+
ts = row[1] if isinstance(row, tuple) else row["last_completed_at"]
|
|
155
|
+
if ts:
|
|
156
|
+
cursor.execute(
|
|
157
|
+
"INSERT INTO ingests (pipe, started_at, completed_at, status) "
|
|
158
|
+
"SELECT ?, ?, ?, 'completed' "
|
|
159
|
+
"WHERE NOT EXISTS ("
|
|
160
|
+
" SELECT 1 FROM ingests WHERE pipe = ? AND completed_at = ?"
|
|
161
|
+
")",
|
|
162
|
+
(stage, ts, ts, stage, ts),
|
|
163
|
+
)
|
|
164
|
+
except sqlite3.OperationalError:
|
|
165
|
+
pass # table doesn't exist on fresh install
|
|
166
|
+
cursor.execute("DROP TABLE IF EXISTS pipeline_watermarks")
|
|
167
|
+
cursor.execute("DROP TABLE IF EXISTS runs")
|
|
168
|
+
|
|
169
|
+
# browser_visits columns added with status/client/project support
|
|
170
|
+
for col, col_def in [
|
|
171
|
+
("status", "TEXT DEFAULT 'active'"),
|
|
172
|
+
("client_id", "INTEGER"),
|
|
173
|
+
("project_id", "INTEGER"),
|
|
174
|
+
]:
|
|
175
|
+
try:
|
|
176
|
+
cursor.execute(f"ALTER TABLE browser_visits ADD COLUMN {col} {col_def}")
|
|
177
|
+
except sqlite3.OperationalError:
|
|
178
|
+
pass # table doesn't exist yet or column already exists
|
|
179
|
+
|
|
180
|
+
# emails: add status column
|
|
181
|
+
try:
|
|
182
|
+
cursor.execute("ALTER TABLE emails ADD COLUMN status TEXT DEFAULT 'active'")
|
|
183
|
+
except sqlite3.OperationalError:
|
|
184
|
+
pass # table doesn't exist yet or column already exists
|
|
185
|
+
|
|
186
|
+
# files: add client_id
|
|
187
|
+
try:
|
|
188
|
+
cursor.execute("ALTER TABLE files ADD COLUMN client_id INTEGER")
|
|
189
|
+
except sqlite3.OperationalError:
|
|
190
|
+
pass # table doesn't exist yet or column already exists
|
|
191
|
+
|
|
192
|
+
# Rename browser_visits → visits
|
|
193
|
+
# Add mcp columns to old table first — the ACCESS_CONTROL_TABLES loop
|
|
194
|
+
# above targets "visits" which doesn't exist yet on legacy DBs.
|
|
195
|
+
for col, col_def in [
|
|
196
|
+
("mcp_read", "TEXT DEFAULT 'inherit'"),
|
|
197
|
+
("mcp_view", "TEXT DEFAULT 'inherit'"),
|
|
198
|
+
]:
|
|
199
|
+
try:
|
|
200
|
+
cursor.execute(f"ALTER TABLE browser_visits ADD COLUMN {col} {col_def}")
|
|
201
|
+
except sqlite3.OperationalError:
|
|
202
|
+
pass # table gone, or column exists
|
|
203
|
+
try:
|
|
204
|
+
for idx in [
|
|
205
|
+
"idx_browser_time",
|
|
206
|
+
"idx_browser_browser",
|
|
207
|
+
"idx_browser_visits_project",
|
|
208
|
+
"idx_browser_unique",
|
|
209
|
+
"idx_browser_visits_client",
|
|
210
|
+
"idx_browser_visits_status",
|
|
211
|
+
"idx_browser_visits_visibility",
|
|
212
|
+
]:
|
|
213
|
+
cursor.execute(f"DROP INDEX IF EXISTS {idx}")
|
|
214
|
+
cursor.execute("ALTER TABLE browser_visits RENAME TO visits")
|
|
215
|
+
except sqlite3.OperationalError:
|
|
216
|
+
pass # already renamed or fresh install
|
|
217
|
+
|
|
218
|
+
# clients/projects: add status_reason column
|
|
219
|
+
for table in ("clients", "projects"):
|
|
220
|
+
try:
|
|
221
|
+
cursor.execute(f"ALTER TABLE {table} ADD COLUMN status_reason TEXT")
|
|
222
|
+
except sqlite3.OperationalError:
|
|
223
|
+
pass # table doesn't exist yet or column already exists
|
|
224
|
+
|
|
225
|
+
# ── standard entity column set ──
|
|
226
|
+
|
|
227
|
+
# folders: add status, client_id, indexed_at
|
|
228
|
+
for col, col_def in [
|
|
229
|
+
("status", "TEXT DEFAULT 'active'"),
|
|
230
|
+
("client_id", "INTEGER"),
|
|
231
|
+
("indexed_at", "DATETIME"),
|
|
232
|
+
]:
|
|
233
|
+
try:
|
|
234
|
+
cursor.execute(f"ALTER TABLE folders ADD COLUMN {col} {col_def}")
|
|
235
|
+
except sqlite3.OperationalError:
|
|
236
|
+
pass # table doesn't exist yet or column already exists
|
|
237
|
+
|
|
238
|
+
# messages: add status
|
|
239
|
+
try:
|
|
240
|
+
cursor.execute("ALTER TABLE messages ADD COLUMN status TEXT DEFAULT 'active'")
|
|
241
|
+
except sqlite3.OperationalError:
|
|
242
|
+
pass
|
|
243
|
+
|
|
244
|
+
# emails: add created_at (no DEFAULT — see visits comment above)
|
|
245
|
+
try:
|
|
246
|
+
cursor.execute("ALTER TABLE emails ADD COLUMN created_at DATETIME")
|
|
247
|
+
except sqlite3.OperationalError:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
# visits / browser_visits: add created_at
|
|
251
|
+
# Note: ALTER TABLE cannot use CURRENT_TIMESTAMP as default
|
|
252
|
+
# (non-constant), so we add without default. The CREATE TABLE
|
|
253
|
+
# DDL has the default for fresh DBs.
|
|
254
|
+
for table in ("visits", "browser_visits"):
|
|
255
|
+
try:
|
|
256
|
+
cursor.execute(f"ALTER TABLE {table} ADD COLUMN created_at DATETIME")
|
|
257
|
+
except sqlite3.OperationalError:
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
# display_name on all 8 entity tables
|
|
261
|
+
for table in ACCESS_CONTROL_TABLES:
|
|
262
|
+
try:
|
|
263
|
+
cursor.execute(f"ALTER TABLE {table} ADD COLUMN display_name TEXT")
|
|
264
|
+
except sqlite3.OperationalError:
|
|
265
|
+
pass
|
|
266
|
+
|
|
267
|
+
# Backfill display_name from source columns for existing rows
|
|
268
|
+
for table, source_col in _DISPLAY_NAME_BACKFILL.items():
|
|
269
|
+
try:
|
|
270
|
+
cursor.execute(f"UPDATE {table} SET display_name = {source_col} WHERE display_name IS NULL")
|
|
271
|
+
except sqlite3.OperationalError:
|
|
272
|
+
pass # table doesn't exist yet
|
|
273
|
+
|
|
274
|
+
# ── Timestamp column standardization ──
|
|
275
|
+
|
|
276
|
+
# Rename chats.updated_at → modified_at (origin timestamp)
|
|
277
|
+
try:
|
|
278
|
+
cursor.execute("ALTER TABLE chats RENAME COLUMN updated_at TO modified_at")
|
|
279
|
+
except sqlite3.OperationalError:
|
|
280
|
+
pass # already renamed or table doesn't exist
|
|
281
|
+
|
|
282
|
+
# Add updated_at audit column to all 6 entity tables
|
|
283
|
+
for table in ("files", "folders", "visits", "chats", "messages", "emails"):
|
|
284
|
+
try:
|
|
285
|
+
cursor.execute(f"ALTER TABLE {table} ADD COLUMN updated_at DATETIME")
|
|
286
|
+
except sqlite3.OperationalError:
|
|
287
|
+
pass # column already exists or table doesn't exist
|
|
288
|
+
|
|
289
|
+
# Add indexed_at to messages (was missing)
|
|
290
|
+
try:
|
|
291
|
+
cursor.execute("ALTER TABLE messages ADD COLUMN indexed_at DATETIME")
|
|
292
|
+
except sqlite3.OperationalError:
|
|
293
|
+
pass # column already exists
|
|
294
|
+
|
|
295
|
+
# Add vectorized_chunks to messages (matches files pattern)
|
|
296
|
+
try:
|
|
297
|
+
cursor.execute("ALTER TABLE messages ADD COLUMN vectorized_chunks INTEGER DEFAULT 0")
|
|
298
|
+
except sqlite3.OperationalError:
|
|
299
|
+
pass # column already exists
|
|
300
|
+
|
|
301
|
+
# Backfill new audit columns from existing data
|
|
302
|
+
_timestamp_backfill = {
|
|
303
|
+
"files": "UPDATE files SET updated_at = indexed_at WHERE updated_at IS NULL",
|
|
304
|
+
"folders": "UPDATE folders SET updated_at = indexed_at WHERE updated_at IS NULL",
|
|
305
|
+
"visits": "UPDATE visits SET updated_at = indexed_at WHERE updated_at IS NULL",
|
|
306
|
+
"chats": "UPDATE chats SET updated_at = indexed_at WHERE updated_at IS NULL",
|
|
307
|
+
"emails": "UPDATE emails SET updated_at = indexed_at WHERE updated_at IS NULL",
|
|
308
|
+
"messages_indexed": "UPDATE messages SET indexed_at = created_at WHERE indexed_at IS NULL",
|
|
309
|
+
"messages_updated": "UPDATE messages SET updated_at = created_at WHERE updated_at IS NULL",
|
|
310
|
+
}
|
|
311
|
+
for label, sql in _timestamp_backfill.items():
|
|
312
|
+
try:
|
|
313
|
+
cursor.execute(sql)
|
|
314
|
+
except sqlite3.OperationalError:
|
|
315
|
+
pass # table doesn't exist yet
|