footprinter-cli 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +431 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
- footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
- footprinter/bundled/samples/visible-file-sample.txt +2 -0
- footprinter/cli/__init__.py +135 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +327 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/_sample_seed.py +204 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +543 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +2001 -0
- footprinter/cli/status.py +747 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +602 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +724 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +487 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +315 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +223 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +112 -0
- footprinter/ingest/pipe_runner.py +200 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +186 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +67 -0
- footprinter/mcp/errors.py +105 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +19 -0
- footprinter/paths.py +117 -0
- footprinter/permissions.py +1152 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1264 -0
- footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
- footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
- footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Pipe runner — pipe dispatch, iteration, timing, error aggregation.
|
|
2
|
+
|
|
3
|
+
Runs pipes in order, delegates data-source pipes to adapters and
|
|
4
|
+
processing pipes to ProcessingPipeline. Handles timing, error
|
|
5
|
+
classification, and fatal-error halting.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import sqlite3
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from typing import TYPE_CHECKING, Callable, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
from footprinter.ingest.adapters.protocol import PipeContext
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from footprinter.ingest.processing import ProcessingPipeline
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PipeRunner:
|
|
24
|
+
"""Runs pipes in order, handles timing, manages error aggregation.
|
|
25
|
+
|
|
26
|
+
Receives its adapter registry, pipeline definitions, and valid pipe
|
|
27
|
+
list from the orchestrator (composition root). Does not import pipe
|
|
28
|
+
definitions directly.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
processing: ProcessingPipeline,
|
|
34
|
+
get_db: Callable,
|
|
35
|
+
config: Dict,
|
|
36
|
+
config_path: str,
|
|
37
|
+
adapter_registry: Dict[str, type],
|
|
38
|
+
pipelines: Dict[str, List[str]],
|
|
39
|
+
all_pipes: List[str],
|
|
40
|
+
connector_pipe_map: Optional[Dict[str, str]] = None,
|
|
41
|
+
):
|
|
42
|
+
self.processing = processing
|
|
43
|
+
self._get_db = get_db
|
|
44
|
+
self.config = config
|
|
45
|
+
self.config_path = config_path
|
|
46
|
+
self.adapter_registry = adapter_registry
|
|
47
|
+
self.pipelines = pipelines
|
|
48
|
+
self.all_pipes = all_pipes
|
|
49
|
+
self._connector_pipe_map = connector_pipe_map or {}
|
|
50
|
+
self.full_mode = False
|
|
51
|
+
|
|
52
|
+
def run_pipe(
|
|
53
|
+
self,
|
|
54
|
+
pipe: str,
|
|
55
|
+
on_progress: Optional[Callable] = None,
|
|
56
|
+
last_run: Optional[datetime] = None,
|
|
57
|
+
) -> Dict:
|
|
58
|
+
"""Run a single pipe.
|
|
59
|
+
|
|
60
|
+
Dispatches to the adapter registry for data-source pipes,
|
|
61
|
+
or to ProcessingPipeline for processing pipes. If the pipe
|
|
62
|
+
belongs to an uninstalled connector, returns a skip result with
|
|
63
|
+
install instructions.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Dict with pipe results including elapsed_seconds.
|
|
67
|
+
"""
|
|
68
|
+
logger.info(f"Running pipe: {pipe}")
|
|
69
|
+
start_time = datetime.now()
|
|
70
|
+
|
|
71
|
+
result = {"stage": pipe, "status": "unknown"}
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
adapter_cls = self.adapter_registry.get(pipe)
|
|
75
|
+
if adapter_cls is not None:
|
|
76
|
+
adapter = adapter_cls()
|
|
77
|
+
db = self._get_db()
|
|
78
|
+
ctx = PipeContext(
|
|
79
|
+
source_config=self.config,
|
|
80
|
+
config_path=self.config_path,
|
|
81
|
+
full_mode=self.full_mode,
|
|
82
|
+
last_run=last_run,
|
|
83
|
+
on_progress=on_progress,
|
|
84
|
+
)
|
|
85
|
+
pipe_result = adapter.run(db, ctx)
|
|
86
|
+
elapsed = (datetime.now() - start_time).total_seconds()
|
|
87
|
+
pipe_result.elapsed_seconds = round(elapsed, 1)
|
|
88
|
+
result = pipe_result.to_dict()
|
|
89
|
+
elif self.processing.is_processing_pipe(pipe):
|
|
90
|
+
db = self._get_db()
|
|
91
|
+
pipe_result = self.processing.run_phase(pipe, db)
|
|
92
|
+
elapsed = (datetime.now() - start_time).total_seconds()
|
|
93
|
+
pipe_result.elapsed_seconds = round(elapsed, 1)
|
|
94
|
+
result = pipe_result.to_dict()
|
|
95
|
+
else:
|
|
96
|
+
# Check if this pipe belongs to an uninstalled connector
|
|
97
|
+
connector_name = self._find_connector_for_pipe(pipe)
|
|
98
|
+
if connector_name:
|
|
99
|
+
result = {
|
|
100
|
+
"stage": pipe,
|
|
101
|
+
"status": "skipped",
|
|
102
|
+
"reason": "not installed",
|
|
103
|
+
"hint": f"run: fp connect install {connector_name}",
|
|
104
|
+
}
|
|
105
|
+
else:
|
|
106
|
+
logger.error(f"Unknown pipe: {pipe}")
|
|
107
|
+
result = {
|
|
108
|
+
"stage": pipe,
|
|
109
|
+
"status": "error",
|
|
110
|
+
"error": f"Unknown pipe: {pipe}",
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
result["stage"] = pipe
|
|
114
|
+
result["status"] = result.get("status", "completed")
|
|
115
|
+
|
|
116
|
+
except ImportError as e:
|
|
117
|
+
logger.warning(f"Pipe {pipe} skipped — missing dependency: {e}")
|
|
118
|
+
result = {
|
|
119
|
+
"stage": pipe,
|
|
120
|
+
"status": "skipped",
|
|
121
|
+
"reason": f"Not installed: {e}",
|
|
122
|
+
"error_type": "missing_dependency",
|
|
123
|
+
}
|
|
124
|
+
except sqlite3.OperationalError as e:
|
|
125
|
+
logger.error(f"Database error in pipe {pipe}: {e}")
|
|
126
|
+
result = {
|
|
127
|
+
"stage": pipe,
|
|
128
|
+
"status": "error",
|
|
129
|
+
"error": str(e),
|
|
130
|
+
"error_type": "database",
|
|
131
|
+
}
|
|
132
|
+
except FileNotFoundError as e:
|
|
133
|
+
logger.error(f"Config/file error in pipe {pipe}: {e}")
|
|
134
|
+
result = {
|
|
135
|
+
"stage": pipe,
|
|
136
|
+
"status": "error",
|
|
137
|
+
"error": str(e),
|
|
138
|
+
"error_type": "config",
|
|
139
|
+
}
|
|
140
|
+
# Intentional broad catch: last-resort after specific
|
|
141
|
+
# ImportError, OperationalError, FileNotFoundError handlers
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.error(f"Error in pipe {pipe}: {e}")
|
|
144
|
+
result = {
|
|
145
|
+
"stage": pipe,
|
|
146
|
+
"status": "error",
|
|
147
|
+
"error": str(e),
|
|
148
|
+
"error_type": "runtime",
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
elapsed = (datetime.now() - start_time).total_seconds()
|
|
152
|
+
result["elapsed_seconds"] = round(elapsed, 1)
|
|
153
|
+
logger.info(f"Pipe {pipe} completed in {elapsed:.1f}s")
|
|
154
|
+
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
def run_pipes(
|
|
158
|
+
self,
|
|
159
|
+
pipes: List[str],
|
|
160
|
+
on_pipe_start: Optional[Callable] = None,
|
|
161
|
+
on_pipe_end: Optional[Callable] = None,
|
|
162
|
+
on_progress: Optional[Callable] = None,
|
|
163
|
+
pipe_hook: Optional[Callable] = None,
|
|
164
|
+
last_run: Optional[datetime] = None,
|
|
165
|
+
) -> List[Dict]:
|
|
166
|
+
"""Run multiple pipes in order.
|
|
167
|
+
|
|
168
|
+
Raises ValueError for unknown pipe names. Stops on fatal errors
|
|
169
|
+
(database/config error_type), continues on runtime errors.
|
|
170
|
+
"""
|
|
171
|
+
unknown = [s for s in pipes if s not in self.all_pipes]
|
|
172
|
+
if unknown:
|
|
173
|
+
raise ValueError(f"Unknown pipe(s): {', '.join(unknown)}. Valid pipes: {', '.join(self.all_pipes)}")
|
|
174
|
+
|
|
175
|
+
results = []
|
|
176
|
+
|
|
177
|
+
for pipe in pipes:
|
|
178
|
+
if on_pipe_start:
|
|
179
|
+
on_pipe_start(pipe)
|
|
180
|
+
|
|
181
|
+
if pipe_hook:
|
|
182
|
+
result = pipe_hook(pipe, on_progress=on_progress)
|
|
183
|
+
else:
|
|
184
|
+
result = self.run_pipe(pipe, on_progress=on_progress, last_run=last_run)
|
|
185
|
+
results.append(result)
|
|
186
|
+
|
|
187
|
+
if on_pipe_end:
|
|
188
|
+
on_pipe_end(pipe, result)
|
|
189
|
+
|
|
190
|
+
# Stop pipeline on fatal errors (database/config); runtime errors continue
|
|
191
|
+
if result.get("status") == "error":
|
|
192
|
+
if result.get("error_type") in ("database", "config"):
|
|
193
|
+
logger.error(f"Fatal error in {pipe}: {result.get('error', 'unknown')}")
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
return results
|
|
197
|
+
|
|
198
|
+
def _find_connector_for_pipe(self, pipe: str) -> str | None:
|
|
199
|
+
"""Find the connector name that owns a given pipe, if any."""
|
|
200
|
+
return self._connector_pipe_map.get(pipe)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Processing module — access resolution and pipeline framework.
|
|
3
|
+
|
|
4
|
+
Primary role: ``run_access_resolution`` stamps visibility and permissions
|
|
5
|
+
on ingested entities, with last-run-based incremental processing.
|
|
6
|
+
Also provides the ``ProcessingPipeline`` framework for phase registration
|
|
7
|
+
and dispatch.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import sqlite3
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import TYPE_CHECKING, Callable, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from footprinter.ingest.adapters.protocol import ErrorType, PipeResult
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from footprinter.ingest.database import Database
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Last-run helpers (backed by ingests table)
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _read_last_run(conn: sqlite3.Connection, pipe: str) -> Optional[str]:
|
|
31
|
+
"""Read the last-completed timestamp for a pipe."""
|
|
32
|
+
row = conn.execute(
|
|
33
|
+
"SELECT completed_at FROM ingests WHERE pipe = ? AND status = 'completed' ORDER BY completed_at DESC LIMIT 1",
|
|
34
|
+
(pipe,),
|
|
35
|
+
).fetchone()
|
|
36
|
+
if row is None:
|
|
37
|
+
return None
|
|
38
|
+
return row[0] if isinstance(row, tuple) else row["completed_at"]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Access resolution runner
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_access_resolution(db: "Database", full_mode: bool = False) -> PipeResult:
|
|
47
|
+
"""Stamp visibility and permissions on entities.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
db: Database instance (needs db.conn).
|
|
51
|
+
full_mode: If True, recalculate everything. If False, only
|
|
52
|
+
entities added/modified since the last run.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
PipeResult with per-entity-type counts in data.
|
|
56
|
+
"""
|
|
57
|
+
from footprinter.access import ENTITY_META, recalculate_access, stamp_entities
|
|
58
|
+
|
|
59
|
+
conn = db.conn
|
|
60
|
+
last_run = _read_last_run(conn, "access_resolution")
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
if full_mode or last_run is None:
|
|
64
|
+
# Full recalculation
|
|
65
|
+
stats = recalculate_access(conn, "global")
|
|
66
|
+
else:
|
|
67
|
+
# Incremental — only entities with indexed_at > last run
|
|
68
|
+
ids_by_type: Dict[str, list] = {}
|
|
69
|
+
for entity_type, meta in ENTITY_META.items():
|
|
70
|
+
table = meta["table"]
|
|
71
|
+
|
|
72
|
+
# Not all tables have indexed_at (folders, projects, clients don't)
|
|
73
|
+
try:
|
|
74
|
+
conn.execute(f"SELECT indexed_at FROM {table} LIMIT 0")
|
|
75
|
+
except sqlite3.OperationalError:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
where = "indexed_at > ?"
|
|
79
|
+
if meta["has_status"]:
|
|
80
|
+
where += " AND status != 'removed'"
|
|
81
|
+
|
|
82
|
+
rows = conn.execute(f"SELECT id FROM {table} WHERE {where}", (last_run,)).fetchall()
|
|
83
|
+
ids = [r["id"] if isinstance(r, sqlite3.Row) else r[0] for r in rows]
|
|
84
|
+
|
|
85
|
+
if ids:
|
|
86
|
+
ids_by_type[entity_type] = ids
|
|
87
|
+
|
|
88
|
+
stats = stamp_entities(conn, ids_by_type)
|
|
89
|
+
except Exception as e: # Intentional broad catch: last-resort for access resolution; pipeline must continue
|
|
90
|
+
logger.error("Access resolution error: %s", e, exc_info=True)
|
|
91
|
+
return PipeResult.make_error("access_resolution", str(e), ErrorType.RUNTIME)
|
|
92
|
+
else:
|
|
93
|
+
return PipeResult.completed("access_resolution", **stats)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# Identity mapping — pipe names map directly to phase names.
|
|
97
|
+
PIPE_TO_PHASE: Dict[str, str] = {}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class PhaseSpec:
|
|
102
|
+
"""Specification for a single processing phase."""
|
|
103
|
+
|
|
104
|
+
name: str
|
|
105
|
+
skip_guard: Optional[Callable[["Database"], bool]] = None
|
|
106
|
+
runner: Optional[Callable[["Database"], PipeResult]] = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class ProcessingPipeline:
|
|
110
|
+
"""Pipeline for processing stages.
|
|
111
|
+
|
|
112
|
+
Phases are registered with a runner callable and optional skip guard.
|
|
113
|
+
Execution order follows registration order.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self) -> None:
|
|
117
|
+
self._phases: Dict[str, PhaseSpec] = {}
|
|
118
|
+
|
|
119
|
+
def register(
|
|
120
|
+
self,
|
|
121
|
+
name: str,
|
|
122
|
+
runner: Optional[Callable[["Database"], PipeResult]] = None,
|
|
123
|
+
skip_guard: Optional[Callable[["Database"], bool]] = None,
|
|
124
|
+
) -> None:
|
|
125
|
+
"""Register a processing phase."""
|
|
126
|
+
self._phases[name] = PhaseSpec(
|
|
127
|
+
name=name,
|
|
128
|
+
skip_guard=skip_guard,
|
|
129
|
+
runner=runner,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def is_processing_pipe(self, pipe_name: str) -> bool:
|
|
133
|
+
"""Check if a pipe name maps to a registered processing phase."""
|
|
134
|
+
phase_name = PIPE_TO_PHASE.get(pipe_name, pipe_name)
|
|
135
|
+
return phase_name in self._phases
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def phase_names(self) -> List[str]:
|
|
139
|
+
"""Return phase names in registration order."""
|
|
140
|
+
return list(self._phases.keys())
|
|
141
|
+
|
|
142
|
+
def run_phase(self, pipe_name: str, db: "Database") -> PipeResult:
|
|
143
|
+
"""Execute a processing phase by pipe name.
|
|
144
|
+
|
|
145
|
+
Applies skip guard, then calls runner directly.
|
|
146
|
+
"""
|
|
147
|
+
phase_name = PIPE_TO_PHASE.get(pipe_name, pipe_name)
|
|
148
|
+
spec = self._phases.get(phase_name)
|
|
149
|
+
|
|
150
|
+
if spec is None:
|
|
151
|
+
return PipeResult.make_error(pipe_name, f"Unknown processing phase: {phase_name}")
|
|
152
|
+
|
|
153
|
+
if spec.runner is None:
|
|
154
|
+
return PipeResult.make_error(pipe_name, f"No runner registered for phase: {phase_name}")
|
|
155
|
+
|
|
156
|
+
# Check skip guard
|
|
157
|
+
if spec.skip_guard is not None:
|
|
158
|
+
try:
|
|
159
|
+
should_skip = spec.skip_guard(db)
|
|
160
|
+
if should_skip:
|
|
161
|
+
return PipeResult.skipped(pipe_name, f"Skip guard triggered for {phase_name}")
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.warning(f"Skip guard for {phase_name} raised {type(e).__name__}: {e}; proceeding")
|
|
164
|
+
|
|
165
|
+
return spec.runner(db)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Pipe registry — the "phone book" for pipes.
|
|
2
|
+
|
|
3
|
+
Knows what pipes exist, which adapter classes implement the core
|
|
4
|
+
data-source pipes, and provides functions to compute pipeline and
|
|
5
|
+
refresh pipe definitions dynamically. Does NOT run anything — that's
|
|
6
|
+
the orchestrator's job.
|
|
7
|
+
|
|
8
|
+
v1.0 pipe set
|
|
9
|
+
--------------
|
|
10
|
+
Core (always available): local_folders, local_files, browser, chat.
|
|
11
|
+
Connector pipes are resolved dynamically from installed ConnectorSpecs.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from footprinter.ingest.adapters import (
|
|
15
|
+
BrowserAdapter,
|
|
16
|
+
ChatAdapter,
|
|
17
|
+
LocalFilesAdapter,
|
|
18
|
+
LocalFoldersAdapter,
|
|
19
|
+
)
|
|
20
|
+
from footprinter.ingest.adapters.protocol import ErrorType, PipeResult, PipeStatus
|
|
21
|
+
|
|
22
|
+
# ── Source catalogue ────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
# Core v1.0 sources (work out of the box)
|
|
25
|
+
CORE_PIPES = [
|
|
26
|
+
"local_folders", # Scan ~/Work, ~/Personal folder structure
|
|
27
|
+
"local_files", # Index local files
|
|
28
|
+
"browser", # Browser history
|
|
29
|
+
"chat", # Claude/ChatGPT exports (status only - manual import)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# Not valid CLI targets; excluded from get_all_pipes()
|
|
33
|
+
FUTURE_PIPES = [
|
|
34
|
+
"project_links",
|
|
35
|
+
"summaries",
|
|
36
|
+
"drive_links",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
# Post-processing pipes — appended to every pipeline, run after all data-source pipes
|
|
40
|
+
POST_PIPES = [
|
|
41
|
+
"access_resolution", # Stamp visibility + permissions on ingested entities
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# ── Core source registry (data-source adapters only) ─────────────────
|
|
45
|
+
|
|
46
|
+
CORE_PIPE_REGISTRY = {
|
|
47
|
+
"local_folders": LocalFoldersAdapter,
|
|
48
|
+
"local_files": LocalFilesAdapter,
|
|
49
|
+
"browser": BrowserAdapter,
|
|
50
|
+
"chat": ChatAdapter,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# ── Dynamic resolution functions ─────────────────────────────────────
|
|
54
|
+
#
|
|
55
|
+
# These replace the former static PIPELINES, REFRESH_PIPES, and ALL_PIPES
|
|
56
|
+
# dicts. They accept connector_pipelines — a dict mapping connector names
|
|
57
|
+
# to their pipe lists (e.g., {"google": ["drive_folders", "drive_files", "gmail"]}).
|
|
58
|
+
# The orchestrator builds this from ConnectorSpec metadata and passes it in,
|
|
59
|
+
# so this module never imports from connectors/.
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_pipelines(
|
|
63
|
+
connector_pipes: dict[str, type],
|
|
64
|
+
connector_pipelines: dict[str, list[str]] | None = None,
|
|
65
|
+
) -> dict[str, list[str]]:
|
|
66
|
+
"""Compute pipeline definitions from core + installed connector pipes.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
connector_pipes: Merged adapter registry from get_connector_pipes().
|
|
70
|
+
connector_pipelines: Connector name → adapter pipe names. Built by
|
|
71
|
+
the orchestrator from ConnectorSpec.adapter_entries.
|
|
72
|
+
|
|
73
|
+
Returns pipeline name → ordered pipe list.
|
|
74
|
+
"""
|
|
75
|
+
pipelines: dict[str, list[str]] = {
|
|
76
|
+
"local": list(CORE_PIPES),
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Add a pipeline per connector whose pipes are in connector_pipes
|
|
80
|
+
for name, pipes in (connector_pipelines or {}).items():
|
|
81
|
+
installed = [s for s in pipes if s in connector_pipes]
|
|
82
|
+
if installed:
|
|
83
|
+
pipelines[name] = installed
|
|
84
|
+
|
|
85
|
+
# "all" = core + all installed connector data-source pipes
|
|
86
|
+
all_pipe_names = list(CORE_PIPES)
|
|
87
|
+
for name, pipes in pipelines.items():
|
|
88
|
+
if name == "local":
|
|
89
|
+
continue
|
|
90
|
+
for s in pipes:
|
|
91
|
+
if s not in all_pipe_names:
|
|
92
|
+
all_pipe_names.append(s)
|
|
93
|
+
pipelines["all"] = all_pipe_names
|
|
94
|
+
|
|
95
|
+
# Append post-processing pipes to every pipeline
|
|
96
|
+
for name in pipelines:
|
|
97
|
+
pipelines[name] = pipelines[name] + POST_PIPES
|
|
98
|
+
|
|
99
|
+
return pipelines
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_refresh_pipes(
|
|
103
|
+
connector_pipes: dict[str, type],
|
|
104
|
+
connector_pipelines: dict[str, list[str]] | None = None,
|
|
105
|
+
) -> dict[str, list[str]]:
|
|
106
|
+
"""Compute refresh pipe mappings from core + installed connector pipes.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
connector_pipes: Merged adapter registry from get_connector_pipes().
|
|
110
|
+
connector_pipelines: Connector name → adapter pipe names.
|
|
111
|
+
|
|
112
|
+
Returns source name → pipe list. Each core source group gets a key,
|
|
113
|
+
each connector gets a key, and individual connector pipes get keys.
|
|
114
|
+
"""
|
|
115
|
+
refresh: dict[str, list[str]] = {
|
|
116
|
+
"local": ["local_folders", "local_files"],
|
|
117
|
+
"browser": ["browser"],
|
|
118
|
+
"chat": ["chat"],
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# Per-connector and per-pipe entries
|
|
122
|
+
for name, pipes in (connector_pipelines or {}).items():
|
|
123
|
+
installed = [s for s in pipes if s in connector_pipes]
|
|
124
|
+
if not installed:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Connector-level key (e.g., "google")
|
|
128
|
+
refresh[name] = installed
|
|
129
|
+
|
|
130
|
+
# Per-pipe keys and grouped keys (e.g., "gmail", "drive")
|
|
131
|
+
drive_pipes = []
|
|
132
|
+
for pipe in installed:
|
|
133
|
+
if pipe.startswith("drive_"):
|
|
134
|
+
drive_pipes.append(pipe)
|
|
135
|
+
else:
|
|
136
|
+
# Individual pipe key (e.g., "gmail")
|
|
137
|
+
refresh[pipe] = [pipe]
|
|
138
|
+
|
|
139
|
+
if drive_pipes:
|
|
140
|
+
refresh["drive"] = drive_pipes
|
|
141
|
+
|
|
142
|
+
# "all" = everything
|
|
143
|
+
all_pipe_names = list(CORE_PIPES)
|
|
144
|
+
for name, pipes in (connector_pipelines or {}).items():
|
|
145
|
+
for s in pipes:
|
|
146
|
+
if s in connector_pipes and s not in all_pipe_names:
|
|
147
|
+
all_pipe_names.append(s)
|
|
148
|
+
refresh["all"] = all_pipe_names
|
|
149
|
+
|
|
150
|
+
# Append post-processing pipes to every refresh group
|
|
151
|
+
for name in refresh:
|
|
152
|
+
refresh[name] = refresh[name] + POST_PIPES
|
|
153
|
+
|
|
154
|
+
return refresh
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_all_pipes(connector_pipes: dict[str, type]) -> list[str]:
|
|
158
|
+
"""Compute complete list of valid pipe names.
|
|
159
|
+
|
|
160
|
+
Includes core pipes and installed connector pipes.
|
|
161
|
+
FUTURE_PIPES entries are excluded — they are not registered pipes.
|
|
162
|
+
"""
|
|
163
|
+
result = list(CORE_PIPES)
|
|
164
|
+
for s in connector_pipes:
|
|
165
|
+
if s not in result:
|
|
166
|
+
result.append(s)
|
|
167
|
+
for s in POST_PIPES:
|
|
168
|
+
if s not in result:
|
|
169
|
+
result.append(s)
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ── Convenience re-exports ───────────────────────────────────────────
|
|
174
|
+
|
|
175
|
+
__all__ = [
|
|
176
|
+
"CORE_PIPES",
|
|
177
|
+
"FUTURE_PIPES",
|
|
178
|
+
"POST_PIPES",
|
|
179
|
+
"CORE_PIPE_REGISTRY",
|
|
180
|
+
"get_pipelines",
|
|
181
|
+
"get_refresh_pipes",
|
|
182
|
+
"get_all_pipes",
|
|
183
|
+
"PipeResult",
|
|
184
|
+
"PipeStatus",
|
|
185
|
+
"ErrorType",
|
|
186
|
+
]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Pure persistence for pipeline run records.
|
|
2
|
+
|
|
3
|
+
Saves and loads a JSON record of each pipeline run. No heuristics or
|
|
4
|
+
config awareness — warning logic lives in the display layer.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from datetime import datetime, timedelta, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from footprinter.paths import get_last_run_path
|
|
13
|
+
|
|
14
|
+
SESSION_WINDOW_MINUTES = 10
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def save_run_record(
|
|
18
|
+
results: List[Dict],
|
|
19
|
+
mode: str,
|
|
20
|
+
started_at: datetime,
|
|
21
|
+
*,
|
|
22
|
+
interrupted: bool = False,
|
|
23
|
+
path: Optional[Path] = None,
|
|
24
|
+
) -> Path:
|
|
25
|
+
"""Write a run record to JSON, merging with recent records.
|
|
26
|
+
|
|
27
|
+
If an existing record started within SESSION_WINDOW_MINUTES of
|
|
28
|
+
``started_at``, new stages are appended to it (preserving the
|
|
29
|
+
original ``started_at``). Otherwise the record is replaced.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
results: List of per-stage result dicts from PipeRunner.
|
|
33
|
+
mode: Run mode string (e.g. "incremental", "full").
|
|
34
|
+
started_at: When the pipeline started.
|
|
35
|
+
interrupted: Whether the run was interrupted (e.g. KeyboardInterrupt).
|
|
36
|
+
path: Override output path (default: get_last_run_path()).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The path the record was written to.
|
|
40
|
+
"""
|
|
41
|
+
if path is None:
|
|
42
|
+
path = get_last_run_path()
|
|
43
|
+
|
|
44
|
+
completed_at = datetime.now(timezone.utc)
|
|
45
|
+
total_elapsed = sum(r.get("elapsed_seconds", 0) for r in results)
|
|
46
|
+
|
|
47
|
+
# Merge with existing record if within session window
|
|
48
|
+
existing = load_run_record(path=path)
|
|
49
|
+
if existing and _within_session_window(existing, started_at):
|
|
50
|
+
existing["stages"].extend(results)
|
|
51
|
+
existing["completed_at"] = completed_at.isoformat()
|
|
52
|
+
existing["total_elapsed_seconds"] = sum(r.get("elapsed_seconds", 0) for r in existing["stages"])
|
|
53
|
+
existing["interrupted"] = interrupted
|
|
54
|
+
record = existing
|
|
55
|
+
else:
|
|
56
|
+
record = {
|
|
57
|
+
"started_at": started_at.isoformat(),
|
|
58
|
+
"completed_at": completed_at.isoformat(),
|
|
59
|
+
"mode": mode,
|
|
60
|
+
"interrupted": interrupted,
|
|
61
|
+
"total_elapsed_seconds": total_elapsed,
|
|
62
|
+
"stages": results,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
path.write_text(json.dumps(record, indent=2, default=str))
|
|
67
|
+
return path
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _within_session_window(existing: Dict, new_started_at: datetime) -> bool:
|
|
71
|
+
"""Check if an existing record is within the merge window."""
|
|
72
|
+
try:
|
|
73
|
+
existing_start = datetime.fromisoformat(existing["started_at"])
|
|
74
|
+
return abs(new_started_at - existing_start) <= timedelta(minutes=SESSION_WINDOW_MINUTES)
|
|
75
|
+
except (KeyError, ValueError):
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def load_run_record(path: Optional[Path] = None) -> Optional[Dict]:
|
|
80
|
+
"""Read a run record from JSON.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
The parsed record dict, or None if the file doesn't exist.
|
|
84
|
+
"""
|
|
85
|
+
if path is None:
|
|
86
|
+
path = get_last_run_path()
|
|
87
|
+
|
|
88
|
+
if not path.exists():
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
return json.loads(path.read_text())
|