footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,217 @@
1
+ """Pipe runner — pipe dispatch, iteration, timing, error aggregation.
2
+
3
+ Runs pipes in order, delegates data-source pipes to adapters and
4
+ processing pipes to ProcessingPipeline. Handles timing, error
5
+ classification, and fatal-error halting.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import sqlite3
12
+ from datetime import datetime
13
+ from typing import TYPE_CHECKING, Callable, Dict, List, Optional
14
+
15
+ from footprinter.ingest.adapters.protocol import PipeContext
16
+
17
+ if TYPE_CHECKING:
18
+ from footprinter.ingest.processing import ProcessingPipeline
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class PipeRunner:
24
+ """Runs pipes in order, handles timing, manages error aggregation.
25
+
26
+ Receives its adapter registry, pipeline definitions, and valid pipe
27
+ list from the orchestrator (composition root). Does not import pipe
28
+ definitions directly.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ processing: ProcessingPipeline,
34
+ get_db: Callable,
35
+ config: Dict,
36
+ config_path: str,
37
+ adapter_registry: Dict[str, type],
38
+ pipelines: Dict[str, List[str]],
39
+ all_pipes: List[str],
40
+ user_pipes: Optional[List[str]] = None,
41
+ connector_pipe_map: Optional[Dict[str, str]] = None,
42
+ ):
43
+ self.processing = processing
44
+ self._get_db = get_db
45
+ self.config = config
46
+ self.config_path = config_path
47
+ self.adapter_registry = adapter_registry
48
+ self.pipelines = pipelines
49
+ self.all_pipes = all_pipes
50
+ # User-selectable subset for error messages. Falls back to all_pipes
51
+ # when omitted (legacy call sites) — error messages then show every
52
+ # pipe including post-processing, as before.
53
+ self.user_pipes = user_pipes if user_pipes is not None else all_pipes
54
+ self._connector_pipe_map = connector_pipe_map or {}
55
+ self.full_mode = False
56
+
57
+ def run_pipe(
58
+ self,
59
+ pipe: str,
60
+ on_progress: Optional[Callable] = None,
61
+ last_run: Optional[datetime] = None,
62
+ ) -> Dict:
63
+ """Run a single pipe.
64
+
65
+ Dispatches to the adapter registry for data-source pipes,
66
+ or to ProcessingPipeline for processing pipes. If the pipe
67
+ belongs to an uninstalled connector, returns a skip result with
68
+ install instructions.
69
+
70
+ Returns:
71
+ Dict with pipe results including elapsed_seconds.
72
+ """
73
+ logger.info(f"Running pipe: {pipe}")
74
+ start_time = datetime.now()
75
+
76
+ result = {"stage": pipe, "status": "unknown"}
77
+
78
+ try:
79
+ adapter_cls = self.adapter_registry.get(pipe)
80
+ if adapter_cls is not None:
81
+ adapter = adapter_cls()
82
+ db = self._get_db()
83
+ ctx = PipeContext(
84
+ source_config=self.config,
85
+ config_path=self.config_path,
86
+ full_mode=self.full_mode,
87
+ last_run=last_run,
88
+ on_progress=on_progress,
89
+ )
90
+ pipe_result = adapter.run(db, ctx)
91
+ elapsed = (datetime.now() - start_time).total_seconds()
92
+ pipe_result.elapsed_seconds = round(elapsed, 1)
93
+ result = pipe_result.to_dict()
94
+ elif self.processing.is_processing_pipe(pipe):
95
+ db = self._get_db()
96
+ pipe_result = self.processing.run_phase(pipe, db)
97
+ elapsed = (datetime.now() - start_time).total_seconds()
98
+ pipe_result.elapsed_seconds = round(elapsed, 1)
99
+ result = pipe_result.to_dict()
100
+ else:
101
+ # Check if this pipe belongs to an uninstalled connector
102
+ connector_name = self._find_connector_for_pipe(pipe)
103
+ if connector_name:
104
+ result = {
105
+ "stage": pipe,
106
+ "status": "skipped",
107
+ "reason": "not installed",
108
+ "hint": f"run: fp connect install {connector_name}",
109
+ }
110
+ else:
111
+ logger.error(f"Unknown pipe: {pipe}")
112
+ result = {
113
+ "stage": pipe,
114
+ "status": "error",
115
+ "error": f"Unknown pipe: {pipe}",
116
+ }
117
+
118
+ result["stage"] = pipe
119
+ result["status"] = result.get("status", "completed")
120
+
121
+ except ImportError as e:
122
+ logger.warning(f"Pipe {pipe} skipped — missing dependency: {e}")
123
+ result = {
124
+ "stage": pipe,
125
+ "status": "skipped",
126
+ "reason": f"Not installed: {e}",
127
+ "error_type": "missing_dependency",
128
+ }
129
+ except sqlite3.OperationalError as e:
130
+ logger.error(f"Database error in pipe {pipe}: {e}")
131
+ result = {
132
+ "stage": pipe,
133
+ "status": "error",
134
+ "error": str(e),
135
+ "error_type": "database",
136
+ }
137
+ except FileNotFoundError as e:
138
+ logger.error(f"Config/file error in pipe {pipe}: {e}")
139
+ result = {
140
+ "stage": pipe,
141
+ "status": "error",
142
+ "error": str(e),
143
+ "error_type": "config",
144
+ }
145
+ # Intentional broad catch: last-resort after specific
146
+ # ImportError, OperationalError, FileNotFoundError handlers
147
+ except Exception as e:
148
+ logger.error(f"Error in pipe {pipe}: {e}")
149
+ result = {
150
+ "stage": pipe,
151
+ "status": "error",
152
+ "error": str(e),
153
+ "error_type": "runtime",
154
+ }
155
+
156
+ elapsed = (datetime.now() - start_time).total_seconds()
157
+ result["elapsed_seconds"] = round(elapsed, 1)
158
+ logger.info(f"Pipe {pipe} completed in {elapsed:.1f}s")
159
+
160
+ return result
161
+
162
+ def validate_pipes(self, pipes: List[str]) -> None:
163
+ """Raise ValueError for unknown pipe names. Pure check, no side effects.
164
+
165
+ Exposed so callers that need to fail before starting UI output
166
+ (progress bars, headers) can pre-flight without duplicating the
167
+ unknown-pipe rule.
168
+ """
169
+ unknown = [s for s in pipes if s not in self.all_pipes]
170
+ if unknown:
171
+ raise ValueError(
172
+ f"Unknown pipe(s): {', '.join(unknown)}. "
173
+ f"Valid pipes: {', '.join(self.user_pipes)}"
174
+ )
175
+
176
+ def run_pipes(
177
+ self,
178
+ pipes: List[str],
179
+ on_pipe_start: Optional[Callable] = None,
180
+ on_pipe_end: Optional[Callable] = None,
181
+ on_progress: Optional[Callable] = None,
182
+ pipe_hook: Optional[Callable] = None,
183
+ last_run: Optional[datetime] = None,
184
+ ) -> List[Dict]:
185
+ """Run multiple pipes in order.
186
+
187
+ Raises ValueError for unknown pipe names. Stops on fatal errors
188
+ (database/config error_type), continues on runtime errors.
189
+ """
190
+ self.validate_pipes(pipes)
191
+
192
+ results = []
193
+
194
+ for pipe in pipes:
195
+ if on_pipe_start:
196
+ on_pipe_start(pipe)
197
+
198
+ if pipe_hook:
199
+ result = pipe_hook(pipe, on_progress=on_progress)
200
+ else:
201
+ result = self.run_pipe(pipe, on_progress=on_progress, last_run=last_run)
202
+ results.append(result)
203
+
204
+ if on_pipe_end:
205
+ on_pipe_end(pipe, result)
206
+
207
+ # Stop pipeline on fatal errors (database/config); runtime errors continue
208
+ if result.get("status") == "error":
209
+ if result.get("error_type") in ("database", "config"):
210
+ logger.error(f"Fatal error in {pipe}: {result.get('error', 'unknown')}")
211
+ break
212
+
213
+ return results
214
+
215
+ def _find_connector_for_pipe(self, pipe: str) -> str | None:
216
+ """Find the connector name that owns a given pipe, if any."""
217
+ return self._connector_pipe_map.get(pipe)
@@ -0,0 +1,165 @@
1
+ """
2
+ Processing module — access resolution and pipeline framework.
3
+
4
+ Primary role: ``run_access_resolution`` stamps visibility and permissions
5
+ on ingested entities, with last-run-based incremental processing.
6
+ Also provides the ``ProcessingPipeline`` framework for phase registration
7
+ and dispatch.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import sqlite3
14
+ from dataclasses import dataclass
15
+ from typing import TYPE_CHECKING, Callable, Dict, List, Optional
16
+
17
+ from footprinter.ingest.adapters.protocol import ErrorType, PipeResult
18
+
19
+ if TYPE_CHECKING:
20
+ from footprinter.ingest.database import Database
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Last-run helpers (backed by ingests table)
27
+ # ---------------------------------------------------------------------------
28
+
29
+
30
+ def _read_last_run(conn: sqlite3.Connection, pipe: str) -> Optional[str]:
31
+ """Read the last-completed timestamp for a pipe."""
32
+ row = conn.execute(
33
+ "SELECT completed_at FROM ingests WHERE pipe = ? AND status = 'completed' ORDER BY completed_at DESC LIMIT 1",
34
+ (pipe,),
35
+ ).fetchone()
36
+ if row is None:
37
+ return None
38
+ return row[0] if isinstance(row, tuple) else row["completed_at"]
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Access resolution runner
43
+ # ---------------------------------------------------------------------------
44
+
45
+
46
+ def run_access_resolution(db: "Database", full_mode: bool = False) -> PipeResult:
47
+ """Stamp visibility and permissions on entities.
48
+
49
+ Args:
50
+ db: Database instance (needs db.conn).
51
+ full_mode: If True, recalculate everything. If False, only
52
+ entities added/modified since the last run.
53
+
54
+ Returns:
55
+ PipeResult with per-entity-type counts in data.
56
+ """
57
+ from footprinter.access import ENTITY_META, recalculate_access, stamp_entities
58
+
59
+ conn = db.conn
60
+ last_run = _read_last_run(conn, "access_resolution")
61
+
62
+ try:
63
+ if full_mode or last_run is None:
64
+ # Full recalculation
65
+ stats = recalculate_access(conn, "global")
66
+ else:
67
+ # Incremental — only entities with indexed_at > last run
68
+ ids_by_type: Dict[str, list] = {}
69
+ for entity_type, meta in ENTITY_META.items():
70
+ table = meta["table"]
71
+
72
+ # Not all tables have indexed_at (folders, projects, clients don't)
73
+ try:
74
+ conn.execute(f"SELECT indexed_at FROM {table} LIMIT 0")
75
+ except sqlite3.OperationalError:
76
+ continue
77
+
78
+ where = "indexed_at > ?"
79
+ if meta["has_status"]:
80
+ where += " AND status != 'removed'"
81
+
82
+ rows = conn.execute(f"SELECT id FROM {table} WHERE {where}", (last_run,)).fetchall()
83
+ ids = [r["id"] if isinstance(r, sqlite3.Row) else r[0] for r in rows]
84
+
85
+ if ids:
86
+ ids_by_type[entity_type] = ids
87
+
88
+ stats = stamp_entities(conn, ids_by_type)
89
+ except Exception as e: # Intentional broad catch: last-resort for access resolution; pipeline must continue
90
+ logger.error("Access resolution error: %s", e, exc_info=True)
91
+ return PipeResult.make_error("access_resolution", str(e), ErrorType.RUNTIME)
92
+ else:
93
+ return PipeResult.completed("access_resolution", **stats)
94
+
95
+
96
+ # Identity mapping — pipe names map directly to phase names.
97
+ PIPE_TO_PHASE: Dict[str, str] = {}
98
+
99
+
100
+ @dataclass
101
+ class PhaseSpec:
102
+ """Specification for a single processing phase."""
103
+
104
+ name: str
105
+ skip_guard: Optional[Callable[["Database"], bool]] = None
106
+ runner: Optional[Callable[["Database"], PipeResult]] = None
107
+
108
+
109
+ class ProcessingPipeline:
110
+ """Pipeline for processing stages.
111
+
112
+ Phases are registered with a runner callable and optional skip guard.
113
+ Execution order follows registration order.
114
+ """
115
+
116
+ def __init__(self) -> None:
117
+ self._phases: Dict[str, PhaseSpec] = {}
118
+
119
+ def register(
120
+ self,
121
+ name: str,
122
+ runner: Optional[Callable[["Database"], PipeResult]] = None,
123
+ skip_guard: Optional[Callable[["Database"], bool]] = None,
124
+ ) -> None:
125
+ """Register a processing phase."""
126
+ self._phases[name] = PhaseSpec(
127
+ name=name,
128
+ skip_guard=skip_guard,
129
+ runner=runner,
130
+ )
131
+
132
+ def is_processing_pipe(self, pipe_name: str) -> bool:
133
+ """Check if a pipe name maps to a registered processing phase."""
134
+ phase_name = PIPE_TO_PHASE.get(pipe_name, pipe_name)
135
+ return phase_name in self._phases
136
+
137
+ @property
138
+ def phase_names(self) -> List[str]:
139
+ """Return phase names in registration order."""
140
+ return list(self._phases.keys())
141
+
142
+ def run_phase(self, pipe_name: str, db: "Database") -> PipeResult:
143
+ """Execute a processing phase by pipe name.
144
+
145
+ Applies skip guard, then calls runner directly.
146
+ """
147
+ phase_name = PIPE_TO_PHASE.get(pipe_name, pipe_name)
148
+ spec = self._phases.get(phase_name)
149
+
150
+ if spec is None:
151
+ return PipeResult.make_error(pipe_name, f"Unknown processing phase: {phase_name}")
152
+
153
+ if spec.runner is None:
154
+ return PipeResult.make_error(pipe_name, f"No runner registered for phase: {phase_name}")
155
+
156
+ # Check skip guard
157
+ if spec.skip_guard is not None:
158
+ try:
159
+ should_skip = spec.skip_guard(db)
160
+ if should_skip:
161
+ return PipeResult.skipped(pipe_name, f"Skip guard triggered for {phase_name}")
162
+ except Exception as e:
163
+ logger.warning(f"Skip guard for {phase_name} raised {type(e).__name__}: {e}; proceeding")
164
+
165
+ return spec.runner(db)
@@ -0,0 +1,201 @@
1
+ """Pipe registry — the "phone book" for pipes.
2
+
3
+ Knows what pipes exist, which adapter classes implement the core
4
+ data-source pipes, and provides functions to compute pipeline and
5
+ refresh pipe definitions dynamically. Does NOT run anything — that's
6
+ the orchestrator's job.
7
+
8
+ v1.0 pipe set
9
+ --------------
10
+ Core (always available): local_folders, local_files, browser, chat.
11
+ Connector pipes are resolved dynamically from installed ConnectorSpecs.
12
+ """
13
+
14
+ from footprinter.ingest.adapters import (
15
+ BrowserAdapter,
16
+ ChatAdapter,
17
+ LocalFilesAdapter,
18
+ LocalFoldersAdapter,
19
+ )
20
+ from footprinter.ingest.adapters.protocol import ErrorType, PipeResult, PipeStatus
21
+
22
+ # ── Source catalogue ────────────────────────────────────────────────
23
+
24
+ # Core v1.0 sources (work out of the box)
25
+ CORE_PIPES = [
26
+ "local_folders", # Scan ~/Work, ~/Personal folder structure
27
+ "local_files", # Index local files
28
+ "browser", # Browser history
29
+ "chat", # Claude/ChatGPT exports (status only - manual import)
30
+ ]
31
+
32
+ # Not valid CLI targets; excluded from get_all_pipes()
33
+ FUTURE_PIPES = [
34
+ "project_links",
35
+ "summaries",
36
+ "drive_links",
37
+ ]
38
+
39
+ # Post-processing pipes — appended to every pipeline, run after all data-source pipes
40
+ POST_PIPES = [
41
+ "access_resolution", # Stamp visibility + permissions on ingested entities
42
+ ]
43
+
44
+ # ── Core source registry (data-source adapters only) ─────────────────
45
+
46
+ CORE_PIPE_REGISTRY = {
47
+ "local_folders": LocalFoldersAdapter,
48
+ "local_files": LocalFilesAdapter,
49
+ "browser": BrowserAdapter,
50
+ "chat": ChatAdapter,
51
+ }
52
+
53
+ # ── Dynamic resolution functions ─────────────────────────────────────
54
+ #
55
+ # These replace the former static PIPELINES, REFRESH_PIPES, and ALL_PIPES
56
+ # dicts. They accept connector_pipelines — a dict mapping connector names
57
+ # to their pipe lists (e.g., {"google": ["drive_folders", "drive_files", "gmail"]}).
58
+ # The orchestrator builds this from ConnectorSpec metadata and passes it in,
59
+ # so this module never imports from connectors/.
60
+
61
+
62
+ def get_pipelines(
63
+ connector_pipes: dict[str, type],
64
+ connector_pipelines: dict[str, list[str]] | None = None,
65
+ ) -> dict[str, list[str]]:
66
+ """Compute pipeline definitions from core + installed connector pipes.
67
+
68
+ Args:
69
+ connector_pipes: Merged adapter registry from get_connector_pipes().
70
+ connector_pipelines: Connector name → adapter pipe names. Built by
71
+ the orchestrator from ConnectorSpec.adapter_entries.
72
+
73
+ Returns pipeline name → ordered pipe list.
74
+ """
75
+ pipelines: dict[str, list[str]] = {
76
+ "local": list(CORE_PIPES),
77
+ }
78
+
79
+ # Add a pipeline per connector whose pipes are in connector_pipes
80
+ for name, pipes in (connector_pipelines or {}).items():
81
+ installed = [s for s in pipes if s in connector_pipes]
82
+ if installed:
83
+ pipelines[name] = installed
84
+
85
+ # "all" = core + all installed connector data-source pipes
86
+ all_pipe_names = list(CORE_PIPES)
87
+ for name, pipes in pipelines.items():
88
+ if name == "local":
89
+ continue
90
+ for s in pipes:
91
+ if s not in all_pipe_names:
92
+ all_pipe_names.append(s)
93
+ pipelines["all"] = all_pipe_names
94
+
95
+ # Append post-processing pipes to every pipeline
96
+ for name in pipelines:
97
+ pipelines[name] = pipelines[name] + POST_PIPES
98
+
99
+ return pipelines
100
+
101
+
102
+ def get_refresh_pipes(
103
+ connector_pipes: dict[str, type],
104
+ connector_pipelines: dict[str, list[str]] | None = None,
105
+ ) -> dict[str, list[str]]:
106
+ """Compute refresh pipe mappings from core + installed connector pipes.
107
+
108
+ Args:
109
+ connector_pipes: Merged adapter registry from get_connector_pipes().
110
+ connector_pipelines: Connector name → adapter pipe names.
111
+
112
+ Returns source name → pipe list. Each core source group gets a key,
113
+ each connector gets a key, and individual connector pipes get keys.
114
+ """
115
+ refresh: dict[str, list[str]] = {
116
+ "local": ["local_folders", "local_files"],
117
+ "browser": ["browser"],
118
+ "chat": ["chat"],
119
+ }
120
+
121
+ # Per-connector and per-pipe entries
122
+ for name, pipes in (connector_pipelines or {}).items():
123
+ installed = [s for s in pipes if s in connector_pipes]
124
+ if not installed:
125
+ continue
126
+
127
+ # Connector-level key (e.g., "google")
128
+ refresh[name] = installed
129
+
130
+ # Per-pipe keys and grouped keys (e.g., "gmail", "drive")
131
+ drive_pipes = []
132
+ for pipe in installed:
133
+ if pipe.startswith("drive_"):
134
+ drive_pipes.append(pipe)
135
+ else:
136
+ # Individual pipe key (e.g., "gmail")
137
+ refresh[pipe] = [pipe]
138
+
139
+ if drive_pipes:
140
+ refresh["drive"] = drive_pipes
141
+
142
+ # "all" = everything
143
+ all_pipe_names = list(CORE_PIPES)
144
+ for name, pipes in (connector_pipelines or {}).items():
145
+ for s in pipes:
146
+ if s in connector_pipes and s not in all_pipe_names:
147
+ all_pipe_names.append(s)
148
+ refresh["all"] = all_pipe_names
149
+
150
+ # Append post-processing pipes to every refresh group
151
+ for name in refresh:
152
+ refresh[name] = refresh[name] + POST_PIPES
153
+
154
+ return refresh
155
+
156
+
157
+ def get_all_pipes(connector_pipes: dict[str, type]) -> list[str]:
158
+ """Compute complete list of valid pipe names.
159
+
160
+ Includes core pipes and installed connector pipes.
161
+ FUTURE_PIPES entries are excluded — they are not registered pipes.
162
+ """
163
+ result = list(CORE_PIPES)
164
+ for s in connector_pipes:
165
+ if s not in result:
166
+ result.append(s)
167
+ for s in POST_PIPES:
168
+ if s not in result:
169
+ result.append(s)
170
+ return result
171
+
172
+
173
+ def get_user_pipes(connector_pipes: dict[str, type]) -> list[str]:
174
+ """Compute the user-selectable subset of pipes for CLI error messages.
175
+
176
+ Includes core + installed connector data-source pipes. POST_PIPES are
177
+ excluded — they run implicitly after every pipeline and aren't meant
178
+ to be invoked directly via ``fp ingest --pipe``.
179
+ """
180
+ result = list(CORE_PIPES)
181
+ for s in connector_pipes:
182
+ if s not in result:
183
+ result.append(s)
184
+ return result
185
+
186
+
187
+ # ── Convenience re-exports ───────────────────────────────────────────
188
+
189
+ __all__ = [
190
+ "CORE_PIPES",
191
+ "FUTURE_PIPES",
192
+ "POST_PIPES",
193
+ "CORE_PIPE_REGISTRY",
194
+ "get_pipelines",
195
+ "get_refresh_pipes",
196
+ "get_all_pipes",
197
+ "get_user_pipes",
198
+ "PipeResult",
199
+ "PipeStatus",
200
+ "ErrorType",
201
+ ]
@@ -0,0 +1,91 @@
1
+ """Pure persistence for pipeline run records.
2
+
3
+ Saves and loads a JSON record of each pipeline run. No heuristics or
4
+ config awareness — warning logic lives in the display layer.
5
+ """
6
+
7
+ import json
8
+ from datetime import datetime, timedelta, timezone
9
+ from pathlib import Path
10
+ from typing import Dict, List, Optional
11
+
12
+ from footprinter.paths import get_last_run_path
13
+
14
+ SESSION_WINDOW_MINUTES = 10
15
+
16
+
17
+ def save_run_record(
18
+ results: List[Dict],
19
+ mode: str,
20
+ started_at: datetime,
21
+ *,
22
+ interrupted: bool = False,
23
+ path: Optional[Path] = None,
24
+ ) -> Path:
25
+ """Write a run record to JSON, merging with recent records.
26
+
27
+ If an existing record started within SESSION_WINDOW_MINUTES of
28
+ ``started_at``, new stages are appended to it (preserving the
29
+ original ``started_at``). Otherwise the record is replaced.
30
+
31
+ Args:
32
+ results: List of per-stage result dicts from PipeRunner.
33
+ mode: Run mode string (e.g. "incremental", "full").
34
+ started_at: When the pipeline started.
35
+ interrupted: Whether the run was interrupted (e.g. KeyboardInterrupt).
36
+ path: Override output path (default: get_last_run_path()).
37
+
38
+ Returns:
39
+ The path the record was written to.
40
+ """
41
+ if path is None:
42
+ path = get_last_run_path()
43
+
44
+ completed_at = datetime.now(timezone.utc)
45
+ total_elapsed = sum(r.get("elapsed_seconds", 0) for r in results)
46
+
47
+ # Merge with existing record if within session window
48
+ existing = load_run_record(path=path)
49
+ if existing and _within_session_window(existing, started_at):
50
+ existing["stages"].extend(results)
51
+ existing["completed_at"] = completed_at.isoformat()
52
+ existing["total_elapsed_seconds"] = sum(r.get("elapsed_seconds", 0) for r in existing["stages"])
53
+ existing["interrupted"] = interrupted
54
+ record = existing
55
+ else:
56
+ record = {
57
+ "started_at": started_at.isoformat(),
58
+ "completed_at": completed_at.isoformat(),
59
+ "mode": mode,
60
+ "interrupted": interrupted,
61
+ "total_elapsed_seconds": total_elapsed,
62
+ "stages": results,
63
+ }
64
+
65
+ path.parent.mkdir(parents=True, exist_ok=True)
66
+ path.write_text(json.dumps(record, indent=2, default=str))
67
+ return path
68
+
69
+
70
+ def _within_session_window(existing: Dict, new_started_at: datetime) -> bool:
71
+ """Check if an existing record is within the merge window."""
72
+ try:
73
+ existing_start = datetime.fromisoformat(existing["started_at"])
74
+ return abs(new_started_at - existing_start) <= timedelta(minutes=SESSION_WINDOW_MINUTES)
75
+ except (KeyError, ValueError):
76
+ return False
77
+
78
+
79
+ def load_run_record(path: Optional[Path] = None) -> Optional[Dict]:
80
+ """Read a run record from JSON.
81
+
82
+ Returns:
83
+ The parsed record dict, or None if the file doesn't exist.
84
+ """
85
+ if path is None:
86
+ path = get_last_run_path()
87
+
88
+ if not path.exists():
89
+ return None
90
+
91
+ return json.loads(path.read_text())