footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,226 @@
1
+ """
2
+ Folder structure indexer for Footprinter.
3
+
4
+ Scans ~/Work and ~/Personal to discover folder structure before file indexing.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import os
11
+ import sqlite3
12
+ from typing import TYPE_CHECKING, Dict, List, Tuple
13
+
14
+ from footprinter.utils.time import utc_now_iso
15
+
16
+ if TYPE_CHECKING:
17
+ from footprinter.ingest.database import Database
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Home directory
22
+ HOME = os.path.expanduser("~")
23
+
24
+
25
+ class FolderIndexer:
26
+ """Indexes folder structure for Footprinter."""
27
+
28
+ # Directories to always skip (system/build caches)
29
+ SKIP_DIRS = {
30
+ "node_modules",
31
+ "__pycache__",
32
+ ".git",
33
+ ".venv",
34
+ "venv",
35
+ "site-packages",
36
+ ".next",
37
+ ".sfdx",
38
+ ".sf",
39
+ ".pytest_cache",
40
+ ".mypy_cache",
41
+ ".eggs",
42
+ ".tox",
43
+ ".nox",
44
+ ".cache",
45
+ "dist",
46
+ "build",
47
+ }
48
+
49
+ def __init__(self, config: Dict, db: Database):
50
+ """
51
+ Initialize folder scanner.
52
+
53
+ Args:
54
+ config: Configuration dictionary
55
+ db: Shared Database instance
56
+ """
57
+ self.config = config
58
+ self.db = db
59
+
60
+ def should_skip_dir(self, dir_name: str) -> bool:
61
+ """Check if directory should be skipped.
62
+
63
+ v3 Architecture (2026-01): Scan ALL folders including hidden ones.
64
+ Hidden folders are scanned so their files can be indexed with status='hidden'.
65
+
66
+ Only skip regeneratable build/cache directories (node_modules, venv, etc.)
67
+ """
68
+ # Skip known build/cache directories (regeneratable dependencies)
69
+ if dir_name.lower() in self.SKIP_DIRS:
70
+ return True
71
+
72
+ # NOTE: Hidden directories (starting with .) are NOT skipped
73
+ # They are scanned so their files can be indexed with status='hidden'
74
+ # Filter hidden folders in the Web UI, not at scan time
75
+
76
+ return False
77
+
78
+ def scan_folders(self, root_paths: List[str]) -> List[Dict]:
79
+ """
80
+ Scan folder structure starting from root paths.
81
+
82
+ Args:
83
+ root_paths: List of root paths to scan (e.g., ['~/Work', '~/Personal'])
84
+
85
+ Returns:
86
+ List of folder dictionaries
87
+ """
88
+ folders = []
89
+
90
+ for root_path in root_paths:
91
+ expanded_root = os.path.expanduser(root_path)
92
+ if not os.path.isdir(expanded_root):
93
+ logger.warning(f"Root path does not exist: {expanded_root}")
94
+ continue
95
+
96
+ logger.info(f"Scanning folders in {expanded_root}...")
97
+
98
+ for dirpath, dirnames, _ in os.walk(expanded_root):
99
+ # Filter out directories to skip
100
+ dirnames[:] = [d for d in dirnames if not self.should_skip_dir(d)]
101
+
102
+ # Get relative path from home
103
+ if dirpath.startswith(HOME):
104
+ relative_path = dirpath[len(HOME) :]
105
+ else:
106
+ relative_path = dirpath
107
+
108
+ # Get parent path
109
+ parent_path = os.path.dirname(dirpath)
110
+
111
+ folder = {
112
+ "path": dirpath,
113
+ "relative_path": relative_path,
114
+ "name": os.path.basename(dirpath) or relative_path,
115
+ "parent_path": parent_path if parent_path != dirpath else None,
116
+ "scanned_at": utc_now_iso(),
117
+ }
118
+
119
+ folders.append(folder)
120
+
121
+ logger.info(f"Found {len(folders)} folders")
122
+ return folders
123
+
124
+ def save_folders(self, folders: List[Dict]) -> Tuple[int, int]:
125
+ """
126
+ Save folders to database.
127
+
128
+ Args:
129
+ folders: List of folder dictionaries
130
+
131
+ Returns:
132
+ Tuple of (inserted_count, updated_count)
133
+ """
134
+ cursor = self.db.conn.cursor()
135
+
136
+ inserted = 0
137
+ updated = 0
138
+
139
+ for folder in folders:
140
+ try:
141
+ # Try insert first
142
+ cursor.execute(
143
+ """
144
+ INSERT INTO folders
145
+ (path, relative_path, name, parent_path, scanned_at,
146
+ indexed_at, updated_at)
147
+ VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
148
+ """,
149
+ (
150
+ folder["path"],
151
+ folder["relative_path"],
152
+ folder["name"],
153
+ folder["parent_path"],
154
+ folder["scanned_at"],
155
+ ),
156
+ )
157
+ inserted += 1
158
+ except sqlite3.IntegrityError:
159
+ # Update existing
160
+ cursor.execute(
161
+ """
162
+ UPDATE folders
163
+ SET relative_path = ?,
164
+ name = ?,
165
+ parent_path = ?,
166
+ scanned_at = ?,
167
+ updated_at = CURRENT_TIMESTAMP
168
+ WHERE path = ?
169
+ """,
170
+ (
171
+ folder["relative_path"],
172
+ folder["name"],
173
+ folder["parent_path"],
174
+ folder["scanned_at"],
175
+ folder["path"],
176
+ ),
177
+ )
178
+ updated += 1
179
+
180
+ self.db.conn.commit()
181
+
182
+ logger.info(f"Saved folders: {inserted} inserted, {updated} updated")
183
+ return inserted, updated
184
+
185
+ def get_folder_stats(self) -> Dict:
186
+ """Get statistics about indexed folders."""
187
+ cursor = self.db.conn.cursor()
188
+
189
+ cursor.execute("SELECT COUNT(*) FROM folders")
190
+ total = cursor.fetchone()[0]
191
+
192
+ return {"total_folders": total}
193
+
194
+
195
+ def main():
196
+ """Run folder indexer from command line."""
197
+ from footprinter.ingest.database import Database
198
+ from footprinter.paths import get_db_path
199
+ from footprinter.source_registry import get_config
200
+
201
+ # Load config
202
+ config = get_config()
203
+
204
+ # Database
205
+ db = Database(str(get_db_path()))
206
+
207
+ # Create indexer
208
+ indexer = FolderIndexer(config, db)
209
+
210
+ # Scan folders
211
+ root_paths = config.get("directories", [])
212
+ if not root_paths:
213
+ raise ValueError("No directories configured. Add directories to config/config.yaml.")
214
+ folders = indexer.scan_folders(root_paths)
215
+
216
+ # Save to database
217
+ inserted, updated = indexer.save_folders(folders)
218
+
219
+ # Log stats
220
+ stats = indexer.get_folder_stats()
221
+ logger.info("Folder Scan Complete:")
222
+ logger.info(f" Total folders: {stats['total_folders']}")
223
+
224
+
225
+ if __name__ == "__main__":
226
+ main()
@@ -0,0 +1,321 @@
1
+ """
2
+ Full content extraction with chunking for semantic search.
3
+ """
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional
8
+
9
+ from .content_extractors import ContentExtractor
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _pypdf_warned = False
14
+ _docx_warned = False
15
+
16
+
17
+ class FullContentExtractor(ContentExtractor):
18
+ """Extract full content from files with intelligent chunking."""
19
+
20
+ def __init__(
21
+ self,
22
+ chunk_size: int = 1000,
23
+ chunk_overlap: float = 0.15,
24
+ max_file_size_bytes: int = 50 * 1024 * 1024,
25
+ file_types: Optional[List[str]] = None,
26
+ exclude_patterns: Optional[List[str]] = None,
27
+ ):
28
+ """
29
+ Initialize full content extractor.
30
+
31
+ Args:
32
+ chunk_size: Size of each chunk in characters
33
+ chunk_overlap: Fractional overlap between chunks (0.0 to 1.0).
34
+ Note: chunking.py uses absolute chars; this uses a fraction.
35
+ max_file_size_bytes: Maximum file size to read (0 = no limit)
36
+ file_types: Allowlist of file extensions (e.g. [".md", ".txt"]).
37
+ None means all supported types are extracted.
38
+ exclude_patterns: fnmatch patterns for file paths to skip.
39
+ """
40
+ super().__init__(max_preview_length=1000) # Keep small preview for DB
41
+ self.chunk_size = chunk_size
42
+ self.chunk_overlap = chunk_overlap
43
+ self.max_file_size_bytes = max_file_size_bytes
44
+ self.file_types = [t.lower() for t in file_types] if file_types is not None else None
45
+ self.exclude_patterns = exclude_patterns
46
+
47
+ @classmethod
48
+ def from_config(cls, config: dict) -> "FullContentExtractor":
49
+ """Build a FullContentExtractor from the application config dict.
50
+
51
+ Reads ``config["indexing"]["max_file_size_mb"]`` (default 0 = no
52
+ limit) and ``config["vectorization"]`` (chunk_size, chunk_overlap,
53
+ file_types, exclude_patterns). Missing vectorization keys fall back
54
+ to constructor defaults.
55
+ """
56
+ max_mb = config.get("indexing", {}).get("max_file_size_mb", 0)
57
+ vec_config = config.get("vectorization", {})
58
+ vec_kwargs: dict = {}
59
+ if "chunk_size" in vec_config:
60
+ vec_kwargs["chunk_size"] = vec_config["chunk_size"]
61
+ if "chunk_overlap" in vec_config:
62
+ vec_kwargs["chunk_overlap"] = vec_config["chunk_overlap"]
63
+ if "file_types" in vec_config:
64
+ vec_kwargs["file_types"] = vec_config["file_types"]
65
+ if "exclude_patterns" in vec_config:
66
+ vec_kwargs["exclude_patterns"] = vec_config["exclude_patterns"]
67
+ return cls(
68
+ max_file_size_bytes=int(max_mb * 1024 * 1024),
69
+ **vec_kwargs,
70
+ )
71
+
72
+ def extract_with_chunking(self, file_path: Path) -> List[Dict[str, str]]:
73
+ """
74
+ Extract content and split into chunks if necessary.
75
+
76
+ Returns:
77
+ List of chunk dictionaries with 'content', 'chunk_index', 'total_chunks'
78
+ """
79
+ # Extract full content
80
+ full_content = self._extract_full_content(file_path)
81
+
82
+ if not full_content or len(full_content) == 0:
83
+ return []
84
+
85
+ # If content is small enough, return single chunk
86
+ if len(full_content) <= self.chunk_size:
87
+ return [{"content": full_content, "chunk_index": 0, "total_chunks": 1}]
88
+
89
+ # Split into chunks with overlap
90
+ chunks = []
91
+ overlap_size = int(self.chunk_size * self.chunk_overlap)
92
+
93
+ start = 0
94
+ chunk_index = 0
95
+
96
+ while start < len(full_content):
97
+ end = min(start + self.chunk_size, len(full_content))
98
+ chunk_text = full_content[start:end]
99
+
100
+ chunks.append(
101
+ {
102
+ "content": chunk_text,
103
+ "chunk_index": chunk_index,
104
+ "total_chunks": 0, # Will update after loop
105
+ }
106
+ )
107
+
108
+ # Move start forward, with overlap
109
+ if end >= len(full_content):
110
+ break
111
+ start = end - overlap_size
112
+ chunk_index += 1
113
+
114
+ # Update total_chunks count
115
+ total = len(chunks)
116
+ for chunk in chunks:
117
+ chunk["total_chunks"] = total
118
+
119
+ if total > 1:
120
+ logger.info(f"Split {file_path.name} into {total} chunks")
121
+
122
+ return chunks
123
+
124
+ def _extract_full_content(self, file_path: Path) -> Optional[str]:
125
+ """Extract full content from file.
126
+
127
+ Gates checked in order: file type allowlist, exclude patterns,
128
+ file size limit. Returns None if any gate rejects the file.
129
+ """
130
+ # File type allowlist gate (cheap — check before stat())
131
+ file_type = file_path.suffix.lower()
132
+ if self.file_types is not None and file_type not in self.file_types:
133
+ logger.debug("Skipping %s: extension %s not in file_types allowlist", file_path.name, file_type)
134
+ return None
135
+
136
+ # Exclude patterns gate (fnmatch against full absolute path)
137
+ if self.exclude_patterns:
138
+ from fnmatch import fnmatch
139
+
140
+ path_str = str(file_path)
141
+ if any(fnmatch(path_str, pat) for pat in self.exclude_patterns):
142
+ logger.debug("Skipping %s: matched exclude pattern", file_path.name)
143
+ return None
144
+
145
+ # File size guard
146
+ if self.max_file_size_bytes > 0:
147
+ try:
148
+ file_size = file_path.stat().st_size
149
+ if file_size > self.max_file_size_bytes:
150
+ logger.warning(
151
+ f"Skipping {file_path.name}: {file_size} bytes "
152
+ f"exceeds content extraction limit of {self.max_file_size_bytes} bytes"
153
+ )
154
+ return None
155
+ except OSError:
156
+ pass # stat failed — let the read attempt handle the error
157
+
158
+ try:
159
+ # Text-based files
160
+ if file_type in [
161
+ ".txt",
162
+ ".md",
163
+ ".py",
164
+ ".js",
165
+ ".json",
166
+ ".yaml",
167
+ ".yml",
168
+ ".html",
169
+ ".css",
170
+ ".jsx",
171
+ ".tsx",
172
+ ]:
173
+ return self._extract_full_text(file_path)
174
+
175
+ # Documents
176
+ elif file_type == ".pdf":
177
+ return self._extract_full_pdf(file_path)
178
+ elif file_type in [".docx", ".doc"]:
179
+ return self._extract_full_docx(file_path)
180
+
181
+ # Data files
182
+ elif file_type == ".csv":
183
+ return self._extract_csv_full(file_path)
184
+
185
+ # Other text-like formats
186
+ elif file_type in [
187
+ ".xml",
188
+ ".svg",
189
+ ".rst",
190
+ ".toml",
191
+ ".ini",
192
+ ".cfg",
193
+ ".conf",
194
+ ".sh",
195
+ ".bash",
196
+ ".zsh",
197
+ ".fish",
198
+ ".sql",
199
+ ".graphql",
200
+ ".proto",
201
+ ".ts",
202
+ ".vue",
203
+ ".svelte",
204
+ ".astro",
205
+ ".java",
206
+ ".kt",
207
+ ".scala",
208
+ ".go",
209
+ ".rs",
210
+ ".rb",
211
+ ".php",
212
+ ".c",
213
+ ".h",
214
+ ".cpp",
215
+ ".hpp",
216
+ ".cs",
217
+ ".swift",
218
+ ".m",
219
+ ".r",
220
+ ".jl",
221
+ ".lua",
222
+ ".pl",
223
+ ".pm",
224
+ ".tf",
225
+ ".hcl",
226
+ ".dockerfile",
227
+ ".log",
228
+ ".env",
229
+ ".gitignore",
230
+ ".editorconfig",
231
+ ".tex",
232
+ ".bib",
233
+ ".org",
234
+ ]:
235
+ return self._extract_full_text(file_path)
236
+
237
+ else:
238
+ # Skip binary/unknown files (images, video, audio, archives, etc.)
239
+ return None
240
+
241
+ except Exception as e:
242
+ logger.debug(f"Could not extract content from {file_path}: {e}")
243
+ return None
244
+
245
+ def _extract_full_text(self, file_path: Path) -> Optional[str]:
246
+ """Extract full text content."""
247
+ try:
248
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
249
+ return f.read()
250
+ except Exception as e:
251
+ logger.debug(f"Error reading text file {file_path}: {e}")
252
+ return None
253
+
254
+ def _extract_full_pdf(self, file_path: Path) -> Optional[str]:
255
+ """Extract full PDF content."""
256
+ try:
257
+ import pypdf
258
+
259
+ with open(file_path, "rb") as f:
260
+ reader = pypdf.PdfReader(f)
261
+
262
+ text = ""
263
+ for page in reader.pages:
264
+ text += page.extract_text() + "\n"
265
+
266
+ return text
267
+
268
+ except ImportError:
269
+ global _pypdf_warned
270
+ if not _pypdf_warned:
271
+ logger.warning("pypdf not installed, skipping PDF extraction")
272
+ _pypdf_warned = True
273
+ return None
274
+ except Exception as e:
275
+ logger.debug(f"Error reading PDF {file_path}: {e}")
276
+ return None
277
+
278
+ def _extract_full_docx(self, file_path: Path) -> Optional[str]:
279
+ """Extract full DOCX content."""
280
+ try:
281
+ import docx
282
+
283
+ doc = docx.Document(file_path)
284
+
285
+ text = ""
286
+ for para in doc.paragraphs:
287
+ text += para.text + "\n"
288
+
289
+ return text
290
+
291
+ except ImportError:
292
+ global _docx_warned
293
+ if not _docx_warned:
294
+ logger.warning("python-docx not installed, skipping DOCX extraction")
295
+ _docx_warned = True
296
+ return None
297
+ except Exception as e:
298
+ logger.debug(f"Error reading DOCX {file_path}: {e}")
299
+ return None
300
+
301
+ def _extract_csv_full(self, file_path: Path) -> Optional[str]:
302
+ """Extract CSV content (headers + sample rows)."""
303
+ try:
304
+ import csv
305
+
306
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
307
+ reader = csv.reader(f)
308
+
309
+ lines = []
310
+ for i, row in enumerate(reader):
311
+ lines.append(",".join(row))
312
+
313
+ # Limit to reasonable size
314
+ if i >= 1000: # First 1000 rows
315
+ break
316
+
317
+ return "\n".join(lines)
318
+
319
+ except Exception as e:
320
+ logger.debug(f"Error reading CSV {file_path}: {e}")
321
+ return None
@@ -0,0 +1,125 @@
1
+ """Thin facade — coordinates pipeline pipes via delegation to extracted modules."""
2
+
3
+ import logging
4
+ from typing import Dict, List
5
+
6
+ from footprinter.connectors import discover_connectors, get_connector_pipes, get_schema_specs
7
+ from footprinter.ingest.pipe_runner import PipeRunner
8
+ from footprinter.ingest.registry import (
9
+ CORE_PIPE_REGISTRY,
10
+ POST_PIPES,
11
+ get_all_pipes,
12
+ get_pipelines,
13
+ get_refresh_pipes,
14
+ get_user_pipes,
15
+ )
16
+ from footprinter.paths import get_config_path, get_db_path
17
+ from footprinter.services.ingest_service import IngestService
18
+ from footprinter.source_registry import get_config
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class DataPipelineOrchestrator:
24
+ """Composition root — merges core + connector sources, delegates to PipeRunner."""
25
+
26
+ def __init__(self, config_path: str = None):
27
+ self.config = get_config(config_path)
28
+ self.config_path = config_path or str(get_config_path())
29
+ self.db = None
30
+ self.full_mode = False
31
+ from footprinter.source_registry import remote_accounts
32
+ self.remote_accounts = remote_accounts()
33
+ self._connectors = discover_connectors()
34
+ connector_pipes = get_connector_pipes(self._connectors)
35
+ self.adapter_registry = {**CORE_PIPE_REGISTRY, **connector_pipes}
36
+
37
+ # Build connector metadata for pipeline resolution and skip hints
38
+ connector_pipelines: dict[str, list[str]] = {}
39
+ connector_pipe_map: dict[str, str] = {}
40
+ for name, spec in self._connectors.items():
41
+ connector_pipelines[name] = list(spec.adapter_entries.keys())
42
+ for pipe in spec.pipes:
43
+ connector_pipe_map[pipe] = name
44
+ for pipe in spec.adapter_entries:
45
+ connector_pipe_map[pipe] = name
46
+
47
+ self.pipelines = get_pipelines(connector_pipes, connector_pipelines)
48
+ self.refresh_pipes = get_refresh_pipes(connector_pipes, connector_pipelines)
49
+ self.all_pipes = get_all_pipes(connector_pipes)
50
+ self.user_pipes = get_user_pipes(connector_pipes)
51
+
52
+ # Ensure DB schema exists (fresh installs need tables before pipes run)
53
+ from .database import Database
54
+ Database(str(get_db_path()), connector_specs=get_schema_specs(self._connectors)).close()
55
+ from .processing import ProcessingPipeline, run_access_resolution
56
+ self.processing = ProcessingPipeline()
57
+ self.processing.register(
58
+ "access_resolution",
59
+ runner=lambda db: run_access_resolution(db, full_mode=self.full_mode),
60
+ )
61
+ self.runner = PipeRunner(
62
+ processing=self.processing, get_db=self._get_db,
63
+ config=self.config, config_path=self.config_path,
64
+ adapter_registry=self.adapter_registry, pipelines=self.pipelines,
65
+ all_pipes=self.all_pipes, user_pipes=self.user_pipes,
66
+ connector_pipe_map=connector_pipe_map,
67
+ )
68
+ self.ingest_service = IngestService(self._get_db().conn, get_db=self._get_db)
69
+
70
+ def _get_db(self):
71
+ if self.db is None:
72
+ from .database import Database
73
+ self.db = Database(str(get_db_path()), connector_specs=get_schema_specs(self._connectors))
74
+ return self.db
75
+
76
+ def run_pipe(self, pipe: str) -> Dict:
77
+ """Execute a single pipe by name."""
78
+ self.runner.full_mode = self.full_mode
79
+ mode = "full" if self.full_mode else "incremental"
80
+ return self.ingest_service.run_pipe(pipe, mode=mode, trigger="cli", runner=self.runner)
81
+
82
+ def run_pipeline(self, pipeline_name: str, on_pipe_start=None, on_pipe_end=None, on_progress=None) -> List[Dict]:
83
+ """Execute all pipes in a named pipeline. Bypasses the user-facing post-pipe guard."""
84
+ if pipeline_name not in self.runner.pipelines:
85
+ raise ValueError(f"Unknown pipeline: {pipeline_name}. Available: {', '.join(self.runner.pipelines.keys())}")
86
+ return self._dispatch_pipes(self.runner.pipelines[pipeline_name], on_pipe_start, on_pipe_end, on_progress)
87
+
88
+ def run_pipes(self, pipes: List[str], on_pipe_start=None, on_pipe_end=None, on_progress=None) -> List[Dict]:
89
+ """Execute a user-supplied pipe list. Rejects POST_PIPES (post-processing stages)."""
90
+ post = [p for p in pipes if p in POST_PIPES]
91
+ if post:
92
+ raise ValueError(
93
+ f"{post[0]} is a post-processing stage, not a user-selectable pipe. "
94
+ f"Use 'fp ingest' or 'fp ingest --pipe <source>' to trigger it implicitly."
95
+ )
96
+ return self._dispatch_pipes(pipes, on_pipe_start, on_pipe_end, on_progress)
97
+
98
+ def run_refresh(self, source: str, on_pipe_start=None, on_pipe_end=None, on_progress=None) -> List[Dict]:
99
+ """Execute a refresh group. Shares _dispatch_pipes with run_pipeline so POST_PIPES run inline."""
100
+ if source not in self.refresh_pipes:
101
+ raise ValueError(f"Unknown refresh source: {source}. Available: {', '.join(self.refresh_pipes.keys())}")
102
+ return self._dispatch_pipes(self.refresh_pipes[source], on_pipe_start, on_pipe_end, on_progress)
103
+
104
+ def _dispatch_pipes(self, pipes, on_pipe_start, on_pipe_end, on_progress) -> List[Dict]:
105
+ self.runner.full_mode = self.full_mode
106
+ mode = "full" if self.full_mode else "incremental"
107
+ hook = lambda pipe, on_progress=None: self.ingest_service.run_pipe( # noqa: E731
108
+ pipe, mode=mode, trigger="cli", runner=self.runner, on_progress=on_progress,
109
+ )
110
+ return self.ingest_service.run_pipes(
111
+ pipes, runner=self.runner, full_mode=self.full_mode,
112
+ on_pipe_start=on_pipe_start, on_pipe_end=on_pipe_end,
113
+ on_progress=on_progress, pipe_hook=hook,
114
+ )
115
+
116
+ def get_status(self) -> Dict:
117
+ """Return current data counts and pipeline health."""
118
+ from footprinter.ingest.status import get_status
119
+ return get_status(str(get_db_path()))
120
+
121
+ def close(self):
122
+ """Close the database connection and release resources."""
123
+ if self.db:
124
+ self.db.close()
125
+ self.db = None