footprinter-cli 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +431 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
- footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
- footprinter/bundled/samples/visible-file-sample.txt +2 -0
- footprinter/cli/__init__.py +135 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +327 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/_sample_seed.py +204 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +543 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +2001 -0
- footprinter/cli/status.py +747 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +602 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +724 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +487 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +315 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +223 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +112 -0
- footprinter/ingest/pipe_runner.py +200 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +186 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +67 -0
- footprinter/mcp/errors.py +105 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +19 -0
- footprinter/paths.py +117 -0
- footprinter/permissions.py +1152 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1264 -0
- footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
- footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
- footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Folder structure indexer for Footprinter.
|
|
3
|
+
|
|
4
|
+
Scans ~/Work and ~/Personal to discover folder structure before file indexing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import sqlite3
|
|
12
|
+
from typing import TYPE_CHECKING, Dict, List, Tuple
|
|
13
|
+
|
|
14
|
+
from footprinter.utils.time import utc_now_iso
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from footprinter.ingest.database import Database
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Home directory
|
|
22
|
+
HOME = os.path.expanduser("~")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FolderIndexer:
|
|
26
|
+
"""Indexes folder structure for Footprinter."""
|
|
27
|
+
|
|
28
|
+
# Directories to always skip (system/build caches)
|
|
29
|
+
SKIP_DIRS = {
|
|
30
|
+
"node_modules",
|
|
31
|
+
"__pycache__",
|
|
32
|
+
".git",
|
|
33
|
+
".venv",
|
|
34
|
+
"venv",
|
|
35
|
+
"site-packages",
|
|
36
|
+
".next",
|
|
37
|
+
".sfdx",
|
|
38
|
+
".sf",
|
|
39
|
+
".pytest_cache",
|
|
40
|
+
".mypy_cache",
|
|
41
|
+
".eggs",
|
|
42
|
+
".tox",
|
|
43
|
+
".nox",
|
|
44
|
+
".cache",
|
|
45
|
+
"dist",
|
|
46
|
+
"build",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def __init__(self, config: Dict, db: Database):
|
|
50
|
+
"""
|
|
51
|
+
Initialize folder scanner.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
config: Configuration dictionary
|
|
55
|
+
db: Shared Database instance
|
|
56
|
+
"""
|
|
57
|
+
self.config = config
|
|
58
|
+
self.db = db
|
|
59
|
+
|
|
60
|
+
def should_skip_dir(self, dir_name: str) -> bool:
|
|
61
|
+
"""Check if directory should be skipped.
|
|
62
|
+
|
|
63
|
+
v3 Architecture (2026-01): Scan ALL folders including hidden ones.
|
|
64
|
+
Hidden folders are scanned so their files can be indexed with status='hidden'.
|
|
65
|
+
|
|
66
|
+
Only skip regeneratable build/cache directories (node_modules, venv, etc.)
|
|
67
|
+
"""
|
|
68
|
+
# Skip known build/cache directories (regeneratable dependencies)
|
|
69
|
+
if dir_name.lower() in self.SKIP_DIRS:
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
# NOTE: Hidden directories (starting with .) are NOT skipped
|
|
73
|
+
# They are scanned so their files can be indexed with status='hidden'
|
|
74
|
+
# Filter hidden folders in the Web UI, not at scan time
|
|
75
|
+
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
def scan_folders(self, root_paths: List[str]) -> List[Dict]:
|
|
79
|
+
"""
|
|
80
|
+
Scan folder structure starting from root paths.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
root_paths: List of root paths to scan (e.g., ['~/Work', '~/Personal'])
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of folder dictionaries
|
|
87
|
+
"""
|
|
88
|
+
folders = []
|
|
89
|
+
|
|
90
|
+
for root_path in root_paths:
|
|
91
|
+
expanded_root = os.path.expanduser(root_path)
|
|
92
|
+
if not os.path.isdir(expanded_root):
|
|
93
|
+
logger.warning(f"Root path does not exist: {expanded_root}")
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
logger.info(f"Scanning folders in {expanded_root}...")
|
|
97
|
+
|
|
98
|
+
for dirpath, dirnames, _ in os.walk(expanded_root):
|
|
99
|
+
# Filter out directories to skip
|
|
100
|
+
dirnames[:] = [d for d in dirnames if not self.should_skip_dir(d)]
|
|
101
|
+
|
|
102
|
+
# Get relative path from home
|
|
103
|
+
if dirpath.startswith(HOME):
|
|
104
|
+
relative_path = dirpath[len(HOME) :]
|
|
105
|
+
else:
|
|
106
|
+
relative_path = dirpath
|
|
107
|
+
|
|
108
|
+
# Get parent path
|
|
109
|
+
parent_path = os.path.dirname(dirpath)
|
|
110
|
+
|
|
111
|
+
folder = {
|
|
112
|
+
"path": dirpath,
|
|
113
|
+
"relative_path": relative_path,
|
|
114
|
+
"name": os.path.basename(dirpath) or relative_path,
|
|
115
|
+
"parent_path": parent_path if parent_path != dirpath else None,
|
|
116
|
+
"scanned_at": utc_now_iso(),
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
folders.append(folder)
|
|
120
|
+
|
|
121
|
+
logger.info(f"Found {len(folders)} folders")
|
|
122
|
+
return folders
|
|
123
|
+
|
|
124
|
+
def save_folders(self, folders: List[Dict]) -> Tuple[int, int]:
|
|
125
|
+
"""
|
|
126
|
+
Save folders to database.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
folders: List of folder dictionaries
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (inserted_count, updated_count)
|
|
133
|
+
"""
|
|
134
|
+
cursor = self.db.conn.cursor()
|
|
135
|
+
|
|
136
|
+
inserted = 0
|
|
137
|
+
updated = 0
|
|
138
|
+
|
|
139
|
+
for folder in folders:
|
|
140
|
+
try:
|
|
141
|
+
# Try insert first
|
|
142
|
+
cursor.execute(
|
|
143
|
+
"""
|
|
144
|
+
INSERT INTO folders
|
|
145
|
+
(path, relative_path, name, parent_path, scanned_at,
|
|
146
|
+
indexed_at, updated_at)
|
|
147
|
+
VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
148
|
+
""",
|
|
149
|
+
(
|
|
150
|
+
folder["path"],
|
|
151
|
+
folder["relative_path"],
|
|
152
|
+
folder["name"],
|
|
153
|
+
folder["parent_path"],
|
|
154
|
+
folder["scanned_at"],
|
|
155
|
+
),
|
|
156
|
+
)
|
|
157
|
+
inserted += 1
|
|
158
|
+
except sqlite3.IntegrityError:
|
|
159
|
+
# Update existing
|
|
160
|
+
cursor.execute(
|
|
161
|
+
"""
|
|
162
|
+
UPDATE folders
|
|
163
|
+
SET relative_path = ?,
|
|
164
|
+
name = ?,
|
|
165
|
+
parent_path = ?,
|
|
166
|
+
scanned_at = ?,
|
|
167
|
+
updated_at = CURRENT_TIMESTAMP
|
|
168
|
+
WHERE path = ?
|
|
169
|
+
""",
|
|
170
|
+
(
|
|
171
|
+
folder["relative_path"],
|
|
172
|
+
folder["name"],
|
|
173
|
+
folder["parent_path"],
|
|
174
|
+
folder["scanned_at"],
|
|
175
|
+
folder["path"],
|
|
176
|
+
),
|
|
177
|
+
)
|
|
178
|
+
updated += 1
|
|
179
|
+
|
|
180
|
+
self.db.conn.commit()
|
|
181
|
+
|
|
182
|
+
logger.info(f"Saved folders: {inserted} inserted, {updated} updated")
|
|
183
|
+
return inserted, updated
|
|
184
|
+
|
|
185
|
+
def get_folder_stats(self) -> Dict:
|
|
186
|
+
"""Get statistics about indexed folders."""
|
|
187
|
+
cursor = self.db.conn.cursor()
|
|
188
|
+
|
|
189
|
+
cursor.execute("SELECT COUNT(*) FROM folders")
|
|
190
|
+
total = cursor.fetchone()[0]
|
|
191
|
+
|
|
192
|
+
return {"total_folders": total}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def main():
|
|
196
|
+
"""Run folder indexer from command line."""
|
|
197
|
+
from footprinter.ingest.database import Database
|
|
198
|
+
from footprinter.paths import get_db_path
|
|
199
|
+
from footprinter.source_registry import get_config
|
|
200
|
+
|
|
201
|
+
# Load config
|
|
202
|
+
config = get_config()
|
|
203
|
+
|
|
204
|
+
# Database
|
|
205
|
+
db = Database(str(get_db_path()))
|
|
206
|
+
|
|
207
|
+
# Create indexer
|
|
208
|
+
indexer = FolderIndexer(config, db)
|
|
209
|
+
|
|
210
|
+
# Scan folders
|
|
211
|
+
root_paths = config.get("directories", [])
|
|
212
|
+
if not root_paths:
|
|
213
|
+
raise ValueError("No directories configured. Add directories to config/config.yaml.")
|
|
214
|
+
folders = indexer.scan_folders(root_paths)
|
|
215
|
+
|
|
216
|
+
# Save to database
|
|
217
|
+
inserted, updated = indexer.save_folders(folders)
|
|
218
|
+
|
|
219
|
+
# Log stats
|
|
220
|
+
stats = indexer.get_folder_stats()
|
|
221
|
+
logger.info("Folder Scan Complete:")
|
|
222
|
+
logger.info(f" Total folders: {stats['total_folders']}")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
if __name__ == "__main__":
|
|
226
|
+
main()
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Full content extraction with chunking for semantic search.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from .content_extractors import ContentExtractor
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_pypdf_warned = False
|
|
14
|
+
_docx_warned = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FullContentExtractor(ContentExtractor):
|
|
18
|
+
"""Extract full content from files with intelligent chunking."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
chunk_size: int = 1000,
|
|
23
|
+
chunk_overlap: float = 0.15,
|
|
24
|
+
max_file_size_bytes: int = 50 * 1024 * 1024,
|
|
25
|
+
file_types: Optional[List[str]] = None,
|
|
26
|
+
exclude_patterns: Optional[List[str]] = None,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize full content extractor.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
chunk_size: Size of each chunk in characters
|
|
33
|
+
chunk_overlap: Fractional overlap between chunks (0.0 to 1.0).
|
|
34
|
+
Note: chunking.py uses absolute chars; this uses a fraction.
|
|
35
|
+
max_file_size_bytes: Maximum file size to read (0 = no limit)
|
|
36
|
+
file_types: Allowlist of file extensions (e.g. [".md", ".txt"]).
|
|
37
|
+
None means all supported types are extracted.
|
|
38
|
+
exclude_patterns: fnmatch patterns for file paths to skip.
|
|
39
|
+
"""
|
|
40
|
+
super().__init__(max_preview_length=1000) # Keep small preview for DB
|
|
41
|
+
self.chunk_size = chunk_size
|
|
42
|
+
self.chunk_overlap = chunk_overlap
|
|
43
|
+
self.max_file_size_bytes = max_file_size_bytes
|
|
44
|
+
self.file_types = [t.lower() for t in file_types] if file_types is not None else None
|
|
45
|
+
self.exclude_patterns = exclude_patterns
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def from_config(cls, config: dict) -> "FullContentExtractor":
|
|
49
|
+
"""Build a FullContentExtractor from the application config dict.
|
|
50
|
+
|
|
51
|
+
Reads ``config["indexing"]["max_file_size_mb"]`` (default 0 = no
|
|
52
|
+
limit) and ``config["vectorization"]`` (chunk_size, chunk_overlap,
|
|
53
|
+
file_types, exclude_patterns). Missing vectorization keys fall back
|
|
54
|
+
to constructor defaults.
|
|
55
|
+
"""
|
|
56
|
+
max_mb = config.get("indexing", {}).get("max_file_size_mb", 0)
|
|
57
|
+
vec_config = config.get("vectorization", {})
|
|
58
|
+
vec_kwargs: dict = {}
|
|
59
|
+
if "chunk_size" in vec_config:
|
|
60
|
+
vec_kwargs["chunk_size"] = vec_config["chunk_size"]
|
|
61
|
+
if "chunk_overlap" in vec_config:
|
|
62
|
+
vec_kwargs["chunk_overlap"] = vec_config["chunk_overlap"]
|
|
63
|
+
if "file_types" in vec_config:
|
|
64
|
+
vec_kwargs["file_types"] = vec_config["file_types"]
|
|
65
|
+
if "exclude_patterns" in vec_config:
|
|
66
|
+
vec_kwargs["exclude_patterns"] = vec_config["exclude_patterns"]
|
|
67
|
+
return cls(
|
|
68
|
+
max_file_size_bytes=int(max_mb * 1024 * 1024),
|
|
69
|
+
**vec_kwargs,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def extract_with_chunking(self, file_path: Path) -> List[Dict[str, str]]:
|
|
73
|
+
"""
|
|
74
|
+
Extract content and split into chunks if necessary.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
List of chunk dictionaries with 'content', 'chunk_index', 'total_chunks'
|
|
78
|
+
"""
|
|
79
|
+
# Extract full content
|
|
80
|
+
full_content = self._extract_full_content(file_path)
|
|
81
|
+
|
|
82
|
+
if not full_content or len(full_content) == 0:
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
# If content is small enough, return single chunk
|
|
86
|
+
if len(full_content) <= self.chunk_size:
|
|
87
|
+
return [{"content": full_content, "chunk_index": 0, "total_chunks": 1}]
|
|
88
|
+
|
|
89
|
+
# Split into chunks with overlap
|
|
90
|
+
chunks = []
|
|
91
|
+
overlap_size = int(self.chunk_size * self.chunk_overlap)
|
|
92
|
+
|
|
93
|
+
start = 0
|
|
94
|
+
chunk_index = 0
|
|
95
|
+
|
|
96
|
+
while start < len(full_content):
|
|
97
|
+
end = min(start + self.chunk_size, len(full_content))
|
|
98
|
+
chunk_text = full_content[start:end]
|
|
99
|
+
|
|
100
|
+
chunks.append(
|
|
101
|
+
{
|
|
102
|
+
"content": chunk_text,
|
|
103
|
+
"chunk_index": chunk_index,
|
|
104
|
+
"total_chunks": 0, # Will update after loop
|
|
105
|
+
}
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Move start forward, with overlap
|
|
109
|
+
if end >= len(full_content):
|
|
110
|
+
break
|
|
111
|
+
start = end - overlap_size
|
|
112
|
+
chunk_index += 1
|
|
113
|
+
|
|
114
|
+
# Update total_chunks count
|
|
115
|
+
total = len(chunks)
|
|
116
|
+
for chunk in chunks:
|
|
117
|
+
chunk["total_chunks"] = total
|
|
118
|
+
|
|
119
|
+
if total > 1:
|
|
120
|
+
logger.info(f"Split {file_path.name} into {total} chunks")
|
|
121
|
+
|
|
122
|
+
return chunks
|
|
123
|
+
|
|
124
|
+
def _extract_full_content(self, file_path: Path) -> Optional[str]:
|
|
125
|
+
"""Extract full content from file.
|
|
126
|
+
|
|
127
|
+
Gates checked in order: file type allowlist, exclude patterns,
|
|
128
|
+
file size limit. Returns None if any gate rejects the file.
|
|
129
|
+
"""
|
|
130
|
+
# File type allowlist gate (cheap — check before stat())
|
|
131
|
+
file_type = file_path.suffix.lower()
|
|
132
|
+
if self.file_types is not None and file_type not in self.file_types:
|
|
133
|
+
logger.debug("Skipping %s: extension %s not in file_types allowlist", file_path.name, file_type)
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
# Exclude patterns gate (fnmatch against full absolute path)
|
|
137
|
+
if self.exclude_patterns:
|
|
138
|
+
from fnmatch import fnmatch
|
|
139
|
+
|
|
140
|
+
path_str = str(file_path)
|
|
141
|
+
if any(fnmatch(path_str, pat) for pat in self.exclude_patterns):
|
|
142
|
+
logger.debug("Skipping %s: matched exclude pattern", file_path.name)
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
# File size guard
|
|
146
|
+
if self.max_file_size_bytes > 0:
|
|
147
|
+
try:
|
|
148
|
+
file_size = file_path.stat().st_size
|
|
149
|
+
if file_size > self.max_file_size_bytes:
|
|
150
|
+
logger.warning(
|
|
151
|
+
f"Skipping {file_path.name}: {file_size} bytes "
|
|
152
|
+
f"exceeds content extraction limit of {self.max_file_size_bytes} bytes"
|
|
153
|
+
)
|
|
154
|
+
return None
|
|
155
|
+
except OSError:
|
|
156
|
+
pass # stat failed — let the read attempt handle the error
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
# Text-based files
|
|
160
|
+
if file_type in [
|
|
161
|
+
".txt",
|
|
162
|
+
".md",
|
|
163
|
+
".py",
|
|
164
|
+
".js",
|
|
165
|
+
".json",
|
|
166
|
+
".yaml",
|
|
167
|
+
".yml",
|
|
168
|
+
".html",
|
|
169
|
+
".css",
|
|
170
|
+
".jsx",
|
|
171
|
+
".tsx",
|
|
172
|
+
]:
|
|
173
|
+
return self._extract_full_text(file_path)
|
|
174
|
+
|
|
175
|
+
# Documents
|
|
176
|
+
elif file_type == ".pdf":
|
|
177
|
+
return self._extract_full_pdf(file_path)
|
|
178
|
+
elif file_type in [".docx", ".doc"]:
|
|
179
|
+
return self._extract_full_docx(file_path)
|
|
180
|
+
|
|
181
|
+
# Data files
|
|
182
|
+
elif file_type == ".csv":
|
|
183
|
+
return self._extract_csv_full(file_path)
|
|
184
|
+
|
|
185
|
+
# Other text-like formats
|
|
186
|
+
elif file_type in [
|
|
187
|
+
".xml",
|
|
188
|
+
".svg",
|
|
189
|
+
".rst",
|
|
190
|
+
".toml",
|
|
191
|
+
".ini",
|
|
192
|
+
".cfg",
|
|
193
|
+
".conf",
|
|
194
|
+
".sh",
|
|
195
|
+
".bash",
|
|
196
|
+
".zsh",
|
|
197
|
+
".fish",
|
|
198
|
+
".sql",
|
|
199
|
+
".graphql",
|
|
200
|
+
".proto",
|
|
201
|
+
".ts",
|
|
202
|
+
".vue",
|
|
203
|
+
".svelte",
|
|
204
|
+
".astro",
|
|
205
|
+
".java",
|
|
206
|
+
".kt",
|
|
207
|
+
".scala",
|
|
208
|
+
".go",
|
|
209
|
+
".rs",
|
|
210
|
+
".rb",
|
|
211
|
+
".php",
|
|
212
|
+
".c",
|
|
213
|
+
".h",
|
|
214
|
+
".cpp",
|
|
215
|
+
".hpp",
|
|
216
|
+
".cs",
|
|
217
|
+
".swift",
|
|
218
|
+
".m",
|
|
219
|
+
".r",
|
|
220
|
+
".jl",
|
|
221
|
+
".lua",
|
|
222
|
+
".pl",
|
|
223
|
+
".pm",
|
|
224
|
+
".tf",
|
|
225
|
+
".hcl",
|
|
226
|
+
".dockerfile",
|
|
227
|
+
".log",
|
|
228
|
+
".env",
|
|
229
|
+
".gitignore",
|
|
230
|
+
".editorconfig",
|
|
231
|
+
".tex",
|
|
232
|
+
".bib",
|
|
233
|
+
".org",
|
|
234
|
+
]:
|
|
235
|
+
return self._extract_full_text(file_path)
|
|
236
|
+
|
|
237
|
+
else:
|
|
238
|
+
# Skip binary/unknown files (images, video, audio, archives, etc.)
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.debug(f"Could not extract content from {file_path}: {e}")
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
def _extract_full_text(self, file_path: Path) -> Optional[str]:
|
|
246
|
+
"""Extract full text content."""
|
|
247
|
+
try:
|
|
248
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
249
|
+
return f.read()
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.debug(f"Error reading text file {file_path}: {e}")
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
def _extract_full_pdf(self, file_path: Path) -> Optional[str]:
|
|
255
|
+
"""Extract full PDF content."""
|
|
256
|
+
try:
|
|
257
|
+
import pypdf
|
|
258
|
+
|
|
259
|
+
with open(file_path, "rb") as f:
|
|
260
|
+
reader = pypdf.PdfReader(f)
|
|
261
|
+
|
|
262
|
+
text = ""
|
|
263
|
+
for page in reader.pages:
|
|
264
|
+
text += page.extract_text() + "\n"
|
|
265
|
+
|
|
266
|
+
return text
|
|
267
|
+
|
|
268
|
+
except ImportError:
|
|
269
|
+
global _pypdf_warned
|
|
270
|
+
if not _pypdf_warned:
|
|
271
|
+
logger.warning("pypdf not installed, skipping PDF extraction")
|
|
272
|
+
_pypdf_warned = True
|
|
273
|
+
return None
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.debug(f"Error reading PDF {file_path}: {e}")
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
def _extract_full_docx(self, file_path: Path) -> Optional[str]:
|
|
279
|
+
"""Extract full DOCX content."""
|
|
280
|
+
try:
|
|
281
|
+
import docx
|
|
282
|
+
|
|
283
|
+
doc = docx.Document(file_path)
|
|
284
|
+
|
|
285
|
+
text = ""
|
|
286
|
+
for para in doc.paragraphs:
|
|
287
|
+
text += para.text + "\n"
|
|
288
|
+
|
|
289
|
+
return text
|
|
290
|
+
|
|
291
|
+
except ImportError:
|
|
292
|
+
global _docx_warned
|
|
293
|
+
if not _docx_warned:
|
|
294
|
+
logger.warning("python-docx not installed, skipping DOCX extraction")
|
|
295
|
+
_docx_warned = True
|
|
296
|
+
return None
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.debug(f"Error reading DOCX {file_path}: {e}")
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
def _extract_csv_full(self, file_path: Path) -> Optional[str]:
|
|
302
|
+
"""Extract CSV content (headers + sample rows)."""
|
|
303
|
+
try:
|
|
304
|
+
import csv
|
|
305
|
+
|
|
306
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
307
|
+
reader = csv.reader(f)
|
|
308
|
+
|
|
309
|
+
lines = []
|
|
310
|
+
for i, row in enumerate(reader):
|
|
311
|
+
lines.append(",".join(row))
|
|
312
|
+
|
|
313
|
+
# Limit to reasonable size
|
|
314
|
+
if i >= 1000: # First 1000 rows
|
|
315
|
+
break
|
|
316
|
+
|
|
317
|
+
return "\n".join(lines)
|
|
318
|
+
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logger.debug(f"Error reading CSV {file_path}: {e}")
|
|
321
|
+
return None
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Thin facade — coordinates pipeline pipes via delegation to extracted modules."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, List
|
|
5
|
+
|
|
6
|
+
from footprinter.connectors import discover_connectors, get_connector_pipes, get_schema_specs
|
|
7
|
+
from footprinter.ingest.pipe_runner import PipeRunner
|
|
8
|
+
from footprinter.ingest.registry import (
|
|
9
|
+
CORE_PIPE_REGISTRY,
|
|
10
|
+
get_all_pipes,
|
|
11
|
+
get_pipelines,
|
|
12
|
+
get_refresh_pipes,
|
|
13
|
+
)
|
|
14
|
+
from footprinter.paths import get_config_path, get_db_path
|
|
15
|
+
from footprinter.services.ingest_service import IngestService
|
|
16
|
+
from footprinter.source_registry import get_config
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DataPipelineOrchestrator:
|
|
22
|
+
"""Composition root — merges core + connector sources, delegates to PipeRunner."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, config_path: str = None):
|
|
25
|
+
self.config = get_config(config_path)
|
|
26
|
+
self.config_path = config_path or str(get_config_path())
|
|
27
|
+
self.db = None
|
|
28
|
+
self.full_mode = False
|
|
29
|
+
from footprinter.source_registry import remote_accounts
|
|
30
|
+
self.remote_accounts = remote_accounts()
|
|
31
|
+
self._connectors = discover_connectors()
|
|
32
|
+
connector_pipes = get_connector_pipes(self._connectors)
|
|
33
|
+
self.adapter_registry = {**CORE_PIPE_REGISTRY, **connector_pipes}
|
|
34
|
+
|
|
35
|
+
# Build connector metadata for pipeline resolution and skip hints
|
|
36
|
+
connector_pipelines: dict[str, list[str]] = {}
|
|
37
|
+
connector_pipe_map: dict[str, str] = {}
|
|
38
|
+
for name, spec in self._connectors.items():
|
|
39
|
+
connector_pipelines[name] = list(spec.adapter_entries.keys())
|
|
40
|
+
for pipe in spec.pipes:
|
|
41
|
+
connector_pipe_map[pipe] = name
|
|
42
|
+
for pipe in spec.adapter_entries:
|
|
43
|
+
connector_pipe_map[pipe] = name
|
|
44
|
+
|
|
45
|
+
self.pipelines = get_pipelines(connector_pipes, connector_pipelines)
|
|
46
|
+
self.refresh_pipes = get_refresh_pipes(connector_pipes, connector_pipelines)
|
|
47
|
+
self.all_pipes = get_all_pipes(connector_pipes)
|
|
48
|
+
|
|
49
|
+
# Ensure DB schema exists (fresh installs need tables before pipes run)
|
|
50
|
+
from .database import Database
|
|
51
|
+
Database(str(get_db_path()), connector_specs=get_schema_specs(self._connectors)).close()
|
|
52
|
+
from .processing import ProcessingPipeline, run_access_resolution
|
|
53
|
+
self.processing = ProcessingPipeline()
|
|
54
|
+
self.processing.register(
|
|
55
|
+
"access_resolution",
|
|
56
|
+
runner=lambda db: run_access_resolution(db, full_mode=self.full_mode),
|
|
57
|
+
)
|
|
58
|
+
self.runner = PipeRunner(
|
|
59
|
+
processing=self.processing, get_db=self._get_db,
|
|
60
|
+
config=self.config, config_path=self.config_path,
|
|
61
|
+
adapter_registry=self.adapter_registry, pipelines=self.pipelines,
|
|
62
|
+
all_pipes=self.all_pipes, connector_pipe_map=connector_pipe_map,
|
|
63
|
+
)
|
|
64
|
+
self.ingest_service = IngestService(self._get_db().conn, get_db=self._get_db)
|
|
65
|
+
|
|
66
|
+
def _get_db(self):
|
|
67
|
+
if self.db is None:
|
|
68
|
+
from .database import Database
|
|
69
|
+
self.db = Database(str(get_db_path()), connector_specs=get_schema_specs(self._connectors))
|
|
70
|
+
return self.db
|
|
71
|
+
|
|
72
|
+
def run_pipe(self, pipe: str) -> Dict:
|
|
73
|
+
"""Execute a single pipe by name."""
|
|
74
|
+
self.runner.full_mode = self.full_mode
|
|
75
|
+
mode = "full" if self.full_mode else "incremental"
|
|
76
|
+
return self.ingest_service.run_pipe(pipe, mode=mode, trigger="cli", runner=self.runner)
|
|
77
|
+
|
|
78
|
+
def run_pipeline(self, pipeline_name: str, on_pipe_start=None, on_pipe_end=None, on_progress=None) -> List[Dict]:
|
|
79
|
+
"""Execute all pipes in a named pipeline."""
|
|
80
|
+
if pipeline_name not in self.runner.pipelines:
|
|
81
|
+
raise ValueError(f"Unknown pipeline: {pipeline_name}. Available: {', '.join(self.runner.pipelines.keys())}")
|
|
82
|
+
return self.run_pipes(
|
|
83
|
+
self.runner.pipelines[pipeline_name],
|
|
84
|
+
on_pipe_start=on_pipe_start, on_pipe_end=on_pipe_end, on_progress=on_progress,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def run_pipes(self, pipes: List[str], on_pipe_start=None, on_pipe_end=None, on_progress=None) -> List[Dict]:
|
|
88
|
+
"""Execute a list of specific pipes in order."""
|
|
89
|
+
self.runner.full_mode = self.full_mode
|
|
90
|
+
mode = "full" if self.full_mode else "incremental"
|
|
91
|
+
|
|
92
|
+
def hook(pipe, on_progress=None):
|
|
93
|
+
return self.ingest_service.run_pipe(
|
|
94
|
+
pipe, mode=mode, trigger="cli", runner=self.runner, on_progress=on_progress,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return self.ingest_service.run_pipes(
|
|
98
|
+
pipes, runner=self.runner, full_mode=self.full_mode,
|
|
99
|
+
on_pipe_start=on_pipe_start, on_pipe_end=on_pipe_end,
|
|
100
|
+
on_progress=on_progress, pipe_hook=hook,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def get_status(self) -> Dict:
|
|
104
|
+
"""Return current data counts and pipeline health."""
|
|
105
|
+
from footprinter.ingest.status import get_status
|
|
106
|
+
return get_status(str(get_db_path()))
|
|
107
|
+
|
|
108
|
+
def close(self):
|
|
109
|
+
"""Close the database connection and release resources."""
|
|
110
|
+
if self.db:
|
|
111
|
+
self.db.close()
|
|
112
|
+
self.db = None
|