footprinter-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +444 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/cli/__init__.py +128 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +332 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +579 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +1836 -0
- footprinter/cli/status.py +729 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +610 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +741 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +515 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +328 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +261 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +125 -0
- footprinter/ingest/pipe_runner.py +217 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +201 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +57 -0
- footprinter/mcp/errors.py +102 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +15 -0
- footprinter/paths.py +91 -0
- footprinter/permissions.py +1160 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1272 -0
- footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0.dist-info/METADATA +229 -0
- footprinter_cli-1.0.0.dist-info/RECORD +134 -0
- footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File indexer that coordinates file scanning and content extraction.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Callable, Dict, Optional
|
|
9
|
+
|
|
10
|
+
from footprinter.db import files as files_db
|
|
11
|
+
from footprinter.source_registry import get_config
|
|
12
|
+
|
|
13
|
+
from .content_extractors import ContentExtractor
|
|
14
|
+
from .database import Database
|
|
15
|
+
from .file_scanner import FileScanner
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FileIndexer:
|
|
21
|
+
"""File indexer coordinating all indexing operations."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, config_path: str = None, last_run: Optional[datetime] = None, db: Optional["Database"] = None):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the indexer.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
config_path: Path to config YAML file (default: resolved via get_config_path())
|
|
29
|
+
last_run: Timestamp of last successful run. If set, only index files
|
|
30
|
+
modified after this time. None means full scan.
|
|
31
|
+
db: Optional shared Database handle. If None, creates its own.
|
|
32
|
+
"""
|
|
33
|
+
self.config = get_config(config_path)
|
|
34
|
+
self.db = db if db is not None else Database()
|
|
35
|
+
self._owns_db = db is None
|
|
36
|
+
self.incremental = last_run is not None
|
|
37
|
+
|
|
38
|
+
if last_run:
|
|
39
|
+
logger.info(f"Incremental mode: indexing files modified since {last_run}")
|
|
40
|
+
else:
|
|
41
|
+
logger.info("Full scan mode (no last_run provided)")
|
|
42
|
+
|
|
43
|
+
self._vector_store = None # lazy
|
|
44
|
+
self._full_extractor = None # lazy
|
|
45
|
+
self._vec_counts = {
|
|
46
|
+
"vectorized_new": 0,
|
|
47
|
+
"vectorized_refreshed": 0,
|
|
48
|
+
"vectorized_skipped_unchanged": 0,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
self.file_scanner = FileScanner(self.config, since_datetime=last_run)
|
|
52
|
+
self.content_extractor = ContentExtractor()
|
|
53
|
+
|
|
54
|
+
def index_files(
|
|
55
|
+
self,
|
|
56
|
+
relationship_maps: Optional[Dict[str, Any]] = None,
|
|
57
|
+
on_progress: Optional[Callable[[int], None]] = None,
|
|
58
|
+
) -> dict:
|
|
59
|
+
"""Index all files from configured directories to files table.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
relationship_maps: Optional pre-built maps for in-memory
|
|
63
|
+
project/folder resolution. When provided, avoids per-row SQL.
|
|
64
|
+
on_progress: Optional callback fired with cumulative file count
|
|
65
|
+
after each file is processed (inserted + updated + skipped +
|
|
66
|
+
unchanged + errors).
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Dict with keys: inserted, updated, skipped, unchanged, errors
|
|
70
|
+
"""
|
|
71
|
+
logger.info("Starting file indexing to files...")
|
|
72
|
+
|
|
73
|
+
self._vec_counts = {
|
|
74
|
+
"vectorized_new": 0,
|
|
75
|
+
"vectorized_refreshed": 0,
|
|
76
|
+
"vectorized_skipped_unchanged": 0,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
inserted_count = 0
|
|
80
|
+
updated_count = 0
|
|
81
|
+
skipped_count = 0
|
|
82
|
+
unchanged_count = 0
|
|
83
|
+
error_count = 0
|
|
84
|
+
total_processed = 0
|
|
85
|
+
batch = []
|
|
86
|
+
batch_size = 1000 # Commit every 1000 files for performance
|
|
87
|
+
self._indexed_paths = set() # Track all indexed paths for stale detection
|
|
88
|
+
|
|
89
|
+
for file_metadata in self.file_scanner.scan_all_directories():
|
|
90
|
+
try:
|
|
91
|
+
# Extract content preview (only when opt-in enabled)
|
|
92
|
+
if self.config.get("indexing", {}).get("content_snippets", False):
|
|
93
|
+
file_path = Path(file_metadata["file_path"])
|
|
94
|
+
content_preview = self.content_extractor.extract(file_path)
|
|
95
|
+
file_metadata["content_preview"] = content_preview
|
|
96
|
+
else:
|
|
97
|
+
file_metadata["content_preview"] = None
|
|
98
|
+
|
|
99
|
+
# Add to batch
|
|
100
|
+
batch.append(file_metadata)
|
|
101
|
+
self._indexed_paths.add(file_metadata["file_path"])
|
|
102
|
+
|
|
103
|
+
# Batch insert for performance
|
|
104
|
+
if len(batch) >= batch_size:
|
|
105
|
+
bi, bu, bs, bun = self._insert_batch(batch, relationship_maps)
|
|
106
|
+
inserted_count += bi
|
|
107
|
+
updated_count += bu
|
|
108
|
+
skipped_count += bs
|
|
109
|
+
unchanged_count += bun
|
|
110
|
+
batch = []
|
|
111
|
+
logger.info(
|
|
112
|
+
f"Progress: {inserted_count:,} inserted,"
|
|
113
|
+
f" {updated_count:,} updated,"
|
|
114
|
+
f" {unchanged_count:,} unchanged,"
|
|
115
|
+
f" {skipped_count:,} skipped..."
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
except Exception as e: # Intentional broad catch: batch loop must not abort on single-item errors
|
|
119
|
+
logger.error(f"Error indexing file {file_metadata.get('file_path')}: {e}")
|
|
120
|
+
error_count += 1
|
|
121
|
+
finally:
|
|
122
|
+
total_processed += 1
|
|
123
|
+
if on_progress is not None:
|
|
124
|
+
on_progress(total_processed)
|
|
125
|
+
|
|
126
|
+
# Insert remaining files
|
|
127
|
+
if batch:
|
|
128
|
+
bi, bu, bs, bun = self._insert_batch(batch, relationship_maps)
|
|
129
|
+
inserted_count += bi
|
|
130
|
+
updated_count += bu
|
|
131
|
+
skipped_count += bs
|
|
132
|
+
unchanged_count += bun
|
|
133
|
+
|
|
134
|
+
# Mark stale files (no longer on disk)
|
|
135
|
+
# Only do this in full mode - incremental mode only scans modified files
|
|
136
|
+
if not self.incremental:
|
|
137
|
+
removed_ids = files_db.mark_removed_files(self.db.conn, self._indexed_paths)
|
|
138
|
+
if removed_ids:
|
|
139
|
+
store = self._get_vector_store()
|
|
140
|
+
if store:
|
|
141
|
+
for file_id in removed_ids:
|
|
142
|
+
try:
|
|
143
|
+
store.delete_file(file_id)
|
|
144
|
+
except Exception: # Intentional broad catch: vector cleanup is best-effort
|
|
145
|
+
logger.warning("Failed to delete vectors for removed file_id=%s", file_id, exc_info=True)
|
|
146
|
+
logger.info(f"Marked {len(removed_ids):,} files as stale (no longer on disk)")
|
|
147
|
+
else:
|
|
148
|
+
logger.info("Skipping stale detection in incremental mode")
|
|
149
|
+
|
|
150
|
+
logger.info(
|
|
151
|
+
f"File indexing complete: {inserted_count:,} inserted,"
|
|
152
|
+
f" {updated_count:,} updated,"
|
|
153
|
+
f" {unchanged_count:,} unchanged,"
|
|
154
|
+
f" {skipped_count:,} skipped,"
|
|
155
|
+
f" {error_count:,} errors"
|
|
156
|
+
f" | vectors: {self._vec_counts['vectorized_new']:,} new,"
|
|
157
|
+
f" {self._vec_counts['vectorized_refreshed']:,} refreshed,"
|
|
158
|
+
f" {self._vec_counts['vectorized_skipped_unchanged']:,} skipped-unchanged"
|
|
159
|
+
)
|
|
160
|
+
return {
|
|
161
|
+
"inserted": inserted_count,
|
|
162
|
+
"updated": updated_count,
|
|
163
|
+
"skipped": skipped_count,
|
|
164
|
+
"unchanged": unchanged_count,
|
|
165
|
+
"errors": error_count,
|
|
166
|
+
**self._vec_counts,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
def _get_vector_store(self):
|
|
170
|
+
if self._vector_store is None:
|
|
171
|
+
try:
|
|
172
|
+
from footprinter.semantic.vector_store import VectorStore
|
|
173
|
+
|
|
174
|
+
self._vector_store = VectorStore.get_instance()
|
|
175
|
+
except Exception as e: # Intentional broad catch: vector store is optional; any init failure disables it
|
|
176
|
+
logger.warning("Vector store unavailable: %s", e)
|
|
177
|
+
self._vector_store = False # sentinel: don't retry
|
|
178
|
+
return self._vector_store if self._vector_store is not False else None
|
|
179
|
+
|
|
180
|
+
def _vectorize_file(self, file_id, file_path, result_type="updated"):
|
|
181
|
+
try:
|
|
182
|
+
from footprinter.semantic.vector_store import _file_vectorization_enabled
|
|
183
|
+
except ImportError:
|
|
184
|
+
return
|
|
185
|
+
if not _file_vectorization_enabled():
|
|
186
|
+
return
|
|
187
|
+
row = self.db.conn.execute(
|
|
188
|
+
"SELECT COALESCE(json_extract(metadata, '$.vectorize'), 1) as vec,"
|
|
189
|
+
" vectorized_at, vectorized_chunks FROM files WHERE id = ?",
|
|
190
|
+
(file_id,),
|
|
191
|
+
).fetchone()
|
|
192
|
+
if row and row["vec"] == 0:
|
|
193
|
+
return
|
|
194
|
+
already_vectorized = (
|
|
195
|
+
row is not None
|
|
196
|
+
and row["vectorized_at"] is not None
|
|
197
|
+
and (row["vectorized_chunks"] or 0) > 0
|
|
198
|
+
)
|
|
199
|
+
if result_type == "unchanged" and already_vectorized:
|
|
200
|
+
self._vec_counts["vectorized_skipped_unchanged"] += 1
|
|
201
|
+
return
|
|
202
|
+
store = self._get_vector_store()
|
|
203
|
+
if not store:
|
|
204
|
+
return
|
|
205
|
+
try:
|
|
206
|
+
path = Path(file_path)
|
|
207
|
+
if not path.exists():
|
|
208
|
+
return
|
|
209
|
+
if self._full_extractor is None:
|
|
210
|
+
from .full_content_extractor import FullContentExtractor
|
|
211
|
+
|
|
212
|
+
self._full_extractor = FullContentExtractor.from_config(self.config)
|
|
213
|
+
chunks = self._full_extractor.extract_with_chunking(path)
|
|
214
|
+
if not chunks:
|
|
215
|
+
return
|
|
216
|
+
metadata = {"file_type": path.suffix.lower(), "file_name": path.name}
|
|
217
|
+
store.upsert_file(file_id, file_path, chunks, metadata)
|
|
218
|
+
self.db.conn.execute(
|
|
219
|
+
"UPDATE files SET vectorized_at = CURRENT_TIMESTAMP, vectorized_chunks = ? WHERE id = ?",
|
|
220
|
+
(len(chunks), file_id),
|
|
221
|
+
)
|
|
222
|
+
if already_vectorized:
|
|
223
|
+
self._vec_counts["vectorized_refreshed"] += 1
|
|
224
|
+
else:
|
|
225
|
+
self._vec_counts["vectorized_new"] += 1
|
|
226
|
+
except Exception as e: # Intentional broad catch: file vectorization is optional enhancement
|
|
227
|
+
logger.debug(f"Vectorization skipped for {file_path}: {e}")
|
|
228
|
+
|
|
229
|
+
def _insert_batch(
|
|
230
|
+
self,
|
|
231
|
+
batch,
|
|
232
|
+
relationship_maps: Optional[Dict[str, Any]] = None,
|
|
233
|
+
) -> tuple:
|
|
234
|
+
"""Insert a batch of files into files table.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Tuple of (inserted, updated, skipped, unchanged) counts
|
|
238
|
+
"""
|
|
239
|
+
inserted = 0
|
|
240
|
+
updated = 0
|
|
241
|
+
skipped = 0
|
|
242
|
+
unchanged = 0
|
|
243
|
+
for file_metadata in batch:
|
|
244
|
+
result = files_db.insert_file(self.db.conn, file_metadata, relationship_maps=relationship_maps)
|
|
245
|
+
if result is None:
|
|
246
|
+
skipped += 1
|
|
247
|
+
continue
|
|
248
|
+
result_type, file_id = result
|
|
249
|
+
if result_type == "inserted":
|
|
250
|
+
inserted += 1
|
|
251
|
+
elif result_type == "unchanged":
|
|
252
|
+
unchanged += 1
|
|
253
|
+
else:
|
|
254
|
+
updated += 1
|
|
255
|
+
self._vectorize_file(
|
|
256
|
+
file_id,
|
|
257
|
+
file_metadata.get("file_path") or file_metadata.get("path"),
|
|
258
|
+
result_type=result_type,
|
|
259
|
+
)
|
|
260
|
+
self.db.conn.commit()
|
|
261
|
+
return inserted, updated, skipped, unchanged
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File system scanner with exclusion support.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, Generator, List, Optional
|
|
11
|
+
|
|
12
|
+
from ..utils.hash_utils import compute_md5, compute_sha256
|
|
13
|
+
from ..utils.time import UTC_FMT
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_creation_time(stat_result: os.stat_result) -> float:
|
|
19
|
+
"""Return the best available file creation timestamp.
|
|
20
|
+
|
|
21
|
+
On macOS (and Python 3.12+ on Linux with supported filesystems),
|
|
22
|
+
``st_birthtime`` gives the true creation time. On older Linux,
|
|
23
|
+
``st_ctime`` is the inode-change time (chmod, chown) — not creation —
|
|
24
|
+
so we fall back to ``st_mtime`` as the closest available proxy.
|
|
25
|
+
"""
|
|
26
|
+
if hasattr(stat_result, "st_birthtime") and stat_result.st_birthtime > 0:
|
|
27
|
+
return stat_result.st_birthtime
|
|
28
|
+
return stat_result.st_mtime
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _expand_home(pattern: str) -> str:
|
|
32
|
+
"""Expand ~ to the current user's home directory in regex patterns.
|
|
33
|
+
|
|
34
|
+
Patterns starting with ^~/ have the ~ replaced with the regex-escaped
|
|
35
|
+
home directory path, making them platform-agnostic.
|
|
36
|
+
"""
|
|
37
|
+
if pattern.startswith("^~/"):
|
|
38
|
+
home = re.escape(os.path.expanduser("~"))
|
|
39
|
+
return "^" + home + pattern[2:]
|
|
40
|
+
return pattern
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class FileScanner:
|
|
44
|
+
"""File system scanner with configurable exclusion patterns."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, config: Dict, since_datetime: Optional[datetime] = None):
|
|
47
|
+
"""
|
|
48
|
+
Initialize file scanner.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
config: Configuration dictionary
|
|
52
|
+
since_datetime: If provided, only scan files modified after this datetime
|
|
53
|
+
(for incremental indexing)
|
|
54
|
+
"""
|
|
55
|
+
self.config = config
|
|
56
|
+
self.since_datetime = since_datetime
|
|
57
|
+
self.always_exclusions = self._compile_always_exclusions()
|
|
58
|
+
self.sensitive_exclusions = self._compile_sensitive_exclusions()
|
|
59
|
+
self.supported_extensions = set(config.get("indexing", {}).get("supported_extensions", []))
|
|
60
|
+
# 0 = no size limit (matches config.example.yaml)
|
|
61
|
+
self.max_file_size = config.get("indexing", {}).get("max_file_size_mb", 0) * 1024 * 1024
|
|
62
|
+
|
|
63
|
+
def _compile_always_exclusions(self) -> List[re.Pattern]:
|
|
64
|
+
"""Compile 'always' exclusion patterns (apply to all folders)."""
|
|
65
|
+
patterns = []
|
|
66
|
+
exclusions = self.config.get("exclusions", {})
|
|
67
|
+
|
|
68
|
+
for pattern in exclusions.get("always", []):
|
|
69
|
+
patterns.append(re.compile(_expand_home(pattern)))
|
|
70
|
+
|
|
71
|
+
return patterns
|
|
72
|
+
|
|
73
|
+
def _compile_sensitive_exclusions(self) -> List[re.Pattern]:
|
|
74
|
+
"""Compile sensitive exclusion patterns (apply to all folders)."""
|
|
75
|
+
patterns = []
|
|
76
|
+
exclusions = self.config.get("exclusions", {})
|
|
77
|
+
|
|
78
|
+
for pattern in exclusions.get("sensitive", []):
|
|
79
|
+
patterns.append(re.compile(_expand_home(pattern)))
|
|
80
|
+
|
|
81
|
+
return patterns
|
|
82
|
+
|
|
83
|
+
def should_exclude(self, file_path: str) -> bool:
|
|
84
|
+
"""
|
|
85
|
+
Check if file should be excluded based on patterns.
|
|
86
|
+
|
|
87
|
+
v3 Architecture (2026-01): Index ALL files in ~/Work and ~/Personal.
|
|
88
|
+
Hidden files are indexed with status='hidden', not excluded.
|
|
89
|
+
|
|
90
|
+
Only exclude:
|
|
91
|
+
- always: Regeneratable dependencies (node_modules, venv, .git internals)
|
|
92
|
+
- sensitive: Credentials and keys (.aws, .ssh, .kube)
|
|
93
|
+
|
|
94
|
+
REMOVED: client_hidden exclusions - hidden files are now indexed with
|
|
95
|
+
status='hidden' and filtered in the Web UI, not at scan time.
|
|
96
|
+
"""
|
|
97
|
+
# Check 'always' exclusions (node_modules, venv, .git internals, etc.)
|
|
98
|
+
# These are regeneratable dependencies and system noise
|
|
99
|
+
for pattern in self.always_exclusions:
|
|
100
|
+
if pattern.search(file_path):
|
|
101
|
+
logger.debug(f"Excluding {file_path} (always pattern)")
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
# Check sensitive exclusions (.aws, .ssh, .kube - credentials)
|
|
105
|
+
# These should never be indexed for security
|
|
106
|
+
for pattern in self.sensitive_exclusions:
|
|
107
|
+
if pattern.search(file_path):
|
|
108
|
+
logger.debug(f"Excluding {file_path} (sensitive pattern)")
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
# NOTE: Hidden files (starting with .) are NOT excluded
|
|
112
|
+
# They are indexed with status='hidden' in the database
|
|
113
|
+
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
def is_supported_file(self, file_path: Path) -> bool:
|
|
117
|
+
"""Check if file type is supported."""
|
|
118
|
+
if not self.supported_extensions:
|
|
119
|
+
return True # No filter = support all
|
|
120
|
+
return file_path.suffix.lower() in self.supported_extensions
|
|
121
|
+
|
|
122
|
+
def get_file_metadata(self, file_path: Path) -> Optional[Dict]:
|
|
123
|
+
"""Extract file metadata."""
|
|
124
|
+
try:
|
|
125
|
+
stat = file_path.stat()
|
|
126
|
+
|
|
127
|
+
# Skip if too large (0 = no limit)
|
|
128
|
+
if self.max_file_size > 0 and stat.st_size > self.max_file_size:
|
|
129
|
+
logger.debug(f"Skipping {file_path} (too large: {stat.st_size} bytes)")
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
file_path_str = str(file_path.absolute())
|
|
133
|
+
|
|
134
|
+
# Calculate both hashes:
|
|
135
|
+
# - SHA-256 for content deduplication
|
|
136
|
+
# - MD5 for Google Drive matching (Drive uses MD5)
|
|
137
|
+
sha256_hash = compute_sha256(file_path_str)
|
|
138
|
+
md5_hash = compute_md5(file_path_str)
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"file_path": file_path_str,
|
|
142
|
+
"file_name": file_path.name,
|
|
143
|
+
"file_type": file_path.suffix.lower() or "no_extension",
|
|
144
|
+
"file_size": stat.st_size,
|
|
145
|
+
"created_at": datetime.fromtimestamp(_get_creation_time(stat), tz=timezone.utc).strftime(UTC_FMT),
|
|
146
|
+
"modified_at": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).strftime(UTC_FMT),
|
|
147
|
+
"accessed_at": datetime.fromtimestamp(stat.st_atime, tz=timezone.utc).strftime(UTC_FMT),
|
|
148
|
+
"sha256_hash": sha256_hash,
|
|
149
|
+
"md5_hash": md5_hash,
|
|
150
|
+
"metadata": {
|
|
151
|
+
"permissions": oct(stat.st_mode)[-3:],
|
|
152
|
+
},
|
|
153
|
+
}
|
|
154
|
+
except (OSError, OverflowError) as e:
|
|
155
|
+
logger.error(f"Error reading metadata for {file_path}: {e}")
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
def scan_directory(self, directory: str) -> Generator[Dict, None, None]:
|
|
159
|
+
"""
|
|
160
|
+
Scan directory and yield file metadata.
|
|
161
|
+
|
|
162
|
+
Yields file metadata dictionaries for indexing.
|
|
163
|
+
If since_datetime is set, only yields files modified after that time.
|
|
164
|
+
"""
|
|
165
|
+
directory_path = Path(directory).expanduser().resolve()
|
|
166
|
+
|
|
167
|
+
if not directory_path.exists():
|
|
168
|
+
logger.error(f"Directory does not exist: {directory_path}")
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
if not directory_path.is_dir():
|
|
172
|
+
logger.error(f"Path is not a directory: {directory_path}")
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
if self.since_datetime:
|
|
176
|
+
logger.info(f"Scanning directory: {directory_path} (incremental since {self.since_datetime})")
|
|
177
|
+
else:
|
|
178
|
+
logger.info(f"Scanning directory: {directory_path}")
|
|
179
|
+
|
|
180
|
+
file_count = 0
|
|
181
|
+
excluded_count = 0
|
|
182
|
+
skipped_unchanged = 0
|
|
183
|
+
error_count = 0
|
|
184
|
+
seen_real_paths: set[str] = set()
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
for root, dirs, files in os.walk(directory_path, followlinks=True):
|
|
188
|
+
# Check if current directory should be excluded
|
|
189
|
+
# Also check resolved path for directory symlinks
|
|
190
|
+
root_path = Path(root)
|
|
191
|
+
if self.should_exclude(root):
|
|
192
|
+
dirs[:] = [] # Don't recurse into this directory
|
|
193
|
+
excluded_count += len(files)
|
|
194
|
+
continue
|
|
195
|
+
if root_path.is_symlink():
|
|
196
|
+
resolved_root = str(root_path.resolve())
|
|
197
|
+
if self.should_exclude(resolved_root) or resolved_root in seen_real_paths:
|
|
198
|
+
dirs[:] = []
|
|
199
|
+
excluded_count += len(files)
|
|
200
|
+
continue
|
|
201
|
+
seen_real_paths.add(resolved_root)
|
|
202
|
+
|
|
203
|
+
for file_name in files:
|
|
204
|
+
file_path = Path(root) / file_name
|
|
205
|
+
|
|
206
|
+
# Skip broken symlinks early (before stat() call)
|
|
207
|
+
if file_path.is_symlink() and not file_path.exists():
|
|
208
|
+
logger.debug(f"Skipping broken symlink: {file_path}")
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
# Skip excluded files (check symlink path)
|
|
212
|
+
if self.should_exclude(str(file_path)):
|
|
213
|
+
excluded_count += 1
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
# For symlinks, also check the resolved target against exclusions
|
|
217
|
+
if file_path.is_symlink():
|
|
218
|
+
real_path = str(file_path.resolve())
|
|
219
|
+
if self.should_exclude(real_path):
|
|
220
|
+
excluded_count += 1
|
|
221
|
+
continue
|
|
222
|
+
else:
|
|
223
|
+
real_path = str(file_path.resolve())
|
|
224
|
+
|
|
225
|
+
# Dedup: skip if we've already seen this real path
|
|
226
|
+
if real_path in seen_real_paths:
|
|
227
|
+
logger.debug(f"Skipping duplicate path: {file_path} -> {real_path}")
|
|
228
|
+
continue
|
|
229
|
+
seen_real_paths.add(real_path)
|
|
230
|
+
|
|
231
|
+
# Skip unsupported file types
|
|
232
|
+
if not self.is_supported_file(file_path):
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
# For incremental indexing, skip files not modified since last run
|
|
236
|
+
if self.since_datetime:
|
|
237
|
+
try:
|
|
238
|
+
mtime = datetime.fromtimestamp(file_path.stat().st_mtime, tz=timezone.utc)
|
|
239
|
+
since = (
|
|
240
|
+
self.since_datetime.replace(tzinfo=timezone.utc)
|
|
241
|
+
if self.since_datetime.tzinfo is None
|
|
242
|
+
else self.since_datetime
|
|
243
|
+
)
|
|
244
|
+
if mtime <= since:
|
|
245
|
+
skipped_unchanged += 1
|
|
246
|
+
continue
|
|
247
|
+
except OSError:
|
|
248
|
+
logger.debug("Could not check mtime for %s, processing anyway", file_path)
|
|
249
|
+
|
|
250
|
+
# Get metadata
|
|
251
|
+
metadata = self.get_file_metadata(file_path)
|
|
252
|
+
if metadata:
|
|
253
|
+
file_count += 1
|
|
254
|
+
yield metadata
|
|
255
|
+
else:
|
|
256
|
+
error_count += 1
|
|
257
|
+
|
|
258
|
+
except OSError as e:
|
|
259
|
+
logger.error(f"Error scanning directory {directory_path}: {e}")
|
|
260
|
+
error_count += 1
|
|
261
|
+
|
|
262
|
+
if self.since_datetime:
|
|
263
|
+
logger.info(
|
|
264
|
+
f"Scan complete: {file_count} new/modified,"
|
|
265
|
+
f" {skipped_unchanged} unchanged,"
|
|
266
|
+
f" {excluded_count} excluded,"
|
|
267
|
+
f" {error_count} errors"
|
|
268
|
+
)
|
|
269
|
+
else:
|
|
270
|
+
logger.info(f"Scan complete: {file_count} files, {excluded_count} excluded, {error_count} errors")
|
|
271
|
+
|
|
272
|
+
def scan_all_directories(self) -> Generator[Dict, None, None]:
|
|
273
|
+
"""Scan all configured directories."""
|
|
274
|
+
directories = self.config.get("directories", [])
|
|
275
|
+
|
|
276
|
+
for directory in directories:
|
|
277
|
+
yield from self.scan_directory(directory)
|