footprinter-cli 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +431 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
  19. footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
  20. footprinter/bundled/samples/visible-file-sample.txt +2 -0
  21. footprinter/cli/__init__.py +135 -0
  22. footprinter/cli/__main__.py +6 -0
  23. footprinter/cli/_common.py +327 -0
  24. footprinter/cli/_policy_helpers.py +646 -0
  25. footprinter/cli/_prompt.py +220 -0
  26. footprinter/cli/_sample_seed.py +204 -0
  27. footprinter/cli/api_cmd.py +32 -0
  28. footprinter/cli/connect.py +591 -0
  29. footprinter/cli/data.py +879 -0
  30. footprinter/cli/delete.py +128 -0
  31. footprinter/cli/ingest.py +543 -0
  32. footprinter/cli/mcp_cmd.py +750 -0
  33. footprinter/cli/mcp_setup.py +306 -0
  34. footprinter/cli/search.py +393 -0
  35. footprinter/cli/search_cmd.py +69 -0
  36. footprinter/cli/setup.py +2001 -0
  37. footprinter/cli/status.py +747 -0
  38. footprinter/cli/status_cmd.py +104 -0
  39. footprinter/cli/upsert.py +794 -0
  40. footprinter/cli/vectorize_cmd.py +215 -0
  41. footprinter/cli/view.py +322 -0
  42. footprinter/connectors/__init__.py +171 -0
  43. footprinter/connectors/config_utils.py +141 -0
  44. footprinter/db/__init__.py +37 -0
  45. footprinter/db/browser.py +198 -0
  46. footprinter/db/chats.py +602 -0
  47. footprinter/db/clients.py +307 -0
  48. footprinter/db/emails.py +279 -0
  49. footprinter/db/files.py +724 -0
  50. footprinter/db/folders.py +659 -0
  51. footprinter/db/messages.py +192 -0
  52. footprinter/db/policies.py +151 -0
  53. footprinter/db/projects.py +673 -0
  54. footprinter/db/search.py +573 -0
  55. footprinter/db/sql_utils.py +168 -0
  56. footprinter/db/status.py +320 -0
  57. footprinter/db/uploads.py +70 -0
  58. footprinter/ingest/__init__.py +0 -0
  59. footprinter/ingest/adapters/__init__.py +33 -0
  60. footprinter/ingest/adapters/browser.py +54 -0
  61. footprinter/ingest/adapters/chat.py +57 -0
  62. footprinter/ingest/adapters/ingest.py +146 -0
  63. footprinter/ingest/adapters/local_files.py +68 -0
  64. footprinter/ingest/adapters/local_folders.py +52 -0
  65. footprinter/ingest/adapters/protocol.py +174 -0
  66. footprinter/ingest/browser_indexer.py +216 -0
  67. footprinter/ingest/chat_dedup.py +156 -0
  68. footprinter/ingest/chat_indexer.py +487 -0
  69. footprinter/ingest/chat_parsers/__init__.py +8 -0
  70. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  71. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  72. footprinter/ingest/cli.py +827 -0
  73. footprinter/ingest/content_extractors.py +117 -0
  74. footprinter/ingest/database.py +36 -0
  75. footprinter/ingest/db/__init__.py +1 -0
  76. footprinter/ingest/db/connector_schema.py +47 -0
  77. footprinter/ingest/db/migration.py +315 -0
  78. footprinter/ingest/db/schema.py +1043 -0
  79. footprinter/ingest/db/security.py +6 -0
  80. footprinter/ingest/file_indexer.py +223 -0
  81. footprinter/ingest/file_scanner.py +277 -0
  82. footprinter/ingest/folder_indexer.py +226 -0
  83. footprinter/ingest/full_content_extractor.py +321 -0
  84. footprinter/ingest/orchestrator.py +112 -0
  85. footprinter/ingest/pipe_runner.py +200 -0
  86. footprinter/ingest/processing.py +165 -0
  87. footprinter/ingest/registry.py +186 -0
  88. footprinter/ingest/run_record.py +91 -0
  89. footprinter/ingest/status.py +346 -0
  90. footprinter/mcp/__init__.py +0 -0
  91. footprinter/mcp/__main__.py +5 -0
  92. footprinter/mcp/db.py +67 -0
  93. footprinter/mcp/errors.py +105 -0
  94. footprinter/mcp/extraction.py +226 -0
  95. footprinter/mcp/server.py +39 -0
  96. footprinter/mcp/tools/__init__.py +0 -0
  97. footprinter/mcp/tools/navigation.py +70 -0
  98. footprinter/mcp/tools/read.py +75 -0
  99. footprinter/mcp/tools/search.py +158 -0
  100. footprinter/mcp/tools/semantic.py +79 -0
  101. footprinter/mcp/tools/status.py +19 -0
  102. footprinter/paths.py +117 -0
  103. footprinter/permissions.py +1152 -0
  104. footprinter/semantic/__init__.py +13 -0
  105. footprinter/semantic/chunking.py +52 -0
  106. footprinter/semantic/embeddings.py +23 -0
  107. footprinter/semantic/hybrid_search.py +273 -0
  108. footprinter/semantic/vector_store.py +471 -0
  109. footprinter/services/__init__.py +49 -0
  110. footprinter/services/access_service.py +342 -0
  111. footprinter/services/chat_service.py +85 -0
  112. footprinter/services/client_service.py +267 -0
  113. footprinter/services/content_service.py +181 -0
  114. footprinter/services/email_service.py +89 -0
  115. footprinter/services/file_service.py +83 -0
  116. footprinter/services/folder_service.py +122 -0
  117. footprinter/services/includes.py +19 -0
  118. footprinter/services/ingest_service.py +231 -0
  119. footprinter/services/project_service.py +262 -0
  120. footprinter/services/roles.py +25 -0
  121. footprinter/services/search_service.py +177 -0
  122. footprinter/services/semantic_service.py +360 -0
  123. footprinter/services/status_service.py +18 -0
  124. footprinter/services/visit_service.py +65 -0
  125. footprinter/source_registry.py +194 -0
  126. footprinter/utils/__init__.py +7 -0
  127. footprinter/utils/hash_utils.py +59 -0
  128. footprinter/utils/logging_config.py +68 -0
  129. footprinter/utils/mime.py +30 -0
  130. footprinter/utils/text.py +6 -0
  131. footprinter/utils/time.py +11 -0
  132. footprinter/visibility.py +1264 -0
  133. footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
  134. footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
  135. footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
  136. footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
  137. footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
  138. footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,6 @@
1
+ """Security and permission database methods.
2
+
3
+ Permission and visibility tables (permission_policies, visibility_policies)
4
+ and per-row columns (mcp_read, mcp_view) are defined in
5
+ schema.py. Resolution logic lives in permissions.py and visibility.py.
6
+ """
@@ -0,0 +1,223 @@
1
+ """
2
+ File indexer that coordinates file scanning and content extraction.
3
+ """
4
+
5
+ import logging
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Any, Callable, Dict, Optional
9
+
10
+ from footprinter.db import files as files_db
11
+ from footprinter.source_registry import get_config
12
+
13
+ from .content_extractors import ContentExtractor
14
+ from .database import Database
15
+ from .file_scanner import FileScanner
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class FileIndexer:
21
+ """File indexer coordinating all indexing operations."""
22
+
23
+ def __init__(self, config_path: str = None, last_run: Optional[datetime] = None, db: Optional["Database"] = None):
24
+ """
25
+ Initialize the indexer.
26
+
27
+ Args:
28
+ config_path: Path to config YAML file (default: resolved via get_config_path())
29
+ last_run: Timestamp of last successful run. If set, only index files
30
+ modified after this time. None means full scan.
31
+ db: Optional shared Database handle. If None, creates its own.
32
+ """
33
+ self.config = get_config(config_path)
34
+ self.db = db if db is not None else Database()
35
+ self._owns_db = db is None
36
+ self.incremental = last_run is not None
37
+
38
+ if last_run:
39
+ logger.info(f"Incremental mode: indexing files modified since {last_run}")
40
+ else:
41
+ logger.info("Full scan mode (no last_run provided)")
42
+
43
+ self._vector_store = None # lazy
44
+ self._full_extractor = None # lazy
45
+
46
+ self.file_scanner = FileScanner(self.config, since_datetime=last_run)
47
+ self.content_extractor = ContentExtractor()
48
+
49
+ def index_files(
50
+ self,
51
+ relationship_maps: Optional[Dict[str, Any]] = None,
52
+ on_progress: Optional[Callable[[int], None]] = None,
53
+ ) -> dict:
54
+ """Index all files from configured directories to files table.
55
+
56
+ Args:
57
+ relationship_maps: Optional pre-built maps for in-memory
58
+ project/folder resolution. When provided, avoids per-row SQL.
59
+ on_progress: Optional callback fired with cumulative file count
60
+ after each file is processed (inserted + updated + skipped).
61
+
62
+ Returns:
63
+ Dict with keys: inserted, updated, skipped, errors
64
+ """
65
+ logger.info("Starting file indexing to files...")
66
+
67
+ inserted_count = 0
68
+ updated_count = 0
69
+ skipped_count = 0
70
+ error_count = 0
71
+ total_processed = 0
72
+ batch = []
73
+ batch_size = 1000 # Commit every 1000 files for performance
74
+ self._indexed_paths = set() # Track all indexed paths for stale detection
75
+
76
+ for file_metadata in self.file_scanner.scan_all_directories():
77
+ try:
78
+ # Extract content preview (only when opt-in enabled)
79
+ if self.config.get("indexing", {}).get("content_snippets", False):
80
+ file_path = Path(file_metadata["file_path"])
81
+ content_preview = self.content_extractor.extract(file_path)
82
+ file_metadata["content_preview"] = content_preview
83
+ else:
84
+ file_metadata["content_preview"] = None
85
+
86
+ # Add to batch
87
+ batch.append(file_metadata)
88
+ self._indexed_paths.add(file_metadata["file_path"])
89
+
90
+ # Batch insert for performance
91
+ if len(batch) >= batch_size:
92
+ bi, bu, bs = self._insert_batch(batch, relationship_maps)
93
+ inserted_count += bi
94
+ updated_count += bu
95
+ skipped_count += bs
96
+ batch = []
97
+ logger.info(
98
+ f"Progress: {inserted_count:,} inserted,"
99
+ f" {updated_count:,} updated,"
100
+ f" {skipped_count:,} skipped..."
101
+ )
102
+
103
+ except Exception as e: # Intentional broad catch: batch loop must not abort on single-item errors
104
+ logger.error(f"Error indexing file {file_metadata.get('file_path')}: {e}")
105
+ error_count += 1
106
+ finally:
107
+ total_processed += 1
108
+ if on_progress is not None:
109
+ on_progress(total_processed)
110
+
111
+ # Insert remaining files
112
+ if batch:
113
+ bi, bu, bs = self._insert_batch(batch, relationship_maps)
114
+ inserted_count += bi
115
+ updated_count += bu
116
+ skipped_count += bs
117
+
118
+ # Mark stale files (no longer on disk)
119
+ # Only do this in full mode - incremental mode only scans modified files
120
+ if not self.incremental:
121
+ removed_ids = files_db.mark_removed_files(self.db.conn, self._indexed_paths)
122
+ if removed_ids:
123
+ store = self._get_vector_store()
124
+ if store:
125
+ for file_id in removed_ids:
126
+ try:
127
+ store.delete_file(file_id)
128
+ except Exception: # Intentional broad catch: vector cleanup is best-effort
129
+ logger.warning("Failed to delete vectors for removed file_id=%s", file_id, exc_info=True)
130
+ logger.info(f"Marked {len(removed_ids):,} files as stale (no longer on disk)")
131
+ else:
132
+ logger.info("Skipping stale detection in incremental mode")
133
+
134
+ logger.info(
135
+ f"File indexing complete: {inserted_count:,} inserted,"
136
+ f" {updated_count:,} updated,"
137
+ f" {skipped_count:,} skipped,"
138
+ f" {error_count:,} errors"
139
+ )
140
+ return {
141
+ "inserted": inserted_count,
142
+ "updated": updated_count,
143
+ "skipped": skipped_count,
144
+ "errors": error_count,
145
+ }
146
+
147
+ def _get_vector_store(self):
148
+ if self._vector_store is None:
149
+ try:
150
+ from footprinter.semantic.vector_store import VectorStore
151
+
152
+ self._vector_store = VectorStore.get_instance()
153
+ except Exception as e: # Intentional broad catch: vector store is optional; any init failure disables it
154
+ logger.warning("Vector store unavailable: %s", e)
155
+ self._vector_store = False # sentinel: don't retry
156
+ return self._vector_store if self._vector_store is not False else None
157
+
158
+ def _vectorize_file(self, file_id, file_path):
159
+ try:
160
+ from footprinter.semantic.vector_store import _file_vectorization_enabled
161
+ except ImportError:
162
+ return
163
+ if not _file_vectorization_enabled():
164
+ return
165
+ # Check per-record vectorize flag
166
+ row = self.db.conn.execute(
167
+ "SELECT COALESCE(json_extract(metadata, '$.vectorize'), 1) as vec FROM files WHERE id = ?",
168
+ (file_id,),
169
+ ).fetchone()
170
+ if row and row["vec"] == 0:
171
+ return
172
+ store = self._get_vector_store()
173
+ if not store:
174
+ return
175
+ try:
176
+ path = Path(file_path)
177
+ if not path.exists():
178
+ return
179
+ if self._full_extractor is None:
180
+ from .full_content_extractor import FullContentExtractor
181
+
182
+ self._full_extractor = FullContentExtractor.from_config(self.config)
183
+ chunks = self._full_extractor.extract_with_chunking(path)
184
+ if not chunks:
185
+ return
186
+ metadata = {"file_type": path.suffix.lower(), "file_name": path.name}
187
+ store.upsert_file(file_id, file_path, chunks, metadata)
188
+ self.db.conn.execute(
189
+ "UPDATE files SET vectorized_at = CURRENT_TIMESTAMP, vectorized_chunks = ? WHERE id = ?",
190
+ (len(chunks), file_id),
191
+ )
192
+ except Exception as e: # Intentional broad catch: file vectorization is optional enhancement
193
+ logger.debug(f"Vectorization skipped for {file_path}: {e}")
194
+
195
+ def _insert_batch(
196
+ self,
197
+ batch,
198
+ relationship_maps: Optional[Dict[str, Any]] = None,
199
+ ) -> tuple:
200
+ """Insert a batch of files into files table.
201
+
202
+ Returns:
203
+ Tuple of (inserted, updated, skipped) counts
204
+ """
205
+ inserted = 0
206
+ updated = 0
207
+ skipped = 0
208
+ for file_metadata in batch:
209
+ result = files_db.insert_file(self.db.conn, file_metadata, relationship_maps=relationship_maps)
210
+ if result is None:
211
+ skipped += 1
212
+ else:
213
+ result_type, file_id = result
214
+ if result_type == "inserted":
215
+ inserted += 1
216
+ else:
217
+ updated += 1
218
+ self._vectorize_file(
219
+ file_id,
220
+ file_metadata.get("file_path") or file_metadata.get("path"),
221
+ )
222
+ self.db.conn.commit()
223
+ return inserted, updated, skipped
@@ -0,0 +1,277 @@
1
+ """
2
+ File system scanner with exclusion support.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ import re
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Dict, Generator, List, Optional
11
+
12
+ from ..utils.hash_utils import compute_md5, compute_sha256
13
+ from ..utils.time import UTC_FMT
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def _get_creation_time(stat_result: os.stat_result) -> float:
19
+ """Return the best available file creation timestamp.
20
+
21
+ On macOS (and Python 3.12+ on Linux with supported filesystems),
22
+ ``st_birthtime`` gives the true creation time. On older Linux,
23
+ ``st_ctime`` is the inode-change time (chmod, chown) — not creation —
24
+ so we fall back to ``st_mtime`` as the closest available proxy.
25
+ """
26
+ if hasattr(stat_result, "st_birthtime") and stat_result.st_birthtime > 0:
27
+ return stat_result.st_birthtime
28
+ return stat_result.st_mtime
29
+
30
+
31
+ def _expand_home(pattern: str) -> str:
32
+ """Expand ~ to the current user's home directory in regex patterns.
33
+
34
+ Patterns starting with ^~/ have the ~ replaced with the regex-escaped
35
+ home directory path, making them platform-agnostic.
36
+ """
37
+ if pattern.startswith("^~/"):
38
+ home = re.escape(os.path.expanduser("~"))
39
+ return "^" + home + pattern[2:]
40
+ return pattern
41
+
42
+
43
+ class FileScanner:
44
+ """File system scanner with configurable exclusion patterns."""
45
+
46
+ def __init__(self, config: Dict, since_datetime: Optional[datetime] = None):
47
+ """
48
+ Initialize file scanner.
49
+
50
+ Args:
51
+ config: Configuration dictionary
52
+ since_datetime: If provided, only scan files modified after this datetime
53
+ (for incremental indexing)
54
+ """
55
+ self.config = config
56
+ self.since_datetime = since_datetime
57
+ self.always_exclusions = self._compile_always_exclusions()
58
+ self.sensitive_exclusions = self._compile_sensitive_exclusions()
59
+ self.supported_extensions = set(config.get("indexing", {}).get("supported_extensions", []))
60
+ # 0 = no size limit (matches config.example.yaml)
61
+ self.max_file_size = config.get("indexing", {}).get("max_file_size_mb", 0) * 1024 * 1024
62
+
63
+ def _compile_always_exclusions(self) -> List[re.Pattern]:
64
+ """Compile 'always' exclusion patterns (apply to all folders)."""
65
+ patterns = []
66
+ exclusions = self.config.get("exclusions", {})
67
+
68
+ for pattern in exclusions.get("always", []):
69
+ patterns.append(re.compile(_expand_home(pattern)))
70
+
71
+ return patterns
72
+
73
+ def _compile_sensitive_exclusions(self) -> List[re.Pattern]:
74
+ """Compile sensitive exclusion patterns (apply to all folders)."""
75
+ patterns = []
76
+ exclusions = self.config.get("exclusions", {})
77
+
78
+ for pattern in exclusions.get("sensitive", []):
79
+ patterns.append(re.compile(_expand_home(pattern)))
80
+
81
+ return patterns
82
+
83
+ def should_exclude(self, file_path: str) -> bool:
84
+ """
85
+ Check if file should be excluded based on patterns.
86
+
87
+ v3 Architecture (2026-01): Index ALL files in ~/Work and ~/Personal.
88
+ Hidden files are indexed with status='hidden', not excluded.
89
+
90
+ Only exclude:
91
+ - always: Regeneratable dependencies (node_modules, venv, .git internals)
92
+ - sensitive: Credentials and keys (.aws, .ssh, .kube)
93
+
94
+ REMOVED: client_hidden exclusions - hidden files are now indexed with
95
+ status='hidden' and filtered in the Web UI, not at scan time.
96
+ """
97
+ # Check 'always' exclusions (node_modules, venv, .git internals, etc.)
98
+ # These are regeneratable dependencies and system noise
99
+ for pattern in self.always_exclusions:
100
+ if pattern.search(file_path):
101
+ logger.debug(f"Excluding {file_path} (always pattern)")
102
+ return True
103
+
104
+ # Check sensitive exclusions (.aws, .ssh, .kube - credentials)
105
+ # These should never be indexed for security
106
+ for pattern in self.sensitive_exclusions:
107
+ if pattern.search(file_path):
108
+ logger.debug(f"Excluding {file_path} (sensitive pattern)")
109
+ return True
110
+
111
+ # NOTE: Hidden files (starting with .) are NOT excluded
112
+ # They are indexed with status='hidden' in the database
113
+
114
+ return False
115
+
116
+ def is_supported_file(self, file_path: Path) -> bool:
117
+ """Check if file type is supported."""
118
+ if not self.supported_extensions:
119
+ return True # No filter = support all
120
+ return file_path.suffix.lower() in self.supported_extensions
121
+
122
+ def get_file_metadata(self, file_path: Path) -> Optional[Dict]:
123
+ """Extract file metadata."""
124
+ try:
125
+ stat = file_path.stat()
126
+
127
+ # Skip if too large (0 = no limit)
128
+ if self.max_file_size > 0 and stat.st_size > self.max_file_size:
129
+ logger.debug(f"Skipping {file_path} (too large: {stat.st_size} bytes)")
130
+ return None
131
+
132
+ file_path_str = str(file_path.absolute())
133
+
134
+ # Calculate both hashes:
135
+ # - SHA-256 for content deduplication
136
+ # - MD5 for Google Drive matching (Drive uses MD5)
137
+ sha256_hash = compute_sha256(file_path_str)
138
+ md5_hash = compute_md5(file_path_str)
139
+
140
+ return {
141
+ "file_path": file_path_str,
142
+ "file_name": file_path.name,
143
+ "file_type": file_path.suffix.lower() or "no_extension",
144
+ "file_size": stat.st_size,
145
+ "created_at": datetime.fromtimestamp(_get_creation_time(stat), tz=timezone.utc).strftime(UTC_FMT),
146
+ "modified_at": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).strftime(UTC_FMT),
147
+ "accessed_at": datetime.fromtimestamp(stat.st_atime, tz=timezone.utc).strftime(UTC_FMT),
148
+ "sha256_hash": sha256_hash,
149
+ "md5_hash": md5_hash,
150
+ "metadata": {
151
+ "permissions": oct(stat.st_mode)[-3:],
152
+ },
153
+ }
154
+ except (OSError, OverflowError) as e:
155
+ logger.error(f"Error reading metadata for {file_path}: {e}")
156
+ return None
157
+
158
+ def scan_directory(self, directory: str) -> Generator[Dict, None, None]:
159
+ """
160
+ Scan directory and yield file metadata.
161
+
162
+ Yields file metadata dictionaries for indexing.
163
+ If since_datetime is set, only yields files modified after that time.
164
+ """
165
+ directory_path = Path(directory).expanduser().resolve()
166
+
167
+ if not directory_path.exists():
168
+ logger.error(f"Directory does not exist: {directory_path}")
169
+ return
170
+
171
+ if not directory_path.is_dir():
172
+ logger.error(f"Path is not a directory: {directory_path}")
173
+ return
174
+
175
+ if self.since_datetime:
176
+ logger.info(f"Scanning directory: {directory_path} (incremental since {self.since_datetime})")
177
+ else:
178
+ logger.info(f"Scanning directory: {directory_path}")
179
+
180
+ file_count = 0
181
+ excluded_count = 0
182
+ skipped_unchanged = 0
183
+ error_count = 0
184
+ seen_real_paths: set[str] = set()
185
+
186
+ try:
187
+ for root, dirs, files in os.walk(directory_path, followlinks=True):
188
+ # Check if current directory should be excluded
189
+ # Also check resolved path for directory symlinks
190
+ root_path = Path(root)
191
+ if self.should_exclude(root):
192
+ dirs[:] = [] # Don't recurse into this directory
193
+ excluded_count += len(files)
194
+ continue
195
+ if root_path.is_symlink():
196
+ resolved_root = str(root_path.resolve())
197
+ if self.should_exclude(resolved_root) or resolved_root in seen_real_paths:
198
+ dirs[:] = []
199
+ excluded_count += len(files)
200
+ continue
201
+ seen_real_paths.add(resolved_root)
202
+
203
+ for file_name in files:
204
+ file_path = Path(root) / file_name
205
+
206
+ # Skip broken symlinks early (before stat() call)
207
+ if file_path.is_symlink() and not file_path.exists():
208
+ logger.debug(f"Skipping broken symlink: {file_path}")
209
+ continue
210
+
211
+ # Skip excluded files (check symlink path)
212
+ if self.should_exclude(str(file_path)):
213
+ excluded_count += 1
214
+ continue
215
+
216
+ # For symlinks, also check the resolved target against exclusions
217
+ if file_path.is_symlink():
218
+ real_path = str(file_path.resolve())
219
+ if self.should_exclude(real_path):
220
+ excluded_count += 1
221
+ continue
222
+ else:
223
+ real_path = str(file_path.resolve())
224
+
225
+ # Dedup: skip if we've already seen this real path
226
+ if real_path in seen_real_paths:
227
+ logger.debug(f"Skipping duplicate path: {file_path} -> {real_path}")
228
+ continue
229
+ seen_real_paths.add(real_path)
230
+
231
+ # Skip unsupported file types
232
+ if not self.is_supported_file(file_path):
233
+ continue
234
+
235
+ # For incremental indexing, skip files not modified since last run
236
+ if self.since_datetime:
237
+ try:
238
+ mtime = datetime.fromtimestamp(file_path.stat().st_mtime, tz=timezone.utc)
239
+ since = (
240
+ self.since_datetime.replace(tzinfo=timezone.utc)
241
+ if self.since_datetime.tzinfo is None
242
+ else self.since_datetime
243
+ )
244
+ if mtime <= since:
245
+ skipped_unchanged += 1
246
+ continue
247
+ except OSError:
248
+ logger.debug("Could not check mtime for %s, processing anyway", file_path)
249
+
250
+ # Get metadata
251
+ metadata = self.get_file_metadata(file_path)
252
+ if metadata:
253
+ file_count += 1
254
+ yield metadata
255
+ else:
256
+ error_count += 1
257
+
258
+ except OSError as e:
259
+ logger.error(f"Error scanning directory {directory_path}: {e}")
260
+ error_count += 1
261
+
262
+ if self.since_datetime:
263
+ logger.info(
264
+ f"Scan complete: {file_count} new/modified,"
265
+ f" {skipped_unchanged} unchanged,"
266
+ f" {excluded_count} excluded,"
267
+ f" {error_count} errors"
268
+ )
269
+ else:
270
+ logger.info(f"Scan complete: {file_count} files, {excluded_count} excluded, {error_count} errors")
271
+
272
+ def scan_all_directories(self) -> Generator[Dict, None, None]:
273
+ """Scan all configured directories."""
274
+ directories = self.config.get("directories", [])
275
+
276
+ for directory in directories:
277
+ yield from self.scan_directory(directory)