basic-memory 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. basic_memory/__init__.py +7 -0
  2. basic_memory/alembic/alembic.ini +119 -0
  3. basic_memory/alembic/env.py +185 -0
  4. basic_memory/alembic/migrations.py +24 -0
  5. basic_memory/alembic/script.py.mako +26 -0
  6. basic_memory/alembic/versions/314f1ea54dc4_add_postgres_full_text_search_support_.py +131 -0
  7. basic_memory/alembic/versions/3dae7c7b1564_initial_schema.py +93 -0
  8. basic_memory/alembic/versions/502b60eaa905_remove_required_from_entity_permalink.py +51 -0
  9. basic_memory/alembic/versions/5fe1ab1ccebe_add_projects_table.py +120 -0
  10. basic_memory/alembic/versions/647e7a75e2cd_project_constraint_fix.py +112 -0
  11. basic_memory/alembic/versions/9d9c1cb7d8f5_add_mtime_and_size_columns_to_entity_.py +49 -0
  12. basic_memory/alembic/versions/a1b2c3d4e5f6_fix_project_foreign_keys.py +49 -0
  13. basic_memory/alembic/versions/a2b3c4d5e6f7_add_search_index_entity_cascade.py +56 -0
  14. basic_memory/alembic/versions/b3c3938bacdb_relation_to_name_unique_index.py +44 -0
  15. basic_memory/alembic/versions/cc7172b46608_update_search_index_schema.py +113 -0
  16. basic_memory/alembic/versions/e7e1f4367280_add_scan_watermark_tracking_to_project.py +37 -0
  17. basic_memory/alembic/versions/f8a9b2c3d4e5_add_pg_trgm_for_fuzzy_link_resolution.py +239 -0
  18. basic_memory/api/__init__.py +5 -0
  19. basic_memory/api/app.py +131 -0
  20. basic_memory/api/routers/__init__.py +11 -0
  21. basic_memory/api/routers/directory_router.py +84 -0
  22. basic_memory/api/routers/importer_router.py +152 -0
  23. basic_memory/api/routers/knowledge_router.py +318 -0
  24. basic_memory/api/routers/management_router.py +80 -0
  25. basic_memory/api/routers/memory_router.py +90 -0
  26. basic_memory/api/routers/project_router.py +448 -0
  27. basic_memory/api/routers/prompt_router.py +260 -0
  28. basic_memory/api/routers/resource_router.py +249 -0
  29. basic_memory/api/routers/search_router.py +36 -0
  30. basic_memory/api/routers/utils.py +169 -0
  31. basic_memory/api/template_loader.py +292 -0
  32. basic_memory/api/v2/__init__.py +35 -0
  33. basic_memory/api/v2/routers/__init__.py +21 -0
  34. basic_memory/api/v2/routers/directory_router.py +93 -0
  35. basic_memory/api/v2/routers/importer_router.py +182 -0
  36. basic_memory/api/v2/routers/knowledge_router.py +413 -0
  37. basic_memory/api/v2/routers/memory_router.py +130 -0
  38. basic_memory/api/v2/routers/project_router.py +342 -0
  39. basic_memory/api/v2/routers/prompt_router.py +270 -0
  40. basic_memory/api/v2/routers/resource_router.py +286 -0
  41. basic_memory/api/v2/routers/search_router.py +73 -0
  42. basic_memory/cli/__init__.py +1 -0
  43. basic_memory/cli/app.py +84 -0
  44. basic_memory/cli/auth.py +277 -0
  45. basic_memory/cli/commands/__init__.py +18 -0
  46. basic_memory/cli/commands/cloud/__init__.py +6 -0
  47. basic_memory/cli/commands/cloud/api_client.py +112 -0
  48. basic_memory/cli/commands/cloud/bisync_commands.py +110 -0
  49. basic_memory/cli/commands/cloud/cloud_utils.py +101 -0
  50. basic_memory/cli/commands/cloud/core_commands.py +195 -0
  51. basic_memory/cli/commands/cloud/rclone_commands.py +371 -0
  52. basic_memory/cli/commands/cloud/rclone_config.py +110 -0
  53. basic_memory/cli/commands/cloud/rclone_installer.py +263 -0
  54. basic_memory/cli/commands/cloud/upload.py +233 -0
  55. basic_memory/cli/commands/cloud/upload_command.py +124 -0
  56. basic_memory/cli/commands/command_utils.py +77 -0
  57. basic_memory/cli/commands/db.py +44 -0
  58. basic_memory/cli/commands/format.py +198 -0
  59. basic_memory/cli/commands/import_chatgpt.py +84 -0
  60. basic_memory/cli/commands/import_claude_conversations.py +87 -0
  61. basic_memory/cli/commands/import_claude_projects.py +86 -0
  62. basic_memory/cli/commands/import_memory_json.py +87 -0
  63. basic_memory/cli/commands/mcp.py +76 -0
  64. basic_memory/cli/commands/project.py +889 -0
  65. basic_memory/cli/commands/status.py +174 -0
  66. basic_memory/cli/commands/telemetry.py +81 -0
  67. basic_memory/cli/commands/tool.py +341 -0
  68. basic_memory/cli/main.py +28 -0
  69. basic_memory/config.py +616 -0
  70. basic_memory/db.py +394 -0
  71. basic_memory/deps.py +705 -0
  72. basic_memory/file_utils.py +478 -0
  73. basic_memory/ignore_utils.py +297 -0
  74. basic_memory/importers/__init__.py +27 -0
  75. basic_memory/importers/base.py +79 -0
  76. basic_memory/importers/chatgpt_importer.py +232 -0
  77. basic_memory/importers/claude_conversations_importer.py +180 -0
  78. basic_memory/importers/claude_projects_importer.py +148 -0
  79. basic_memory/importers/memory_json_importer.py +108 -0
  80. basic_memory/importers/utils.py +61 -0
  81. basic_memory/markdown/__init__.py +21 -0
  82. basic_memory/markdown/entity_parser.py +279 -0
  83. basic_memory/markdown/markdown_processor.py +160 -0
  84. basic_memory/markdown/plugins.py +242 -0
  85. basic_memory/markdown/schemas.py +70 -0
  86. basic_memory/markdown/utils.py +117 -0
  87. basic_memory/mcp/__init__.py +1 -0
  88. basic_memory/mcp/async_client.py +139 -0
  89. basic_memory/mcp/project_context.py +141 -0
  90. basic_memory/mcp/prompts/__init__.py +19 -0
  91. basic_memory/mcp/prompts/ai_assistant_guide.py +70 -0
  92. basic_memory/mcp/prompts/continue_conversation.py +62 -0
  93. basic_memory/mcp/prompts/recent_activity.py +188 -0
  94. basic_memory/mcp/prompts/search.py +57 -0
  95. basic_memory/mcp/prompts/utils.py +162 -0
  96. basic_memory/mcp/resources/ai_assistant_guide.md +283 -0
  97. basic_memory/mcp/resources/project_info.py +71 -0
  98. basic_memory/mcp/server.py +81 -0
  99. basic_memory/mcp/tools/__init__.py +48 -0
  100. basic_memory/mcp/tools/build_context.py +120 -0
  101. basic_memory/mcp/tools/canvas.py +152 -0
  102. basic_memory/mcp/tools/chatgpt_tools.py +190 -0
  103. basic_memory/mcp/tools/delete_note.py +242 -0
  104. basic_memory/mcp/tools/edit_note.py +324 -0
  105. basic_memory/mcp/tools/list_directory.py +168 -0
  106. basic_memory/mcp/tools/move_note.py +551 -0
  107. basic_memory/mcp/tools/project_management.py +201 -0
  108. basic_memory/mcp/tools/read_content.py +281 -0
  109. basic_memory/mcp/tools/read_note.py +267 -0
  110. basic_memory/mcp/tools/recent_activity.py +534 -0
  111. basic_memory/mcp/tools/search.py +385 -0
  112. basic_memory/mcp/tools/utils.py +540 -0
  113. basic_memory/mcp/tools/view_note.py +78 -0
  114. basic_memory/mcp/tools/write_note.py +230 -0
  115. basic_memory/models/__init__.py +15 -0
  116. basic_memory/models/base.py +10 -0
  117. basic_memory/models/knowledge.py +226 -0
  118. basic_memory/models/project.py +87 -0
  119. basic_memory/models/search.py +85 -0
  120. basic_memory/repository/__init__.py +11 -0
  121. basic_memory/repository/entity_repository.py +503 -0
  122. basic_memory/repository/observation_repository.py +73 -0
  123. basic_memory/repository/postgres_search_repository.py +379 -0
  124. basic_memory/repository/project_info_repository.py +10 -0
  125. basic_memory/repository/project_repository.py +128 -0
  126. basic_memory/repository/relation_repository.py +146 -0
  127. basic_memory/repository/repository.py +385 -0
  128. basic_memory/repository/search_index_row.py +95 -0
  129. basic_memory/repository/search_repository.py +94 -0
  130. basic_memory/repository/search_repository_base.py +241 -0
  131. basic_memory/repository/sqlite_search_repository.py +439 -0
  132. basic_memory/schemas/__init__.py +86 -0
  133. basic_memory/schemas/base.py +297 -0
  134. basic_memory/schemas/cloud.py +50 -0
  135. basic_memory/schemas/delete.py +37 -0
  136. basic_memory/schemas/directory.py +30 -0
  137. basic_memory/schemas/importer.py +35 -0
  138. basic_memory/schemas/memory.py +285 -0
  139. basic_memory/schemas/project_info.py +212 -0
  140. basic_memory/schemas/prompt.py +90 -0
  141. basic_memory/schemas/request.py +112 -0
  142. basic_memory/schemas/response.py +229 -0
  143. basic_memory/schemas/search.py +117 -0
  144. basic_memory/schemas/sync_report.py +72 -0
  145. basic_memory/schemas/v2/__init__.py +27 -0
  146. basic_memory/schemas/v2/entity.py +129 -0
  147. basic_memory/schemas/v2/resource.py +46 -0
  148. basic_memory/services/__init__.py +8 -0
  149. basic_memory/services/context_service.py +601 -0
  150. basic_memory/services/directory_service.py +308 -0
  151. basic_memory/services/entity_service.py +864 -0
  152. basic_memory/services/exceptions.py +37 -0
  153. basic_memory/services/file_service.py +541 -0
  154. basic_memory/services/initialization.py +216 -0
  155. basic_memory/services/link_resolver.py +121 -0
  156. basic_memory/services/project_service.py +880 -0
  157. basic_memory/services/search_service.py +404 -0
  158. basic_memory/services/service.py +15 -0
  159. basic_memory/sync/__init__.py +6 -0
  160. basic_memory/sync/background_sync.py +26 -0
  161. basic_memory/sync/sync_service.py +1259 -0
  162. basic_memory/sync/watch_service.py +510 -0
  163. basic_memory/telemetry.py +249 -0
  164. basic_memory/templates/prompts/continue_conversation.hbs +110 -0
  165. basic_memory/templates/prompts/search.hbs +101 -0
  166. basic_memory/utils.py +468 -0
  167. basic_memory-0.17.1.dist-info/METADATA +617 -0
  168. basic_memory-0.17.1.dist-info/RECORD +171 -0
  169. basic_memory-0.17.1.dist-info/WHEEL +4 -0
  170. basic_memory-0.17.1.dist-info/entry_points.txt +3 -0
  171. basic_memory-0.17.1.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,1259 @@
1
+ """Service for syncing files between filesystem and database."""
2
+
3
+ import asyncio
4
+ import os
5
+ import sys
6
+ import time
7
+ from collections import OrderedDict
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import AsyncIterator, Dict, List, Optional, Set, Tuple
12
+
13
+ import aiofiles.os
14
+
15
+ from loguru import logger
16
+ from sqlalchemy.exc import IntegrityError
17
+
18
+ from basic_memory import db
19
+ from basic_memory.config import BasicMemoryConfig, ConfigManager
20
+ from basic_memory.file_utils import has_frontmatter
21
+ from basic_memory.ignore_utils import load_bmignore_patterns, should_ignore_path
22
+ from basic_memory.markdown import EntityParser, MarkdownProcessor
23
+ from basic_memory.models import Entity, Project
24
+ from basic_memory.repository import (
25
+ EntityRepository,
26
+ RelationRepository,
27
+ ObservationRepository,
28
+ ProjectRepository,
29
+ )
30
+ from basic_memory.repository.search_repository import create_search_repository
31
+ from basic_memory.services import EntityService, FileService
32
+ from basic_memory.services.exceptions import SyncFatalError
33
+ from basic_memory.services.link_resolver import LinkResolver
34
+ from basic_memory.services.search_service import SearchService
35
+
36
+ # Circuit breaker configuration
37
+ MAX_CONSECUTIVE_FAILURES = 3
38
+
39
+
40
+ @dataclass
41
+ class FileFailureInfo:
42
+ """Track failure information for a file that repeatedly fails to sync.
43
+
44
+ Attributes:
45
+ count: Number of consecutive failures
46
+ first_failure: Timestamp of first failure in current sequence
47
+ last_failure: Timestamp of most recent failure
48
+ last_error: Error message from most recent failure
49
+ last_checksum: Checksum of file when it last failed (for detecting file changes)
50
+ """
51
+
52
+ count: int
53
+ first_failure: datetime
54
+ last_failure: datetime
55
+ last_error: str
56
+ last_checksum: str
57
+
58
+
59
+ @dataclass
60
+ class SkippedFile:
61
+ """Information about a file that was skipped due to repeated failures.
62
+
63
+ Attributes:
64
+ path: File path relative to project root
65
+ reason: Error message from last failure
66
+ failure_count: Number of consecutive failures
67
+ first_failed: Timestamp of first failure
68
+ """
69
+
70
+ path: str
71
+ reason: str
72
+ failure_count: int
73
+ first_failed: datetime
74
+
75
+
76
+ @dataclass
77
+ class SyncReport:
78
+ """Report of file changes found compared to database state.
79
+
80
+ Attributes:
81
+ total: Total number of files in directory being synced
82
+ new: Files that exist on disk but not in database
83
+ modified: Files that exist in both but have different checksums
84
+ deleted: Files that exist in database but not on disk
85
+ moves: Files that have been moved from one location to another
86
+ checksums: Current checksums for files on disk
87
+ skipped_files: Files that were skipped due to repeated failures
88
+ """
89
+
90
+ # We keep paths as strings in sets/dicts for easier serialization
91
+ new: Set[str] = field(default_factory=set)
92
+ modified: Set[str] = field(default_factory=set)
93
+ deleted: Set[str] = field(default_factory=set)
94
+ moves: Dict[str, str] = field(default_factory=dict) # old_path -> new_path
95
+ checksums: Dict[str, str] = field(default_factory=dict) # path -> checksum
96
+ skipped_files: List[SkippedFile] = field(default_factory=list)
97
+
98
+ @property
99
+ def total(self) -> int:
100
+ """Total number of changes."""
101
+ return len(self.new) + len(self.modified) + len(self.deleted) + len(self.moves)
102
+
103
+
104
+ @dataclass
105
+ class ScanResult:
106
+ """Result of scanning a directory."""
107
+
108
+ # file_path -> checksum
109
+ files: Dict[str, str] = field(default_factory=dict)
110
+
111
+ # checksum -> file_path
112
+ checksums: Dict[str, str] = field(default_factory=dict)
113
+
114
+ # file_path -> error message
115
+ errors: Dict[str, str] = field(default_factory=dict)
116
+
117
+
118
+ class SyncService:
119
+ """Syncs documents and knowledge files with database."""
120
+
121
+ def __init__(
122
+ self,
123
+ app_config: BasicMemoryConfig,
124
+ entity_service: EntityService,
125
+ entity_parser: EntityParser,
126
+ entity_repository: EntityRepository,
127
+ relation_repository: RelationRepository,
128
+ project_repository: ProjectRepository,
129
+ search_service: SearchService,
130
+ file_service: FileService,
131
+ ):
132
+ self.app_config = app_config
133
+ self.entity_service = entity_service
134
+ self.entity_parser = entity_parser
135
+ self.entity_repository = entity_repository
136
+ self.relation_repository = relation_repository
137
+ self.project_repository = project_repository
138
+ self.search_service = search_service
139
+ self.file_service = file_service
140
+ # Load ignore patterns once at initialization for performance
141
+ self._ignore_patterns = load_bmignore_patterns()
142
+ # Circuit breaker: track file failures to prevent infinite retry loops
143
+ # Use OrderedDict for LRU behavior with bounded size to prevent unbounded memory growth
144
+ self._file_failures: OrderedDict[str, FileFailureInfo] = OrderedDict()
145
+ self._max_tracked_failures = 100 # Limit failure cache size
146
+
147
+ async def _should_skip_file(self, path: str) -> bool:
148
+ """Check if file should be skipped due to repeated failures.
149
+
150
+ Computes current file checksum and compares with last failed checksum.
151
+ If checksums differ, file has changed and we should retry.
152
+
153
+ Args:
154
+ path: File path to check
155
+
156
+ Returns:
157
+ True if file should be skipped, False otherwise
158
+ """
159
+ if path not in self._file_failures:
160
+ return False
161
+
162
+ failure_info = self._file_failures[path]
163
+
164
+ # Check if failure count exceeds threshold
165
+ if failure_info.count < MAX_CONSECUTIVE_FAILURES:
166
+ return False
167
+
168
+ # Compute current checksum to see if file changed
169
+ try:
170
+ current_checksum = await self.file_service.compute_checksum(path)
171
+
172
+ # If checksum changed, file was modified - reset and retry
173
+ if current_checksum != failure_info.last_checksum:
174
+ logger.info(
175
+ f"File {path} changed since last failure (checksum differs), "
176
+ f"resetting failure count and retrying"
177
+ )
178
+ del self._file_failures[path]
179
+ return False
180
+ except Exception as e:
181
+ # If we can't compute checksum, log but still skip to avoid infinite loops
182
+ logger.warning(f"Failed to compute checksum for {path}: {e}")
183
+
184
+ # File unchanged and exceeded threshold - skip it
185
+ return True
186
+
187
+ async def _record_failure(self, path: str, error: str) -> None:
188
+ """Record a file sync failure for circuit breaker tracking.
189
+
190
+ Uses LRU cache with bounded size to prevent unbounded memory growth.
191
+
192
+ Args:
193
+ path: File path that failed
194
+ error: Error message from the failure
195
+ """
196
+ now = datetime.now()
197
+
198
+ # Compute checksum for failure tracking
199
+ try:
200
+ checksum = await self.file_service.compute_checksum(path)
201
+ except Exception:
202
+ # If checksum fails, use empty string (better than crashing)
203
+ checksum = ""
204
+
205
+ if path in self._file_failures:
206
+ # Update existing failure record and move to end (most recently used)
207
+ failure_info = self._file_failures.pop(path)
208
+ failure_info.count += 1
209
+ failure_info.last_failure = now
210
+ failure_info.last_error = error
211
+ failure_info.last_checksum = checksum
212
+ self._file_failures[path] = failure_info
213
+
214
+ logger.warning(
215
+ f"File sync failed (attempt {failure_info.count}/{MAX_CONSECUTIVE_FAILURES}): "
216
+ f"path={path}, error={error}"
217
+ )
218
+
219
+ # Log when threshold is reached
220
+ if failure_info.count >= MAX_CONSECUTIVE_FAILURES:
221
+ logger.error(
222
+ f"File {path} has failed {MAX_CONSECUTIVE_FAILURES} times and will be skipped. "
223
+ f"First failure: {failure_info.first_failure}, Last error: {error}"
224
+ )
225
+ else:
226
+ # Create new failure record
227
+ self._file_failures[path] = FileFailureInfo(
228
+ count=1,
229
+ first_failure=now,
230
+ last_failure=now,
231
+ last_error=error,
232
+ last_checksum=checksum,
233
+ )
234
+ logger.debug(f"Recording first failure for {path}: {error}")
235
+
236
+ # Enforce cache size limit - remove oldest entry if over limit
237
+ if len(self._file_failures) > self._max_tracked_failures:
238
+ removed_path, removed_info = self._file_failures.popitem(last=False)
239
+ logger.debug(
240
+ f"Evicting oldest failure record from cache: path={removed_path}, "
241
+ f"failures={removed_info.count}"
242
+ )
243
+
244
+ def _clear_failure(self, path: str) -> None:
245
+ """Clear failure tracking for a file after successful sync.
246
+
247
+ Args:
248
+ path: File path that successfully synced
249
+ """
250
+ if path in self._file_failures:
251
+ logger.info(f"Clearing failure history for {path} after successful sync")
252
+ del self._file_failures[path]
253
+
254
+ async def sync(
255
+ self, directory: Path, project_name: Optional[str] = None, force_full: bool = False
256
+ ) -> SyncReport:
257
+ """Sync all files with database and update scan watermark.
258
+
259
+ Args:
260
+ directory: Directory to sync
261
+ project_name: Optional project name
262
+ force_full: If True, force a full scan bypassing watermark optimization
263
+ """
264
+
265
+ start_time = time.time()
266
+ sync_start_timestamp = time.time() # Capture at start for watermark
267
+ logger.info(f"Sync operation started for directory: {directory} (force_full={force_full})")
268
+
269
+ # initial paths from db to sync
270
+ # path -> checksum
271
+ report = await self.scan(directory, force_full=force_full)
272
+
273
+ # order of sync matters to resolve relations effectively
274
+ logger.info(
275
+ f"Sync changes detected: new_files={len(report.new)}, modified_files={len(report.modified)}, "
276
+ + f"deleted_files={len(report.deleted)}, moved_files={len(report.moves)}"
277
+ )
278
+
279
+ # sync moves first
280
+ for old_path, new_path in report.moves.items():
281
+ # in the case where a file has been deleted and replaced by another file
282
+ # it will show up in the move and modified lists, so handle it in modified
283
+ if new_path in report.modified:
284
+ report.modified.remove(new_path)
285
+ logger.debug(
286
+ f"File marked as moved and modified: old_path={old_path}, new_path={new_path}"
287
+ )
288
+ else:
289
+ await self.handle_move(old_path, new_path)
290
+
291
+ # deleted next
292
+ for path in report.deleted:
293
+ await self.handle_delete(path)
294
+
295
+ # then new and modified
296
+ for path in report.new:
297
+ entity, _ = await self.sync_file(path, new=True)
298
+
299
+ # Track if file was skipped
300
+ if entity is None and await self._should_skip_file(path):
301
+ failure_info = self._file_failures[path]
302
+ report.skipped_files.append(
303
+ SkippedFile(
304
+ path=path,
305
+ reason=failure_info.last_error,
306
+ failure_count=failure_info.count,
307
+ first_failed=failure_info.first_failure,
308
+ )
309
+ )
310
+
311
+ for path in report.modified:
312
+ entity, _ = await self.sync_file(path, new=False)
313
+
314
+ # Track if file was skipped
315
+ if entity is None and await self._should_skip_file(path):
316
+ failure_info = self._file_failures[path]
317
+ report.skipped_files.append(
318
+ SkippedFile(
319
+ path=path,
320
+ reason=failure_info.last_error,
321
+ failure_count=failure_info.count,
322
+ first_failed=failure_info.first_failure,
323
+ )
324
+ )
325
+
326
+ # Only resolve relations if there were actual changes
327
+ # If no files changed, no new unresolved relations could have been created
328
+ if report.total > 0:
329
+ await self.resolve_relations()
330
+ else:
331
+ logger.info("Skipping relation resolution - no file changes detected")
332
+
333
+ # Update scan watermark after successful sync
334
+ # Use the timestamp from sync start (not end) to ensure we catch files
335
+ # created during the sync on the next iteration
336
+ current_file_count = await self._quick_count_files(directory)
337
+ if self.entity_repository.project_id is not None:
338
+ project = await self.project_repository.find_by_id(self.entity_repository.project_id)
339
+ if project:
340
+ await self.project_repository.update(
341
+ project.id,
342
+ {
343
+ "last_scan_timestamp": sync_start_timestamp,
344
+ "last_file_count": current_file_count,
345
+ },
346
+ )
347
+ logger.debug(
348
+ f"Updated scan watermark: timestamp={sync_start_timestamp}, "
349
+ f"file_count={current_file_count}"
350
+ )
351
+
352
+ duration_ms = int((time.time() - start_time) * 1000)
353
+
354
+ # Log summary with skipped files if any
355
+ if report.skipped_files:
356
+ logger.warning(
357
+ f"Sync completed with {len(report.skipped_files)} skipped files: "
358
+ f"directory={directory}, total_changes={report.total}, "
359
+ f"skipped={len(report.skipped_files)}, duration_ms={duration_ms}"
360
+ )
361
+ for skipped in report.skipped_files:
362
+ logger.warning(
363
+ f"Skipped file: path={skipped.path}, "
364
+ f"failures={skipped.failure_count}, reason={skipped.reason}"
365
+ )
366
+ else:
367
+ logger.info(
368
+ f"Sync operation completed: directory={directory}, "
369
+ f"total_changes={report.total}, duration_ms={duration_ms}"
370
+ )
371
+
372
+ return report
373
+
374
+ async def scan(self, directory, force_full: bool = False):
375
+ """Smart scan using watermark and file count for large project optimization.
376
+
377
+ Uses scan watermark tracking to dramatically reduce scan time for large projects:
378
+ - Tracks last_scan_timestamp and last_file_count in Project model
379
+ - Uses `find -newermt` for incremental scanning (only changed files)
380
+ - Falls back to full scan when deletions detected (file count decreased)
381
+
382
+ Expected performance:
383
+ - No changes: 225x faster (2s vs 450s for 1,460 files on TigrisFS)
384
+ - Few changes: 84x faster (5s vs 420s)
385
+ - Deletions: Full scan (rare, acceptable)
386
+
387
+ Architecture:
388
+ - Get current file count quickly (find | wc -l: 1.4s)
389
+ - Compare with last_file_count to detect deletions
390
+ - If no deletions: incremental scan with find -newermt (0.2s)
391
+ - Process changed files with mtime-based comparison
392
+
393
+ Args:
394
+ directory: Directory to scan
395
+ force_full: If True, bypass watermark optimization and force full scan
396
+ """
397
+ scan_start_time = time.time()
398
+
399
+ report = SyncReport()
400
+
401
+ # Get current project to check watermark
402
+ if self.entity_repository.project_id is None:
403
+ raise ValueError("Entity repository has no project_id set")
404
+
405
+ project = await self.project_repository.find_by_id(self.entity_repository.project_id)
406
+ if project is None:
407
+ raise ValueError(f"Project not found: {self.entity_repository.project_id}")
408
+
409
+ # Step 1: Quick file count
410
+ logger.debug("Counting files in directory")
411
+ current_count = await self._quick_count_files(directory)
412
+ logger.debug(f"Found {current_count} files in directory")
413
+
414
+ # Step 2: Determine scan strategy based on watermark and file count
415
+ if force_full:
416
+ # User explicitly requested full scan → bypass watermark optimization
417
+ scan_type = "full_forced"
418
+ logger.info("Force full scan requested, bypassing watermark optimization")
419
+ file_paths_to_scan = await self._scan_directory_full(directory)
420
+
421
+ elif project.last_file_count is None:
422
+ # First sync ever → full scan
423
+ scan_type = "full_initial"
424
+ logger.info("First sync for this project, performing full scan")
425
+ file_paths_to_scan = await self._scan_directory_full(directory)
426
+
427
+ elif current_count < project.last_file_count:
428
+ # Files deleted → need full scan to detect which ones
429
+ scan_type = "full_deletions"
430
+ logger.info(
431
+ f"File count decreased ({project.last_file_count} → {current_count}), "
432
+ f"running full scan to detect deletions"
433
+ )
434
+ file_paths_to_scan = await self._scan_directory_full(directory)
435
+
436
+ elif project.last_scan_timestamp is not None:
437
+ # Incremental scan: only files modified since last scan
438
+ scan_type = "incremental"
439
+ logger.info(
440
+ f"Running incremental scan for files modified since {project.last_scan_timestamp}"
441
+ )
442
+ file_paths_to_scan = await self._scan_directory_modified_since(
443
+ directory, project.last_scan_timestamp
444
+ )
445
+ logger.info(
446
+ f"Incremental scan found {len(file_paths_to_scan)} potentially changed files"
447
+ )
448
+
449
+ else:
450
+ # Fallback to full scan (no watermark available)
451
+ scan_type = "full_fallback"
452
+ logger.warning("No scan watermark available, falling back to full scan")
453
+ file_paths_to_scan = await self._scan_directory_full(directory)
454
+
455
+ # Step 3: Process each file with mtime-based comparison
456
+ scanned_paths: Set[str] = set()
457
+ changed_checksums: Dict[str, str] = {}
458
+
459
+ logger.debug(f"Processing {len(file_paths_to_scan)} files with mtime-based comparison")
460
+
461
+ for rel_path in file_paths_to_scan:
462
+ scanned_paths.add(rel_path)
463
+
464
+ # Get file stats
465
+ abs_path = directory / rel_path
466
+ if not abs_path.exists():
467
+ # File was deleted between scan and now (race condition)
468
+ continue
469
+
470
+ stat_info = abs_path.stat()
471
+
472
+ # Indexed lookup - single file query (not full table scan)
473
+ db_entity = await self.entity_repository.get_by_file_path(rel_path)
474
+
475
+ if db_entity is None:
476
+ # New file - need checksum for move detection
477
+ checksum = await self.file_service.compute_checksum(rel_path)
478
+ report.new.add(rel_path)
479
+ changed_checksums[rel_path] = checksum
480
+ logger.trace(f"New file detected: {rel_path}")
481
+ continue
482
+
483
+ # File exists in DB - check if mtime/size changed
484
+ db_mtime = db_entity.mtime
485
+ db_size = db_entity.size
486
+ fs_mtime = stat_info.st_mtime
487
+ fs_size = stat_info.st_size
488
+
489
+ # Compare mtime and size (like rsync/rclone)
490
+ # Allow small epsilon for float comparison (0.01s = 10ms)
491
+ mtime_changed = db_mtime is None or abs(fs_mtime - db_mtime) > 0.01
492
+ size_changed = db_size is None or fs_size != db_size
493
+
494
+ if mtime_changed or size_changed:
495
+ # File modified - compute checksum
496
+ checksum = await self.file_service.compute_checksum(rel_path)
497
+ db_checksum = db_entity.checksum
498
+
499
+ # Only mark as modified if checksum actually differs
500
+ # (handles cases where mtime changed but content didn't, e.g., git operations)
501
+ if checksum != db_checksum:
502
+ report.modified.add(rel_path)
503
+ changed_checksums[rel_path] = checksum
504
+ logger.trace(
505
+ f"Modified file detected: {rel_path}, "
506
+ f"mtime_changed={mtime_changed}, size_changed={size_changed}"
507
+ )
508
+ else:
509
+ # File unchanged - no checksum needed
510
+ logger.trace(f"File unchanged (mtime/size match): {rel_path}")
511
+
512
+ # Step 4: Detect moves (for both full and incremental scans)
513
+ # Check if any "new" files are actually moves by matching checksums
514
+ for new_path in list(report.new): # Use list() to allow modification during iteration
515
+ new_checksum = changed_checksums.get(new_path)
516
+ if not new_checksum:
517
+ continue
518
+
519
+ # Look for existing entity with same checksum but different path
520
+ # This could be a move or a copy
521
+ existing_entities = await self.entity_repository.find_by_checksum(new_checksum)
522
+
523
+ for candidate in existing_entities:
524
+ if candidate.file_path == new_path:
525
+ # Same path, skip (shouldn't happen for "new" files but be safe)
526
+ continue
527
+
528
+ # Check if the old path still exists on disk
529
+ old_path_abs = directory / candidate.file_path
530
+ if old_path_abs.exists():
531
+ # Original still exists → this is a copy, not a move
532
+ logger.trace(
533
+ f"File copy detected (not move): {candidate.file_path} copied to {new_path}"
534
+ )
535
+ continue
536
+
537
+ # Original doesn't exist → this is a move!
538
+ report.moves[candidate.file_path] = new_path
539
+ report.new.remove(new_path)
540
+ logger.trace(f"Move detected: {candidate.file_path} -> {new_path}")
541
+ break # Only match first candidate
542
+
543
+ # Step 5: Detect deletions (only for full scans)
544
+ # Incremental scans can't reliably detect deletions since they only see modified files
545
+ if scan_type in ("full_initial", "full_deletions", "full_fallback", "full_forced"):
546
+ # Use optimized query for just file paths (not full entities)
547
+ db_file_paths = await self.entity_repository.get_all_file_paths()
548
+ logger.debug(f"Found {len(db_file_paths)} db paths for deletion detection")
549
+
550
+ for db_path in db_file_paths:
551
+ if db_path not in scanned_paths:
552
+ # File in DB but not on filesystem
553
+ # Check if it was already detected as a move
554
+ if db_path in report.moves:
555
+ # Already handled as a move, skip
556
+ continue
557
+
558
+ # File was deleted
559
+ report.deleted.add(db_path)
560
+ logger.trace(f"Deleted file detected: {db_path}")
561
+
562
+ # Store checksums for files that need syncing
563
+ report.checksums = changed_checksums
564
+
565
+ scan_duration_ms = int((time.time() - scan_start_time) * 1000)
566
+
567
+ logger.info(
568
+ f"Completed {scan_type} scan for directory {directory} in {scan_duration_ms}ms, "
569
+ f"found {report.total} changes (new={len(report.new)}, "
570
+ f"modified={len(report.modified)}, deleted={len(report.deleted)}, "
571
+ f"moves={len(report.moves)})"
572
+ )
573
+ return report
574
+
575
+ async def sync_file(
576
+ self, path: str, new: bool = True
577
+ ) -> Tuple[Optional[Entity], Optional[str]]:
578
+ """Sync a single file with circuit breaker protection.
579
+
580
+ Args:
581
+ path: Path to file to sync
582
+ new: Whether this is a new file
583
+
584
+ Returns:
585
+ Tuple of (entity, checksum) or (None, None) if sync fails or file is skipped
586
+ """
587
+ # Check if file should be skipped due to repeated failures
588
+ if await self._should_skip_file(path):
589
+ logger.warning(f"Skipping file due to repeated failures: {path}")
590
+ return None, None
591
+
592
+ try:
593
+ logger.debug(
594
+ f"Syncing file path={path} is_new={new} is_markdown={self.file_service.is_markdown(path)}"
595
+ )
596
+
597
+ if self.file_service.is_markdown(path):
598
+ entity, checksum = await self.sync_markdown_file(path, new)
599
+ else:
600
+ entity, checksum = await self.sync_regular_file(path, new)
601
+
602
+ if entity is not None:
603
+ await self.search_service.index_entity(entity)
604
+
605
+ # Clear failure tracking on successful sync
606
+ self._clear_failure(path)
607
+
608
+ logger.debug(
609
+ f"File sync completed, path={path}, entity_id={entity.id}, checksum={checksum[:8]}"
610
+ )
611
+ return entity, checksum
612
+
613
+ except FileNotFoundError:
614
+ # File exists in database but not on filesystem
615
+ # This indicates a database/filesystem inconsistency - treat as deletion
616
+ logger.warning(
617
+ f"File not found during sync, treating as deletion: path={path}. "
618
+ "This may indicate a race condition or manual file deletion."
619
+ )
620
+ await self.handle_delete(path)
621
+ return None, None
622
+
623
+ except Exception as e:
624
+ # Check if this is a fatal error (or caused by one)
625
+ # Fatal errors like project deletion should terminate sync immediately
626
+ if isinstance(e, SyncFatalError) or isinstance(e.__cause__, SyncFatalError):
627
+ logger.error(f"Fatal sync error encountered, terminating sync: path={path}")
628
+ raise
629
+
630
+ # Otherwise treat as recoverable file-level error
631
+ error_msg = str(e)
632
+ logger.error(f"Failed to sync file: path={path}, error={error_msg}")
633
+
634
+ # Record failure for circuit breaker
635
+ await self._record_failure(path, error_msg)
636
+
637
+ return None, None
638
+
639
+ async def sync_markdown_file(self, path: str, new: bool = True) -> Tuple[Optional[Entity], str]:
640
+ """Sync a markdown file with full processing.
641
+
642
+ Args:
643
+ path: Path to markdown file
644
+ new: Whether this is a new file
645
+
646
+ Returns:
647
+ Tuple of (entity, checksum)
648
+ """
649
+ # Parse markdown first to get any existing permalink
650
+ logger.debug(f"Parsing markdown file, path: {path}, new: {new}")
651
+
652
+ file_content = await self.file_service.read_file_content(path)
653
+ file_contains_frontmatter = has_frontmatter(file_content)
654
+
655
+ # Get file timestamps for tracking modification times
656
+ file_metadata = await self.file_service.get_file_metadata(path)
657
+ created = file_metadata.created_at
658
+ modified = file_metadata.modified_at
659
+
660
+ # Parse markdown content with file metadata (avoids redundant file read/stat)
661
+ # This enables cloud implementations (S3FileService) to provide metadata from head_object
662
+ abs_path = self.file_service.base_path / path
663
+ entity_markdown = await self.entity_parser.parse_markdown_content(
664
+ file_path=abs_path,
665
+ content=file_content,
666
+ mtime=file_metadata.modified_at.timestamp(),
667
+ ctime=file_metadata.created_at.timestamp(),
668
+ )
669
+
670
+ # if the file contains frontmatter, resolve a permalink (unless disabled)
671
+ if file_contains_frontmatter and not self.app_config.disable_permalinks:
672
+ # Resolve permalink - skip conflict checks during bulk sync for performance
673
+ permalink = await self.entity_service.resolve_permalink(
674
+ path, markdown=entity_markdown, skip_conflict_check=True
675
+ )
676
+
677
+ # If permalink changed, update the file
678
+ if permalink != entity_markdown.frontmatter.permalink:
679
+ logger.info(
680
+ f"Updating permalink for path: {path}, old_permalink: {entity_markdown.frontmatter.permalink}, new_permalink: {permalink}"
681
+ )
682
+
683
+ entity_markdown.frontmatter.metadata["permalink"] = permalink
684
+ await self.file_service.update_frontmatter(path, {"permalink": permalink})
685
+
686
+ # if the file is new, create an entity
687
+ if new:
688
+ # Create entity with final permalink
689
+ logger.debug(f"Creating new entity from markdown, path={path}")
690
+ await self.entity_service.create_entity_from_markdown(Path(path), entity_markdown)
691
+
692
+ # otherwise we need to update the entity and observations
693
+ else:
694
+ logger.debug(f"Updating entity from markdown, path={path}")
695
+ await self.entity_service.update_entity_and_observations(Path(path), entity_markdown)
696
+
697
+ # Update relations and search index
698
+ entity = await self.entity_service.update_entity_relations(path, entity_markdown)
699
+
700
+ # After updating relations, we need to compute the checksum again
701
+ # This is necessary for files with wikilinks to ensure consistent checksums
702
+ # after relation processing is complete
703
+ final_checksum = await self.file_service.compute_checksum(path)
704
+
705
+ # Update checksum, timestamps, and file metadata from file system
706
+ # Store mtime/size for efficient change detection in future scans
707
+ # This ensures temporal ordering in search and recent activity uses actual file modification times
708
+ await self.entity_repository.update(
709
+ entity.id,
710
+ {
711
+ "checksum": final_checksum,
712
+ "created_at": created,
713
+ "updated_at": modified,
714
+ "mtime": file_metadata.modified_at.timestamp(),
715
+ "size": file_metadata.size,
716
+ },
717
+ )
718
+
719
+ logger.debug(
720
+ f"Markdown sync completed: path={path}, entity_id={entity.id}, "
721
+ f"observation_count={len(entity.observations)}, relation_count={len(entity.relations)}, "
722
+ f"checksum={final_checksum[:8]}"
723
+ )
724
+
725
+ # Return the final checksum to ensure everything is consistent
726
+ return entity, final_checksum
727
+
728
+ async def sync_regular_file(self, path: str, new: bool = True) -> Tuple[Optional[Entity], str]:
729
+ """Sync a non-markdown file with basic tracking.
730
+
731
+ Args:
732
+ path: Path to file
733
+ new: Whether this is a new file
734
+
735
+ Returns:
736
+ Tuple of (entity, checksum)
737
+ """
738
+ checksum = await self.file_service.compute_checksum(path)
739
+ if new:
740
+ # Generate permalink from path - skip conflict checks during bulk sync
741
+ await self.entity_service.resolve_permalink(path, skip_conflict_check=True)
742
+
743
+ # get file timestamps
744
+ file_metadata = await self.file_service.get_file_metadata(path)
745
+ created = file_metadata.created_at
746
+ modified = file_metadata.modified_at
747
+
748
+ # get mime type
749
+ content_type = self.file_service.content_type(path)
750
+
751
+ file_path = Path(path)
752
+ try:
753
+ entity = await self.entity_repository.add(
754
+ Entity(
755
+ entity_type="file",
756
+ file_path=path,
757
+ checksum=checksum,
758
+ title=file_path.name,
759
+ created_at=created,
760
+ updated_at=modified,
761
+ content_type=content_type,
762
+ mtime=file_metadata.modified_at.timestamp(),
763
+ size=file_metadata.size,
764
+ )
765
+ )
766
+ return entity, checksum
767
+ except IntegrityError as e:
768
+ # Handle race condition where entity was created by another process
769
+ if "UNIQUE constraint failed: entity.file_path" in str(e):
770
+ logger.info(
771
+ f"Entity already exists for file_path={path}, updating instead of creating"
772
+ )
773
+ # Treat as update instead of create
774
+ entity = await self.entity_repository.get_by_file_path(path)
775
+ if entity is None: # pragma: no cover
776
+ logger.error(f"Entity not found after constraint violation, path={path}")
777
+ raise ValueError(f"Entity not found after constraint violation: {path}")
778
+
779
+ # Re-get file metadata since we're in update path
780
+ file_metadata_for_update = await self.file_service.get_file_metadata(path)
781
+ updated = await self.entity_repository.update(
782
+ entity.id,
783
+ {
784
+ "file_path": path,
785
+ "checksum": checksum,
786
+ "mtime": file_metadata_for_update.modified_at.timestamp(),
787
+ "size": file_metadata_for_update.size,
788
+ },
789
+ )
790
+
791
+ if updated is None: # pragma: no cover
792
+ logger.error(f"Failed to update entity, entity_id={entity.id}, path={path}")
793
+ raise ValueError(f"Failed to update entity with ID {entity.id}")
794
+
795
+ return updated, checksum
796
+ else:
797
+ # Re-raise if it's a different integrity error
798
+ raise
799
+ else:
800
+ # Get file timestamps for updating modification time
801
+ file_metadata = await self.file_service.get_file_metadata(path)
802
+ modified = file_metadata.modified_at
803
+
804
+ entity = await self.entity_repository.get_by_file_path(path)
805
+ if entity is None: # pragma: no cover
806
+ logger.error(f"Entity not found for existing file, path={path}")
807
+ raise ValueError(f"Entity not found for existing file: {path}")
808
+
809
+ # Update checksum, modification time, and file metadata from file system
810
+ # Store mtime/size for efficient change detection in future scans
811
+ updated = await self.entity_repository.update(
812
+ entity.id,
813
+ {
814
+ "file_path": path,
815
+ "checksum": checksum,
816
+ "updated_at": modified,
817
+ "mtime": file_metadata.modified_at.timestamp(),
818
+ "size": file_metadata.size,
819
+ },
820
+ )
821
+
822
+ if updated is None: # pragma: no cover
823
+ logger.error(f"Failed to update entity, entity_id={entity.id}, path={path}")
824
+ raise ValueError(f"Failed to update entity with ID {entity.id}")
825
+
826
+ return updated, checksum
827
+
828
+ async def handle_delete(self, file_path: str):
829
+ """Handle complete entity deletion including search index cleanup."""
830
+
831
+ # First get entity to get permalink before deletion
832
+ entity = await self.entity_repository.get_by_file_path(file_path)
833
+ if entity:
834
+ logger.info(
835
+ f"Deleting entity with file_path={file_path}, entity_id={entity.id}, permalink={entity.permalink}"
836
+ )
837
+
838
+ # Delete from db (this cascades to observations/relations)
839
+ await self.entity_service.delete_entity_by_file_path(file_path)
840
+
841
+ # Clean up search index
842
+ permalinks = (
843
+ [entity.permalink]
844
+ + [o.permalink for o in entity.observations]
845
+ + [r.permalink for r in entity.relations]
846
+ )
847
+
848
+ logger.debug(
849
+ f"Cleaning up search index for entity_id={entity.id}, file_path={file_path}, "
850
+ f"index_entries={len(permalinks)}"
851
+ )
852
+
853
+ for permalink in permalinks:
854
+ if permalink:
855
+ await self.search_service.delete_by_permalink(permalink)
856
+ else:
857
+ await self.search_service.delete_by_entity_id(entity.id)
858
+
859
+ async def handle_move(self, old_path, new_path):
860
+ logger.debug("Moving entity", old_path=old_path, new_path=new_path)
861
+
862
+ entity = await self.entity_repository.get_by_file_path(old_path)
863
+ if entity:
864
+ # Check if destination path is already occupied by another entity
865
+ existing_at_destination = await self.entity_repository.get_by_file_path(new_path)
866
+ if existing_at_destination and existing_at_destination.id != entity.id:
867
+ # Handle the conflict - this could be a file swap or replacement scenario
868
+ logger.warning(
869
+ f"File path conflict detected during move: "
870
+ f"entity_id={entity.id} trying to move from '{old_path}' to '{new_path}', "
871
+ f"but entity_id={existing_at_destination.id} already occupies '{new_path}'"
872
+ )
873
+
874
+ # Check if this is a file swap (the destination entity is being moved to our old path)
875
+ # This would indicate a simultaneous move operation
876
+ old_path_after_swap = await self.entity_repository.get_by_file_path(old_path)
877
+ if old_path_after_swap and old_path_after_swap.id == existing_at_destination.id:
878
+ logger.info(f"Detected file swap between '{old_path}' and '{new_path}'")
879
+ # This is a swap scenario - both moves should succeed
880
+ # We'll allow this to proceed since the other file has moved out
881
+ else:
882
+ # This is a conflict where the destination is occupied
883
+ raise ValueError(
884
+ f"Cannot move entity from '{old_path}' to '{new_path}': "
885
+ f"destination path is already occupied by another file. "
886
+ f"This may be caused by: "
887
+ f"1. Conflicting file names with different character encodings, "
888
+ f"2. Case sensitivity differences (e.g., 'Finance/' vs 'finance/'), "
889
+ f"3. Character conflicts between hyphens in filenames and generated permalinks, "
890
+ f"4. Files with similar names containing special characters. "
891
+ f"Try renaming one of the conflicting files to resolve this issue."
892
+ )
893
+
894
+ # Update file_path in all cases
895
+ updates = {"file_path": new_path}
896
+
897
+ # If configured, also update permalink to match new path
898
+ if (
899
+ self.app_config.update_permalinks_on_move
900
+ and not self.app_config.disable_permalinks
901
+ and self.file_service.is_markdown(new_path)
902
+ ):
903
+ # generate new permalink value - skip conflict checks during bulk sync
904
+ new_permalink = await self.entity_service.resolve_permalink(
905
+ new_path, skip_conflict_check=True
906
+ )
907
+
908
+ # write to file and get new checksum
909
+ new_checksum = await self.file_service.update_frontmatter(
910
+ new_path, {"permalink": new_permalink}
911
+ )
912
+
913
+ updates["permalink"] = new_permalink
914
+ updates["checksum"] = new_checksum
915
+
916
+ logger.info(
917
+ f"Updating permalink on move,old_permalink={entity.permalink}"
918
+ f"new_permalink={new_permalink}"
919
+ f"new_checksum={new_checksum}"
920
+ )
921
+
922
+ try:
923
+ updated = await self.entity_repository.update(entity.id, updates)
924
+ except Exception as e:
925
+ # Catch any database integrity errors and provide helpful context
926
+ if "UNIQUE constraint failed" in str(e):
927
+ logger.error(
928
+ f"Database constraint violation during move: "
929
+ f"entity_id={entity.id}, old_path='{old_path}', new_path='{new_path}'"
930
+ )
931
+ raise ValueError(
932
+ f"Cannot complete move from '{old_path}' to '{new_path}': "
933
+ f"a database constraint was violated. This usually indicates "
934
+ f"a file path or permalink conflict. Please check for: "
935
+ f"1. Duplicate file names, "
936
+ f"2. Case sensitivity issues (e.g., 'File.md' vs 'file.md'), "
937
+ f"3. Character encoding conflicts in file names."
938
+ ) from e
939
+ else:
940
+ # Re-raise other exceptions as-is
941
+ raise
942
+
943
+ if updated is None: # pragma: no cover
944
+ logger.error(
945
+ "Failed to update entity path"
946
+ f"entity_id={entity.id}"
947
+ f"old_path={old_path}"
948
+ f"new_path={new_path}"
949
+ )
950
+ raise ValueError(f"Failed to update entity path for ID {entity.id}")
951
+
952
+ logger.debug(
953
+ "Entity path updated"
954
+ f"entity_id={entity.id} "
955
+ f"permalink={entity.permalink} "
956
+ f"old_path={old_path} "
957
+ f"new_path={new_path} "
958
+ )
959
+
960
+ # update search index
961
+ await self.search_service.index_entity(updated)
962
+
963
+ async def resolve_relations(self, entity_id: int | None = None):
964
+ """Try to resolve unresolved relations.
965
+
966
+ Args:
967
+ entity_id: If provided, only resolve relations for this specific entity.
968
+ Otherwise, resolve all unresolved relations in the database.
969
+ """
970
+
971
+ if entity_id:
972
+ # Only get unresolved relations for the specific entity
973
+ unresolved_relations = (
974
+ await self.relation_repository.find_unresolved_relations_for_entity(entity_id)
975
+ )
976
+ logger.info(
977
+ f"Resolving forward references for entity {entity_id}",
978
+ count=len(unresolved_relations),
979
+ )
980
+ else:
981
+ # Get all unresolved relations (original behavior)
982
+ unresolved_relations = await self.relation_repository.find_unresolved_relations()
983
+ logger.info("Resolving all forward references", count=len(unresolved_relations))
984
+
985
+ for relation in unresolved_relations:
986
+ logger.trace(
987
+ "Attempting to resolve relation "
988
+ f"relation_id={relation.id} "
989
+ f"from_id={relation.from_id} "
990
+ f"to_name={relation.to_name}"
991
+ )
992
+
993
+ resolved_entity = await self.entity_service.link_resolver.resolve_link(relation.to_name)
994
+
995
+ # ignore reference to self
996
+ if resolved_entity and resolved_entity.id != relation.from_id:
997
+ logger.debug(
998
+ "Resolved forward reference "
999
+ f"relation_id={relation.id} "
1000
+ f"from_id={relation.from_id} "
1001
+ f"to_name={relation.to_name} "
1002
+ f"resolved_id={resolved_entity.id} "
1003
+ f"resolved_title={resolved_entity.title}",
1004
+ )
1005
+ try:
1006
+ await self.relation_repository.update(
1007
+ relation.id,
1008
+ {
1009
+ "to_id": resolved_entity.id,
1010
+ "to_name": resolved_entity.title,
1011
+ },
1012
+ )
1013
+ # update search index only on successful resolution
1014
+ await self.search_service.index_entity(resolved_entity)
1015
+ except IntegrityError:
1016
+ # IntegrityError means a relation with this (from_id, to_id, relation_type)
1017
+ # already exists. The UPDATE was rolled back, so our unresolved relation
1018
+ # (to_id=NULL) still exists in the database. We delete it because:
1019
+ # 1. It's redundant - a resolved relation already captures this relationship
1020
+ # 2. If we don't delete it, future syncs will try to resolve it again
1021
+ # and get the same IntegrityError
1022
+ logger.debug(
1023
+ "Deleting duplicate unresolved relation "
1024
+ f"relation_id={relation.id} "
1025
+ f"from_id={relation.from_id} "
1026
+ f"to_name={relation.to_name} "
1027
+ f"resolved_to_id={resolved_entity.id}"
1028
+ )
1029
+ try:
1030
+ await self.relation_repository.delete(relation.id)
1031
+ except Exception as e:
1032
+ # Log but don't fail - the relation may have been deleted already
1033
+ logger.debug(f"Could not delete duplicate relation {relation.id}: {e}")
1034
+
1035
+ async def _quick_count_files(self, directory: Path) -> int:
1036
+ """Fast file count using find command.
1037
+
1038
+ Uses subprocess to leverage OS-level file counting which is much faster
1039
+ than Python iteration, especially on network filesystems like TigrisFS.
1040
+
1041
+ On Windows, subprocess is not supported with SelectorEventLoop (which we use
1042
+ to avoid aiosqlite cleanup issues), so we fall back to Python-based counting.
1043
+
1044
+ Args:
1045
+ directory: Directory to count files in
1046
+
1047
+ Returns:
1048
+ Number of files in directory (recursive)
1049
+ """
1050
+ # Windows with SelectorEventLoop doesn't support subprocess
1051
+ if sys.platform == "win32":
1052
+ count = 0
1053
+ async for _ in self.scan_directory(directory):
1054
+ count += 1
1055
+ return count
1056
+
1057
+ process = await asyncio.create_subprocess_shell(
1058
+ f'find "{directory}" -type f | wc -l',
1059
+ stdout=asyncio.subprocess.PIPE,
1060
+ stderr=asyncio.subprocess.PIPE,
1061
+ )
1062
+ stdout, stderr = await process.communicate()
1063
+
1064
+ if process.returncode != 0:
1065
+ error_msg = stderr.decode().strip()
1066
+ logger.error(
1067
+ f"FILE COUNT OPTIMIZATION FAILED: find command failed with exit code {process.returncode}, "
1068
+ f"error: {error_msg}. Falling back to manual count. "
1069
+ f"This will slow down watermark detection!"
1070
+ )
1071
+ # Fallback: count using scan_directory
1072
+ count = 0
1073
+ async for _ in self.scan_directory(directory):
1074
+ count += 1
1075
+ return count
1076
+
1077
+ return int(stdout.strip())
1078
+
1079
+ async def _scan_directory_modified_since(
1080
+ self, directory: Path, since_timestamp: float
1081
+ ) -> List[str]:
1082
+ """Use find -newermt for filesystem-level filtering of modified files.
1083
+
1084
+ This is dramatically faster than scanning all files and comparing mtimes,
1085
+ especially on network filesystems like TigrisFS where stat operations are expensive.
1086
+
1087
+ On Windows, subprocess is not supported with SelectorEventLoop (which we use
1088
+ to avoid aiosqlite cleanup issues), so we implement mtime filtering in Python.
1089
+
1090
+ Args:
1091
+ directory: Directory to scan
1092
+ since_timestamp: Unix timestamp to find files newer than
1093
+
1094
+ Returns:
1095
+ List of relative file paths modified since the timestamp (respects .bmignore)
1096
+ """
1097
+ # Windows with SelectorEventLoop doesn't support subprocess
1098
+ # Implement mtime filtering in Python to preserve watermark optimization
1099
+ if sys.platform == "win32":
1100
+ file_paths = []
1101
+ async for file_path_str, stat_info in self.scan_directory(directory):
1102
+ if stat_info.st_mtime > since_timestamp:
1103
+ rel_path = Path(file_path_str).relative_to(directory).as_posix()
1104
+ file_paths.append(rel_path)
1105
+ return file_paths
1106
+
1107
+ # Convert timestamp to find-compatible format
1108
+ since_date = datetime.fromtimestamp(since_timestamp).strftime("%Y-%m-%d %H:%M:%S")
1109
+
1110
+ process = await asyncio.create_subprocess_shell(
1111
+ f'find "{directory}" -type f -newermt "{since_date}"',
1112
+ stdout=asyncio.subprocess.PIPE,
1113
+ stderr=asyncio.subprocess.PIPE,
1114
+ )
1115
+ stdout, stderr = await process.communicate()
1116
+
1117
+ if process.returncode != 0:
1118
+ error_msg = stderr.decode().strip()
1119
+ logger.error(
1120
+ f"SCAN OPTIMIZATION FAILED: find -newermt command failed with exit code {process.returncode}, "
1121
+ f"error: {error_msg}. Falling back to full scan. "
1122
+ f"This will cause slow syncs on large projects!"
1123
+ )
1124
+ # Fallback to full scan
1125
+ return await self._scan_directory_full(directory)
1126
+
1127
+ # Convert absolute paths to relative and filter through ignore patterns
1128
+ file_paths = []
1129
+ for line in stdout.decode().splitlines():
1130
+ if line:
1131
+ try:
1132
+ abs_path = Path(line)
1133
+ rel_path = abs_path.relative_to(directory).as_posix()
1134
+
1135
+ # Apply ignore patterns (same as scan_directory)
1136
+ if should_ignore_path(abs_path, directory, self._ignore_patterns):
1137
+ logger.trace(f"Ignoring path per .bmignore: {rel_path}")
1138
+ continue
1139
+
1140
+ file_paths.append(rel_path)
1141
+ except ValueError:
1142
+ # Path is not relative to directory, skip it
1143
+ logger.warning(f"Skipping file not under directory: {line}")
1144
+ continue
1145
+
1146
+ return file_paths
1147
+
1148
+ async def _scan_directory_full(self, directory: Path) -> List[str]:
1149
+ """Full directory scan returning all file paths.
1150
+
1151
+ Uses scan_directory() which respects .bmignore patterns.
1152
+
1153
+ Args:
1154
+ directory: Directory to scan
1155
+
1156
+ Returns:
1157
+ List of relative file paths (respects .bmignore)
1158
+ """
1159
+ file_paths = []
1160
+ async for file_path_str, _ in self.scan_directory(directory):
1161
+ rel_path = Path(file_path_str).relative_to(directory).as_posix()
1162
+ file_paths.append(rel_path)
1163
+ return file_paths
1164
+
1165
+ async def scan_directory(self, directory: Path) -> AsyncIterator[Tuple[str, os.stat_result]]:
1166
+ """Stream files from directory using aiofiles.os.scandir() with cached stat info.
1167
+
1168
+ This method uses aiofiles.os.scandir() to leverage async I/O and cached stat
1169
+ information from directory entries. This reduces network I/O by 50% on network
1170
+ filesystems like TigrisFS by avoiding redundant stat() calls.
1171
+
1172
+ Args:
1173
+ directory: Directory to scan
1174
+
1175
+ Yields:
1176
+ Tuples of (absolute_file_path, stat_info) for each file
1177
+ """
1178
+ try:
1179
+ entries = await aiofiles.os.scandir(directory)
1180
+ except PermissionError:
1181
+ logger.warning(f"Permission denied scanning directory: {directory}")
1182
+ return
1183
+
1184
+ results = []
1185
+ subdirs = []
1186
+
1187
+ for entry in entries:
1188
+ entry_path = Path(entry.path)
1189
+
1190
+ # Check ignore patterns
1191
+ if should_ignore_path(entry_path, directory, self._ignore_patterns):
1192
+ logger.trace(f"Ignoring path per .bmignore: {entry_path.relative_to(directory)}")
1193
+ continue
1194
+
1195
+ if entry.is_dir(follow_symlinks=False):
1196
+ # Collect subdirectories to recurse into
1197
+ subdirs.append(entry_path)
1198
+ elif entry.is_file(follow_symlinks=False):
1199
+ # Get cached stat info (no extra syscall!)
1200
+ stat_info = entry.stat(follow_symlinks=False)
1201
+ results.append((entry.path, stat_info))
1202
+
1203
+ # Yield files from current directory
1204
+ for file_path, stat_info in results:
1205
+ yield (file_path, stat_info)
1206
+
1207
+ # Recurse into subdirectories
1208
+ for subdir in subdirs:
1209
+ async for result in self.scan_directory(subdir):
1210
+ yield result
1211
+
1212
+
1213
+ async def get_sync_service(project: Project) -> SyncService: # pragma: no cover
1214
+ """Get sync service instance with all dependencies."""
1215
+
1216
+ app_config = ConfigManager().config
1217
+ _, session_maker = await db.get_or_create_db(
1218
+ db_path=app_config.database_path, db_type=db.DatabaseType.FILESYSTEM
1219
+ )
1220
+
1221
+ project_path = Path(project.path)
1222
+ entity_parser = EntityParser(project_path)
1223
+ markdown_processor = MarkdownProcessor(entity_parser, app_config=app_config)
1224
+ file_service = FileService(project_path, markdown_processor, app_config=app_config)
1225
+
1226
+ # Initialize repositories
1227
+ entity_repository = EntityRepository(session_maker, project_id=project.id)
1228
+ observation_repository = ObservationRepository(session_maker, project_id=project.id)
1229
+ relation_repository = RelationRepository(session_maker, project_id=project.id)
1230
+ search_repository = create_search_repository(session_maker, project_id=project.id)
1231
+ project_repository = ProjectRepository(session_maker)
1232
+
1233
+ # Initialize services
1234
+ search_service = SearchService(search_repository, entity_repository, file_service)
1235
+ link_resolver = LinkResolver(entity_repository, search_service)
1236
+
1237
+ # Initialize services
1238
+ entity_service = EntityService(
1239
+ entity_parser,
1240
+ entity_repository,
1241
+ observation_repository,
1242
+ relation_repository,
1243
+ file_service,
1244
+ link_resolver,
1245
+ )
1246
+
1247
+ # Create sync service
1248
+ sync_service = SyncService(
1249
+ app_config=app_config,
1250
+ entity_service=entity_service,
1251
+ entity_parser=entity_parser,
1252
+ entity_repository=entity_repository,
1253
+ relation_repository=relation_repository,
1254
+ project_repository=project_repository,
1255
+ search_service=search_service,
1256
+ file_service=file_service,
1257
+ )
1258
+
1259
+ return sync_service