basic-memory 0.2.12__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of basic-memory might be problematic. Click here for more details.

Files changed (149) hide show
  1. basic_memory/__init__.py +5 -1
  2. basic_memory/alembic/alembic.ini +119 -0
  3. basic_memory/alembic/env.py +27 -3
  4. basic_memory/alembic/migrations.py +4 -9
  5. basic_memory/alembic/versions/502b60eaa905_remove_required_from_entity_permalink.py +51 -0
  6. basic_memory/alembic/versions/5fe1ab1ccebe_add_projects_table.py +108 -0
  7. basic_memory/alembic/versions/647e7a75e2cd_project_constraint_fix.py +104 -0
  8. basic_memory/alembic/versions/9d9c1cb7d8f5_add_mtime_and_size_columns_to_entity_.py +49 -0
  9. basic_memory/alembic/versions/a1b2c3d4e5f6_fix_project_foreign_keys.py +49 -0
  10. basic_memory/alembic/versions/b3c3938bacdb_relation_to_name_unique_index.py +44 -0
  11. basic_memory/alembic/versions/cc7172b46608_update_search_index_schema.py +100 -0
  12. basic_memory/alembic/versions/e7e1f4367280_add_scan_watermark_tracking_to_project.py +37 -0
  13. basic_memory/api/app.py +63 -31
  14. basic_memory/api/routers/__init__.py +4 -1
  15. basic_memory/api/routers/directory_router.py +84 -0
  16. basic_memory/api/routers/importer_router.py +152 -0
  17. basic_memory/api/routers/knowledge_router.py +165 -28
  18. basic_memory/api/routers/management_router.py +80 -0
  19. basic_memory/api/routers/memory_router.py +28 -67
  20. basic_memory/api/routers/project_router.py +406 -0
  21. basic_memory/api/routers/prompt_router.py +260 -0
  22. basic_memory/api/routers/resource_router.py +219 -14
  23. basic_memory/api/routers/search_router.py +21 -13
  24. basic_memory/api/routers/utils.py +130 -0
  25. basic_memory/api/template_loader.py +292 -0
  26. basic_memory/cli/app.py +52 -1
  27. basic_memory/cli/auth.py +277 -0
  28. basic_memory/cli/commands/__init__.py +13 -2
  29. basic_memory/cli/commands/cloud/__init__.py +6 -0
  30. basic_memory/cli/commands/cloud/api_client.py +112 -0
  31. basic_memory/cli/commands/cloud/bisync_commands.py +110 -0
  32. basic_memory/cli/commands/cloud/cloud_utils.py +101 -0
  33. basic_memory/cli/commands/cloud/core_commands.py +195 -0
  34. basic_memory/cli/commands/cloud/rclone_commands.py +301 -0
  35. basic_memory/cli/commands/cloud/rclone_config.py +110 -0
  36. basic_memory/cli/commands/cloud/rclone_installer.py +249 -0
  37. basic_memory/cli/commands/cloud/upload.py +233 -0
  38. basic_memory/cli/commands/cloud/upload_command.py +124 -0
  39. basic_memory/cli/commands/command_utils.py +51 -0
  40. basic_memory/cli/commands/db.py +26 -7
  41. basic_memory/cli/commands/import_chatgpt.py +83 -0
  42. basic_memory/cli/commands/import_claude_conversations.py +86 -0
  43. basic_memory/cli/commands/import_claude_projects.py +85 -0
  44. basic_memory/cli/commands/import_memory_json.py +35 -92
  45. basic_memory/cli/commands/mcp.py +84 -10
  46. basic_memory/cli/commands/project.py +876 -0
  47. basic_memory/cli/commands/status.py +47 -30
  48. basic_memory/cli/commands/tool.py +341 -0
  49. basic_memory/cli/main.py +13 -6
  50. basic_memory/config.py +481 -22
  51. basic_memory/db.py +192 -32
  52. basic_memory/deps.py +252 -22
  53. basic_memory/file_utils.py +113 -58
  54. basic_memory/ignore_utils.py +297 -0
  55. basic_memory/importers/__init__.py +27 -0
  56. basic_memory/importers/base.py +79 -0
  57. basic_memory/importers/chatgpt_importer.py +232 -0
  58. basic_memory/importers/claude_conversations_importer.py +177 -0
  59. basic_memory/importers/claude_projects_importer.py +148 -0
  60. basic_memory/importers/memory_json_importer.py +108 -0
  61. basic_memory/importers/utils.py +58 -0
  62. basic_memory/markdown/entity_parser.py +143 -23
  63. basic_memory/markdown/markdown_processor.py +3 -3
  64. basic_memory/markdown/plugins.py +39 -21
  65. basic_memory/markdown/schemas.py +1 -1
  66. basic_memory/markdown/utils.py +28 -13
  67. basic_memory/mcp/async_client.py +134 -4
  68. basic_memory/mcp/project_context.py +141 -0
  69. basic_memory/mcp/prompts/__init__.py +19 -0
  70. basic_memory/mcp/prompts/ai_assistant_guide.py +70 -0
  71. basic_memory/mcp/prompts/continue_conversation.py +62 -0
  72. basic_memory/mcp/prompts/recent_activity.py +188 -0
  73. basic_memory/mcp/prompts/search.py +57 -0
  74. basic_memory/mcp/prompts/utils.py +162 -0
  75. basic_memory/mcp/resources/ai_assistant_guide.md +283 -0
  76. basic_memory/mcp/resources/project_info.py +71 -0
  77. basic_memory/mcp/server.py +7 -13
  78. basic_memory/mcp/tools/__init__.py +33 -21
  79. basic_memory/mcp/tools/build_context.py +120 -0
  80. basic_memory/mcp/tools/canvas.py +130 -0
  81. basic_memory/mcp/tools/chatgpt_tools.py +187 -0
  82. basic_memory/mcp/tools/delete_note.py +225 -0
  83. basic_memory/mcp/tools/edit_note.py +320 -0
  84. basic_memory/mcp/tools/list_directory.py +167 -0
  85. basic_memory/mcp/tools/move_note.py +545 -0
  86. basic_memory/mcp/tools/project_management.py +200 -0
  87. basic_memory/mcp/tools/read_content.py +271 -0
  88. basic_memory/mcp/tools/read_note.py +255 -0
  89. basic_memory/mcp/tools/recent_activity.py +534 -0
  90. basic_memory/mcp/tools/search.py +369 -14
  91. basic_memory/mcp/tools/utils.py +374 -16
  92. basic_memory/mcp/tools/view_note.py +77 -0
  93. basic_memory/mcp/tools/write_note.py +207 -0
  94. basic_memory/models/__init__.py +3 -2
  95. basic_memory/models/knowledge.py +67 -15
  96. basic_memory/models/project.py +87 -0
  97. basic_memory/models/search.py +10 -6
  98. basic_memory/repository/__init__.py +2 -0
  99. basic_memory/repository/entity_repository.py +229 -7
  100. basic_memory/repository/observation_repository.py +35 -3
  101. basic_memory/repository/project_info_repository.py +10 -0
  102. basic_memory/repository/project_repository.py +103 -0
  103. basic_memory/repository/relation_repository.py +21 -2
  104. basic_memory/repository/repository.py +147 -29
  105. basic_memory/repository/search_repository.py +437 -59
  106. basic_memory/schemas/__init__.py +22 -9
  107. basic_memory/schemas/base.py +97 -8
  108. basic_memory/schemas/cloud.py +50 -0
  109. basic_memory/schemas/directory.py +30 -0
  110. basic_memory/schemas/importer.py +35 -0
  111. basic_memory/schemas/memory.py +188 -23
  112. basic_memory/schemas/project_info.py +211 -0
  113. basic_memory/schemas/prompt.py +90 -0
  114. basic_memory/schemas/request.py +57 -3
  115. basic_memory/schemas/response.py +9 -1
  116. basic_memory/schemas/search.py +33 -35
  117. basic_memory/schemas/sync_report.py +72 -0
  118. basic_memory/services/__init__.py +2 -1
  119. basic_memory/services/context_service.py +251 -106
  120. basic_memory/services/directory_service.py +295 -0
  121. basic_memory/services/entity_service.py +595 -60
  122. basic_memory/services/exceptions.py +21 -0
  123. basic_memory/services/file_service.py +284 -30
  124. basic_memory/services/initialization.py +191 -0
  125. basic_memory/services/link_resolver.py +50 -56
  126. basic_memory/services/project_service.py +863 -0
  127. basic_memory/services/search_service.py +172 -34
  128. basic_memory/sync/__init__.py +3 -2
  129. basic_memory/sync/background_sync.py +26 -0
  130. basic_memory/sync/sync_service.py +1176 -96
  131. basic_memory/sync/watch_service.py +412 -135
  132. basic_memory/templates/prompts/continue_conversation.hbs +110 -0
  133. basic_memory/templates/prompts/search.hbs +101 -0
  134. basic_memory/utils.py +388 -28
  135. basic_memory-0.16.1.dist-info/METADATA +493 -0
  136. basic_memory-0.16.1.dist-info/RECORD +148 -0
  137. {basic_memory-0.2.12.dist-info → basic_memory-0.16.1.dist-info}/entry_points.txt +1 -0
  138. basic_memory/alembic/README +0 -1
  139. basic_memory/cli/commands/sync.py +0 -203
  140. basic_memory/mcp/tools/knowledge.py +0 -56
  141. basic_memory/mcp/tools/memory.py +0 -151
  142. basic_memory/mcp/tools/notes.py +0 -122
  143. basic_memory/schemas/discovery.py +0 -28
  144. basic_memory/sync/file_change_scanner.py +0 -158
  145. basic_memory/sync/utils.py +0 -34
  146. basic_memory-0.2.12.dist-info/METADATA +0 -291
  147. basic_memory-0.2.12.dist-info/RECORD +0 -78
  148. {basic_memory-0.2.12.dist-info → basic_memory-0.16.1.dist-info}/WHEEL +0 -0
  149. {basic_memory-0.2.12.dist-info → basic_memory-0.16.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,50 +1,853 @@
1
1
  """Service for syncing files between filesystem and database."""
2
2
 
3
+ import asyncio
4
+ import os
5
+ import time
6
+ from collections import OrderedDict
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
3
9
  from pathlib import Path
4
- from typing import Dict
10
+ from typing import AsyncIterator, Dict, List, Optional, Set, Tuple
5
11
 
12
+ import aiofiles.os
13
+ import logfire
6
14
  from loguru import logger
7
15
  from sqlalchemy.exc import IntegrityError
8
16
 
9
- from basic_memory import file_utils
10
- from basic_memory.markdown import EntityParser, EntityMarkdown
11
- from basic_memory.repository import EntityRepository, RelationRepository
12
- from basic_memory.services import EntityService
17
+ from basic_memory import db
18
+ from basic_memory.config import BasicMemoryConfig, ConfigManager
19
+ from basic_memory.file_utils import has_frontmatter
20
+ from basic_memory.ignore_utils import load_bmignore_patterns, should_ignore_path
21
+ from basic_memory.markdown import EntityParser, MarkdownProcessor
22
+ from basic_memory.models import Entity, Project
23
+ from basic_memory.repository import (
24
+ EntityRepository,
25
+ RelationRepository,
26
+ ObservationRepository,
27
+ ProjectRepository,
28
+ )
29
+ from basic_memory.repository.search_repository import SearchRepository
30
+ from basic_memory.services import EntityService, FileService
31
+ from basic_memory.services.exceptions import SyncFatalError
32
+ from basic_memory.services.link_resolver import LinkResolver
13
33
  from basic_memory.services.search_service import SearchService
14
- from basic_memory.sync import FileChangeScanner
15
- from basic_memory.sync.utils import SyncReport
16
34
 
35
+ # Circuit breaker configuration
36
+ MAX_CONSECUTIVE_FAILURES = 3
17
37
 
18
- class SyncService:
19
- """Syncs documents and knowledge files with database.
20
38
 
21
- Implements two-pass sync strategy for knowledge files to handle relations:
22
- 1. First pass creates/updates entities without relations
23
- 2. Second pass processes relations after all entities exist
39
+ @dataclass
40
+ class FileFailureInfo:
41
+ """Track failure information for a file that repeatedly fails to sync.
42
+
43
+ Attributes:
44
+ count: Number of consecutive failures
45
+ first_failure: Timestamp of first failure in current sequence
46
+ last_failure: Timestamp of most recent failure
47
+ last_error: Error message from most recent failure
48
+ last_checksum: Checksum of file when it last failed (for detecting file changes)
49
+ """
50
+
51
+ count: int
52
+ first_failure: datetime
53
+ last_failure: datetime
54
+ last_error: str
55
+ last_checksum: str
56
+
57
+
58
+ @dataclass
59
+ class SkippedFile:
60
+ """Information about a file that was skipped due to repeated failures.
61
+
62
+ Attributes:
63
+ path: File path relative to project root
64
+ reason: Error message from last failure
65
+ failure_count: Number of consecutive failures
66
+ first_failed: Timestamp of first failure
67
+ """
68
+
69
+ path: str
70
+ reason: str
71
+ failure_count: int
72
+ first_failed: datetime
73
+
74
+
75
+ @dataclass
76
+ class SyncReport:
77
+ """Report of file changes found compared to database state.
78
+
79
+ Attributes:
80
+ total: Total number of files in directory being synced
81
+ new: Files that exist on disk but not in database
82
+ modified: Files that exist in both but have different checksums
83
+ deleted: Files that exist in database but not on disk
84
+ moves: Files that have been moved from one location to another
85
+ checksums: Current checksums for files on disk
86
+ skipped_files: Files that were skipped due to repeated failures
24
87
  """
25
88
 
89
+ # We keep paths as strings in sets/dicts for easier serialization
90
+ new: Set[str] = field(default_factory=set)
91
+ modified: Set[str] = field(default_factory=set)
92
+ deleted: Set[str] = field(default_factory=set)
93
+ moves: Dict[str, str] = field(default_factory=dict) # old_path -> new_path
94
+ checksums: Dict[str, str] = field(default_factory=dict) # path -> checksum
95
+ skipped_files: List[SkippedFile] = field(default_factory=list)
96
+
97
+ @property
98
+ def total(self) -> int:
99
+ """Total number of changes."""
100
+ return len(self.new) + len(self.modified) + len(self.deleted) + len(self.moves)
101
+
102
+
103
+ @dataclass
104
+ class ScanResult:
105
+ """Result of scanning a directory."""
106
+
107
+ # file_path -> checksum
108
+ files: Dict[str, str] = field(default_factory=dict)
109
+
110
+ # checksum -> file_path
111
+ checksums: Dict[str, str] = field(default_factory=dict)
112
+
113
+ # file_path -> error message
114
+ errors: Dict[str, str] = field(default_factory=dict)
115
+
116
+
117
+ class SyncService:
118
+ """Syncs documents and knowledge files with database."""
119
+
26
120
  def __init__(
27
121
  self,
28
- scanner: FileChangeScanner,
122
+ app_config: BasicMemoryConfig,
29
123
  entity_service: EntityService,
30
124
  entity_parser: EntityParser,
31
125
  entity_repository: EntityRepository,
32
126
  relation_repository: RelationRepository,
127
+ project_repository: ProjectRepository,
33
128
  search_service: SearchService,
129
+ file_service: FileService,
34
130
  ):
35
- self.scanner = scanner
131
+ self.app_config = app_config
36
132
  self.entity_service = entity_service
37
133
  self.entity_parser = entity_parser
38
134
  self.entity_repository = entity_repository
39
135
  self.relation_repository = relation_repository
136
+ self.project_repository = project_repository
40
137
  self.search_service = search_service
138
+ self.file_service = file_service
139
+ # Load ignore patterns once at initialization for performance
140
+ self._ignore_patterns = load_bmignore_patterns()
141
+ # Circuit breaker: track file failures to prevent infinite retry loops
142
+ # Use OrderedDict for LRU behavior with bounded size to prevent unbounded memory growth
143
+ self._file_failures: OrderedDict[str, FileFailureInfo] = OrderedDict()
144
+ self._max_tracked_failures = 100 # Limit failure cache size
145
+
146
+ async def _should_skip_file(self, path: str) -> bool:
147
+ """Check if file should be skipped due to repeated failures.
148
+
149
+ Computes current file checksum and compares with last failed checksum.
150
+ If checksums differ, file has changed and we should retry.
151
+
152
+ Args:
153
+ path: File path to check
154
+
155
+ Returns:
156
+ True if file should be skipped, False otherwise
157
+ """
158
+ if path not in self._file_failures:
159
+ return False
160
+
161
+ failure_info = self._file_failures[path]
162
+
163
+ # Check if failure count exceeds threshold
164
+ if failure_info.count < MAX_CONSECUTIVE_FAILURES:
165
+ return False
166
+
167
+ # Compute current checksum to see if file changed
168
+ try:
169
+ current_checksum = await self.file_service.compute_checksum(path)
170
+
171
+ # If checksum changed, file was modified - reset and retry
172
+ if current_checksum != failure_info.last_checksum:
173
+ logger.info(
174
+ f"File {path} changed since last failure (checksum differs), "
175
+ f"resetting failure count and retrying"
176
+ )
177
+ del self._file_failures[path]
178
+ return False
179
+ except Exception as e:
180
+ # If we can't compute checksum, log but still skip to avoid infinite loops
181
+ logger.warning(f"Failed to compute checksum for {path}: {e}")
182
+
183
+ # File unchanged and exceeded threshold - skip it
184
+ return True
185
+
186
+ async def _record_failure(self, path: str, error: str) -> None:
187
+ """Record a file sync failure for circuit breaker tracking.
188
+
189
+ Uses LRU cache with bounded size to prevent unbounded memory growth.
190
+
191
+ Args:
192
+ path: File path that failed
193
+ error: Error message from the failure
194
+ """
195
+ now = datetime.now()
196
+
197
+ # Compute checksum for failure tracking
198
+ try:
199
+ checksum = await self.file_service.compute_checksum(path)
200
+ except Exception:
201
+ # If checksum fails, use empty string (better than crashing)
202
+ checksum = ""
203
+
204
+ if path in self._file_failures:
205
+ # Update existing failure record and move to end (most recently used)
206
+ failure_info = self._file_failures.pop(path)
207
+ failure_info.count += 1
208
+ failure_info.last_failure = now
209
+ failure_info.last_error = error
210
+ failure_info.last_checksum = checksum
211
+ self._file_failures[path] = failure_info
212
+
213
+ logger.warning(
214
+ f"File sync failed (attempt {failure_info.count}/{MAX_CONSECUTIVE_FAILURES}): "
215
+ f"path={path}, error={error}"
216
+ )
217
+
218
+ # Record metric for file failure
219
+ logfire.metric_counter("sync.circuit_breaker.failures").add(1)
220
+
221
+ # Log when threshold is reached
222
+ if failure_info.count >= MAX_CONSECUTIVE_FAILURES:
223
+ logger.error(
224
+ f"File {path} has failed {MAX_CONSECUTIVE_FAILURES} times and will be skipped. "
225
+ f"First failure: {failure_info.first_failure}, Last error: {error}"
226
+ )
227
+ # Record metric for file being blocked by circuit breaker
228
+ logfire.metric_counter("sync.circuit_breaker.blocked_files").add(1)
229
+ else:
230
+ # Create new failure record
231
+ self._file_failures[path] = FileFailureInfo(
232
+ count=1,
233
+ first_failure=now,
234
+ last_failure=now,
235
+ last_error=error,
236
+ last_checksum=checksum,
237
+ )
238
+ logger.debug(f"Recording first failure for {path}: {error}")
239
+
240
+ # Enforce cache size limit - remove oldest entry if over limit
241
+ if len(self._file_failures) > self._max_tracked_failures:
242
+ removed_path, removed_info = self._file_failures.popitem(last=False)
243
+ logger.debug(
244
+ f"Evicting oldest failure record from cache: path={removed_path}, "
245
+ f"failures={removed_info.count}"
246
+ )
247
+
248
+ def _clear_failure(self, path: str) -> None:
249
+ """Clear failure tracking for a file after successful sync.
250
+
251
+ Args:
252
+ path: File path that successfully synced
253
+ """
254
+ if path in self._file_failures:
255
+ logger.info(f"Clearing failure history for {path} after successful sync")
256
+ del self._file_failures[path]
257
+
258
+ @logfire.instrument()
259
+ async def sync(
260
+ self, directory: Path, project_name: Optional[str] = None, force_full: bool = False
261
+ ) -> SyncReport:
262
+ """Sync all files with database and update scan watermark.
263
+
264
+ Args:
265
+ directory: Directory to sync
266
+ project_name: Optional project name
267
+ force_full: If True, force a full scan bypassing watermark optimization
268
+ """
269
+
270
+ start_time = time.time()
271
+ sync_start_timestamp = time.time() # Capture at start for watermark
272
+ logger.info(f"Sync operation started for directory: {directory} (force_full={force_full})")
273
+
274
+ # initial paths from db to sync
275
+ # path -> checksum
276
+ report = await self.scan(directory, force_full=force_full)
277
+
278
+ # order of sync matters to resolve relations effectively
279
+ logger.info(
280
+ f"Sync changes detected: new_files={len(report.new)}, modified_files={len(report.modified)}, "
281
+ + f"deleted_files={len(report.deleted)}, moved_files={len(report.moves)}"
282
+ )
283
+
284
+ # sync moves first
285
+ with logfire.span("process_moves", move_count=len(report.moves)):
286
+ for old_path, new_path in report.moves.items():
287
+ # in the case where a file has been deleted and replaced by another file
288
+ # it will show up in the move and modified lists, so handle it in modified
289
+ if new_path in report.modified:
290
+ report.modified.remove(new_path)
291
+ logger.debug(
292
+ f"File marked as moved and modified: old_path={old_path}, new_path={new_path}"
293
+ )
294
+ else:
295
+ await self.handle_move(old_path, new_path)
296
+
297
+ # deleted next
298
+ with logfire.span("process_deletes", delete_count=len(report.deleted)):
299
+ for path in report.deleted:
300
+ await self.handle_delete(path)
301
+
302
+ # then new and modified
303
+ with logfire.span("process_new_files", new_count=len(report.new)):
304
+ for path in report.new:
305
+ entity, _ = await self.sync_file(path, new=True)
306
+
307
+ # Track if file was skipped
308
+ if entity is None and await self._should_skip_file(path):
309
+ failure_info = self._file_failures[path]
310
+ report.skipped_files.append(
311
+ SkippedFile(
312
+ path=path,
313
+ reason=failure_info.last_error,
314
+ failure_count=failure_info.count,
315
+ first_failed=failure_info.first_failure,
316
+ )
317
+ )
318
+
319
+ with logfire.span("process_modified_files", modified_count=len(report.modified)):
320
+ for path in report.modified:
321
+ entity, _ = await self.sync_file(path, new=False)
322
+
323
+ # Track if file was skipped
324
+ if entity is None and await self._should_skip_file(path):
325
+ failure_info = self._file_failures[path]
326
+ report.skipped_files.append(
327
+ SkippedFile(
328
+ path=path,
329
+ reason=failure_info.last_error,
330
+ failure_count=failure_info.count,
331
+ first_failed=failure_info.first_failure,
332
+ )
333
+ )
41
334
 
42
- async def handle_entity_deletion(self, file_path: str):
335
+ # Only resolve relations if there were actual changes
336
+ # If no files changed, no new unresolved relations could have been created
337
+ with logfire.span("resolve_relations"):
338
+ if report.total > 0:
339
+ await self.resolve_relations()
340
+ else:
341
+ logger.info("Skipping relation resolution - no file changes detected")
342
+
343
+ # Update scan watermark after successful sync
344
+ # Use the timestamp from sync start (not end) to ensure we catch files
345
+ # created during the sync on the next iteration
346
+ current_file_count = await self._quick_count_files(directory)
347
+ if self.entity_repository.project_id is not None:
348
+ project = await self.project_repository.find_by_id(self.entity_repository.project_id)
349
+ if project:
350
+ await self.project_repository.update(
351
+ project.id,
352
+ {
353
+ "last_scan_timestamp": sync_start_timestamp,
354
+ "last_file_count": current_file_count,
355
+ },
356
+ )
357
+ logger.debug(
358
+ f"Updated scan watermark: timestamp={sync_start_timestamp}, "
359
+ f"file_count={current_file_count}"
360
+ )
361
+
362
+ duration_ms = int((time.time() - start_time) * 1000)
363
+
364
+ # Record metrics for sync operation
365
+ logfire.metric_histogram("sync.duration", unit="ms").record(duration_ms)
366
+ logfire.metric_counter("sync.files.new").add(len(report.new))
367
+ logfire.metric_counter("sync.files.modified").add(len(report.modified))
368
+ logfire.metric_counter("sync.files.deleted").add(len(report.deleted))
369
+ logfire.metric_counter("sync.files.moved").add(len(report.moves))
370
+ if report.skipped_files:
371
+ logfire.metric_counter("sync.files.skipped").add(len(report.skipped_files))
372
+
373
+ # Log summary with skipped files if any
374
+ if report.skipped_files:
375
+ logger.warning(
376
+ f"Sync completed with {len(report.skipped_files)} skipped files: "
377
+ f"directory={directory}, total_changes={report.total}, "
378
+ f"skipped={len(report.skipped_files)}, duration_ms={duration_ms}"
379
+ )
380
+ for skipped in report.skipped_files:
381
+ logger.warning(
382
+ f"Skipped file: path={skipped.path}, "
383
+ f"failures={skipped.failure_count}, reason={skipped.reason}"
384
+ )
385
+ else:
386
+ logger.info(
387
+ f"Sync operation completed: directory={directory}, "
388
+ f"total_changes={report.total}, duration_ms={duration_ms}"
389
+ )
390
+
391
+ return report
392
+
393
+ @logfire.instrument()
394
+ async def scan(self, directory, force_full: bool = False):
395
+ """Smart scan using watermark and file count for large project optimization.
396
+
397
+ Uses scan watermark tracking to dramatically reduce scan time for large projects:
398
+ - Tracks last_scan_timestamp and last_file_count in Project model
399
+ - Uses `find -newermt` for incremental scanning (only changed files)
400
+ - Falls back to full scan when deletions detected (file count decreased)
401
+
402
+ Expected performance:
403
+ - No changes: 225x faster (2s vs 450s for 1,460 files on TigrisFS)
404
+ - Few changes: 84x faster (5s vs 420s)
405
+ - Deletions: Full scan (rare, acceptable)
406
+
407
+ Architecture:
408
+ - Get current file count quickly (find | wc -l: 1.4s)
409
+ - Compare with last_file_count to detect deletions
410
+ - If no deletions: incremental scan with find -newermt (0.2s)
411
+ - Process changed files with mtime-based comparison
412
+
413
+ Args:
414
+ directory: Directory to scan
415
+ force_full: If True, bypass watermark optimization and force full scan
416
+ """
417
+ scan_start_time = time.time()
418
+
419
+ report = SyncReport()
420
+
421
+ # Get current project to check watermark
422
+ if self.entity_repository.project_id is None:
423
+ raise ValueError("Entity repository has no project_id set")
424
+
425
+ project = await self.project_repository.find_by_id(self.entity_repository.project_id)
426
+ if project is None:
427
+ raise ValueError(f"Project not found: {self.entity_repository.project_id}")
428
+
429
+ # Step 1: Quick file count
430
+ logger.debug("Counting files in directory")
431
+ current_count = await self._quick_count_files(directory)
432
+ logger.debug(f"Found {current_count} files in directory")
433
+
434
+ # Step 2: Determine scan strategy based on watermark and file count
435
+ if force_full:
436
+ # User explicitly requested full scan → bypass watermark optimization
437
+ scan_type = "full_forced"
438
+ logger.info("Force full scan requested, bypassing watermark optimization")
439
+ file_paths_to_scan = await self._scan_directory_full(directory)
440
+
441
+ elif project.last_file_count is None:
442
+ # First sync ever → full scan
443
+ scan_type = "full_initial"
444
+ logger.info("First sync for this project, performing full scan")
445
+ file_paths_to_scan = await self._scan_directory_full(directory)
446
+
447
+ elif current_count < project.last_file_count:
448
+ # Files deleted → need full scan to detect which ones
449
+ scan_type = "full_deletions"
450
+ logger.info(
451
+ f"File count decreased ({project.last_file_count} → {current_count}), "
452
+ f"running full scan to detect deletions"
453
+ )
454
+ file_paths_to_scan = await self._scan_directory_full(directory)
455
+
456
+ elif project.last_scan_timestamp is not None:
457
+ # Incremental scan: only files modified since last scan
458
+ scan_type = "incremental"
459
+ logger.info(
460
+ f"Running incremental scan for files modified since {project.last_scan_timestamp}"
461
+ )
462
+ file_paths_to_scan = await self._scan_directory_modified_since(
463
+ directory, project.last_scan_timestamp
464
+ )
465
+ logger.info(
466
+ f"Incremental scan found {len(file_paths_to_scan)} potentially changed files"
467
+ )
468
+
469
+ else:
470
+ # Fallback to full scan (no watermark available)
471
+ scan_type = "full_fallback"
472
+ logger.warning("No scan watermark available, falling back to full scan")
473
+ file_paths_to_scan = await self._scan_directory_full(directory)
474
+
475
+ # Record scan type metric
476
+ logfire.metric_counter(f"sync.scan.{scan_type}").add(1)
477
+ logfire.metric_histogram("sync.scan.files_scanned", unit="files").record(
478
+ len(file_paths_to_scan)
479
+ )
480
+
481
+ # Step 3: Process each file with mtime-based comparison
482
+ scanned_paths: Set[str] = set()
483
+ changed_checksums: Dict[str, str] = {}
484
+
485
+ logger.debug(f"Processing {len(file_paths_to_scan)} files with mtime-based comparison")
486
+
487
+ for rel_path in file_paths_to_scan:
488
+ scanned_paths.add(rel_path)
489
+
490
+ # Get file stats
491
+ abs_path = directory / rel_path
492
+ if not abs_path.exists():
493
+ # File was deleted between scan and now (race condition)
494
+ continue
495
+
496
+ stat_info = abs_path.stat()
497
+
498
+ # Indexed lookup - single file query (not full table scan)
499
+ db_entity = await self.entity_repository.get_by_file_path(rel_path)
500
+
501
+ if db_entity is None:
502
+ # New file - need checksum for move detection
503
+ checksum = await self.file_service.compute_checksum(rel_path)
504
+ report.new.add(rel_path)
505
+ changed_checksums[rel_path] = checksum
506
+ logger.trace(f"New file detected: {rel_path}")
507
+ continue
508
+
509
+ # File exists in DB - check if mtime/size changed
510
+ db_mtime = db_entity.mtime
511
+ db_size = db_entity.size
512
+ fs_mtime = stat_info.st_mtime
513
+ fs_size = stat_info.st_size
514
+
515
+ # Compare mtime and size (like rsync/rclone)
516
+ # Allow small epsilon for float comparison (0.01s = 10ms)
517
+ mtime_changed = db_mtime is None or abs(fs_mtime - db_mtime) > 0.01
518
+ size_changed = db_size is None or fs_size != db_size
519
+
520
+ if mtime_changed or size_changed:
521
+ # File modified - compute checksum
522
+ checksum = await self.file_service.compute_checksum(rel_path)
523
+ db_checksum = db_entity.checksum
524
+
525
+ # Only mark as modified if checksum actually differs
526
+ # (handles cases where mtime changed but content didn't, e.g., git operations)
527
+ if checksum != db_checksum:
528
+ report.modified.add(rel_path)
529
+ changed_checksums[rel_path] = checksum
530
+ logger.trace(
531
+ f"Modified file detected: {rel_path}, "
532
+ f"mtime_changed={mtime_changed}, size_changed={size_changed}"
533
+ )
534
+ else:
535
+ # File unchanged - no checksum needed
536
+ logger.trace(f"File unchanged (mtime/size match): {rel_path}")
537
+
538
+ # Step 4: Detect moves (for both full and incremental scans)
539
+ # Check if any "new" files are actually moves by matching checksums
540
+ for new_path in list(report.new): # Use list() to allow modification during iteration
541
+ new_checksum = changed_checksums.get(new_path)
542
+ if not new_checksum:
543
+ continue
544
+
545
+ # Look for existing entity with same checksum but different path
546
+ # This could be a move or a copy
547
+ existing_entities = await self.entity_repository.find_by_checksum(new_checksum)
548
+
549
+ for candidate in existing_entities:
550
+ if candidate.file_path == new_path:
551
+ # Same path, skip (shouldn't happen for "new" files but be safe)
552
+ continue
553
+
554
+ # Check if the old path still exists on disk
555
+ old_path_abs = directory / candidate.file_path
556
+ if old_path_abs.exists():
557
+ # Original still exists → this is a copy, not a move
558
+ logger.trace(
559
+ f"File copy detected (not move): {candidate.file_path} copied to {new_path}"
560
+ )
561
+ continue
562
+
563
+ # Original doesn't exist → this is a move!
564
+ report.moves[candidate.file_path] = new_path
565
+ report.new.remove(new_path)
566
+ logger.trace(f"Move detected: {candidate.file_path} -> {new_path}")
567
+ break # Only match first candidate
568
+
569
+ # Step 5: Detect deletions (only for full scans)
570
+ # Incremental scans can't reliably detect deletions since they only see modified files
571
+ if scan_type in ("full_initial", "full_deletions", "full_fallback", "full_forced"):
572
+ # Use optimized query for just file paths (not full entities)
573
+ db_file_paths = await self.entity_repository.get_all_file_paths()
574
+ logger.debug(f"Found {len(db_file_paths)} db paths for deletion detection")
575
+
576
+ for db_path in db_file_paths:
577
+ if db_path not in scanned_paths:
578
+ # File in DB but not on filesystem
579
+ # Check if it was already detected as a move
580
+ if db_path in report.moves:
581
+ # Already handled as a move, skip
582
+ continue
583
+
584
+ # File was deleted
585
+ report.deleted.add(db_path)
586
+ logger.trace(f"Deleted file detected: {db_path}")
587
+
588
+ # Store checksums for files that need syncing
589
+ report.checksums = changed_checksums
590
+
591
+ scan_duration_ms = int((time.time() - scan_start_time) * 1000)
592
+ logfire.metric_histogram("sync.scan.duration", unit="ms").record(scan_duration_ms)
593
+
594
+ logger.info(
595
+ f"Completed {scan_type} scan for directory {directory} in {scan_duration_ms}ms, "
596
+ f"found {report.total} changes (new={len(report.new)}, "
597
+ f"modified={len(report.modified)}, deleted={len(report.deleted)}, "
598
+ f"moves={len(report.moves)})"
599
+ )
600
+ return report
601
+
602
+ @logfire.instrument()
603
+ async def sync_file(
604
+ self, path: str, new: bool = True
605
+ ) -> Tuple[Optional[Entity], Optional[str]]:
606
+ """Sync a single file with circuit breaker protection.
607
+
608
+ Args:
609
+ path: Path to file to sync
610
+ new: Whether this is a new file
611
+
612
+ Returns:
613
+ Tuple of (entity, checksum) or (None, None) if sync fails or file is skipped
614
+ """
615
+ # Check if file should be skipped due to repeated failures
616
+ if await self._should_skip_file(path):
617
+ logger.warning(f"Skipping file due to repeated failures: {path}")
618
+ return None, None
619
+
620
+ try:
621
+ logger.debug(
622
+ f"Syncing file path={path} is_new={new} is_markdown={self.file_service.is_markdown(path)}"
623
+ )
624
+
625
+ if self.file_service.is_markdown(path):
626
+ entity, checksum = await self.sync_markdown_file(path, new)
627
+ else:
628
+ entity, checksum = await self.sync_regular_file(path, new)
629
+
630
+ if entity is not None:
631
+ await self.search_service.index_entity(entity)
632
+
633
+ # Clear failure tracking on successful sync
634
+ self._clear_failure(path)
635
+
636
+ logger.debug(
637
+ f"File sync completed, path={path}, entity_id={entity.id}, checksum={checksum[:8]}"
638
+ )
639
+ return entity, checksum
640
+
641
+ except Exception as e:
642
+ # Check if this is a fatal error (or caused by one)
643
+ # Fatal errors like project deletion should terminate sync immediately
644
+ if isinstance(e, SyncFatalError) or isinstance(e.__cause__, SyncFatalError):
645
+ logger.error(f"Fatal sync error encountered, terminating sync: path={path}")
646
+ raise
647
+
648
+ # Otherwise treat as recoverable file-level error
649
+ error_msg = str(e)
650
+ logger.error(f"Failed to sync file: path={path}, error={error_msg}")
651
+
652
+ # Record failure for circuit breaker
653
+ await self._record_failure(path, error_msg)
654
+
655
+ return None, None
656
+
657
+ @logfire.instrument()
658
+ async def sync_markdown_file(self, path: str, new: bool = True) -> Tuple[Optional[Entity], str]:
659
+ """Sync a markdown file with full processing.
660
+
661
+ Args:
662
+ path: Path to markdown file
663
+ new: Whether this is a new file
664
+
665
+ Returns:
666
+ Tuple of (entity, checksum)
667
+ """
668
+ # Parse markdown first to get any existing permalink
669
+ logger.debug(f"Parsing markdown file, path: {path}, new: {new}")
670
+
671
+ file_content = await self.file_service.read_file_content(path)
672
+ file_contains_frontmatter = has_frontmatter(file_content)
673
+
674
+ # Get file timestamps for tracking modification times
675
+ file_stats = self.file_service.file_stats(path)
676
+ created = datetime.fromtimestamp(file_stats.st_ctime).astimezone()
677
+ modified = datetime.fromtimestamp(file_stats.st_mtime).astimezone()
678
+
679
+ # entity markdown will always contain front matter, so it can be used up create/update the entity
680
+ entity_markdown = await self.entity_parser.parse_file(path)
681
+
682
+ # if the file contains frontmatter, resolve a permalink (unless disabled)
683
+ if file_contains_frontmatter and not self.app_config.disable_permalinks:
684
+ # Resolve permalink - skip conflict checks during bulk sync for performance
685
+ permalink = await self.entity_service.resolve_permalink(
686
+ path, markdown=entity_markdown, skip_conflict_check=True
687
+ )
688
+
689
+ # If permalink changed, update the file
690
+ if permalink != entity_markdown.frontmatter.permalink:
691
+ logger.info(
692
+ f"Updating permalink for path: {path}, old_permalink: {entity_markdown.frontmatter.permalink}, new_permalink: {permalink}"
693
+ )
694
+
695
+ entity_markdown.frontmatter.metadata["permalink"] = permalink
696
+ await self.file_service.update_frontmatter(path, {"permalink": permalink})
697
+
698
+ # if the file is new, create an entity
699
+ if new:
700
+ # Create entity with final permalink
701
+ logger.debug(f"Creating new entity from markdown, path={path}")
702
+ await self.entity_service.create_entity_from_markdown(Path(path), entity_markdown)
703
+
704
+ # otherwise we need to update the entity and observations
705
+ else:
706
+ logger.debug(f"Updating entity from markdown, path={path}")
707
+ await self.entity_service.update_entity_and_observations(Path(path), entity_markdown)
708
+
709
+ # Update relations and search index
710
+ entity = await self.entity_service.update_entity_relations(path, entity_markdown)
711
+
712
+ # After updating relations, we need to compute the checksum again
713
+ # This is necessary for files with wikilinks to ensure consistent checksums
714
+ # after relation processing is complete
715
+ final_checksum = await self.file_service.compute_checksum(path)
716
+
717
+ # Update checksum, timestamps, and file metadata from file system
718
+ # Store mtime/size for efficient change detection in future scans
719
+ # This ensures temporal ordering in search and recent activity uses actual file modification times
720
+ await self.entity_repository.update(
721
+ entity.id,
722
+ {
723
+ "checksum": final_checksum,
724
+ "created_at": created,
725
+ "updated_at": modified,
726
+ "mtime": file_stats.st_mtime,
727
+ "size": file_stats.st_size,
728
+ },
729
+ )
730
+
731
+ logger.debug(
732
+ f"Markdown sync completed: path={path}, entity_id={entity.id}, "
733
+ f"observation_count={len(entity.observations)}, relation_count={len(entity.relations)}, "
734
+ f"checksum={final_checksum[:8]}"
735
+ )
736
+
737
+ # Return the final checksum to ensure everything is consistent
738
+ return entity, final_checksum
739
+
740
+ @logfire.instrument()
741
+ async def sync_regular_file(self, path: str, new: bool = True) -> Tuple[Optional[Entity], str]:
742
+ """Sync a non-markdown file with basic tracking.
743
+
744
+ Args:
745
+ path: Path to file
746
+ new: Whether this is a new file
747
+
748
+ Returns:
749
+ Tuple of (entity, checksum)
750
+ """
751
+ checksum = await self.file_service.compute_checksum(path)
752
+ if new:
753
+ # Generate permalink from path - skip conflict checks during bulk sync
754
+ await self.entity_service.resolve_permalink(path, skip_conflict_check=True)
755
+
756
+ # get file timestamps
757
+ file_stats = self.file_service.file_stats(path)
758
+ created = datetime.fromtimestamp(file_stats.st_ctime).astimezone()
759
+ modified = datetime.fromtimestamp(file_stats.st_mtime).astimezone()
760
+
761
+ # get mime type
762
+ content_type = self.file_service.content_type(path)
763
+
764
+ file_path = Path(path)
765
+ try:
766
+ entity = await self.entity_repository.add(
767
+ Entity(
768
+ entity_type="file",
769
+ file_path=path,
770
+ checksum=checksum,
771
+ title=file_path.name,
772
+ created_at=created,
773
+ updated_at=modified,
774
+ content_type=content_type,
775
+ mtime=file_stats.st_mtime,
776
+ size=file_stats.st_size,
777
+ )
778
+ )
779
+ return entity, checksum
780
+ except IntegrityError as e:
781
+ # Handle race condition where entity was created by another process
782
+ if "UNIQUE constraint failed: entity.file_path" in str(e):
783
+ logger.info(
784
+ f"Entity already exists for file_path={path}, updating instead of creating"
785
+ )
786
+ # Treat as update instead of create
787
+ entity = await self.entity_repository.get_by_file_path(path)
788
+ if entity is None: # pragma: no cover
789
+ logger.error(f"Entity not found after constraint violation, path={path}")
790
+ raise ValueError(f"Entity not found after constraint violation: {path}")
791
+
792
+ # Re-get file stats since we're in update path
793
+ file_stats_for_update = self.file_service.file_stats(path)
794
+ updated = await self.entity_repository.update(
795
+ entity.id,
796
+ {
797
+ "file_path": path,
798
+ "checksum": checksum,
799
+ "mtime": file_stats_for_update.st_mtime,
800
+ "size": file_stats_for_update.st_size,
801
+ },
802
+ )
803
+
804
+ if updated is None: # pragma: no cover
805
+ logger.error(f"Failed to update entity, entity_id={entity.id}, path={path}")
806
+ raise ValueError(f"Failed to update entity with ID {entity.id}")
807
+
808
+ return updated, checksum
809
+ else:
810
+ # Re-raise if it's a different integrity error
811
+ raise
812
+ else:
813
+ # Get file timestamps for updating modification time
814
+ file_stats = self.file_service.file_stats(path)
815
+ modified = datetime.fromtimestamp(file_stats.st_mtime).astimezone()
816
+
817
+ entity = await self.entity_repository.get_by_file_path(path)
818
+ if entity is None: # pragma: no cover
819
+ logger.error(f"Entity not found for existing file, path={path}")
820
+ raise ValueError(f"Entity not found for existing file: {path}")
821
+
822
+ # Update checksum, modification time, and file metadata from file system
823
+ # Store mtime/size for efficient change detection in future scans
824
+ updated = await self.entity_repository.update(
825
+ entity.id,
826
+ {
827
+ "file_path": path,
828
+ "checksum": checksum,
829
+ "updated_at": modified,
830
+ "mtime": file_stats.st_mtime,
831
+ "size": file_stats.st_size,
832
+ },
833
+ )
834
+
835
+ if updated is None: # pragma: no cover
836
+ logger.error(f"Failed to update entity, entity_id={entity.id}, path={path}")
837
+ raise ValueError(f"Failed to update entity with ID {entity.id}")
838
+
839
+ return updated, checksum
840
+
841
+ @logfire.instrument()
842
+ async def handle_delete(self, file_path: str):
43
843
  """Handle complete entity deletion including search index cleanup."""
844
+
44
845
  # First get entity to get permalink before deletion
45
846
  entity = await self.entity_repository.get_by_file_path(file_path)
46
847
  if entity:
47
- logger.debug(f"Deleting entity and cleaning up search index: {file_path}")
848
+ logger.info(
849
+ f"Deleting entity with file_path={file_path}, entity_id={entity.id}, permalink={entity.permalink}"
850
+ )
48
851
 
49
852
  # Delete from db (this cascades to observations/relations)
50
853
  await self.entity_service.delete_entity_by_file_path(file_path)
@@ -55,111 +858,388 @@ class SyncService:
55
858
  + [o.permalink for o in entity.observations]
56
859
  + [r.permalink for r in entity.relations]
57
860
  )
58
- logger.debug(f"Deleting from search index: {permalinks}")
861
+
862
+ logger.debug(
863
+ f"Cleaning up search index for entity_id={entity.id}, file_path={file_path}, "
864
+ f"index_entries={len(permalinks)}"
865
+ )
866
+
59
867
  for permalink in permalinks:
60
- await self.search_service.delete_by_permalink(permalink)
61
-
62
- async def sync(self, directory: Path) -> SyncReport:
63
- """Sync knowledge files with database."""
64
- changes = await self.scanner.find_knowledge_changes(directory)
65
- logger.info(f"Found {changes.total_changes} knowledge changes")
66
-
67
- # Handle moves first
68
- for old_path, new_path in changes.moves.items():
69
- logger.debug(f"Moving entity: {old_path} -> {new_path}")
70
- entity = await self.entity_repository.get_by_file_path(old_path)
71
- if entity:
72
- # Update file_path but keep the same permalink for link stability
73
- updated = await self.entity_repository.update(
74
- entity.id, {"file_path": new_path, "checksum": changes.checksums[new_path]}
75
- )
76
- # update search index
77
- if updated:
78
- await self.search_service.index_entity(updated)
868
+ if permalink:
869
+ await self.search_service.delete_by_permalink(permalink)
870
+ else:
871
+ await self.search_service.delete_by_entity_id(entity.id)
872
+
873
+ @logfire.instrument()
874
+ async def handle_move(self, old_path, new_path):
875
+ logger.debug("Moving entity", old_path=old_path, new_path=new_path)
79
876
 
80
- # Handle deletions next
81
- # remove rows from db for files no longer present
82
- for path in changes.deleted:
83
- await self.handle_entity_deletion(path)
877
+ entity = await self.entity_repository.get_by_file_path(old_path)
878
+ if entity:
879
+ # Check if destination path is already occupied by another entity
880
+ existing_at_destination = await self.entity_repository.get_by_file_path(new_path)
881
+ if existing_at_destination and existing_at_destination.id != entity.id:
882
+ # Handle the conflict - this could be a file swap or replacement scenario
883
+ logger.warning(
884
+ f"File path conflict detected during move: "
885
+ f"entity_id={entity.id} trying to move from '{old_path}' to '{new_path}', "
886
+ f"but entity_id={existing_at_destination.id} already occupies '{new_path}'"
887
+ )
84
888
 
85
- # Parse files that need updating
86
- parsed_entities: Dict[str, EntityMarkdown] = {}
889
+ # Check if this is a file swap (the destination entity is being moved to our old path)
890
+ # This would indicate a simultaneous move operation
891
+ old_path_after_swap = await self.entity_repository.get_by_file_path(old_path)
892
+ if old_path_after_swap and old_path_after_swap.id == existing_at_destination.id:
893
+ logger.info(f"Detected file swap between '{old_path}' and '{new_path}'")
894
+ # This is a swap scenario - both moves should succeed
895
+ # We'll allow this to proceed since the other file has moved out
896
+ else:
897
+ # This is a conflict where the destination is occupied
898
+ raise ValueError(
899
+ f"Cannot move entity from '{old_path}' to '{new_path}': "
900
+ f"destination path is already occupied by another file. "
901
+ f"This may be caused by: "
902
+ f"1. Conflicting file names with different character encodings, "
903
+ f"2. Case sensitivity differences (e.g., 'Finance/' vs 'finance/'), "
904
+ f"3. Character conflicts between hyphens in filenames and generated permalinks, "
905
+ f"4. Files with similar names containing special characters. "
906
+ f"Try renaming one of the conflicting files to resolve this issue."
907
+ )
87
908
 
88
- for path in [*changes.new, *changes.modified]:
89
- entity_markdown = await self.entity_parser.parse_file(directory / path)
90
- parsed_entities[path] = entity_markdown
909
+ # Update file_path in all cases
910
+ updates = {"file_path": new_path}
91
911
 
92
- # First pass: Create/update entities
93
- # entities will have a null checksum to indicate they are not complete
94
- for path, entity_markdown in parsed_entities.items():
95
- # Get unique permalink and update markdown if needed
96
- permalink = await self.entity_service.resolve_permalink(
97
- Path(path), markdown=entity_markdown
98
- )
912
+ # If configured, also update permalink to match new path
913
+ if (
914
+ self.app_config.update_permalinks_on_move
915
+ and not self.app_config.disable_permalinks
916
+ and self.file_service.is_markdown(new_path)
917
+ ):
918
+ # generate new permalink value - skip conflict checks during bulk sync
919
+ new_permalink = await self.entity_service.resolve_permalink(
920
+ new_path, skip_conflict_check=True
921
+ )
99
922
 
100
- if permalink != entity_markdown.frontmatter.permalink:
101
- # Add/update permalink in frontmatter
102
- logger.info(f"Adding permalink '{permalink}' to file: {path}")
923
+ # write to file and get new checksum
924
+ new_checksum = await self.file_service.update_frontmatter(
925
+ new_path, {"permalink": new_permalink}
926
+ )
103
927
 
104
- # update markdown
105
- entity_markdown.frontmatter.metadata["permalink"] = permalink
928
+ updates["permalink"] = new_permalink
929
+ updates["checksum"] = new_checksum
106
930
 
107
- # update file frontmatter
108
- updated_checksum = await file_utils.update_frontmatter(
109
- directory / path, {"permalink": permalink}
931
+ logger.info(
932
+ f"Updating permalink on move,old_permalink={entity.permalink}"
933
+ f"new_permalink={new_permalink}"
934
+ f"new_checksum={new_checksum}"
110
935
  )
111
936
 
112
- # Update checksum in changes report since file was modified
113
- changes.checksums[path] = updated_checksum
937
+ try:
938
+ updated = await self.entity_repository.update(entity.id, updates)
939
+ except Exception as e:
940
+ # Catch any database integrity errors and provide helpful context
941
+ if "UNIQUE constraint failed" in str(e):
942
+ logger.error(
943
+ f"Database constraint violation during move: "
944
+ f"entity_id={entity.id}, old_path='{old_path}', new_path='{new_path}'"
945
+ )
946
+ raise ValueError(
947
+ f"Cannot complete move from '{old_path}' to '{new_path}': "
948
+ f"a database constraint was violated. This usually indicates "
949
+ f"a file path or permalink conflict. Please check for: "
950
+ f"1. Duplicate file names, "
951
+ f"2. Case sensitivity issues (e.g., 'File.md' vs 'file.md'), "
952
+ f"3. Character encoding conflicts in file names."
953
+ ) from e
954
+ else:
955
+ # Re-raise other exceptions as-is
956
+ raise
114
957
 
115
- # if the file is new, create an entity
116
- if path in changes.new:
117
- # Create entity with final permalink
118
- logger.debug(f"Creating new entity_markdown: {path}")
119
- await self.entity_service.create_entity_from_markdown(Path(path), entity_markdown)
120
- # otherwise we need to update the entity and observations
121
- else:
122
- logger.debug(f"Updating entity_markdown: {path}")
123
- await self.entity_service.update_entity_and_observations(
124
- Path(path), entity_markdown
958
+ if updated is None: # pragma: no cover
959
+ logger.error(
960
+ "Failed to update entity path"
961
+ f"entity_id={entity.id}"
962
+ f"old_path={old_path}"
963
+ f"new_path={new_path}"
125
964
  )
965
+ raise ValueError(f"Failed to update entity path for ID {entity.id}")
966
+
967
+ logger.debug(
968
+ "Entity path updated"
969
+ f"entity_id={entity.id} "
970
+ f"permalink={entity.permalink} "
971
+ f"old_path={old_path} "
972
+ f"new_path={new_path} "
973
+ )
126
974
 
127
- # Second pass
128
- for path, entity_markdown in parsed_entities.items():
129
- logger.debug(f"Updating relations for: {path}")
975
+ # update search index
976
+ await self.search_service.index_entity(updated)
130
977
 
131
- # Process relations
132
- checksum = changes.checksums[path]
133
- entity = await self.entity_service.update_entity_relations(Path(path), entity_markdown)
978
+ @logfire.instrument()
979
+ async def resolve_relations(self, entity_id: int | None = None):
980
+ """Try to resolve unresolved relations.
134
981
 
135
- # add to search index
136
- await self.search_service.index_entity(entity)
982
+ Args:
983
+ entity_id: If provided, only resolve relations for this specific entity.
984
+ Otherwise, resolve all unresolved relations in the database.
985
+ """
137
986
 
138
- # Set final checksum to mark sync complete
139
- await self.entity_repository.update(entity.id, {"checksum": checksum})
987
+ if entity_id:
988
+ # Only get unresolved relations for the specific entity
989
+ unresolved_relations = (
990
+ await self.relation_repository.find_unresolved_relations_for_entity(entity_id)
991
+ )
992
+ logger.info(
993
+ f"Resolving forward references for entity {entity_id}",
994
+ count=len(unresolved_relations),
995
+ )
996
+ else:
997
+ # Get all unresolved relations (original behavior)
998
+ unresolved_relations = await self.relation_repository.find_unresolved_relations()
999
+ logger.info("Resolving all forward references", count=len(unresolved_relations))
140
1000
 
141
- # Third pass: Try to resolve any forward references
142
- logger.debug("Attempting to resolve forward references")
143
- for relation in await self.relation_repository.find_unresolved_relations():
144
- target_entity = await self.entity_service.link_resolver.resolve_link(relation.to_name)
145
- # check we found a link that is not the source
146
- if target_entity and target_entity.id != relation.from_id:
1001
+ for relation in unresolved_relations:
1002
+ logger.trace(
1003
+ "Attempting to resolve relation "
1004
+ f"relation_id={relation.id} "
1005
+ f"from_id={relation.from_id} "
1006
+ f"to_name={relation.to_name}"
1007
+ )
1008
+
1009
+ resolved_entity = await self.entity_service.link_resolver.resolve_link(relation.to_name)
1010
+
1011
+ # ignore reference to self
1012
+ if resolved_entity and resolved_entity.id != relation.from_id:
147
1013
  logger.debug(
148
- f"Resolved forward reference: {relation.to_name} -> {target_entity.permalink}"
1014
+ "Resolved forward reference "
1015
+ f"relation_id={relation.id} "
1016
+ f"from_id={relation.from_id} "
1017
+ f"to_name={relation.to_name} "
1018
+ f"resolved_id={resolved_entity.id} "
1019
+ f"resolved_title={resolved_entity.title}",
149
1020
  )
150
-
151
1021
  try:
152
1022
  await self.relation_repository.update(
153
1023
  relation.id,
154
1024
  {
155
- "to_id": target_entity.id,
156
- "to_name": target_entity.title, # Update to actual title
1025
+ "to_id": resolved_entity.id,
1026
+ "to_name": resolved_entity.title,
157
1027
  },
158
1028
  )
159
- except IntegrityError:
160
- logger.debug(f"Ignoring duplicate relation {relation}")
1029
+ except IntegrityError: # pragma: no cover
1030
+ logger.debug(
1031
+ "Ignoring duplicate relation "
1032
+ f"relation_id={relation.id} "
1033
+ f"from_id={relation.from_id} "
1034
+ f"to_name={relation.to_name}"
1035
+ )
161
1036
 
162
1037
  # update search index
163
- await self.search_service.index_entity(target_entity)
1038
+ await self.search_service.index_entity(resolved_entity)
1039
+
1040
+ async def _quick_count_files(self, directory: Path) -> int:
1041
+ """Fast file count using find command.
1042
+
1043
+ Uses subprocess to leverage OS-level file counting which is much faster
1044
+ than Python iteration, especially on network filesystems like TigrisFS.
1045
+
1046
+ Args:
1047
+ directory: Directory to count files in
1048
+
1049
+ Returns:
1050
+ Number of files in directory (recursive)
1051
+ """
1052
+ process = await asyncio.create_subprocess_shell(
1053
+ f'find "{directory}" -type f | wc -l',
1054
+ stdout=asyncio.subprocess.PIPE,
1055
+ stderr=asyncio.subprocess.PIPE,
1056
+ )
1057
+ stdout, stderr = await process.communicate()
1058
+
1059
+ if process.returncode != 0:
1060
+ error_msg = stderr.decode().strip()
1061
+ logger.error(
1062
+ f"FILE COUNT OPTIMIZATION FAILED: find command failed with exit code {process.returncode}, "
1063
+ f"error: {error_msg}. Falling back to manual count. "
1064
+ f"This will slow down watermark detection!"
1065
+ )
1066
+ # Track optimization failures for visibility
1067
+ logfire.metric_counter("sync.scan.file_count_failure").add(1)
1068
+ # Fallback: count using scan_directory
1069
+ count = 0
1070
+ async for _ in self.scan_directory(directory):
1071
+ count += 1
1072
+ return count
1073
+
1074
+ return int(stdout.strip())
1075
+
1076
+ async def _scan_directory_modified_since(
1077
+ self, directory: Path, since_timestamp: float
1078
+ ) -> List[str]:
1079
+ """Use find -newermt for filesystem-level filtering of modified files.
1080
+
1081
+ This is dramatically faster than scanning all files and comparing mtimes,
1082
+ especially on network filesystems like TigrisFS where stat operations are expensive.
1083
+
1084
+ Args:
1085
+ directory: Directory to scan
1086
+ since_timestamp: Unix timestamp to find files newer than
1087
+
1088
+ Returns:
1089
+ List of relative file paths modified since the timestamp (respects .bmignore)
1090
+ """
1091
+ # Convert timestamp to find-compatible format
1092
+ since_date = datetime.fromtimestamp(since_timestamp).strftime("%Y-%m-%d %H:%M:%S")
1093
+
1094
+ process = await asyncio.create_subprocess_shell(
1095
+ f'find "{directory}" -type f -newermt "{since_date}"',
1096
+ stdout=asyncio.subprocess.PIPE,
1097
+ stderr=asyncio.subprocess.PIPE,
1098
+ )
1099
+ stdout, stderr = await process.communicate()
1100
+
1101
+ if process.returncode != 0:
1102
+ error_msg = stderr.decode().strip()
1103
+ logger.error(
1104
+ f"SCAN OPTIMIZATION FAILED: find -newermt command failed with exit code {process.returncode}, "
1105
+ f"error: {error_msg}. Falling back to full scan. "
1106
+ f"This will cause slow syncs on large projects!"
1107
+ )
1108
+ # Track optimization failures for visibility
1109
+ logfire.metric_counter("sync.scan.optimization_failure").add(1)
1110
+ # Fallback to full scan
1111
+ return await self._scan_directory_full(directory)
1112
+
1113
+ # Convert absolute paths to relative and filter through ignore patterns
1114
+ file_paths = []
1115
+ for line in stdout.decode().splitlines():
1116
+ if line:
1117
+ try:
1118
+ abs_path = Path(line)
1119
+ rel_path = abs_path.relative_to(directory).as_posix()
1120
+
1121
+ # Apply ignore patterns (same as scan_directory)
1122
+ if should_ignore_path(abs_path, directory, self._ignore_patterns):
1123
+ logger.trace(f"Ignoring path per .bmignore: {rel_path}")
1124
+ continue
1125
+
1126
+ file_paths.append(rel_path)
1127
+ except ValueError:
1128
+ # Path is not relative to directory, skip it
1129
+ logger.warning(f"Skipping file not under directory: {line}")
1130
+ continue
1131
+
1132
+ return file_paths
1133
+
1134
+ async def _scan_directory_full(self, directory: Path) -> List[str]:
1135
+ """Full directory scan returning all file paths.
1136
+
1137
+ Uses scan_directory() which respects .bmignore patterns.
1138
+
1139
+ Args:
1140
+ directory: Directory to scan
1141
+
1142
+ Returns:
1143
+ List of relative file paths (respects .bmignore)
1144
+ """
1145
+ file_paths = []
1146
+ async for file_path_str, _ in self.scan_directory(directory):
1147
+ rel_path = Path(file_path_str).relative_to(directory).as_posix()
1148
+ file_paths.append(rel_path)
1149
+ return file_paths
1150
+
1151
+ async def scan_directory(self, directory: Path) -> AsyncIterator[Tuple[str, os.stat_result]]:
1152
+ """Stream files from directory using aiofiles.os.scandir() with cached stat info.
1153
+
1154
+ This method uses aiofiles.os.scandir() to leverage async I/O and cached stat
1155
+ information from directory entries. This reduces network I/O by 50% on network
1156
+ filesystems like TigrisFS by avoiding redundant stat() calls.
1157
+
1158
+ Args:
1159
+ directory: Directory to scan
1160
+
1161
+ Yields:
1162
+ Tuples of (absolute_file_path, stat_info) for each file
1163
+ """
1164
+ try:
1165
+ entries = await aiofiles.os.scandir(directory)
1166
+ except PermissionError:
1167
+ logger.warning(f"Permission denied scanning directory: {directory}")
1168
+ return
1169
+
1170
+ results = []
1171
+ subdirs = []
1172
+
1173
+ for entry in entries:
1174
+ entry_path = Path(entry.path)
1175
+
1176
+ # Check ignore patterns
1177
+ if should_ignore_path(entry_path, directory, self._ignore_patterns):
1178
+ logger.trace(f"Ignoring path per .bmignore: {entry_path.relative_to(directory)}")
1179
+ continue
1180
+
1181
+ if entry.is_dir(follow_symlinks=False):
1182
+ # Collect subdirectories to recurse into
1183
+ subdirs.append(entry_path)
1184
+ elif entry.is_file(follow_symlinks=False):
1185
+ # Get cached stat info (no extra syscall!)
1186
+ stat_info = entry.stat(follow_symlinks=False)
1187
+ results.append((entry.path, stat_info))
1188
+
1189
+ # Yield files from current directory
1190
+ for file_path, stat_info in results:
1191
+ yield (file_path, stat_info)
1192
+
1193
+ # Recurse into subdirectories
1194
+ for subdir in subdirs:
1195
+ async for result in self.scan_directory(subdir):
1196
+ yield result
1197
+
1198
+
1199
+ async def get_sync_service(project: Project) -> SyncService: # pragma: no cover
1200
+ """Get sync service instance with all dependencies."""
1201
+
1202
+ app_config = ConfigManager().config
1203
+ _, session_maker = await db.get_or_create_db(
1204
+ db_path=app_config.database_path, db_type=db.DatabaseType.FILESYSTEM
1205
+ )
1206
+
1207
+ project_path = Path(project.path)
1208
+ entity_parser = EntityParser(project_path)
1209
+ markdown_processor = MarkdownProcessor(entity_parser)
1210
+ file_service = FileService(project_path, markdown_processor)
1211
+
1212
+ # Initialize repositories
1213
+ entity_repository = EntityRepository(session_maker, project_id=project.id)
1214
+ observation_repository = ObservationRepository(session_maker, project_id=project.id)
1215
+ relation_repository = RelationRepository(session_maker, project_id=project.id)
1216
+ search_repository = SearchRepository(session_maker, project_id=project.id)
1217
+ project_repository = ProjectRepository(session_maker)
1218
+
1219
+ # Initialize services
1220
+ search_service = SearchService(search_repository, entity_repository, file_service)
1221
+ link_resolver = LinkResolver(entity_repository, search_service)
1222
+
1223
+ # Initialize services
1224
+ entity_service = EntityService(
1225
+ entity_parser,
1226
+ entity_repository,
1227
+ observation_repository,
1228
+ relation_repository,
1229
+ file_service,
1230
+ link_resolver,
1231
+ )
1232
+
1233
+ # Create sync service
1234
+ sync_service = SyncService(
1235
+ app_config=app_config,
1236
+ entity_service=entity_service,
1237
+ entity_parser=entity_parser,
1238
+ entity_repository=entity_repository,
1239
+ relation_repository=relation_repository,
1240
+ project_repository=project_repository,
1241
+ search_service=search_service,
1242
+ file_service=file_service,
1243
+ )
164
1244
 
165
- return changes
1245
+ return sync_service