okb 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
okb/rescan.py ADDED
@@ -0,0 +1,227 @@
1
+ """
2
+ Rescan indexed documents for freshness and re-ingest changed files.
3
+
4
+ Checks stored file_modified_at metadata against actual file mtime,
5
+ and re-ingests documents that have changed or been deleted.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import sys
11
+ from dataclasses import dataclass, field
12
+ from datetime import UTC, datetime
13
+ from pathlib import Path
14
+
15
+ import psycopg
16
+ from psycopg.rows import dict_row
17
+
18
+
19
+ @dataclass
20
+ class RescanResult:
21
+ """Result of a rescan operation."""
22
+
23
+ updated: list[str] = field(default_factory=list) # Re-ingested files
24
+ deleted: list[str] = field(default_factory=list) # Removed (--delete flag)
25
+ missing: list[str] = field(default_factory=list) # No longer exist (without --delete)
26
+ unchanged: int = 0
27
+ errors: list[tuple[str, str]] = field(default_factory=list) # (path, error_message)
28
+
29
+
30
+ # Virtual path prefixes that should be skipped (handled by sync or manual re-fetch)
31
+ VIRTUAL_PREFIXES = ("claude://", "todoist://", "dropbox://", "http://", "https://")
32
+
33
+ # Source types that are file-based
34
+ FILE_SOURCE_TYPES = ("markdown", "code", "text", "org", "org-todo", "pdf", "docx")
35
+
36
+
37
+ def _escape_like(text: str) -> str:
38
+ """Escape special LIKE pattern characters (%, _, \\) so they match literally."""
39
+ return text.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
40
+
41
+
42
+ class Rescanner:
43
+ """Checks indexed documents for freshness and re-ingests changed files."""
44
+
45
+ def __init__(self, db_url: str, use_modal: bool = True):
46
+ self.db_url = db_url
47
+ self.use_modal = use_modal
48
+
49
+ def get_indexed_files(self) -> list[dict]:
50
+ """
51
+ Query file-based documents with stored mtime.
52
+
53
+ Returns list of dicts with:
54
+ - base_path: The file path (without :: anchor)
55
+ - source_type: Document source type
56
+ - stored_mtime: ISO timestamp from metadata
57
+ - derived_count: Number of derived documents (e.g., org-todo items)
58
+ """
59
+ with psycopg.connect(self.db_url, row_factory=dict_row) as conn:
60
+ # Get unique file paths with their mtimes
61
+ # Use SPLIT_PART to get base path (before ::)
62
+ # Group by base path to count derived documents
63
+ results = conn.execute(
64
+ """
65
+ SELECT
66
+ SPLIT_PART(source_path, '::', 1) as base_path,
67
+ MAX(source_type) as source_type,
68
+ MAX(metadata->>'file_modified_at') as stored_mtime,
69
+ COUNT(*) - 1 as derived_count
70
+ FROM documents
71
+ WHERE source_type = ANY(%s)
72
+ GROUP BY SPLIT_PART(source_path, '::', 1)
73
+ ORDER BY base_path
74
+ """,
75
+ (list(FILE_SOURCE_TYPES),),
76
+ ).fetchall()
77
+
78
+ # Filter out virtual paths
79
+ return [
80
+ dict(r)
81
+ for r in results
82
+ if not any(r["base_path"].startswith(p) for p in VIRTUAL_PREFIXES)
83
+ ]
84
+
85
+ def check_file_freshness(self, doc: dict) -> tuple[str, str | None]:
86
+ """
87
+ Check if a file is fresh, stale, or missing.
88
+
89
+ Args:
90
+ doc: Dict with base_path and stored_mtime
91
+
92
+ Returns:
93
+ Tuple of (status, current_mtime_iso) where status is:
94
+ - 'fresh': File unchanged
95
+ - 'stale': File modified since indexing
96
+ - 'missing': File no longer exists
97
+ """
98
+ path = Path(doc["base_path"])
99
+
100
+ if not path.exists():
101
+ return ("missing", None)
102
+
103
+ # Get current mtime
104
+ current_mtime = datetime.fromtimestamp(path.stat().st_mtime, tz=UTC)
105
+ current_mtime_iso = current_mtime.isoformat()
106
+
107
+ stored_mtime = doc.get("stored_mtime")
108
+ if not stored_mtime:
109
+ # No stored mtime - treat as stale
110
+ return ("stale", current_mtime_iso)
111
+
112
+ # Parse stored mtime for comparison
113
+ # Handle both timezone-aware and naive ISO formats
114
+ try:
115
+ stored_dt = datetime.fromisoformat(stored_mtime.replace("Z", "+00:00"))
116
+ if stored_dt.tzinfo is None:
117
+ stored_dt = stored_dt.replace(tzinfo=UTC)
118
+ except (ValueError, AttributeError):
119
+ return ("stale", current_mtime_iso)
120
+
121
+ # Compare timestamps (allow 1 second tolerance for filesystem precision)
122
+ if abs((current_mtime - stored_dt).total_seconds()) <= 1:
123
+ return ("fresh", current_mtime_iso)
124
+
125
+ return ("stale", current_mtime_iso)
126
+
127
+ def delete_document_and_derived(self, source_path: str) -> int:
128
+ """
129
+ Delete a document and any derived documents (e.g., org-todo items).
130
+
131
+ Args:
132
+ source_path: The base file path
133
+
134
+ Returns:
135
+ Number of documents deleted
136
+ """
137
+ with psycopg.connect(self.db_url) as conn:
138
+ # Delete exact match and any derived docs (path::*)
139
+ # Escape LIKE metacharacters (%, _) so they match literally
140
+ escaped_path = _escape_like(source_path)
141
+ result = conn.execute(
142
+ """
143
+ DELETE FROM documents
144
+ WHERE source_path = %s OR source_path LIKE %s ESCAPE '\\'
145
+ RETURNING id
146
+ """,
147
+ (source_path, escaped_path + "::%"),
148
+ ).fetchall()
149
+ conn.commit()
150
+ return len(result)
151
+
152
+ def rescan(
153
+ self,
154
+ dry_run: bool = False,
155
+ delete_missing: bool = False,
156
+ verbose: bool = True,
157
+ ) -> RescanResult:
158
+ """
159
+ Scan indexed files for freshness and re-ingest changed ones.
160
+
161
+ Args:
162
+ dry_run: If True, only report what would be done
163
+ delete_missing: If True, remove documents for missing files
164
+ verbose: If True, print progress to stderr
165
+
166
+ Returns:
167
+ RescanResult with lists of updated/deleted/missing files
168
+ """
169
+ from .ingest import Ingester, parse_document
170
+
171
+ result = RescanResult()
172
+ indexed_files = self.get_indexed_files()
173
+
174
+ if verbose:
175
+ print(f"Found {len(indexed_files)} indexed files", file=sys.stderr)
176
+
177
+ ingester = None if dry_run else Ingester(self.db_url, use_modal=self.use_modal)
178
+
179
+ for doc in indexed_files:
180
+ base_path = doc["base_path"]
181
+ derived_count = doc.get("derived_count", 0)
182
+ status, current_mtime = self.check_file_freshness(doc)
183
+
184
+ if status == "fresh":
185
+ result.unchanged += 1
186
+ continue
187
+
188
+ elif status == "missing":
189
+ derived_note = f" (+ {derived_count} derived)" if derived_count else ""
190
+ if delete_missing:
191
+ if verbose:
192
+ print(f" [DELETE] {base_path}{derived_note}", file=sys.stderr)
193
+ if not dry_run:
194
+ self.delete_document_and_derived(base_path)
195
+ result.deleted.append(base_path)
196
+ else:
197
+ if verbose:
198
+ print(f" [MISSING] {base_path}{derived_note}", file=sys.stderr)
199
+ result.missing.append(base_path)
200
+
201
+ elif status == "stale":
202
+ derived_note = f" (+ {derived_count} derived)" if derived_count else ""
203
+ if verbose:
204
+ print(f" [STALE] {base_path}{derived_note}", file=sys.stderr)
205
+
206
+ if not dry_run:
207
+ try:
208
+ # Delete old document(s) first
209
+ self.delete_document_and_derived(base_path)
210
+
211
+ # Re-ingest
212
+ path = Path(base_path)
213
+ documents = parse_document(path)
214
+ if documents:
215
+ # Set file_modified_at on all documents
216
+ for d in documents:
217
+ d.metadata.extra["file_modified_at"] = current_mtime
218
+ ingester.ingest_documents(documents)
219
+ result.updated.append(base_path)
220
+ except Exception as e:
221
+ result.errors.append((base_path, str(e)))
222
+ if verbose:
223
+ print(f" ERROR: {e}", file=sys.stderr)
224
+ else:
225
+ result.updated.append(base_path)
226
+
227
+ return result
@@ -0,0 +1 @@
1
+ """Utility scripts for knowledge base management."""
okb/scripts/watch.py ADDED
@@ -0,0 +1,206 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Watch directories and auto-ingest changed files.
4
+
5
+ Usage:
6
+ python scripts/watch.py ~/notes ~/docs
7
+ python scripts/watch.py ~/notes --local # Use CPU embedding
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import sys
14
+ import time
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+
18
+ from watchdog.observers import Observer
19
+ from watchdog.events import FileSystemEventHandler, FileSystemEvent
20
+
21
+ from ..config import config
22
+ from ..ingest import Ingester, parse_document, content_hash, check_file_skip, read_text_with_fallback
23
+
24
+
25
+ class KnowledgeHandler(FileSystemEventHandler):
26
+ """Handle file system events for knowledge base updates."""
27
+
28
+ def __init__(self, ingester: Ingester, debounce_seconds: float = 2.0):
29
+ self.ingester = ingester
30
+ self.debounce_seconds = debounce_seconds
31
+ self._pending: dict[str, float] = {} # path -> last_event_time
32
+ self._processed_hashes: dict[str, str] = {} # path -> content_hash
33
+
34
+ def _should_process(self, path: Path) -> bool:
35
+ """Check if file should be processed."""
36
+ if not path.is_file():
37
+ return False
38
+
39
+ if config.should_skip_path(path):
40
+ return False
41
+
42
+ if path.suffix not in config.all_extensions:
43
+ return False
44
+
45
+ # Check block/skip patterns (e.g., .env, *.min.js, temp files)
46
+ skip_check = check_file_skip(path)
47
+ if skip_check.should_skip:
48
+ return False
49
+
50
+ return True
51
+
52
+ def _debounced_update(self, path: Path):
53
+ """Update document with debouncing."""
54
+ path_str = str(path)
55
+ now = time.time()
56
+
57
+ # Check debounce
58
+ if path_str in self._pending:
59
+ if now - self._pending[path_str] < self.debounce_seconds:
60
+ self._pending[path_str] = now
61
+ return
62
+
63
+ self._pending[path_str] = now
64
+
65
+ # Check if content actually changed
66
+ try:
67
+ content = read_text_with_fallback(path)
68
+ new_hash = content_hash(content)
69
+
70
+ if self._processed_hashes.get(path_str) == new_hash:
71
+ return # No actual change
72
+
73
+ self._processed_hashes[path_str] = new_hash
74
+
75
+ # Content-based checks (secrets, minified)
76
+ if config.scan_content:
77
+ skip_check = check_file_skip(path, content)
78
+ if skip_check.should_skip:
79
+ prefix = "BLOCKED" if skip_check.is_security else "Skipping"
80
+ print(f"[watch] {prefix}: {path} ({skip_check.reason})")
81
+ return
82
+
83
+ except Exception:
84
+ return
85
+
86
+ # Process the file
87
+ print(f"[watch] Updating: {path}")
88
+
89
+ try:
90
+ doc = parse_document(path)
91
+ # Capture file mtime for staleness tracking
92
+ mtime = datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc)
93
+ doc.metadata.extra["file_modified_at"] = mtime.isoformat()
94
+ self.ingester.ingest_documents([doc])
95
+
96
+ except Exception as e:
97
+ print(f"[watch] Error processing {path}: {e}", file=sys.stderr)
98
+
99
+ def on_modified(self, event: FileSystemEvent):
100
+ """Handle file modification."""
101
+ if event.is_directory:
102
+ return
103
+
104
+ path = Path(event.src_path)
105
+ if self._should_process(path):
106
+ self._debounced_update(path)
107
+
108
+ def on_created(self, event: FileSystemEvent):
109
+ """Handle new file creation."""
110
+ if event.is_directory:
111
+ return
112
+
113
+ path = Path(event.src_path)
114
+ if self._should_process(path):
115
+ self._debounced_update(path)
116
+
117
+ def on_deleted(self, event: FileSystemEvent):
118
+ """Handle file deletion."""
119
+ if event.is_directory:
120
+ return
121
+
122
+ path = Path(event.src_path)
123
+ path_str = str(path.resolve())
124
+
125
+ # Clean up tracking
126
+ self._pending.pop(str(path), None)
127
+ self._processed_hashes.pop(str(path), None)
128
+
129
+ # Remove from database
130
+ print(f"[watch] Removing: {path}")
131
+ try:
132
+ self.ingester.delete_document(path_str)
133
+ except Exception as e:
134
+ print(f"[watch] Error removing {path}: {e}", file=sys.stderr)
135
+
136
+ def on_moved(self, event: FileSystemEvent):
137
+ """Handle file rename/move."""
138
+ # Treat as delete + create
139
+ if hasattr(event, "src_path"):
140
+ src = Path(event.src_path)
141
+ self._pending.pop(str(src), None)
142
+ self._processed_hashes.pop(str(src), None)
143
+
144
+ print(f"[watch] Removing (moved): {src}")
145
+ try:
146
+ self.ingester.delete_document(str(src.resolve()))
147
+ except Exception:
148
+ pass
149
+
150
+ if hasattr(event, "dest_path"):
151
+ dest = Path(event.dest_path)
152
+ if self._should_process(dest):
153
+ self._debounced_update(dest)
154
+
155
+
156
+ def watch(directories: list[Path], db_url: str, use_modal: bool = True):
157
+ """Watch directories for changes."""
158
+ ingester = Ingester(db_url, use_modal=use_modal)
159
+ handler = KnowledgeHandler(ingester)
160
+ observer = Observer()
161
+
162
+ for directory in directories:
163
+ if not directory.exists():
164
+ print(f"Warning: Directory does not exist: {directory}", file=sys.stderr)
165
+ continue
166
+
167
+ observer.schedule(handler, str(directory), recursive=True)
168
+ print(f"[watch] Watching: {directory}")
169
+
170
+ observer.start()
171
+ print("[watch] Press Ctrl+C to stop")
172
+
173
+ try:
174
+ while True:
175
+ time.sleep(1)
176
+ except KeyboardInterrupt:
177
+ print("\n[watch] Stopping...")
178
+ observer.stop()
179
+
180
+ observer.join()
181
+ print("[watch] Done.")
182
+
183
+
184
+ def main():
185
+ """CLI entry point."""
186
+ parser = argparse.ArgumentParser(description="Watch directories and auto-ingest changes")
187
+ parser.add_argument(
188
+ "directories",
189
+ nargs="*",
190
+ type=Path,
191
+ default=[Path.home() / "notes"],
192
+ help="Directories to watch (default: ~/notes)",
193
+ )
194
+ parser.add_argument("--db-url", default=config.db_url, help="Database URL")
195
+ parser.add_argument(
196
+ "--local",
197
+ action="store_true",
198
+ help="Use local CPU embedding instead of Modal",
199
+ )
200
+
201
+ args = parser.parse_args()
202
+ watch(args.directories, args.db_url, use_modal=not args.local)
203
+
204
+
205
+ if __name__ == "__main__":
206
+ main()