okb 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/__init__.py +3 -0
- okb/cli.py +1272 -0
- okb/config.py +661 -0
- okb/data/init.sql +92 -0
- okb/http_server.py +463 -0
- okb/ingest.py +1589 -0
- okb/llm/__init__.py +86 -0
- okb/llm/base.py +83 -0
- okb/llm/cache.py +217 -0
- okb/llm/filter.py +187 -0
- okb/llm/providers.py +322 -0
- okb/local_embedder.py +87 -0
- okb/mcp_server.py +1393 -0
- okb/migrate.py +53 -0
- okb/migrations/0001.initial-schema.sql +91 -0
- okb/migrations/0002.sync-state.sql +22 -0
- okb/migrations/0003.structured-fields.sql +22 -0
- okb/migrations/0004.tokens.sql +13 -0
- okb/migrations/0005.database-metadata.sql +19 -0
- okb/migrations/0006.llm-cache.sql +13 -0
- okb/modal_embedder.py +120 -0
- okb/modal_llm.py +178 -0
- okb/plugins/__init__.py +8 -0
- okb/plugins/base.py +110 -0
- okb/plugins/registry.py +123 -0
- okb/plugins/sources/__init__.py +5 -0
- okb/plugins/sources/dropbox_paper.py +188 -0
- okb/plugins/sources/github.py +484 -0
- okb/rescan.py +227 -0
- okb/scripts/__init__.py +1 -0
- okb/scripts/watch.py +206 -0
- okb/tokens.py +277 -0
- okb-1.0.0.dist-info/METADATA +397 -0
- okb-1.0.0.dist-info/RECORD +36 -0
- okb-1.0.0.dist-info/WHEEL +4 -0
- okb-1.0.0.dist-info/entry_points.txt +9 -0
okb/rescan.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rescan indexed documents for freshness and re-ingest changed files.
|
|
3
|
+
|
|
4
|
+
Checks stored file_modified_at metadata against actual file mtime,
|
|
5
|
+
and re-ingests documents that have changed or been deleted.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from datetime import UTC, datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import psycopg
|
|
16
|
+
from psycopg.rows import dict_row
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class RescanResult:
|
|
21
|
+
"""Result of a rescan operation."""
|
|
22
|
+
|
|
23
|
+
updated: list[str] = field(default_factory=list) # Re-ingested files
|
|
24
|
+
deleted: list[str] = field(default_factory=list) # Removed (--delete flag)
|
|
25
|
+
missing: list[str] = field(default_factory=list) # No longer exist (without --delete)
|
|
26
|
+
unchanged: int = 0
|
|
27
|
+
errors: list[tuple[str, str]] = field(default_factory=list) # (path, error_message)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Virtual path prefixes that should be skipped (handled by sync or manual re-fetch)
|
|
31
|
+
VIRTUAL_PREFIXES = ("claude://", "todoist://", "dropbox://", "http://", "https://")
|
|
32
|
+
|
|
33
|
+
# Source types that are file-based
|
|
34
|
+
FILE_SOURCE_TYPES = ("markdown", "code", "text", "org", "org-todo", "pdf", "docx")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _escape_like(text: str) -> str:
|
|
38
|
+
"""Escape special LIKE pattern characters (%, _, \\) so they match literally."""
|
|
39
|
+
return text.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Rescanner:
|
|
43
|
+
"""Checks indexed documents for freshness and re-ingests changed files."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, db_url: str, use_modal: bool = True):
|
|
46
|
+
self.db_url = db_url
|
|
47
|
+
self.use_modal = use_modal
|
|
48
|
+
|
|
49
|
+
def get_indexed_files(self) -> list[dict]:
|
|
50
|
+
"""
|
|
51
|
+
Query file-based documents with stored mtime.
|
|
52
|
+
|
|
53
|
+
Returns list of dicts with:
|
|
54
|
+
- base_path: The file path (without :: anchor)
|
|
55
|
+
- source_type: Document source type
|
|
56
|
+
- stored_mtime: ISO timestamp from metadata
|
|
57
|
+
- derived_count: Number of derived documents (e.g., org-todo items)
|
|
58
|
+
"""
|
|
59
|
+
with psycopg.connect(self.db_url, row_factory=dict_row) as conn:
|
|
60
|
+
# Get unique file paths with their mtimes
|
|
61
|
+
# Use SPLIT_PART to get base path (before ::)
|
|
62
|
+
# Group by base path to count derived documents
|
|
63
|
+
results = conn.execute(
|
|
64
|
+
"""
|
|
65
|
+
SELECT
|
|
66
|
+
SPLIT_PART(source_path, '::', 1) as base_path,
|
|
67
|
+
MAX(source_type) as source_type,
|
|
68
|
+
MAX(metadata->>'file_modified_at') as stored_mtime,
|
|
69
|
+
COUNT(*) - 1 as derived_count
|
|
70
|
+
FROM documents
|
|
71
|
+
WHERE source_type = ANY(%s)
|
|
72
|
+
GROUP BY SPLIT_PART(source_path, '::', 1)
|
|
73
|
+
ORDER BY base_path
|
|
74
|
+
""",
|
|
75
|
+
(list(FILE_SOURCE_TYPES),),
|
|
76
|
+
).fetchall()
|
|
77
|
+
|
|
78
|
+
# Filter out virtual paths
|
|
79
|
+
return [
|
|
80
|
+
dict(r)
|
|
81
|
+
for r in results
|
|
82
|
+
if not any(r["base_path"].startswith(p) for p in VIRTUAL_PREFIXES)
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
def check_file_freshness(self, doc: dict) -> tuple[str, str | None]:
|
|
86
|
+
"""
|
|
87
|
+
Check if a file is fresh, stale, or missing.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
doc: Dict with base_path and stored_mtime
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Tuple of (status, current_mtime_iso) where status is:
|
|
94
|
+
- 'fresh': File unchanged
|
|
95
|
+
- 'stale': File modified since indexing
|
|
96
|
+
- 'missing': File no longer exists
|
|
97
|
+
"""
|
|
98
|
+
path = Path(doc["base_path"])
|
|
99
|
+
|
|
100
|
+
if not path.exists():
|
|
101
|
+
return ("missing", None)
|
|
102
|
+
|
|
103
|
+
# Get current mtime
|
|
104
|
+
current_mtime = datetime.fromtimestamp(path.stat().st_mtime, tz=UTC)
|
|
105
|
+
current_mtime_iso = current_mtime.isoformat()
|
|
106
|
+
|
|
107
|
+
stored_mtime = doc.get("stored_mtime")
|
|
108
|
+
if not stored_mtime:
|
|
109
|
+
# No stored mtime - treat as stale
|
|
110
|
+
return ("stale", current_mtime_iso)
|
|
111
|
+
|
|
112
|
+
# Parse stored mtime for comparison
|
|
113
|
+
# Handle both timezone-aware and naive ISO formats
|
|
114
|
+
try:
|
|
115
|
+
stored_dt = datetime.fromisoformat(stored_mtime.replace("Z", "+00:00"))
|
|
116
|
+
if stored_dt.tzinfo is None:
|
|
117
|
+
stored_dt = stored_dt.replace(tzinfo=UTC)
|
|
118
|
+
except (ValueError, AttributeError):
|
|
119
|
+
return ("stale", current_mtime_iso)
|
|
120
|
+
|
|
121
|
+
# Compare timestamps (allow 1 second tolerance for filesystem precision)
|
|
122
|
+
if abs((current_mtime - stored_dt).total_seconds()) <= 1:
|
|
123
|
+
return ("fresh", current_mtime_iso)
|
|
124
|
+
|
|
125
|
+
return ("stale", current_mtime_iso)
|
|
126
|
+
|
|
127
|
+
def delete_document_and_derived(self, source_path: str) -> int:
|
|
128
|
+
"""
|
|
129
|
+
Delete a document and any derived documents (e.g., org-todo items).
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
source_path: The base file path
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Number of documents deleted
|
|
136
|
+
"""
|
|
137
|
+
with psycopg.connect(self.db_url) as conn:
|
|
138
|
+
# Delete exact match and any derived docs (path::*)
|
|
139
|
+
# Escape LIKE metacharacters (%, _) so they match literally
|
|
140
|
+
escaped_path = _escape_like(source_path)
|
|
141
|
+
result = conn.execute(
|
|
142
|
+
"""
|
|
143
|
+
DELETE FROM documents
|
|
144
|
+
WHERE source_path = %s OR source_path LIKE %s ESCAPE '\\'
|
|
145
|
+
RETURNING id
|
|
146
|
+
""",
|
|
147
|
+
(source_path, escaped_path + "::%"),
|
|
148
|
+
).fetchall()
|
|
149
|
+
conn.commit()
|
|
150
|
+
return len(result)
|
|
151
|
+
|
|
152
|
+
def rescan(
|
|
153
|
+
self,
|
|
154
|
+
dry_run: bool = False,
|
|
155
|
+
delete_missing: bool = False,
|
|
156
|
+
verbose: bool = True,
|
|
157
|
+
) -> RescanResult:
|
|
158
|
+
"""
|
|
159
|
+
Scan indexed files for freshness and re-ingest changed ones.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
dry_run: If True, only report what would be done
|
|
163
|
+
delete_missing: If True, remove documents for missing files
|
|
164
|
+
verbose: If True, print progress to stderr
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
RescanResult with lists of updated/deleted/missing files
|
|
168
|
+
"""
|
|
169
|
+
from .ingest import Ingester, parse_document
|
|
170
|
+
|
|
171
|
+
result = RescanResult()
|
|
172
|
+
indexed_files = self.get_indexed_files()
|
|
173
|
+
|
|
174
|
+
if verbose:
|
|
175
|
+
print(f"Found {len(indexed_files)} indexed files", file=sys.stderr)
|
|
176
|
+
|
|
177
|
+
ingester = None if dry_run else Ingester(self.db_url, use_modal=self.use_modal)
|
|
178
|
+
|
|
179
|
+
for doc in indexed_files:
|
|
180
|
+
base_path = doc["base_path"]
|
|
181
|
+
derived_count = doc.get("derived_count", 0)
|
|
182
|
+
status, current_mtime = self.check_file_freshness(doc)
|
|
183
|
+
|
|
184
|
+
if status == "fresh":
|
|
185
|
+
result.unchanged += 1
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
elif status == "missing":
|
|
189
|
+
derived_note = f" (+ {derived_count} derived)" if derived_count else ""
|
|
190
|
+
if delete_missing:
|
|
191
|
+
if verbose:
|
|
192
|
+
print(f" [DELETE] {base_path}{derived_note}", file=sys.stderr)
|
|
193
|
+
if not dry_run:
|
|
194
|
+
self.delete_document_and_derived(base_path)
|
|
195
|
+
result.deleted.append(base_path)
|
|
196
|
+
else:
|
|
197
|
+
if verbose:
|
|
198
|
+
print(f" [MISSING] {base_path}{derived_note}", file=sys.stderr)
|
|
199
|
+
result.missing.append(base_path)
|
|
200
|
+
|
|
201
|
+
elif status == "stale":
|
|
202
|
+
derived_note = f" (+ {derived_count} derived)" if derived_count else ""
|
|
203
|
+
if verbose:
|
|
204
|
+
print(f" [STALE] {base_path}{derived_note}", file=sys.stderr)
|
|
205
|
+
|
|
206
|
+
if not dry_run:
|
|
207
|
+
try:
|
|
208
|
+
# Delete old document(s) first
|
|
209
|
+
self.delete_document_and_derived(base_path)
|
|
210
|
+
|
|
211
|
+
# Re-ingest
|
|
212
|
+
path = Path(base_path)
|
|
213
|
+
documents = parse_document(path)
|
|
214
|
+
if documents:
|
|
215
|
+
# Set file_modified_at on all documents
|
|
216
|
+
for d in documents:
|
|
217
|
+
d.metadata.extra["file_modified_at"] = current_mtime
|
|
218
|
+
ingester.ingest_documents(documents)
|
|
219
|
+
result.updated.append(base_path)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
result.errors.append((base_path, str(e)))
|
|
222
|
+
if verbose:
|
|
223
|
+
print(f" ERROR: {e}", file=sys.stderr)
|
|
224
|
+
else:
|
|
225
|
+
result.updated.append(base_path)
|
|
226
|
+
|
|
227
|
+
return result
|
okb/scripts/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility scripts for knowledge base management."""
|
okb/scripts/watch.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Watch directories and auto-ingest changed files.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python scripts/watch.py ~/notes ~/docs
|
|
7
|
+
python scripts/watch.py ~/notes --local # Use CPU embedding
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import sys
|
|
14
|
+
import time
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from watchdog.observers import Observer
|
|
19
|
+
from watchdog.events import FileSystemEventHandler, FileSystemEvent
|
|
20
|
+
|
|
21
|
+
from ..config import config
|
|
22
|
+
from ..ingest import Ingester, parse_document, content_hash, check_file_skip, read_text_with_fallback
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class KnowledgeHandler(FileSystemEventHandler):
|
|
26
|
+
"""Handle file system events for knowledge base updates."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, ingester: Ingester, debounce_seconds: float = 2.0):
|
|
29
|
+
self.ingester = ingester
|
|
30
|
+
self.debounce_seconds = debounce_seconds
|
|
31
|
+
self._pending: dict[str, float] = {} # path -> last_event_time
|
|
32
|
+
self._processed_hashes: dict[str, str] = {} # path -> content_hash
|
|
33
|
+
|
|
34
|
+
def _should_process(self, path: Path) -> bool:
|
|
35
|
+
"""Check if file should be processed."""
|
|
36
|
+
if not path.is_file():
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
if config.should_skip_path(path):
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
if path.suffix not in config.all_extensions:
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
# Check block/skip patterns (e.g., .env, *.min.js, temp files)
|
|
46
|
+
skip_check = check_file_skip(path)
|
|
47
|
+
if skip_check.should_skip:
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
def _debounced_update(self, path: Path):
|
|
53
|
+
"""Update document with debouncing."""
|
|
54
|
+
path_str = str(path)
|
|
55
|
+
now = time.time()
|
|
56
|
+
|
|
57
|
+
# Check debounce
|
|
58
|
+
if path_str in self._pending:
|
|
59
|
+
if now - self._pending[path_str] < self.debounce_seconds:
|
|
60
|
+
self._pending[path_str] = now
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
self._pending[path_str] = now
|
|
64
|
+
|
|
65
|
+
# Check if content actually changed
|
|
66
|
+
try:
|
|
67
|
+
content = read_text_with_fallback(path)
|
|
68
|
+
new_hash = content_hash(content)
|
|
69
|
+
|
|
70
|
+
if self._processed_hashes.get(path_str) == new_hash:
|
|
71
|
+
return # No actual change
|
|
72
|
+
|
|
73
|
+
self._processed_hashes[path_str] = new_hash
|
|
74
|
+
|
|
75
|
+
# Content-based checks (secrets, minified)
|
|
76
|
+
if config.scan_content:
|
|
77
|
+
skip_check = check_file_skip(path, content)
|
|
78
|
+
if skip_check.should_skip:
|
|
79
|
+
prefix = "BLOCKED" if skip_check.is_security else "Skipping"
|
|
80
|
+
print(f"[watch] {prefix}: {path} ({skip_check.reason})")
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
except Exception:
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
# Process the file
|
|
87
|
+
print(f"[watch] Updating: {path}")
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
doc = parse_document(path)
|
|
91
|
+
# Capture file mtime for staleness tracking
|
|
92
|
+
mtime = datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc)
|
|
93
|
+
doc.metadata.extra["file_modified_at"] = mtime.isoformat()
|
|
94
|
+
self.ingester.ingest_documents([doc])
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"[watch] Error processing {path}: {e}", file=sys.stderr)
|
|
98
|
+
|
|
99
|
+
def on_modified(self, event: FileSystemEvent):
|
|
100
|
+
"""Handle file modification."""
|
|
101
|
+
if event.is_directory:
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
path = Path(event.src_path)
|
|
105
|
+
if self._should_process(path):
|
|
106
|
+
self._debounced_update(path)
|
|
107
|
+
|
|
108
|
+
def on_created(self, event: FileSystemEvent):
|
|
109
|
+
"""Handle new file creation."""
|
|
110
|
+
if event.is_directory:
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
path = Path(event.src_path)
|
|
114
|
+
if self._should_process(path):
|
|
115
|
+
self._debounced_update(path)
|
|
116
|
+
|
|
117
|
+
def on_deleted(self, event: FileSystemEvent):
|
|
118
|
+
"""Handle file deletion."""
|
|
119
|
+
if event.is_directory:
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
path = Path(event.src_path)
|
|
123
|
+
path_str = str(path.resolve())
|
|
124
|
+
|
|
125
|
+
# Clean up tracking
|
|
126
|
+
self._pending.pop(str(path), None)
|
|
127
|
+
self._processed_hashes.pop(str(path), None)
|
|
128
|
+
|
|
129
|
+
# Remove from database
|
|
130
|
+
print(f"[watch] Removing: {path}")
|
|
131
|
+
try:
|
|
132
|
+
self.ingester.delete_document(path_str)
|
|
133
|
+
except Exception as e:
|
|
134
|
+
print(f"[watch] Error removing {path}: {e}", file=sys.stderr)
|
|
135
|
+
|
|
136
|
+
def on_moved(self, event: FileSystemEvent):
|
|
137
|
+
"""Handle file rename/move."""
|
|
138
|
+
# Treat as delete + create
|
|
139
|
+
if hasattr(event, "src_path"):
|
|
140
|
+
src = Path(event.src_path)
|
|
141
|
+
self._pending.pop(str(src), None)
|
|
142
|
+
self._processed_hashes.pop(str(src), None)
|
|
143
|
+
|
|
144
|
+
print(f"[watch] Removing (moved): {src}")
|
|
145
|
+
try:
|
|
146
|
+
self.ingester.delete_document(str(src.resolve()))
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
if hasattr(event, "dest_path"):
|
|
151
|
+
dest = Path(event.dest_path)
|
|
152
|
+
if self._should_process(dest):
|
|
153
|
+
self._debounced_update(dest)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def watch(directories: list[Path], db_url: str, use_modal: bool = True):
|
|
157
|
+
"""Watch directories for changes."""
|
|
158
|
+
ingester = Ingester(db_url, use_modal=use_modal)
|
|
159
|
+
handler = KnowledgeHandler(ingester)
|
|
160
|
+
observer = Observer()
|
|
161
|
+
|
|
162
|
+
for directory in directories:
|
|
163
|
+
if not directory.exists():
|
|
164
|
+
print(f"Warning: Directory does not exist: {directory}", file=sys.stderr)
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
observer.schedule(handler, str(directory), recursive=True)
|
|
168
|
+
print(f"[watch] Watching: {directory}")
|
|
169
|
+
|
|
170
|
+
observer.start()
|
|
171
|
+
print("[watch] Press Ctrl+C to stop")
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
while True:
|
|
175
|
+
time.sleep(1)
|
|
176
|
+
except KeyboardInterrupt:
|
|
177
|
+
print("\n[watch] Stopping...")
|
|
178
|
+
observer.stop()
|
|
179
|
+
|
|
180
|
+
observer.join()
|
|
181
|
+
print("[watch] Done.")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def main():
|
|
185
|
+
"""CLI entry point."""
|
|
186
|
+
parser = argparse.ArgumentParser(description="Watch directories and auto-ingest changes")
|
|
187
|
+
parser.add_argument(
|
|
188
|
+
"directories",
|
|
189
|
+
nargs="*",
|
|
190
|
+
type=Path,
|
|
191
|
+
default=[Path.home() / "notes"],
|
|
192
|
+
help="Directories to watch (default: ~/notes)",
|
|
193
|
+
)
|
|
194
|
+
parser.add_argument("--db-url", default=config.db_url, help="Database URL")
|
|
195
|
+
parser.add_argument(
|
|
196
|
+
"--local",
|
|
197
|
+
action="store_true",
|
|
198
|
+
help="Use local CPU embedding instead of Modal",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
args = parser.parse_args()
|
|
202
|
+
watch(args.directories, args.db_url, use_modal=not args.local)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
if __name__ == "__main__":
|
|
206
|
+
main()
|