okb 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/__init__.py +3 -0
- okb/cli.py +1272 -0
- okb/config.py +661 -0
- okb/data/init.sql +92 -0
- okb/http_server.py +463 -0
- okb/ingest.py +1589 -0
- okb/llm/__init__.py +86 -0
- okb/llm/base.py +83 -0
- okb/llm/cache.py +217 -0
- okb/llm/filter.py +187 -0
- okb/llm/providers.py +322 -0
- okb/local_embedder.py +87 -0
- okb/mcp_server.py +1393 -0
- okb/migrate.py +53 -0
- okb/migrations/0001.initial-schema.sql +91 -0
- okb/migrations/0002.sync-state.sql +22 -0
- okb/migrations/0003.structured-fields.sql +22 -0
- okb/migrations/0004.tokens.sql +13 -0
- okb/migrations/0005.database-metadata.sql +19 -0
- okb/migrations/0006.llm-cache.sql +13 -0
- okb/modal_embedder.py +120 -0
- okb/modal_llm.py +178 -0
- okb/plugins/__init__.py +8 -0
- okb/plugins/base.py +110 -0
- okb/plugins/registry.py +123 -0
- okb/plugins/sources/__init__.py +5 -0
- okb/plugins/sources/dropbox_paper.py +188 -0
- okb/plugins/sources/github.py +484 -0
- okb/rescan.py +227 -0
- okb/scripts/__init__.py +1 -0
- okb/scripts/watch.py +206 -0
- okb/tokens.py +277 -0
- okb-1.0.0.dist-info/METADATA +397 -0
- okb-1.0.0.dist-info/RECORD +36 -0
- okb-1.0.0.dist-info/WHEEL +4 -0
- okb-1.0.0.dist-info/entry_points.txt +9 -0
okb/plugins/registry.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Plugin discovery and registration via entry_points."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import entry_points
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from .base import APISource, FileParser
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PluginRegistry:
|
|
14
|
+
"""Registry for file parsers and API sources discovered via entry_points.
|
|
15
|
+
|
|
16
|
+
Plugins are discovered from two entry_point groups:
|
|
17
|
+
- okb.parsers: FileParser implementations
|
|
18
|
+
- okb.sources: APISource implementations
|
|
19
|
+
|
|
20
|
+
Example pyproject.toml for a plugin:
|
|
21
|
+
[project.entry-points."okb.parsers"]
|
|
22
|
+
epub = "okb_epub:EpubParser"
|
|
23
|
+
|
|
24
|
+
[project.entry-points."okb.sources"]
|
|
25
|
+
github = "okb_github:GitHubSource"
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
_parsers: dict[str, list[FileParser]] = {} # ext -> list of parsers
|
|
29
|
+
_sources: dict[str, APISource] = {} # name -> source
|
|
30
|
+
_loaded = False
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def load_plugins(cls) -> None:
|
|
34
|
+
"""Load all plugins from entry_points. Called automatically on first use."""
|
|
35
|
+
if cls._loaded:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
# Load file parsers
|
|
39
|
+
parser_eps = entry_points(group="okb.parsers")
|
|
40
|
+
for ep in parser_eps:
|
|
41
|
+
try:
|
|
42
|
+
parser_cls = ep.load()
|
|
43
|
+
parser = parser_cls()
|
|
44
|
+
for ext in parser.extensions:
|
|
45
|
+
ext_lower = ext.lower()
|
|
46
|
+
if ext_lower not in cls._parsers:
|
|
47
|
+
cls._parsers[ext_lower] = []
|
|
48
|
+
cls._parsers[ext_lower].append(parser)
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"Warning: Failed to load parser plugin '{ep.name}': {e}")
|
|
51
|
+
|
|
52
|
+
# Load API sources
|
|
53
|
+
source_eps = entry_points(group="okb.sources")
|
|
54
|
+
for ep in source_eps:
|
|
55
|
+
try:
|
|
56
|
+
source_cls = ep.load()
|
|
57
|
+
source = source_cls()
|
|
58
|
+
cls._sources[source.name] = source
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print(f"Warning: Failed to load source plugin '{ep.name}': {e}")
|
|
61
|
+
|
|
62
|
+
cls._loaded = True
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def get_parser_for_file(cls, path: Path) -> FileParser | None:
|
|
66
|
+
"""Find a parser that can handle this file.
|
|
67
|
+
|
|
68
|
+
First filters by extension, then calls can_parse() on each candidate.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
path: Path to the file to parse
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
FileParser instance that can handle the file, or None
|
|
75
|
+
"""
|
|
76
|
+
cls.load_plugins()
|
|
77
|
+
ext = path.suffix.lower()
|
|
78
|
+
for parser in cls._parsers.get(ext, []):
|
|
79
|
+
if parser.can_parse(path):
|
|
80
|
+
return parser
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def get_source(cls, name: str) -> APISource | None:
|
|
85
|
+
"""Get an API source by name.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
name: Source name (e.g., 'github', 'todoist')
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
APISource instance, or None if not found
|
|
92
|
+
"""
|
|
93
|
+
cls.load_plugins()
|
|
94
|
+
return cls._sources.get(name)
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def list_sources(cls) -> list[str]:
|
|
98
|
+
"""List all available API source names.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of source names
|
|
102
|
+
"""
|
|
103
|
+
cls.load_plugins()
|
|
104
|
+
return list(cls._sources.keys())
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def list_parsers(cls) -> dict[str, list[str]]:
|
|
108
|
+
"""List all registered parsers by extension.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dict mapping extension to list of parser source_type names
|
|
112
|
+
"""
|
|
113
|
+
cls.load_plugins()
|
|
114
|
+
return {
|
|
115
|
+
ext: [p.source_type for p in parsers] for ext, parsers in cls._parsers.items()
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def reset(cls) -> None:
|
|
120
|
+
"""Reset the registry. Mainly useful for testing."""
|
|
121
|
+
cls._parsers = {}
|
|
122
|
+
cls._sources = {}
|
|
123
|
+
cls._loaded = False
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Dropbox Paper API source for syncing Paper documents as markdown."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from okb.ingest import Document
|
|
11
|
+
from okb.plugins.base import SyncState
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DropboxPaperSource:
|
|
15
|
+
"""API source for Dropbox Paper documents.
|
|
16
|
+
|
|
17
|
+
Syncs Paper documents as markdown for searchable knowledge base entries.
|
|
18
|
+
|
|
19
|
+
Config example:
|
|
20
|
+
plugins:
|
|
21
|
+
sources:
|
|
22
|
+
dropbox-paper:
|
|
23
|
+
enabled: true
|
|
24
|
+
token: ${DROPBOX_TOKEN}
|
|
25
|
+
folders: [/] # Optional: filter to specific folder paths
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
lkb sync run dropbox-paper
|
|
29
|
+
lkb sync run dropbox-paper --full # Ignore incremental state
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name = "dropbox-paper"
|
|
33
|
+
source_type = "dropbox-paper"
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
self._client = None
|
|
37
|
+
self._folders: list[str] | None = None
|
|
38
|
+
self._doc_ids: list[str] | None = None
|
|
39
|
+
|
|
40
|
+
def configure(self, config: dict) -> None:
|
|
41
|
+
"""Initialize Dropbox client with OAuth token.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
config: Source configuration containing 'token' and optional 'folders' or 'doc_ids'
|
|
45
|
+
"""
|
|
46
|
+
import dropbox
|
|
47
|
+
|
|
48
|
+
token = config.get("token")
|
|
49
|
+
if not token:
|
|
50
|
+
raise ValueError("dropbox-paper source requires 'token' in config")
|
|
51
|
+
|
|
52
|
+
self._client = dropbox.Dropbox(token)
|
|
53
|
+
self._folders = config.get("folders")
|
|
54
|
+
self._doc_ids = config.get("doc_ids") # Specific doc IDs from CLI
|
|
55
|
+
|
|
56
|
+
def fetch(self, state: SyncState | None = None) -> tuple[list[Document], SyncState]:
|
|
57
|
+
"""Fetch Paper documents from Dropbox.
|
|
58
|
+
|
|
59
|
+
Uses the legacy Paper API to list and download documents as markdown.
|
|
60
|
+
Supports incremental sync via cursor-based pagination.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
state: Previous sync state for incremental updates, or None for full sync
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Tuple of (list of documents, new sync state)
|
|
67
|
+
"""
|
|
68
|
+
from okb.plugins.base import SyncState as SyncStateClass
|
|
69
|
+
|
|
70
|
+
if self._client is None:
|
|
71
|
+
raise RuntimeError("Source not configured. Call configure() first.")
|
|
72
|
+
|
|
73
|
+
documents: list[Document] = []
|
|
74
|
+
cursor = state.cursor if state else None
|
|
75
|
+
|
|
76
|
+
print("Fetching Dropbox Paper documents...", file=sys.stderr)
|
|
77
|
+
|
|
78
|
+
# Use specific doc IDs from CLI, or list all Paper docs
|
|
79
|
+
if self._doc_ids:
|
|
80
|
+
doc_ids = self._doc_ids
|
|
81
|
+
print(f"Syncing {len(doc_ids)} specific document(s)", file=sys.stderr)
|
|
82
|
+
else:
|
|
83
|
+
doc_ids = self._list_paper_docs(cursor)
|
|
84
|
+
print(f"Found {len(doc_ids)} Paper documents", file=sys.stderr)
|
|
85
|
+
|
|
86
|
+
for doc_id in doc_ids:
|
|
87
|
+
try:
|
|
88
|
+
doc = self._fetch_paper_doc(doc_id)
|
|
89
|
+
if doc:
|
|
90
|
+
# Apply folder filter if configured
|
|
91
|
+
if self._folders:
|
|
92
|
+
folder_path = doc.metadata.extra.get("folder_path", "/")
|
|
93
|
+
if not any(folder_path.startswith(f) for f in self._folders):
|
|
94
|
+
continue
|
|
95
|
+
documents.append(doc)
|
|
96
|
+
print(f" Synced: {doc.title}", file=sys.stderr)
|
|
97
|
+
except Exception as e:
|
|
98
|
+
print(f" Error fetching doc {doc_id}: {e}", file=sys.stderr)
|
|
99
|
+
|
|
100
|
+
# Build new sync state
|
|
101
|
+
new_state = SyncStateClass(
|
|
102
|
+
last_sync=datetime.now(UTC),
|
|
103
|
+
cursor=cursor, # Paper API doesn't provide incremental cursors
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return documents, new_state
|
|
107
|
+
|
|
108
|
+
def _list_paper_docs(self, cursor: str | None = None) -> list[str]:
|
|
109
|
+
"""List all Paper document IDs.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
cursor: Pagination cursor (not used by Paper API list)
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of Paper document IDs
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
doc_ids = []
|
|
119
|
+
|
|
120
|
+
# Initial request
|
|
121
|
+
result = self._client.paper_docs_list()
|
|
122
|
+
doc_ids.extend(result.doc_ids)
|
|
123
|
+
|
|
124
|
+
# Paginate through all results
|
|
125
|
+
while result.has_more:
|
|
126
|
+
result = self._client.paper_docs_list_continue(result.cursor.value)
|
|
127
|
+
doc_ids.extend(result.doc_ids)
|
|
128
|
+
|
|
129
|
+
return doc_ids
|
|
130
|
+
|
|
131
|
+
def _fetch_paper_doc(self, doc_id: str) -> Document | None:
|
|
132
|
+
"""Fetch a single Paper document and convert to Document.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
doc_id: Dropbox Paper document ID
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Document instance or None if fetch failed
|
|
139
|
+
"""
|
|
140
|
+
from dropbox.paper import ExportFormat
|
|
141
|
+
|
|
142
|
+
from okb.ingest import Document, DocumentMetadata
|
|
143
|
+
|
|
144
|
+
# Get document metadata
|
|
145
|
+
try:
|
|
146
|
+
folder_result = self._client.paper_docs_get_folder_info(doc_id)
|
|
147
|
+
folder_path = folder_result.folder_sharing_policy_type.name if folder_result else "/"
|
|
148
|
+
# Try to get actual folder path from folders list
|
|
149
|
+
if folder_result and hasattr(folder_result, "folders") and folder_result.folders:
|
|
150
|
+
folder_path = "/" + "/".join(f.name for f in folder_result.folders)
|
|
151
|
+
else:
|
|
152
|
+
folder_path = "/"
|
|
153
|
+
except Exception:
|
|
154
|
+
folder_path = "/"
|
|
155
|
+
|
|
156
|
+
# Download as markdown
|
|
157
|
+
result, response = self._client.paper_docs_download(
|
|
158
|
+
doc_id, ExportFormat.markdown
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
content = response.content.decode("utf-8")
|
|
162
|
+
if not content.strip():
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
# Extract title from first heading or filename
|
|
166
|
+
title = result.title or f"Paper Doc {doc_id}"
|
|
167
|
+
|
|
168
|
+
# Parse modification time
|
|
169
|
+
doc_date = None
|
|
170
|
+
if hasattr(result, "server_modified"):
|
|
171
|
+
doc_date = result.server_modified.isoformat()
|
|
172
|
+
|
|
173
|
+
metadata = DocumentMetadata(
|
|
174
|
+
extra={
|
|
175
|
+
"folder_path": folder_path,
|
|
176
|
+
"doc_id": doc_id,
|
|
177
|
+
}
|
|
178
|
+
)
|
|
179
|
+
if doc_date:
|
|
180
|
+
metadata.extra["document_date"] = doc_date
|
|
181
|
+
|
|
182
|
+
return Document(
|
|
183
|
+
source_path=f"dropbox://paper/{doc_id}",
|
|
184
|
+
source_type=self.source_type,
|
|
185
|
+
title=title,
|
|
186
|
+
content=content,
|
|
187
|
+
metadata=metadata,
|
|
188
|
+
)
|