okb 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/__init__.py +3 -0
- okb/cli.py +1272 -0
- okb/config.py +661 -0
- okb/data/init.sql +92 -0
- okb/http_server.py +463 -0
- okb/ingest.py +1589 -0
- okb/llm/__init__.py +86 -0
- okb/llm/base.py +83 -0
- okb/llm/cache.py +217 -0
- okb/llm/filter.py +187 -0
- okb/llm/providers.py +322 -0
- okb/local_embedder.py +87 -0
- okb/mcp_server.py +1393 -0
- okb/migrate.py +53 -0
- okb/migrations/0001.initial-schema.sql +91 -0
- okb/migrations/0002.sync-state.sql +22 -0
- okb/migrations/0003.structured-fields.sql +22 -0
- okb/migrations/0004.tokens.sql +13 -0
- okb/migrations/0005.database-metadata.sql +19 -0
- okb/migrations/0006.llm-cache.sql +13 -0
- okb/modal_embedder.py +120 -0
- okb/modal_llm.py +178 -0
- okb/plugins/__init__.py +8 -0
- okb/plugins/base.py +110 -0
- okb/plugins/registry.py +123 -0
- okb/plugins/sources/__init__.py +5 -0
- okb/plugins/sources/dropbox_paper.py +188 -0
- okb/plugins/sources/github.py +484 -0
- okb/rescan.py +227 -0
- okb/scripts/__init__.py +1 -0
- okb/scripts/watch.py +206 -0
- okb/tokens.py +277 -0
- okb-1.0.0.dist-info/METADATA +397 -0
- okb-1.0.0.dist-info/RECORD +36 -0
- okb-1.0.0.dist-info/WHEEL +4 -0
- okb-1.0.0.dist-info/entry_points.txt +9 -0
okb/migrate.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Migration runner for okb database schema."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from yoyo import get_backend, read_migrations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_migrations_path() -> str:
|
|
11
|
+
"""Get path to migrations directory."""
|
|
12
|
+
return str(Path(__file__).parent / "migrations")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _convert_db_url(db_url: str) -> str:
|
|
16
|
+
"""Convert psycopg3 URL to yoyo-compatible format.
|
|
17
|
+
|
|
18
|
+
yoyo uses psycopg2 by default. We convert:
|
|
19
|
+
postgresql://... -> postgresql+psycopg://...
|
|
20
|
+
to use psycopg v3.
|
|
21
|
+
"""
|
|
22
|
+
if db_url.startswith("postgresql://") and "+psycopg" not in db_url:
|
|
23
|
+
return db_url.replace("postgresql://", "postgresql+psycopg://", 1)
|
|
24
|
+
return db_url
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def run_migrations(db_url: str) -> list[str]:
|
|
28
|
+
"""Apply pending migrations, return list of applied migration IDs."""
|
|
29
|
+
backend = get_backend(_convert_db_url(db_url))
|
|
30
|
+
migrations = read_migrations(get_migrations_path())
|
|
31
|
+
|
|
32
|
+
with backend.lock():
|
|
33
|
+
to_apply = backend.to_apply(migrations)
|
|
34
|
+
if to_apply:
|
|
35
|
+
backend.apply_migrations(to_apply)
|
|
36
|
+
|
|
37
|
+
return [m.id for m in to_apply]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_pending(db_url: str) -> list[str]:
|
|
41
|
+
"""Get list of pending migration IDs."""
|
|
42
|
+
backend = get_backend(_convert_db_url(db_url))
|
|
43
|
+
migrations = read_migrations(get_migrations_path())
|
|
44
|
+
return [m.id for m in backend.to_apply(migrations)]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_applied(db_url: str) -> list[str]:
|
|
48
|
+
"""Get list of applied migration IDs."""
|
|
49
|
+
backend = get_backend(_convert_db_url(db_url))
|
|
50
|
+
migrations = read_migrations(get_migrations_path())
|
|
51
|
+
to_apply = backend.to_apply(migrations)
|
|
52
|
+
to_apply_ids = {m.id for m in to_apply}
|
|
53
|
+
return [m.id for m in migrations if m.id not in to_apply_ids]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
-- Initial schema - documents, chunks, indexes
|
|
2
|
+
-- depends:
|
|
3
|
+
|
|
4
|
+
CREATE EXTENSION IF NOT EXISTS vector;
|
|
5
|
+
|
|
6
|
+
-- Main documents table
|
|
7
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
8
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
9
|
+
source_path TEXT NOT NULL,
|
|
10
|
+
source_type TEXT NOT NULL,
|
|
11
|
+
title TEXT,
|
|
12
|
+
content TEXT NOT NULL,
|
|
13
|
+
metadata JSONB DEFAULT '{}',
|
|
14
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
15
|
+
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
|
16
|
+
content_hash TEXT NOT NULL,
|
|
17
|
+
UNIQUE(content_hash)
|
|
18
|
+
);
|
|
19
|
+
|
|
20
|
+
-- Chunks for semantic search
|
|
21
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
22
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
23
|
+
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
|
|
24
|
+
chunk_index INTEGER NOT NULL,
|
|
25
|
+
content TEXT NOT NULL,
|
|
26
|
+
embedding_text TEXT NOT NULL,
|
|
27
|
+
embedding vector(768),
|
|
28
|
+
token_count INTEGER,
|
|
29
|
+
metadata JSONB DEFAULT '{}',
|
|
30
|
+
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
31
|
+
);
|
|
32
|
+
|
|
33
|
+
-- HNSW index for similarity search
|
|
34
|
+
CREATE INDEX IF NOT EXISTS chunks_embedding_idx ON chunks
|
|
35
|
+
USING hnsw (embedding vector_cosine_ops)
|
|
36
|
+
WITH (m = 16, ef_construction = 64);
|
|
37
|
+
|
|
38
|
+
-- Full-text search indexes
|
|
39
|
+
CREATE INDEX IF NOT EXISTS documents_content_fts ON documents
|
|
40
|
+
USING gin(to_tsvector('english', content));
|
|
41
|
+
|
|
42
|
+
CREATE INDEX IF NOT EXISTS chunks_content_fts ON chunks
|
|
43
|
+
USING gin(to_tsvector('english', content));
|
|
44
|
+
|
|
45
|
+
-- Lookup indexes
|
|
46
|
+
CREATE INDEX IF NOT EXISTS documents_source_path_idx ON documents(source_path);
|
|
47
|
+
CREATE INDEX IF NOT EXISTS documents_source_type_idx ON documents(source_type);
|
|
48
|
+
CREATE INDEX IF NOT EXISTS documents_metadata_idx ON documents USING gin(metadata);
|
|
49
|
+
|
|
50
|
+
-- Function to update timestamp
|
|
51
|
+
CREATE OR REPLACE FUNCTION update_updated_at()
|
|
52
|
+
RETURNS TRIGGER AS $$
|
|
53
|
+
BEGIN
|
|
54
|
+
NEW.updated_at = NOW();
|
|
55
|
+
RETURN NEW;
|
|
56
|
+
END;
|
|
57
|
+
$$ LANGUAGE plpgsql;
|
|
58
|
+
|
|
59
|
+
-- Trigger (drop first to make idempotent)
|
|
60
|
+
DROP TRIGGER IF EXISTS documents_updated_at ON documents;
|
|
61
|
+
CREATE TRIGGER documents_updated_at
|
|
62
|
+
BEFORE UPDATE ON documents
|
|
63
|
+
FOR EACH ROW
|
|
64
|
+
EXECUTE FUNCTION update_updated_at();
|
|
65
|
+
|
|
66
|
+
-- Helper view for search results
|
|
67
|
+
CREATE OR REPLACE VIEW search_results AS
|
|
68
|
+
SELECT
|
|
69
|
+
c.id as chunk_id,
|
|
70
|
+
c.content,
|
|
71
|
+
c.chunk_index,
|
|
72
|
+
c.token_count,
|
|
73
|
+
c.embedding,
|
|
74
|
+
d.id as document_id,
|
|
75
|
+
d.source_path,
|
|
76
|
+
d.source_type,
|
|
77
|
+
d.title,
|
|
78
|
+
d.metadata
|
|
79
|
+
FROM chunks c
|
|
80
|
+
JOIN documents d ON c.document_id = d.id;
|
|
81
|
+
|
|
82
|
+
-- Stats view
|
|
83
|
+
CREATE OR REPLACE VIEW index_stats AS
|
|
84
|
+
SELECT
|
|
85
|
+
source_type,
|
|
86
|
+
COUNT(DISTINCT d.id) as document_count,
|
|
87
|
+
COUNT(c.id) as chunk_count,
|
|
88
|
+
SUM(c.token_count) as total_tokens
|
|
89
|
+
FROM documents d
|
|
90
|
+
LEFT JOIN chunks c ON c.document_id = d.id
|
|
91
|
+
GROUP BY source_type;
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
-- Sync state for API sources (plugin system)
|
|
2
|
+
-- depends: 0001.initial-schema
|
|
3
|
+
|
|
4
|
+
CREATE TABLE IF NOT EXISTS sync_state (
|
|
5
|
+
source_name TEXT NOT NULL,
|
|
6
|
+
database_name TEXT NOT NULL DEFAULT 'default',
|
|
7
|
+
last_sync TIMESTAMPTZ,
|
|
8
|
+
cursor TEXT,
|
|
9
|
+
extra JSONB DEFAULT '{}',
|
|
10
|
+
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
|
11
|
+
PRIMARY KEY (source_name, database_name)
|
|
12
|
+
);
|
|
13
|
+
|
|
14
|
+
-- Index for listing sync states
|
|
15
|
+
CREATE INDEX IF NOT EXISTS sync_state_updated_idx ON sync_state(updated_at DESC);
|
|
16
|
+
|
|
17
|
+
-- Trigger for updated_at
|
|
18
|
+
DROP TRIGGER IF EXISTS sync_state_updated_at ON sync_state;
|
|
19
|
+
CREATE TRIGGER sync_state_updated_at
|
|
20
|
+
BEFORE UPDATE ON sync_state
|
|
21
|
+
FOR EACH ROW
|
|
22
|
+
EXECUTE FUNCTION update_updated_at();
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
-- Structured fields for actionable items (tasks, events, emails)
|
|
2
|
+
-- depends: 0002.sync-state
|
|
3
|
+
|
|
4
|
+
-- Add structured fields to documents table for temporal/status queries
|
|
5
|
+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS due_date TIMESTAMPTZ;
|
|
6
|
+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS event_start TIMESTAMPTZ;
|
|
7
|
+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS event_end TIMESTAMPTZ;
|
|
8
|
+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS status TEXT;
|
|
9
|
+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS priority INTEGER;
|
|
10
|
+
|
|
11
|
+
-- Indexes for common query patterns
|
|
12
|
+
CREATE INDEX IF NOT EXISTS documents_due_date_idx ON documents(due_date) WHERE due_date IS NOT NULL;
|
|
13
|
+
CREATE INDEX IF NOT EXISTS documents_event_start_idx ON documents(event_start) WHERE event_start IS NOT NULL;
|
|
14
|
+
CREATE INDEX IF NOT EXISTS documents_status_idx ON documents(status) WHERE status IS NOT NULL;
|
|
15
|
+
|
|
16
|
+
-- Composite index for "incomplete tasks due soon" queries
|
|
17
|
+
CREATE INDEX IF NOT EXISTS documents_actionable_idx ON documents(due_date, status)
|
|
18
|
+
WHERE due_date IS NOT NULL AND status IS NOT NULL;
|
|
19
|
+
|
|
20
|
+
-- Composite index for "events in date range" queries
|
|
21
|
+
CREATE INDEX IF NOT EXISTS documents_event_range_idx ON documents(event_start, event_end)
|
|
22
|
+
WHERE event_start IS NOT NULL;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
-- API tokens for HTTP authentication
|
|
2
|
+
-- depends: 0003.structured-fields
|
|
3
|
+
|
|
4
|
+
CREATE TABLE IF NOT EXISTS tokens (
|
|
5
|
+
token_hash TEXT PRIMARY KEY, -- SHA256(full_token), hex encoded
|
|
6
|
+
permissions TEXT NOT NULL CHECK (permissions IN ('ro', 'rw')),
|
|
7
|
+
description TEXT,
|
|
8
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
9
|
+
last_used_at TIMESTAMPTZ
|
|
10
|
+
);
|
|
11
|
+
|
|
12
|
+
-- Index for listing tokens
|
|
13
|
+
CREATE INDEX IF NOT EXISTS tokens_created_at_idx ON tokens(created_at DESC);
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
-- Database metadata table for storing LLM-enhanced descriptions
|
|
2
|
+
-- depends: 0004.tokens
|
|
3
|
+
|
|
4
|
+
CREATE TABLE IF NOT EXISTS database_metadata (
|
|
5
|
+
id SERIAL PRIMARY KEY,
|
|
6
|
+
key TEXT NOT NULL UNIQUE, -- 'description', 'topics', etc.
|
|
7
|
+
value JSONB NOT NULL,
|
|
8
|
+
source TEXT NOT NULL DEFAULT 'llm', -- 'config' or 'llm'
|
|
9
|
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
10
|
+
);
|
|
11
|
+
|
|
12
|
+
-- Index for quick key lookups
|
|
13
|
+
CREATE INDEX IF NOT EXISTS idx_database_metadata_key ON database_metadata(key);
|
|
14
|
+
|
|
15
|
+
-- Insert default entries that can be updated by LLM
|
|
16
|
+
INSERT INTO database_metadata (key, value, source) VALUES
|
|
17
|
+
('llm_description', 'null'::jsonb, 'llm'),
|
|
18
|
+
('llm_topics', '[]'::jsonb, 'llm')
|
|
19
|
+
ON CONFLICT (key) DO NOTHING;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
-- LLM response cache for avoiding redundant API calls
|
|
2
|
+
-- depends: 0005.database-metadata
|
|
3
|
+
|
|
4
|
+
CREATE TABLE IF NOT EXISTS llm_cache (
|
|
5
|
+
content_hash TEXT PRIMARY KEY, -- SHA256 of (prompt + system + model)
|
|
6
|
+
provider TEXT NOT NULL,
|
|
7
|
+
model TEXT NOT NULL,
|
|
8
|
+
response TEXT NOT NULL, -- JSON: {content, input_tokens, output_tokens}
|
|
9
|
+
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
10
|
+
);
|
|
11
|
+
|
|
12
|
+
-- Index for cache cleanup queries
|
|
13
|
+
CREATE INDEX IF NOT EXISTS llm_cache_created_at_idx ON llm_cache(created_at);
|
okb/modal_embedder.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modal-based GPU embedding service.
|
|
3
|
+
|
|
4
|
+
Provides on-demand GPU access for batch embedding generation.
|
|
5
|
+
Costs approximately $0.02 per 1000 chunks on T4 GPU.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
modal deploy modal_embedder.py
|
|
9
|
+
|
|
10
|
+
Then call from Python:
|
|
11
|
+
embedder = modal.Cls.from_name("knowledge-embedder", "Embedder")()
|
|
12
|
+
embeddings = embedder.embed_batch.remote(texts)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import modal
|
|
16
|
+
|
|
17
|
+
app = modal.App("knowledge-embedder")
|
|
18
|
+
|
|
19
|
+
# Container image with all dependencies
|
|
20
|
+
embedder_image = modal.Image.debian_slim(python_version="3.11").pip_install(
|
|
21
|
+
"sentence-transformers>=2.2.0",
|
|
22
|
+
"torch>=2.0.0",
|
|
23
|
+
"numpy>=1.24.0",
|
|
24
|
+
"einops>=0.7.0", # Required by nomic
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@app.cls(
|
|
29
|
+
image=embedder_image,
|
|
30
|
+
gpu="T4", # Cheapest option, sufficient for embedding
|
|
31
|
+
timeout=600,
|
|
32
|
+
scaledown_window=300, # Keep warm for 5 min
|
|
33
|
+
retries=2,
|
|
34
|
+
)
|
|
35
|
+
class Embedder:
|
|
36
|
+
"""GPU-accelerated embedding generator using nomic-embed-text."""
|
|
37
|
+
|
|
38
|
+
@modal.enter()
|
|
39
|
+
def load_model(self):
|
|
40
|
+
"""Load model once when container starts."""
|
|
41
|
+
from sentence_transformers import SentenceTransformer
|
|
42
|
+
import torch
|
|
43
|
+
|
|
44
|
+
self.model = SentenceTransformer(
|
|
45
|
+
"nomic-ai/nomic-embed-text-v1.5",
|
|
46
|
+
trust_remote_code=True,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
50
|
+
self.model.to(device)
|
|
51
|
+
print(f"Model loaded on {device}")
|
|
52
|
+
|
|
53
|
+
@modal.method()
|
|
54
|
+
def embed_batch(
|
|
55
|
+
self,
|
|
56
|
+
texts: list[str],
|
|
57
|
+
is_query: bool = False,
|
|
58
|
+
batch_size: int = 16,
|
|
59
|
+
) -> list[list[float]]:
|
|
60
|
+
"""
|
|
61
|
+
Generate embeddings for a batch of texts.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
texts: List of text chunks to embed
|
|
65
|
+
is_query: If True, use query prefix; otherwise document prefix
|
|
66
|
+
batch_size: Processing batch size
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List of embedding vectors (768 dimensions)
|
|
70
|
+
"""
|
|
71
|
+
# Nomic model requires task-specific prefixes
|
|
72
|
+
prefix = "search_query: " if is_query else "search_document: "
|
|
73
|
+
prefixed = [f"{prefix}{t}" for t in texts]
|
|
74
|
+
|
|
75
|
+
embeddings = self.model.encode(
|
|
76
|
+
prefixed,
|
|
77
|
+
batch_size=batch_size,
|
|
78
|
+
show_progress_bar=len(texts) > 100,
|
|
79
|
+
convert_to_numpy=True,
|
|
80
|
+
normalize_embeddings=True, # For cosine similarity
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return embeddings.tolist()
|
|
84
|
+
|
|
85
|
+
@modal.method()
|
|
86
|
+
def embed_single(self, text: str, is_query: bool = False) -> list[float]:
|
|
87
|
+
"""Embed a single text (convenience method)."""
|
|
88
|
+
prefix = "search_query: " if is_query else "search_document: "
|
|
89
|
+
embedding = self.model.encode(
|
|
90
|
+
f"{prefix}{text}",
|
|
91
|
+
convert_to_numpy=True,
|
|
92
|
+
normalize_embeddings=True,
|
|
93
|
+
)
|
|
94
|
+
return embedding.tolist()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# For testing
|
|
98
|
+
@app.local_entrypoint()
|
|
99
|
+
def test():
|
|
100
|
+
"""Test the embedder."""
|
|
101
|
+
embedder = Embedder()
|
|
102
|
+
|
|
103
|
+
test_texts = [
|
|
104
|
+
"Django ORM query optimization using select_related and prefetch_related",
|
|
105
|
+
"PostgreSQL VACUUM and ANALYZE for table maintenance",
|
|
106
|
+
"Kubernetes pod scheduling with node affinity rules",
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
print(f"Embedding {len(test_texts)} texts...")
|
|
110
|
+
embeddings = embedder.embed_batch.remote(test_texts)
|
|
111
|
+
|
|
112
|
+
print(f"Generated {len(embeddings)} embeddings")
|
|
113
|
+
print(f"Embedding dimension: {len(embeddings[0])}")
|
|
114
|
+
|
|
115
|
+
# Test similarity
|
|
116
|
+
import numpy as np
|
|
117
|
+
|
|
118
|
+
emb = np.array(embeddings)
|
|
119
|
+
similarity = np.dot(emb, emb.T)
|
|
120
|
+
print(f"\nSimilarity matrix:\n{similarity}")
|
okb/modal_llm.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modal-based GPU LLM service for document classification.
|
|
3
|
+
|
|
4
|
+
Provides on-demand GPU access for LLM inference using open models.
|
|
5
|
+
Uses Llama 3.2 3B by default - fast and efficient for classification tasks.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
modal deploy modal_llm.py
|
|
9
|
+
|
|
10
|
+
Then call from Python:
|
|
11
|
+
llm = modal.Cls.from_name("knowledge-llm", "LLM")()
|
|
12
|
+
response = llm.complete.remote("Classify this document", system="You are a classifier")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import modal
|
|
16
|
+
|
|
17
|
+
app = modal.App("knowledge-llm")
|
|
18
|
+
|
|
19
|
+
# Container image with transformers and torch
|
|
20
|
+
llm_image = (
|
|
21
|
+
modal.Image.debian_slim(python_version="3.11")
|
|
22
|
+
.pip_install(
|
|
23
|
+
"transformers>=4.40.0",
|
|
24
|
+
"torch>=2.0.0",
|
|
25
|
+
"accelerate>=0.27.0",
|
|
26
|
+
"bitsandbytes>=0.42.0", # For quantization
|
|
27
|
+
)
|
|
28
|
+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Default model - Llama 3.2 3B is fast and good for classification
|
|
32
|
+
DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.cls(
|
|
36
|
+
image=llm_image,
|
|
37
|
+
gpu="T4", # T4 is sufficient for 3B model with quantization
|
|
38
|
+
timeout=300,
|
|
39
|
+
scaledown_window=300, # Keep warm for 5 min
|
|
40
|
+
retries=1,
|
|
41
|
+
)
|
|
42
|
+
class LLM:
|
|
43
|
+
"""GPU-accelerated LLM for document classification."""
|
|
44
|
+
|
|
45
|
+
model_id: str = DEFAULT_MODEL
|
|
46
|
+
|
|
47
|
+
@modal.enter()
|
|
48
|
+
def load_model(self):
|
|
49
|
+
"""Load model once when container starts."""
|
|
50
|
+
import torch
|
|
51
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
52
|
+
|
|
53
|
+
print(f"Loading model: {self.model_id}")
|
|
54
|
+
|
|
55
|
+
# Use 4-bit quantization for memory efficiency
|
|
56
|
+
quantization_config = BitsAndBytesConfig(
|
|
57
|
+
load_in_4bit=True,
|
|
58
|
+
bnb_4bit_compute_dtype=torch.float16,
|
|
59
|
+
bnb_4bit_use_double_quant=True,
|
|
60
|
+
bnb_4bit_quant_type="nf4",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
|
|
64
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
65
|
+
self.model_id,
|
|
66
|
+
quantization_config=quantization_config,
|
|
67
|
+
device_map="auto",
|
|
68
|
+
torch_dtype=torch.float16,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Set pad token if not set
|
|
72
|
+
if self.tokenizer.pad_token is None:
|
|
73
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
74
|
+
|
|
75
|
+
print(f"Model loaded on {self.model.device}")
|
|
76
|
+
|
|
77
|
+
@modal.method()
|
|
78
|
+
def complete(
|
|
79
|
+
self,
|
|
80
|
+
prompt: str,
|
|
81
|
+
system: str | None = None,
|
|
82
|
+
max_tokens: int = 256,
|
|
83
|
+
temperature: float = 0.1,
|
|
84
|
+
) -> dict:
|
|
85
|
+
"""Generate a completion for the given prompt.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
prompt: User prompt
|
|
89
|
+
system: Optional system prompt
|
|
90
|
+
max_tokens: Maximum tokens to generate
|
|
91
|
+
temperature: Sampling temperature (lower = more deterministic)
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Dict with 'content', 'model', 'input_tokens', 'output_tokens'
|
|
95
|
+
"""
|
|
96
|
+
import torch
|
|
97
|
+
|
|
98
|
+
# Build messages in chat format
|
|
99
|
+
messages = []
|
|
100
|
+
if system:
|
|
101
|
+
messages.append({"role": "system", "content": system})
|
|
102
|
+
messages.append({"role": "user", "content": prompt})
|
|
103
|
+
|
|
104
|
+
# Apply chat template
|
|
105
|
+
text = self.tokenizer.apply_chat_template(
|
|
106
|
+
messages,
|
|
107
|
+
tokenize=False,
|
|
108
|
+
add_generation_prompt=True,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
|
|
112
|
+
input_tokens = inputs.input_ids.shape[1]
|
|
113
|
+
|
|
114
|
+
with torch.no_grad():
|
|
115
|
+
outputs = self.model.generate(
|
|
116
|
+
**inputs,
|
|
117
|
+
max_new_tokens=max_tokens,
|
|
118
|
+
temperature=temperature,
|
|
119
|
+
do_sample=temperature > 0,
|
|
120
|
+
pad_token_id=self.tokenizer.pad_token_id,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Decode only the new tokens
|
|
124
|
+
output_tokens = outputs.shape[1] - input_tokens
|
|
125
|
+
response_text = self.tokenizer.decode(
|
|
126
|
+
outputs[0][input_tokens:],
|
|
127
|
+
skip_special_tokens=True,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"content": response_text.strip(),
|
|
132
|
+
"model": self.model_id,
|
|
133
|
+
"input_tokens": input_tokens,
|
|
134
|
+
"output_tokens": output_tokens,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
@modal.method()
|
|
138
|
+
def complete_batch(
|
|
139
|
+
self,
|
|
140
|
+
prompts: list[str],
|
|
141
|
+
system: str | None = None,
|
|
142
|
+
max_tokens: int = 256,
|
|
143
|
+
) -> list[dict]:
|
|
144
|
+
"""Generate completions for multiple prompts.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
prompts: List of user prompts
|
|
148
|
+
system: Optional system prompt (same for all)
|
|
149
|
+
max_tokens: Maximum tokens per response
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of response dicts
|
|
153
|
+
"""
|
|
154
|
+
# For now, process sequentially (batched inference is more complex)
|
|
155
|
+
return [self.complete(prompt, system=system, max_tokens=max_tokens) for prompt in prompts]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# For testing
|
|
159
|
+
@app.local_entrypoint()
|
|
160
|
+
def test():
|
|
161
|
+
"""Test the LLM."""
|
|
162
|
+
llm = LLM()
|
|
163
|
+
|
|
164
|
+
system = "You are a document classifier. Respond with JSON: {action, reason}"
|
|
165
|
+
prompt = """Classify this email:
|
|
166
|
+
|
|
167
|
+
Subject: 50% OFF - Limited Time Offer!
|
|
168
|
+
From: deals@marketing-spam.com
|
|
169
|
+
|
|
170
|
+
Don't miss out on our biggest sale of the year!
|
|
171
|
+
Click here to claim your discount before it expires!"""
|
|
172
|
+
|
|
173
|
+
print("Testing LLM classification...")
|
|
174
|
+
response = llm.complete.remote(prompt, system=system, max_tokens=100)
|
|
175
|
+
|
|
176
|
+
print(f"Model: {response['model']}")
|
|
177
|
+
print(f"Tokens: {response['input_tokens']} in, {response['output_tokens']} out")
|
|
178
|
+
print(f"Response:\n{response['content']}")
|
okb/plugins/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Plugin system for LKB - extensible file parsers and API sources."""
|
|
2
|
+
|
|
3
|
+
# Re-export Document from ingest for plugin authors
|
|
4
|
+
from ..ingest import Document
|
|
5
|
+
from .base import APISource, FileParser, SyncState
|
|
6
|
+
from .registry import PluginRegistry
|
|
7
|
+
|
|
8
|
+
__all__ = ["FileParser", "APISource", "SyncState", "Document", "PluginRegistry"]
|
okb/plugins/base.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Protocol definitions for LKB plugins."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from ..ingest import Document
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class SyncState:
|
|
16
|
+
"""Tracks sync progress for incremental updates."""
|
|
17
|
+
|
|
18
|
+
last_sync: datetime | None = None
|
|
19
|
+
cursor: str | None = None
|
|
20
|
+
extra: dict = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@runtime_checkable
|
|
24
|
+
class FileParser(Protocol):
|
|
25
|
+
"""Protocol for file format parsers.
|
|
26
|
+
|
|
27
|
+
Plugins implement this to add support for new file types.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
class MyParser:
|
|
31
|
+
extensions = ['.xyz']
|
|
32
|
+
source_type = 'xyz'
|
|
33
|
+
|
|
34
|
+
def can_parse(self, path: Path) -> bool:
|
|
35
|
+
return path.suffix.lower() == '.xyz'
|
|
36
|
+
|
|
37
|
+
def parse(self, path: Path, extra_metadata: dict | None = None) -> Document:
|
|
38
|
+
...
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
extensions: list[str] # e.g., ['.pdf', '.PDF'] - for fast pre-filtering
|
|
42
|
+
source_type: str # e.g., 'pdf'
|
|
43
|
+
|
|
44
|
+
def can_parse(self, path: Path) -> bool:
|
|
45
|
+
"""Check if this parser can handle the file (beyond just extension).
|
|
46
|
+
|
|
47
|
+
Called after extension match. Can inspect file content, magic bytes, etc.
|
|
48
|
+
Return False to let other parsers try.
|
|
49
|
+
"""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
def parse(self, path: Path, extra_metadata: dict | None = None) -> Document:
|
|
53
|
+
"""Parse the file and return a Document.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
path: Path to the file to parse
|
|
57
|
+
extra_metadata: Optional metadata to merge into the document
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Document instance ready for ingestion
|
|
61
|
+
"""
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@runtime_checkable
|
|
66
|
+
class APISource(Protocol):
|
|
67
|
+
"""Protocol for API-based data sources.
|
|
68
|
+
|
|
69
|
+
Plugins implement this to sync data from external services.
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
class GitHubSource:
|
|
73
|
+
name = 'github'
|
|
74
|
+
source_type = 'github-source'
|
|
75
|
+
|
|
76
|
+
def configure(self, config: dict) -> None:
|
|
77
|
+
self._token = config['token']
|
|
78
|
+
self._repos = config.get('repos', []) # From CLI --repo flags
|
|
79
|
+
|
|
80
|
+
def fetch(self, state: SyncState | None = None) -> tuple[list[Document], SyncState]:
|
|
81
|
+
# Use state.last_sync for incremental fetching
|
|
82
|
+
# Return (documents, new_state)
|
|
83
|
+
...
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
name: str # e.g., 'github'
|
|
87
|
+
source_type: str # e.g., 'github-issue'
|
|
88
|
+
|
|
89
|
+
def configure(self, config: dict) -> None:
|
|
90
|
+
"""Configure the source with settings from config file.
|
|
91
|
+
|
|
92
|
+
Config values may include resolved environment variables.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
config: Source-specific configuration dict
|
|
96
|
+
"""
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
def fetch(self, state: SyncState | None = None) -> tuple[list[Document], SyncState]:
|
|
100
|
+
"""Fetch documents from the external source.
|
|
101
|
+
|
|
102
|
+
Should support incremental fetching using the state object.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
state: Previous sync state for incremental updates, or None for full sync
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Tuple of (list of documents, new sync state)
|
|
109
|
+
"""
|
|
110
|
+
...
|