okb 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
okb/migrate.py ADDED
@@ -0,0 +1,53 @@
1
+ """Migration runner for okb database schema."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from yoyo import get_backend, read_migrations
8
+
9
+
10
+ def get_migrations_path() -> str:
11
+ """Get path to migrations directory."""
12
+ return str(Path(__file__).parent / "migrations")
13
+
14
+
15
+ def _convert_db_url(db_url: str) -> str:
16
+ """Convert psycopg3 URL to yoyo-compatible format.
17
+
18
+ yoyo uses psycopg2 by default. We convert:
19
+ postgresql://... -> postgresql+psycopg://...
20
+ to use psycopg v3.
21
+ """
22
+ if db_url.startswith("postgresql://") and "+psycopg" not in db_url:
23
+ return db_url.replace("postgresql://", "postgresql+psycopg://", 1)
24
+ return db_url
25
+
26
+
27
+ def run_migrations(db_url: str) -> list[str]:
28
+ """Apply pending migrations, return list of applied migration IDs."""
29
+ backend = get_backend(_convert_db_url(db_url))
30
+ migrations = read_migrations(get_migrations_path())
31
+
32
+ with backend.lock():
33
+ to_apply = backend.to_apply(migrations)
34
+ if to_apply:
35
+ backend.apply_migrations(to_apply)
36
+
37
+ return [m.id for m in to_apply]
38
+
39
+
40
+ def get_pending(db_url: str) -> list[str]:
41
+ """Get list of pending migration IDs."""
42
+ backend = get_backend(_convert_db_url(db_url))
43
+ migrations = read_migrations(get_migrations_path())
44
+ return [m.id for m in backend.to_apply(migrations)]
45
+
46
+
47
+ def get_applied(db_url: str) -> list[str]:
48
+ """Get list of applied migration IDs."""
49
+ backend = get_backend(_convert_db_url(db_url))
50
+ migrations = read_migrations(get_migrations_path())
51
+ to_apply = backend.to_apply(migrations)
52
+ to_apply_ids = {m.id for m in to_apply}
53
+ return [m.id for m in migrations if m.id not in to_apply_ids]
@@ -0,0 +1,91 @@
1
+ -- Initial schema - documents, chunks, indexes
2
+ -- depends:
3
+
4
+ CREATE EXTENSION IF NOT EXISTS vector;
5
+
6
+ -- Main documents table
7
+ CREATE TABLE IF NOT EXISTS documents (
8
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
9
+ source_path TEXT NOT NULL,
10
+ source_type TEXT NOT NULL,
11
+ title TEXT,
12
+ content TEXT NOT NULL,
13
+ metadata JSONB DEFAULT '{}',
14
+ created_at TIMESTAMPTZ DEFAULT NOW(),
15
+ updated_at TIMESTAMPTZ DEFAULT NOW(),
16
+ content_hash TEXT NOT NULL,
17
+ UNIQUE(content_hash)
18
+ );
19
+
20
+ -- Chunks for semantic search
21
+ CREATE TABLE IF NOT EXISTS chunks (
22
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
23
+ document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
24
+ chunk_index INTEGER NOT NULL,
25
+ content TEXT NOT NULL,
26
+ embedding_text TEXT NOT NULL,
27
+ embedding vector(768),
28
+ token_count INTEGER,
29
+ metadata JSONB DEFAULT '{}',
30
+ created_at TIMESTAMPTZ DEFAULT NOW()
31
+ );
32
+
33
+ -- HNSW index for similarity search
34
+ CREATE INDEX IF NOT EXISTS chunks_embedding_idx ON chunks
35
+ USING hnsw (embedding vector_cosine_ops)
36
+ WITH (m = 16, ef_construction = 64);
37
+
38
+ -- Full-text search indexes
39
+ CREATE INDEX IF NOT EXISTS documents_content_fts ON documents
40
+ USING gin(to_tsvector('english', content));
41
+
42
+ CREATE INDEX IF NOT EXISTS chunks_content_fts ON chunks
43
+ USING gin(to_tsvector('english', content));
44
+
45
+ -- Lookup indexes
46
+ CREATE INDEX IF NOT EXISTS documents_source_path_idx ON documents(source_path);
47
+ CREATE INDEX IF NOT EXISTS documents_source_type_idx ON documents(source_type);
48
+ CREATE INDEX IF NOT EXISTS documents_metadata_idx ON documents USING gin(metadata);
49
+
50
+ -- Function to update timestamp
51
+ CREATE OR REPLACE FUNCTION update_updated_at()
52
+ RETURNS TRIGGER AS $$
53
+ BEGIN
54
+ NEW.updated_at = NOW();
55
+ RETURN NEW;
56
+ END;
57
+ $$ LANGUAGE plpgsql;
58
+
59
+ -- Trigger (drop first to make idempotent)
60
+ DROP TRIGGER IF EXISTS documents_updated_at ON documents;
61
+ CREATE TRIGGER documents_updated_at
62
+ BEFORE UPDATE ON documents
63
+ FOR EACH ROW
64
+ EXECUTE FUNCTION update_updated_at();
65
+
66
+ -- Helper view for search results
67
+ CREATE OR REPLACE VIEW search_results AS
68
+ SELECT
69
+ c.id as chunk_id,
70
+ c.content,
71
+ c.chunk_index,
72
+ c.token_count,
73
+ c.embedding,
74
+ d.id as document_id,
75
+ d.source_path,
76
+ d.source_type,
77
+ d.title,
78
+ d.metadata
79
+ FROM chunks c
80
+ JOIN documents d ON c.document_id = d.id;
81
+
82
+ -- Stats view
83
+ CREATE OR REPLACE VIEW index_stats AS
84
+ SELECT
85
+ source_type,
86
+ COUNT(DISTINCT d.id) as document_count,
87
+ COUNT(c.id) as chunk_count,
88
+ SUM(c.token_count) as total_tokens
89
+ FROM documents d
90
+ LEFT JOIN chunks c ON c.document_id = d.id
91
+ GROUP BY source_type;
@@ -0,0 +1,22 @@
1
+ -- Sync state for API sources (plugin system)
2
+ -- depends: 0001.initial-schema
3
+
4
+ CREATE TABLE IF NOT EXISTS sync_state (
5
+ source_name TEXT NOT NULL,
6
+ database_name TEXT NOT NULL DEFAULT 'default',
7
+ last_sync TIMESTAMPTZ,
8
+ cursor TEXT,
9
+ extra JSONB DEFAULT '{}',
10
+ updated_at TIMESTAMPTZ DEFAULT NOW(),
11
+ PRIMARY KEY (source_name, database_name)
12
+ );
13
+
14
+ -- Index for listing sync states
15
+ CREATE INDEX IF NOT EXISTS sync_state_updated_idx ON sync_state(updated_at DESC);
16
+
17
+ -- Trigger for updated_at
18
+ DROP TRIGGER IF EXISTS sync_state_updated_at ON sync_state;
19
+ CREATE TRIGGER sync_state_updated_at
20
+ BEFORE UPDATE ON sync_state
21
+ FOR EACH ROW
22
+ EXECUTE FUNCTION update_updated_at();
@@ -0,0 +1,22 @@
1
+ -- Structured fields for actionable items (tasks, events, emails)
2
+ -- depends: 0002.sync-state
3
+
4
+ -- Add structured fields to documents table for temporal/status queries
5
+ ALTER TABLE documents ADD COLUMN IF NOT EXISTS due_date TIMESTAMPTZ;
6
+ ALTER TABLE documents ADD COLUMN IF NOT EXISTS event_start TIMESTAMPTZ;
7
+ ALTER TABLE documents ADD COLUMN IF NOT EXISTS event_end TIMESTAMPTZ;
8
+ ALTER TABLE documents ADD COLUMN IF NOT EXISTS status TEXT;
9
+ ALTER TABLE documents ADD COLUMN IF NOT EXISTS priority INTEGER;
10
+
11
+ -- Indexes for common query patterns
12
+ CREATE INDEX IF NOT EXISTS documents_due_date_idx ON documents(due_date) WHERE due_date IS NOT NULL;
13
+ CREATE INDEX IF NOT EXISTS documents_event_start_idx ON documents(event_start) WHERE event_start IS NOT NULL;
14
+ CREATE INDEX IF NOT EXISTS documents_status_idx ON documents(status) WHERE status IS NOT NULL;
15
+
16
+ -- Composite index for "incomplete tasks due soon" queries
17
+ CREATE INDEX IF NOT EXISTS documents_actionable_idx ON documents(due_date, status)
18
+ WHERE due_date IS NOT NULL AND status IS NOT NULL;
19
+
20
+ -- Composite index for "events in date range" queries
21
+ CREATE INDEX IF NOT EXISTS documents_event_range_idx ON documents(event_start, event_end)
22
+ WHERE event_start IS NOT NULL;
@@ -0,0 +1,13 @@
1
+ -- API tokens for HTTP authentication
2
+ -- depends: 0003.structured-fields
3
+
4
+ CREATE TABLE IF NOT EXISTS tokens (
5
+ token_hash TEXT PRIMARY KEY, -- SHA256(full_token), hex encoded
6
+ permissions TEXT NOT NULL CHECK (permissions IN ('ro', 'rw')),
7
+ description TEXT,
8
+ created_at TIMESTAMPTZ DEFAULT NOW(),
9
+ last_used_at TIMESTAMPTZ
10
+ );
11
+
12
+ -- Index for listing tokens
13
+ CREATE INDEX IF NOT EXISTS tokens_created_at_idx ON tokens(created_at DESC);
@@ -0,0 +1,19 @@
1
+ -- Database metadata table for storing LLM-enhanced descriptions
2
+ -- depends: 0004.tokens
3
+
4
+ CREATE TABLE IF NOT EXISTS database_metadata (
5
+ id SERIAL PRIMARY KEY,
6
+ key TEXT NOT NULL UNIQUE, -- 'description', 'topics', etc.
7
+ value JSONB NOT NULL,
8
+ source TEXT NOT NULL DEFAULT 'llm', -- 'config' or 'llm'
9
+ updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
10
+ );
11
+
12
+ -- Index for quick key lookups
13
+ CREATE INDEX IF NOT EXISTS idx_database_metadata_key ON database_metadata(key);
14
+
15
+ -- Insert default entries that can be updated by LLM
16
+ INSERT INTO database_metadata (key, value, source) VALUES
17
+ ('llm_description', 'null'::jsonb, 'llm'),
18
+ ('llm_topics', '[]'::jsonb, 'llm')
19
+ ON CONFLICT (key) DO NOTHING;
@@ -0,0 +1,13 @@
1
+ -- LLM response cache for avoiding redundant API calls
2
+ -- depends: 0005.database-metadata
3
+
4
+ CREATE TABLE IF NOT EXISTS llm_cache (
5
+ content_hash TEXT PRIMARY KEY, -- SHA256 of (prompt + system + model)
6
+ provider TEXT NOT NULL,
7
+ model TEXT NOT NULL,
8
+ response TEXT NOT NULL, -- JSON: {content, input_tokens, output_tokens}
9
+ created_at TIMESTAMPTZ DEFAULT NOW()
10
+ );
11
+
12
+ -- Index for cache cleanup queries
13
+ CREATE INDEX IF NOT EXISTS llm_cache_created_at_idx ON llm_cache(created_at);
okb/modal_embedder.py ADDED
@@ -0,0 +1,120 @@
1
+ """
2
+ Modal-based GPU embedding service.
3
+
4
+ Provides on-demand GPU access for batch embedding generation.
5
+ Costs approximately $0.02 per 1000 chunks on T4 GPU.
6
+
7
+ Usage:
8
+ modal deploy modal_embedder.py
9
+
10
+ Then call from Python:
11
+ embedder = modal.Cls.from_name("knowledge-embedder", "Embedder")()
12
+ embeddings = embedder.embed_batch.remote(texts)
13
+ """
14
+
15
+ import modal
16
+
17
+ app = modal.App("knowledge-embedder")
18
+
19
+ # Container image with all dependencies
20
+ embedder_image = modal.Image.debian_slim(python_version="3.11").pip_install(
21
+ "sentence-transformers>=2.2.0",
22
+ "torch>=2.0.0",
23
+ "numpy>=1.24.0",
24
+ "einops>=0.7.0", # Required by nomic
25
+ )
26
+
27
+
28
+ @app.cls(
29
+ image=embedder_image,
30
+ gpu="T4", # Cheapest option, sufficient for embedding
31
+ timeout=600,
32
+ scaledown_window=300, # Keep warm for 5 min
33
+ retries=2,
34
+ )
35
+ class Embedder:
36
+ """GPU-accelerated embedding generator using nomic-embed-text."""
37
+
38
+ @modal.enter()
39
+ def load_model(self):
40
+ """Load model once when container starts."""
41
+ from sentence_transformers import SentenceTransformer
42
+ import torch
43
+
44
+ self.model = SentenceTransformer(
45
+ "nomic-ai/nomic-embed-text-v1.5",
46
+ trust_remote_code=True,
47
+ )
48
+
49
+ device = "cuda" if torch.cuda.is_available() else "cpu"
50
+ self.model.to(device)
51
+ print(f"Model loaded on {device}")
52
+
53
+ @modal.method()
54
+ def embed_batch(
55
+ self,
56
+ texts: list[str],
57
+ is_query: bool = False,
58
+ batch_size: int = 16,
59
+ ) -> list[list[float]]:
60
+ """
61
+ Generate embeddings for a batch of texts.
62
+
63
+ Args:
64
+ texts: List of text chunks to embed
65
+ is_query: If True, use query prefix; otherwise document prefix
66
+ batch_size: Processing batch size
67
+
68
+ Returns:
69
+ List of embedding vectors (768 dimensions)
70
+ """
71
+ # Nomic model requires task-specific prefixes
72
+ prefix = "search_query: " if is_query else "search_document: "
73
+ prefixed = [f"{prefix}{t}" for t in texts]
74
+
75
+ embeddings = self.model.encode(
76
+ prefixed,
77
+ batch_size=batch_size,
78
+ show_progress_bar=len(texts) > 100,
79
+ convert_to_numpy=True,
80
+ normalize_embeddings=True, # For cosine similarity
81
+ )
82
+
83
+ return embeddings.tolist()
84
+
85
+ @modal.method()
86
+ def embed_single(self, text: str, is_query: bool = False) -> list[float]:
87
+ """Embed a single text (convenience method)."""
88
+ prefix = "search_query: " if is_query else "search_document: "
89
+ embedding = self.model.encode(
90
+ f"{prefix}{text}",
91
+ convert_to_numpy=True,
92
+ normalize_embeddings=True,
93
+ )
94
+ return embedding.tolist()
95
+
96
+
97
+ # For testing
98
+ @app.local_entrypoint()
99
+ def test():
100
+ """Test the embedder."""
101
+ embedder = Embedder()
102
+
103
+ test_texts = [
104
+ "Django ORM query optimization using select_related and prefetch_related",
105
+ "PostgreSQL VACUUM and ANALYZE for table maintenance",
106
+ "Kubernetes pod scheduling with node affinity rules",
107
+ ]
108
+
109
+ print(f"Embedding {len(test_texts)} texts...")
110
+ embeddings = embedder.embed_batch.remote(test_texts)
111
+
112
+ print(f"Generated {len(embeddings)} embeddings")
113
+ print(f"Embedding dimension: {len(embeddings[0])}")
114
+
115
+ # Test similarity
116
+ import numpy as np
117
+
118
+ emb = np.array(embeddings)
119
+ similarity = np.dot(emb, emb.T)
120
+ print(f"\nSimilarity matrix:\n{similarity}")
okb/modal_llm.py ADDED
@@ -0,0 +1,178 @@
1
+ """
2
+ Modal-based GPU LLM service for document classification.
3
+
4
+ Provides on-demand GPU access for LLM inference using open models.
5
+ Uses Llama 3.2 3B by default - fast and efficient for classification tasks.
6
+
7
+ Usage:
8
+ modal deploy modal_llm.py
9
+
10
+ Then call from Python:
11
+ llm = modal.Cls.from_name("knowledge-llm", "LLM")()
12
+ response = llm.complete.remote("Classify this document", system="You are a classifier")
13
+ """
14
+
15
+ import modal
16
+
17
+ app = modal.App("knowledge-llm")
18
+
19
+ # Container image with transformers and torch
20
+ llm_image = (
21
+ modal.Image.debian_slim(python_version="3.11")
22
+ .pip_install(
23
+ "transformers>=4.40.0",
24
+ "torch>=2.0.0",
25
+ "accelerate>=0.27.0",
26
+ "bitsandbytes>=0.42.0", # For quantization
27
+ )
28
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
29
+ )
30
+
31
+ # Default model - Llama 3.2 3B is fast and good for classification
32
+ DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
33
+
34
+
35
+ @app.cls(
36
+ image=llm_image,
37
+ gpu="T4", # T4 is sufficient for 3B model with quantization
38
+ timeout=300,
39
+ scaledown_window=300, # Keep warm for 5 min
40
+ retries=1,
41
+ )
42
+ class LLM:
43
+ """GPU-accelerated LLM for document classification."""
44
+
45
+ model_id: str = DEFAULT_MODEL
46
+
47
+ @modal.enter()
48
+ def load_model(self):
49
+ """Load model once when container starts."""
50
+ import torch
51
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
52
+
53
+ print(f"Loading model: {self.model_id}")
54
+
55
+ # Use 4-bit quantization for memory efficiency
56
+ quantization_config = BitsAndBytesConfig(
57
+ load_in_4bit=True,
58
+ bnb_4bit_compute_dtype=torch.float16,
59
+ bnb_4bit_use_double_quant=True,
60
+ bnb_4bit_quant_type="nf4",
61
+ )
62
+
63
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
64
+ self.model = AutoModelForCausalLM.from_pretrained(
65
+ self.model_id,
66
+ quantization_config=quantization_config,
67
+ device_map="auto",
68
+ torch_dtype=torch.float16,
69
+ )
70
+
71
+ # Set pad token if not set
72
+ if self.tokenizer.pad_token is None:
73
+ self.tokenizer.pad_token = self.tokenizer.eos_token
74
+
75
+ print(f"Model loaded on {self.model.device}")
76
+
77
+ @modal.method()
78
+ def complete(
79
+ self,
80
+ prompt: str,
81
+ system: str | None = None,
82
+ max_tokens: int = 256,
83
+ temperature: float = 0.1,
84
+ ) -> dict:
85
+ """Generate a completion for the given prompt.
86
+
87
+ Args:
88
+ prompt: User prompt
89
+ system: Optional system prompt
90
+ max_tokens: Maximum tokens to generate
91
+ temperature: Sampling temperature (lower = more deterministic)
92
+
93
+ Returns:
94
+ Dict with 'content', 'model', 'input_tokens', 'output_tokens'
95
+ """
96
+ import torch
97
+
98
+ # Build messages in chat format
99
+ messages = []
100
+ if system:
101
+ messages.append({"role": "system", "content": system})
102
+ messages.append({"role": "user", "content": prompt})
103
+
104
+ # Apply chat template
105
+ text = self.tokenizer.apply_chat_template(
106
+ messages,
107
+ tokenize=False,
108
+ add_generation_prompt=True,
109
+ )
110
+
111
+ inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
112
+ input_tokens = inputs.input_ids.shape[1]
113
+
114
+ with torch.no_grad():
115
+ outputs = self.model.generate(
116
+ **inputs,
117
+ max_new_tokens=max_tokens,
118
+ temperature=temperature,
119
+ do_sample=temperature > 0,
120
+ pad_token_id=self.tokenizer.pad_token_id,
121
+ )
122
+
123
+ # Decode only the new tokens
124
+ output_tokens = outputs.shape[1] - input_tokens
125
+ response_text = self.tokenizer.decode(
126
+ outputs[0][input_tokens:],
127
+ skip_special_tokens=True,
128
+ )
129
+
130
+ return {
131
+ "content": response_text.strip(),
132
+ "model": self.model_id,
133
+ "input_tokens": input_tokens,
134
+ "output_tokens": output_tokens,
135
+ }
136
+
137
+ @modal.method()
138
+ def complete_batch(
139
+ self,
140
+ prompts: list[str],
141
+ system: str | None = None,
142
+ max_tokens: int = 256,
143
+ ) -> list[dict]:
144
+ """Generate completions for multiple prompts.
145
+
146
+ Args:
147
+ prompts: List of user prompts
148
+ system: Optional system prompt (same for all)
149
+ max_tokens: Maximum tokens per response
150
+
151
+ Returns:
152
+ List of response dicts
153
+ """
154
+ # For now, process sequentially (batched inference is more complex)
155
+ return [self.complete(prompt, system=system, max_tokens=max_tokens) for prompt in prompts]
156
+
157
+
158
+ # For testing
159
+ @app.local_entrypoint()
160
+ def test():
161
+ """Test the LLM."""
162
+ llm = LLM()
163
+
164
+ system = "You are a document classifier. Respond with JSON: {action, reason}"
165
+ prompt = """Classify this email:
166
+
167
+ Subject: 50% OFF - Limited Time Offer!
168
+ From: deals@marketing-spam.com
169
+
170
+ Don't miss out on our biggest sale of the year!
171
+ Click here to claim your discount before it expires!"""
172
+
173
+ print("Testing LLM classification...")
174
+ response = llm.complete.remote(prompt, system=system, max_tokens=100)
175
+
176
+ print(f"Model: {response['model']}")
177
+ print(f"Tokens: {response['input_tokens']} in, {response['output_tokens']} out")
178
+ print(f"Response:\n{response['content']}")
@@ -0,0 +1,8 @@
1
+ """Plugin system for LKB - extensible file parsers and API sources."""
2
+
3
+ # Re-export Document from ingest for plugin authors
4
+ from ..ingest import Document
5
+ from .base import APISource, FileParser, SyncState
6
+ from .registry import PluginRegistry
7
+
8
+ __all__ = ["FileParser", "APISource", "SyncState", "Document", "PluginRegistry"]
okb/plugins/base.py ADDED
@@ -0,0 +1,110 @@
1
+ """Protocol definitions for LKB plugins."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Protocol, runtime_checkable
9
+
10
+ if TYPE_CHECKING:
11
+ from ..ingest import Document
12
+
13
+
14
+ @dataclass
15
+ class SyncState:
16
+ """Tracks sync progress for incremental updates."""
17
+
18
+ last_sync: datetime | None = None
19
+ cursor: str | None = None
20
+ extra: dict = field(default_factory=dict)
21
+
22
+
23
+ @runtime_checkable
24
+ class FileParser(Protocol):
25
+ """Protocol for file format parsers.
26
+
27
+ Plugins implement this to add support for new file types.
28
+
29
+ Example:
30
+ class MyParser:
31
+ extensions = ['.xyz']
32
+ source_type = 'xyz'
33
+
34
+ def can_parse(self, path: Path) -> bool:
35
+ return path.suffix.lower() == '.xyz'
36
+
37
+ def parse(self, path: Path, extra_metadata: dict | None = None) -> Document:
38
+ ...
39
+ """
40
+
41
+ extensions: list[str] # e.g., ['.pdf', '.PDF'] - for fast pre-filtering
42
+ source_type: str # e.g., 'pdf'
43
+
44
+ def can_parse(self, path: Path) -> bool:
45
+ """Check if this parser can handle the file (beyond just extension).
46
+
47
+ Called after extension match. Can inspect file content, magic bytes, etc.
48
+ Return False to let other parsers try.
49
+ """
50
+ ...
51
+
52
+ def parse(self, path: Path, extra_metadata: dict | None = None) -> Document:
53
+ """Parse the file and return a Document.
54
+
55
+ Args:
56
+ path: Path to the file to parse
57
+ extra_metadata: Optional metadata to merge into the document
58
+
59
+ Returns:
60
+ Document instance ready for ingestion
61
+ """
62
+ ...
63
+
64
+
65
+ @runtime_checkable
66
+ class APISource(Protocol):
67
+ """Protocol for API-based data sources.
68
+
69
+ Plugins implement this to sync data from external services.
70
+
71
+ Example:
72
+ class GitHubSource:
73
+ name = 'github'
74
+ source_type = 'github-source'
75
+
76
+ def configure(self, config: dict) -> None:
77
+ self._token = config['token']
78
+ self._repos = config.get('repos', []) # From CLI --repo flags
79
+
80
+ def fetch(self, state: SyncState | None = None) -> tuple[list[Document], SyncState]:
81
+ # Use state.last_sync for incremental fetching
82
+ # Return (documents, new_state)
83
+ ...
84
+ """
85
+
86
+ name: str # e.g., 'github'
87
+ source_type: str # e.g., 'github-issue'
88
+
89
+ def configure(self, config: dict) -> None:
90
+ """Configure the source with settings from config file.
91
+
92
+ Config values may include resolved environment variables.
93
+
94
+ Args:
95
+ config: Source-specific configuration dict
96
+ """
97
+ ...
98
+
99
+ def fetch(self, state: SyncState | None = None) -> tuple[list[Document], SyncState]:
100
+ """Fetch documents from the external source.
101
+
102
+ Should support incremental fetching using the state object.
103
+
104
+ Args:
105
+ state: Previous sync state for incremental updates, or None for full sync
106
+
107
+ Returns:
108
+ Tuple of (list of documents, new sync state)
109
+ """
110
+ ...