okb 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/__init__.py +3 -0
- okb/cli.py +1272 -0
- okb/config.py +661 -0
- okb/data/init.sql +92 -0
- okb/http_server.py +463 -0
- okb/ingest.py +1589 -0
- okb/llm/__init__.py +86 -0
- okb/llm/base.py +83 -0
- okb/llm/cache.py +217 -0
- okb/llm/filter.py +187 -0
- okb/llm/providers.py +322 -0
- okb/local_embedder.py +87 -0
- okb/mcp_server.py +1393 -0
- okb/migrate.py +53 -0
- okb/migrations/0001.initial-schema.sql +91 -0
- okb/migrations/0002.sync-state.sql +22 -0
- okb/migrations/0003.structured-fields.sql +22 -0
- okb/migrations/0004.tokens.sql +13 -0
- okb/migrations/0005.database-metadata.sql +19 -0
- okb/migrations/0006.llm-cache.sql +13 -0
- okb/modal_embedder.py +120 -0
- okb/modal_llm.py +178 -0
- okb/plugins/__init__.py +8 -0
- okb/plugins/base.py +110 -0
- okb/plugins/registry.py +123 -0
- okb/plugins/sources/__init__.py +5 -0
- okb/plugins/sources/dropbox_paper.py +188 -0
- okb/plugins/sources/github.py +484 -0
- okb/rescan.py +227 -0
- okb/scripts/__init__.py +1 -0
- okb/scripts/watch.py +206 -0
- okb/tokens.py +277 -0
- okb-1.0.0.dist-info/METADATA +397 -0
- okb-1.0.0.dist-info/RECORD +36 -0
- okb-1.0.0.dist-info/WHEEL +4 -0
- okb-1.0.0.dist-info/entry_points.txt +9 -0
okb/llm/__init__.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""LLM integration for document classification and enrichment.
|
|
2
|
+
|
|
3
|
+
This package provides a provider-agnostic interface for LLM operations,
|
|
4
|
+
with support for Claude API, AWS Bedrock, and response caching.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from okb.llm import get_llm, complete
|
|
8
|
+
|
|
9
|
+
# Get configured provider (returns None if disabled)
|
|
10
|
+
llm = get_llm()
|
|
11
|
+
if llm:
|
|
12
|
+
response = llm.complete("Summarize this document", system="Be concise")
|
|
13
|
+
|
|
14
|
+
# Or use convenience function with caching
|
|
15
|
+
response = complete("Classify this email")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from .base import LLMProvider, LLMResponse
|
|
19
|
+
from .filter import FilterAction, FilterResult, filter_document, filter_documents
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"LLMProvider",
|
|
23
|
+
"LLMResponse",
|
|
24
|
+
"FilterAction",
|
|
25
|
+
"FilterResult",
|
|
26
|
+
"filter_document",
|
|
27
|
+
"filter_documents",
|
|
28
|
+
"get_llm",
|
|
29
|
+
"complete",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_llm() -> LLMProvider | None:
|
|
34
|
+
"""Get the configured LLM provider, or None if disabled.
|
|
35
|
+
|
|
36
|
+
Reads configuration from the global config object.
|
|
37
|
+
Lazily initializes the provider on first call.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Configured LLMProvider instance, or None if llm.provider is not set
|
|
41
|
+
"""
|
|
42
|
+
from .providers import get_provider
|
|
43
|
+
|
|
44
|
+
return get_provider()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def complete(
|
|
48
|
+
prompt: str,
|
|
49
|
+
system: str | None = None,
|
|
50
|
+
max_tokens: int = 1024,
|
|
51
|
+
use_cache: bool = True,
|
|
52
|
+
) -> LLMResponse | None:
|
|
53
|
+
"""Generate a completion using the configured LLM provider.
|
|
54
|
+
|
|
55
|
+
Convenience function that handles caching and provider initialization.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
prompt: The user prompt to complete
|
|
59
|
+
system: Optional system prompt for context/instructions
|
|
60
|
+
max_tokens: Maximum tokens in the response
|
|
61
|
+
use_cache: Whether to use cached responses (default True)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
LLMResponse with the generated content, or None if LLM is disabled
|
|
65
|
+
"""
|
|
66
|
+
from .cache import cache_response, get_cached
|
|
67
|
+
from .providers import get_provider
|
|
68
|
+
|
|
69
|
+
provider = get_provider()
|
|
70
|
+
if provider is None:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# Check cache first
|
|
74
|
+
if use_cache:
|
|
75
|
+
cached = get_cached(prompt, system, provider.name)
|
|
76
|
+
if cached is not None:
|
|
77
|
+
return cached
|
|
78
|
+
|
|
79
|
+
# Generate new response
|
|
80
|
+
response = provider.complete(prompt, system=system, max_tokens=max_tokens)
|
|
81
|
+
|
|
82
|
+
# Cache the response
|
|
83
|
+
if use_cache:
|
|
84
|
+
cache_response(prompt, system, provider.name, response)
|
|
85
|
+
|
|
86
|
+
return response
|
okb/llm/base.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Protocol definitions and types for LLM providers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class LLMResponse:
|
|
11
|
+
"""Standard response from LLM providers."""
|
|
12
|
+
|
|
13
|
+
content: str # Raw text response
|
|
14
|
+
model: str # Model that generated it
|
|
15
|
+
input_tokens: int | None = None
|
|
16
|
+
output_tokens: int | None = None
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def total_tokens(self) -> int | None:
|
|
20
|
+
"""Total tokens used (input + output)."""
|
|
21
|
+
if self.input_tokens is None or self.output_tokens is None:
|
|
22
|
+
return None
|
|
23
|
+
return self.input_tokens + self.output_tokens
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@runtime_checkable
|
|
27
|
+
class LLMProvider(Protocol):
|
|
28
|
+
"""Protocol for LLM backend providers.
|
|
29
|
+
|
|
30
|
+
Plugins implement this to add support for different LLM services.
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
class MyProvider:
|
|
34
|
+
name = 'my-llm'
|
|
35
|
+
|
|
36
|
+
def configure(self, config: dict) -> None:
|
|
37
|
+
self._api_key = config.get('api_key')
|
|
38
|
+
|
|
39
|
+
def complete(self, prompt: str, system: str | None = None) -> LLMResponse:
|
|
40
|
+
# Call the LLM API
|
|
41
|
+
...
|
|
42
|
+
|
|
43
|
+
def is_available(self) -> bool:
|
|
44
|
+
return self._api_key is not None
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
name: str # Provider identifier, e.g., "claude", "bedrock", "ollama"
|
|
48
|
+
|
|
49
|
+
def configure(self, config: dict) -> None:
|
|
50
|
+
"""Configure the provider with settings from config.
|
|
51
|
+
|
|
52
|
+
Config values may include resolved environment variables.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
config: Provider-specific configuration dict
|
|
56
|
+
"""
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
def complete(
|
|
60
|
+
self,
|
|
61
|
+
prompt: str,
|
|
62
|
+
system: str | None = None,
|
|
63
|
+
max_tokens: int = 1024,
|
|
64
|
+
) -> LLMResponse:
|
|
65
|
+
"""Generate a completion for the given prompt.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
prompt: The user prompt to complete
|
|
69
|
+
system: Optional system prompt for context/instructions
|
|
70
|
+
max_tokens: Maximum tokens in the response
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
LLMResponse with the generated content and metadata
|
|
74
|
+
"""
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
def is_available(self) -> bool:
|
|
78
|
+
"""Check if the provider is configured and reachable.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
True if the provider can accept requests
|
|
82
|
+
"""
|
|
83
|
+
...
|
okb/llm/cache.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""LLM response caching to avoid redundant API calls."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import psycopg
|
|
11
|
+
|
|
12
|
+
from .base import LLMResponse
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _compute_cache_key(prompt: str, system: str | None, model: str) -> str:
|
|
19
|
+
"""Compute cache key from prompt, system, and model.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
prompt: User prompt
|
|
23
|
+
system: System prompt (may be None)
|
|
24
|
+
model: Model name
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
SHA256 hash of the combined inputs
|
|
28
|
+
"""
|
|
29
|
+
content = f"{prompt}\n---\n{system or ''}\n---\n{model}"
|
|
30
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_cached(
|
|
34
|
+
prompt: str,
|
|
35
|
+
system: str | None,
|
|
36
|
+
provider: str,
|
|
37
|
+
db_url: str | None = None,
|
|
38
|
+
) -> LLMResponse | None:
|
|
39
|
+
"""Retrieve a cached LLM response if available.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
prompt: User prompt
|
|
43
|
+
system: System prompt
|
|
44
|
+
provider: Provider name (e.g., "claude")
|
|
45
|
+
db_url: Database URL (default: from config)
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Cached LLMResponse or None if not found
|
|
49
|
+
"""
|
|
50
|
+
from ..config import config
|
|
51
|
+
|
|
52
|
+
if db_url is None:
|
|
53
|
+
db_url = config.db_url
|
|
54
|
+
|
|
55
|
+
# Get model from config for cache key
|
|
56
|
+
model = config.llm_model or "default"
|
|
57
|
+
content_hash = _compute_cache_key(prompt, system, model)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
with psycopg.connect(db_url) as conn:
|
|
61
|
+
with conn.cursor() as cur:
|
|
62
|
+
cur.execute(
|
|
63
|
+
"""
|
|
64
|
+
SELECT response FROM llm_cache
|
|
65
|
+
WHERE content_hash = %s AND provider = %s AND model = %s
|
|
66
|
+
""",
|
|
67
|
+
(content_hash, provider, model),
|
|
68
|
+
)
|
|
69
|
+
row = cur.fetchone()
|
|
70
|
+
if row is None:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# Parse cached response
|
|
74
|
+
data = json.loads(row[0])
|
|
75
|
+
return LLMResponse(
|
|
76
|
+
content=data["content"],
|
|
77
|
+
model=data["model"],
|
|
78
|
+
input_tokens=data.get("input_tokens"),
|
|
79
|
+
output_tokens=data.get("output_tokens"),
|
|
80
|
+
)
|
|
81
|
+
except psycopg.Error:
|
|
82
|
+
# Cache miss on error - don't block on cache failures
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def cache_response(
|
|
87
|
+
prompt: str,
|
|
88
|
+
system: str | None,
|
|
89
|
+
provider: str,
|
|
90
|
+
response: LLMResponse,
|
|
91
|
+
db_url: str | None = None,
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Store an LLM response in the cache.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
prompt: User prompt
|
|
97
|
+
system: System prompt
|
|
98
|
+
provider: Provider name
|
|
99
|
+
response: LLMResponse to cache
|
|
100
|
+
db_url: Database URL (default: from config)
|
|
101
|
+
"""
|
|
102
|
+
from ..config import config
|
|
103
|
+
|
|
104
|
+
if db_url is None:
|
|
105
|
+
db_url = config.db_url
|
|
106
|
+
|
|
107
|
+
model = config.llm_model or "default"
|
|
108
|
+
content_hash = _compute_cache_key(prompt, system, model)
|
|
109
|
+
|
|
110
|
+
# Serialize response
|
|
111
|
+
data = {
|
|
112
|
+
"content": response.content,
|
|
113
|
+
"model": response.model,
|
|
114
|
+
"input_tokens": response.input_tokens,
|
|
115
|
+
"output_tokens": response.output_tokens,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
with psycopg.connect(db_url) as conn:
|
|
120
|
+
with conn.cursor() as cur:
|
|
121
|
+
cur.execute(
|
|
122
|
+
"""
|
|
123
|
+
INSERT INTO llm_cache (content_hash, provider, model, response)
|
|
124
|
+
VALUES (%s, %s, %s, %s)
|
|
125
|
+
ON CONFLICT (content_hash) DO UPDATE SET
|
|
126
|
+
response = EXCLUDED.response,
|
|
127
|
+
created_at = NOW()
|
|
128
|
+
""",
|
|
129
|
+
(content_hash, provider, model, json.dumps(data)),
|
|
130
|
+
)
|
|
131
|
+
conn.commit()
|
|
132
|
+
except psycopg.Error:
|
|
133
|
+
# Don't fail on cache write errors
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def clear_cache(
|
|
138
|
+
older_than: datetime | None = None,
|
|
139
|
+
db_url: str | None = None,
|
|
140
|
+
) -> int:
|
|
141
|
+
"""Clear cached LLM responses.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
older_than: Only clear entries older than this datetime.
|
|
145
|
+
If None, clears all entries.
|
|
146
|
+
db_url: Database URL (default: from config)
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Number of entries deleted
|
|
150
|
+
"""
|
|
151
|
+
from ..config import config
|
|
152
|
+
|
|
153
|
+
if db_url is None:
|
|
154
|
+
db_url = config.db_url
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
with psycopg.connect(db_url) as conn:
|
|
158
|
+
with conn.cursor() as cur:
|
|
159
|
+
if older_than:
|
|
160
|
+
cur.execute(
|
|
161
|
+
"DELETE FROM llm_cache WHERE created_at < %s",
|
|
162
|
+
(older_than,),
|
|
163
|
+
)
|
|
164
|
+
else:
|
|
165
|
+
cur.execute("DELETE FROM llm_cache")
|
|
166
|
+
deleted = cur.rowcount
|
|
167
|
+
conn.commit()
|
|
168
|
+
return deleted
|
|
169
|
+
except psycopg.Error:
|
|
170
|
+
return 0
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def get_cache_stats(db_url: str | None = None) -> dict:
|
|
174
|
+
"""Get statistics about the LLM cache.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
db_url: Database URL (default: from config)
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Dict with cache statistics
|
|
181
|
+
"""
|
|
182
|
+
from ..config import config
|
|
183
|
+
|
|
184
|
+
if db_url is None:
|
|
185
|
+
db_url = config.db_url
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
with psycopg.connect(db_url) as conn:
|
|
189
|
+
with conn.cursor() as cur:
|
|
190
|
+
# Total entries
|
|
191
|
+
cur.execute("SELECT COUNT(*) FROM llm_cache")
|
|
192
|
+
total = cur.fetchone()[0]
|
|
193
|
+
|
|
194
|
+
# Entries by provider/model
|
|
195
|
+
cur.execute(
|
|
196
|
+
"""
|
|
197
|
+
SELECT provider, model, COUNT(*) as count
|
|
198
|
+
FROM llm_cache
|
|
199
|
+
GROUP BY provider, model
|
|
200
|
+
ORDER BY count DESC
|
|
201
|
+
"""
|
|
202
|
+
)
|
|
203
|
+
by_provider = [
|
|
204
|
+
{"provider": r[0], "model": r[1], "count": r[2]} for r in cur.fetchall()
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
# Oldest entry
|
|
208
|
+
cur.execute("SELECT MIN(created_at) FROM llm_cache")
|
|
209
|
+
oldest = cur.fetchone()[0]
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
"total_entries": total,
|
|
213
|
+
"by_provider": by_provider,
|
|
214
|
+
"oldest_entry": oldest.isoformat() if oldest else None,
|
|
215
|
+
}
|
|
216
|
+
except psycopg.Error:
|
|
217
|
+
return {"total_entries": 0, "by_provider": [], "oldest_entry": None}
|
okb/llm/filter.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""LLM-based document filtering for pre-ingest classification."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from ..ingest import Document
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FilterAction(Enum):
|
|
16
|
+
"""Actions the filter can take on a document."""
|
|
17
|
+
|
|
18
|
+
INGEST = "ingest" # Process normally
|
|
19
|
+
SKIP = "skip" # Don't ingest
|
|
20
|
+
REVIEW = "review" # Flag for manual review (still ingest)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class FilterResult:
|
|
25
|
+
"""Result of LLM filtering on a document."""
|
|
26
|
+
|
|
27
|
+
action: FilterAction
|
|
28
|
+
reason: str
|
|
29
|
+
confidence: float | None = None # Optional confidence score 0-1
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def should_ingest(self) -> bool:
|
|
33
|
+
"""Whether the document should be ingested."""
|
|
34
|
+
return self.action in (FilterAction.INGEST, FilterAction.REVIEW)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
DEFAULT_SYSTEM_PROMPT = """\
|
|
38
|
+
You are a document classifier. Analyze the document and decide whether it should
|
|
39
|
+
be ingested into a knowledge base.
|
|
40
|
+
|
|
41
|
+
Respond with a JSON object containing:
|
|
42
|
+
- "action": one of "ingest", "skip", or "review"
|
|
43
|
+
- "reason": brief explanation (1 sentence)
|
|
44
|
+
|
|
45
|
+
Use these guidelines:
|
|
46
|
+
- "ingest": valuable content worth indexing (notes, docs, important emails, etc.)
|
|
47
|
+
- "skip": low-value content (spam, marketing, automated notifications, duplicates)
|
|
48
|
+
- "review": uncertain cases that need human review
|
|
49
|
+
|
|
50
|
+
Respond ONLY with the JSON object, no other text."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _build_filter_prompt(document: Document, custom_prompt: str | None = None) -> str:
|
|
54
|
+
"""Build the prompt for document filtering.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
document: Document to classify
|
|
58
|
+
custom_prompt: Optional custom instructions to append
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Formatted prompt string
|
|
62
|
+
"""
|
|
63
|
+
parts = [
|
|
64
|
+
f"Title: {document.title}",
|
|
65
|
+
f"Source: {document.source_type}",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
if document.metadata and document.metadata.tags:
|
|
69
|
+
parts.append(f"Tags: {', '.join(document.metadata.tags)}")
|
|
70
|
+
|
|
71
|
+
# Include content preview (truncate if too long)
|
|
72
|
+
content_preview = document.content[:2000]
|
|
73
|
+
if len(document.content) > 2000:
|
|
74
|
+
content_preview += "\n[... truncated ...]"
|
|
75
|
+
|
|
76
|
+
parts.append(f"\nContent:\n{content_preview}")
|
|
77
|
+
|
|
78
|
+
if custom_prompt:
|
|
79
|
+
parts.append(f"\nAdditional instructions: {custom_prompt}")
|
|
80
|
+
|
|
81
|
+
return "\n".join(parts)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _parse_filter_response(response: str) -> FilterResult:
|
|
85
|
+
"""Parse the LLM response into a FilterResult.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
response: Raw LLM response text
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Parsed FilterResult
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ValueError: If response cannot be parsed
|
|
95
|
+
"""
|
|
96
|
+
# Try to extract JSON from response
|
|
97
|
+
# Handle cases where LLM wraps in markdown code blocks
|
|
98
|
+
json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", response, re.DOTALL)
|
|
99
|
+
if json_match:
|
|
100
|
+
json_str = json_match.group(1)
|
|
101
|
+
else:
|
|
102
|
+
# Try to find raw JSON object
|
|
103
|
+
json_match = re.search(r"\{[^{}]*\}", response, re.DOTALL)
|
|
104
|
+
if json_match:
|
|
105
|
+
json_str = json_match.group(0)
|
|
106
|
+
else:
|
|
107
|
+
# Fallback: assume entire response is JSON
|
|
108
|
+
json_str = response.strip()
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
data = json.loads(json_str)
|
|
112
|
+
except json.JSONDecodeError as e:
|
|
113
|
+
raise ValueError(f"Failed to parse filter response as JSON: {e}")
|
|
114
|
+
|
|
115
|
+
action_str = data.get("action", "").lower()
|
|
116
|
+
try:
|
|
117
|
+
action = FilterAction(action_str)
|
|
118
|
+
except ValueError:
|
|
119
|
+
# Default to ingest if action is invalid
|
|
120
|
+
action = FilterAction.INGEST
|
|
121
|
+
|
|
122
|
+
reason = data.get("reason", "No reason provided")
|
|
123
|
+
confidence = data.get("confidence")
|
|
124
|
+
|
|
125
|
+
return FilterResult(action=action, reason=reason, confidence=confidence)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def filter_document(
|
|
129
|
+
document: Document,
|
|
130
|
+
custom_prompt: str | None = None,
|
|
131
|
+
use_cache: bool = True,
|
|
132
|
+
) -> FilterResult:
|
|
133
|
+
"""Filter a single document using the configured LLM.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
document: Document to filter
|
|
137
|
+
custom_prompt: Optional custom classification instructions
|
|
138
|
+
use_cache: Whether to use cached responses
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
FilterResult with action and reason
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
RuntimeError: If LLM is not configured
|
|
145
|
+
"""
|
|
146
|
+
from . import complete
|
|
147
|
+
|
|
148
|
+
prompt = _build_filter_prompt(document, custom_prompt)
|
|
149
|
+
response = complete(prompt, system=DEFAULT_SYSTEM_PROMPT, use_cache=use_cache)
|
|
150
|
+
|
|
151
|
+
if response is None:
|
|
152
|
+
# LLM not configured - default to ingest
|
|
153
|
+
return FilterResult(
|
|
154
|
+
action=FilterAction.INGEST,
|
|
155
|
+
reason="LLM not configured, defaulting to ingest",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
return _parse_filter_response(response.content)
|
|
160
|
+
except ValueError as e:
|
|
161
|
+
# Parse error - default to ingest with warning
|
|
162
|
+
return FilterResult(
|
|
163
|
+
action=FilterAction.INGEST,
|
|
164
|
+
reason=f"Failed to parse LLM response: {e}",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def filter_documents(
|
|
169
|
+
documents: list[Document],
|
|
170
|
+
custom_prompt: str | None = None,
|
|
171
|
+
use_cache: bool = True,
|
|
172
|
+
) -> list[tuple[Document, FilterResult]]:
|
|
173
|
+
"""Filter multiple documents.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
documents: List of documents to filter
|
|
177
|
+
custom_prompt: Optional custom classification instructions
|
|
178
|
+
use_cache: Whether to use cached responses
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
List of (document, filter_result) tuples
|
|
182
|
+
"""
|
|
183
|
+
results = []
|
|
184
|
+
for doc in documents:
|
|
185
|
+
result = filter_document(doc, custom_prompt=custom_prompt, use_cache=use_cache)
|
|
186
|
+
results.append((doc, result))
|
|
187
|
+
return results
|