okb 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/cli.py +1209 -16
- okb/config.py +122 -4
- okb/http_server.py +208 -2
- okb/llm/analyze.py +524 -0
- okb/llm/consolidate.py +685 -0
- okb/llm/enrich.py +723 -0
- okb/llm/extractors/__init__.py +13 -0
- okb/llm/extractors/base.py +44 -0
- okb/llm/extractors/cross_doc.py +478 -0
- okb/llm/extractors/dedup.py +499 -0
- okb/llm/extractors/entity.py +369 -0
- okb/llm/extractors/todo.py +149 -0
- okb/llm/providers.py +9 -6
- okb/mcp_server.py +1279 -12
- okb/migrations/0008.enrichment.sql +46 -0
- okb/migrations/0009.entity-consolidation.sql +120 -0
- okb/migrations/0010.token-id.sql +7 -0
- okb/modal_llm.py +26 -8
- okb/plugins/sources/__init__.py +2 -1
- okb/plugins/sources/dropbox_paper.py +44 -9
- okb/plugins/sources/github.py +5 -5
- okb/plugins/sources/todoist.py +254 -0
- okb/tokens.py +25 -3
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/METADATA +119 -68
- okb-1.1.0.dist-info/RECORD +49 -0
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/entry_points.txt +1 -0
- okb-1.0.0.dist-info/RECORD +0 -36
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
-- LLM enrichment for document annotation (TODOs and entities)
|
|
2
|
+
-- depends: 0006.llm-cache
|
|
3
|
+
|
|
4
|
+
-- Track enrichment state on documents
|
|
5
|
+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS enriched_at TIMESTAMPTZ;
|
|
6
|
+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS enrichment_version INTEGER;
|
|
7
|
+
|
|
8
|
+
-- Index for "needs enrichment" queries
|
|
9
|
+
CREATE INDEX IF NOT EXISTS idx_documents_needs_enrichment
|
|
10
|
+
ON documents(enriched_at) WHERE enriched_at IS NULL;
|
|
11
|
+
|
|
12
|
+
-- Pending entity suggestions (before approval)
|
|
13
|
+
CREATE TABLE IF NOT EXISTS pending_entities (
|
|
14
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
15
|
+
source_document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
16
|
+
entity_name TEXT NOT NULL,
|
|
17
|
+
entity_type TEXT NOT NULL, -- person, project, technology, concept, organization
|
|
18
|
+
aliases JSONB DEFAULT '[]',
|
|
19
|
+
description TEXT,
|
|
20
|
+
mentions JSONB DEFAULT '[]', -- Context snippets from source document
|
|
21
|
+
confidence REAL,
|
|
22
|
+
status TEXT DEFAULT 'pending', -- pending, approved, rejected
|
|
23
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
24
|
+
reviewed_at TIMESTAMPTZ
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
CREATE INDEX IF NOT EXISTS idx_pending_entities_status ON pending_entities(status);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_pending_entities_source ON pending_entities(source_document_id);
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_pending_entities_type ON pending_entities(entity_type);
|
|
30
|
+
|
|
31
|
+
-- Entity references (links entity documents to source documents)
|
|
32
|
+
-- When an entity is approved, it becomes a document with source_path like okb://entity/person/john-smith
|
|
33
|
+
-- This table tracks which documents mention each entity
|
|
34
|
+
CREATE TABLE IF NOT EXISTS entity_refs (
|
|
35
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
36
|
+
entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
37
|
+
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
38
|
+
mention_text TEXT NOT NULL,
|
|
39
|
+
context TEXT, -- Surrounding text for context
|
|
40
|
+
confidence REAL,
|
|
41
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
42
|
+
UNIQUE(entity_id, document_id, mention_text)
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
CREATE INDEX IF NOT EXISTS idx_entity_refs_entity ON entity_refs(entity_id);
|
|
46
|
+
CREATE INDEX IF NOT EXISTS idx_entity_refs_document ON entity_refs(document_id);
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
-- Entity consolidation: deduplication, cross-doc detection, clustering, relationships
|
|
2
|
+
-- depends: 0008.enrichment
|
|
3
|
+
|
|
4
|
+
-- Canonical mappings: alias text -> entity document
|
|
5
|
+
-- Used for deduplication and alias resolution
|
|
6
|
+
CREATE TABLE IF NOT EXISTS entity_aliases (
|
|
7
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
8
|
+
alias_text TEXT NOT NULL,
|
|
9
|
+
entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
10
|
+
confidence REAL, -- How confident we are this alias belongs to entity
|
|
11
|
+
source TEXT DEFAULT 'manual', -- 'manual', 'merge', 'extraction'
|
|
12
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
13
|
+
UNIQUE(alias_text, entity_id)
|
|
14
|
+
);
|
|
15
|
+
|
|
16
|
+
CREATE INDEX IF NOT EXISTS idx_entity_aliases_text ON entity_aliases(LOWER(alias_text));
|
|
17
|
+
CREATE INDEX IF NOT EXISTS idx_entity_aliases_entity ON entity_aliases(entity_id);
|
|
18
|
+
|
|
19
|
+
-- Proposed entity merges awaiting user confirmation
|
|
20
|
+
CREATE TABLE IF NOT EXISTS pending_entity_merges (
|
|
21
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
22
|
+
canonical_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
23
|
+
duplicate_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
24
|
+
confidence REAL NOT NULL, -- How confident we are these are the same
|
|
25
|
+
reason TEXT, -- Why we think they're the same ("embedding_similarity", "alias_match", "llm")
|
|
26
|
+
detected_at TIMESTAMPTZ DEFAULT NOW(),
|
|
27
|
+
status TEXT DEFAULT 'pending', -- 'pending', 'approved', 'rejected'
|
|
28
|
+
reviewed_at TIMESTAMPTZ,
|
|
29
|
+
UNIQUE(canonical_id, duplicate_id)
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
CREATE INDEX IF NOT EXISTS idx_pending_merges_status ON pending_entity_merges(status);
|
|
33
|
+
CREATE INDEX IF NOT EXISTS idx_pending_merges_confidence ON pending_entity_merges(confidence DESC);
|
|
34
|
+
|
|
35
|
+
-- Entity-to-entity relationships
|
|
36
|
+
CREATE TABLE IF NOT EXISTS entity_relationships (
|
|
37
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
38
|
+
source_entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
39
|
+
target_entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
40
|
+
relationship_type TEXT NOT NULL, -- 'works_for', 'uses', 'belongs_to', 'related_to'
|
|
41
|
+
confidence REAL,
|
|
42
|
+
source TEXT DEFAULT 'extraction', -- 'extraction', 'manual'
|
|
43
|
+
context TEXT, -- Supporting context for the relationship
|
|
44
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
45
|
+
UNIQUE(source_entity_id, target_entity_id, relationship_type)
|
|
46
|
+
);
|
|
47
|
+
|
|
48
|
+
CREATE INDEX IF NOT EXISTS idx_entity_rel_source ON entity_relationships(source_entity_id);
|
|
49
|
+
CREATE INDEX IF NOT EXISTS idx_entity_rel_target ON entity_relationships(target_entity_id);
|
|
50
|
+
CREATE INDEX IF NOT EXISTS idx_entity_rel_type ON entity_relationships(relationship_type);
|
|
51
|
+
|
|
52
|
+
-- Topic clusters group related entities and documents
|
|
53
|
+
CREATE TABLE IF NOT EXISTS topic_clusters (
|
|
54
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
55
|
+
name TEXT NOT NULL,
|
|
56
|
+
description TEXT,
|
|
57
|
+
centroid vector(768), -- Cluster centroid embedding
|
|
58
|
+
member_count INTEGER DEFAULT 0,
|
|
59
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
60
|
+
updated_at TIMESTAMPTZ DEFAULT NOW()
|
|
61
|
+
);
|
|
62
|
+
|
|
63
|
+
CREATE INDEX IF NOT EXISTS idx_topic_clusters_centroid ON topic_clusters
|
|
64
|
+
USING hnsw (centroid vector_cosine_ops);
|
|
65
|
+
|
|
66
|
+
-- Cluster membership: entities and documents can belong to clusters
|
|
67
|
+
CREATE TABLE IF NOT EXISTS topic_cluster_members (
|
|
68
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
69
|
+
cluster_id UUID NOT NULL REFERENCES topic_clusters(id) ON DELETE CASCADE,
|
|
70
|
+
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
71
|
+
distance REAL, -- Distance from cluster centroid
|
|
72
|
+
is_entity BOOLEAN DEFAULT FALSE, -- True if document is an entity
|
|
73
|
+
added_at TIMESTAMPTZ DEFAULT NOW(),
|
|
74
|
+
UNIQUE(cluster_id, document_id)
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_members_cluster ON topic_cluster_members(cluster_id);
|
|
78
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_members_document ON topic_cluster_members(document_id);
|
|
79
|
+
|
|
80
|
+
-- Proposed cluster merges awaiting confirmation
|
|
81
|
+
CREATE TABLE IF NOT EXISTS pending_cluster_merges (
|
|
82
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
83
|
+
primary_cluster_id UUID NOT NULL REFERENCES topic_clusters(id) ON DELETE CASCADE,
|
|
84
|
+
secondary_cluster_id UUID NOT NULL REFERENCES topic_clusters(id) ON DELETE CASCADE,
|
|
85
|
+
similarity REAL NOT NULL, -- How similar the clusters are
|
|
86
|
+
status TEXT DEFAULT 'pending', -- 'pending', 'approved', 'rejected'
|
|
87
|
+
detected_at TIMESTAMPTZ DEFAULT NOW(),
|
|
88
|
+
reviewed_at TIMESTAMPTZ,
|
|
89
|
+
UNIQUE(primary_cluster_id, secondary_cluster_id)
|
|
90
|
+
);
|
|
91
|
+
|
|
92
|
+
CREATE INDEX IF NOT EXISTS idx_pending_cluster_merges_status ON pending_cluster_merges(status);
|
|
93
|
+
|
|
94
|
+
-- Cross-document entity candidates: detected mentions not yet extracted as entities
|
|
95
|
+
CREATE TABLE IF NOT EXISTS cross_doc_entity_candidates (
|
|
96
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
97
|
+
text TEXT NOT NULL, -- The mention text (normalized)
|
|
98
|
+
document_ids UUID[] NOT NULL, -- Array of document IDs containing this mention
|
|
99
|
+
document_count INTEGER NOT NULL, -- Number of documents (for quick filtering)
|
|
100
|
+
sample_contexts JSONB DEFAULT '[]', -- Sample text contexts where it appears
|
|
101
|
+
suggested_type TEXT, -- Suggested entity type
|
|
102
|
+
confidence REAL,
|
|
103
|
+
status TEXT DEFAULT 'pending', -- 'pending', 'approved', 'rejected', 'exists'
|
|
104
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
105
|
+
reviewed_at TIMESTAMPTZ,
|
|
106
|
+
UNIQUE(text)
|
|
107
|
+
);
|
|
108
|
+
|
|
109
|
+
CREATE INDEX IF NOT EXISTS idx_cross_doc_status ON cross_doc_entity_candidates(status);
|
|
110
|
+
CREATE INDEX IF NOT EXISTS idx_cross_doc_count ON cross_doc_entity_candidates(document_count DESC);
|
|
111
|
+
|
|
112
|
+
-- Track consolidation runs
|
|
113
|
+
CREATE TABLE IF NOT EXISTS consolidation_runs (
|
|
114
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
115
|
+
run_type TEXT NOT NULL, -- 'dedup', 'cross_doc', 'cluster', 'relationship', 'full'
|
|
116
|
+
started_at TIMESTAMPTZ DEFAULT NOW(),
|
|
117
|
+
completed_at TIMESTAMPTZ,
|
|
118
|
+
stats JSONB DEFAULT '{}', -- Run statistics
|
|
119
|
+
error TEXT -- Error message if failed
|
|
120
|
+
);
|
okb/modal_llm.py
CHANGED
|
@@ -2,20 +2,34 @@
|
|
|
2
2
|
Modal-based GPU LLM service for document classification.
|
|
3
3
|
|
|
4
4
|
Provides on-demand GPU access for LLM inference using open models.
|
|
5
|
-
Uses Llama 3.2 3B by default - fast and efficient for classification tasks.
|
|
6
5
|
|
|
7
6
|
Usage:
|
|
7
|
+
# Deploy with default model (Phi-3)
|
|
8
8
|
modal deploy modal_llm.py
|
|
9
9
|
|
|
10
|
+
# Deploy with specific model
|
|
11
|
+
OKB_LLM_MODEL=meta-llama/Llama-3.2-3B-Instruct modal deploy modal_llm.py
|
|
12
|
+
|
|
10
13
|
Then call from Python:
|
|
11
14
|
llm = modal.Cls.from_name("knowledge-llm", "LLM")()
|
|
12
15
|
response = llm.complete.remote("Classify this document", system="You are a classifier")
|
|
13
16
|
"""
|
|
14
17
|
|
|
18
|
+
import os
|
|
19
|
+
|
|
15
20
|
import modal
|
|
16
21
|
|
|
17
22
|
app = modal.App("knowledge-llm")
|
|
18
23
|
|
|
24
|
+
# Model is set via environment variable at deploy time
|
|
25
|
+
# Default to Phi-3 which doesn't require HuggingFace approval
|
|
26
|
+
DEFAULT_MODEL = "microsoft/Phi-3-mini-4k-instruct"
|
|
27
|
+
MODEL_ID = os.environ.get("OKB_LLM_MODEL", DEFAULT_MODEL)
|
|
28
|
+
|
|
29
|
+
# GPU type - L4 recommended for speed/cost balance
|
|
30
|
+
DEFAULT_GPU = "L4"
|
|
31
|
+
GPU_TYPE = os.environ.get("OKB_MODAL_GPU", DEFAULT_GPU)
|
|
32
|
+
|
|
19
33
|
# Container image with transformers and torch
|
|
20
34
|
llm_image = (
|
|
21
35
|
modal.Image.debian_slim(python_version="3.11")
|
|
@@ -24,17 +38,19 @@ llm_image = (
|
|
|
24
38
|
"torch>=2.0.0",
|
|
25
39
|
"accelerate>=0.27.0",
|
|
26
40
|
"bitsandbytes>=0.42.0", # For quantization
|
|
41
|
+
"hf_transfer", # Fast downloads
|
|
27
42
|
)
|
|
28
|
-
.env({
|
|
43
|
+
.env({
|
|
44
|
+
"HF_HUB_ENABLE_HF_TRANSFER": "1",
|
|
45
|
+
"OKB_LLM_MODEL": MODEL_ID,
|
|
46
|
+
"OKB_MODAL_GPU": GPU_TYPE,
|
|
47
|
+
})
|
|
29
48
|
)
|
|
30
49
|
|
|
31
|
-
# Default model - Llama 3.2 3B is fast and good for classification
|
|
32
|
-
DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
|
|
33
|
-
|
|
34
50
|
|
|
35
51
|
@app.cls(
|
|
36
52
|
image=llm_image,
|
|
37
|
-
gpu=
|
|
53
|
+
gpu=GPU_TYPE,
|
|
38
54
|
timeout=300,
|
|
39
55
|
scaledown_window=300, # Keep warm for 5 min
|
|
40
56
|
retries=1,
|
|
@@ -42,14 +58,16 @@ DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
|
|
|
42
58
|
class LLM:
|
|
43
59
|
"""GPU-accelerated LLM for document classification."""
|
|
44
60
|
|
|
45
|
-
model_id: str = DEFAULT_MODEL
|
|
46
|
-
|
|
47
61
|
@modal.enter()
|
|
48
62
|
def load_model(self):
|
|
49
63
|
"""Load model once when container starts."""
|
|
64
|
+
import os
|
|
65
|
+
|
|
50
66
|
import torch
|
|
51
67
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
52
68
|
|
|
69
|
+
# Read model from environment (set at deploy time)
|
|
70
|
+
self.model_id = os.environ.get("OKB_LLM_MODEL", "microsoft/Phi-3-mini-4k-instruct")
|
|
53
71
|
print(f"Loading model: {self.model_id}")
|
|
54
72
|
|
|
55
73
|
# Use 4-bit quantization for memory efficiency
|
okb/plugins/sources/__init__.py
CHANGED
|
@@ -16,17 +16,27 @@ class DropboxPaperSource:
|
|
|
16
16
|
|
|
17
17
|
Syncs Paper documents as markdown for searchable knowledge base entries.
|
|
18
18
|
|
|
19
|
-
Config example:
|
|
19
|
+
Config example (refresh token - recommended):
|
|
20
20
|
plugins:
|
|
21
21
|
sources:
|
|
22
22
|
dropbox-paper:
|
|
23
23
|
enabled: true
|
|
24
|
-
|
|
24
|
+
app_key: ${DROPBOX_APP_KEY}
|
|
25
|
+
app_secret: ${DROPBOX_APP_SECRET}
|
|
26
|
+
refresh_token: ${DROPBOX_REFRESH_TOKEN}
|
|
25
27
|
folders: [/] # Optional: filter to specific folder paths
|
|
26
28
|
|
|
29
|
+
Config example (access token - short-lived):
|
|
30
|
+
plugins:
|
|
31
|
+
sources:
|
|
32
|
+
dropbox-paper:
|
|
33
|
+
enabled: true
|
|
34
|
+
token: ${DROPBOX_TOKEN} # Expires after ~4 hours
|
|
35
|
+
|
|
27
36
|
Usage:
|
|
28
|
-
|
|
29
|
-
|
|
37
|
+
okb sync run dropbox-paper
|
|
38
|
+
okb sync run dropbox-paper --full # Ignore incremental state
|
|
39
|
+
okb sync run dropbox-paper --doc <doc_id> # Sync specific document
|
|
30
40
|
"""
|
|
31
41
|
|
|
32
42
|
name = "dropbox-paper"
|
|
@@ -38,18 +48,43 @@ class DropboxPaperSource:
|
|
|
38
48
|
self._doc_ids: list[str] | None = None
|
|
39
49
|
|
|
40
50
|
def configure(self, config: dict) -> None:
|
|
41
|
-
"""Initialize Dropbox client with OAuth token.
|
|
51
|
+
"""Initialize Dropbox client with OAuth token or refresh token.
|
|
52
|
+
|
|
53
|
+
Supports two authentication modes:
|
|
54
|
+
1. Access token only (short-lived, will expire):
|
|
55
|
+
token: <access_token>
|
|
56
|
+
|
|
57
|
+
2. Refresh token (recommended, auto-refreshes):
|
|
58
|
+
app_key: <app_key>
|
|
59
|
+
app_secret: <app_secret>
|
|
60
|
+
refresh_token: <refresh_token>
|
|
42
61
|
|
|
43
62
|
Args:
|
|
44
|
-
config: Source configuration containing
|
|
63
|
+
config: Source configuration containing auth credentials and optional 'folders'/'doc_ids'
|
|
45
64
|
"""
|
|
46
65
|
import dropbox
|
|
47
66
|
|
|
67
|
+
app_key = config.get("app_key")
|
|
68
|
+
app_secret = config.get("app_secret")
|
|
69
|
+
refresh_token = config.get("refresh_token")
|
|
48
70
|
token = config.get("token")
|
|
49
|
-
if not token:
|
|
50
|
-
raise ValueError("dropbox-paper source requires 'token' in config")
|
|
51
71
|
|
|
52
|
-
|
|
72
|
+
if app_key and app_secret and refresh_token:
|
|
73
|
+
# Use refresh token - will auto-refresh access tokens
|
|
74
|
+
self._client = dropbox.Dropbox(
|
|
75
|
+
app_key=app_key,
|
|
76
|
+
app_secret=app_secret,
|
|
77
|
+
oauth2_refresh_token=refresh_token,
|
|
78
|
+
)
|
|
79
|
+
elif token:
|
|
80
|
+
# Legacy: direct access token (will expire)
|
|
81
|
+
self._client = dropbox.Dropbox(token)
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
"dropbox-paper source requires either 'token' or "
|
|
85
|
+
"'app_key'/'app_secret'/'refresh_token' in config"
|
|
86
|
+
)
|
|
87
|
+
|
|
53
88
|
self._folders = config.get("folders")
|
|
54
89
|
self._doc_ids = config.get("doc_ids") # Specific doc IDs from CLI
|
|
55
90
|
|
okb/plugins/sources/github.py
CHANGED
|
@@ -95,11 +95,11 @@ class GitHubSource:
|
|
|
95
95
|
token: ${GITHUB_TOKEN}
|
|
96
96
|
|
|
97
97
|
Usage:
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
98
|
+
okb sync run github --repo owner/repo # README + docs/ (default)
|
|
99
|
+
okb sync run github --repo owner/repo --source # All source files
|
|
100
|
+
okb sync run github --repo owner/repo --issues # Include issues
|
|
101
|
+
okb sync run github --repo owner/repo --prs # Include PRs
|
|
102
|
+
okb sync run github --repo owner/repo --wiki # Include wiki
|
|
103
103
|
"""
|
|
104
104
|
|
|
105
105
|
name = "github"
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Todoist API source for syncing tasks into OKB."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from datetime import UTC, datetime, timedelta
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from okb.ingest import Document
|
|
11
|
+
from okb.plugins.base import SyncState
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TodoistSource:
|
|
15
|
+
"""API source for Todoist tasks.
|
|
16
|
+
|
|
17
|
+
Syncs active and optionally completed tasks for semantic search and actionable item queries.
|
|
18
|
+
|
|
19
|
+
Config example:
|
|
20
|
+
plugins:
|
|
21
|
+
sources:
|
|
22
|
+
todoist:
|
|
23
|
+
enabled: true
|
|
24
|
+
token: ${TODOIST_TOKEN}
|
|
25
|
+
include_completed: false # Include recently completed tasks
|
|
26
|
+
completed_days: 30 # Days of completed tasks to sync
|
|
27
|
+
include_comments: false # Include task comments (expensive)
|
|
28
|
+
project_filter: [] # Optional: list of project IDs to sync
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
okb sync run todoist
|
|
32
|
+
okb sync run todoist --full # Full resync
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
name = "todoist"
|
|
36
|
+
source_type = "todoist-task"
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
self._client = None
|
|
40
|
+
self._include_completed = False
|
|
41
|
+
self._completed_days = 30
|
|
42
|
+
self._include_comments = False
|
|
43
|
+
self._project_filter: list[str] | None = None
|
|
44
|
+
self._projects: dict[str, str] = {} # id -> name
|
|
45
|
+
|
|
46
|
+
def configure(self, config: dict) -> None:
|
|
47
|
+
"""Initialize Todoist client with API token.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
config: Source configuration containing 'token' and optional settings
|
|
51
|
+
"""
|
|
52
|
+
from todoist_api_python.api import TodoistAPI
|
|
53
|
+
|
|
54
|
+
token = config.get("token")
|
|
55
|
+
if not token:
|
|
56
|
+
raise ValueError("todoist source requires 'token' in config")
|
|
57
|
+
|
|
58
|
+
self._client = TodoistAPI(token)
|
|
59
|
+
self._include_completed = config.get("include_completed", False)
|
|
60
|
+
self._completed_days = config.get("completed_days", 30)
|
|
61
|
+
self._include_comments = config.get("include_comments", False)
|
|
62
|
+
self._project_filter = config.get("project_filter")
|
|
63
|
+
|
|
64
|
+
def fetch(self, state: SyncState | None = None) -> tuple[list[Document], SyncState]:
|
|
65
|
+
"""Fetch tasks from Todoist.
|
|
66
|
+
|
|
67
|
+
Active tasks are always fully synced (API has no "modified since" filter).
|
|
68
|
+
Completed tasks use state.last_sync for incremental fetching.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
state: Previous sync state for incremental updates, or None for full sync
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Tuple of (list of documents, new sync state)
|
|
75
|
+
"""
|
|
76
|
+
from okb.plugins.base import SyncState as SyncStateClass
|
|
77
|
+
|
|
78
|
+
if self._client is None:
|
|
79
|
+
raise RuntimeError("Source not configured. Call configure() first.")
|
|
80
|
+
|
|
81
|
+
documents: list[Document] = []
|
|
82
|
+
|
|
83
|
+
print("Fetching Todoist tasks...", file=sys.stderr)
|
|
84
|
+
|
|
85
|
+
# Load projects for name lookup
|
|
86
|
+
self._load_projects()
|
|
87
|
+
|
|
88
|
+
# Fetch active tasks
|
|
89
|
+
active_docs = self._fetch_active_tasks()
|
|
90
|
+
documents.extend(active_docs)
|
|
91
|
+
print(f" Synced {len(active_docs)} active tasks", file=sys.stderr)
|
|
92
|
+
|
|
93
|
+
# Fetch completed tasks if enabled
|
|
94
|
+
if self._include_completed:
|
|
95
|
+
since = state.last_sync if state and state.last_sync else None
|
|
96
|
+
completed_docs = self._fetch_completed_tasks(since)
|
|
97
|
+
documents.extend(completed_docs)
|
|
98
|
+
print(f" Synced {len(completed_docs)} completed tasks", file=sys.stderr)
|
|
99
|
+
|
|
100
|
+
new_state = SyncStateClass(last_sync=datetime.now(UTC))
|
|
101
|
+
return documents, new_state
|
|
102
|
+
|
|
103
|
+
def _load_projects(self) -> None:
|
|
104
|
+
"""Load projects for name lookup."""
|
|
105
|
+
try:
|
|
106
|
+
self._projects = {}
|
|
107
|
+
for project_batch in self._client.get_projects():
|
|
108
|
+
for p in project_batch:
|
|
109
|
+
self._projects[p.id] = p.name
|
|
110
|
+
except Exception as e:
|
|
111
|
+
print(f" Warning: Could not load projects: {e}", file=sys.stderr)
|
|
112
|
+
self._projects = {}
|
|
113
|
+
|
|
114
|
+
def _fetch_active_tasks(self) -> list[Document]:
|
|
115
|
+
"""Fetch all active tasks."""
|
|
116
|
+
documents = []
|
|
117
|
+
|
|
118
|
+
for task_batch in self._client.get_tasks():
|
|
119
|
+
for task in task_batch:
|
|
120
|
+
# Apply project filter if configured
|
|
121
|
+
if self._project_filter and task.project_id not in self._project_filter:
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
doc = self._task_to_document(task, is_completed=False)
|
|
125
|
+
if doc:
|
|
126
|
+
documents.append(doc)
|
|
127
|
+
|
|
128
|
+
return documents
|
|
129
|
+
|
|
130
|
+
def _fetch_completed_tasks(self, since: datetime | None) -> list[Document]:
|
|
131
|
+
"""Fetch completed tasks within the configured window."""
|
|
132
|
+
documents = []
|
|
133
|
+
|
|
134
|
+
# Determine date range
|
|
135
|
+
until = datetime.now(UTC)
|
|
136
|
+
if since:
|
|
137
|
+
start = since
|
|
138
|
+
else:
|
|
139
|
+
start = until - timedelta(days=self._completed_days)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
for task_batch in self._client.get_completed_tasks_by_completion_date(
|
|
143
|
+
since=start,
|
|
144
|
+
until=until,
|
|
145
|
+
):
|
|
146
|
+
for task in task_batch:
|
|
147
|
+
# Apply project filter if configured
|
|
148
|
+
if self._project_filter and task.project_id not in self._project_filter:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
doc = self._task_to_document(task, is_completed=True)
|
|
152
|
+
if doc:
|
|
153
|
+
documents.append(doc)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
print(f" Warning: Could not fetch completed tasks: {e}", file=sys.stderr)
|
|
156
|
+
|
|
157
|
+
return documents
|
|
158
|
+
|
|
159
|
+
def _task_to_document(self, task, is_completed: bool) -> Document | None:
|
|
160
|
+
"""Convert a Todoist task to a Document."""
|
|
161
|
+
from okb.ingest import Document, DocumentMetadata
|
|
162
|
+
|
|
163
|
+
# Build content from task content + description + optional comments
|
|
164
|
+
content_parts = [task.content]
|
|
165
|
+
if task.description:
|
|
166
|
+
content_parts.append(task.description)
|
|
167
|
+
|
|
168
|
+
if self._include_comments:
|
|
169
|
+
comments = self._fetch_task_comments(task.id)
|
|
170
|
+
if comments:
|
|
171
|
+
content_parts.append("\n## Comments\n" + "\n".join(comments))
|
|
172
|
+
|
|
173
|
+
content = "\n\n".join(content_parts)
|
|
174
|
+
|
|
175
|
+
# Parse due date
|
|
176
|
+
due_date = None
|
|
177
|
+
if task.due:
|
|
178
|
+
due_date = self._parse_due(task.due)
|
|
179
|
+
|
|
180
|
+
# Map priority: Todoist uses 1-4 (4=urgent), OKB uses 1-5 (1=highest)
|
|
181
|
+
priority = 5 - task.priority if task.priority else None
|
|
182
|
+
|
|
183
|
+
# Get project name
|
|
184
|
+
project_name = self._projects.get(task.project_id)
|
|
185
|
+
|
|
186
|
+
# Build metadata
|
|
187
|
+
metadata = DocumentMetadata(
|
|
188
|
+
tags=task.labels or [],
|
|
189
|
+
project=project_name,
|
|
190
|
+
extra={
|
|
191
|
+
"todoist_id": task.id,
|
|
192
|
+
"project_id": task.project_id,
|
|
193
|
+
},
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Determine status
|
|
197
|
+
status = "completed" if is_completed or task.is_completed else "pending"
|
|
198
|
+
|
|
199
|
+
return Document(
|
|
200
|
+
source_path=f"todoist://task/{task.id}",
|
|
201
|
+
source_type=self.source_type,
|
|
202
|
+
title=task.content,
|
|
203
|
+
content=content,
|
|
204
|
+
metadata=metadata,
|
|
205
|
+
due_date=due_date,
|
|
206
|
+
status=status,
|
|
207
|
+
priority=priority,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def _parse_due(self, due) -> datetime | None:
|
|
211
|
+
"""Parse Todoist Due object to datetime."""
|
|
212
|
+
if due is None:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
# Due has 'datetime' (full datetime) or 'date' (date only)
|
|
217
|
+
if hasattr(due, "datetime") and due.datetime:
|
|
218
|
+
return datetime.fromisoformat(due.datetime.replace("Z", "+00:00"))
|
|
219
|
+
elif hasattr(due, "date") and due.date:
|
|
220
|
+
# Date-only: treat as end of day in UTC
|
|
221
|
+
if isinstance(due.date, str):
|
|
222
|
+
d = datetime.strptime(due.date, "%Y-%m-%d").date()
|
|
223
|
+
else:
|
|
224
|
+
d = due.date
|
|
225
|
+
return datetime(d.year, d.month, d.day, 23, 59, 59, tzinfo=UTC)
|
|
226
|
+
except Exception:
|
|
227
|
+
pass
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
def _fetch_task_comments(self, task_id: str) -> list[str]:
|
|
231
|
+
"""Fetch comments for a task."""
|
|
232
|
+
comments = []
|
|
233
|
+
try:
|
|
234
|
+
for comment_batch in self._client.get_comments(task_id=task_id):
|
|
235
|
+
for comment in comment_batch:
|
|
236
|
+
comments.append(f"- {comment.content}")
|
|
237
|
+
except Exception:
|
|
238
|
+
pass
|
|
239
|
+
return comments
|
|
240
|
+
|
|
241
|
+
def list_projects(self) -> list[tuple[str, str]]:
|
|
242
|
+
"""List all projects with their IDs.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List of (project_id, project_name) tuples
|
|
246
|
+
"""
|
|
247
|
+
if self._client is None:
|
|
248
|
+
raise RuntimeError("Source not configured. Call configure() first.")
|
|
249
|
+
|
|
250
|
+
projects = []
|
|
251
|
+
for project_batch in self._client.get_projects():
|
|
252
|
+
for p in project_batch:
|
|
253
|
+
projects.append((p.id, p.name))
|
|
254
|
+
return projects
|