okb 1.1.0a0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,369 @@
1
+ """Entity extraction from document content using LLM."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+
8
+ from .base import ExtractedEntity
9
+
10
+ ENTITY_SYSTEM_PROMPT = """\
11
+ You are an expert at identifying named entities in text for a PERSONAL knowledge base.
12
+ Extract only entities that are specific to the author's context - things an LLM wouldn't know about.
13
+
14
+ Entity types to extract:
15
+ - person: People the author knows, works with, or references (colleagues, contacts, clients)
16
+ - project: Specific named projects/products/codebases (e.g., "Acme Dashboard", "customer-portal")
17
+ NOT git branches, environments, or workflow stages
18
+ - technology: ONLY obscure/niche tools or internal systems - NOT well-known technologies
19
+ - organization: Specific companies, teams, clients the author works with
20
+
21
+ DO NOT extract:
22
+ - Well-known technologies: JSON, HTTP, SQL, Python, JavaScript, Docker, AWS, PostgreSQL, React, etc.
23
+ (The LLM already knows these - they add no value to a personal knowledge base)
24
+ - Code symbols: function names, method calls, variables, class names
25
+ - Generic terms: "user", "data", "system", "database", "API", "server", "client"
26
+ - Git branches/workflow terms: main, master, develop, release, staging, production, feature, hotfix
27
+ - Generic process terms: deploy, build, test, migration, setup, config
28
+ - Environment names: dev, prod, qa, uat, local
29
+ - Issue or bug descriptions - those are documents, not entities
30
+ - Famous people, major companies (Google, Microsoft, etc.) unless contextually relevant to author
31
+
32
+ ONLY extract entities that would help answer "Who/what is X?" where X is specific to this person.
33
+
34
+ For each entity found, extract:
35
+ - name: The canonical name (proper noun)
36
+ - entity_type: One of: person, project, technology, organization
37
+ - aliases: Other names/abbreviations (optional)
38
+ - description: Brief description based on context (optional)
39
+ - mentions: Text snippets where entity appears (max 3)
40
+ - confidence: How confident you are (0.0-1.0)
41
+
42
+ Return JSON array. Return empty array [] if no context-specific entities found.
43
+ """
44
+
45
+ ENTITY_USER_PROMPT = """\
46
+ Document title: {title}
47
+ Source type: {source_type}
48
+
49
+ Content:
50
+ {content}
51
+
52
+ Extract named entities as JSON array.
53
+ """
54
+
55
+
56
+ def extract_entities(
57
+ content: str,
58
+ title: str,
59
+ source_type: str | None = None,
60
+ min_confidence: float = 0.8,
61
+ ) -> list[ExtractedEntity]:
62
+ """Extract entities from document content using LLM.
63
+
64
+ Args:
65
+ content: Document content to analyze
66
+ title: Document title for context
67
+ source_type: Type of document (optional)
68
+ min_confidence: Minimum confidence threshold (0-1)
69
+
70
+ Returns:
71
+ List of extracted entities
72
+ """
73
+ from .. import complete
74
+
75
+ # Truncate content if too long
76
+ if len(content) > 20000:
77
+ content = content[:20000] + "\n\n[... content truncated ...]"
78
+
79
+ prompt = ENTITY_USER_PROMPT.format(
80
+ title=title,
81
+ source_type=source_type or "unknown",
82
+ content=content,
83
+ )
84
+
85
+ response = complete(
86
+ prompt=prompt,
87
+ system=ENTITY_SYSTEM_PROMPT,
88
+ max_tokens=2048,
89
+ use_cache=True,
90
+ )
91
+
92
+ if response is None:
93
+ return []
94
+
95
+ return _parse_entity_response(response.content, min_confidence, title)
96
+
97
+
98
+ def _looks_like_code(name: str) -> bool:
99
+ """Check if entity name looks like code."""
100
+ # Contains parentheses (function calls)
101
+ if "(" in name or ")" in name:
102
+ return True
103
+ # Snake_case with underscores (likely variable/function)
104
+ if "_" in name and name.islower():
105
+ return True
106
+ # Starts with lowercase and contains dots (method chain)
107
+ if name and name[0].islower() and "." in name:
108
+ return True
109
+ # CamelCase starting with lowercase (variable/method name)
110
+ if name and name[0].islower() and any(c.isupper() for c in name):
111
+ return True
112
+ return False
113
+
114
+
115
+ # Well-known technologies that add no value to a personal knowledge base
116
+ COMMON_TECHNOLOGIES = frozenset(
117
+ s.lower()
118
+ for s in [
119
+ # Data formats
120
+ "JSON",
121
+ "XML",
122
+ "YAML",
123
+ "CSV",
124
+ "HTML",
125
+ "CSS",
126
+ "Markdown",
127
+ # Protocols
128
+ "HTTP",
129
+ "HTTPS",
130
+ "REST",
131
+ "GraphQL",
132
+ "WebSocket",
133
+ "TCP",
134
+ "UDP",
135
+ "SSH",
136
+ "FTP",
137
+ "SMTP",
138
+ # Languages
139
+ "Python",
140
+ "JavaScript",
141
+ "TypeScript",
142
+ "Java",
143
+ "Go",
144
+ "Rust",
145
+ "C",
146
+ "C++",
147
+ "Ruby",
148
+ "PHP",
149
+ "Swift",
150
+ "Kotlin",
151
+ "Scala",
152
+ "Bash",
153
+ "Shell",
154
+ "SQL",
155
+ "Lua",
156
+ # Major frameworks/tools
157
+ "React",
158
+ "Vue",
159
+ "Angular",
160
+ "Node.js",
161
+ "Django",
162
+ "Flask",
163
+ "FastAPI",
164
+ "Rails",
165
+ "Spring",
166
+ "Express",
167
+ "Next.js",
168
+ # Databases
169
+ "PostgreSQL",
170
+ "MySQL",
171
+ "MongoDB",
172
+ "Redis",
173
+ "SQLite",
174
+ "Elasticsearch",
175
+ "DynamoDB",
176
+ # Cloud/infra
177
+ "AWS",
178
+ "Azure",
179
+ "GCP",
180
+ "Docker",
181
+ "Kubernetes",
182
+ "Linux",
183
+ "Windows",
184
+ "macOS",
185
+ "Nginx",
186
+ "Apache",
187
+ # Tools
188
+ "Git",
189
+ "GitHub",
190
+ "GitLab",
191
+ "npm",
192
+ "pip",
193
+ "Webpack",
194
+ "VS Code",
195
+ "Vim",
196
+ "Emacs",
197
+ ]
198
+ )
199
+
200
+ # Generic git/workflow/environment terms that are not context-specific
201
+ GENERIC_TERMS = frozenset(
202
+ s.lower()
203
+ for s in [
204
+ # Git branches
205
+ "main",
206
+ "master",
207
+ "develop",
208
+ "development",
209
+ "release",
210
+ "staging",
211
+ "production",
212
+ "feature",
213
+ "hotfix",
214
+ "bugfix",
215
+ # Environments
216
+ "dev",
217
+ "prod",
218
+ "test",
219
+ "qa",
220
+ "uat",
221
+ "local",
222
+ "sandbox",
223
+ # Workflow/process terms
224
+ "deploy",
225
+ "build",
226
+ "migration",
227
+ "setup",
228
+ "config",
229
+ "configuration",
230
+ "rollback",
231
+ "rollout",
232
+ # Generic architectural terms
233
+ "frontend",
234
+ "backend",
235
+ "api",
236
+ "service",
237
+ "server",
238
+ "client",
239
+ "app",
240
+ "application",
241
+ "module",
242
+ "component",
243
+ "library",
244
+ "package",
245
+ "plugin",
246
+ "extension",
247
+ # Generic data terms
248
+ "database",
249
+ "cache",
250
+ "queue",
251
+ "worker",
252
+ "scheduler",
253
+ "cron",
254
+ ]
255
+ )
256
+
257
+
258
+ def _parse_entity_response(
259
+ response_text: str, min_confidence: float, title: str = ""
260
+ ) -> list[ExtractedEntity]:
261
+ """Parse LLM response into ExtractedEntity objects."""
262
+ # Try to extract JSON from response
263
+ json_match = re.search(r"\[.*\]", response_text, re.DOTALL)
264
+ if not json_match:
265
+ return []
266
+
267
+ try:
268
+ entities_data = json.loads(json_match.group())
269
+ except json.JSONDecodeError:
270
+ return []
271
+
272
+ if not isinstance(entities_data, list):
273
+ return []
274
+
275
+ valid_types = {"person", "project", "technology", "organization"}
276
+ entities = []
277
+
278
+ for item in entities_data:
279
+ if not isinstance(item, dict):
280
+ continue
281
+
282
+ name = item.get("name", "").strip()
283
+ entity_type = item.get("entity_type")
284
+
285
+ if not name or not isinstance(name, str):
286
+ continue
287
+ if not entity_type or entity_type not in valid_types:
288
+ continue
289
+
290
+ # Filter: too short or too long
291
+ if len(name) < 3 or len(name) > 80:
292
+ continue
293
+
294
+ # Filter: looks like code
295
+ if _looks_like_code(name):
296
+ continue
297
+
298
+ # Filter: well-known technologies (LLM already knows these)
299
+ if name.lower() in COMMON_TECHNOLOGIES:
300
+ continue
301
+
302
+ # Filter: generic git/workflow/environment terms
303
+ if name.lower() in GENERIC_TERMS:
304
+ continue
305
+
306
+ # Filter: matches document title (source shouldn't be extracted as entity)
307
+ if title and name.lower() == title.lower():
308
+ continue
309
+
310
+ # Get confidence (default to 0.85 if not specified)
311
+ confidence = item.get("confidence", 0.85)
312
+ if not isinstance(confidence, int | float):
313
+ confidence = 0.85
314
+
315
+ if confidence < min_confidence:
316
+ continue
317
+
318
+ # Parse aliases
319
+ aliases = item.get("aliases", [])
320
+ if not isinstance(aliases, list):
321
+ aliases = []
322
+ aliases = [a for a in aliases if isinstance(a, str)]
323
+
324
+ # Parse mentions
325
+ mentions = item.get("mentions", [])
326
+ if not isinstance(mentions, list):
327
+ mentions = []
328
+ mentions = [m for m in mentions if isinstance(m, str)][:3]
329
+
330
+ entities.append(
331
+ ExtractedEntity(
332
+ name=name,
333
+ entity_type=entity_type,
334
+ aliases=aliases,
335
+ description=item.get("description"),
336
+ mentions=mentions,
337
+ confidence=float(confidence),
338
+ )
339
+ )
340
+
341
+ return entities
342
+
343
+
344
+ def normalize_entity_name(name: str) -> str:
345
+ """Normalize entity name for deduplication and URL generation.
346
+
347
+ Examples:
348
+ "John Smith" -> "john-smith"
349
+ "AWS (Amazon Web Services)" -> "aws-amazon-web-services"
350
+ "React.js" -> "react-js"
351
+ """
352
+ # Lowercase
353
+ normalized = name.lower()
354
+ # Replace non-alphanumeric with spaces
355
+ normalized = re.sub(r"[^a-z0-9\s]", " ", normalized)
356
+ # Collapse whitespace and replace with hyphens
357
+ normalized = re.sub(r"\s+", "-", normalized.strip())
358
+ # Remove leading/trailing hyphens
359
+ normalized = normalized.strip("-")
360
+ return normalized
361
+
362
+
363
+ def entity_source_path(entity_type: str, name: str) -> str:
364
+ """Generate source_path for an entity document.
365
+
366
+ Format: okb://entity/{type}/{normalized-name}
367
+ """
368
+ normalized = normalize_entity_name(name)
369
+ return f"okb://entity/{entity_type}/{normalized}"
@@ -0,0 +1,149 @@
1
+ """TODO extraction from document content using LLM."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from datetime import UTC, datetime
8
+
9
+ from .base import ExtractedTodo
10
+
11
+ TODO_SYSTEM_PROMPT = """\
12
+ You are an expert at identifying action items and tasks in text.
13
+ Extract TODO items from the given document content.
14
+
15
+ Look for:
16
+ - Explicit markers: TODO, FIXME, HACK, XXX, ACTION
17
+ - Action phrases: "need to", "should", "must", "have to", "action item"
18
+ - Deadlines and commitments: "by Friday", "before the meeting", "this week"
19
+ - Questions implying needed work: "What about X?", "How do we handle Y?"
20
+ - Incomplete items marked for follow-up
21
+
22
+ For each TODO found, extract:
23
+ - title: A concise description of the task (imperative form: "Fix the bug", not "The bug needs fixing")
24
+ - content: Additional context or details (optional)
25
+ - due_date: If a deadline is mentioned, in ISO format YYYY-MM-DD (optional)
26
+ - priority: 1=urgent, 2=high, 3=normal, 4=low, 5=someday (optional)
27
+ - assignee: Person responsible if mentioned (optional)
28
+ - source_context: The exact text snippet where this TODO was found
29
+
30
+ Return JSON array of extracted TODOs. Return empty array [] if none found.
31
+ Be conservative - only extract clear action items, not vague mentions.
32
+ """
33
+
34
+ TODO_USER_PROMPT = """\
35
+ Document title: {title}
36
+ Source type: {source_type}
37
+
38
+ Content:
39
+ {content}
40
+
41
+ Extract all TODO items from this content as JSON array.
42
+ """
43
+
44
+
45
+ def extract_todos(
46
+ content: str,
47
+ title: str,
48
+ source_type: str,
49
+ min_confidence: float = 0.7,
50
+ ) -> list[ExtractedTodo]:
51
+ """Extract TODO items from document content using LLM.
52
+
53
+ Args:
54
+ content: Document content to analyze
55
+ title: Document title for context
56
+ source_type: Type of document (markdown, code, org, etc.)
57
+ min_confidence: Minimum confidence threshold (0-1)
58
+
59
+ Returns:
60
+ List of extracted TODO items
61
+ """
62
+ from .. import complete
63
+
64
+ # Truncate content if too long (keep first ~20k chars for context)
65
+ if len(content) > 20000:
66
+ content = content[:20000] + "\n\n[... content truncated ...]"
67
+
68
+ prompt = TODO_USER_PROMPT.format(
69
+ title=title,
70
+ source_type=source_type,
71
+ content=content,
72
+ )
73
+
74
+ response = complete(
75
+ prompt=prompt,
76
+ system=TODO_SYSTEM_PROMPT,
77
+ max_tokens=2048,
78
+ use_cache=True,
79
+ )
80
+
81
+ if response is None:
82
+ return []
83
+
84
+ return _parse_todo_response(response.content, min_confidence)
85
+
86
+
87
+ def _parse_todo_response(response_text: str, min_confidence: float) -> list[ExtractedTodo]:
88
+ """Parse LLM response into ExtractedTodo objects."""
89
+ # Try to extract JSON from response
90
+ json_match = re.search(r"\[.*\]", response_text, re.DOTALL)
91
+ if not json_match:
92
+ return []
93
+
94
+ try:
95
+ todos_data = json.loads(json_match.group())
96
+ except json.JSONDecodeError:
97
+ return []
98
+
99
+ if not isinstance(todos_data, list):
100
+ return []
101
+
102
+ todos = []
103
+ for item in todos_data:
104
+ if not isinstance(item, dict):
105
+ continue
106
+
107
+ title = item.get("title")
108
+ if not title or not isinstance(title, str):
109
+ continue
110
+
111
+ # Parse due_date if present
112
+ due_date = None
113
+ if due_str := item.get("due_date"):
114
+ try:
115
+ due_date = datetime.fromisoformat(due_str).replace(tzinfo=UTC)
116
+ except (ValueError, TypeError):
117
+ pass
118
+
119
+ # Parse priority
120
+ priority = None
121
+ if p := item.get("priority"):
122
+ try:
123
+ priority = int(p)
124
+ if priority < 1 or priority > 5:
125
+ priority = None
126
+ except (ValueError, TypeError):
127
+ pass
128
+
129
+ # Get confidence (default to 0.8 if not specified)
130
+ confidence = item.get("confidence", 0.8)
131
+ if not isinstance(confidence, (int, float)):
132
+ confidence = 0.8
133
+
134
+ if confidence < min_confidence:
135
+ continue
136
+
137
+ todos.append(
138
+ ExtractedTodo(
139
+ title=title.strip(),
140
+ content=item.get("content"),
141
+ due_date=due_date,
142
+ priority=priority,
143
+ assignee=item.get("assignee"),
144
+ confidence=float(confidence),
145
+ source_context=item.get("source_context"),
146
+ )
147
+ )
148
+
149
+ return todos
okb/llm/providers.py CHANGED
@@ -165,13 +165,13 @@ class ClaudeProvider:
165
165
 
166
166
 
167
167
  class ModalProvider:
168
- """Modal-based LLM provider using open models (Llama, Mistral, etc.).
168
+ """Modal-based LLM provider using open models (Phi-3, Llama, Mistral, etc.).
169
169
 
170
170
  Runs on Modal GPU infrastructure - no API key needed, pay per compute.
171
- Requires deploying the Modal app first: `modal deploy lkb/modal_llm.py`
171
+ Requires deploying the Modal app first: `okb llm deploy`
172
172
 
173
173
  Config:
174
- model: Model name (default: meta-llama/Llama-3.2-3B-Instruct)
174
+ model: Model name (default: microsoft/Phi-3-mini-4k-instruct)
175
175
  timeout: Request timeout in seconds (default: 60)
176
176
  """
177
177
 
@@ -179,7 +179,7 @@ class ModalProvider:
179
179
 
180
180
  def __init__(self) -> None:
181
181
  self._llm = None
182
- self._model: str = "meta-llama/Llama-3.2-3B-Instruct"
182
+ self._model: str = "microsoft/Phi-3-mini-4k-instruct"
183
183
  self._timeout: int = 60
184
184
 
185
185
  def configure(self, config: dict) -> None:
@@ -202,7 +202,7 @@ class ModalProvider:
202
202
  self._llm = modal.Cls.from_name("knowledge-llm", "LLM")()
203
203
  except modal.exception.NotFoundError:
204
204
  raise RuntimeError(
205
- "Modal LLM app not deployed. Deploy with: modal deploy lkb/modal_llm.py"
205
+ "Modal LLM app not deployed. Deploy with: okb llm deploy"
206
206
  )
207
207
 
208
208
  def complete(
@@ -244,9 +244,12 @@ class ModalProvider:
244
244
  def list_models(self) -> list[str]:
245
245
  """List recommended models for Modal."""
246
246
  return [
247
+ # Non-gated (work immediately)
248
+ "microsoft/Phi-3-mini-4k-instruct",
249
+ "Qwen/Qwen2-1.5B-Instruct",
250
+ # Gated (require HuggingFace approval + HF_TOKEN)
247
251
  "meta-llama/Llama-3.2-3B-Instruct",
248
252
  "meta-llama/Llama-3.2-1B-Instruct",
249
- "mistralai/Mistral-7B-Instruct-v0.3",
250
253
  ]
251
254
 
252
255