okb 1.1.0__py3-none-any.whl → 1.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
okb/llm/analyze.py DELETED
@@ -1,524 +0,0 @@
1
- """Database/project-level analysis - aggregate entities and extract themes."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- from dataclasses import dataclass
7
- from datetime import UTC, datetime
8
- from typing import Any
9
-
10
- import psycopg
11
- from psycopg.rows import dict_row
12
-
13
-
14
- @dataclass
15
- class AnalysisResult:
16
- """Result from database/project analysis."""
17
-
18
- description: str
19
- topics: list[str]
20
- key_entities: list[dict] # [{name, type, ref_count, doc_count}]
21
- stats: dict
22
- sample_count: int
23
- analyzed_at: datetime
24
- project: str | None = None
25
-
26
-
27
- ANALYSIS_SYSTEM = """You are analyzing a personal knowledge base to understand its contents.
28
- Based on the provided statistics, entity list, and document samples,
29
- generate a concise description and topic keywords.
30
-
31
- Focus on PURPOSE and THEMES, not statistics.
32
- Good: "Technical notes on Python web development with Django and FastAPI"
33
- Bad: "Contains 2500 code files and 63 markdown documents"
34
-
35
- Return ONLY valid JSON with this structure:
36
- {"description": "1-3 sentence description", "topics": ["topic1", "topic2", ...]}"""
37
-
38
- ANALYSIS_USER = """## Content Statistics
39
- {stats}
40
-
41
- ## Most Referenced Entities ({entity_count} shown)
42
- {entity_list}
43
-
44
- ## Sample Document Titles and Excerpts ({sample_count} documents)
45
- {document_samples}
46
-
47
- ---
48
- Analyze this knowledge base and return JSON with "description" and "topics" fields.
49
- The description should capture the overall purpose and main themes (1-3 sentences).
50
- Topics should be 5-15 keywords that characterize the content."""
51
-
52
-
53
- def get_entity_summary(
54
- db_url: str,
55
- project: str | None = None,
56
- limit: int = 20,
57
- ) -> list[dict]:
58
- """Get most-mentioned entities with reference counts.
59
-
60
- Returns list of dicts with: name, type, ref_count, doc_count
61
- """
62
- with psycopg.connect(db_url, row_factory=dict_row) as conn:
63
- sql = """
64
- SELECT
65
- e.title as name,
66
- e.metadata->>'entity_type' as type,
67
- COUNT(DISTINCT r.id) as ref_count,
68
- COUNT(DISTINCT r.document_id) as doc_count
69
- FROM documents e
70
- JOIN entity_refs r ON r.entity_id = e.id
71
- WHERE e.source_type = 'entity'
72
- """
73
- params: list[Any] = []
74
-
75
- if project:
76
- sql += """
77
- AND r.document_id IN (
78
- SELECT id FROM documents WHERE metadata->>'project' = %s
79
- )
80
- """
81
- params.append(project)
82
-
83
- sql += """
84
- GROUP BY e.id, e.title, e.metadata->>'entity_type'
85
- ORDER BY ref_count DESC
86
- LIMIT %s
87
- """
88
- params.append(limit)
89
-
90
- results = conn.execute(sql, params).fetchall()
91
- return [dict(r) for r in results]
92
-
93
-
94
- def get_content_stats(db_url: str, project: str | None = None) -> dict:
95
- """Get content composition statistics.
96
-
97
- Returns dict with: source_types, projects, total_documents, total_tokens, date_range
98
- """
99
- with psycopg.connect(db_url, row_factory=dict_row) as conn:
100
- # Base filter for project
101
- project_filter = ""
102
- params: list[Any] = []
103
- if project:
104
- project_filter = " WHERE metadata->>'project' = %s"
105
- params.append(project)
106
-
107
- # Source type breakdown
108
- sql = f"""
109
- SELECT
110
- source_type,
111
- COUNT(*) as doc_count,
112
- COALESCE(SUM(
113
- (SELECT COALESCE(SUM(token_count), 0) FROM chunks WHERE document_id = d.id)
114
- ), 0) as token_count
115
- FROM documents d
116
- {project_filter}
117
- GROUP BY source_type
118
- ORDER BY doc_count DESC
119
- """
120
- source_types = conn.execute(sql, params).fetchall()
121
-
122
- # Total counts
123
- sql = f"""
124
- SELECT
125
- COUNT(*) as total_documents,
126
- (SELECT COUNT(*) FROM chunks c
127
- JOIN documents d ON c.document_id = d.id
128
- {project_filter.replace("WHERE", "WHERE" if project else "")}) as total_chunks
129
- FROM documents d
130
- {project_filter}
131
- """
132
- totals = conn.execute(sql, params + params if project else params).fetchone()
133
-
134
- # Project list (only if not filtering by project)
135
- projects = []
136
- if not project:
137
- sql = """
138
- SELECT DISTINCT metadata->>'project' as project
139
- FROM documents
140
- WHERE metadata->>'project' IS NOT NULL
141
- ORDER BY project
142
- """
143
- projects = [r["project"] for r in conn.execute(sql).fetchall()]
144
-
145
- # Date range
146
- sql = f"""
147
- SELECT
148
- MIN(COALESCE(
149
- (metadata->>'document_date')::date,
150
- (metadata->>'file_modified_at')::date,
151
- created_at::date
152
- )) as earliest,
153
- MAX(COALESCE(
154
- (metadata->>'document_date')::date,
155
- (metadata->>'file_modified_at')::date,
156
- created_at::date
157
- )) as latest
158
- FROM documents d
159
- {project_filter}
160
- """
161
- dates = conn.execute(sql, params).fetchone()
162
-
163
- total_tokens = sum(s["token_count"] for s in source_types)
164
-
165
- return {
166
- "source_types": {s["source_type"]: s["doc_count"] for s in source_types},
167
- "projects": projects,
168
- "total_documents": totals["total_documents"],
169
- "total_chunks": totals["total_chunks"],
170
- "total_tokens": total_tokens,
171
- "date_range": {
172
- "earliest": str(dates["earliest"]) if dates["earliest"] else None,
173
- "latest": str(dates["latest"]) if dates["latest"] else None,
174
- },
175
- }
176
-
177
-
178
- def get_document_samples(
179
- db_url: str,
180
- project: str | None = None,
181
- sample_size: int = 15,
182
- strategy: str = "diverse",
183
- ) -> list[dict]:
184
- """Get representative document samples for topic extraction.
185
-
186
- Args:
187
- db_url: Database URL
188
- project: Filter by project
189
- sample_size: Number of documents to sample
190
- strategy: Sampling strategy - "diverse" uses embedding distance, "recent", or "random"
191
-
192
- Returns:
193
- List of dicts with: title, source_type, excerpt (first ~500 chars)
194
- """
195
- with psycopg.connect(db_url, row_factory=dict_row) as conn:
196
- from pgvector.psycopg import register_vector
197
-
198
- register_vector(conn)
199
-
200
- project_filter = ""
201
- params: list[Any] = []
202
- if project:
203
- project_filter = " AND d.metadata->>'project' = %s"
204
- params.append(project)
205
-
206
- # Exclude derived documents
207
- base_filter = """
208
- d.source_path NOT LIKE '%%::todo/%%'
209
- AND d.source_path NOT LIKE 'okb://entity/%%'
210
- AND d.source_path NOT LIKE 'claude://%%'
211
- """
212
-
213
- if strategy == "diverse":
214
- # Use embedding-based diverse sampling:
215
- # Start with a random document, then iteratively pick documents
216
- # that are farthest from the already-selected set
217
- sql = f"""
218
- WITH first_doc AS (
219
- SELECT d.id, d.title, d.source_type, d.content,
220
- (SELECT embedding FROM chunks WHERE document_id = d.id LIMIT 1) as emb
221
- FROM documents d
222
- WHERE {base_filter} {project_filter}
223
- ORDER BY RANDOM()
224
- LIMIT 1
225
- )
226
- SELECT id, title, source_type,
227
- LEFT(content, 500) as excerpt
228
- FROM first_doc
229
- """
230
- results = list(conn.execute(sql, params).fetchall())
231
-
232
- if results:
233
- # Get remaining documents with embeddings
234
- sql = f"""
235
- SELECT d.id, d.title, d.source_type,
236
- LEFT(d.content, 500) as excerpt,
237
- (SELECT embedding FROM chunks WHERE document_id = d.id LIMIT 1) as emb
238
- FROM documents d
239
- WHERE {base_filter} {project_filter}
240
- AND d.id != %s
241
- """
242
- remaining = list(conn.execute(sql, params + [results[0]["id"]]).fetchall())
243
-
244
- # Iteratively select documents farthest from selected set
245
- selected_embs = []
246
- if results[0].get("emb"):
247
- # Get embedding for first doc
248
- first_emb = conn.execute(
249
- "SELECT embedding FROM chunks WHERE document_id = %s LIMIT 1",
250
- (results[0]["id"],),
251
- ).fetchone()
252
- if first_emb:
253
- selected_embs = [first_emb["embedding"]]
254
-
255
- while len(results) < sample_size and remaining:
256
- if not selected_embs:
257
- # No embeddings available, fall back to random
258
- import random
259
-
260
- next_doc = random.choice(remaining)
261
- remaining.remove(next_doc)
262
- else:
263
- # Find doc with max min-distance from selected
264
- best_doc = None
265
- best_dist = -1
266
- for doc in remaining:
267
- if doc.get("emb") is None:
268
- continue
269
- # Min distance to any selected
270
- min_dist = min(
271
- 1
272
- - float(
273
- conn.execute(
274
- "SELECT %s::vector <=> %s::vector as dist",
275
- (doc["emb"], emb),
276
- ).fetchone()["dist"]
277
- )
278
- for emb in selected_embs
279
- )
280
- if min_dist > best_dist:
281
- best_dist = min_dist
282
- best_doc = doc
283
-
284
- if best_doc is None:
285
- break
286
- next_doc = best_doc
287
- remaining.remove(next_doc)
288
- if next_doc.get("emb"):
289
- selected_embs.append(next_doc["emb"])
290
-
291
- results.append(
292
- {
293
- "id": next_doc["id"],
294
- "title": next_doc["title"],
295
- "source_type": next_doc["source_type"],
296
- "excerpt": next_doc["excerpt"],
297
- }
298
- )
299
-
300
- elif strategy == "recent":
301
- sql = f"""
302
- SELECT d.title, d.source_type,
303
- LEFT(d.content, 500) as excerpt
304
- FROM documents d
305
- WHERE {base_filter} {project_filter}
306
- ORDER BY d.updated_at DESC
307
- LIMIT %s
308
- """
309
- results = conn.execute(sql, params + [sample_size]).fetchall()
310
-
311
- else: # random
312
- sql = f"""
313
- SELECT d.title, d.source_type,
314
- LEFT(d.content, 500) as excerpt
315
- FROM documents d
316
- WHERE {base_filter} {project_filter}
317
- ORDER BY RANDOM()
318
- LIMIT %s
319
- """
320
- results = conn.execute(sql, params + [sample_size]).fetchall()
321
-
322
- return [
323
- {"title": r["title"], "source_type": r["source_type"], "excerpt": r["excerpt"]}
324
- for r in results
325
- ]
326
-
327
-
328
- def analyze_database(
329
- db_url: str,
330
- project: str | None = None,
331
- sample_size: int = 15,
332
- auto_update: bool = True,
333
- ) -> AnalysisResult:
334
- """Run full database/project analysis.
335
-
336
- Args:
337
- db_url: Database URL
338
- project: Analyze specific project only
339
- sample_size: Number of documents to sample
340
- auto_update: Update database_metadata with results
341
-
342
- Returns:
343
- AnalysisResult with description, topics, entities, stats
344
- """
345
- from . import get_llm
346
-
347
- # Gather data
348
- stats = get_content_stats(db_url, project)
349
- entities = get_entity_summary(db_url, project, limit=20)
350
- samples = get_document_samples(db_url, project, sample_size, strategy="diverse")
351
-
352
- # Format stats for prompt
353
- stats_text = []
354
- stats_text.append(f"Total documents: {stats['total_documents']}")
355
- stats_text.append(f"Total tokens: ~{stats['total_tokens']:,}")
356
- if stats["source_types"]:
357
- types_str = ", ".join(
358
- f"{t}: {c}" for t, c in sorted(stats["source_types"].items(), key=lambda x: -x[1])
359
- )
360
- stats_text.append(f"Source types: {types_str}")
361
- if stats["projects"]:
362
- stats_text.append(f"Projects: {', '.join(stats['projects'])}")
363
- if stats["date_range"]["earliest"]:
364
- stats_text.append(
365
- f"Date range: {stats['date_range']['earliest']} to {stats['date_range']['latest']}"
366
- )
367
-
368
- # Format entities for prompt
369
- entity_text = []
370
- for e in entities:
371
- entity_text.append(
372
- f"- {e['name']} ({e['type']}): {e['ref_count']} mentions in {e['doc_count']} docs"
373
- )
374
- if not entity_text:
375
- entity_text.append("(No entities extracted yet)")
376
-
377
- # Format samples for prompt
378
- sample_text = []
379
- for s in samples:
380
- excerpt = s["excerpt"].replace("\n", " ")[:200] if s["excerpt"] else ""
381
- sample_text.append(f"### {s['title']} ({s['source_type']})\n{excerpt}...")
382
-
383
- # Build prompt
384
- prompt = ANALYSIS_USER.format(
385
- stats="\n".join(stats_text),
386
- entity_count=len(entities),
387
- entity_list="\n".join(entity_text),
388
- sample_count=len(samples),
389
- document_samples="\n\n".join(sample_text),
390
- )
391
-
392
- # Call LLM
393
- llm = get_llm()
394
- if llm is None:
395
- raise RuntimeError(
396
- "No LLM provider configured. Analysis requires an LLM. "
397
- "Set ANTHROPIC_API_KEY or configure llm.provider in config."
398
- )
399
-
400
- response = llm.complete(prompt, system=ANALYSIS_SYSTEM, max_tokens=1024)
401
-
402
- # Parse response
403
- try:
404
- # Try to extract JSON from response
405
- content = response.content.strip()
406
- # Handle markdown code blocks
407
- if content.startswith("```"):
408
- lines = content.split("\n")
409
- content = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
410
- data = json.loads(content)
411
- description = data.get("description", "")
412
- topics = data.get("topics", [])
413
- except (json.JSONDecodeError, KeyError):
414
- # Fallback: use raw response as description
415
- description = response.content.strip()[:500]
416
- topics = []
417
-
418
- result = AnalysisResult(
419
- description=description,
420
- topics=topics,
421
- key_entities=entities,
422
- stats=stats,
423
- sample_count=len(samples),
424
- analyzed_at=datetime.now(UTC),
425
- project=project,
426
- )
427
-
428
- # Update database metadata if requested
429
- if auto_update:
430
- _update_metadata(db_url, result)
431
-
432
- return result
433
-
434
-
435
- def _update_metadata(db_url: str, result: AnalysisResult) -> None:
436
- """Update database_metadata with analysis results."""
437
- with psycopg.connect(db_url) as conn:
438
- # Use project-specific keys if analyzing a project
439
- suffix = f"_{result.project}" if result.project else ""
440
-
441
- conn.execute(
442
- """
443
- INSERT INTO database_metadata (key, value, source, updated_at)
444
- VALUES (%s, %s, 'llm', NOW())
445
- ON CONFLICT (key) DO UPDATE SET
446
- value = EXCLUDED.value,
447
- source = 'llm',
448
- updated_at = NOW()
449
- """,
450
- (f"llm_description{suffix}", psycopg.types.json.Json(result.description)),
451
- )
452
-
453
- conn.execute(
454
- """
455
- INSERT INTO database_metadata (key, value, source, updated_at)
456
- VALUES (%s, %s, 'llm', NOW())
457
- ON CONFLICT (key) DO UPDATE SET
458
- value = EXCLUDED.value,
459
- source = 'llm',
460
- updated_at = NOW()
461
- """,
462
- (f"llm_topics{suffix}", psycopg.types.json.Json(result.topics)),
463
- )
464
-
465
- # Also store analysis metadata
466
- conn.execute(
467
- """
468
- INSERT INTO database_metadata (key, value, source, updated_at)
469
- VALUES (%s, %s, 'llm', NOW())
470
- ON CONFLICT (key) DO UPDATE SET
471
- value = EXCLUDED.value,
472
- source = 'llm',
473
- updated_at = NOW()
474
- """,
475
- (
476
- f"llm_analysis{suffix}",
477
- psycopg.types.json.Json(
478
- {
479
- "analyzed_at": result.analyzed_at.isoformat(),
480
- "sample_count": result.sample_count,
481
- "entity_count": len(result.key_entities),
482
- "doc_count": result.stats["total_documents"],
483
- }
484
- ),
485
- ),
486
- )
487
-
488
- conn.commit()
489
-
490
-
491
- def format_analysis_result(result: AnalysisResult) -> str:
492
- """Format analysis result for display."""
493
- lines = ["## Knowledge Base Analysis\n"]
494
-
495
- if result.project:
496
- lines.append(f"**Project:** {result.project}\n")
497
-
498
- lines.append(f"**Description:** {result.description}\n")
499
-
500
- if result.topics:
501
- lines.append(f"**Topics:** {', '.join(result.topics)}\n")
502
-
503
- lines.append("\n### Content Statistics")
504
- lines.append(f"- Documents: {result.stats['total_documents']:,}")
505
- lines.append(f"- Tokens: ~{result.stats['total_tokens']:,}")
506
- if result.stats["source_types"]:
507
- sorted_types = sorted(result.stats["source_types"].items(), key=lambda x: -x[1])
508
- types_str = ", ".join(f"{t}: {c}" for t, c in sorted_types)
509
- lines.append(f"- Source types: {types_str}")
510
- if result.stats["projects"]:
511
- lines.append(f"- Projects: {', '.join(result.stats['projects'])}")
512
-
513
- if result.key_entities:
514
- lines.append("\n### Key Entities (by mentions)")
515
- for i, e in enumerate(result.key_entities[:10], 1):
516
- lines.append(
517
- f"{i}. {e['name']} ({e['type']}) - "
518
- f"{e['ref_count']} mentions in {e['doc_count']} docs"
519
- )
520
-
521
- timestamp = result.analyzed_at.strftime("%Y-%m-%d %H:%M")
522
- lines.append(f"\nAnalyzed {result.sample_count} document samples at {timestamp}")
523
-
524
- return "\n".join(lines)