PyPI - openkb - Versions diffs - 0.1.0.dev1__tar.gz → 0.1.2__tar.gz - Mend

openkb 0.1.0.dev1tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{openkb-0.1.0.dev1 → openkb-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.3
 Name: openkb
-Version: 0.1.0.dev1
-Summary: OpenKB — Open LLM Knowledge Base, powered by PageIndex
+Version: 0.1.2
+Summary: OpenKB: Open LLM Knowledge Base, powered by PageIndex
 License: Apache-2.0
 Keywords: ai,rag,retrieval,knowledge-base,llm,pageindex,agents,document
 Author: Ray
@@ -22,7 +22,8 @@ Requires-Dist: json-repair
 Requires-Dist: litellm
 Requires-Dist: markitdown[all]
 Requires-Dist: openai-agents
-Requires-Dist: pageindex (==0.3.0.dev0)
+Requires-Dist: pageindex (==0.3.0.dev1)
+Requires-Dist: prompt_toolkit (>=3.0)
 Requires-Dist: python-dotenv
 Requires-Dist: pyyaml
 Requires-Dist: watchdog (>=3.0)
@@ -57,11 +58,12 @@ Traditional RAG rediscovers knowledge from scratch on every query. Nothing accum
 ### Features
-- **Any format** — PDF, Word, PowerPoint, Excel, HTML, Markdown, text, CSV, and more via markitdown
+- **Broad format support** — PDF, Word, Markdown, PowerPoint, HTML, Excel, CSV, text, and more via markitdown
 - **Scale to long documents** — Long and complex documents are handled via [PageIndex](https://github.com/VectifyAI/PageIndex) tree indexing, enabling accurate, vectorless long-context retrieval
 - **Native multi-modality** — Retrieves and understands figures, tables, and images, not just text
-- **Auto wiki** — LLM generates summaries, concept pages, and cross-links. You curate sources; the LLM does the rest
-- **Query** — Ask questions against your wiki. The LLM navigates your compiled knowledge to answer
+- **Compiled Wiki** — LLM manages and compiles your documents into summaries, concept pages, and cross-links, all kept in sync
+- **Query** — Ask questions (one-off) against your wiki. The LLM navigates your compiled knowledge to answer
+- **Interactive Chat** — Multi-turn conversations with persisted sessions you can resume across runs
 - **Lint** — Health checks find contradictions, gaps, orphans, and stale content
 - **Watch mode** — Drop files into `raw/`, wiki updates automatically
 - **Obsidian compatible** — Wiki is plain `.md` files with `[[wikilinks]]`. Open in Obsidian for graph view and browsing
@@ -88,11 +90,11 @@ openkb add paper.pdf
 openkb add ~/papers/                   # Add a whole directory
 openkb add article.html
-# 4. Ask questions
+# 4. Ask a question
 openkb query "What are the main findings?"
-# 5. Check wiki health
-openkb lint
+# 5. Or start an interactive chat session
+openkb chat
 ```
 ### Set up your LLM
@@ -165,6 +167,7 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume
 | `openkb add <file_or_dir>` | Add documents and compile to wiki |
 | `openkb query "question"` | Ask a question against the knowledge base |
 | `openkb query "question" --save` | Ask and save the answer to `wiki/explorations/` |
+| `openkb chat` | Start an interactive multi-turn chat (use `--resume`, `--list`, `--delete` to manage sessions) |
 | `openkb watch` | Watch `raw/` and auto-compile new files |
 | `openkb lint` | Run structural + knowledge health checks |
 | `openkb list` | List indexed documents and concepts |
@@ -172,6 +175,20 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume
 <!-- | `openkb lint --fix` | Auto-fix what it can | -->
+### Interactive chat
+`openkb chat` opens an interactive chat session over your wiki knowledge base. Unlike the one-shot `openkb query`, each turn carries the conversation history, so you can dig into a topic without re-typing context.
+```bash
+openkb chat                       # start a new session
+openkb chat --resume              # resume the most recent session
+openkb chat --resume 20260411     # resume by id (unique prefix works)
+openkb chat --list                # list all sessions
+openkb chat --delete <id>         # delete a session
+```
+`/help` lists all slash commands: e.g., `/save` exports the transcript, `/clear` starts a fresh session.
 ### Configuration
 Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:

{openkb-0.1.0.dev1 → openkb-0.1.2}/README.md RENAMED Viewed

@@ -24,11 +24,12 @@ Traditional RAG rediscovers knowledge from scratch on every query. Nothing accum
 ### Features
-- **Any format** — PDF, Word, PowerPoint, Excel, HTML, Markdown, text, CSV, and more via markitdown
+- **Broad format support** — PDF, Word, Markdown, PowerPoint, HTML, Excel, CSV, text, and more via markitdown
 - **Scale to long documents** — Long and complex documents are handled via [PageIndex](https://github.com/VectifyAI/PageIndex) tree indexing, enabling accurate, vectorless long-context retrieval
 - **Native multi-modality** — Retrieves and understands figures, tables, and images, not just text
-- **Auto wiki** — LLM generates summaries, concept pages, and cross-links. You curate sources; the LLM does the rest
-- **Query** — Ask questions against your wiki. The LLM navigates your compiled knowledge to answer
+- **Compiled Wiki** — LLM manages and compiles your documents into summaries, concept pages, and cross-links, all kept in sync
+- **Query** — Ask questions (one-off) against your wiki. The LLM navigates your compiled knowledge to answer
+- **Interactive Chat** — Multi-turn conversations with persisted sessions you can resume across runs
 - **Lint** — Health checks find contradictions, gaps, orphans, and stale content
 - **Watch mode** — Drop files into `raw/`, wiki updates automatically
 - **Obsidian compatible** — Wiki is plain `.md` files with `[[wikilinks]]`. Open in Obsidian for graph view and browsing
@@ -55,11 +56,11 @@ openkb add paper.pdf
 openkb add ~/papers/                   # Add a whole directory
 openkb add article.html
-# 4. Ask questions
+# 4. Ask a question
 openkb query "What are the main findings?"
-# 5. Check wiki health
-openkb lint
+# 5. Or start an interactive chat session
+openkb chat
 ```
 ### Set up your LLM
@@ -132,6 +133,7 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume
 | `openkb add <file_or_dir>` | Add documents and compile to wiki |
 | `openkb query "question"` | Ask a question against the knowledge base |
 | `openkb query "question" --save` | Ask and save the answer to `wiki/explorations/` |
+| `openkb chat` | Start an interactive multi-turn chat (use `--resume`, `--list`, `--delete` to manage sessions) |
 | `openkb watch` | Watch `raw/` and auto-compile new files |
 | `openkb lint` | Run structural + knowledge health checks |
 | `openkb list` | List indexed documents and concepts |
@@ -139,6 +141,20 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume
 <!-- | `openkb lint --fix` | Auto-fix what it can | -->
+### Interactive chat
+`openkb chat` opens an interactive chat session over your wiki knowledge base. Unlike the one-shot `openkb query`, each turn carries the conversation history, so you can dig into a topic without re-typing context.
+```bash
+openkb chat                       # start a new session
+openkb chat --resume              # resume the most recent session
+openkb chat --resume 20260411     # resume by id (unique prefix works)
+openkb chat --list                # list all sessions
+openkb chat --delete <id>         # delete a session
+```
+`/help` lists all slash commands: e.g., `/save` exports the transcript, `/clear` starts a fresh session.
 ### Configuration
 Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:

openkb-0.1.2/openkb/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""OpenKB package."""
+from importlib.metadata import PackageNotFoundError, version as _version
+try:
+    __version__ = _version("openkb")
+except PackageNotFoundError:
+    __version__ = "0.0.0+unknown"

openkb-0.1.2/openkb/agent/chat.py ADDED Viewed

@@ -0,0 +1,378 @@
+"""Interactive multi-turn chat REPL for the OpenKB knowledge base.
+Builds on the single-shot Q&A agent in ``openkb.agent.query`` and keeps
+conversation state in ``ChatSession``. Uses prompt_toolkit for the input
+line (history, editing, bottom toolbar) and streams responses directly to
+stdout to preserve the existing ``query`` visual.
+"""
+from __future__ import annotations
+import os
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Any
+from prompt_toolkit import PromptSession
+from prompt_toolkit.formatted_text import FormattedText
+from prompt_toolkit.shortcuts import print_formatted_text
+from prompt_toolkit.styles import Style
+from openkb.agent.chat_session import ChatSession
+from openkb.agent.query import MAX_TURNS, build_query_agent
+from openkb.log import append_log
+_STYLE_DICT: dict[str, str] = {
+    "prompt":           "bold #5fa0e0",
+    "bottom-toolbar":   "noreverse nobold #8a8a8a bg:default",
+    "toolbar":          "noreverse nobold #8a8a8a bg:default",
+    "toolbar.session":  "noreverse #8a8a8a bg:default bold",
+    "header":           "#8a8a8a",
+    "header.title":     "bold #5fa0e0",
+    "tool":             "#a8a8a8",
+    "tool.name":        "#a8a8a8 bold",
+    "slash.ok":         "ansigreen",
+    "slash.help":       "#8a8a8a",
+    "error":            "ansired bold",
+    "resume.turn":      "#5fa0e0",
+    "resume.user":      "bold",
+    "resume.assistant": "#8a8a8a",
+}
+_HELP_TEXT = (
+    "Commands:\n"
+    "  /exit          Exit (Ctrl-D also works)\n"
+    "  /clear         Start a fresh session (current one is kept on disk)\n"
+    "  /save [name]   Export transcript to wiki/explorations/\n"
+    "  /help          Show this"
+)
+_SIGINT_EXIT_WINDOW = 2.0
+def _use_color(force_off: bool) -> bool:
+    if force_off:
+        return False
+    if os.environ.get("NO_COLOR", ""):
+        return False
+    if not sys.stdout.isatty():
+        return False
+    return True
+def _build_style(use_color: bool) -> Style:
+    return Style.from_dict(_STYLE_DICT if use_color else {})
+def _fmt(style: Style, *fragments: tuple[str, str]) -> None:
+    print_formatted_text(FormattedText(list(fragments)), style=style, end="")
+def _format_tool_line(name: str, args: str, width: int = 78) -> str:
+    args = args or ""
+    args = args.replace("\n", " ")
+    base = f"  \u00b7 {name}({args})"
+    if len(base) > width:
+        base = base[: width - 1] + "\u2026"
+    return base
+def _extract_preview(text: str, limit: int = 150) -> str:
+    text = " ".join((text or "").strip().split())
+    if len(text) <= limit:
+        return text
+    return text[: limit - 1] + "\u2026"
+def _openkb_version() -> str:
+    from openkb import __version__
+    return __version__
+def _display_kb_dir(kb_dir: Path) -> str:
+    home = str(Path.home())
+    s = str(kb_dir)
+    if s == home:
+        return "~"
+    if s.startswith(home + "/"):
+        return "~" + s[len(home):]
+    return s
+def _print_header(session: ChatSession, kb_dir: Path, style: Style) -> None:
+    disp_dir = _display_kb_dir(kb_dir)
+    version = _openkb_version()
+    version_suffix = f" v{version}\n" if version else "\n"
+    print()
+    _fmt(
+        style,
+        ("class:header.title", "OpenKB Chat"),
+        ("class:header", version_suffix),
+    )
+    _fmt(
+        style,
+        (
+            "class:header",
+            f"{disp_dir} \u00b7 {session.model} \u00b7 session {session.id}\n",
+        ),
+    )
+    _fmt(
+        style,
+        (
+            "class:header",
+            "Type /help for commands, Ctrl-D to exit, "
+            "Ctrl-C to abort current response.\n",
+        ),
+    )
+    print()
+def _print_resume_view(session: ChatSession, style: Style) -> None:
+    turns = list(zip(session.user_turns, session.assistant_texts))
+    if not turns:
+        return
+    total = len(turns)
+    if total > 5:
+        omitted = total - 5
+        _fmt(
+            style,
+            ("class:header", f"... {omitted} earlier turn(s) omitted\n"),
+        )
+        turns = turns[-5:]
+        start = omitted + 1
+    else:
+        start = 1
+    _fmt(
+        style,
+        ("class:header", f"Resumed session  {total} turn(s)\n"),
+    )
+    for i, (u, a) in enumerate(turns, start):
+        _fmt(
+            style,
+            ("class:resume.turn", f"[{i}] "),
+            ("class:resume.user", f">>> {u}\n"),
+        )
+        if a:
+            preview = _extract_preview(a, 180)
+            extra = ""
+            if len(a) > len(preview):
+                extra = f"  ({len(a)} chars)"
+            _fmt(
+                style,
+                ("class:resume.turn", f"[{i}] "),
+                ("class:resume.assistant", f"    {preview}{extra}\n"),
+            )
+    print()
+def _bottom_toolbar(session: ChatSession) -> FormattedText:
+    return FormattedText(
+        [
+            ("class:toolbar", " session "),
+            ("class:toolbar.session", session.id),
+            (
+                "class:toolbar",
+                f"  {session.turn_count} turn(s)  {session.model} ",
+            ),
+        ]
+    )
+def _make_prompt_session(session: ChatSession, style: Style, use_color: bool) -> PromptSession:
+    return PromptSession(
+        message=FormattedText([("class:prompt", ">>> ")]),
+        style=style,
+        bottom_toolbar=(lambda: _bottom_toolbar(session)) if use_color else None,
+    )
+async def _run_turn(agent: Any, session: ChatSession, user_input: str, style: Style) -> None:
+    """Run one agent turn with streaming output and persist the new history."""
+    from agents import (
+        RawResponsesStreamEvent,
+        RunItemStreamEvent,
+        Runner,
+    )
+    from openai.types.responses import ResponseTextDeltaEvent
+    new_input = session.history + [{"role": "user", "content": user_input}]
+    result = Runner.run_streamed(agent, new_input, max_turns=MAX_TURNS)
+    sys.stdout.write("\n")
+    sys.stdout.flush()
+    collected: list[str] = []
+    last_was_text = False
+    need_blank_before_text = False
+    try:
+        async for event in result.stream_events():
+            if isinstance(event, RawResponsesStreamEvent):
+                if isinstance(event.data, ResponseTextDeltaEvent):
+                    text = event.data.delta
+                    if text:
+                        if need_blank_before_text:
+                            sys.stdout.write("\n")
+                            need_blank_before_text = False
+                        sys.stdout.write(text)
+                        sys.stdout.flush()
+                        collected.append(text)
+                        last_was_text = True
+            elif isinstance(event, RunItemStreamEvent):
+                item = event.item
+                if item.type == "tool_call_item":
+                    if last_was_text:
+                        sys.stdout.write("\n")
+                        sys.stdout.flush()
+                        last_was_text = False
+                    raw = item.raw_item
+                    name = getattr(raw, "name", "?")
+                    args = getattr(raw, "arguments", "") or ""
+                    _fmt(style, ("class:tool", _format_tool_line(name, args) + "\n"))
+                    need_blank_before_text = True
+    finally:
+        sys.stdout.write("\n\n")
+        sys.stdout.flush()
+    answer = "".join(collected).strip()
+    if not answer:
+        answer = (result.final_output or "").strip()
+    session.record_turn(user_input, answer, result.to_input_list())
+def _save_transcript(kb_dir: Path, session: ChatSession, name: str | None) -> Path:
+    explore_dir = kb_dir / "wiki" / "explorations"
+    explore_dir.mkdir(parents=True, exist_ok=True)
+    base = name or session.title or (session.user_turns[0] if session.user_turns else session.id)
+    slug = re.sub(r"[^a-z0-9]+", "-", base.lower()).strip("-")[:60] or session.id
+    date = session.created_at[:10].replace("-", "")
+    path = explore_dir / f"{slug}-{date}.md"
+    lines: list[str] = [
+        "---",
+        f'session: "{session.id}"',
+        f'model: "{session.model}"',
+        f'created: "{session.created_at}"',
+        "---",
+        "",
+        f"# Chat transcript  {session.title or session.id}",
+        "",
+    ]
+    for i, (u, a) in enumerate(zip(session.user_turns, session.assistant_texts), 1):
+        lines.append(f"## [{i}] {u}")
+        lines.append("")
+        lines.append(a or "_(no response recorded)_")
+        lines.append("")
+    path.write_text("\n".join(lines), encoding="utf-8")
+    return path
+async def _handle_slash(
+    cmd: str,
+    kb_dir: Path,
+    session: ChatSession,
+    style: Style,
+) -> str | None:
+    """Return ``"exit"`` to end the REPL, ``"new_session"`` to swap sessions,
+    or ``None`` to continue with the current session."""
+    parts = cmd.split(maxsplit=1)
+    head = parts[0].lower()
+    arg = parts[1].strip() if len(parts) > 1 else ""
+    if head in ("/exit", "/quit"):
+        _fmt(style, ("class:header", "Bye. Thanks for using OpenKB.\n\n"))
+        return "exit"
+    if head == "/help":
+        _fmt(style, ("class:slash.help", _HELP_TEXT + "\n"))
+        return None
+    if head == "/clear":
+        old_id = session.id
+        _fmt(
+            style,
+            ("class:slash.ok", f"Started new session (previous: {old_id})\n"),
+        )
+        return "new_session"
+    if head == "/save":
+        if not session.user_turns:
+            _fmt(style, ("class:error", "Nothing to save yet.\n"))
+            return None
+        path = _save_transcript(kb_dir, session, arg or None)
+        _fmt(style, ("class:slash.ok", f"Saved to {path}\n"))
+        return None
+    _fmt(
+        style,
+        ("class:error", f"Unknown command: {head}. Try /help.\n"),
+    )
+    return None
+async def run_chat(
+    kb_dir: Path,
+    session: ChatSession,
+    *,
+    no_color: bool = False,
+) -> None:
+    """Run the chat REPL against ``session`` until the user exits."""
+    from openkb.config import load_config
+    use_color = _use_color(force_off=no_color)
+    style = _build_style(use_color)
+    config = load_config(kb_dir / ".openkb" / "config.yaml")
+    language = session.language or config.get("language", "en")
+    wiki_root = str(kb_dir / "wiki")
+    agent = build_query_agent(wiki_root, session.model, language=language)
+    _print_header(session, kb_dir, style)
+    if session.turn_count > 0:
+        _print_resume_view(session, style)
+    prompt_session = _make_prompt_session(session, style, use_color)
+    last_sigint = 0.0
+    while True:
+        try:
+            user_input = await prompt_session.prompt_async()
+            last_sigint = 0.0
+        except KeyboardInterrupt:
+            now = time.monotonic()
+            if last_sigint and (now - last_sigint) < _SIGINT_EXIT_WINDOW:
+                _fmt(style, ("class:header", "\nBye. Thanks for using OpenKB.\n\n"))
+                return
+            last_sigint = now
+            _fmt(style, ("class:header", "\n(Press Ctrl-C again to exit)\n"))
+            continue
+        except EOFError:
+            _fmt(style, ("class:header", "Bye. Thanks for using OpenKB.\n\n"))
+            return
+        user_input = (user_input or "").strip()
+        if not user_input:
+            continue
+        if user_input.startswith("/"):
+            action = await _handle_slash(user_input, kb_dir, session, style)
+            if action == "exit":
+                return
+            if action == "new_session":
+                session = ChatSession.new(kb_dir, session.model, session.language)
+                agent = build_query_agent(wiki_root, session.model, language=language)
+                prompt_session = _make_prompt_session(session, style, use_color)
+            continue
+        append_log(kb_dir / "wiki", "query", user_input)
+        try:
+            await _run_turn(agent, session, user_input, style)
+        except KeyboardInterrupt:
+            _fmt(style, ("class:error", "\n[aborted]\n"))
+        except Exception as exc:
+            _fmt(style, ("class:error", f"[ERROR] {exc}\n"))

openkb-0.1.2/openkb/agent/chat_session.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""Chat session persistence for `openkb chat`.
+Each session lives in ``<kb>/.openkb/chats/<id>.json`` and stores a sanitized
+agent-SDK history (from ``RunResult.to_input_list()``) alongside the user
+messages and full assistant replies kept as plain strings for display and
+export. Large tool-returned image payloads are replaced with lightweight
+references before the history is reused or persisted.
+"""
+from __future__ import annotations
+import json
+import os
+import random
+import string
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+_IMAGE_HISTORY_NOTE = (
+    "Image output omitted from chat history to avoid persisting raw data URLs."
+)
+def _utcnow_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+def _gen_id() -> str:
+    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+    rand = "".join(random.choices(string.ascii_lowercase + string.digits, k=3))
+    return f"{ts}-{rand}"
+def chats_dir(kb_dir: Path) -> Path:
+    return kb_dir / ".openkb" / "chats"
+def _title_from(msg: str, limit: int = 60) -> str:
+    msg = " ".join(msg.strip().split())
+    if len(msg) <= limit:
+        return msg
+    return msg[: limit - 1] + "\u2026"
+def _image_history_placeholder(image_path: str | None) -> dict[str, str]:
+    text = _IMAGE_HISTORY_NOTE
+    if image_path:
+        text += f" Source path: {image_path}."
+    text += " Call get_image again if you need to inspect it."
+    return {"type": "input_text", "text": text}
+def _extract_get_image_path(item: dict[str, Any]) -> str | None:
+    if item.get("type") != "function_call" or item.get("name") != "get_image":
+        return None
+    arguments = item.get("arguments")
+    if not isinstance(arguments, str):
+        return None
+    try:
+        payload = json.loads(arguments)
+    except json.JSONDecodeError:
+        return None
+    image_path = payload.get("image_path")
+    if isinstance(image_path, str) and image_path:
+        return image_path
+    return None
+def _sanitize_history_value(value: Any, image_path: str | None = None) -> Any:
+    if isinstance(value, list):
+        return [_sanitize_history_value(item, image_path) for item in value]
+    if not isinstance(value, dict):
+        return value
+    if value.get("type") == "input_image":
+        image_url = value.get("image_url")
+        if isinstance(image_url, str) and image_url.startswith("data:"):
+            return _image_history_placeholder(image_path)
+    return {
+        key: _sanitize_history_value(item, image_path)
+        for key, item in value.items()
+    }
+def sanitize_history(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Strip large image payloads from model history while keeping a re-fetch hint."""
+    image_paths_by_call_id: dict[str, str] = {}
+    sanitized: list[dict[str, Any]] = []
+    for item in history:
+        if not isinstance(item, dict):
+            sanitized.append(item)
+            continue
+        image_path = _extract_get_image_path(item)
+        call_id = item.get("call_id")
+        if image_path and isinstance(call_id, str):
+            image_paths_by_call_id[call_id] = image_path
+        history_image_path = None
+        if item.get("type") == "function_call_output" and isinstance(call_id, str):
+            history_image_path = image_paths_by_call_id.get(call_id)
+        sanitized.append(_sanitize_history_value(item, history_image_path))
+    return sanitized
+@dataclass
+class ChatSession:
+    id: str
+    created_at: str
+    updated_at: str
+    model: str
+    language: str
+    title: str
+    turn_count: int
+    history: list[dict[str, Any]]
+    user_turns: list[str]
+    assistant_texts: list[str]
+    path: Path
+    @classmethod
+    def new(cls, kb_dir: Path, model: str, language: str) -> "ChatSession":
+        now = _utcnow_iso()
+        sid = _gen_id()
+        return cls(
+            id=sid,
+            created_at=now,
+            updated_at=now,
+            model=model,
+            language=language,
+            title="",
+            turn_count=0,
+            history=[],
+            user_turns=[],
+            assistant_texts=[],
+            path=chats_dir(kb_dir) / f"{sid}.json",
+        )
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "created_at": self.created_at,
+            "updated_at": self.updated_at,
+            "model": self.model,
+            "language": self.language,
+            "title": self.title,
+            "turn_count": self.turn_count,
+            "history": self.history,
+            "user_turns": self.user_turns,
+            "assistant_texts": self.assistant_texts,
+        }
+    def save(self) -> None:
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        tmp = self.path.with_suffix(".json.tmp")
+        tmp.write_text(
+            json.dumps(self.to_dict(), ensure_ascii=False, indent=2, default=str),
+            encoding="utf-8",
+        )
+        os.replace(tmp, self.path)
+    def record_turn(
+        self,
+        user_message: str,
+        assistant_text: str,
+        new_history: list[dict[str, Any]],
+    ) -> None:
+        self.history = sanitize_history(new_history)
+        self.user_turns.append(user_message)
+        self.assistant_texts.append(assistant_text)
+        self.turn_count = len(self.user_turns)
+        if not self.title:
+            self.title = _title_from(user_message)
+        self.updated_at = _utcnow_iso()
+        self.save()
+def load_session(kb_dir: Path, session_id: str) -> ChatSession:
+    path = chats_dir(kb_dir) / f"{session_id}.json"
+    data = json.loads(path.read_text(encoding="utf-8"))
+    return ChatSession(
+        id=data["id"],
+        created_at=data["created_at"],
+        updated_at=data["updated_at"],
+        model=data["model"],
+        language=data.get("language", "en"),
+        title=data.get("title", ""),
+        turn_count=data.get("turn_count", 0),
+        history=sanitize_history(data.get("history", [])),
+        user_turns=data.get("user_turns", []),
+        assistant_texts=data.get("assistant_texts", []),
+        path=path,
+    )
+def list_sessions(kb_dir: Path) -> list[dict[str, Any]]:
+    """Return session metadata dicts, most recently updated first."""
+    d = chats_dir(kb_dir)
+    if not d.exists():
+        return []
+    out: list[dict[str, Any]] = []
+    for p in d.glob("*.json"):
+        try:
+            data = json.loads(p.read_text(encoding="utf-8"))
+        except (json.JSONDecodeError, OSError):
+            continue
+        out.append(
+            {
+                "id": data.get("id", p.stem),
+                "title": data.get("title", ""),
+                "turn_count": data.get("turn_count", 0),
+                "updated_at": data.get("updated_at", ""),
+                "model": data.get("model", ""),
+            }
+        )
+    out.sort(key=lambda s: (s["updated_at"], s["id"]), reverse=True)
+    return out
+def resolve_session_id(kb_dir: Path, query: str) -> str | None:
+    """Resolve a query to a full session id.
+    ``query`` may be:
+    - ``"__latest__"`` — returns the most recently updated session id.
+    - A full session id — returned as-is if it exists.
+    - A unique prefix of a session id — expanded to the full id.
+    Returns ``None`` if no session matches. Raises ``ValueError`` when a
+    prefix is ambiguous.
+    """
+    sessions = list_sessions(kb_dir)
+    if not sessions:
+        return None
+    if query == "__latest__":
+        return sessions[0]["id"]
+    for s in sessions:
+        if s["id"] == query:
+            return s["id"]
+    matches = [s["id"] for s in sessions if s["id"].startswith(query)]
+    if len(matches) == 1:
+        return matches[0]
+    if len(matches) > 1:
+        raise ValueError(
+            f"Ambiguous session prefix '{query}' matches: {', '.join(matches)}"
+        )
+    return None
+def delete_session(kb_dir: Path, session_id: str) -> bool:
+    path = chats_dir(kb_dir) / f"{session_id}.json"
+    if path.exists():
+        path.unlink()
+        return True
+    return False
+def relative_time(iso_str: str) -> str:
+    """Render an ISO-8601 timestamp as a short relative string."""
+    try:
+        t = datetime.strptime(iso_str, "%Y-%m-%dT%H:%M:%SZ").replace(
+            tzinfo=timezone.utc
+        )
+    except (ValueError, TypeError):
+        return iso_str or ""
+    now = datetime.now(timezone.utc)
+    seconds = int((now - t).total_seconds())
+    if seconds < 60:
+        return "just now"
+    if seconds < 3600:
+        return f"{seconds // 60}m ago"
+    if seconds < 86400:
+        return f"{seconds // 3600}h ago"
+    if seconds < 86400 * 7:
+        return f"{seconds // 86400}d ago"
+    return t.strftime("%Y-%m-%d")

{openkb-0.1.0.dev1 → openkb-0.1.2}/openkb/agent/compiler.py RENAMED Viewed

@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 _SYSTEM_TEMPLATE = """\
-You are a wiki compilation agent for a personal knowledge base.
+You are OpenKB's wiki compilation agent for a personal knowledge base.
 {schema_md}
@@ -284,6 +284,57 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
     return "\n".join(lines) or "(none yet)"
+def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None:
+    """Return the [start, end) bounds for a Markdown H2 section."""
+    for i, line in enumerate(lines):
+        if line == heading:
+            start = i + 1
+            end = len(lines)
+            for j in range(start, len(lines)):
+                if lines[j].startswith("## "):
+                    end = j
+                    break
+            return start, end
+    return None
+def _section_contains_link(lines: list[str], heading: str, link: str) -> bool:
+    """Check whether an index entry already exists inside the named section."""
+    bounds = _get_section_bounds(lines, heading)
+    if bounds is None:
+        return False
+    start, end = bounds
+    entry_prefix = f"- {link}"
+    return any(line.startswith(entry_prefix) for line in lines[start:end])
+def _replace_section_entry(lines: list[str], heading: str, link: str, entry: str) -> bool:
+    """Replace the first matching entry within a specific section."""
+    bounds = _get_section_bounds(lines, heading)
+    if bounds is None:
+        return False
+    start, end = bounds
+    entry_prefix = f"- {link}"
+    for i in range(start, end):
+        if lines[i].startswith(entry_prefix):
+            lines[i] = entry
+            return True
+    return False
+def _insert_section_entry(lines: list[str], heading: str, entry: str) -> bool:
+    """Insert a new entry at the top of a specific section."""
+    bounds = _get_section_bounds(lines, heading)
+    if bounds is None:
+        return False
+    start, _ = bounds
+    lines.insert(start, entry)
+    return True
 def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
                     doc_type: str = "short") -> None:
@@ -460,7 +511,6 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str])
             text += f"\n\n## Related Documents\n- {link}\n"
         path.write_text(text, encoding="utf-8")
 def _update_index(
     wiki_dir: Path, doc_name: str, concept_names: list[str],
     doc_brief: str = "", concept_briefs: dict[str, str] | None = None,
@@ -469,8 +519,9 @@ def _update_index(
     """Append document and concept entries to index.md.
     When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries
-    are written as ``- [[link]] (type) — brief text``.  Existing entries are
-    detected by the link part only and skipped to avoid duplicates.
+    are written as ``- [[link]] (type) — brief text``. Existing entries are
+    detected within their own section by exact entry prefix and skipped to
+    avoid duplicates.
     ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the
     query agent knows how to access detailed content.
     """
@@ -484,34 +535,27 @@ def _update_index(
             encoding="utf-8",
         )
-    text = index_path.read_text(encoding="utf-8")
+    lines = index_path.read_text(encoding="utf-8").split("\n")
     doc_link = f"[[summaries/{doc_name}]]"
-    if doc_link not in text:
+    if not _section_contains_link(lines, "## Documents", doc_link):
         doc_entry = f"- {doc_link} ({doc_type})"
         if doc_brief:
             doc_entry += f" — {doc_brief}"
-        if "## Documents" in text:
-            text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1)
+        _insert_section_entry(lines, "## Documents", doc_entry)
     for name in concept_names:
         concept_link = f"[[concepts/{name}]]"
         concept_entry = f"- {concept_link}"
         if name in concept_briefs:
             concept_entry += f" — {concept_briefs[name]}"
-        if concept_link in text:
+        if _section_contains_link(lines, "## Concepts", concept_link):
             if name in concept_briefs:
-                lines = text.split("\n")
-                for i, line in enumerate(lines):
-                    if concept_link in line:
-                        lines[i] = concept_entry
-                        break
-                text = "\n".join(lines)
+                _replace_section_entry(lines, "## Concepts", concept_link, concept_entry)
         else:
-            if "## Concepts" in text:
-                text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1)
+            _insert_section_entry(lines, "## Concepts", concept_entry)
-    index_path.write_text(text, encoding="utf-8")
+    index_path.write_text("\n".join(lines), encoding="utf-8")
 # ---------------------------------------------------------------------------

{openkb-0.1.0.dev1 → openkb-0.1.2}/openkb/agent/linter.py RENAMED Viewed

@@ -11,7 +11,7 @@ MAX_TURNS = 50
 from openkb.schema import SCHEMA_MD, get_agents_md
 _LINTER_INSTRUCTIONS_TEMPLATE = """\
-You are a knowledge-base semantic lint agent. Your job is to audit the wiki
+You are OpenKB's semantic lint agent. Your job is to audit the wiki
 for quality issues that structural tools cannot detect.
 {schema_md}
@@ -50,7 +50,7 @@ def build_lint_agent(wiki_root: str, model: str, language: str = "en") -> Agent:
     """
     schema_md = get_agents_md(Path(wiki_root))
     instructions = _LINTER_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
-    instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
+    instructions += f"\n\nIMPORTANT: Write the lint report in {language} language."
     @function_tool
     def list_files(directory: str) -> str:

{openkb-0.1.0.dev1 → openkb-0.1.2}/openkb/agent/query.py RENAMED Viewed

@@ -6,13 +6,13 @@ from pathlib import Path
 from agents import Agent, Runner, function_tool
 from agents import ToolOutputImage, ToolOutputText
-from openkb.agent.tools import read_wiki_file, read_wiki_image
+from openkb.agent.tools import get_wiki_page_content, read_wiki_file, read_wiki_image
 MAX_TURNS = 50
 from openkb.schema import get_agents_md
 _QUERY_INSTRUCTIONS_TEMPLATE = """\
-You are a knowledge-base Q&A agent. You answer questions by searching the wiki.
+You are OpenKB, a knowledge-base Q&A agent. You answer questions by searching the wiki.
 {schema_md}
@@ -20,7 +20,8 @@ You are a knowledge-base Q&A agent. You answer questions by searching the wiki.
 1. Read index.md to see all documents and concepts with brief summaries.
    Each document is marked (short) or (pageindex) to indicate its type.
 2. Read relevant summary pages (summaries/) for document overviews.
-   Note: summaries may omit details.
+   Summaries may omit details — if you need more, follow the summary's
+   `full_text` frontmatter field to the source (see step 4).
 3. Read concept pages (concepts/) for cross-document synthesis.
 4. When you need detailed source document content, each summary page has a
    `full_text` frontmatter field with the path to the original document content:
@@ -28,9 +29,8 @@ You are a knowledge-base Q&A agent. You answer questions by searching the wiki.
    - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
      with tight page ranges. The summary shows document tree structure with page
      ranges to help you target. Never fetch the whole document.
-5. When source content references images (e.g. ![image](sources/images/doc/file.png)),
-   use get_image to view them. Always view images when the question asks about
-   a figure, chart, diagram, or visual content.
+5. Source content may reference images (e.g. ![image](sources/images/doc/file.png)).
+   Use the get_image tool to view them when needed.
 6. Synthesize a clear, concise, well-cited answer grounded in wiki content.
 Answer based only on wiki content. Be concise.
@@ -44,7 +44,7 @@ def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent
     """Build and return the Q&A agent."""
     schema_md = get_agents_md(Path(wiki_root))
     instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
-    instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
+    instructions += f"\n\nIMPORTANT: Answer in {language} language."
     @function_tool
     def read_file(path: str) -> str:
@@ -55,7 +55,7 @@ def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent
         return read_wiki_file(path, wiki_root)
     @function_tool
-    def get_page_content_tool(doc_name: str, pages: str) -> str:
+    def get_page_content(doc_name: str, pages: str) -> str:
         """Get text content of specific pages from a PageIndex (long) document.
         Only use for documents with doc_type: pageindex. For short documents,
         use read_file instead.
@@ -63,13 +63,15 @@ def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent
             doc_name: Document name (e.g. 'attention-is-all-you-need').
             pages: Page specification (e.g. '3-5,7,10-12').
         """
-        from openkb.agent.tools import get_page_content
-        return get_page_content(doc_name, pages, wiki_root)
+        return get_wiki_page_content(doc_name, pages, wiki_root)
     @function_tool
     def get_image(image_path: str) -> ToolOutputImage | ToolOutputText:
         """View an image from the wiki.
-        Use when source content references images you need to see.
+        Use when a question asks about a specific figure, chart, or diagram
+        you'd need to see to answer accurately.
         Args:
             image_path: Image path relative to wiki root (e.g. 'sources/images/doc/p1_img1.png').
         """
@@ -83,7 +85,7 @@ def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent
     return Agent(
         name="wiki-query",
         instructions=instructions,
-        tools=[read_file, get_page_content_tool, get_image],
+        tools=[read_file, get_page_content, get_image],
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )

{openkb-0.1.0.dev1 → openkb-0.1.2}/openkb/agent/tools.py RENAMED Viewed

@@ -89,7 +89,7 @@ def parse_pages(pages: str) -> list[int]:
     return sorted(n for n in result if n > 0)
-def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
+def get_wiki_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
     """Return formatted content for specified pages of a document.
     Reads ``{wiki_root}/sources/{doc_name}.json`` which must be a JSON array of

{openkb-0.1.0.dev1 → openkb-0.1.2}/openkb/cli.py RENAMED Viewed

@@ -1,6 +1,12 @@
 """OpenKB CLI — command-line interface for the knowledge base workflow."""
 from __future__ import annotations
+# Silence import-time warnings (e.g. pydub's missing-ffmpeg warning emitted
+# when markitdown pulls it in). markitdown later clobbers the filters during
+# its own import, so we re-apply after all imports below.
+import warnings
+warnings.filterwarnings("ignore")
 import asyncio
 import json
 import logging
@@ -256,22 +262,23 @@ def init():
         return
     # Interactive prompts
+    click.echo("Pick an LLM in `provider/model` LiteLLM format:")
+    click.echo("  OpenAI:    gpt-5.4-mini, gpt-5.4")
+    click.echo("  Anthropic: anthropic/claude-sonnet-4-6, anthropic/claude-opus-4-6")
+    click.echo("  Gemini:    gemini/gemini-3.1-pro-preview, gemini/gemini-3-flash-preview")
+    click.echo("  Others:    see https://docs.litellm.ai/docs/providers")
+    click.echo()
     model = click.prompt(
-        f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]",
+        f"Model (enter for default {DEFAULT_CONFIG['model']})",
         default=DEFAULT_CONFIG["model"],
         show_default=False,
     )
-    language = click.prompt(
-        f"Language [default: {DEFAULT_CONFIG['language']}]",
-        default=DEFAULT_CONFIG["language"],
+    api_key = click.prompt(
+        "LLM API Key (saved to .env, enter to skip)",
+        default="",
+        hide_input=True,
         show_default=False,
-    )
-    pageindex_threshold = click.prompt(
-        f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]",
-        default=DEFAULT_CONFIG["pageindex_threshold"],
-        type=int,
-        show_default=False,
-    )
+    ).strip()
     # Create directory structure
     Path("raw").mkdir(exist_ok=True)
     Path("wiki/sources/images").mkdir(parents=True, exist_ok=True)
@@ -290,12 +297,22 @@ def init():
     openkb_dir.mkdir()
     config = {
         "model": model,
-        "language": language,
-        "pageindex_threshold": pageindex_threshold,
+        "language": DEFAULT_CONFIG["language"],
+        "pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
     }
     save_config(openkb_dir / "config.yaml", config)
     (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
+    # Write API key to KB-local .env (0600) if the user provided one
+    if api_key:
+        env_path = Path(".env")
+        if env_path.exists():
+            click.echo(".env already exists, skipping write. Add LLM_API_KEY manually if needed.")
+        else:
+            env_path.write_text(f"LLM_API_KEY={api_key}\n", encoding="utf-8")
+            os.chmod(env_path, 0o600)
+            click.echo("Saved LLM API key to .env.")
     # Register this KB in the global config
     register_kb(Path.cwd())
@@ -378,6 +395,107 @@ def query(ctx, question, save):
         click.echo(f"\nSaved to {explore_path}")
+@cli.command()
+@click.option(
+    "--resume", "-r", "resume",
+    is_flag=False, flag_value="__latest__", default=None, metavar="[ID]",
+    help="Resume the latest chat session, or a specific one by id or prefix.",
+)
+@click.option(
+    "--list", "list_sessions_flag",
+    is_flag=True, default=False,
+    help="List chat sessions.",
+)
+@click.option(
+    "--delete", "delete_id",
+    default=None, metavar="ID",
+    help="Delete a chat session by id or prefix.",
+)
+@click.option(
+    "--no-color", "no_color",
+    is_flag=True, default=False,
+    help="Disable colored output.",
+)
+@click.pass_context
+def chat(ctx, resume, list_sessions_flag, delete_id, no_color):
+    """Start an interactive chat with the knowledge base."""
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
+    if kb_dir is None:
+        click.echo("No knowledge base found. Run `openkb init` first.")
+        return
+    from openkb.agent.chat_session import (
+        ChatSession,
+        delete_session,
+        list_sessions,
+        load_session,
+        relative_time,
+        resolve_session_id,
+    )
+    if list_sessions_flag:
+        sessions = list_sessions(kb_dir)
+        if not sessions:
+            click.echo("No chat sessions yet.")
+            return
+        click.echo(f"  {'ID':<22} {'TURNS':<6} {'UPDATED':<12} TITLE")
+        click.echo(f"  {'-'*22} {'-'*6} {'-'*12} {'-'*30}")
+        for s in sessions:
+            rel = relative_time(s.get("updated_at", ""))
+            title = s.get("title") or "(empty)"
+            click.echo(
+                f"  {s['id']:<22} {s['turn_count']:<6} {rel:<12} {title}"
+            )
+        click.echo(
+            f"\n{len(sessions)} session(s) in {kb_dir / '.openkb' / 'chats'}"
+        )
+        return
+    if delete_id is not None:
+        try:
+            resolved = resolve_session_id(kb_dir, delete_id)
+        except ValueError as exc:
+            click.echo(f"[ERROR] {exc}")
+            return
+        if not resolved:
+            click.echo(f"No matching session: {delete_id}")
+            return
+        if delete_session(kb_dir, resolved):
+            click.echo(f"Deleted session {resolved}")
+        else:
+            click.echo(f"Could not delete session: {resolved}")
+        return
+    openkb_dir = kb_dir / ".openkb"
+    config = load_config(openkb_dir / "config.yaml")
+    _setup_llm_key(kb_dir)
+    if resume is not None:
+        try:
+            resolved = resolve_session_id(kb_dir, resume)
+        except ValueError as exc:
+            click.echo(f"[ERROR] {exc}")
+            return
+        if not resolved:
+            if resume == "__latest__":
+                click.echo("No previous chat sessions to resume.")
+            else:
+                click.echo(f"No matching session: {resume}")
+            return
+        session = load_session(kb_dir, resolved)
+    else:
+        model: str = config.get("model", DEFAULT_CONFIG["model"])
+        language: str = config.get("language", "en")
+        session = ChatSession.new(kb_dir, model, language)
+    from openkb.agent.chat import run_chat
+    try:
+        asyncio.run(run_chat(kb_dir, session, no_color=no_color))
+    except Exception as exc:
+        click.echo(f"[ERROR] Chat failed: {exc}")
 @cli.command()
 @click.pass_context
 def watch(ctx):

{openkb-0.1.0.dev1 → openkb-0.1.2}/openkb/indexer.py RENAMED Viewed

@@ -77,13 +77,28 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
         "structure": structure,
     }
-    # Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
+    # Write wiki/sources/ — per-page content
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
     images_dir = sources_dir / "images" / pdf_path.stem
     from openkb.images import convert_pdf_to_pages
-    all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
+    all_pages: list = []
+    if pageindex_api_key:
+        # Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
+        # requires a page range, so pass "1-N".
+        from openkb.converter import get_pdf_page_count
+        page_count = get_pdf_page_count(pdf_path)
+        try:
+            all_pages = col.get_page_content(doc_id, f"1-{page_count}")
+        except Exception as exc:
+            logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc)
+    if not all_pages:
+        if pageindex_api_key:
+            logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
+        all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
     (sources_dir / f"{pdf_path.stem}.json").write_text(
         json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",

{openkb-0.1.0.dev1 → openkb-0.1.2}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "openkb"
-version = "0.1.0.dev1"
-description = "OpenKB — Open LLM Knowledge Base, powered by PageIndex"
+version = "0.1.2"
+description = "OpenKB: Open LLM Knowledge Base, powered by PageIndex"
 readme = "README.md"
 license = "Apache-2.0"
 authors = [
@@ -37,14 +37,22 @@ json-repair = "*"
 litellm = "*"
 markitdown = {version = "*", extras = ["all"]}
 openai-agents = "*"
-pageindex = "0.3.0.dev0"
+pageindex = "0.3.0.dev1"
+prompt_toolkit = ">=3.0"
 python-dotenv = "*"
 pyyaml = "*"
 watchdog = ">=3.0"
+[tool.poetry.group.dev.dependencies]
+pytest = "*"
+pytest-asyncio = "*"
 [tool.poetry.scripts]
 openkb = "openkb.cli:cli"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"