PyPI - pythonclaw - Versions diffs - 0.2.0__py3-none-any.whl - Mend

pythonclaw 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

pythonclaw/__init__.py +17 -0
pythonclaw/__main__.py +6 -0
pythonclaw/channels/discord_bot.py +231 -0
pythonclaw/channels/telegram_bot.py +236 -0
pythonclaw/config.py +190 -0
pythonclaw/core/__init__.py +25 -0
pythonclaw/core/agent.py +773 -0
pythonclaw/core/compaction.py +220 -0
pythonclaw/core/knowledge/rag.py +93 -0
pythonclaw/core/llm/anthropic_client.py +107 -0
pythonclaw/core/llm/base.py +26 -0
pythonclaw/core/llm/gemini_client.py +139 -0
pythonclaw/core/llm/openai_compatible.py +39 -0
pythonclaw/core/llm/response.py +57 -0
pythonclaw/core/memory/manager.py +120 -0
pythonclaw/core/memory/storage.py +164 -0
pythonclaw/core/persistent_agent.py +103 -0
pythonclaw/core/retrieval/__init__.py +6 -0
pythonclaw/core/retrieval/chunker.py +78 -0
pythonclaw/core/retrieval/dense.py +152 -0
pythonclaw/core/retrieval/fusion.py +51 -0
pythonclaw/core/retrieval/reranker.py +112 -0
pythonclaw/core/retrieval/retriever.py +166 -0
pythonclaw/core/retrieval/sparse.py +69 -0
pythonclaw/core/session_store.py +269 -0
pythonclaw/core/skill_loader.py +322 -0
pythonclaw/core/skillhub.py +290 -0
pythonclaw/core/tools.py +622 -0
pythonclaw/core/utils.py +64 -0
pythonclaw/daemon.py +221 -0
pythonclaw/init.py +61 -0
pythonclaw/main.py +489 -0
pythonclaw/onboard.py +290 -0
pythonclaw/scheduler/cron.py +310 -0
pythonclaw/scheduler/heartbeat.py +178 -0
pythonclaw/server.py +145 -0
pythonclaw/session_manager.py +104 -0
pythonclaw/templates/persona/demo_persona.md +2 -0
pythonclaw/templates/skills/communication/CATEGORY.md +4 -0
pythonclaw/templates/skills/communication/email/SKILL.md +54 -0
pythonclaw/templates/skills/communication/email/__pycache__/send_email.cpython-311.pyc +0 -0
pythonclaw/templates/skills/communication/email/send_email.py +88 -0
pythonclaw/templates/skills/data/CATEGORY.md +4 -0
pythonclaw/templates/skills/data/csv_analyzer/SKILL.md +51 -0
pythonclaw/templates/skills/data/csv_analyzer/__pycache__/analyze.cpython-311.pyc +0 -0
pythonclaw/templates/skills/data/csv_analyzer/analyze.py +138 -0
pythonclaw/templates/skills/data/finance/SKILL.md +41 -0
pythonclaw/templates/skills/data/finance/__pycache__/fetch_quote.cpython-311.pyc +0 -0
pythonclaw/templates/skills/data/finance/fetch_quote.py +118 -0
pythonclaw/templates/skills/data/news/SKILL.md +39 -0
pythonclaw/templates/skills/data/news/__pycache__/search_news.cpython-311.pyc +0 -0
pythonclaw/templates/skills/data/news/search_news.py +57 -0
pythonclaw/templates/skills/data/pdf_reader/SKILL.md +40 -0
pythonclaw/templates/skills/data/pdf_reader/__pycache__/read_pdf.cpython-311.pyc +0 -0
pythonclaw/templates/skills/data/pdf_reader/read_pdf.py +113 -0
pythonclaw/templates/skills/data/scraper/SKILL.md +39 -0
pythonclaw/templates/skills/data/scraper/__pycache__/scrape.cpython-311.pyc +0 -0
pythonclaw/templates/skills/data/scraper/scrape.py +92 -0
pythonclaw/templates/skills/data/weather/SKILL.md +42 -0
pythonclaw/templates/skills/data/weather/__pycache__/weather.cpython-311.pyc +0 -0
pythonclaw/templates/skills/data/weather/weather.py +142 -0
pythonclaw/templates/skills/data/youtube/SKILL.md +43 -0
pythonclaw/templates/skills/data/youtube/__pycache__/youtube_info.cpython-311.pyc +0 -0
pythonclaw/templates/skills/data/youtube/youtube_info.py +167 -0
pythonclaw/templates/skills/dev/CATEGORY.md +4 -0
pythonclaw/templates/skills/dev/code_runner/SKILL.md +46 -0
pythonclaw/templates/skills/dev/code_runner/__pycache__/run_code.cpython-311.pyc +0 -0
pythonclaw/templates/skills/dev/code_runner/run_code.py +117 -0
pythonclaw/templates/skills/dev/github/SKILL.md +52 -0
pythonclaw/templates/skills/dev/github/__pycache__/gh.cpython-311.pyc +0 -0
pythonclaw/templates/skills/dev/github/gh.py +165 -0
pythonclaw/templates/skills/dev/http_request/SKILL.md +40 -0
pythonclaw/templates/skills/dev/http_request/__pycache__/request.cpython-311.pyc +0 -0
pythonclaw/templates/skills/dev/http_request/request.py +90 -0
pythonclaw/templates/skills/google/CATEGORY.md +4 -0
pythonclaw/templates/skills/google/workspace/SKILL.md +98 -0
pythonclaw/templates/skills/google/workspace/check_setup.sh +52 -0
pythonclaw/templates/skills/meta/CATEGORY.md +4 -0
pythonclaw/templates/skills/meta/skill_creator/SKILL.md +151 -0
pythonclaw/templates/skills/system/CATEGORY.md +4 -0
pythonclaw/templates/skills/system/change_persona/SKILL.md +41 -0
pythonclaw/templates/skills/system/change_setting/SKILL.md +65 -0
pythonclaw/templates/skills/system/change_setting/__pycache__/update_config.cpython-311.pyc +0 -0
pythonclaw/templates/skills/system/change_setting/update_config.py +129 -0
pythonclaw/templates/skills/system/change_soul/SKILL.md +41 -0
pythonclaw/templates/skills/system/onboarding/SKILL.md +63 -0
pythonclaw/templates/skills/system/onboarding/__pycache__/write_identity.cpython-311.pyc +0 -0
pythonclaw/templates/skills/system/onboarding/write_identity.py +218 -0
pythonclaw/templates/skills/system/random/SKILL.md +33 -0
pythonclaw/templates/skills/system/random/__pycache__/random_util.cpython-311.pyc +0 -0
pythonclaw/templates/skills/system/random/random_util.py +45 -0
pythonclaw/templates/skills/system/time/SKILL.md +33 -0
pythonclaw/templates/skills/system/time/__pycache__/time_util.cpython-311.pyc +0 -0
pythonclaw/templates/skills/system/time/time_util.py +81 -0
pythonclaw/templates/skills/text/CATEGORY.md +4 -0
pythonclaw/templates/skills/text/translator/SKILL.md +47 -0
pythonclaw/templates/skills/text/translator/__pycache__/translate.cpython-311.pyc +0 -0
pythonclaw/templates/skills/text/translator/translate.py +66 -0
pythonclaw/templates/skills/web/CATEGORY.md +4 -0
pythonclaw/templates/skills/web/tavily/SKILL.md +61 -0
pythonclaw/templates/soul/SOUL.md +54 -0
pythonclaw/web/__init__.py +1 -0
pythonclaw/web/app.py +585 -0
pythonclaw/web/static/favicon.png +0 -0
pythonclaw/web/static/index.html +1318 -0
pythonclaw/web/static/logo.png +0 -0
pythonclaw-0.2.0.dist-info/METADATA +410 -0
pythonclaw-0.2.0.dist-info/RECORD +112 -0
pythonclaw-0.2.0.dist-info/WHEEL +5 -0
pythonclaw-0.2.0.dist-info/entry_points.txt +2 -0
pythonclaw-0.2.0.dist-info/licenses/LICENSE +21 -0
pythonclaw-0.2.0.dist-info/top_level.txt +1 -0

pythonclaw/core/memory/manager.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""
+MemoryManager — long-term key-value memory with hybrid RAG recall.
+Storage
+-------
+Memories are stored as Markdown files:
+  - MEMORY.md        — curated long-term memory (latest value per key)
+  - YYYY-MM-DD.md    — daily append-only log
+When writing, both MEMORY.md and today's daily log are updated.
+When reading, MEMORY.md is the source of truth (holds latest per key).
+Conflict resolution: if the same key is written multiple times, the most
+recent write wins (MEMORY.md is always overwritten with the latest value).
+Recall
+------
+When a specific query is given, the manager converts every memory entry into a
+short "chunk"  ("{key}: {value}")  and runs hybrid sparse + dense retrieval to
+return the most relevant ones.  When the query is empty or "*", ALL memories
+are returned (full-dump mode, used by compaction and legacy callers).
+"""
+from __future__ import annotations
+import logging
+from .storage import MemoryStorage
+from ..retrieval.retriever import HybridRetriever
+logger = logging.getLogger(__name__)
+_DUMP_TRIGGERS = {"", "*", "all", "everything"}
+class MemoryManager:
+    """
+    Manages long-term memories stored as Markdown files.
+    Parameters
+    ----------
+    memory_dir : path to the memory directory (contains MEMORY.md + daily logs).
+    use_dense  : include embedding retrieval for recall (False by default for
+                 memory — BM25 alone is fast and sufficient for small corpora).
+    """
+    def __init__(
+        self,
+        memory_dir: str | None = None,
+        use_dense: bool = False,
+    ) -> None:
+        import os
+        if memory_dir is None:
+            home = os.path.expanduser("~")
+            memory_dir = os.path.join(home, ".ada", "memory")
+        self.storage = MemoryStorage(memory_dir)
+        self._use_dense = use_dense
+    # ── Core operations ──────────────────────────────────────────────────────
+    def remember(self, content: str, key: str | None = None) -> str:
+        """Store *content* under *key* in long-term memory."""
+        if not key:
+            raise ValueError("Key is required for memory storage.")
+        self.storage.set(key, content)
+        return f"Memory stored: [{key}] = {content}"
+    def recall(self, query: str, top_k: int = 10) -> str:
+        """
+        Retrieve memories relevant to *query*.
+        - If query is empty / "*" / "all" → returns ALL memories (full dump).
+        - Otherwise → runs hybrid BM25 (+ optional dense) retrieval and
+          returns the top *top_k* most relevant entries.
+        """
+        all_memories = self.storage.list_all()
+        if not all_memories:
+            return "No memories found."
+        # Full-dump mode
+        if query.strip().lower() in _DUMP_TRIGGERS:
+            lines = [f"- {k}: {v}" for k, v in all_memories.items()]
+            return "\n".join(lines)
+        # Smart retrieval
+        corpus = [
+            {"source": k, "content": f"{k}: {v}"}
+            for k, v in all_memories.items()
+        ]
+        retriever = HybridRetriever(
+            provider=None,          # no LLM re-ranker for memory
+            use_sparse=True,
+            use_dense=self._use_dense,
+            use_reranker=False,
+        )
+        retriever.fit(corpus)
+        hits = retriever.retrieve(query, top_k=top_k)
+        if not hits:
+            logger.debug("[MemoryManager] No RAG hits for '%s', returning all.", query)
+            lines = [f"- {k}: {v}" for k, v in all_memories.items()]
+            return "(No close match found; showing all memories)\n" + "\n".join(lines)
+        lines = [f"- {h['source']}: {h['content'].split(': ', 1)[-1]}" for h in hits]
+        return "\n".join(lines)
+    def forget(self, key: str) -> str:
+        """Remove a memory entry by key."""
+        if self.storage.get(key) is not None:
+            self.storage.delete(key)
+            return f"Forgot: {key}"
+        return f"Nothing found for: {key}"
+    # ── Helpers used by compaction ───────────────────────────────────────────
+    def list_all(self) -> dict:
+        """Return the raw {key: value} dict (used by compaction.memory_flush)."""
+        return self.storage.list_all()

pythonclaw/core/memory/storage.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""
+Markdown-backed memory storage (inspired by OpenClaw).
+Layout
+------
+  context/memory/MEMORY.md         — curated long-term memory (latest value per key)
+  context/memory/YYYY-MM-DD.md     — daily append-only log
+Write flow
+----------
+  set(key, value)  →  append to today's daily log  +  upsert into MEMORY.md
+Read flow
+---------
+  get(key)         →  read from MEMORY.md (always holds the latest)
+  list_all()       →  parse MEMORY.md and return {key: value}
+Conflict resolution
+-------------------
+  MEMORY.md always holds the latest value for each key. When set() is called,
+  it updates MEMORY.md with the new timestamp, so the most recent write wins.
+"""
+from __future__ import annotations
+import os
+import re
+from datetime import datetime
+from typing import Any, Dict
+_KEY_HEADER = re.compile(r"^## (.+)$", re.MULTILINE)
+_UPDATED_LINE = re.compile(r"^> Updated: (.+)$", re.MULTILINE)
+class MemoryStorage:
+    """Markdown-backed key-value memory with daily logs."""
+    def __init__(self, memory_dir: str = "context/memory") -> None:
+        self.memory_dir = memory_dir
+        os.makedirs(memory_dir, exist_ok=True)
+        self._memory_file = os.path.join(memory_dir, "MEMORY.md")
+        self.data: Dict[str, dict] = {}   # key → {"value": ..., "updated": ...}
+        self._load()
+    # ── Persistence ───────────────────────────────────────────────────────────
+    def _load(self) -> None:
+        """Parse MEMORY.md into self.data."""
+        if not os.path.exists(self._memory_file):
+            self.data = {}
+            return
+        try:
+            with open(self._memory_file, "r", encoding="utf-8") as f:
+                text = f.read()
+        except OSError:
+            self.data = {}
+            return
+        self.data = self._parse_memory_md(text)
+    @staticmethod
+    def _parse_memory_md(text: str) -> Dict[str, dict]:
+        """
+        Parse a MEMORY.md into {key: {"value": str, "updated": str}}.
+        Expected format per entry::
+            ## key_name
+            > Updated: 2026-02-23 15:30:00
+            The actual value content here.
+        """
+        entries: Dict[str, dict] = {}
+        sections = re.split(r"(?=^## )", text, flags=re.MULTILINE)
+        for section in sections:
+            section = section.strip()
+            if not section:
+                continue
+            key_match = _KEY_HEADER.match(section)
+            if not key_match:
+                continue
+            key = key_match.group(1).strip()
+            updated = ""
+            upd_match = _UPDATED_LINE.search(section)
+            if upd_match:
+                updated = upd_match.group(1).strip()
+            # Content is everything after the metadata lines
+            lines = section.split("\n")
+            content_lines = []
+            past_header = False
+            for line in lines[1:]:  # skip the ## heading
+                if not past_header:
+                    if line.startswith("> Updated:") or line.strip() == "":
+                        continue
+                    past_header = True
+                content_lines.append(line)
+            entries[key] = {
+                "value": "\n".join(content_lines).strip(),
+                "updated": updated,
+            }
+        return entries
+    def _save_memory_md(self) -> None:
+        """Write self.data back to MEMORY.md."""
+        os.makedirs(os.path.dirname(self._memory_file) or ".", exist_ok=True)
+        lines = ["# Long-Term Memory\n"]
+        for key, entry in self.data.items():
+            updated = entry.get("updated", "")
+            value = entry.get("value", "")
+            lines.append(f"## {key}")
+            lines.append(f"> Updated: {updated}")
+            lines.append("")
+            lines.append(value)
+            lines.append("")
+        try:
+            with open(self._memory_file, "w", encoding="utf-8") as f:
+                f.write("\n".join(lines))
+        except OSError as e:
+            print(f"Error saving MEMORY.md: {e}")
+    def _append_daily_log(self, key: str, value: str) -> None:
+        """Append an entry to today's daily log file."""
+        today = datetime.now().strftime("%Y-%m-%d")
+        daily_file = os.path.join(self.memory_dir, f"{today}.md")
+        now = datetime.now().strftime("%H:%M:%S")
+        is_new = not os.path.exists(daily_file)
+        try:
+            with open(daily_file, "a", encoding="utf-8") as f:
+                if is_new:
+                    f.write(f"# Daily Memory — {today}\n\n")
+                f.write(f"### {now} — {key}\n\n{value}\n\n")
+        except OSError as e:
+            print(f"Error writing daily memory log: {e}")
+    # ── Public API ────────────────────────────────────────────────────────────
+    def get(self, key: str) -> Any:
+        entry = self.data.get(key)
+        return entry["value"] if entry else None
+    def set(self, key: str, value: Any) -> None:
+        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        self.data[key] = {"value": str(value), "updated": now}
+        self._save_memory_md()
+        self._append_daily_log(key, str(value))
+    def delete(self, key: str) -> None:
+        if key in self.data:
+            del self.data[key]
+            self._save_memory_md()
+    def list_all(self) -> Dict[str, Any]:
+        """Return {key: value} for all entries (latest version)."""
+        return {k: v["value"] for k, v in self.data.items()}

pythonclaw/core/persistent_agent.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""
+PersistentAgent — an Agent subclass that automatically saves its message
+history to a SessionStore after every chat() or compact() call.
+On construction it restores the previous conversation from the store so that
+sessions survive server restarts.
+Restoration strategy
+--------------------
+  messages[0]   — always rebuilt fresh by Agent.__init__ (soul + persona + skills)
+  messages[1:]  — restored from the Markdown session store
+This means soul/persona/skill changes take effect on the next restart while
+the full conversation history (including compaction summaries and skill
+injection messages) is preserved.
+Timestamps
+----------
+Each message carries a ``_ts`` field (ISO 8601) that records when it was
+created.  This enables time-based truncation in the SessionStore.
+"""
+from __future__ import annotations
+import logging
+import re
+from datetime import datetime
+from typing import TYPE_CHECKING
+from .agent import Agent
+if TYPE_CHECKING:
+    from .session_store import SessionStore
+logger = logging.getLogger(__name__)
+class PersistentAgent(Agent):
+    """Agent that auto-saves to and restores from a Markdown SessionStore."""
+    def __init__(
+        self,
+        *args,
+        store: "SessionStore",
+        session_id: str,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self._store = store
+        self._session_id = session_id
+        self._restore()
+    # ── Restore ──────────────────────────────────────────────────────────────
+    def _restore(self) -> None:
+        """Load saved messages and merge with the freshly built system prompt."""
+        saved = self._store.load(self._session_id)
+        if not saved:
+            return
+        initial_system = self.messages[0]   # freshly built system prompt
+        self.messages = [initial_system] + saved
+        # Re-infer which skills were loaded so _use_skill doesn't double-inject
+        for msg in saved:
+            if msg.get("role") == "system":
+                content = msg.get("content", "")
+                # Support both old ("Skill Enabled:") and new ("SKILL ACTIVATED:") formats
+                m = re.search(r"(?:Skill Enabled|SKILL ACTIVATED):\s*(.+)", content)
+                if m:
+                    self.loaded_skill_names.add(m.group(1).strip().rstrip("]"))
+        logger.info(
+            "[PersistentAgent] Restored session '%s': %d messages, %d skills",
+            self._session_id, len(saved), len(self.loaded_skill_names),
+        )
+    # ── Timestamp injection ──────────────────────────────────────────────────
+    @staticmethod
+    def _ensure_ts(msg: dict) -> dict:
+        """Add a ``_ts`` field to a message if it doesn't have one."""
+        if "_ts" not in msg:
+            msg["_ts"] = datetime.now().isoformat(timespec="seconds")
+        return msg
+    # ── Auto-save ────────────────────────────────────────────────────────────
+    def _save(self) -> None:
+        # Ensure every message has a timestamp before saving
+        for msg in self.messages[1:]:
+            self._ensure_ts(msg)
+        self._store.save(self._session_id, self.messages)
+    def chat(self, user_input: str) -> str:
+        response = super().chat(user_input)
+        self._save()
+        return response
+    def compact(self, instruction: str | None = None) -> str:
+        result = super().compact(instruction)
+        self._save()
+        return result

pythonclaw/core/retrieval/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Hybrid retrieval pipeline: BM25 + dense embeddings + RRF fusion."""
+from .retriever import HybridRetriever
+from .chunker import chunk_text, load_corpus_from_directory
+__all__ = ["HybridRetriever", "chunk_text", "load_corpus_from_directory"]

pythonclaw/core/retrieval/chunker.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""
+Text chunking utilities.
+Strategy
+--------
+1. Split document by paragraphs (double newline).
+2. Any paragraph longer than `max_chars` is further split with a sliding
+   window of size `chunk_size` and overlap `overlap`.
+3. Each chunk carries metadata: source filename, chunk index, character offset.
+Supported file extensions: .txt  .md
+"""
+from __future__ import annotations
+import os
+import re
+def chunk_text(
+    text: str,
+    source: str = "",
+    chunk_size: int = 400,
+    overlap: int = 80,
+) -> list[dict]:
+    """
+    Split *text* into chunks suitable for retrieval.
+    Returns a list of dicts:
+        {"source": str, "content": str, "chunk_idx": int}
+    """
+    # Split by blank lines (paragraph boundaries)
+    raw_paragraphs = re.split(r"\n{2,}", text)
+    paragraphs = [p.strip() for p in raw_paragraphs if p.strip()]
+    chunks: list[dict] = []
+    idx = 0
+    for para in paragraphs:
+        if len(para) <= chunk_size:
+            chunks.append({"source": source, "content": para, "chunk_idx": idx})
+            idx += 1
+        else:
+            # Sliding-window split for long paragraphs
+            start = 0
+            while start < len(para):
+                end = min(start + chunk_size, len(para))
+                window = para[start:end].strip()
+                if window:
+                    chunks.append({"source": source, "content": window, "chunk_idx": idx})
+                    idx += 1
+                if end == len(para):
+                    break
+                start += chunk_size - overlap
+    return chunks
+def load_corpus_from_directory(directory: str) -> list[dict]:
+    """
+    Load all .txt and .md files from *directory* and return a flat list of chunks.
+    """
+    corpus: list[dict] = []
+    if not os.path.isdir(directory):
+        return corpus
+    for filename in sorted(os.listdir(directory)):
+        if not filename.lower().endswith((".txt", ".md")):
+            continue
+        filepath = os.path.join(directory, filename)
+        try:
+            with open(filepath, "r", encoding="utf-8") as f:
+                text = f.read()
+            corpus.extend(chunk_text(text, source=filename))
+        except OSError as exc:
+            print(f"[Chunker] Could not read '{filepath}': {exc}")
+    return corpus

pythonclaw/core/retrieval/dense.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""
+Dense retriever — semantic embedding similarity.
+Priority order
+--------------
+1. sentence-transformers  (neural embeddings, best semantic quality)
+   pip install sentence-transformers
+2. scikit-learn TF-IDF + cosine similarity (no GPU, still beats pure BM25 for
+   paraphrase/synonym queries)
+   pip install scikit-learn numpy
+3. Pure-Python fallback: character bigram Jaccard similarity (always works,
+   poor quality — install one of the above for production use).
+All three expose the same interface:
+    fit(corpus)  →  retrieve(query, top_k)  →  [(score, chunk), ...]
+"""
+from __future__ import annotations
+# ── Availability probes ──────────────────────────────────────────────────────
+try:
+    from sentence_transformers import SentenceTransformer  # type: ignore
+    import numpy as np
+    _HAS_ST = True
+except ImportError:
+    _HAS_ST = False
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer  # type: ignore
+    from sklearn.metrics.pairwise import cosine_similarity as _sklearn_cos  # type: ignore
+    import numpy as np  # type: ignore
+    _HAS_SKLEARN = True
+except ImportError:
+    _HAS_SKLEARN = False
+# ── Helpers ──────────────────────────────────────────────────────────────────
+def _char_bigrams(text: str) -> set[str]:
+    t = text.lower()
+    return {t[i : i + 2] for i in range(len(t) - 1)}
+def _jaccard(a: set, b: set) -> float:
+    if not a or not b:
+        return 0.0
+    return len(a & b) / len(a | b)
+# ── Backend implementations ─────────────────────────────────────────────────
+class _SentenceTransformersBackend:
+    def __init__(self, model_name: str) -> None:
+        self._model = SentenceTransformer(model_name)
+        self._embeddings: "np.ndarray | None" = None
+        self._corpus: list[dict] = []
+    def fit(self, corpus: list[dict]) -> None:
+        self._corpus = corpus
+        texts = [c["content"] for c in corpus]
+        self._embeddings = self._model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
+    def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
+        if self._embeddings is None or not self._corpus:
+            return []
+        q_emb = self._model.encode([query], convert_to_numpy=True)
+        # Cosine similarity (embeddings are L2-normalised by default in ST)
+        sims = (self._embeddings @ q_emb.T).flatten()
+        ranked = sorted(
+            zip(sims.tolist(), self._corpus), key=lambda x: x[0], reverse=True
+        )
+        return [(float(s), c) for s, c in ranked[:top_k] if s > 0]
+class _TfidfBackend:
+    def __init__(self) -> None:
+        self._vec: "TfidfVectorizer | None" = None
+        self._matrix = None
+        self._corpus: list[dict] = []
+    def fit(self, corpus: list[dict]) -> None:
+        self._corpus = corpus
+        if not corpus:
+            self._vec = None
+            self._matrix = None
+            return
+        texts = [c["content"] for c in corpus]
+        self._vec = TfidfVectorizer(analyzer="word", min_df=1, stop_words=None)
+        self._matrix = self._vec.fit_transform(texts)
+    def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
+        if self._vec is None or not self._corpus:
+            return []
+        q_vec = self._vec.transform([query])
+        sims = _sklearn_cos(q_vec, self._matrix).flatten()
+        ranked = sorted(
+            zip(sims.tolist(), self._corpus), key=lambda x: x[0], reverse=True
+        )
+        return [(float(s), c) for s, c in ranked[:top_k] if s > 0]
+class _BigramBackend:
+    def __init__(self) -> None:
+        self._bigrams: list[set] = []
+        self._corpus: list[dict] = []
+    def fit(self, corpus: list[dict]) -> None:
+        self._corpus = corpus
+        self._bigrams = [_char_bigrams(c["content"]) for c in corpus]
+    def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
+        q_bg = _char_bigrams(query)
+        pairs = [
+            (_jaccard(q_bg, bg), chunk)
+            for bg, chunk in zip(self._bigrams, self._corpus)
+        ]
+        pairs.sort(key=lambda x: x[0], reverse=True)
+        return [(s, c) for s, c in pairs[:top_k] if s > 0]
+# ── Public class ─────────────────────────────────────────────────────────────
+class EmbeddingRetriever:
+    """
+    Unified dense retriever.  Automatically picks the best available backend.
+    Parameters
+    ----------
+    model_name : sentence-transformers model name (only used when ST is installed).
+    """
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
+        if _HAS_ST:
+            self._backend: _SentenceTransformersBackend | _TfidfBackend | _BigramBackend = (
+                _SentenceTransformersBackend(model_name)
+            )
+            self.backend_name = f"sentence-transformers({model_name})"
+        elif _HAS_SKLEARN:
+            self._backend = _TfidfBackend()
+            self.backend_name = "sklearn-tfidf"
+        else:
+            self._backend = _BigramBackend()
+            self.backend_name = "bigram-jaccard"
+    def fit(self, corpus: list[dict]) -> None:
+        self._backend.fit(corpus)
+    def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
+        return self._backend.retrieve(query, top_k)

pythonclaw/core/retrieval/fusion.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""
+Reciprocal Rank Fusion (RRF).
+RRF is a simple, parameter-free method for combining ranked lists from multiple
+retrievers.  For each document, its RRF score is:
+    rrf(d) = Σ  1 / (k + rank_i(d))
+             i
+where rank_i(d) is d's 1-based rank in list i and k=60 is the standard constant.
+Documents that appear in multiple lists get a boost; documents missing from a
+list contribute 0 for that list.
+Reference: Cormack et al. (2009) "Reciprocal rank fusion outperforms condorcet
+and individual rank learning methods."
+"""
+from __future__ import annotations
+from collections import defaultdict
+def reciprocal_rank_fusion(
+    ranked_lists: list[list[tuple[float, dict]]],
+    k: int = 60,
+) -> list[tuple[float, dict]]:
+    """
+    Fuse multiple ranked lists using RRF.
+    Parameters
+    ----------
+    ranked_lists : each sub-list is [(score, chunk_dict), ...] sorted desc by score.
+                   Chunks must have an '_idx' field set by HybridRetriever.fit().
+    k            : smoothing constant (default 60, per original paper).
+    Returns
+    -------
+    Fused list of (rrf_score, chunk_dict) sorted desc by rrf_score.
+    """
+    rrf_scores: dict[int, float] = defaultdict(float)
+    chunk_by_idx: dict[int, dict] = {}
+    for ranked in ranked_lists:
+        for rank, (_, chunk) in enumerate(ranked):
+            idx = chunk.get("_idx", id(chunk))
+            rrf_scores[idx] += 1.0 / (k + rank + 1)
+            chunk_by_idx[idx] = chunk
+    fused = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
+    return [(score, chunk_by_idx[idx]) for idx, score in fused]