pythonclaw 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. pythonclaw/__init__.py +17 -0
  2. pythonclaw/__main__.py +6 -0
  3. pythonclaw/channels/discord_bot.py +231 -0
  4. pythonclaw/channels/telegram_bot.py +236 -0
  5. pythonclaw/config.py +190 -0
  6. pythonclaw/core/__init__.py +25 -0
  7. pythonclaw/core/agent.py +773 -0
  8. pythonclaw/core/compaction.py +220 -0
  9. pythonclaw/core/knowledge/rag.py +93 -0
  10. pythonclaw/core/llm/anthropic_client.py +107 -0
  11. pythonclaw/core/llm/base.py +26 -0
  12. pythonclaw/core/llm/gemini_client.py +139 -0
  13. pythonclaw/core/llm/openai_compatible.py +39 -0
  14. pythonclaw/core/llm/response.py +57 -0
  15. pythonclaw/core/memory/manager.py +120 -0
  16. pythonclaw/core/memory/storage.py +164 -0
  17. pythonclaw/core/persistent_agent.py +103 -0
  18. pythonclaw/core/retrieval/__init__.py +6 -0
  19. pythonclaw/core/retrieval/chunker.py +78 -0
  20. pythonclaw/core/retrieval/dense.py +152 -0
  21. pythonclaw/core/retrieval/fusion.py +51 -0
  22. pythonclaw/core/retrieval/reranker.py +112 -0
  23. pythonclaw/core/retrieval/retriever.py +166 -0
  24. pythonclaw/core/retrieval/sparse.py +69 -0
  25. pythonclaw/core/session_store.py +269 -0
  26. pythonclaw/core/skill_loader.py +322 -0
  27. pythonclaw/core/skillhub.py +290 -0
  28. pythonclaw/core/tools.py +622 -0
  29. pythonclaw/core/utils.py +64 -0
  30. pythonclaw/daemon.py +221 -0
  31. pythonclaw/init.py +61 -0
  32. pythonclaw/main.py +489 -0
  33. pythonclaw/onboard.py +290 -0
  34. pythonclaw/scheduler/cron.py +310 -0
  35. pythonclaw/scheduler/heartbeat.py +178 -0
  36. pythonclaw/server.py +145 -0
  37. pythonclaw/session_manager.py +104 -0
  38. pythonclaw/templates/persona/demo_persona.md +2 -0
  39. pythonclaw/templates/skills/communication/CATEGORY.md +4 -0
  40. pythonclaw/templates/skills/communication/email/SKILL.md +54 -0
  41. pythonclaw/templates/skills/communication/email/__pycache__/send_email.cpython-311.pyc +0 -0
  42. pythonclaw/templates/skills/communication/email/send_email.py +88 -0
  43. pythonclaw/templates/skills/data/CATEGORY.md +4 -0
  44. pythonclaw/templates/skills/data/csv_analyzer/SKILL.md +51 -0
  45. pythonclaw/templates/skills/data/csv_analyzer/__pycache__/analyze.cpython-311.pyc +0 -0
  46. pythonclaw/templates/skills/data/csv_analyzer/analyze.py +138 -0
  47. pythonclaw/templates/skills/data/finance/SKILL.md +41 -0
  48. pythonclaw/templates/skills/data/finance/__pycache__/fetch_quote.cpython-311.pyc +0 -0
  49. pythonclaw/templates/skills/data/finance/fetch_quote.py +118 -0
  50. pythonclaw/templates/skills/data/news/SKILL.md +39 -0
  51. pythonclaw/templates/skills/data/news/__pycache__/search_news.cpython-311.pyc +0 -0
  52. pythonclaw/templates/skills/data/news/search_news.py +57 -0
  53. pythonclaw/templates/skills/data/pdf_reader/SKILL.md +40 -0
  54. pythonclaw/templates/skills/data/pdf_reader/__pycache__/read_pdf.cpython-311.pyc +0 -0
  55. pythonclaw/templates/skills/data/pdf_reader/read_pdf.py +113 -0
  56. pythonclaw/templates/skills/data/scraper/SKILL.md +39 -0
  57. pythonclaw/templates/skills/data/scraper/__pycache__/scrape.cpython-311.pyc +0 -0
  58. pythonclaw/templates/skills/data/scraper/scrape.py +92 -0
  59. pythonclaw/templates/skills/data/weather/SKILL.md +42 -0
  60. pythonclaw/templates/skills/data/weather/__pycache__/weather.cpython-311.pyc +0 -0
  61. pythonclaw/templates/skills/data/weather/weather.py +142 -0
  62. pythonclaw/templates/skills/data/youtube/SKILL.md +43 -0
  63. pythonclaw/templates/skills/data/youtube/__pycache__/youtube_info.cpython-311.pyc +0 -0
  64. pythonclaw/templates/skills/data/youtube/youtube_info.py +167 -0
  65. pythonclaw/templates/skills/dev/CATEGORY.md +4 -0
  66. pythonclaw/templates/skills/dev/code_runner/SKILL.md +46 -0
  67. pythonclaw/templates/skills/dev/code_runner/__pycache__/run_code.cpython-311.pyc +0 -0
  68. pythonclaw/templates/skills/dev/code_runner/run_code.py +117 -0
  69. pythonclaw/templates/skills/dev/github/SKILL.md +52 -0
  70. pythonclaw/templates/skills/dev/github/__pycache__/gh.cpython-311.pyc +0 -0
  71. pythonclaw/templates/skills/dev/github/gh.py +165 -0
  72. pythonclaw/templates/skills/dev/http_request/SKILL.md +40 -0
  73. pythonclaw/templates/skills/dev/http_request/__pycache__/request.cpython-311.pyc +0 -0
  74. pythonclaw/templates/skills/dev/http_request/request.py +90 -0
  75. pythonclaw/templates/skills/google/CATEGORY.md +4 -0
  76. pythonclaw/templates/skills/google/workspace/SKILL.md +98 -0
  77. pythonclaw/templates/skills/google/workspace/check_setup.sh +52 -0
  78. pythonclaw/templates/skills/meta/CATEGORY.md +4 -0
  79. pythonclaw/templates/skills/meta/skill_creator/SKILL.md +151 -0
  80. pythonclaw/templates/skills/system/CATEGORY.md +4 -0
  81. pythonclaw/templates/skills/system/change_persona/SKILL.md +41 -0
  82. pythonclaw/templates/skills/system/change_setting/SKILL.md +65 -0
  83. pythonclaw/templates/skills/system/change_setting/__pycache__/update_config.cpython-311.pyc +0 -0
  84. pythonclaw/templates/skills/system/change_setting/update_config.py +129 -0
  85. pythonclaw/templates/skills/system/change_soul/SKILL.md +41 -0
  86. pythonclaw/templates/skills/system/onboarding/SKILL.md +63 -0
  87. pythonclaw/templates/skills/system/onboarding/__pycache__/write_identity.cpython-311.pyc +0 -0
  88. pythonclaw/templates/skills/system/onboarding/write_identity.py +218 -0
  89. pythonclaw/templates/skills/system/random/SKILL.md +33 -0
  90. pythonclaw/templates/skills/system/random/__pycache__/random_util.cpython-311.pyc +0 -0
  91. pythonclaw/templates/skills/system/random/random_util.py +45 -0
  92. pythonclaw/templates/skills/system/time/SKILL.md +33 -0
  93. pythonclaw/templates/skills/system/time/__pycache__/time_util.cpython-311.pyc +0 -0
  94. pythonclaw/templates/skills/system/time/time_util.py +81 -0
  95. pythonclaw/templates/skills/text/CATEGORY.md +4 -0
  96. pythonclaw/templates/skills/text/translator/SKILL.md +47 -0
  97. pythonclaw/templates/skills/text/translator/__pycache__/translate.cpython-311.pyc +0 -0
  98. pythonclaw/templates/skills/text/translator/translate.py +66 -0
  99. pythonclaw/templates/skills/web/CATEGORY.md +4 -0
  100. pythonclaw/templates/skills/web/tavily/SKILL.md +61 -0
  101. pythonclaw/templates/soul/SOUL.md +54 -0
  102. pythonclaw/web/__init__.py +1 -0
  103. pythonclaw/web/app.py +585 -0
  104. pythonclaw/web/static/favicon.png +0 -0
  105. pythonclaw/web/static/index.html +1318 -0
  106. pythonclaw/web/static/logo.png +0 -0
  107. pythonclaw-0.2.0.dist-info/METADATA +410 -0
  108. pythonclaw-0.2.0.dist-info/RECORD +112 -0
  109. pythonclaw-0.2.0.dist-info/WHEEL +5 -0
  110. pythonclaw-0.2.0.dist-info/entry_points.txt +2 -0
  111. pythonclaw-0.2.0.dist-info/licenses/LICENSE +21 -0
  112. pythonclaw-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,120 @@
1
+ """
2
+ MemoryManager — long-term key-value memory with hybrid RAG recall.
3
+
4
+ Storage
5
+ -------
6
+ Memories are stored as Markdown files:
7
+ - MEMORY.md — curated long-term memory (latest value per key)
8
+ - YYYY-MM-DD.md — daily append-only log
9
+
10
+ When writing, both MEMORY.md and today's daily log are updated.
11
+ When reading, MEMORY.md is the source of truth (holds latest per key).
12
+ Conflict resolution: if the same key is written multiple times, the most
13
+ recent write wins (MEMORY.md is always overwritten with the latest value).
14
+
15
+ Recall
16
+ ------
17
+ When a specific query is given, the manager converts every memory entry into a
18
+ short "chunk" ("{key}: {value}") and runs hybrid sparse + dense retrieval to
19
+ return the most relevant ones. When the query is empty or "*", ALL memories
20
+ are returned (full-dump mode, used by compaction and legacy callers).
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import logging
26
+
27
+ from .storage import MemoryStorage
28
+ from ..retrieval.retriever import HybridRetriever
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ _DUMP_TRIGGERS = {"", "*", "all", "everything"}
33
+
34
+
35
+ class MemoryManager:
36
+ """
37
+ Manages long-term memories stored as Markdown files.
38
+
39
+ Parameters
40
+ ----------
41
+ memory_dir : path to the memory directory (contains MEMORY.md + daily logs).
42
+ use_dense : include embedding retrieval for recall (False by default for
43
+ memory — BM25 alone is fast and sufficient for small corpora).
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ memory_dir: str | None = None,
49
+ use_dense: bool = False,
50
+ ) -> None:
51
+ import os
52
+
53
+ if memory_dir is None:
54
+ home = os.path.expanduser("~")
55
+ memory_dir = os.path.join(home, ".ada", "memory")
56
+
57
+ self.storage = MemoryStorage(memory_dir)
58
+ self._use_dense = use_dense
59
+
60
+ # ── Core operations ──────────────────────────────────────────────────────
61
+
62
+ def remember(self, content: str, key: str | None = None) -> str:
63
+ """Store *content* under *key* in long-term memory."""
64
+ if not key:
65
+ raise ValueError("Key is required for memory storage.")
66
+ self.storage.set(key, content)
67
+ return f"Memory stored: [{key}] = {content}"
68
+
69
+ def recall(self, query: str, top_k: int = 10) -> str:
70
+ """
71
+ Retrieve memories relevant to *query*.
72
+
73
+ - If query is empty / "*" / "all" → returns ALL memories (full dump).
74
+ - Otherwise → runs hybrid BM25 (+ optional dense) retrieval and
75
+ returns the top *top_k* most relevant entries.
76
+ """
77
+ all_memories = self.storage.list_all()
78
+ if not all_memories:
79
+ return "No memories found."
80
+
81
+ # Full-dump mode
82
+ if query.strip().lower() in _DUMP_TRIGGERS:
83
+ lines = [f"- {k}: {v}" for k, v in all_memories.items()]
84
+ return "\n".join(lines)
85
+
86
+ # Smart retrieval
87
+ corpus = [
88
+ {"source": k, "content": f"{k}: {v}"}
89
+ for k, v in all_memories.items()
90
+ ]
91
+
92
+ retriever = HybridRetriever(
93
+ provider=None, # no LLM re-ranker for memory
94
+ use_sparse=True,
95
+ use_dense=self._use_dense,
96
+ use_reranker=False,
97
+ )
98
+ retriever.fit(corpus)
99
+ hits = retriever.retrieve(query, top_k=top_k)
100
+
101
+ if not hits:
102
+ logger.debug("[MemoryManager] No RAG hits for '%s', returning all.", query)
103
+ lines = [f"- {k}: {v}" for k, v in all_memories.items()]
104
+ return "(No close match found; showing all memories)\n" + "\n".join(lines)
105
+
106
+ lines = [f"- {h['source']}: {h['content'].split(': ', 1)[-1]}" for h in hits]
107
+ return "\n".join(lines)
108
+
109
+ def forget(self, key: str) -> str:
110
+ """Remove a memory entry by key."""
111
+ if self.storage.get(key) is not None:
112
+ self.storage.delete(key)
113
+ return f"Forgot: {key}"
114
+ return f"Nothing found for: {key}"
115
+
116
+ # ── Helpers used by compaction ───────────────────────────────────────────
117
+
118
+ def list_all(self) -> dict:
119
+ """Return the raw {key: value} dict (used by compaction.memory_flush)."""
120
+ return self.storage.list_all()
@@ -0,0 +1,164 @@
1
+ """
2
+ Markdown-backed memory storage (inspired by OpenClaw).
3
+
4
+ Layout
5
+ ------
6
+ context/memory/MEMORY.md — curated long-term memory (latest value per key)
7
+ context/memory/YYYY-MM-DD.md — daily append-only log
8
+
9
+ Write flow
10
+ ----------
11
+ set(key, value) → append to today's daily log + upsert into MEMORY.md
12
+
13
+ Read flow
14
+ ---------
15
+ get(key) → read from MEMORY.md (always holds the latest)
16
+ list_all() → parse MEMORY.md and return {key: value}
17
+
18
+ Conflict resolution
19
+ -------------------
20
+ MEMORY.md always holds the latest value for each key. When set() is called,
21
+ it updates MEMORY.md with the new timestamp, so the most recent write wins.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import os
27
+ import re
28
+ from datetime import datetime
29
+ from typing import Any, Dict
30
+
31
+
32
+ _KEY_HEADER = re.compile(r"^## (.+)$", re.MULTILINE)
33
+ _UPDATED_LINE = re.compile(r"^> Updated: (.+)$", re.MULTILINE)
34
+
35
+
36
+ class MemoryStorage:
37
+ """Markdown-backed key-value memory with daily logs."""
38
+
39
+ def __init__(self, memory_dir: str = "context/memory") -> None:
40
+ self.memory_dir = memory_dir
41
+ os.makedirs(memory_dir, exist_ok=True)
42
+ self._memory_file = os.path.join(memory_dir, "MEMORY.md")
43
+ self.data: Dict[str, dict] = {} # key → {"value": ..., "updated": ...}
44
+ self._load()
45
+
46
+ # ── Persistence ───────────────────────────────────────────────────────────
47
+
48
+ def _load(self) -> None:
49
+ """Parse MEMORY.md into self.data."""
50
+ if not os.path.exists(self._memory_file):
51
+ self.data = {}
52
+ return
53
+
54
+ try:
55
+ with open(self._memory_file, "r", encoding="utf-8") as f:
56
+ text = f.read()
57
+ except OSError:
58
+ self.data = {}
59
+ return
60
+
61
+ self.data = self._parse_memory_md(text)
62
+
63
+ @staticmethod
64
+ def _parse_memory_md(text: str) -> Dict[str, dict]:
65
+ """
66
+ Parse a MEMORY.md into {key: {"value": str, "updated": str}}.
67
+
68
+ Expected format per entry::
69
+
70
+ ## key_name
71
+ > Updated: 2026-02-23 15:30:00
72
+
73
+ The actual value content here.
74
+ """
75
+ entries: Dict[str, dict] = {}
76
+ sections = re.split(r"(?=^## )", text, flags=re.MULTILINE)
77
+
78
+ for section in sections:
79
+ section = section.strip()
80
+ if not section:
81
+ continue
82
+ key_match = _KEY_HEADER.match(section)
83
+ if not key_match:
84
+ continue
85
+ key = key_match.group(1).strip()
86
+
87
+ updated = ""
88
+ upd_match = _UPDATED_LINE.search(section)
89
+ if upd_match:
90
+ updated = upd_match.group(1).strip()
91
+
92
+ # Content is everything after the metadata lines
93
+ lines = section.split("\n")
94
+ content_lines = []
95
+ past_header = False
96
+ for line in lines[1:]: # skip the ## heading
97
+ if not past_header:
98
+ if line.startswith("> Updated:") or line.strip() == "":
99
+ continue
100
+ past_header = True
101
+ content_lines.append(line)
102
+
103
+ entries[key] = {
104
+ "value": "\n".join(content_lines).strip(),
105
+ "updated": updated,
106
+ }
107
+
108
+ return entries
109
+
110
+ def _save_memory_md(self) -> None:
111
+ """Write self.data back to MEMORY.md."""
112
+ os.makedirs(os.path.dirname(self._memory_file) or ".", exist_ok=True)
113
+ lines = ["# Long-Term Memory\n"]
114
+
115
+ for key, entry in self.data.items():
116
+ updated = entry.get("updated", "")
117
+ value = entry.get("value", "")
118
+ lines.append(f"## {key}")
119
+ lines.append(f"> Updated: {updated}")
120
+ lines.append("")
121
+ lines.append(value)
122
+ lines.append("")
123
+
124
+ try:
125
+ with open(self._memory_file, "w", encoding="utf-8") as f:
126
+ f.write("\n".join(lines))
127
+ except OSError as e:
128
+ print(f"Error saving MEMORY.md: {e}")
129
+
130
+ def _append_daily_log(self, key: str, value: str) -> None:
131
+ """Append an entry to today's daily log file."""
132
+ today = datetime.now().strftime("%Y-%m-%d")
133
+ daily_file = os.path.join(self.memory_dir, f"{today}.md")
134
+ now = datetime.now().strftime("%H:%M:%S")
135
+
136
+ is_new = not os.path.exists(daily_file)
137
+ try:
138
+ with open(daily_file, "a", encoding="utf-8") as f:
139
+ if is_new:
140
+ f.write(f"# Daily Memory — {today}\n\n")
141
+ f.write(f"### {now} — {key}\n\n{value}\n\n")
142
+ except OSError as e:
143
+ print(f"Error writing daily memory log: {e}")
144
+
145
+ # ── Public API ────────────────────────────────────────────────────────────
146
+
147
+ def get(self, key: str) -> Any:
148
+ entry = self.data.get(key)
149
+ return entry["value"] if entry else None
150
+
151
+ def set(self, key: str, value: Any) -> None:
152
+ now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
153
+ self.data[key] = {"value": str(value), "updated": now}
154
+ self._save_memory_md()
155
+ self._append_daily_log(key, str(value))
156
+
157
+ def delete(self, key: str) -> None:
158
+ if key in self.data:
159
+ del self.data[key]
160
+ self._save_memory_md()
161
+
162
+ def list_all(self) -> Dict[str, Any]:
163
+ """Return {key: value} for all entries (latest version)."""
164
+ return {k: v["value"] for k, v in self.data.items()}
@@ -0,0 +1,103 @@
1
+ """
2
+ PersistentAgent — an Agent subclass that automatically saves its message
3
+ history to a SessionStore after every chat() or compact() call.
4
+
5
+ On construction it restores the previous conversation from the store so that
6
+ sessions survive server restarts.
7
+
8
+ Restoration strategy
9
+ --------------------
10
+ messages[0] — always rebuilt fresh by Agent.__init__ (soul + persona + skills)
11
+ messages[1:] — restored from the Markdown session store
12
+
13
+ This means soul/persona/skill changes take effect on the next restart while
14
+ the full conversation history (including compaction summaries and skill
15
+ injection messages) is preserved.
16
+
17
+ Timestamps
18
+ ----------
19
+ Each message carries a ``_ts`` field (ISO 8601) that records when it was
20
+ created. This enables time-based truncation in the SessionStore.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import logging
26
+ import re
27
+ from datetime import datetime
28
+ from typing import TYPE_CHECKING
29
+
30
+ from .agent import Agent
31
+
32
+ if TYPE_CHECKING:
33
+ from .session_store import SessionStore
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class PersistentAgent(Agent):
39
+ """Agent that auto-saves to and restores from a Markdown SessionStore."""
40
+
41
+ def __init__(
42
+ self,
43
+ *args,
44
+ store: "SessionStore",
45
+ session_id: str,
46
+ **kwargs,
47
+ ) -> None:
48
+ super().__init__(*args, **kwargs)
49
+ self._store = store
50
+ self._session_id = session_id
51
+ self._restore()
52
+
53
+ # ── Restore ──────────────────────────────────────────────────────────────
54
+
55
+ def _restore(self) -> None:
56
+ """Load saved messages and merge with the freshly built system prompt."""
57
+ saved = self._store.load(self._session_id)
58
+ if not saved:
59
+ return
60
+
61
+ initial_system = self.messages[0] # freshly built system prompt
62
+ self.messages = [initial_system] + saved
63
+
64
+ # Re-infer which skills were loaded so _use_skill doesn't double-inject
65
+ for msg in saved:
66
+ if msg.get("role") == "system":
67
+ content = msg.get("content", "")
68
+ # Support both old ("Skill Enabled:") and new ("SKILL ACTIVATED:") formats
69
+ m = re.search(r"(?:Skill Enabled|SKILL ACTIVATED):\s*(.+)", content)
70
+ if m:
71
+ self.loaded_skill_names.add(m.group(1).strip().rstrip("]"))
72
+
73
+ logger.info(
74
+ "[PersistentAgent] Restored session '%s': %d messages, %d skills",
75
+ self._session_id, len(saved), len(self.loaded_skill_names),
76
+ )
77
+
78
+ # ── Timestamp injection ──────────────────────────────────────────────────
79
+
80
+ @staticmethod
81
+ def _ensure_ts(msg: dict) -> dict:
82
+ """Add a ``_ts`` field to a message if it doesn't have one."""
83
+ if "_ts" not in msg:
84
+ msg["_ts"] = datetime.now().isoformat(timespec="seconds")
85
+ return msg
86
+
87
+ # ── Auto-save ────────────────────────────────────────────────────────────
88
+
89
+ def _save(self) -> None:
90
+ # Ensure every message has a timestamp before saving
91
+ for msg in self.messages[1:]:
92
+ self._ensure_ts(msg)
93
+ self._store.save(self._session_id, self.messages)
94
+
95
+ def chat(self, user_input: str) -> str:
96
+ response = super().chat(user_input)
97
+ self._save()
98
+ return response
99
+
100
+ def compact(self, instruction: str | None = None) -> str:
101
+ result = super().compact(instruction)
102
+ self._save()
103
+ return result
@@ -0,0 +1,6 @@
1
+ """Hybrid retrieval pipeline: BM25 + dense embeddings + RRF fusion."""
2
+
3
+ from .retriever import HybridRetriever
4
+ from .chunker import chunk_text, load_corpus_from_directory
5
+
6
+ __all__ = ["HybridRetriever", "chunk_text", "load_corpus_from_directory"]
@@ -0,0 +1,78 @@
1
+ """
2
+ Text chunking utilities.
3
+
4
+ Strategy
5
+ --------
6
+ 1. Split document by paragraphs (double newline).
7
+ 2. Any paragraph longer than `max_chars` is further split with a sliding
8
+ window of size `chunk_size` and overlap `overlap`.
9
+ 3. Each chunk carries metadata: source filename, chunk index, character offset.
10
+
11
+ Supported file extensions: .txt .md
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import re
18
+
19
+
20
+ def chunk_text(
21
+ text: str,
22
+ source: str = "",
23
+ chunk_size: int = 400,
24
+ overlap: int = 80,
25
+ ) -> list[dict]:
26
+ """
27
+ Split *text* into chunks suitable for retrieval.
28
+
29
+ Returns a list of dicts:
30
+ {"source": str, "content": str, "chunk_idx": int}
31
+ """
32
+ # Split by blank lines (paragraph boundaries)
33
+ raw_paragraphs = re.split(r"\n{2,}", text)
34
+ paragraphs = [p.strip() for p in raw_paragraphs if p.strip()]
35
+
36
+ chunks: list[dict] = []
37
+ idx = 0
38
+
39
+ for para in paragraphs:
40
+ if len(para) <= chunk_size:
41
+ chunks.append({"source": source, "content": para, "chunk_idx": idx})
42
+ idx += 1
43
+ else:
44
+ # Sliding-window split for long paragraphs
45
+ start = 0
46
+ while start < len(para):
47
+ end = min(start + chunk_size, len(para))
48
+ window = para[start:end].strip()
49
+ if window:
50
+ chunks.append({"source": source, "content": window, "chunk_idx": idx})
51
+ idx += 1
52
+ if end == len(para):
53
+ break
54
+ start += chunk_size - overlap
55
+
56
+ return chunks
57
+
58
+
59
+ def load_corpus_from_directory(directory: str) -> list[dict]:
60
+ """
61
+ Load all .txt and .md files from *directory* and return a flat list of chunks.
62
+ """
63
+ corpus: list[dict] = []
64
+ if not os.path.isdir(directory):
65
+ return corpus
66
+
67
+ for filename in sorted(os.listdir(directory)):
68
+ if not filename.lower().endswith((".txt", ".md")):
69
+ continue
70
+ filepath = os.path.join(directory, filename)
71
+ try:
72
+ with open(filepath, "r", encoding="utf-8") as f:
73
+ text = f.read()
74
+ corpus.extend(chunk_text(text, source=filename))
75
+ except OSError as exc:
76
+ print(f"[Chunker] Could not read '{filepath}': {exc}")
77
+
78
+ return corpus
@@ -0,0 +1,152 @@
1
+ """
2
+ Dense retriever — semantic embedding similarity.
3
+
4
+ Priority order
5
+ --------------
6
+ 1. sentence-transformers (neural embeddings, best semantic quality)
7
+ pip install sentence-transformers
8
+
9
+ 2. scikit-learn TF-IDF + cosine similarity (no GPU, still beats pure BM25 for
10
+ paraphrase/synonym queries)
11
+ pip install scikit-learn numpy
12
+
13
+ 3. Pure-Python fallback: character bigram Jaccard similarity (always works,
14
+ poor quality — install one of the above for production use).
15
+
16
+ All three expose the same interface:
17
+ fit(corpus) → retrieve(query, top_k) → [(score, chunk), ...]
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ # ── Availability probes ──────────────────────────────────────────────────────
23
+
24
+ try:
25
+ from sentence_transformers import SentenceTransformer # type: ignore
26
+ import numpy as np
27
+ _HAS_ST = True
28
+ except ImportError:
29
+ _HAS_ST = False
30
+
31
+ try:
32
+ from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore
33
+ from sklearn.metrics.pairwise import cosine_similarity as _sklearn_cos # type: ignore
34
+ import numpy as np # type: ignore
35
+ _HAS_SKLEARN = True
36
+ except ImportError:
37
+ _HAS_SKLEARN = False
38
+
39
+
40
+ # ── Helpers ──────────────────────────────────────────────────────────────────
41
+
42
+ def _char_bigrams(text: str) -> set[str]:
43
+ t = text.lower()
44
+ return {t[i : i + 2] for i in range(len(t) - 1)}
45
+
46
+
47
+ def _jaccard(a: set, b: set) -> float:
48
+ if not a or not b:
49
+ return 0.0
50
+ return len(a & b) / len(a | b)
51
+
52
+
53
+ # ── Backend implementations ─────────────────────────────────────────────────
54
+
55
+ class _SentenceTransformersBackend:
56
+ def __init__(self, model_name: str) -> None:
57
+ self._model = SentenceTransformer(model_name)
58
+ self._embeddings: "np.ndarray | None" = None
59
+ self._corpus: list[dict] = []
60
+
61
+ def fit(self, corpus: list[dict]) -> None:
62
+ self._corpus = corpus
63
+ texts = [c["content"] for c in corpus]
64
+ self._embeddings = self._model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
65
+
66
+ def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
67
+ if self._embeddings is None or not self._corpus:
68
+ return []
69
+ q_emb = self._model.encode([query], convert_to_numpy=True)
70
+ # Cosine similarity (embeddings are L2-normalised by default in ST)
71
+ sims = (self._embeddings @ q_emb.T).flatten()
72
+ ranked = sorted(
73
+ zip(sims.tolist(), self._corpus), key=lambda x: x[0], reverse=True
74
+ )
75
+ return [(float(s), c) for s, c in ranked[:top_k] if s > 0]
76
+
77
+
78
+ class _TfidfBackend:
79
+ def __init__(self) -> None:
80
+ self._vec: "TfidfVectorizer | None" = None
81
+ self._matrix = None
82
+ self._corpus: list[dict] = []
83
+
84
+ def fit(self, corpus: list[dict]) -> None:
85
+ self._corpus = corpus
86
+ if not corpus:
87
+ self._vec = None
88
+ self._matrix = None
89
+ return
90
+ texts = [c["content"] for c in corpus]
91
+ self._vec = TfidfVectorizer(analyzer="word", min_df=1, stop_words=None)
92
+ self._matrix = self._vec.fit_transform(texts)
93
+
94
+ def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
95
+ if self._vec is None or not self._corpus:
96
+ return []
97
+ q_vec = self._vec.transform([query])
98
+ sims = _sklearn_cos(q_vec, self._matrix).flatten()
99
+ ranked = sorted(
100
+ zip(sims.tolist(), self._corpus), key=lambda x: x[0], reverse=True
101
+ )
102
+ return [(float(s), c) for s, c in ranked[:top_k] if s > 0]
103
+
104
+
105
+ class _BigramBackend:
106
+ def __init__(self) -> None:
107
+ self._bigrams: list[set] = []
108
+ self._corpus: list[dict] = []
109
+
110
+ def fit(self, corpus: list[dict]) -> None:
111
+ self._corpus = corpus
112
+ self._bigrams = [_char_bigrams(c["content"]) for c in corpus]
113
+
114
+ def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
115
+ q_bg = _char_bigrams(query)
116
+ pairs = [
117
+ (_jaccard(q_bg, bg), chunk)
118
+ for bg, chunk in zip(self._bigrams, self._corpus)
119
+ ]
120
+ pairs.sort(key=lambda x: x[0], reverse=True)
121
+ return [(s, c) for s, c in pairs[:top_k] if s > 0]
122
+
123
+
124
+ # ── Public class ─────────────────────────────────────────────────────────────
125
+
126
+ class EmbeddingRetriever:
127
+ """
128
+ Unified dense retriever. Automatically picks the best available backend.
129
+
130
+ Parameters
131
+ ----------
132
+ model_name : sentence-transformers model name (only used when ST is installed).
133
+ """
134
+
135
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
136
+ if _HAS_ST:
137
+ self._backend: _SentenceTransformersBackend | _TfidfBackend | _BigramBackend = (
138
+ _SentenceTransformersBackend(model_name)
139
+ )
140
+ self.backend_name = f"sentence-transformers({model_name})"
141
+ elif _HAS_SKLEARN:
142
+ self._backend = _TfidfBackend()
143
+ self.backend_name = "sklearn-tfidf"
144
+ else:
145
+ self._backend = _BigramBackend()
146
+ self.backend_name = "bigram-jaccard"
147
+
148
+ def fit(self, corpus: list[dict]) -> None:
149
+ self._backend.fit(corpus)
150
+
151
+ def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
152
+ return self._backend.retrieve(query, top_k)
@@ -0,0 +1,51 @@
1
+ """
2
+ Reciprocal Rank Fusion (RRF).
3
+
4
+ RRF is a simple, parameter-free method for combining ranked lists from multiple
5
+ retrievers. For each document, its RRF score is:
6
+
7
+ rrf(d) = Σ 1 / (k + rank_i(d))
8
+ i
9
+
10
+ where rank_i(d) is d's 1-based rank in list i and k=60 is the standard constant.
11
+
12
+ Documents that appear in multiple lists get a boost; documents missing from a
13
+ list contribute 0 for that list.
14
+
15
+ Reference: Cormack et al. (2009) "Reciprocal rank fusion outperforms condorcet
16
+ and individual rank learning methods."
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from collections import defaultdict
22
+
23
+
24
+ def reciprocal_rank_fusion(
25
+ ranked_lists: list[list[tuple[float, dict]]],
26
+ k: int = 60,
27
+ ) -> list[tuple[float, dict]]:
28
+ """
29
+ Fuse multiple ranked lists using RRF.
30
+
31
+ Parameters
32
+ ----------
33
+ ranked_lists : each sub-list is [(score, chunk_dict), ...] sorted desc by score.
34
+ Chunks must have an '_idx' field set by HybridRetriever.fit().
35
+ k : smoothing constant (default 60, per original paper).
36
+
37
+ Returns
38
+ -------
39
+ Fused list of (rrf_score, chunk_dict) sorted desc by rrf_score.
40
+ """
41
+ rrf_scores: dict[int, float] = defaultdict(float)
42
+ chunk_by_idx: dict[int, dict] = {}
43
+
44
+ for ranked in ranked_lists:
45
+ for rank, (_, chunk) in enumerate(ranked):
46
+ idx = chunk.get("_idx", id(chunk))
47
+ rrf_scores[idx] += 1.0 / (k + rank + 1)
48
+ chunk_by_idx[idx] = chunk
49
+
50
+ fused = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
51
+ return [(score, chunk_by_idx[idx]) for idx, score in fused]