pythonclaw 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonclaw/__init__.py +17 -0
- pythonclaw/__main__.py +6 -0
- pythonclaw/channels/discord_bot.py +231 -0
- pythonclaw/channels/telegram_bot.py +236 -0
- pythonclaw/config.py +190 -0
- pythonclaw/core/__init__.py +25 -0
- pythonclaw/core/agent.py +773 -0
- pythonclaw/core/compaction.py +220 -0
- pythonclaw/core/knowledge/rag.py +93 -0
- pythonclaw/core/llm/anthropic_client.py +107 -0
- pythonclaw/core/llm/base.py +26 -0
- pythonclaw/core/llm/gemini_client.py +139 -0
- pythonclaw/core/llm/openai_compatible.py +39 -0
- pythonclaw/core/llm/response.py +57 -0
- pythonclaw/core/memory/manager.py +120 -0
- pythonclaw/core/memory/storage.py +164 -0
- pythonclaw/core/persistent_agent.py +103 -0
- pythonclaw/core/retrieval/__init__.py +6 -0
- pythonclaw/core/retrieval/chunker.py +78 -0
- pythonclaw/core/retrieval/dense.py +152 -0
- pythonclaw/core/retrieval/fusion.py +51 -0
- pythonclaw/core/retrieval/reranker.py +112 -0
- pythonclaw/core/retrieval/retriever.py +166 -0
- pythonclaw/core/retrieval/sparse.py +69 -0
- pythonclaw/core/session_store.py +269 -0
- pythonclaw/core/skill_loader.py +322 -0
- pythonclaw/core/skillhub.py +290 -0
- pythonclaw/core/tools.py +622 -0
- pythonclaw/core/utils.py +64 -0
- pythonclaw/daemon.py +221 -0
- pythonclaw/init.py +61 -0
- pythonclaw/main.py +489 -0
- pythonclaw/onboard.py +290 -0
- pythonclaw/scheduler/cron.py +310 -0
- pythonclaw/scheduler/heartbeat.py +178 -0
- pythonclaw/server.py +145 -0
- pythonclaw/session_manager.py +104 -0
- pythonclaw/templates/persona/demo_persona.md +2 -0
- pythonclaw/templates/skills/communication/CATEGORY.md +4 -0
- pythonclaw/templates/skills/communication/email/SKILL.md +54 -0
- pythonclaw/templates/skills/communication/email/__pycache__/send_email.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/communication/email/send_email.py +88 -0
- pythonclaw/templates/skills/data/CATEGORY.md +4 -0
- pythonclaw/templates/skills/data/csv_analyzer/SKILL.md +51 -0
- pythonclaw/templates/skills/data/csv_analyzer/__pycache__/analyze.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/data/csv_analyzer/analyze.py +138 -0
- pythonclaw/templates/skills/data/finance/SKILL.md +41 -0
- pythonclaw/templates/skills/data/finance/__pycache__/fetch_quote.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/data/finance/fetch_quote.py +118 -0
- pythonclaw/templates/skills/data/news/SKILL.md +39 -0
- pythonclaw/templates/skills/data/news/__pycache__/search_news.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/data/news/search_news.py +57 -0
- pythonclaw/templates/skills/data/pdf_reader/SKILL.md +40 -0
- pythonclaw/templates/skills/data/pdf_reader/__pycache__/read_pdf.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/data/pdf_reader/read_pdf.py +113 -0
- pythonclaw/templates/skills/data/scraper/SKILL.md +39 -0
- pythonclaw/templates/skills/data/scraper/__pycache__/scrape.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/data/scraper/scrape.py +92 -0
- pythonclaw/templates/skills/data/weather/SKILL.md +42 -0
- pythonclaw/templates/skills/data/weather/__pycache__/weather.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/data/weather/weather.py +142 -0
- pythonclaw/templates/skills/data/youtube/SKILL.md +43 -0
- pythonclaw/templates/skills/data/youtube/__pycache__/youtube_info.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/data/youtube/youtube_info.py +167 -0
- pythonclaw/templates/skills/dev/CATEGORY.md +4 -0
- pythonclaw/templates/skills/dev/code_runner/SKILL.md +46 -0
- pythonclaw/templates/skills/dev/code_runner/__pycache__/run_code.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/dev/code_runner/run_code.py +117 -0
- pythonclaw/templates/skills/dev/github/SKILL.md +52 -0
- pythonclaw/templates/skills/dev/github/__pycache__/gh.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/dev/github/gh.py +165 -0
- pythonclaw/templates/skills/dev/http_request/SKILL.md +40 -0
- pythonclaw/templates/skills/dev/http_request/__pycache__/request.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/dev/http_request/request.py +90 -0
- pythonclaw/templates/skills/google/CATEGORY.md +4 -0
- pythonclaw/templates/skills/google/workspace/SKILL.md +98 -0
- pythonclaw/templates/skills/google/workspace/check_setup.sh +52 -0
- pythonclaw/templates/skills/meta/CATEGORY.md +4 -0
- pythonclaw/templates/skills/meta/skill_creator/SKILL.md +151 -0
- pythonclaw/templates/skills/system/CATEGORY.md +4 -0
- pythonclaw/templates/skills/system/change_persona/SKILL.md +41 -0
- pythonclaw/templates/skills/system/change_setting/SKILL.md +65 -0
- pythonclaw/templates/skills/system/change_setting/__pycache__/update_config.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/system/change_setting/update_config.py +129 -0
- pythonclaw/templates/skills/system/change_soul/SKILL.md +41 -0
- pythonclaw/templates/skills/system/onboarding/SKILL.md +63 -0
- pythonclaw/templates/skills/system/onboarding/__pycache__/write_identity.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/system/onboarding/write_identity.py +218 -0
- pythonclaw/templates/skills/system/random/SKILL.md +33 -0
- pythonclaw/templates/skills/system/random/__pycache__/random_util.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/system/random/random_util.py +45 -0
- pythonclaw/templates/skills/system/time/SKILL.md +33 -0
- pythonclaw/templates/skills/system/time/__pycache__/time_util.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/system/time/time_util.py +81 -0
- pythonclaw/templates/skills/text/CATEGORY.md +4 -0
- pythonclaw/templates/skills/text/translator/SKILL.md +47 -0
- pythonclaw/templates/skills/text/translator/__pycache__/translate.cpython-311.pyc +0 -0
- pythonclaw/templates/skills/text/translator/translate.py +66 -0
- pythonclaw/templates/skills/web/CATEGORY.md +4 -0
- pythonclaw/templates/skills/web/tavily/SKILL.md +61 -0
- pythonclaw/templates/soul/SOUL.md +54 -0
- pythonclaw/web/__init__.py +1 -0
- pythonclaw/web/app.py +585 -0
- pythonclaw/web/static/favicon.png +0 -0
- pythonclaw/web/static/index.html +1318 -0
- pythonclaw/web/static/logo.png +0 -0
- pythonclaw-0.2.0.dist-info/METADATA +410 -0
- pythonclaw-0.2.0.dist-info/RECORD +112 -0
- pythonclaw-0.2.0.dist-info/WHEEL +5 -0
- pythonclaw-0.2.0.dist-info/entry_points.txt +2 -0
- pythonclaw-0.2.0.dist-info/licenses/LICENSE +21 -0
- pythonclaw-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-based re-ranker.
|
|
3
|
+
|
|
4
|
+
Given a query and a list of candidate chunks (retrieved by sparse + dense),
|
|
5
|
+
asks the LLM to sort them by relevance and returns the top-k.
|
|
6
|
+
|
|
7
|
+
Prompt strategy
|
|
8
|
+
---------------
|
|
9
|
+
We ask the LLM to return a JSON array of 0-based indices sorted from most
|
|
10
|
+
to least relevant. This is compact, deterministic to parse, and works well
|
|
11
|
+
with instruction-tuned models.
|
|
12
|
+
|
|
13
|
+
The re-ranker is *optional*. It adds one extra LLM call per retrieval but
|
|
14
|
+
significantly improves precision, especially for ambiguous queries.
|
|
15
|
+
|
|
16
|
+
Usage
|
|
17
|
+
-----
|
|
18
|
+
from pythonclaw.core.retrieval.reranker import LLMReranker
|
|
19
|
+
reranker = LLMReranker(provider)
|
|
20
|
+
best = reranker.rerank(query="...", candidates=[...], top_k=3)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
import re
|
|
28
|
+
from typing import TYPE_CHECKING
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from ..llm.base import LLMProvider
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
_RERANK_PROMPT = """\
|
|
36
|
+
You are a relevance scoring assistant. Given a search query and a list of text passages, rank the passages by their relevance to the query.
|
|
37
|
+
|
|
38
|
+
Query: {query}
|
|
39
|
+
|
|
40
|
+
Passages:
|
|
41
|
+
{passages}
|
|
42
|
+
|
|
43
|
+
Return ONLY a valid JSON array of passage indices (0-based), ordered from most relevant to least relevant.
|
|
44
|
+
Example: [2, 0, 3, 1]
|
|
45
|
+
|
|
46
|
+
Your response (JSON array only):"""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class LLMReranker:
|
|
50
|
+
"""
|
|
51
|
+
Re-ranks retrieval candidates using a single LLM call.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
provider : any LLMProvider instance.
|
|
56
|
+
max_chars : truncate each candidate to this many characters in the prompt.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, provider: "LLMProvider", max_chars: int = 300) -> None:
|
|
60
|
+
self._provider = provider
|
|
61
|
+
self._max_chars = max_chars
|
|
62
|
+
|
|
63
|
+
def rerank(
|
|
64
|
+
self,
|
|
65
|
+
query: str,
|
|
66
|
+
candidates: list[dict],
|
|
67
|
+
top_k: int,
|
|
68
|
+
) -> list[dict]:
|
|
69
|
+
"""
|
|
70
|
+
Re-rank *candidates* for *query* and return the best *top_k*.
|
|
71
|
+
|
|
72
|
+
Falls back to the original order if the LLM response cannot be parsed.
|
|
73
|
+
"""
|
|
74
|
+
if not candidates:
|
|
75
|
+
return []
|
|
76
|
+
if len(candidates) == 1:
|
|
77
|
+
return candidates[:top_k]
|
|
78
|
+
|
|
79
|
+
passages_text = "\n\n".join(
|
|
80
|
+
f"[{i}] {c['content'][: self._max_chars]}"
|
|
81
|
+
for i, c in enumerate(candidates)
|
|
82
|
+
)
|
|
83
|
+
prompt = _RERANK_PROMPT.format(query=query, passages=passages_text)
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
response = self._provider.chat(
|
|
87
|
+
messages=[{"role": "user", "content": prompt}],
|
|
88
|
+
tools=None,
|
|
89
|
+
tool_choice=None,
|
|
90
|
+
)
|
|
91
|
+
raw = response.choices[0].message.content.strip()
|
|
92
|
+
|
|
93
|
+
# Extract first JSON array from the response
|
|
94
|
+
match = re.search(r"\[[\d,\s]+\]", raw)
|
|
95
|
+
if not match:
|
|
96
|
+
raise ValueError(f"No JSON array found in: {raw!r}")
|
|
97
|
+
indices: list[int] = json.loads(match.group())
|
|
98
|
+
|
|
99
|
+
reranked = [
|
|
100
|
+
candidates[i] for i in indices if 0 <= i < len(candidates)
|
|
101
|
+
]
|
|
102
|
+
# Append any candidates the LLM missed (shouldn't happen, but be safe)
|
|
103
|
+
seen = set(indices)
|
|
104
|
+
for i, c in enumerate(candidates):
|
|
105
|
+
if i not in seen:
|
|
106
|
+
reranked.append(c)
|
|
107
|
+
|
|
108
|
+
return reranked[:top_k]
|
|
109
|
+
|
|
110
|
+
except Exception as exc:
|
|
111
|
+
logger.warning("[LLMReranker] Reranking failed (%s), using original order.", exc)
|
|
112
|
+
return candidates[:top_k]
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HybridRetriever — the main retrieval class.
|
|
3
|
+
|
|
4
|
+
Pipeline
|
|
5
|
+
--------
|
|
6
|
+
corpus (list of chunk dicts)
|
|
7
|
+
|
|
|
8
|
+
+-----------+-----------+
|
|
9
|
+
| |
|
|
10
|
+
BM25Retriever EmbeddingRetriever
|
|
11
|
+
(sparse) (dense)
|
|
12
|
+
| |
|
|
13
|
+
+-----------+-----------+
|
|
14
|
+
|
|
|
15
|
+
Reciprocal Rank Fusion
|
|
16
|
+
|
|
|
17
|
+
| (top fetch_k candidates)
|
|
18
|
+
LLMReranker (optional)
|
|
19
|
+
|
|
|
20
|
+
| top_k final results
|
|
21
|
+
|
|
22
|
+
Usage
|
|
23
|
+
-----
|
|
24
|
+
from pythonclaw.core.retrieval import HybridRetriever, load_corpus_from_directory
|
|
25
|
+
|
|
26
|
+
retriever = HybridRetriever(provider=llm_provider)
|
|
27
|
+
retriever.fit(load_corpus_from_directory("context/knowledge"))
|
|
28
|
+
hits = retriever.retrieve("What is the refund policy?", top_k=5)
|
|
29
|
+
# hits = [{"source": "...", "content": "...", ...}, ...]
|
|
30
|
+
|
|
31
|
+
Configuration
|
|
32
|
+
-------------
|
|
33
|
+
use_sparse : enable BM25 (default True)
|
|
34
|
+
use_dense : enable embedding retriever (default True)
|
|
35
|
+
use_reranker : enable LLM re-ranking (default True, requires provider)
|
|
36
|
+
dense_model : sentence-transformers model name
|
|
37
|
+
top_k : number of results returned
|
|
38
|
+
fetch_k : candidates fetched before fusion/reranking (>= top_k)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import logging
|
|
44
|
+
from typing import TYPE_CHECKING
|
|
45
|
+
|
|
46
|
+
from .sparse import BM25Retriever
|
|
47
|
+
from .dense import EmbeddingRetriever
|
|
48
|
+
from .fusion import reciprocal_rank_fusion
|
|
49
|
+
from .reranker import LLMReranker
|
|
50
|
+
|
|
51
|
+
if TYPE_CHECKING:
|
|
52
|
+
from ..llm.base import LLMProvider
|
|
53
|
+
|
|
54
|
+
logger = logging.getLogger(__name__)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class HybridRetriever:
|
|
58
|
+
"""
|
|
59
|
+
Combines sparse (BM25) + dense (embedding) retrieval with RRF fusion and
|
|
60
|
+
an optional LLM re-ranker.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
provider : LLMProvider instance (required for use_reranker=True).
|
|
65
|
+
use_sparse : include BM25 retrieval.
|
|
66
|
+
use_dense : include embedding retrieval.
|
|
67
|
+
use_reranker : re-rank fused candidates with the LLM.
|
|
68
|
+
dense_model : sentence-transformers model name.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
provider: "LLMProvider | None" = None,
|
|
74
|
+
use_sparse: bool = True,
|
|
75
|
+
use_dense: bool = True,
|
|
76
|
+
use_reranker: bool = True,
|
|
77
|
+
dense_model: str = "all-MiniLM-L6-v2",
|
|
78
|
+
) -> None:
|
|
79
|
+
self._provider = provider
|
|
80
|
+
self.use_sparse = use_sparse
|
|
81
|
+
self.use_dense = use_dense
|
|
82
|
+
self.use_reranker = use_reranker and provider is not None
|
|
83
|
+
|
|
84
|
+
self._sparse = BM25Retriever() if use_sparse else None
|
|
85
|
+
self._dense = EmbeddingRetriever(dense_model) if use_dense else None
|
|
86
|
+
self._reranker = LLMReranker(provider) if self.use_reranker else None
|
|
87
|
+
self._corpus: list[dict] = []
|
|
88
|
+
|
|
89
|
+
if use_dense and self._dense:
|
|
90
|
+
logger.info("[HybridRetriever] Dense backend: %s", self._dense.backend_name)
|
|
91
|
+
|
|
92
|
+
# ── Indexing ──────────────────────────────────────────────────────────────
|
|
93
|
+
|
|
94
|
+
def fit(self, corpus: list[dict]) -> "HybridRetriever":
|
|
95
|
+
"""
|
|
96
|
+
Index the corpus. Each item must have a 'content' key.
|
|
97
|
+
Mutates corpus in-place by adding '_idx' for RRF deduplication.
|
|
98
|
+
"""
|
|
99
|
+
for i, chunk in enumerate(corpus):
|
|
100
|
+
chunk["_idx"] = i
|
|
101
|
+
self._corpus = corpus
|
|
102
|
+
|
|
103
|
+
if self._sparse:
|
|
104
|
+
self._sparse.fit(corpus)
|
|
105
|
+
if self._dense:
|
|
106
|
+
self._dense.fit(corpus)
|
|
107
|
+
|
|
108
|
+
logger.info(
|
|
109
|
+
"[HybridRetriever] Indexed %d chunks (sparse=%s dense=%s reranker=%s)",
|
|
110
|
+
len(corpus), self.use_sparse, self.use_dense, self.use_reranker,
|
|
111
|
+
)
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
# ── Retrieval ─────────────────────────────────────────────────────────────
|
|
115
|
+
|
|
116
|
+
def retrieve(self, query: str, top_k: int = 5) -> list[dict]:
|
|
117
|
+
"""
|
|
118
|
+
Retrieve the *top_k* most relevant chunks for *query*.
|
|
119
|
+
|
|
120
|
+
Returns a list of chunk dicts (internal '_idx' field stripped).
|
|
121
|
+
"""
|
|
122
|
+
if not self._corpus or not query.strip():
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
# How many candidates to fetch before reranking
|
|
126
|
+
fetch_k = max(top_k * 3, top_k + 5)
|
|
127
|
+
|
|
128
|
+
ranked_lists: list[list[tuple[float, dict]]] = []
|
|
129
|
+
|
|
130
|
+
if self._sparse:
|
|
131
|
+
sparse_results = self._sparse.retrieve(query, top_k=fetch_k)
|
|
132
|
+
if sparse_results:
|
|
133
|
+
ranked_lists.append(sparse_results)
|
|
134
|
+
|
|
135
|
+
if self._dense:
|
|
136
|
+
dense_results = self._dense.retrieve(query, top_k=fetch_k)
|
|
137
|
+
if dense_results:
|
|
138
|
+
ranked_lists.append(dense_results)
|
|
139
|
+
|
|
140
|
+
if not ranked_lists:
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
# Fusion
|
|
144
|
+
if len(ranked_lists) == 1:
|
|
145
|
+
fused = [(s, c) for s, c in ranked_lists[0]]
|
|
146
|
+
else:
|
|
147
|
+
fused = reciprocal_rank_fusion(ranked_lists)
|
|
148
|
+
|
|
149
|
+
candidates = [c for _, c in fused[: fetch_k if self._reranker else top_k]]
|
|
150
|
+
|
|
151
|
+
# Re-rank
|
|
152
|
+
if self._reranker and candidates:
|
|
153
|
+
candidates = self._reranker.rerank(query, candidates, top_k)
|
|
154
|
+
else:
|
|
155
|
+
candidates = candidates[:top_k]
|
|
156
|
+
|
|
157
|
+
# Strip internal index field before returning
|
|
158
|
+
return [{k: v for k, v in c.items() if k != "_idx"} for c in candidates]
|
|
159
|
+
|
|
160
|
+
# ── Convenience ──────────────────────────────────────────────────────────
|
|
161
|
+
|
|
162
|
+
def __len__(self) -> int:
|
|
163
|
+
return len(self._corpus)
|
|
164
|
+
|
|
165
|
+
def __bool__(self) -> bool:
|
|
166
|
+
return bool(self._corpus)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sparse retriever — BM25Okapi.
|
|
3
|
+
|
|
4
|
+
Falls back to a simple TF-weighted word-overlap scorer when `rank_bm25`
|
|
5
|
+
is not installed. Install rank-bm25 for best quality:
|
|
6
|
+
|
|
7
|
+
pip install rank-bm25
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from rank_bm25 import BM25Okapi
|
|
16
|
+
_HAS_BM25 = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
_HAS_BM25 = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _tokenize(text: str) -> list[str]:
|
|
22
|
+
return re.findall(r"\w+", text.lower())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BM25Retriever:
|
|
26
|
+
"""
|
|
27
|
+
Wraps BM25Okapi (or a simple fallback) for sparse retrieval.
|
|
28
|
+
|
|
29
|
+
Usage
|
|
30
|
+
-----
|
|
31
|
+
r = BM25Retriever()
|
|
32
|
+
r.fit(corpus) # corpus = list of {"content": ..., ...}
|
|
33
|
+
results = r.retrieve("my query", 10) # -> [(score, chunk_dict), ...]
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self) -> None:
|
|
37
|
+
self._corpus: list[dict] = []
|
|
38
|
+
self._bm25: object | None = None
|
|
39
|
+
self._tokenized: list[list[str]] = []
|
|
40
|
+
|
|
41
|
+
def fit(self, corpus: list[dict]) -> None:
|
|
42
|
+
self._corpus = corpus
|
|
43
|
+
self._tokenized = [_tokenize(c["content"]) for c in corpus]
|
|
44
|
+
if _HAS_BM25 and corpus:
|
|
45
|
+
self._bm25 = BM25Okapi(self._tokenized)
|
|
46
|
+
|
|
47
|
+
def retrieve(self, query: str, top_k: int) -> list[tuple[float, dict]]:
|
|
48
|
+
if not self._corpus:
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
tokens = _tokenize(query)
|
|
52
|
+
|
|
53
|
+
if _HAS_BM25 and self._bm25:
|
|
54
|
+
raw_scores = self._bm25.get_scores(tokens)
|
|
55
|
+
pairs = [(float(s), c) for s, c in zip(raw_scores, self._corpus) if s > 0]
|
|
56
|
+
else:
|
|
57
|
+
# Fallback: term-frequency word-overlap
|
|
58
|
+
pairs = []
|
|
59
|
+
query_set = set(tokens)
|
|
60
|
+
for chunk in self._corpus:
|
|
61
|
+
chunk_tokens = _tokenize(chunk["content"])
|
|
62
|
+
if not chunk_tokens:
|
|
63
|
+
continue
|
|
64
|
+
tf = sum(1 for t in chunk_tokens if t in query_set)
|
|
65
|
+
if tf > 0:
|
|
66
|
+
pairs.append((float(tf) / len(chunk_tokens), chunk))
|
|
67
|
+
|
|
68
|
+
pairs.sort(key=lambda x: x[0], reverse=True)
|
|
69
|
+
return pairs[:top_k]
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Markdown-backed session store for pythonclaw.
|
|
3
|
+
|
|
4
|
+
Each session gets its own Markdown file with timestamped messages::
|
|
5
|
+
|
|
6
|
+
context/sessions/telegram_1285451567.md
|
|
7
|
+
|
|
8
|
+
File format
|
|
9
|
+
-----------
|
|
10
|
+
Human-readable Markdown with embedded metadata in HTML comments for reliable
|
|
11
|
+
round-trip parsing. Each message block::
|
|
12
|
+
|
|
13
|
+
<!-- msg:{"role":"user","ts":"2026-02-23T15:18:58"} -->
|
|
14
|
+
### 2026-02-23 15:18:58 — User
|
|
15
|
+
|
|
16
|
+
Hello, how are you?
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
Tool calls are stored as JSON code blocks inside the message. System
|
|
21
|
+
injection messages (skill loads, compaction summaries) are also recorded.
|
|
22
|
+
|
|
23
|
+
Truncation
|
|
24
|
+
----------
|
|
25
|
+
When a session file grows beyond *max_messages*, older messages are dropped
|
|
26
|
+
(keeping only the most recent ones by timestamp). The system prompt
|
|
27
|
+
(messages[0]) is never saved — it is always rebuilt fresh on restore.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import json
|
|
33
|
+
import logging
|
|
34
|
+
import os
|
|
35
|
+
import re
|
|
36
|
+
from datetime import datetime
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
DEFAULT_STORE_DIR = os.path.join("context", "sessions")
|
|
41
|
+
DEFAULT_MAX_MESSAGES = 200
|
|
42
|
+
|
|
43
|
+
_META_PATTERN = re.compile(r"<!-- msg:(.*?) -->")
|
|
44
|
+
_ROLE_LABELS = {
|
|
45
|
+
"user": "User",
|
|
46
|
+
"assistant": "Assistant",
|
|
47
|
+
"system": "System",
|
|
48
|
+
"tool": "Tool",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SessionStore:
|
|
53
|
+
"""Reads and writes per-session message history as Markdown files."""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
base_dir: str = DEFAULT_STORE_DIR,
|
|
58
|
+
max_messages: int = DEFAULT_MAX_MESSAGES,
|
|
59
|
+
) -> None:
|
|
60
|
+
self.base_dir = base_dir
|
|
61
|
+
self.max_messages = max_messages
|
|
62
|
+
os.makedirs(base_dir, exist_ok=True)
|
|
63
|
+
|
|
64
|
+
# ── File path ─────────────────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
def _path(self, session_id: str) -> str:
|
|
67
|
+
"""Convert a session_id like 'telegram:123' to a safe filename."""
|
|
68
|
+
safe = re.sub(r"[^\w\-]", "_", session_id)
|
|
69
|
+
return os.path.join(self.base_dir, f"{safe}.md")
|
|
70
|
+
|
|
71
|
+
# ── Serialisation ─────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def _msg_to_markdown(msg: dict) -> str:
|
|
75
|
+
"""Convert a single message dict to a Markdown block."""
|
|
76
|
+
role = msg.get("role", "unknown")
|
|
77
|
+
content = msg.get("content", "") or ""
|
|
78
|
+
ts = msg.get("_ts") or datetime.now().isoformat(timespec="seconds")
|
|
79
|
+
|
|
80
|
+
# Build metadata for round-trip parsing
|
|
81
|
+
meta: dict = {"role": role, "ts": ts}
|
|
82
|
+
if msg.get("tool_call_id"):
|
|
83
|
+
meta["tool_call_id"] = msg["tool_call_id"]
|
|
84
|
+
|
|
85
|
+
meta_json = json.dumps(meta, ensure_ascii=False)
|
|
86
|
+
label = _ROLE_LABELS.get(role, role.title())
|
|
87
|
+
|
|
88
|
+
# Format timestamp for display
|
|
89
|
+
try:
|
|
90
|
+
dt = datetime.fromisoformat(ts)
|
|
91
|
+
display_ts = dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
92
|
+
except (ValueError, TypeError):
|
|
93
|
+
display_ts = ts
|
|
94
|
+
|
|
95
|
+
lines = [f"<!-- msg:{meta_json} -->"]
|
|
96
|
+
lines.append(f"### {display_ts} — {label}")
|
|
97
|
+
lines.append("")
|
|
98
|
+
|
|
99
|
+
if content:
|
|
100
|
+
lines.append(content)
|
|
101
|
+
lines.append("")
|
|
102
|
+
|
|
103
|
+
# Embed tool_calls as JSON
|
|
104
|
+
tool_calls = msg.get("tool_calls")
|
|
105
|
+
if tool_calls:
|
|
106
|
+
lines.append("<details><summary>Tool Calls</summary>")
|
|
107
|
+
lines.append("")
|
|
108
|
+
lines.append("```json")
|
|
109
|
+
lines.append(json.dumps(tool_calls, ensure_ascii=False, indent=2))
|
|
110
|
+
lines.append("```")
|
|
111
|
+
lines.append("")
|
|
112
|
+
lines.append("</details>")
|
|
113
|
+
lines.append("")
|
|
114
|
+
|
|
115
|
+
lines.append("---")
|
|
116
|
+
lines.append("")
|
|
117
|
+
return "\n".join(lines)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def _parse_markdown(text: str) -> list[dict]:
|
|
121
|
+
"""Parse a session Markdown file back into message dicts."""
|
|
122
|
+
messages: list[dict] = []
|
|
123
|
+
|
|
124
|
+
# Split into blocks by the HTML comment markers
|
|
125
|
+
blocks = _META_PATTERN.split(text)
|
|
126
|
+
# blocks = [preamble, meta1, content1, meta2, content2, ...]
|
|
127
|
+
|
|
128
|
+
i = 1 # skip preamble (title / header)
|
|
129
|
+
while i < len(blocks) - 1:
|
|
130
|
+
meta_str = blocks[i].strip()
|
|
131
|
+
body = blocks[i + 1].strip()
|
|
132
|
+
i += 2
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
meta = json.loads(meta_str)
|
|
136
|
+
except json.JSONDecodeError:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
role = meta.get("role", "unknown")
|
|
140
|
+
msg: dict = {"role": role}
|
|
141
|
+
|
|
142
|
+
if meta.get("tool_call_id"):
|
|
143
|
+
msg["tool_call_id"] = meta["tool_call_id"]
|
|
144
|
+
if meta.get("ts"):
|
|
145
|
+
msg["_ts"] = meta["ts"]
|
|
146
|
+
|
|
147
|
+
# Extract content: everything between the header line and
|
|
148
|
+
# optional <details> / --- markers
|
|
149
|
+
content_lines = []
|
|
150
|
+
tool_calls_json = None
|
|
151
|
+
in_details = False
|
|
152
|
+
in_json_block = False
|
|
153
|
+
json_lines: list[str] = []
|
|
154
|
+
|
|
155
|
+
for line in body.split("\n"):
|
|
156
|
+
stripped = line.strip()
|
|
157
|
+
|
|
158
|
+
# Skip the ### header line and trailing ---
|
|
159
|
+
if stripped.startswith("### ") or stripped == "---":
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
if stripped == "<details><summary>Tool Calls</summary>":
|
|
163
|
+
in_details = True
|
|
164
|
+
continue
|
|
165
|
+
if stripped == "</details>":
|
|
166
|
+
in_details = False
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
if in_details:
|
|
170
|
+
if stripped == "```json":
|
|
171
|
+
in_json_block = True
|
|
172
|
+
continue
|
|
173
|
+
if stripped == "```" and in_json_block:
|
|
174
|
+
in_json_block = False
|
|
175
|
+
try:
|
|
176
|
+
tool_calls_json = json.loads("\n".join(json_lines))
|
|
177
|
+
except json.JSONDecodeError:
|
|
178
|
+
pass
|
|
179
|
+
json_lines = []
|
|
180
|
+
continue
|
|
181
|
+
if in_json_block:
|
|
182
|
+
json_lines.append(line)
|
|
183
|
+
continue
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
content_lines.append(line)
|
|
187
|
+
|
|
188
|
+
content = "\n".join(content_lines).strip()
|
|
189
|
+
if content:
|
|
190
|
+
msg["content"] = content
|
|
191
|
+
else:
|
|
192
|
+
msg["content"] = ""
|
|
193
|
+
|
|
194
|
+
if tool_calls_json:
|
|
195
|
+
msg["tool_calls"] = tool_calls_json
|
|
196
|
+
|
|
197
|
+
messages.append(msg)
|
|
198
|
+
|
|
199
|
+
return messages
|
|
200
|
+
|
|
201
|
+
# ── Core API ──────────────────────────────────────────────────────────────
|
|
202
|
+
|
|
203
|
+
def save(self, session_id: str, messages: list[dict]) -> None:
|
|
204
|
+
"""
|
|
205
|
+
Persist messages[1:] to Markdown.
|
|
206
|
+
messages[0] is the initial system prompt — always rebuilt fresh.
|
|
207
|
+
"""
|
|
208
|
+
to_save = messages[1:] if len(messages) > 1 else []
|
|
209
|
+
|
|
210
|
+
# Add timestamps to messages that don't have one
|
|
211
|
+
for msg in to_save:
|
|
212
|
+
if "_ts" not in msg:
|
|
213
|
+
msg["_ts"] = datetime.now().isoformat(timespec="seconds")
|
|
214
|
+
|
|
215
|
+
# Truncate by time — keep only the most recent max_messages
|
|
216
|
+
if len(to_save) > self.max_messages:
|
|
217
|
+
to_save = to_save[-self.max_messages:]
|
|
218
|
+
|
|
219
|
+
path = self._path(session_id)
|
|
220
|
+
try:
|
|
221
|
+
lines = [f"# Session: {session_id}\n\n"]
|
|
222
|
+
for msg in to_save:
|
|
223
|
+
lines.append(self._msg_to_markdown(msg))
|
|
224
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
225
|
+
f.write("\n".join(lines))
|
|
226
|
+
except OSError as exc:
|
|
227
|
+
logger.error("[SessionStore] Failed to save session '%s': %s", session_id, exc)
|
|
228
|
+
|
|
229
|
+
def load(self, session_id: str) -> list[dict]:
|
|
230
|
+
"""
|
|
231
|
+
Return saved messages (messages[1:] from a previous run).
|
|
232
|
+
Applies time-based truncation: only the most recent max_messages
|
|
233
|
+
are returned.
|
|
234
|
+
"""
|
|
235
|
+
path = self._path(session_id)
|
|
236
|
+
if not os.path.exists(path):
|
|
237
|
+
return []
|
|
238
|
+
try:
|
|
239
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
240
|
+
text = f.read()
|
|
241
|
+
except OSError as exc:
|
|
242
|
+
logger.error("[SessionStore] Failed to load session '%s': %s", session_id, exc)
|
|
243
|
+
return []
|
|
244
|
+
|
|
245
|
+
messages = self._parse_markdown(text)
|
|
246
|
+
|
|
247
|
+
# Time-based truncation: keep only the most recent messages
|
|
248
|
+
if len(messages) > self.max_messages:
|
|
249
|
+
messages = messages[-self.max_messages:]
|
|
250
|
+
|
|
251
|
+
return messages
|
|
252
|
+
|
|
253
|
+
def delete(self, session_id: str) -> None:
|
|
254
|
+
"""Remove the Markdown file for session_id."""
|
|
255
|
+
path = self._path(session_id)
|
|
256
|
+
if os.path.exists(path):
|
|
257
|
+
try:
|
|
258
|
+
os.remove(path)
|
|
259
|
+
logger.info("[SessionStore] Deleted session '%s'", session_id)
|
|
260
|
+
except OSError as exc:
|
|
261
|
+
logger.error("[SessionStore] Failed to delete '%s': %s", session_id, exc)
|
|
262
|
+
|
|
263
|
+
def list_session_ids(self) -> list[str]:
|
|
264
|
+
"""Return all session IDs that have a persisted Markdown file."""
|
|
265
|
+
ids = []
|
|
266
|
+
for fname in os.listdir(self.base_dir):
|
|
267
|
+
if fname.endswith(".md"):
|
|
268
|
+
ids.append(fname[: -len(".md")])
|
|
269
|
+
return ids
|