passiveworkers 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- council/__init__.py +1 -0
- council/artifacts.py +161 -0
- council/batch.py +84 -0
- council/cli.py +54 -0
- council/coordinator.py +133 -0
- council/crypto.py +133 -0
- council/fidelity.py +197 -0
- council/judge.py +393 -0
- council/ledger.py +230 -0
- council/library.py +431 -0
- council/local.py +228 -0
- council/mcp_server.py +87 -0
- council/net/__init__.py +1 -0
- council/net/agent.py +231 -0
- council/net/app.py +390 -0
- council/net/baseline.py +86 -0
- council/net/config.py +79 -0
- council/net/coordinator_app.py +370 -0
- council/net/dashboard.py +111 -0
- council/net/store.py +964 -0
- council/net/submit.py +102 -0
- council/operator.py +412 -0
- council/research.py +520 -0
- council/researcher.py +300 -0
- council/retrieval.py +80 -0
- council/run_demo.py +175 -0
- council/sanitize.py +78 -0
- council/serve.py +183 -0
- council/trust.py +168 -0
- council/worker.py +123 -0
- passiveworkers-0.1.0.dist-info/METADATA +269 -0
- passiveworkers-0.1.0.dist-info/RECORD +36 -0
- passiveworkers-0.1.0.dist-info/WHEEL +5 -0
- passiveworkers-0.1.0.dist-info/entry_points.txt +2 -0
- passiveworkers-0.1.0.dist-info/licenses/LICENSE +21 -0
- passiveworkers-0.1.0.dist-info/top_level.txt +1 -0
council/ledger.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
council/ledger.py — Non-transferable mutual-aid credit ledger
|
|
4
|
+
=============================================================
|
|
5
|
+
Implements the give/take economy at the heart of Passive Workers:
|
|
6
|
+
|
|
7
|
+
• Credit is NON-TRANSFERABLE — there is no account-to-account `transfer()`.
|
|
8
|
+
Credit only moves through `settle_job()`, i.e. as payment for real work.
|
|
9
|
+
• Every account opens with a STARTER_ALLOWANCE so a newcomer can ask for help
|
|
10
|
+
before they have contributed anything (bootstrap).
|
|
11
|
+
• To keep asking once the allowance is spent, you must EARN by helping — a pure
|
|
12
|
+
free-rider depletes to zero and is blocked. This is the "no one only-takes,
|
|
13
|
+
no one only-helps" rule, enforced by balance, not by trust.
|
|
14
|
+
• IDEAS COMPETE: a job's worker pool is split by the judge's quality score, so
|
|
15
|
+
a better answer earns more credit than a worse one for the same question.
|
|
16
|
+
• Every job is CONSERVED: the asker's debit exactly equals the sum credited to
|
|
17
|
+
helpers + judge. No credit is minted or destroyed inside a transaction.
|
|
18
|
+
|
|
19
|
+
This is the open-source, no-token core: money (later) only ever enters or leaves
|
|
20
|
+
at the platform boundary, never as a tradeable instrument between users.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import os
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
|
|
28
|
+
ESCROW_ID = "__escrow__" # internal assisted-offer holding account (no starter grant)
|
|
29
|
+
|
|
30
|
+
# Credits granted on account creation (the bootstrap grant). Operator-tunable:
|
|
31
|
+
# 100 ≈ 2–3 council asks before you must contribute — right for production give/take,
|
|
32
|
+
# too tight for trials/demos (the first 10-question trial died at question 3 on this).
|
|
33
|
+
STARTER_ALLOWANCE = float(os.environ.get("PW_STARTER_CREDITS", "100"))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class InsufficientCredit(Exception):
|
|
37
|
+
"""Raised when an asker cannot afford a job — they must contribute more first."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class Account:
|
|
42
|
+
user_id: str
|
|
43
|
+
balance: float = STARTER_ALLOWANCE
|
|
44
|
+
lifetime_earned: float = 0.0 # credits earned by HELPING (as worker or judge)
|
|
45
|
+
lifetime_spent: float = 0.0 # credits spent by ASKING
|
|
46
|
+
jobs_helped: int = 0
|
|
47
|
+
jobs_asked: int = 0
|
|
48
|
+
quality_sum: float = 0.0 # sum of judge scores this owner's answers received
|
|
49
|
+
quality_n: int = 0 # number of judged answers
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def net_contribution(self) -> float:
|
|
53
|
+
"""Positive = net giver, negative = net taker (excludes starter grant)."""
|
|
54
|
+
return self.lifetime_earned - self.lifetime_spent
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def avg_quality(self) -> float:
|
|
58
|
+
"""Rolling reputation = mean judge score (0-10). 0 if never judged."""
|
|
59
|
+
return round(self.quality_sum / self.quality_n, 2) if self.quality_n else 0.0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Receipt:
|
|
64
|
+
job_id: str
|
|
65
|
+
asker_id: str
|
|
66
|
+
total_cost: float
|
|
67
|
+
payouts: dict[str, float] # helper_id -> credit earned
|
|
68
|
+
judge_id: str
|
|
69
|
+
judge_fee: float
|
|
70
|
+
asker_balance_after: float
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class Ledger:
|
|
75
|
+
accounts: dict[str, Account] = field(default_factory=dict)
|
|
76
|
+
_granted_total: float = 0.0 # sum of all starter grants (for accounting)
|
|
77
|
+
_job_count: int = 0
|
|
78
|
+
|
|
79
|
+
# ------------------------------------------------------------------ accounts
|
|
80
|
+
def open_account(self, user_id: str) -> Account:
|
|
81
|
+
if user_id not in self.accounts:
|
|
82
|
+
self.accounts[user_id] = Account(user_id=user_id)
|
|
83
|
+
self._granted_total += STARTER_ALLOWANCE
|
|
84
|
+
return self.accounts[user_id]
|
|
85
|
+
|
|
86
|
+
def get(self, user_id: str) -> Account:
|
|
87
|
+
return self.accounts[user_id]
|
|
88
|
+
|
|
89
|
+
def balance(self, user_id: str) -> float:
|
|
90
|
+
return self.accounts[user_id].balance
|
|
91
|
+
|
|
92
|
+
def can_afford(self, user_id: str, amount: float) -> bool:
|
|
93
|
+
return self.accounts[user_id].balance >= amount
|
|
94
|
+
|
|
95
|
+
# ------------------------------------------------------------------ settlement
|
|
96
|
+
def quote(self, worker_pool: float, judge_fee: float) -> float:
|
|
97
|
+
"""Total a job will cost the asker (worker pool + judge fee)."""
|
|
98
|
+
return round(worker_pool + judge_fee, 4)
|
|
99
|
+
|
|
100
|
+
def settle_job(
|
|
101
|
+
self,
|
|
102
|
+
job_id: str,
|
|
103
|
+
asker_id: str,
|
|
104
|
+
score_by_worker: dict[str, float],
|
|
105
|
+
worker_pool: float,
|
|
106
|
+
judge_id: str,
|
|
107
|
+
judge_fee: float,
|
|
108
|
+
) -> Receipt:
|
|
109
|
+
"""
|
|
110
|
+
Debit the asker `worker_pool + judge_fee`; split `worker_pool` among helpers
|
|
111
|
+
in proportion to their judge score (ideas compete); pay the judge `judge_fee`.
|
|
112
|
+
Conserved: total debit == total credit.
|
|
113
|
+
"""
|
|
114
|
+
total_cost = self.quote(worker_pool, judge_fee)
|
|
115
|
+
asker = self.accounts[asker_id]
|
|
116
|
+
if asker.balance < total_cost:
|
|
117
|
+
raise InsufficientCredit(
|
|
118
|
+
f"{asker_id} has {asker.balance:.1f} credits but the job costs "
|
|
119
|
+
f"{total_cost:.1f}. They must HELP on some jobs before asking again."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Score-weighted split of the worker pool (better answers earn more).
|
|
123
|
+
# CRITICAL: the split must sum EXACTLY to `worker_pool`, or rounding mints/burns
|
|
124
|
+
# credit and breaks conservation. We round every share but the last, and give the
|
|
125
|
+
# last payee the exact remainder.
|
|
126
|
+
items = list(score_by_worker.items())
|
|
127
|
+
score_sum = sum(max(0.0, s) for _, s in items)
|
|
128
|
+
weights = (
|
|
129
|
+
[max(0.0, s) / score_sum for _, s in items]
|
|
130
|
+
if score_sum > 0
|
|
131
|
+
else [1.0 / max(1, len(items))] * len(items) # degenerate: split evenly
|
|
132
|
+
)
|
|
133
|
+
payouts: dict[str, float] = {}
|
|
134
|
+
allocated = 0.0
|
|
135
|
+
for i, (w, _) in enumerate(items):
|
|
136
|
+
if i < len(items) - 1:
|
|
137
|
+
share = round(worker_pool * weights[i], 4)
|
|
138
|
+
payouts[w] = share
|
|
139
|
+
allocated = round(allocated + share, 4)
|
|
140
|
+
else: # last payee absorbs the remainder → exact sum
|
|
141
|
+
payouts[w] = round(worker_pool - allocated, 4)
|
|
142
|
+
|
|
143
|
+
# Apply debit then credits (conserved).
|
|
144
|
+
asker.balance = round(asker.balance - total_cost, 4)
|
|
145
|
+
asker.lifetime_spent = round(asker.lifetime_spent + total_cost, 4)
|
|
146
|
+
asker.jobs_asked += 1
|
|
147
|
+
|
|
148
|
+
for w, credit in payouts.items():
|
|
149
|
+
acct = self.accounts[w]
|
|
150
|
+
acct.balance = round(acct.balance + credit, 4)
|
|
151
|
+
acct.lifetime_earned = round(acct.lifetime_earned + credit, 4)
|
|
152
|
+
acct.jobs_helped += 1
|
|
153
|
+
|
|
154
|
+
jacct = self.accounts[judge_id]
|
|
155
|
+
jacct.balance = round(jacct.balance + judge_fee, 4)
|
|
156
|
+
jacct.lifetime_earned = round(jacct.lifetime_earned + judge_fee, 4)
|
|
157
|
+
if judge_id not in payouts: # don't double-count help if judge is also a payee
|
|
158
|
+
jacct.jobs_helped += 1
|
|
159
|
+
|
|
160
|
+
self._job_count += 1
|
|
161
|
+
return Receipt(
|
|
162
|
+
job_id=job_id,
|
|
163
|
+
asker_id=asker_id,
|
|
164
|
+
total_cost=total_cost,
|
|
165
|
+
payouts=payouts,
|
|
166
|
+
judge_id=judge_id,
|
|
167
|
+
judge_fee=judge_fee,
|
|
168
|
+
asker_balance_after=asker.balance,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# ------------------------------------------------------------------ escrow (assisted, D21)
|
|
172
|
+
# Internal holding account for assisted offers: the asker's reward is HELD at offer
|
|
173
|
+
# creation so it can't be spent elsewhere before the operator delivers, and is released
|
|
174
|
+
# to the operator on delivery (or refunded on expiry). Opened WITHOUT a starter grant —
|
|
175
|
+
# it only ever holds credit moved from real accounts, so conservation is preserved.
|
|
176
|
+
def _escrow(self) -> "Account":
|
|
177
|
+
if ESCROW_ID not in self.accounts:
|
|
178
|
+
# balance 0 (NOT the starter grant) and not counted in _granted_total — it only
|
|
179
|
+
# ever holds credit moved from real accounts, so conservation is preserved.
|
|
180
|
+
self.accounts[ESCROW_ID] = Account(user_id=ESCROW_ID, balance=0.0)
|
|
181
|
+
return self.accounts[ESCROW_ID]
|
|
182
|
+
|
|
183
|
+
def hold(self, asker_id: str, amount: float) -> None:
|
|
184
|
+
"""Move `amount` from the asker into escrow (raises if they can't afford it)."""
|
|
185
|
+
a = self.accounts[asker_id]
|
|
186
|
+
amount = round(amount, 4)
|
|
187
|
+
if a.balance < amount:
|
|
188
|
+
raise InsufficientCredit(
|
|
189
|
+
f"{asker_id} has {a.balance:.1f} but the offer holds {amount:.1f}. "
|
|
190
|
+
"Help on a job first.")
|
|
191
|
+
a.balance = round(a.balance - amount, 4)
|
|
192
|
+
a.lifetime_spent = round(a.lifetime_spent + amount, 4)
|
|
193
|
+
self._escrow().balance = round(self._escrow().balance + amount, 4)
|
|
194
|
+
|
|
195
|
+
def release(self, to_id: str, amount: float) -> None:
|
|
196
|
+
"""Pay an escrow hold out to the operator (on delivery)."""
|
|
197
|
+
amount = round(amount, 4)
|
|
198
|
+
self._escrow().balance = round(self._escrow().balance - amount, 4)
|
|
199
|
+
o = self.accounts[to_id]
|
|
200
|
+
o.balance = round(o.balance + amount, 4)
|
|
201
|
+
o.lifetime_earned = round(o.lifetime_earned + amount, 4)
|
|
202
|
+
|
|
203
|
+
def refund(self, asker_id: str, amount: float) -> None:
|
|
204
|
+
"""Return an escrow hold to the asker (on offer expiry / failure)."""
|
|
205
|
+
amount = round(amount, 4)
|
|
206
|
+
self._escrow().balance = round(self._escrow().balance - amount, 4)
|
|
207
|
+
a = self.accounts[asker_id]
|
|
208
|
+
a.balance = round(a.balance + amount, 4)
|
|
209
|
+
a.lifetime_spent = round(a.lifetime_spent - amount, 4)
|
|
210
|
+
|
|
211
|
+
# ------------------------------------------------------------------ integrity
|
|
212
|
+
def total_credit(self) -> float:
|
|
213
|
+
return round(sum(a.balance for a in self.accounts.values()), 4)
|
|
214
|
+
|
|
215
|
+
def conservation_ok(self) -> bool:
|
|
216
|
+
"""Total credit in circulation must equal the sum of starter grants."""
|
|
217
|
+
return abs(self.total_credit() - self._granted_total) < 1e-6
|
|
218
|
+
|
|
219
|
+
def summary(self) -> str:
|
|
220
|
+
lines = [f"{'user':<16}{'balance':>10}{'earned':>10}{'spent':>10}{'net':>8}{'help':>6}{'ask':>5}"]
|
|
221
|
+
for a in self.accounts.values():
|
|
222
|
+
lines.append(
|
|
223
|
+
f"{a.user_id:<16}{a.balance:>10.1f}{a.lifetime_earned:>10.1f}"
|
|
224
|
+
f"{a.lifetime_spent:>10.1f}{a.net_contribution:>8.1f}{a.jobs_helped:>6}{a.jobs_asked:>5}"
|
|
225
|
+
)
|
|
226
|
+
lines.append(
|
|
227
|
+
f"\n total credit = {self.total_credit():.1f} | granted = {self._granted_total:.1f} | "
|
|
228
|
+
f"conserved = {self.conservation_ok()} | jobs = {self._job_count}"
|
|
229
|
+
)
|
|
230
|
+
return "\n".join(lines)
|
council/library.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
council/library.py — your private document library (local RAG, D19)
|
|
4
|
+
====================================================================
|
|
5
|
+
Index your own files once; the research engine then draws on them ALONGSIDE the live web.
|
|
6
|
+
Everything is local and keyless: text is chunked, embedded with Ollama `nomic-embed-text`,
|
|
7
|
+
and stored in SQLite at ~/.passiveworkers/library.db. Nothing ever leaves your machine —
|
|
8
|
+
no cloud, no account, no telemetry. Retrieval is plain numpy cosine over the stored matrix
|
|
9
|
+
(no heavy vector DB), in keeping with the project's lean, auditable ethos.
|
|
10
|
+
|
|
11
|
+
CLI:
|
|
12
|
+
pw library add <path|dir> index a file or a whole directory
|
|
13
|
+
pw library list what's indexed
|
|
14
|
+
pw library remove <path> drop a document
|
|
15
|
+
pw library clear wipe the library
|
|
16
|
+
|
|
17
|
+
Used by council/researcher.py (the `[L#]` local citations) and the MCP server.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import pathlib
|
|
26
|
+
import re
|
|
27
|
+
import sqlite3
|
|
28
|
+
import sys
|
|
29
|
+
|
|
30
|
+
import requests
|
|
31
|
+
|
|
32
|
+
OLLAMA = os.environ.get("PW_OLLAMA_BASE", "http://localhost:11434")
|
|
33
|
+
EMBED_MODEL = os.environ.get("PW_EMBED_MODEL", "nomic-embed-text")
|
|
34
|
+
LIB_DIR = pathlib.Path(os.environ.get("PW_LIBRARY_DIR",
|
|
35
|
+
str(pathlib.Path.home() / ".passiveworkers")))
|
|
36
|
+
LIB_DB = LIB_DIR / "library.db"
|
|
37
|
+
_CHUNK_CHARS = 2000 # ~512 tokens — the recall sweet spot (R8 research)
|
|
38
|
+
_OVERLAP = 256 # light boundary insurance; parent-window supplies real context
|
|
39
|
+
_CONTEXTUAL = os.environ.get("PW_CONTEXTUAL_CHUNKS", "") not in ("", "0", "false")
|
|
40
|
+
_CTX_MODEL = os.environ.get("PW_CONTEXT_MODEL", "") # small chat model for situating blurbs
|
|
41
|
+
_RERANK = os.environ.get("PW_RERANK", "") not in ("", "0", "false") # opt-in listwise rerank
|
|
42
|
+
_TEXT_EXT = {".txt", ".md", ".markdown", ".rst", ".csv", ".log"}
|
|
43
|
+
# Ingest guards (resource exhaustion): per-file size, file count, total bytes per add().
|
|
44
|
+
_MAX_FILE_BYTES = int(os.environ.get("PW_MAX_FILE_MB", "30")) * 1_000_000
|
|
45
|
+
_MAX_FILES = int(os.environ.get("PW_MAX_FILES", "500"))
|
|
46
|
+
_MAX_TOTAL_BYTES = int(os.environ.get("PW_MAX_TOTAL_MB", "300")) * 1_000_000
|
|
47
|
+
# Path confinement: indexing is restricted to these roots (default: your home dir). This
|
|
48
|
+
# stops an MCP-connected agent (or a stray path) from indexing system / other-user files.
|
|
49
|
+
# Narrow it to e.g. ~/Documents by setting PW_LIBRARY_ROOTS.
|
|
50
|
+
_ROOTS = [pathlib.Path(p).expanduser().resolve()
|
|
51
|
+
for p in os.environ.get("PW_LIBRARY_ROOTS", str(pathlib.Path.home())).split(os.pathsep)
|
|
52
|
+
if p.strip()]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _within_roots(p: pathlib.Path) -> bool:
|
|
56
|
+
try:
|
|
57
|
+
rp = p.resolve()
|
|
58
|
+
except Exception:
|
|
59
|
+
return False
|
|
60
|
+
return any(rp == r or rp.is_relative_to(r) for r in _ROOTS)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------ text extraction
|
|
64
|
+
def _read_text_file(path: pathlib.Path) -> str:
|
|
65
|
+
return path.read_text(encoding="utf-8", errors="replace")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _read_pdf(path: pathlib.Path) -> str:
|
|
69
|
+
from pypdf import PdfReader
|
|
70
|
+
reader = PdfReader(str(path))
|
|
71
|
+
return "\n".join((page.extract_text() or "") for page in reader.pages)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _read_docx(path: pathlib.Path) -> str:
|
|
75
|
+
import docx
|
|
76
|
+
return "\n".join(p.text for p in docx.Document(str(path)).paragraphs)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def extract_text(path: pathlib.Path) -> str:
|
|
80
|
+
ext = path.suffix.lower()
|
|
81
|
+
if ext == ".pdf":
|
|
82
|
+
return _read_pdf(path)
|
|
83
|
+
if ext == ".docx":
|
|
84
|
+
return _read_docx(path)
|
|
85
|
+
if ext in _TEXT_EXT:
|
|
86
|
+
return _read_text_file(path)
|
|
87
|
+
raise ValueError(f"unsupported file type: {ext} (supported: pdf, docx, txt/md/csv/…)")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _split_recursive(text: str, sep_idx: int = 0, budget: int | None = None) -> list[str]:
|
|
91
|
+
"""Recursively split on a separator hierarchy, keeping semantic units intact and
|
|
92
|
+
never straddling a section boundary. Falls through coarser→finer separators until
|
|
93
|
+
pieces fit `budget` chars. Re-appends the (stripped) separator at flush boundaries so
|
|
94
|
+
sentence punctuation isn't lost from citation quotes."""
|
|
95
|
+
budget = budget or _CHUNK_CHARS
|
|
96
|
+
seps = ["\n\n", "\n", ". ", " ", ""]
|
|
97
|
+
if len(text) <= budget:
|
|
98
|
+
return [text] if text.strip() else []
|
|
99
|
+
if sep_idx >= len(seps):
|
|
100
|
+
return [text[i:i + budget] for i in range(0, len(text), budget)]
|
|
101
|
+
sep = seps[sep_idx]
|
|
102
|
+
parts = text.split(sep) if sep else list(text)
|
|
103
|
+
out, buf = [], ""
|
|
104
|
+
for part in parts:
|
|
105
|
+
piece = (buf + sep + part) if buf else part
|
|
106
|
+
if len(piece) <= budget:
|
|
107
|
+
buf = piece
|
|
108
|
+
else:
|
|
109
|
+
if buf.strip():
|
|
110
|
+
out.append((buf + sep.rstrip()) if sep.strip() else buf)
|
|
111
|
+
buf = part if len(part) <= budget else ""
|
|
112
|
+
if len(part) > budget:
|
|
113
|
+
out.extend(_split_recursive(part, sep_idx + 1, budget))
|
|
114
|
+
if buf.strip():
|
|
115
|
+
out.append(buf)
|
|
116
|
+
return out
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _chunk(text: str) -> list[str]:
|
|
120
|
+
"""Structure-aware chunking (R8/D20): split on markdown/section headers first, then
|
|
121
|
+
recurse paragraph→line→sentence→word so a chunk never straddles a section. Light
|
|
122
|
+
overlap as boundary insurance (parent-window expansion supplies the rest of context)."""
|
|
123
|
+
if not text or not text.strip():
|
|
124
|
+
return []
|
|
125
|
+
# 1) split on headers (markdown #, ##… and underline/blank-separated sections)
|
|
126
|
+
sections, cur = [], []
|
|
127
|
+
for line in text.splitlines():
|
|
128
|
+
if re.match(r"^\s{0,3}#{1,6}\s", line) and cur:
|
|
129
|
+
sections.append("\n".join(cur)); cur = [line]
|
|
130
|
+
else:
|
|
131
|
+
cur.append(line)
|
|
132
|
+
if cur:
|
|
133
|
+
sections.append("\n".join(cur))
|
|
134
|
+
# 2) recurse within each section to a size that leaves room for the overlap tail,
|
|
135
|
+
# so a stored chunk (raw + prepended overlap) stays within _CHUNK_CHARS.
|
|
136
|
+
budget = max(400, _CHUNK_CHARS - _OVERLAP)
|
|
137
|
+
raw = []
|
|
138
|
+
for sec in sections:
|
|
139
|
+
raw.extend(_split_recursive(sec.strip(), budget=budget))
|
|
140
|
+
raw = [c.strip() for c in raw if c.strip()]
|
|
141
|
+
# 3) light char overlap between adjacent chunks (boundary insurance)
|
|
142
|
+
if _OVERLAP <= 0 or len(raw) < 2:
|
|
143
|
+
return raw
|
|
144
|
+
out = [raw[0]]
|
|
145
|
+
for prev, c in zip(raw, raw[1:]):
|
|
146
|
+
tail = prev[-_OVERLAP:]
|
|
147
|
+
out.append((tail + " " + c) if tail else c)
|
|
148
|
+
return out
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ------------------------------------------------------------------ embeddings
|
|
152
|
+
def embed(texts: list[str]) -> list[list[float]]:
|
|
153
|
+
"""Embed via local Ollama. One call per text (Ollama's embeddings API is single-input)."""
|
|
154
|
+
vecs = []
|
|
155
|
+
for t in texts:
|
|
156
|
+
r = requests.post(f"{OLLAMA}/api/embeddings",
|
|
157
|
+
json={"model": EMBED_MODEL, "prompt": t,
|
|
158
|
+
"keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m")}, # warm embedder (R17)
|
|
159
|
+
timeout=120)
|
|
160
|
+
r.raise_for_status()
|
|
161
|
+
vecs.append(r.json()["embedding"])
|
|
162
|
+
return vecs
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# --------------------------------------------- contextual retrieval (Anthropic technique)
|
|
166
|
+
def _smallest_chat_model() -> str:
|
|
167
|
+
"""Pick the smallest installed non-embedding model for cheap contextual blurbs."""
|
|
168
|
+
if _CTX_MODEL:
|
|
169
|
+
return _CTX_MODEL
|
|
170
|
+
try:
|
|
171
|
+
r = requests.get(f"{OLLAMA}/api/tags", timeout=10)
|
|
172
|
+
models = [m for m in r.json().get("models", []) if "embed" not in m["name"].lower()]
|
|
173
|
+
return sorted(models, key=lambda m: m.get("size", 0))[0]["name"]
|
|
174
|
+
except Exception:
|
|
175
|
+
return ""
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _situate(doc_text: str, chunk: str, model: str) -> str:
|
|
179
|
+
"""Anthropic Contextual Retrieval: a 1-2 sentence blurb situating the chunk in its
|
|
180
|
+
document, to be prepended before embedding + BM25 indexing. Best-effort ('' on failure)."""
|
|
181
|
+
if not model:
|
|
182
|
+
return ""
|
|
183
|
+
try:
|
|
184
|
+
from council.sanitize import spotlight
|
|
185
|
+
# the document is untrusted content → spotlight it so a planted instruction inside
|
|
186
|
+
# the file can't hijack the blurb-writer (data, not instructions)
|
|
187
|
+
body = spotlight(f"<document>\n{doc_text[:8000]}\n</document>\n<chunk>\n{chunk[:1500]}\n</chunk>")
|
|
188
|
+
prompt = (f"{body}\n"
|
|
189
|
+
"Give a short succinct context (1-2 sentences) to situate this chunk within "
|
|
190
|
+
"the overall document for improving search retrieval. Answer ONLY with the "
|
|
191
|
+
"context, nothing else.")
|
|
192
|
+
r = requests.post(f"{OLLAMA}/api/generate",
|
|
193
|
+
json={"model": model, "prompt": prompt, "stream": False,
|
|
194
|
+
"options": {"temperature": 0.0, "num_predict": 120},
|
|
195
|
+
"keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m")},
|
|
196
|
+
timeout=120)
|
|
197
|
+
r.raise_for_status()
|
|
198
|
+
return (r.json().get("response") or "").strip()[:400]
|
|
199
|
+
except Exception:
|
|
200
|
+
return ""
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# ------------------------------------------------------------------ store
|
|
204
|
+
class Library:
|
|
205
|
+
def __init__(self, db_path: pathlib.Path = LIB_DB):
|
|
206
|
+
LIB_DIR.mkdir(parents=True, exist_ok=True)
|
|
207
|
+
self.conn = sqlite3.connect(str(db_path))
|
|
208
|
+
self.conn.row_factory = sqlite3.Row
|
|
209
|
+
self._corpus_key = None
|
|
210
|
+
self._corpus_cache = None
|
|
211
|
+
self.conn.execute(
|
|
212
|
+
"CREATE TABLE IF NOT EXISTS chunks("
|
|
213
|
+
"id INTEGER PRIMARY KEY, source TEXT, title TEXT, ord INT, text TEXT, vec TEXT)")
|
|
214
|
+
self.conn.execute(
|
|
215
|
+
"CREATE TABLE IF NOT EXISTS filemeta(source TEXT PRIMARY KEY, fhash TEXT)")
|
|
216
|
+
cols = {r["name"] for r in self.conn.execute("PRAGMA table_info(chunks)")}
|
|
217
|
+
if "context" not in cols: # contextual-retrieval blurb (R8); display still uses `text`
|
|
218
|
+
self.conn.execute("ALTER TABLE chunks ADD COLUMN context TEXT")
|
|
219
|
+
self.conn.commit()
|
|
220
|
+
|
|
221
|
+
def add(self, path: str) -> int:
|
|
222
|
+
"""Index a file or directory (confined to PW_LIBRARY_ROOTS). Returns chunks added."""
|
|
223
|
+
p = pathlib.Path(path).expanduser().resolve()
|
|
224
|
+
if not _within_roots(p):
|
|
225
|
+
raise ValueError(f"path outside allowed roots ({', '.join(map(str, _ROOTS))}); "
|
|
226
|
+
"set PW_LIBRARY_ROOTS to widen")
|
|
227
|
+
if p.is_dir():
|
|
228
|
+
total = files = tbytes = 0
|
|
229
|
+
for f in sorted(p.rglob("*")):
|
|
230
|
+
if files >= _MAX_FILES or tbytes >= _MAX_TOTAL_BYTES:
|
|
231
|
+
print(f" stop: ingest cap reached ({_MAX_FILES} files / "
|
|
232
|
+
f"{_MAX_TOTAL_BYTES//1_000_000} MB)", flush=True)
|
|
233
|
+
break
|
|
234
|
+
if f.is_symlink(): # don't follow symlinks out of the tree
|
|
235
|
+
continue
|
|
236
|
+
if f.is_file() and f.suffix.lower() in _TEXT_EXT | {".pdf", ".docx"} \
|
|
237
|
+
and _within_roots(f):
|
|
238
|
+
try:
|
|
239
|
+
sz = f.stat().st_size
|
|
240
|
+
if sz > _MAX_FILE_BYTES:
|
|
241
|
+
print(f" skip {f.name}: too large ({sz//1_000_000} MB)", flush=True)
|
|
242
|
+
continue
|
|
243
|
+
added = self._add_file(f)
|
|
244
|
+
if added:
|
|
245
|
+
files += 1
|
|
246
|
+
tbytes += sz
|
|
247
|
+
total += added
|
|
248
|
+
except Exception as e:
|
|
249
|
+
print(f" skip {f.name}: {e}", flush=True)
|
|
250
|
+
return total
|
|
251
|
+
return self._add_file(p)
|
|
252
|
+
|
|
253
|
+
def _add_file(self, p: pathlib.Path) -> int:
|
|
254
|
+
if not _within_roots(p):
|
|
255
|
+
raise ValueError("path outside allowed roots")
|
|
256
|
+
if p.is_file() and p.stat().st_size > _MAX_FILE_BYTES:
|
|
257
|
+
raise ValueError(f"file too large (> {_MAX_FILE_BYTES//1_000_000} MB)")
|
|
258
|
+
src = str(p)
|
|
259
|
+
fhash = hashlib.sha256(p.read_bytes()).hexdigest()
|
|
260
|
+
prev = self.conn.execute("SELECT fhash FROM filemeta WHERE source=?", (src,)).fetchone()
|
|
261
|
+
if prev and prev["fhash"] == fhash and not self.is_empty():
|
|
262
|
+
print(f" unchanged {p.name} — skipped (incremental)", flush=True)
|
|
263
|
+
return 0
|
|
264
|
+
doc_text = extract_text(p)
|
|
265
|
+
chunks = _chunk(doc_text)
|
|
266
|
+
if not chunks:
|
|
267
|
+
return 0
|
|
268
|
+
title = p.name
|
|
269
|
+
# Contextual Retrieval: always prepend the title (free baseline); optionally an
|
|
270
|
+
# LLM situating blurb (flag-gated for speed). Embed/BM25 use context+text; the
|
|
271
|
+
# displayed [L#] quote uses `text` only — the blurb never leaks into citations.
|
|
272
|
+
ctx_model = _smallest_chat_model() if _CONTEXTUAL else ""
|
|
273
|
+
contexts, augmented = [], []
|
|
274
|
+
for c in chunks:
|
|
275
|
+
blurb = _situate(doc_text, c, ctx_model) if ctx_model else ""
|
|
276
|
+
ctx = (f"{title}. {blurb}".strip()) if blurb else title
|
|
277
|
+
contexts.append(ctx)
|
|
278
|
+
augmented.append(f"{ctx}\n{c}")
|
|
279
|
+
self.conn.execute("DELETE FROM chunks WHERE source=?", (src,)) # re-index = replace
|
|
280
|
+
vecs = embed(augmented)
|
|
281
|
+
self.conn.executemany(
|
|
282
|
+
"INSERT INTO chunks(source,title,ord,text,context,vec) VALUES(?,?,?,?,?,?)",
|
|
283
|
+
[(src, title, i, c, ctx, json.dumps(v))
|
|
284
|
+
for i, (c, ctx, v) in enumerate(zip(chunks, contexts, vecs))])
|
|
285
|
+
self.conn.execute("INSERT OR REPLACE INTO filemeta(source,fhash) VALUES(?,?)", (src, fhash))
|
|
286
|
+
self.conn.commit()
|
|
287
|
+
tag = " (contextual)" if ctx_model else ""
|
|
288
|
+
print(f" indexed {title}: {len(chunks)} chunks{tag}", flush=True)
|
|
289
|
+
return len(chunks)
|
|
290
|
+
|
|
291
|
+
def remove(self, path: str) -> int:
|
|
292
|
+
src = str(pathlib.Path(path).expanduser().resolve())
|
|
293
|
+
cur = self.conn.execute("DELETE FROM chunks WHERE source=?", (src,))
|
|
294
|
+
self.conn.execute("DELETE FROM filemeta WHERE source=?", (src,))
|
|
295
|
+
self.conn.commit()
|
|
296
|
+
return cur.rowcount
|
|
297
|
+
|
|
298
|
+
def clear(self) -> None:
|
|
299
|
+
self.conn.execute("DELETE FROM chunks")
|
|
300
|
+
self.conn.execute("DELETE FROM filemeta")
|
|
301
|
+
self.conn.commit()
|
|
302
|
+
|
|
303
|
+
def sources(self) -> list[dict]:
|
|
304
|
+
return [dict(r) for r in self.conn.execute(
|
|
305
|
+
"SELECT source, title, COUNT(*) n FROM chunks GROUP BY source ORDER BY title")]
|
|
306
|
+
|
|
307
|
+
def is_empty(self) -> bool:
|
|
308
|
+
return self.conn.execute("SELECT 1 FROM chunks LIMIT 1").fetchone() is None
|
|
309
|
+
|
|
310
|
+
def _corpus(self):
|
|
311
|
+
"""Load rows + build the normalized embedding matrix and BM25 index ONCE, cached on a
|
|
312
|
+
cheap fingerprint (chunk count + max id) so repeated searches (e.g. 3 analysts/run, or
|
|
313
|
+
the serve/MCP loop) don't rebuild the whole index each call."""
|
|
314
|
+
import numpy as np
|
|
315
|
+
from council.retrieval import BM25Okapi, tokenize
|
|
316
|
+
fp = self.conn.execute("SELECT COUNT(*), COALESCE(MAX(id),0) FROM chunks").fetchone()
|
|
317
|
+
key = (fp[0], fp[1])
|
|
318
|
+
if getattr(self, "_corpus_key", None) == key and self._corpus_cache is not None:
|
|
319
|
+
return self._corpus_cache
|
|
320
|
+
rows = list(self.conn.execute("SELECT source,title,ord,text,context,vec FROM chunks"))
|
|
321
|
+
if rows and any(r["context"] is None for r in rows) and any(r["context"] for r in rows):
|
|
322
|
+
print(" note: library mixes pre/post-contextual rows — `pw library clear` then "
|
|
323
|
+
"re-add for a consistent index", flush=True)
|
|
324
|
+
mat = np.asarray([json.loads(r["vec"]) for r in rows], dtype="float32")
|
|
325
|
+
norm = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9)
|
|
326
|
+
bm25 = BM25Okapi([tokenize((r["context"] or "") + " " + r["text"]) for r in rows])
|
|
327
|
+
self._corpus_key, self._corpus_cache = key, (rows, norm, bm25)
|
|
328
|
+
return self._corpus_cache
|
|
329
|
+
|
|
330
|
+
def search(self, query: str, k: int = 5, window: bool = True) -> list[dict]:
|
|
331
|
+
"""Hybrid retrieval (R8/D20): dense cosine ⊕ BM25 lexical, fused by Reciprocal Rank
|
|
332
|
+
Fusion, then small-to-big parent-window expansion. Catches both semantic matches and
|
|
333
|
+
exact terms (names/codes/numbers) dense embeddings miss. [] on empty/failure."""
|
|
334
|
+
if self.is_empty():
|
|
335
|
+
return []
|
|
336
|
+
try:
|
|
337
|
+
import numpy as np
|
|
338
|
+
from council.retrieval import reciprocal_rank_fusion
|
|
339
|
+
rows, norm, bm25 = self._corpus()
|
|
340
|
+
pool = min(50, len(rows)) # candidates per retriever before fusion
|
|
341
|
+
qv = np.asarray(embed([query])[0], dtype="float32")
|
|
342
|
+
qn = qv / (np.linalg.norm(qv) + 1e-9)
|
|
343
|
+
sims = norm @ qn
|
|
344
|
+
dense_rank = list(np.argsort(sims)[::-1][:pool])
|
|
345
|
+
sparse_rank = bm25.top(query, k=pool)
|
|
346
|
+
fused = reciprocal_rank_fusion([list(map(int, dense_rank)), list(map(int, sparse_rank))],
|
|
347
|
+
top_k=max(k, 15) if _RERANK else k)
|
|
348
|
+
if _RERANK and len(fused) > k:
|
|
349
|
+
fused = self._rerank(query, rows, fused, k)
|
|
350
|
+
return [self._hit(rows, i, float(sims[i]), window) for i in fused[:k]]
|
|
351
|
+
except Exception:
|
|
352
|
+
return []
|
|
353
|
+
|
|
354
|
+
def _rerank(self, query: str, rows, ids: list[int], k: int) -> list[int]:
|
|
355
|
+
"""Opt-in (PW_RERANK=1) listwise rerank: one local-model call scores the fused
|
|
356
|
+
candidates by actual relevance. Zero new deps; falls back to RRF order on failure."""
|
|
357
|
+
model = _smallest_chat_model()
|
|
358
|
+
if not model:
|
|
359
|
+
return ids[:k]
|
|
360
|
+
try:
|
|
361
|
+
from council.judge import _extract_json
|
|
362
|
+
from council.sanitize import spotlight
|
|
363
|
+
# candidate passages are untrusted document text → spotlight (a planted
|
|
364
|
+
# instruction can at worst reorder, never escape; index-clamp below bounds it)
|
|
365
|
+
cand = spotlight("\n".join(f"[{j}] {rows[i]['text'][:300]}" for j, i in enumerate(ids)))
|
|
366
|
+
prompt = (f"Rank the passages by relevance to the QUERY. Return STRICT JSON: "
|
|
367
|
+
f'{{"order":[indices best-first]}}.\n\nQUERY: {query}\n\nPASSAGES:\n{cand}\n\nJSON:')
|
|
368
|
+
r = requests.post(f"{OLLAMA}/api/generate",
|
|
369
|
+
json={"model": model, "prompt": prompt, "stream": False,
|
|
370
|
+
"options": {"temperature": 0.0, "num_predict": 120},
|
|
371
|
+
"keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m")},
|
|
372
|
+
timeout=120)
|
|
373
|
+
r.raise_for_status()
|
|
374
|
+
parsed = _extract_json((r.json().get("response") or "").strip())
|
|
375
|
+
order = parsed.get("order") if isinstance(parsed, dict) else None
|
|
376
|
+
if isinstance(order, list):
|
|
377
|
+
picked = [ids[j] for j in order if isinstance(j, int) and 0 <= j < len(ids)]
|
|
378
|
+
# append any not mentioned, preserving fusion order
|
|
379
|
+
picked += [i for i in ids if i not in picked]
|
|
380
|
+
return picked[:k]
|
|
381
|
+
except Exception:
|
|
382
|
+
pass
|
|
383
|
+
return ids[:k]
|
|
384
|
+
|
|
385
|
+
def _hit(self, rows, i, score, window: bool) -> dict:
|
|
386
|
+
"""Build a result, optionally expanding to a parent window (neighbor chunks of the
|
|
387
|
+
same source) for richer grounding. Display text never includes the context blurb."""
|
|
388
|
+
r = rows[i]
|
|
389
|
+
text = r["text"]
|
|
390
|
+
if window:
|
|
391
|
+
same = {row["ord"]: row["text"] for row in rows if row["source"] == r["source"]}
|
|
392
|
+
parts = [same[o] for o in (r["ord"] - 1, r["ord"], r["ord"] + 1) if o in same]
|
|
393
|
+
text = " […] ".join(parts)[:2500]
|
|
394
|
+
return {"source": r["source"], "title": r["title"], "ord": r["ord"],
|
|
395
|
+
"text": text, "score": score}
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# ------------------------------------------------------------------ CLI
|
|
399
|
+
def main() -> int:
|
|
400
|
+
args = sys.argv[1:]
|
|
401
|
+
lib = Library()
|
|
402
|
+
if not args or args[0] == "list":
|
|
403
|
+
srcs = lib.sources()
|
|
404
|
+
if not srcs:
|
|
405
|
+
print("library empty — add files with: pw library add <path>")
|
|
406
|
+
else:
|
|
407
|
+
for s in srcs:
|
|
408
|
+
print(f" {s['title']:40s} {s['n']:4d} chunks {s['source']}")
|
|
409
|
+
print(f"\n{len(srcs)} document(s) indexed.")
|
|
410
|
+
return 0
|
|
411
|
+
cmd = args[0]
|
|
412
|
+
if cmd == "add":
|
|
413
|
+
if len(args) < 2:
|
|
414
|
+
print("usage: pw library add <path|dir>"); return 2
|
|
415
|
+
n = lib.add(args[1])
|
|
416
|
+
print(f"✓ {n} chunks indexed.")
|
|
417
|
+
return 0
|
|
418
|
+
if cmd == "remove":
|
|
419
|
+
if len(args) < 2:
|
|
420
|
+
print("usage: pw library remove <path>"); return 2
|
|
421
|
+
print(f"✓ removed {lib.remove(args[1])} chunks.")
|
|
422
|
+
return 0
|
|
423
|
+
if cmd == "clear":
|
|
424
|
+
lib.clear(); print("✓ library cleared.")
|
|
425
|
+
return 0
|
|
426
|
+
print(f"unknown: pw library {cmd} (add | list | remove | clear)")
|
|
427
|
+
return 2
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
if __name__ == "__main__":
|
|
431
|
+
sys.exit(main())
|