passiveworkers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
council/ledger.py ADDED
@@ -0,0 +1,230 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ council/ledger.py — Non-transferable mutual-aid credit ledger
4
+ =============================================================
5
+ Implements the give/take economy at the heart of Passive Workers:
6
+
7
+ • Credit is NON-TRANSFERABLE — there is no account-to-account `transfer()`.
8
+ Credit only moves through `settle_job()`, i.e. as payment for real work.
9
+ • Every account opens with a STARTER_ALLOWANCE so a newcomer can ask for help
10
+ before they have contributed anything (bootstrap).
11
+ • To keep asking once the allowance is spent, you must EARN by helping — a pure
12
+ free-rider depletes to zero and is blocked. This is the "no one only-takes,
13
+ no one only-helps" rule, enforced by balance, not by trust.
14
+ • IDEAS COMPETE: a job's worker pool is split by the judge's quality score, so
15
+ a better answer earns more credit than a worse one for the same question.
16
+ • Every job is CONSERVED: the asker's debit exactly equals the sum credited to
17
+ helpers + judge. No credit is minted or destroyed inside a transaction.
18
+
19
+ This is the open-source, no-token core: money (later) only ever enters or leaves
20
+ at the platform boundary, never as a tradeable instrument between users.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import os
26
+ from dataclasses import dataclass, field
27
+
28
+ ESCROW_ID = "__escrow__" # internal assisted-offer holding account (no starter grant)
29
+
30
+ # Credits granted on account creation (the bootstrap grant). Operator-tunable:
31
+ # 100 ≈ 2–3 council asks before you must contribute — right for production give/take,
32
+ # too tight for trials/demos (the first 10-question trial died at question 3 on this).
33
+ STARTER_ALLOWANCE = float(os.environ.get("PW_STARTER_CREDITS", "100"))
34
+
35
+
36
+ class InsufficientCredit(Exception):
37
+ """Raised when an asker cannot afford a job — they must contribute more first."""
38
+
39
+
40
+ @dataclass
41
+ class Account:
42
+ user_id: str
43
+ balance: float = STARTER_ALLOWANCE
44
+ lifetime_earned: float = 0.0 # credits earned by HELPING (as worker or judge)
45
+ lifetime_spent: float = 0.0 # credits spent by ASKING
46
+ jobs_helped: int = 0
47
+ jobs_asked: int = 0
48
+ quality_sum: float = 0.0 # sum of judge scores this owner's answers received
49
+ quality_n: int = 0 # number of judged answers
50
+
51
+ @property
52
+ def net_contribution(self) -> float:
53
+ """Positive = net giver, negative = net taker (excludes starter grant)."""
54
+ return self.lifetime_earned - self.lifetime_spent
55
+
56
+ @property
57
+ def avg_quality(self) -> float:
58
+ """Rolling reputation = mean judge score (0-10). 0 if never judged."""
59
+ return round(self.quality_sum / self.quality_n, 2) if self.quality_n else 0.0
60
+
61
+
62
+ @dataclass
63
+ class Receipt:
64
+ job_id: str
65
+ asker_id: str
66
+ total_cost: float
67
+ payouts: dict[str, float] # helper_id -> credit earned
68
+ judge_id: str
69
+ judge_fee: float
70
+ asker_balance_after: float
71
+
72
+
73
+ @dataclass
74
+ class Ledger:
75
+ accounts: dict[str, Account] = field(default_factory=dict)
76
+ _granted_total: float = 0.0 # sum of all starter grants (for accounting)
77
+ _job_count: int = 0
78
+
79
+ # ------------------------------------------------------------------ accounts
80
+ def open_account(self, user_id: str) -> Account:
81
+ if user_id not in self.accounts:
82
+ self.accounts[user_id] = Account(user_id=user_id)
83
+ self._granted_total += STARTER_ALLOWANCE
84
+ return self.accounts[user_id]
85
+
86
+ def get(self, user_id: str) -> Account:
87
+ return self.accounts[user_id]
88
+
89
+ def balance(self, user_id: str) -> float:
90
+ return self.accounts[user_id].balance
91
+
92
+ def can_afford(self, user_id: str, amount: float) -> bool:
93
+ return self.accounts[user_id].balance >= amount
94
+
95
+ # ------------------------------------------------------------------ settlement
96
+ def quote(self, worker_pool: float, judge_fee: float) -> float:
97
+ """Total a job will cost the asker (worker pool + judge fee)."""
98
+ return round(worker_pool + judge_fee, 4)
99
+
100
+ def settle_job(
101
+ self,
102
+ job_id: str,
103
+ asker_id: str,
104
+ score_by_worker: dict[str, float],
105
+ worker_pool: float,
106
+ judge_id: str,
107
+ judge_fee: float,
108
+ ) -> Receipt:
109
+ """
110
+ Debit the asker `worker_pool + judge_fee`; split `worker_pool` among helpers
111
+ in proportion to their judge score (ideas compete); pay the judge `judge_fee`.
112
+ Conserved: total debit == total credit.
113
+ """
114
+ total_cost = self.quote(worker_pool, judge_fee)
115
+ asker = self.accounts[asker_id]
116
+ if asker.balance < total_cost:
117
+ raise InsufficientCredit(
118
+ f"{asker_id} has {asker.balance:.1f} credits but the job costs "
119
+ f"{total_cost:.1f}. They must HELP on some jobs before asking again."
120
+ )
121
+
122
+ # Score-weighted split of the worker pool (better answers earn more).
123
+ # CRITICAL: the split must sum EXACTLY to `worker_pool`, or rounding mints/burns
124
+ # credit and breaks conservation. We round every share but the last, and give the
125
+ # last payee the exact remainder.
126
+ items = list(score_by_worker.items())
127
+ score_sum = sum(max(0.0, s) for _, s in items)
128
+ weights = (
129
+ [max(0.0, s) / score_sum for _, s in items]
130
+ if score_sum > 0
131
+ else [1.0 / max(1, len(items))] * len(items) # degenerate: split evenly
132
+ )
133
+ payouts: dict[str, float] = {}
134
+ allocated = 0.0
135
+ for i, (w, _) in enumerate(items):
136
+ if i < len(items) - 1:
137
+ share = round(worker_pool * weights[i], 4)
138
+ payouts[w] = share
139
+ allocated = round(allocated + share, 4)
140
+ else: # last payee absorbs the remainder → exact sum
141
+ payouts[w] = round(worker_pool - allocated, 4)
142
+
143
+ # Apply debit then credits (conserved).
144
+ asker.balance = round(asker.balance - total_cost, 4)
145
+ asker.lifetime_spent = round(asker.lifetime_spent + total_cost, 4)
146
+ asker.jobs_asked += 1
147
+
148
+ for w, credit in payouts.items():
149
+ acct = self.accounts[w]
150
+ acct.balance = round(acct.balance + credit, 4)
151
+ acct.lifetime_earned = round(acct.lifetime_earned + credit, 4)
152
+ acct.jobs_helped += 1
153
+
154
+ jacct = self.accounts[judge_id]
155
+ jacct.balance = round(jacct.balance + judge_fee, 4)
156
+ jacct.lifetime_earned = round(jacct.lifetime_earned + judge_fee, 4)
157
+ if judge_id not in payouts: # don't double-count help if judge is also a payee
158
+ jacct.jobs_helped += 1
159
+
160
+ self._job_count += 1
161
+ return Receipt(
162
+ job_id=job_id,
163
+ asker_id=asker_id,
164
+ total_cost=total_cost,
165
+ payouts=payouts,
166
+ judge_id=judge_id,
167
+ judge_fee=judge_fee,
168
+ asker_balance_after=asker.balance,
169
+ )
170
+
171
+ # ------------------------------------------------------------------ escrow (assisted, D21)
172
+ # Internal holding account for assisted offers: the asker's reward is HELD at offer
173
+ # creation so it can't be spent elsewhere before the operator delivers, and is released
174
+ # to the operator on delivery (or refunded on expiry). Opened WITHOUT a starter grant —
175
+ # it only ever holds credit moved from real accounts, so conservation is preserved.
176
+ def _escrow(self) -> "Account":
177
+ if ESCROW_ID not in self.accounts:
178
+ # balance 0 (NOT the starter grant) and not counted in _granted_total — it only
179
+ # ever holds credit moved from real accounts, so conservation is preserved.
180
+ self.accounts[ESCROW_ID] = Account(user_id=ESCROW_ID, balance=0.0)
181
+ return self.accounts[ESCROW_ID]
182
+
183
+ def hold(self, asker_id: str, amount: float) -> None:
184
+ """Move `amount` from the asker into escrow (raises if they can't afford it)."""
185
+ a = self.accounts[asker_id]
186
+ amount = round(amount, 4)
187
+ if a.balance < amount:
188
+ raise InsufficientCredit(
189
+ f"{asker_id} has {a.balance:.1f} but the offer holds {amount:.1f}. "
190
+ "Help on a job first.")
191
+ a.balance = round(a.balance - amount, 4)
192
+ a.lifetime_spent = round(a.lifetime_spent + amount, 4)
193
+ self._escrow().balance = round(self._escrow().balance + amount, 4)
194
+
195
+ def release(self, to_id: str, amount: float) -> None:
196
+ """Pay an escrow hold out to the operator (on delivery)."""
197
+ amount = round(amount, 4)
198
+ self._escrow().balance = round(self._escrow().balance - amount, 4)
199
+ o = self.accounts[to_id]
200
+ o.balance = round(o.balance + amount, 4)
201
+ o.lifetime_earned = round(o.lifetime_earned + amount, 4)
202
+
203
+ def refund(self, asker_id: str, amount: float) -> None:
204
+ """Return an escrow hold to the asker (on offer expiry / failure)."""
205
+ amount = round(amount, 4)
206
+ self._escrow().balance = round(self._escrow().balance - amount, 4)
207
+ a = self.accounts[asker_id]
208
+ a.balance = round(a.balance + amount, 4)
209
+ a.lifetime_spent = round(a.lifetime_spent - amount, 4)
210
+
211
+ # ------------------------------------------------------------------ integrity
212
+ def total_credit(self) -> float:
213
+ return round(sum(a.balance for a in self.accounts.values()), 4)
214
+
215
+ def conservation_ok(self) -> bool:
216
+ """Total credit in circulation must equal the sum of starter grants."""
217
+ return abs(self.total_credit() - self._granted_total) < 1e-6
218
+
219
+ def summary(self) -> str:
220
+ lines = [f"{'user':<16}{'balance':>10}{'earned':>10}{'spent':>10}{'net':>8}{'help':>6}{'ask':>5}"]
221
+ for a in self.accounts.values():
222
+ lines.append(
223
+ f"{a.user_id:<16}{a.balance:>10.1f}{a.lifetime_earned:>10.1f}"
224
+ f"{a.lifetime_spent:>10.1f}{a.net_contribution:>8.1f}{a.jobs_helped:>6}{a.jobs_asked:>5}"
225
+ )
226
+ lines.append(
227
+ f"\n total credit = {self.total_credit():.1f} | granted = {self._granted_total:.1f} | "
228
+ f"conserved = {self.conservation_ok()} | jobs = {self._job_count}"
229
+ )
230
+ return "\n".join(lines)
council/library.py ADDED
@@ -0,0 +1,431 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ council/library.py — your private document library (local RAG, D19)
4
+ ====================================================================
5
+ Index your own files once; the research engine then draws on them ALONGSIDE the live web.
6
+ Everything is local and keyless: text is chunked, embedded with Ollama `nomic-embed-text`,
7
+ and stored in SQLite at ~/.passiveworkers/library.db. Nothing ever leaves your machine —
8
+ no cloud, no account, no telemetry. Retrieval is plain numpy cosine over the stored matrix
9
+ (no heavy vector DB), in keeping with the project's lean, auditable ethos.
10
+
11
+ CLI:
12
+ pw library add <path|dir> index a file or a whole directory
13
+ pw library list what's indexed
14
+ pw library remove <path> drop a document
15
+ pw library clear wipe the library
16
+
17
+ Used by council/researcher.py (the `[L#]` local citations) and the MCP server.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ import json
24
+ import os
25
+ import pathlib
26
+ import re
27
+ import sqlite3
28
+ import sys
29
+
30
+ import requests
31
+
32
+ OLLAMA = os.environ.get("PW_OLLAMA_BASE", "http://localhost:11434")
33
+ EMBED_MODEL = os.environ.get("PW_EMBED_MODEL", "nomic-embed-text")
34
+ LIB_DIR = pathlib.Path(os.environ.get("PW_LIBRARY_DIR",
35
+ str(pathlib.Path.home() / ".passiveworkers")))
36
+ LIB_DB = LIB_DIR / "library.db"
37
+ _CHUNK_CHARS = 2000 # ~512 tokens — the recall sweet spot (R8 research)
38
+ _OVERLAP = 256 # light boundary insurance; parent-window supplies real context
39
+ _CONTEXTUAL = os.environ.get("PW_CONTEXTUAL_CHUNKS", "") not in ("", "0", "false")
40
+ _CTX_MODEL = os.environ.get("PW_CONTEXT_MODEL", "") # small chat model for situating blurbs
41
+ _RERANK = os.environ.get("PW_RERANK", "") not in ("", "0", "false") # opt-in listwise rerank
42
+ _TEXT_EXT = {".txt", ".md", ".markdown", ".rst", ".csv", ".log"}
43
+ # Ingest guards (resource exhaustion): per-file size, file count, total bytes per add().
44
+ _MAX_FILE_BYTES = int(os.environ.get("PW_MAX_FILE_MB", "30")) * 1_000_000
45
+ _MAX_FILES = int(os.environ.get("PW_MAX_FILES", "500"))
46
+ _MAX_TOTAL_BYTES = int(os.environ.get("PW_MAX_TOTAL_MB", "300")) * 1_000_000
47
+ # Path confinement: indexing is restricted to these roots (default: your home dir). This
48
+ # stops an MCP-connected agent (or a stray path) from indexing system / other-user files.
49
+ # Narrow it to e.g. ~/Documents by setting PW_LIBRARY_ROOTS.
50
+ _ROOTS = [pathlib.Path(p).expanduser().resolve()
51
+ for p in os.environ.get("PW_LIBRARY_ROOTS", str(pathlib.Path.home())).split(os.pathsep)
52
+ if p.strip()]
53
+
54
+
55
+ def _within_roots(p: pathlib.Path) -> bool:
56
+ try:
57
+ rp = p.resolve()
58
+ except Exception:
59
+ return False
60
+ return any(rp == r or rp.is_relative_to(r) for r in _ROOTS)
61
+
62
+
63
+ # ------------------------------------------------------------------ text extraction
64
+ def _read_text_file(path: pathlib.Path) -> str:
65
+ return path.read_text(encoding="utf-8", errors="replace")
66
+
67
+
68
+ def _read_pdf(path: pathlib.Path) -> str:
69
+ from pypdf import PdfReader
70
+ reader = PdfReader(str(path))
71
+ return "\n".join((page.extract_text() or "") for page in reader.pages)
72
+
73
+
74
+ def _read_docx(path: pathlib.Path) -> str:
75
+ import docx
76
+ return "\n".join(p.text for p in docx.Document(str(path)).paragraphs)
77
+
78
+
79
+ def extract_text(path: pathlib.Path) -> str:
80
+ ext = path.suffix.lower()
81
+ if ext == ".pdf":
82
+ return _read_pdf(path)
83
+ if ext == ".docx":
84
+ return _read_docx(path)
85
+ if ext in _TEXT_EXT:
86
+ return _read_text_file(path)
87
+ raise ValueError(f"unsupported file type: {ext} (supported: pdf, docx, txt/md/csv/…)")
88
+
89
+
90
+ def _split_recursive(text: str, sep_idx: int = 0, budget: int | None = None) -> list[str]:
91
+ """Recursively split on a separator hierarchy, keeping semantic units intact and
92
+ never straddling a section boundary. Falls through coarser→finer separators until
93
+ pieces fit `budget` chars. Re-appends the (stripped) separator at flush boundaries so
94
+ sentence punctuation isn't lost from citation quotes."""
95
+ budget = budget or _CHUNK_CHARS
96
+ seps = ["\n\n", "\n", ". ", " ", ""]
97
+ if len(text) <= budget:
98
+ return [text] if text.strip() else []
99
+ if sep_idx >= len(seps):
100
+ return [text[i:i + budget] for i in range(0, len(text), budget)]
101
+ sep = seps[sep_idx]
102
+ parts = text.split(sep) if sep else list(text)
103
+ out, buf = [], ""
104
+ for part in parts:
105
+ piece = (buf + sep + part) if buf else part
106
+ if len(piece) <= budget:
107
+ buf = piece
108
+ else:
109
+ if buf.strip():
110
+ out.append((buf + sep.rstrip()) if sep.strip() else buf)
111
+ buf = part if len(part) <= budget else ""
112
+ if len(part) > budget:
113
+ out.extend(_split_recursive(part, sep_idx + 1, budget))
114
+ if buf.strip():
115
+ out.append(buf)
116
+ return out
117
+
118
+
119
+ def _chunk(text: str) -> list[str]:
120
+ """Structure-aware chunking (R8/D20): split on markdown/section headers first, then
121
+ recurse paragraph→line→sentence→word so a chunk never straddles a section. Light
122
+ overlap as boundary insurance (parent-window expansion supplies the rest of context)."""
123
+ if not text or not text.strip():
124
+ return []
125
+ # 1) split on headers (markdown #, ##… and underline/blank-separated sections)
126
+ sections, cur = [], []
127
+ for line in text.splitlines():
128
+ if re.match(r"^\s{0,3}#{1,6}\s", line) and cur:
129
+ sections.append("\n".join(cur)); cur = [line]
130
+ else:
131
+ cur.append(line)
132
+ if cur:
133
+ sections.append("\n".join(cur))
134
+ # 2) recurse within each section to a size that leaves room for the overlap tail,
135
+ # so a stored chunk (raw + prepended overlap) stays within _CHUNK_CHARS.
136
+ budget = max(400, _CHUNK_CHARS - _OVERLAP)
137
+ raw = []
138
+ for sec in sections:
139
+ raw.extend(_split_recursive(sec.strip(), budget=budget))
140
+ raw = [c.strip() for c in raw if c.strip()]
141
+ # 3) light char overlap between adjacent chunks (boundary insurance)
142
+ if _OVERLAP <= 0 or len(raw) < 2:
143
+ return raw
144
+ out = [raw[0]]
145
+ for prev, c in zip(raw, raw[1:]):
146
+ tail = prev[-_OVERLAP:]
147
+ out.append((tail + " " + c) if tail else c)
148
+ return out
149
+
150
+
151
+ # ------------------------------------------------------------------ embeddings
152
+ def embed(texts: list[str]) -> list[list[float]]:
153
+ """Embed via local Ollama. One call per text (Ollama's embeddings API is single-input)."""
154
+ vecs = []
155
+ for t in texts:
156
+ r = requests.post(f"{OLLAMA}/api/embeddings",
157
+ json={"model": EMBED_MODEL, "prompt": t,
158
+ "keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m")}, # warm embedder (R17)
159
+ timeout=120)
160
+ r.raise_for_status()
161
+ vecs.append(r.json()["embedding"])
162
+ return vecs
163
+
164
+
165
+ # --------------------------------------------- contextual retrieval (Anthropic technique)
166
+ def _smallest_chat_model() -> str:
167
+ """Pick the smallest installed non-embedding model for cheap contextual blurbs."""
168
+ if _CTX_MODEL:
169
+ return _CTX_MODEL
170
+ try:
171
+ r = requests.get(f"{OLLAMA}/api/tags", timeout=10)
172
+ models = [m for m in r.json().get("models", []) if "embed" not in m["name"].lower()]
173
+ return sorted(models, key=lambda m: m.get("size", 0))[0]["name"]
174
+ except Exception:
175
+ return ""
176
+
177
+
178
+ def _situate(doc_text: str, chunk: str, model: str) -> str:
179
+ """Anthropic Contextual Retrieval: a 1-2 sentence blurb situating the chunk in its
180
+ document, to be prepended before embedding + BM25 indexing. Best-effort ('' on failure)."""
181
+ if not model:
182
+ return ""
183
+ try:
184
+ from council.sanitize import spotlight
185
+ # the document is untrusted content → spotlight it so a planted instruction inside
186
+ # the file can't hijack the blurb-writer (data, not instructions)
187
+ body = spotlight(f"<document>\n{doc_text[:8000]}\n</document>\n<chunk>\n{chunk[:1500]}\n</chunk>")
188
+ prompt = (f"{body}\n"
189
+ "Give a short succinct context (1-2 sentences) to situate this chunk within "
190
+ "the overall document for improving search retrieval. Answer ONLY with the "
191
+ "context, nothing else.")
192
+ r = requests.post(f"{OLLAMA}/api/generate",
193
+ json={"model": model, "prompt": prompt, "stream": False,
194
+ "options": {"temperature": 0.0, "num_predict": 120},
195
+ "keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m")},
196
+ timeout=120)
197
+ r.raise_for_status()
198
+ return (r.json().get("response") or "").strip()[:400]
199
+ except Exception:
200
+ return ""
201
+
202
+
203
+ # ------------------------------------------------------------------ store
204
+ class Library:
205
+ def __init__(self, db_path: pathlib.Path = LIB_DB):
206
+ LIB_DIR.mkdir(parents=True, exist_ok=True)
207
+ self.conn = sqlite3.connect(str(db_path))
208
+ self.conn.row_factory = sqlite3.Row
209
+ self._corpus_key = None
210
+ self._corpus_cache = None
211
+ self.conn.execute(
212
+ "CREATE TABLE IF NOT EXISTS chunks("
213
+ "id INTEGER PRIMARY KEY, source TEXT, title TEXT, ord INT, text TEXT, vec TEXT)")
214
+ self.conn.execute(
215
+ "CREATE TABLE IF NOT EXISTS filemeta(source TEXT PRIMARY KEY, fhash TEXT)")
216
+ cols = {r["name"] for r in self.conn.execute("PRAGMA table_info(chunks)")}
217
+ if "context" not in cols: # contextual-retrieval blurb (R8); display still uses `text`
218
+ self.conn.execute("ALTER TABLE chunks ADD COLUMN context TEXT")
219
+ self.conn.commit()
220
+
221
+ def add(self, path: str) -> int:
222
+ """Index a file or directory (confined to PW_LIBRARY_ROOTS). Returns chunks added."""
223
+ p = pathlib.Path(path).expanduser().resolve()
224
+ if not _within_roots(p):
225
+ raise ValueError(f"path outside allowed roots ({', '.join(map(str, _ROOTS))}); "
226
+ "set PW_LIBRARY_ROOTS to widen")
227
+ if p.is_dir():
228
+ total = files = tbytes = 0
229
+ for f in sorted(p.rglob("*")):
230
+ if files >= _MAX_FILES or tbytes >= _MAX_TOTAL_BYTES:
231
+ print(f" stop: ingest cap reached ({_MAX_FILES} files / "
232
+ f"{_MAX_TOTAL_BYTES//1_000_000} MB)", flush=True)
233
+ break
234
+ if f.is_symlink(): # don't follow symlinks out of the tree
235
+ continue
236
+ if f.is_file() and f.suffix.lower() in _TEXT_EXT | {".pdf", ".docx"} \
237
+ and _within_roots(f):
238
+ try:
239
+ sz = f.stat().st_size
240
+ if sz > _MAX_FILE_BYTES:
241
+ print(f" skip {f.name}: too large ({sz//1_000_000} MB)", flush=True)
242
+ continue
243
+ added = self._add_file(f)
244
+ if added:
245
+ files += 1
246
+ tbytes += sz
247
+ total += added
248
+ except Exception as e:
249
+ print(f" skip {f.name}: {e}", flush=True)
250
+ return total
251
+ return self._add_file(p)
252
+
253
+ def _add_file(self, p: pathlib.Path) -> int:
254
+ if not _within_roots(p):
255
+ raise ValueError("path outside allowed roots")
256
+ if p.is_file() and p.stat().st_size > _MAX_FILE_BYTES:
257
+ raise ValueError(f"file too large (> {_MAX_FILE_BYTES//1_000_000} MB)")
258
+ src = str(p)
259
+ fhash = hashlib.sha256(p.read_bytes()).hexdigest()
260
+ prev = self.conn.execute("SELECT fhash FROM filemeta WHERE source=?", (src,)).fetchone()
261
+ if prev and prev["fhash"] == fhash and not self.is_empty():
262
+ print(f" unchanged {p.name} — skipped (incremental)", flush=True)
263
+ return 0
264
+ doc_text = extract_text(p)
265
+ chunks = _chunk(doc_text)
266
+ if not chunks:
267
+ return 0
268
+ title = p.name
269
+ # Contextual Retrieval: always prepend the title (free baseline); optionally an
270
+ # LLM situating blurb (flag-gated for speed). Embed/BM25 use context+text; the
271
+ # displayed [L#] quote uses `text` only — the blurb never leaks into citations.
272
+ ctx_model = _smallest_chat_model() if _CONTEXTUAL else ""
273
+ contexts, augmented = [], []
274
+ for c in chunks:
275
+ blurb = _situate(doc_text, c, ctx_model) if ctx_model else ""
276
+ ctx = (f"{title}. {blurb}".strip()) if blurb else title
277
+ contexts.append(ctx)
278
+ augmented.append(f"{ctx}\n{c}")
279
+ self.conn.execute("DELETE FROM chunks WHERE source=?", (src,)) # re-index = replace
280
+ vecs = embed(augmented)
281
+ self.conn.executemany(
282
+ "INSERT INTO chunks(source,title,ord,text,context,vec) VALUES(?,?,?,?,?,?)",
283
+ [(src, title, i, c, ctx, json.dumps(v))
284
+ for i, (c, ctx, v) in enumerate(zip(chunks, contexts, vecs))])
285
+ self.conn.execute("INSERT OR REPLACE INTO filemeta(source,fhash) VALUES(?,?)", (src, fhash))
286
+ self.conn.commit()
287
+ tag = " (contextual)" if ctx_model else ""
288
+ print(f" indexed {title}: {len(chunks)} chunks{tag}", flush=True)
289
+ return len(chunks)
290
+
291
+ def remove(self, path: str) -> int:
292
+ src = str(pathlib.Path(path).expanduser().resolve())
293
+ cur = self.conn.execute("DELETE FROM chunks WHERE source=?", (src,))
294
+ self.conn.execute("DELETE FROM filemeta WHERE source=?", (src,))
295
+ self.conn.commit()
296
+ return cur.rowcount
297
+
298
+ def clear(self) -> None:
299
+ self.conn.execute("DELETE FROM chunks")
300
+ self.conn.execute("DELETE FROM filemeta")
301
+ self.conn.commit()
302
+
303
+ def sources(self) -> list[dict]:
304
+ return [dict(r) for r in self.conn.execute(
305
+ "SELECT source, title, COUNT(*) n FROM chunks GROUP BY source ORDER BY title")]
306
+
307
+ def is_empty(self) -> bool:
308
+ return self.conn.execute("SELECT 1 FROM chunks LIMIT 1").fetchone() is None
309
+
310
+ def _corpus(self):
311
+ """Load rows + build the normalized embedding matrix and BM25 index ONCE, cached on a
312
+ cheap fingerprint (chunk count + max id) so repeated searches (e.g. 3 analysts/run, or
313
+ the serve/MCP loop) don't rebuild the whole index each call."""
314
+ import numpy as np
315
+ from council.retrieval import BM25Okapi, tokenize
316
+ fp = self.conn.execute("SELECT COUNT(*), COALESCE(MAX(id),0) FROM chunks").fetchone()
317
+ key = (fp[0], fp[1])
318
+ if getattr(self, "_corpus_key", None) == key and self._corpus_cache is not None:
319
+ return self._corpus_cache
320
+ rows = list(self.conn.execute("SELECT source,title,ord,text,context,vec FROM chunks"))
321
+ if rows and any(r["context"] is None for r in rows) and any(r["context"] for r in rows):
322
+ print(" note: library mixes pre/post-contextual rows — `pw library clear` then "
323
+ "re-add for a consistent index", flush=True)
324
+ mat = np.asarray([json.loads(r["vec"]) for r in rows], dtype="float32")
325
+ norm = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9)
326
+ bm25 = BM25Okapi([tokenize((r["context"] or "") + " " + r["text"]) for r in rows])
327
+ self._corpus_key, self._corpus_cache = key, (rows, norm, bm25)
328
+ return self._corpus_cache
329
+
330
+ def search(self, query: str, k: int = 5, window: bool = True) -> list[dict]:
331
+ """Hybrid retrieval (R8/D20): dense cosine ⊕ BM25 lexical, fused by Reciprocal Rank
332
+ Fusion, then small-to-big parent-window expansion. Catches both semantic matches and
333
+ exact terms (names/codes/numbers) dense embeddings miss. [] on empty/failure."""
334
+ if self.is_empty():
335
+ return []
336
+ try:
337
+ import numpy as np
338
+ from council.retrieval import reciprocal_rank_fusion
339
+ rows, norm, bm25 = self._corpus()
340
+ pool = min(50, len(rows)) # candidates per retriever before fusion
341
+ qv = np.asarray(embed([query])[0], dtype="float32")
342
+ qn = qv / (np.linalg.norm(qv) + 1e-9)
343
+ sims = norm @ qn
344
+ dense_rank = list(np.argsort(sims)[::-1][:pool])
345
+ sparse_rank = bm25.top(query, k=pool)
346
+ fused = reciprocal_rank_fusion([list(map(int, dense_rank)), list(map(int, sparse_rank))],
347
+ top_k=max(k, 15) if _RERANK else k)
348
+ if _RERANK and len(fused) > k:
349
+ fused = self._rerank(query, rows, fused, k)
350
+ return [self._hit(rows, i, float(sims[i]), window) for i in fused[:k]]
351
+ except Exception:
352
+ return []
353
+
354
+ def _rerank(self, query: str, rows, ids: list[int], k: int) -> list[int]:
355
+ """Opt-in (PW_RERANK=1) listwise rerank: one local-model call scores the fused
356
+ candidates by actual relevance. Zero new deps; falls back to RRF order on failure."""
357
+ model = _smallest_chat_model()
358
+ if not model:
359
+ return ids[:k]
360
+ try:
361
+ from council.judge import _extract_json
362
+ from council.sanitize import spotlight
363
+ # candidate passages are untrusted document text → spotlight (a planted
364
+ # instruction can at worst reorder, never escape; index-clamp below bounds it)
365
+ cand = spotlight("\n".join(f"[{j}] {rows[i]['text'][:300]}" for j, i in enumerate(ids)))
366
+ prompt = (f"Rank the passages by relevance to the QUERY. Return STRICT JSON: "
367
+ f'{{"order":[indices best-first]}}.\n\nQUERY: {query}\n\nPASSAGES:\n{cand}\n\nJSON:')
368
+ r = requests.post(f"{OLLAMA}/api/generate",
369
+ json={"model": model, "prompt": prompt, "stream": False,
370
+ "options": {"temperature": 0.0, "num_predict": 120},
371
+ "keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m")},
372
+ timeout=120)
373
+ r.raise_for_status()
374
+ parsed = _extract_json((r.json().get("response") or "").strip())
375
+ order = parsed.get("order") if isinstance(parsed, dict) else None
376
+ if isinstance(order, list):
377
+ picked = [ids[j] for j in order if isinstance(j, int) and 0 <= j < len(ids)]
378
+ # append any not mentioned, preserving fusion order
379
+ picked += [i for i in ids if i not in picked]
380
+ return picked[:k]
381
+ except Exception:
382
+ pass
383
+ return ids[:k]
384
+
385
+ def _hit(self, rows, i, score, window: bool) -> dict:
386
+ """Build a result, optionally expanding to a parent window (neighbor chunks of the
387
+ same source) for richer grounding. Display text never includes the context blurb."""
388
+ r = rows[i]
389
+ text = r["text"]
390
+ if window:
391
+ same = {row["ord"]: row["text"] for row in rows if row["source"] == r["source"]}
392
+ parts = [same[o] for o in (r["ord"] - 1, r["ord"], r["ord"] + 1) if o in same]
393
+ text = " […] ".join(parts)[:2500]
394
+ return {"source": r["source"], "title": r["title"], "ord": r["ord"],
395
+ "text": text, "score": score}
396
+
397
+
398
+ # ------------------------------------------------------------------ CLI
399
+ def main() -> int:
400
+ args = sys.argv[1:]
401
+ lib = Library()
402
+ if not args or args[0] == "list":
403
+ srcs = lib.sources()
404
+ if not srcs:
405
+ print("library empty — add files with: pw library add <path>")
406
+ else:
407
+ for s in srcs:
408
+ print(f" {s['title']:40s} {s['n']:4d} chunks {s['source']}")
409
+ print(f"\n{len(srcs)} document(s) indexed.")
410
+ return 0
411
+ cmd = args[0]
412
+ if cmd == "add":
413
+ if len(args) < 2:
414
+ print("usage: pw library add <path|dir>"); return 2
415
+ n = lib.add(args[1])
416
+ print(f"✓ {n} chunks indexed.")
417
+ return 0
418
+ if cmd == "remove":
419
+ if len(args) < 2:
420
+ print("usage: pw library remove <path>"); return 2
421
+ print(f"✓ removed {lib.remove(args[1])} chunks.")
422
+ return 0
423
+ if cmd == "clear":
424
+ lib.clear(); print("✓ library cleared.")
425
+ return 0
426
+ print(f"unknown: pw library {cmd} (add | list | remove | clear)")
427
+ return 2
428
+
429
+
430
+ if __name__ == "__main__":
431
+ sys.exit(main())