passiveworkers 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- council/__init__.py +1 -0
- council/artifacts.py +161 -0
- council/batch.py +84 -0
- council/cli.py +54 -0
- council/coordinator.py +133 -0
- council/crypto.py +133 -0
- council/fidelity.py +197 -0
- council/judge.py +393 -0
- council/ledger.py +230 -0
- council/library.py +431 -0
- council/local.py +228 -0
- council/mcp_server.py +87 -0
- council/net/__init__.py +1 -0
- council/net/agent.py +231 -0
- council/net/app.py +390 -0
- council/net/baseline.py +86 -0
- council/net/config.py +79 -0
- council/net/coordinator_app.py +370 -0
- council/net/dashboard.py +111 -0
- council/net/store.py +964 -0
- council/net/submit.py +102 -0
- council/operator.py +412 -0
- council/research.py +520 -0
- council/researcher.py +300 -0
- council/retrieval.py +80 -0
- council/run_demo.py +175 -0
- council/sanitize.py +78 -0
- council/serve.py +183 -0
- council/trust.py +168 -0
- council/worker.py +123 -0
- passiveworkers-0.1.0.dist-info/METADATA +269 -0
- passiveworkers-0.1.0.dist-info/RECORD +36 -0
- passiveworkers-0.1.0.dist-info/WHEEL +5 -0
- passiveworkers-0.1.0.dist-info/entry_points.txt +2 -0
- passiveworkers-0.1.0.dist-info/licenses/LICENSE +21 -0
- passiveworkers-0.1.0.dist-info/top_level.txt +1 -0
council/research.py
ADDED
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
council/research.py — per-node, egress-localized web research (M4)
|
|
4
|
+
=================================================================
|
|
5
|
+
A worker's OWN agent researches the live web FROM ITS OWN egress and returns its OWN
|
|
6
|
+
findings (titles + snippets + sources) as a context string. It NEVER proxies someone
|
|
7
|
+
else's traffic — the worker model reads these findings and writes its own answer
|
|
8
|
+
(the legal bright line; see council/worker.py and docs/DECISIONS D4).
|
|
9
|
+
|
|
10
|
+
Why this is the moat: DuckDuckGo/metasearch localize on the *egress IP* and then discard
|
|
11
|
+
it. So the Helsinki VPS and the Gulf Mac get genuinely different result sets from their own
|
|
12
|
+
egress — diversity no central API can replicate. The lever: leave region at world
|
|
13
|
+
(`wt-wt`) and let egress drive locale; never force a region.
|
|
14
|
+
|
|
15
|
+
Config (per node, via env):
|
|
16
|
+
PW_WEB_BACKEND off (default) | ddgs | searxng
|
|
17
|
+
PW_SEARXNG_URL e.g. http://127.0.0.1:8080 (only used for searxng)
|
|
18
|
+
PW_WEB_RESULTS max results (default 5) · PW_WEB_TIMEOUT seconds (default 8)
|
|
19
|
+
|
|
20
|
+
Wire-in: council/net/agent.py passes search() as PerspectiveWorker(web_search=…) when
|
|
21
|
+
PW_WEB_BACKEND != off. Best-effort: returns "" on any failure (never blocks the answer).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import ipaddress
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
import socket
|
|
30
|
+
import time
|
|
31
|
+
from functools import lru_cache
|
|
32
|
+
from urllib.parse import urlparse
|
|
33
|
+
|
|
34
|
+
_TIMEOUT = float(os.environ.get("PW_WEB_TIMEOUT", "8"))
|
|
35
|
+
_MAX_RESULTS = int(os.environ.get("PW_WEB_RESULTS", "5"))
|
|
36
|
+
def _backend() -> str:
|
|
37
|
+
# Read at CALL time, not import time — callers (e.g. council.local) may enable the
|
|
38
|
+
# web after this module is imported. Auto-prefer a local SearXNG when one is up:
|
|
39
|
+
# the ecosystem's converged answer to DDG rate limiting (gpt-researcher #478,
|
|
40
|
+
# local-deep-research #18, open-webui, CrewAI…), and better for privacy.
|
|
41
|
+
b = os.environ.get("PW_WEB_BACKEND", "off")
|
|
42
|
+
if b == "ddgs" and _searxng_alive():
|
|
43
|
+
return "searxng"
|
|
44
|
+
return b
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@lru_cache(maxsize=1)
|
|
48
|
+
def _searxng_alive() -> bool:
|
|
49
|
+
url = os.environ.get("PW_SEARXNG_URL") or "http://127.0.0.1:8080"
|
|
50
|
+
try:
|
|
51
|
+
import requests
|
|
52
|
+
r = requests.get(f"{url.rstrip('/')}/search",
|
|
53
|
+
params={"q": "ping", "format": "json"},
|
|
54
|
+
headers={"User-Agent": _UA}, timeout=2)
|
|
55
|
+
if r.ok:
|
|
56
|
+
os.environ.setdefault("PW_SEARXNG_URL", url)
|
|
57
|
+
return True
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
return False
|
|
61
|
+
_SEARX = os.environ.get("PW_SEARXNG_URL", "")
|
|
62
|
+
_UA = "PassiveWorkers-Research/0.1 (mutual-aid council; egress-localized)"
|
|
63
|
+
|
|
64
|
+
# Source curation: hosts that are video/social/link-farm — fine for browsing, weak as
|
|
65
|
+
# research citations. Suffix-matched so subdomains are covered.
|
|
66
|
+
_LOW_QUALITY_HOSTS = (
|
|
67
|
+
"youtube.com", "youtu.be", "tiktok.com", "facebook.com", "instagram.com",
|
|
68
|
+
"pinterest.com", "pinterest.co.uk", "x.com", "twitter.com", "threads.net",
|
|
69
|
+
"quora.com", "slideshare.net", "scribd.com",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _is_quality_host(host: str) -> bool:
|
|
74
|
+
return not any(host == h or host.endswith("." + h) for h in _LOW_QUALITY_HOSTS)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---- SSRF / abuse guard: only public hosts (block loopback/private/link-local/CGNAT/metadata).
|
|
78
|
+
def _host_is_public(host: str) -> bool:
|
|
79
|
+
if not host:
|
|
80
|
+
return False
|
|
81
|
+
try:
|
|
82
|
+
for _fam, _t, _p, _c, sa in socket.getaddrinfo(host, None):
|
|
83
|
+
ip = ipaddress.ip_address(sa[0])
|
|
84
|
+
if (ip.is_loopback or ip.is_private or ip.is_link_local
|
|
85
|
+
or ip.is_reserved or ip.is_multicast or ip.is_unspecified):
|
|
86
|
+
return False
|
|
87
|
+
if ip in ipaddress.ip_network("100.64.0.0/10"): # CGNAT
|
|
88
|
+
return False
|
|
89
|
+
return True
|
|
90
|
+
except (OSError, ValueError):
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _clean(results: list[dict]) -> str:
|
|
95
|
+
out, seen = [], set()
|
|
96
|
+
for r in results:
|
|
97
|
+
url = (r.get("href") or r.get("url") or "").strip()
|
|
98
|
+
host = (urlparse(url).hostname or "").lower()
|
|
99
|
+
if not host or host in seen or not _host_is_public(host) or not _is_quality_host(host):
|
|
100
|
+
continue
|
|
101
|
+
seen.add(host)
|
|
102
|
+
title = (r.get("title") or "").strip()
|
|
103
|
+
body = (r.get("body") or r.get("content") or "").strip()[:400]
|
|
104
|
+
out.append(f"- {title} ({host})\n {body}\n source: {url}")
|
|
105
|
+
if len(out) >= _MAX_RESULTS:
|
|
106
|
+
break
|
|
107
|
+
return "\n".join(out)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _ddgs(question: str) -> list[dict]:
|
|
111
|
+
try:
|
|
112
|
+
from ddgs import DDGS # current package name
|
|
113
|
+
except ImportError:
|
|
114
|
+
from duckduckgo_search import DDGS # older name, same API
|
|
115
|
+
# DDG rate-limits aggressively at scale (systemic across the ecosystem) —
|
|
116
|
+
# 3 tries with exponential backoff + jitter before giving up.
|
|
117
|
+
last: Exception | None = None
|
|
118
|
+
for attempt in range(3):
|
|
119
|
+
try:
|
|
120
|
+
with DDGS(timeout=int(_TIMEOUT)) as ddg:
|
|
121
|
+
# region world → engines localize on THIS node's egress IP (the moat).
|
|
122
|
+
return list(ddg.text(question, region="wt-wt", safesearch="moderate",
|
|
123
|
+
max_results=_MAX_RESULTS))
|
|
124
|
+
except Exception as e:
|
|
125
|
+
last = e
|
|
126
|
+
time.sleep((2 ** attempt) + (hash(question) % 7) / 10)
|
|
127
|
+
raise last # type: ignore[misc]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _searxng(question: str) -> list[dict]:
|
|
131
|
+
import requests
|
|
132
|
+
searx = os.environ.get("PW_SEARXNG_URL") or _SEARX or "http://127.0.0.1:8080"
|
|
133
|
+
host = urlparse(searx).hostname or ""
|
|
134
|
+
# SearXNG may legitimately run on loopback ON this node; allow that explicitly.
|
|
135
|
+
if not (_host_is_public(host) or host in ("127.0.0.1", "localhost")):
|
|
136
|
+
return []
|
|
137
|
+
r = requests.get(f"{searx.rstrip('/')}/search",
|
|
138
|
+
params={"q": question, "format": "json", "safesearch": 1},
|
|
139
|
+
headers={"User-Agent": _UA}, timeout=_TIMEOUT)
|
|
140
|
+
r.raise_for_status()
|
|
141
|
+
return r.json().get("results", [])
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---- keyless routable engines (engine routing v1: web | academic | encyclopedic) ----
|
|
145
|
+
def _arxiv(question: str, max_results: int = 5) -> list[dict]:
|
|
146
|
+
"""arXiv's free API — for academic queries. Returns the common row shape."""
|
|
147
|
+
import requests
|
|
148
|
+
import xml.etree.ElementTree as ET
|
|
149
|
+
r = requests.get("https://export.arxiv.org/api/query",
|
|
150
|
+
params={"search_query": f"all:{question}", "max_results": max_results,
|
|
151
|
+
"sortBy": "relevance"},
|
|
152
|
+
headers={"User-Agent": _UA}, timeout=_TIMEOUT + 4)
|
|
153
|
+
r.raise_for_status()
|
|
154
|
+
ns = {"a": "http://www.w3.org/2005/Atom"}
|
|
155
|
+
rows = []
|
|
156
|
+
for e in ET.fromstring(r.text).findall("a:entry", ns):
|
|
157
|
+
title = (e.findtext("a:title", "", ns) or "").strip()
|
|
158
|
+
url = (e.findtext("a:id", "", ns) or "").strip()
|
|
159
|
+
summary = (e.findtext("a:summary", "", ns) or "").strip()
|
|
160
|
+
if title and url:
|
|
161
|
+
rows.append({"title": title, "href": url, "body": summary})
|
|
162
|
+
return rows
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _wikipedia(question: str, max_results: int = 4) -> list[dict]:
|
|
166
|
+
"""Wikipedia's free full-text search API — for encyclopedic queries. Same row shape."""
|
|
167
|
+
import requests
|
|
168
|
+
r = requests.get("https://en.wikipedia.org/w/api.php",
|
|
169
|
+
params={"action": "query", "list": "search", "srsearch": question,
|
|
170
|
+
"srlimit": max_results, "format": "json"},
|
|
171
|
+
headers={"User-Agent": _UA}, timeout=_TIMEOUT)
|
|
172
|
+
r.raise_for_status()
|
|
173
|
+
rows = []
|
|
174
|
+
for hit in r.json().get("query", {}).get("search", []):
|
|
175
|
+
title = hit.get("title", "")
|
|
176
|
+
if title:
|
|
177
|
+
rows.append({"title": title,
|
|
178
|
+
"href": "https://en.wikipedia.org/wiki/" + title.replace(" ", "_"),
|
|
179
|
+
"body": re.sub(r"<[^>]+>", "", hit.get("snippet", ""))})
|
|
180
|
+
return rows
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _wikipedia_fallback(question: str) -> str:
|
|
184
|
+
"""Clean official-API fallback when search yields nothing (no geo-signal, but reliable)."""
|
|
185
|
+
try:
|
|
186
|
+
import requests
|
|
187
|
+
r = requests.get("https://en.wikipedia.org/w/api.php",
|
|
188
|
+
params={"action": "opensearch", "search": question,
|
|
189
|
+
"limit": 3, "format": "json"},
|
|
190
|
+
headers={"User-Agent": _UA}, timeout=_TIMEOUT)
|
|
191
|
+
r.raise_for_status()
|
|
192
|
+
_, titles, descs, urls = r.json()
|
|
193
|
+
rows = [{"title": t, "body": d, "url": u} for t, d, u in zip(titles, descs, urls)]
|
|
194
|
+
return _clean(rows)
|
|
195
|
+
except Exception:
|
|
196
|
+
return ""
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@lru_cache(maxsize=256)
|
|
200
|
+
def _cached(question: str, _bucket: int, backend: str) -> str:
|
|
201
|
+
if backend == "searxng":
|
|
202
|
+
rows = _searxng(question)
|
|
203
|
+
else:
|
|
204
|
+
rows = _ddgs(question)
|
|
205
|
+
found = _clean(rows)
|
|
206
|
+
return found or _wikipedia_fallback(question)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def search(question: str) -> str:
|
|
210
|
+
"""The web_search hook: (question) -> findings text. Best-effort; '' on any failure."""
|
|
211
|
+
if _backend() == "off":
|
|
212
|
+
return ""
|
|
213
|
+
q = (question or "").strip()[:300]
|
|
214
|
+
if not q:
|
|
215
|
+
return ""
|
|
216
|
+
try:
|
|
217
|
+
bucket = int(time.time() // 900) # 15-minute cache window via the lru_cache key
|
|
218
|
+
return _cached(q, bucket, _backend())
|
|
219
|
+
except Exception:
|
|
220
|
+
return ""
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ---- dynamic source routing (R17/D29): pick which keyless engines a query should hit ----
|
|
224
|
+
# academic signals → also query arXiv; definitional/background signals → also query Wikipedia.
|
|
225
|
+
# 'web' (egress-localized meta-search — the moat) is ALWAYS included; the extras AUGMENT it,
|
|
226
|
+
# they never replace it. Pure + env-gated (PW_SOURCE_ROUTING=off → web only).
|
|
227
|
+
_ACADEMIC_RE = re.compile(
|
|
228
|
+
r"\b(arxiv|preprint|peer.?reviewed|research papers?|journal article|study|studies|"
|
|
229
|
+
r"meta.?analysis|clinical trial|systematic review|algorithm|theorem|equation|dataset|"
|
|
230
|
+
r"benchmark|state.of.the.art|sota|neural network|machine learning|deep learning|"
|
|
231
|
+
r"reinforcement learning|transformer|quantum|genomic|proteomic|astrophysics)\b", re.I)
|
|
232
|
+
# 'who/what is/are' only counts at the start (or after sentence punctuation) so a mid-sentence
|
|
233
|
+
# 'what is it like' in prose doesn't over-route; the explicit phrases match anywhere.
|
|
234
|
+
_ENCYCLOPEDIC_RE = re.compile(
|
|
235
|
+
r"(?:^|[.!?]\s+)(?:who|what)\s+(?:is|are|was|were)\b"
|
|
236
|
+
r"|\b(?:definition of|meaning of|history of|biography of|overview of|background on|"
|
|
237
|
+
r"capital of|where is)\b", re.I)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def route_engines(query: str) -> list[str]:
|
|
241
|
+
"""Ordered keyless engines for a query. Always starts with 'web' (the egress-localized
|
|
242
|
+
moat); the extras AUGMENT it, never replace it. Appends 'academic' (arXiv) and/or
|
|
243
|
+
'encyclopedic' (Wikipedia) when the query clearly calls for them. NOTE: arXiv/Wikipedia are
|
|
244
|
+
CENTRAL APIs — they do not geo-localize (no egress moat) and are not deanonymizing beyond an
|
|
245
|
+
ordinary HTTP request. PW_SOURCE_ROUTING=off|0|false (case-insensitive) pins it to web only."""
|
|
246
|
+
if os.environ.get("PW_SOURCE_ROUTING", "on").lower() in ("off", "0", "false"):
|
|
247
|
+
return ["web"]
|
|
248
|
+
q = query or ""
|
|
249
|
+
engines = ["web"]
|
|
250
|
+
if _ACADEMIC_RE.search(q):
|
|
251
|
+
engines.append("academic")
|
|
252
|
+
if _ENCYCLOPEDIC_RE.search(q):
|
|
253
|
+
engines.append("encyclopedic")
|
|
254
|
+
return engines
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def search_structured(query: str, max_results: int = 5, engine: str = "web") -> list[dict]:
|
|
258
|
+
"""Structured variant for the researcher: [{title, url, host, snippet}], SSRF-guarded,
|
|
259
|
+
deduped by host. Best-effort; [] on any failure. Same egress-localization as search().
|
|
260
|
+
`engine`: web (meta-search) | academic (arXiv) | encyclopedic (Wikipedia) — keyless."""
|
|
261
|
+
backend = _backend()
|
|
262
|
+
if backend == "off":
|
|
263
|
+
return []
|
|
264
|
+
q = (query or "").strip()[:300]
|
|
265
|
+
if not q:
|
|
266
|
+
return []
|
|
267
|
+
try:
|
|
268
|
+
if engine == "academic":
|
|
269
|
+
rows = _arxiv(q, max_results)
|
|
270
|
+
elif engine == "encyclopedic":
|
|
271
|
+
rows = _wikipedia(q, max_results)
|
|
272
|
+
elif backend == "searxng":
|
|
273
|
+
rows = _searxng(q)
|
|
274
|
+
else:
|
|
275
|
+
rows = _ddgs(q)
|
|
276
|
+
except Exception:
|
|
277
|
+
return []
|
|
278
|
+
from council.sanitize import clean as _sanitize
|
|
279
|
+
out, seen = [], set()
|
|
280
|
+
for r in rows:
|
|
281
|
+
url = (r.get("href") or r.get("url") or "").strip()
|
|
282
|
+
host = (urlparse(url).hostname or "").lower()
|
|
283
|
+
if not host or host in seen or not _host_is_public(host) or not _is_quality_host(host):
|
|
284
|
+
continue
|
|
285
|
+
seen.add(host)
|
|
286
|
+
out.append({"title": _sanitize((r.get("title") or ""))[:160],
|
|
287
|
+
"url": url,
|
|
288
|
+
"host": host,
|
|
289
|
+
"snippet": _sanitize((r.get("body") or r.get("content") or ""))[:500]})
|
|
290
|
+
if len(out) >= max_results:
|
|
291
|
+
break
|
|
292
|
+
return out
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# ---- full-page evidence (R5/D17: the leaders draft from pages, not snippets) ----
|
|
296
|
+
_FETCH_CAP = 200_000 # bytes per page — extraction input, not archival
|
|
297
|
+
_HTML_JUNK = re.compile(r"(?is)<(script|style|noscript|svg|header|footer|nav)[^>]*>.*?</\1>")
|
|
298
|
+
_TAGS = re.compile(r"(?s)<[^>]+>")
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _strip_html(html: str) -> str:
|
|
302
|
+
text = _TAGS.sub(" ", _HTML_JUNK.sub(" ", html))
|
|
303
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _extract_main(html: str) -> tuple[str, str]:
|
|
307
|
+
"""(main_text, iso_date) from raw HTML. Prefer trafilatura (real boilerplate removal +
|
|
308
|
+
metadata, Apache-2.0; see docs/PRIOR_ART.md); fall back to our regex strip if it's
|
|
309
|
+
absent or yields nothing. Date is best-effort ('' when unknown)."""
|
|
310
|
+
try:
|
|
311
|
+
import trafilatura
|
|
312
|
+
text = trafilatura.extract(html, include_comments=False, include_tables=True,
|
|
313
|
+
favor_precision=True) or ""
|
|
314
|
+
date = ""
|
|
315
|
+
try:
|
|
316
|
+
md = trafilatura.extract_metadata(html)
|
|
317
|
+
date = (getattr(md, "date", "") or "") if md else ""
|
|
318
|
+
except Exception:
|
|
319
|
+
date = ""
|
|
320
|
+
if text.strip():
|
|
321
|
+
return text.strip(), date
|
|
322
|
+
except Exception:
|
|
323
|
+
pass
|
|
324
|
+
return _strip_html(html), ""
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
# ---- freshness signals (R18/D30): the council's edge is currency, so lead with recent sources ----
|
|
328
|
+
_MONTHS = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
|
|
329
|
+
"jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
|
|
330
|
+
_DAY = r"(0?[1-9]|[12]\d|3[01])" # 1-31 only (no 0/32+)
|
|
331
|
+
_ISO_RE = re.compile(r"(20\d{2})[-/](0[1-9]|1[0-2])[-/](0[1-9]|[12]\d|3[01])")
|
|
332
|
+
_URL_YM_RE = re.compile(r"/(20\d{2})/(0[1-9]|1[0-2])(?:/|\b)")
|
|
333
|
+
_URL_Y_RE = re.compile(r"/(20[1-3]\d)(?:/|\b)")
|
|
334
|
+
_MON = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)"
|
|
335
|
+
_MON_DMY = re.compile(rf"(?i)\b{_DAY}\s+{_MON}[a-z]*\.?,?\s+(20\d{{2}})\b")
|
|
336
|
+
_MON_MDY = re.compile(rf"(?i)\b{_MON}[a-z]*\.?\s+{_DAY},?\s+(20\d{{2}})\b")
|
|
337
|
+
_MON_MY = re.compile(rf"(?i)\b{_MON}[a-z]*\.?\s+(20\d{{2}})\b")
|
|
338
|
+
|
|
339
|
+
# does a brief/query actually care about recency? If not, recency reordering is noise (and could
|
|
340
|
+
# bury an authoritative older source under a recent repost), so we leave relevance order alone.
|
|
341
|
+
_TEMPORAL_RE = re.compile(
|
|
342
|
+
r"(?i)\b(current|currently|latest|newest|recent|recently|now|today|as of|up.?to.?date|"
|
|
343
|
+
r"most recent|breaking|so far|to date|right now|when|next|upcoming|deadline|date|"
|
|
344
|
+
r"this (year|month|week)|202\d)\b")
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _valid_ymd(y, mo, d) -> bool:
|
|
348
|
+
try:
|
|
349
|
+
import datetime
|
|
350
|
+
datetime.date(int(y), int(mo), int(d))
|
|
351
|
+
return True
|
|
352
|
+
except (ValueError, TypeError):
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def extract_date_hint(url: str, text: str = "") -> str:
|
|
357
|
+
"""Best-effort publication date as 'YYYY-MM-DD' / 'YYYY-MM' / 'YYYY' (most precise first),
|
|
358
|
+
or '' if none. A freshness RANKING signal, not an authority claim. Trust is stratified
|
|
359
|
+
(review R18): URL-path dates are intentional publication dates (trusted at all granularities);
|
|
360
|
+
from free TEXT only FULL, valid dates are trusted — a bare year in prose ('the 2008 crisis')
|
|
361
|
+
is usually a TOPIC year, not the publish date, so it is ignored. All full dates are validated
|
|
362
|
+
(no impossible days like 2026-02-30)."""
|
|
363
|
+
url, text = url or "", text or ""
|
|
364
|
+
|
|
365
|
+
def _full(y, mo, d):
|
|
366
|
+
return f"{int(y):04d}-{int(mo):02d}-{int(d):02d}" if _valid_ymd(y, mo, d) else ""
|
|
367
|
+
|
|
368
|
+
# 1) full ISO date — URL first, then text (both are precise, low ambiguity)
|
|
369
|
+
for blob in (url, text):
|
|
370
|
+
m = _ISO_RE.search(blob)
|
|
371
|
+
if m and (v := _full(m.group(1), m.group(2), m.group(3))):
|
|
372
|
+
return v
|
|
373
|
+
# 2) month-name full dates — text only (URLs don't carry these), validated
|
|
374
|
+
m = _MON_DMY.search(text)
|
|
375
|
+
if m and (v := _full(m.group(3), _MONTHS[m.group(2).lower()[:3]], m.group(1))):
|
|
376
|
+
return v
|
|
377
|
+
m = _MON_MDY.search(text)
|
|
378
|
+
if m and (v := _full(m.group(3), _MONTHS[m.group(1).lower()[:3]], m.group(2))):
|
|
379
|
+
return v
|
|
380
|
+
# 3) URL year/month (intentional path date)
|
|
381
|
+
m = _URL_YM_RE.search(url)
|
|
382
|
+
if m:
|
|
383
|
+
return f"{m.group(1)}-{m.group(2)}"
|
|
384
|
+
# 4) month-year in text ('Aug 2026' — usually an as-of/publish signal, less so a topic date)
|
|
385
|
+
m = _MON_MY.search(text)
|
|
386
|
+
if m:
|
|
387
|
+
return f"{m.group(2)}-{_MONTHS[m.group(1).lower()[:3]]:02d}"
|
|
388
|
+
# 5) bare year ONLY from a URL path (a bare year in free text is too often a topic year)
|
|
389
|
+
m = _URL_Y_RE.search(url)
|
|
390
|
+
if m:
|
|
391
|
+
return m.group(1)
|
|
392
|
+
return ""
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def is_time_sensitive(text: str) -> bool:
|
|
396
|
+
"""True when a brief/query shows recency intent — the gate for freshness reordering (R18)."""
|
|
397
|
+
return bool(_TEMPORAL_RE.search(text or ""))
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
# ---- current-year query injection (R19/D31): the fix recency RANKING can't make ----
|
|
401
|
+
# R18 reorders evidence by date, but you cannot reorder a fresh source that search never
|
|
402
|
+
# returned. On a time-sensitive query a small planner model often OMITS the year (or hallucinates
|
|
403
|
+
# a STALE one), so a meta-search returns the SEO-dominant HISTORICAL page (e.g. the famous 2023
|
|
404
|
+
# FOMC meeting for "current federal funds rate"). Pinning the current year INTO the web query
|
|
405
|
+
# forces the engine to surface this year's results, which R18 then orders freshest-first. WEB
|
|
406
|
+
# ONLY — arXiv sorts by relevance and Wikipedia is full-text, where a bare year pollutes instead.
|
|
407
|
+
#
|
|
408
|
+
# Review R19 hardening:
|
|
409
|
+
# • "Already pinned" must mean a *standalone, plausible* year — never a price ($2000), a fused
|
|
410
|
+
# token (fy2025), or a bare large count (2048); else injection is silently suppressed on
|
|
411
|
+
# exactly the concrete current-fact queries it targets (finding 1).
|
|
412
|
+
# • A RECENTLY-stale year (hallucinated "2023") gets the current year APPENDED alongside it, not
|
|
413
|
+
# no-op'd — so the engine still sees the fresh signal and R18 picks it (finding 2). We APPEND,
|
|
414
|
+
# never string-replace, so a deliberately historical query is never corrupted into nonsense.
|
|
415
|
+
# • Historical / timeless queries are skipped per-query, so a historical sub-query of a
|
|
416
|
+
# time-sensitive brief isn't poisoned with the current year (findings 3/7/9).
|
|
417
|
+
# • Accepted trade-offs (documented, not fixed): a year fused to a word (fy2025) can still get a
|
|
418
|
+
# second year appended (rare; original intent preserved) [finding 5]; a literal year is a
|
|
419
|
+
# soft-AND term, so an evergreen page that omits the year string can rank lower — the page
|
|
420
|
+
# fetch + order_by_recency + the 12-16 evidence cap keep it in play and re-rank by real date
|
|
421
|
+
# [finding 6].
|
|
422
|
+
_QUERY_CAP = 300 # search_structured/search truncate to this; never let the year be the cut tail
|
|
423
|
+
# A *standalone* calendar year — not fused to a word (fy2025), not a currency amount ($2000).
|
|
424
|
+
_STANDALONE_YEAR_RE = re.compile(r"(?<![\w$£€¥])((?:19|20)\d{2})(?![\w%])")
|
|
425
|
+
_STALE_WINDOW = 4 # a past year within N years of today reads as "recently stale" (refresh it);
|
|
426
|
+
# an older standalone year reads as a deliberate historical reference (respect).
|
|
427
|
+
# Query-level historical / timeless intent — pinning the current year would mis-steer it. Keyword
|
|
428
|
+
# based, not exhaustive; deliberately EXCLUDES "what is/are" (too common in legitimate current Qs).
|
|
429
|
+
_HISTORICAL_RE = re.compile(
|
|
430
|
+
r"(?i)(\b(history|historical|historically|origins?|originally|founded|inception|evolution|"
|
|
431
|
+
r"timeline|etymology|biography|retrospective|definition|defined\s+as)\b"
|
|
432
|
+
r"|\bmeaning\s+of\b|\bover\s+the\s+years\b|\bback\s+in\b|\bsince\s+\d{4}\b|\b\d{2,4}0s\b)")
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _year_of(today: str) -> str:
|
|
436
|
+
# search (not anchored match): robust to whatever _today() emits — '2026-06-13' OR
|
|
437
|
+
# 'June 13, 2026' both yield '2026' (review R19, finding 8). '' when no plausible year.
|
|
438
|
+
m = _STANDALONE_YEAR_RE.search(today or "")
|
|
439
|
+
return m.group(1) if m else ""
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def inject_recency(query: str, today: str, time_sensitive: bool = True) -> str:
|
|
443
|
+
"""Pin the current year into a time-sensitive WEB query so search returns CURRENT results
|
|
444
|
+
instead of the SEO-dominant historical page (R19/D31) — the fix R18's recency RANKING can't
|
|
445
|
+
make. No-op when: not time-sensitive (brief-level), query empty, the query shows historical/
|
|
446
|
+
timeless intent, the current (or a near-future forecast) year is already present, a deliberate
|
|
447
|
+
deep-historical year is present, `today` has no parseable year, or appending would exceed the
|
|
448
|
+
300-char cap. A RECENTLY-stale year is kept and the current year APPENDED alongside it (never
|
|
449
|
+
replaced) so the fresh signal reaches search without corrupting a historical query."""
|
|
450
|
+
q = (query or "").strip()
|
|
451
|
+
if not time_sensitive or not q or _HISTORICAL_RE.search(q):
|
|
452
|
+
return q
|
|
453
|
+
year = _year_of(today)
|
|
454
|
+
if not year:
|
|
455
|
+
return q
|
|
456
|
+
cur = int(year)
|
|
457
|
+
# plausible standalone year tokens already in the query (ignore prices / fused / out-of-range)
|
|
458
|
+
years = [int(m.group(1)) for m in _STANDALONE_YEAR_RE.finditer(q)
|
|
459
|
+
if 1990 <= int(m.group(1)) <= cur + 1]
|
|
460
|
+
if any(y >= cur for y in years):
|
|
461
|
+
return q # current year already pinned, or a deliberate near-future (forecast) year
|
|
462
|
+
if any(y < cur - _STALE_WINDOW for y in years):
|
|
463
|
+
return q # a deliberate, deep-historical year — respect it, don't append a second year
|
|
464
|
+
# else: no year, or only a RECENTLY-stale year (e.g. a hallucinated "2023") → append current
|
|
465
|
+
if len(q) + 1 + len(year) > _QUERY_CAP:
|
|
466
|
+
return q
|
|
467
|
+
return f"{q} {year}"
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
# ---- breaking-news auto-deepen (R19/D31): heavier retrieval for fast-moving topics ----
|
|
471
|
+
# A STRICTER subset of recency intent: "happening now" signals where the answer is volatile and
|
|
472
|
+
# SEO favors stale pages, so more queries + more page fetches earn their keep. Deliberately
|
|
473
|
+
# excludes plain "latest"/"current"/"recent" (those are handled by year injection above, which
|
|
474
|
+
# is cheap) so we don't over-deepen — and double the local compute on — every dated query.
|
|
475
|
+
_BREAKING_RE = re.compile(
|
|
476
|
+
r"(?i)\b(breaking|just\s+(announced|released|happened|reported|now)|right\s+now|"
|
|
477
|
+
r"as\s+of\s+(today|now)|today'?s|developing\s+(story|news|situation)|live\s+updates?|"
|
|
478
|
+
r"happening\s+now|this\s+(morning|afternoon|evening))\b")
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def is_breaking(text: str) -> bool:
|
|
482
|
+
"""True for the strongest 'happening now' signals — warrants a depth bump (R19). A strict
|
|
483
|
+
subset of is_time_sensitive(): every breaking brief is time-sensitive, but not vice-versa."""
|
|
484
|
+
return bool(_BREAKING_RE.search(text or ""))
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def _pad_date(d: str) -> str:
|
|
488
|
+
"""'2026' -> '2026-00-00', '2026-06' -> '2026-06-00' so ISO strings compare chronologically."""
|
|
489
|
+
parts = (d or "").split("-")
|
|
490
|
+
while len(parts) < 3:
|
|
491
|
+
parts.append("00")
|
|
492
|
+
return "-".join(parts[:3])
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def order_by_recency(evidence: list[dict]) -> list[dict]:
|
|
496
|
+
"""Reorder evidence so the most-recently-dated sources come first (using each item's
|
|
497
|
+
'date' if fetched, else 'date_hint', else a sniff of url+snippet); undated items keep their
|
|
498
|
+
original relative (relevance) order and sort last. Stable; returns a new list."""
|
|
499
|
+
def _key(item: dict) -> str:
|
|
500
|
+
d = item.get("date") or item.get("date_hint") \
|
|
501
|
+
or extract_date_hint(item.get("url", ""), item.get("snippet", ""))
|
|
502
|
+
return _pad_date(d) if d else ""
|
|
503
|
+
return sorted(evidence, key=_key, reverse=True)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def fetch_extract(url: str, max_chars: int = 6000, with_date: bool = False):
|
|
507
|
+
"""One polite, SSRF-guarded fetch of a PUBLIC http(s) page → sanitized main text.
|
|
508
|
+
Shared by the researcher (page evidence) and batch fetch shards. Raises on failure —
|
|
509
|
+
callers treat page evidence as best-effort. with_date=True → (text, iso_date)."""
|
|
510
|
+
import requests
|
|
511
|
+
from council.sanitize import clean
|
|
512
|
+
host = (urlparse(url).hostname or "").lower()
|
|
513
|
+
if not url.startswith(("http://", "https://")) or not _host_is_public(host):
|
|
514
|
+
raise ValueError(f"not a public http(s) URL: {url[:80]}")
|
|
515
|
+
r = requests.get(url, headers={"User-Agent": _UA}, timeout=15, stream=True)
|
|
516
|
+
r.raise_for_status()
|
|
517
|
+
raw = r.raw.read(_FETCH_CAP, decode_content=True).decode("utf-8", "replace")
|
|
518
|
+
text, date = _extract_main(raw)
|
|
519
|
+
text = clean(text)[:max_chars]
|
|
520
|
+
return (text, date) if with_date else text
|