passiveworkers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
council/research.py ADDED
@@ -0,0 +1,520 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ council/research.py — per-node, egress-localized web research (M4)
4
+ =================================================================
5
+ A worker's OWN agent researches the live web FROM ITS OWN egress and returns its OWN
6
+ findings (titles + snippets + sources) as a context string. It NEVER proxies someone
7
+ else's traffic — the worker model reads these findings and writes its own answer
8
+ (the legal bright line; see council/worker.py and docs/DECISIONS D4).
9
+
10
+ Why this is the moat: DuckDuckGo/metasearch localize on the *egress IP* and then discard
11
+ it. So the Helsinki VPS and the Gulf Mac get genuinely different result sets from their own
12
+ egress — diversity no central API can replicate. The lever: leave region at world
13
+ (`wt-wt`) and let egress drive locale; never force a region.
14
+
15
+ Config (per node, via env):
16
+ PW_WEB_BACKEND off (default) | ddgs | searxng
17
+ PW_SEARXNG_URL e.g. http://127.0.0.1:8080 (only used for searxng)
18
+ PW_WEB_RESULTS max results (default 5) · PW_WEB_TIMEOUT seconds (default 8)
19
+
20
+ Wire-in: council/net/agent.py passes search() as PerspectiveWorker(web_search=…) when
21
+ PW_WEB_BACKEND != off. Best-effort: returns "" on any failure (never blocks the answer).
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import ipaddress
27
+ import os
28
+ import re
29
+ import socket
30
+ import time
31
+ from functools import lru_cache
32
+ from urllib.parse import urlparse
33
+
34
+ _TIMEOUT = float(os.environ.get("PW_WEB_TIMEOUT", "8"))
35
+ _MAX_RESULTS = int(os.environ.get("PW_WEB_RESULTS", "5"))
36
+ def _backend() -> str:
37
+ # Read at CALL time, not import time — callers (e.g. council.local) may enable the
38
+ # web after this module is imported. Auto-prefer a local SearXNG when one is up:
39
+ # the ecosystem's converged answer to DDG rate limiting (gpt-researcher #478,
40
+ # local-deep-research #18, open-webui, CrewAI…), and better for privacy.
41
+ b = os.environ.get("PW_WEB_BACKEND", "off")
42
+ if b == "ddgs" and _searxng_alive():
43
+ return "searxng"
44
+ return b
45
+
46
+
47
+ @lru_cache(maxsize=1)
48
+ def _searxng_alive() -> bool:
49
+ url = os.environ.get("PW_SEARXNG_URL") or "http://127.0.0.1:8080"
50
+ try:
51
+ import requests
52
+ r = requests.get(f"{url.rstrip('/')}/search",
53
+ params={"q": "ping", "format": "json"},
54
+ headers={"User-Agent": _UA}, timeout=2)
55
+ if r.ok:
56
+ os.environ.setdefault("PW_SEARXNG_URL", url)
57
+ return True
58
+ except Exception:
59
+ pass
60
+ return False
61
+ _SEARX = os.environ.get("PW_SEARXNG_URL", "")
62
+ _UA = "PassiveWorkers-Research/0.1 (mutual-aid council; egress-localized)"
63
+
64
+ # Source curation: hosts that are video/social/link-farm — fine for browsing, weak as
65
+ # research citations. Suffix-matched so subdomains are covered.
66
+ _LOW_QUALITY_HOSTS = (
67
+ "youtube.com", "youtu.be", "tiktok.com", "facebook.com", "instagram.com",
68
+ "pinterest.com", "pinterest.co.uk", "x.com", "twitter.com", "threads.net",
69
+ "quora.com", "slideshare.net", "scribd.com",
70
+ )
71
+
72
+
73
+ def _is_quality_host(host: str) -> bool:
74
+ return not any(host == h or host.endswith("." + h) for h in _LOW_QUALITY_HOSTS)
75
+
76
+
77
+ # ---- SSRF / abuse guard: only public hosts (block loopback/private/link-local/CGNAT/metadata).
78
+ def _host_is_public(host: str) -> bool:
79
+ if not host:
80
+ return False
81
+ try:
82
+ for _fam, _t, _p, _c, sa in socket.getaddrinfo(host, None):
83
+ ip = ipaddress.ip_address(sa[0])
84
+ if (ip.is_loopback or ip.is_private or ip.is_link_local
85
+ or ip.is_reserved or ip.is_multicast or ip.is_unspecified):
86
+ return False
87
+ if ip in ipaddress.ip_network("100.64.0.0/10"): # CGNAT
88
+ return False
89
+ return True
90
+ except (OSError, ValueError):
91
+ return False
92
+
93
+
94
+ def _clean(results: list[dict]) -> str:
95
+ out, seen = [], set()
96
+ for r in results:
97
+ url = (r.get("href") or r.get("url") or "").strip()
98
+ host = (urlparse(url).hostname or "").lower()
99
+ if not host or host in seen or not _host_is_public(host) or not _is_quality_host(host):
100
+ continue
101
+ seen.add(host)
102
+ title = (r.get("title") or "").strip()
103
+ body = (r.get("body") or r.get("content") or "").strip()[:400]
104
+ out.append(f"- {title} ({host})\n {body}\n source: {url}")
105
+ if len(out) >= _MAX_RESULTS:
106
+ break
107
+ return "\n".join(out)
108
+
109
+
110
+ def _ddgs(question: str) -> list[dict]:
111
+ try:
112
+ from ddgs import DDGS # current package name
113
+ except ImportError:
114
+ from duckduckgo_search import DDGS # older name, same API
115
+ # DDG rate-limits aggressively at scale (systemic across the ecosystem) —
116
+ # 3 tries with exponential backoff + jitter before giving up.
117
+ last: Exception | None = None
118
+ for attempt in range(3):
119
+ try:
120
+ with DDGS(timeout=int(_TIMEOUT)) as ddg:
121
+ # region world → engines localize on THIS node's egress IP (the moat).
122
+ return list(ddg.text(question, region="wt-wt", safesearch="moderate",
123
+ max_results=_MAX_RESULTS))
124
+ except Exception as e:
125
+ last = e
126
+ time.sleep((2 ** attempt) + (hash(question) % 7) / 10)
127
+ raise last # type: ignore[misc]
128
+
129
+
130
+ def _searxng(question: str) -> list[dict]:
131
+ import requests
132
+ searx = os.environ.get("PW_SEARXNG_URL") or _SEARX or "http://127.0.0.1:8080"
133
+ host = urlparse(searx).hostname or ""
134
+ # SearXNG may legitimately run on loopback ON this node; allow that explicitly.
135
+ if not (_host_is_public(host) or host in ("127.0.0.1", "localhost")):
136
+ return []
137
+ r = requests.get(f"{searx.rstrip('/')}/search",
138
+ params={"q": question, "format": "json", "safesearch": 1},
139
+ headers={"User-Agent": _UA}, timeout=_TIMEOUT)
140
+ r.raise_for_status()
141
+ return r.json().get("results", [])
142
+
143
+
144
+ # ---- keyless routable engines (engine routing v1: web | academic | encyclopedic) ----
145
+ def _arxiv(question: str, max_results: int = 5) -> list[dict]:
146
+ """arXiv's free API — for academic queries. Returns the common row shape."""
147
+ import requests
148
+ import xml.etree.ElementTree as ET
149
+ r = requests.get("https://export.arxiv.org/api/query",
150
+ params={"search_query": f"all:{question}", "max_results": max_results,
151
+ "sortBy": "relevance"},
152
+ headers={"User-Agent": _UA}, timeout=_TIMEOUT + 4)
153
+ r.raise_for_status()
154
+ ns = {"a": "http://www.w3.org/2005/Atom"}
155
+ rows = []
156
+ for e in ET.fromstring(r.text).findall("a:entry", ns):
157
+ title = (e.findtext("a:title", "", ns) or "").strip()
158
+ url = (e.findtext("a:id", "", ns) or "").strip()
159
+ summary = (e.findtext("a:summary", "", ns) or "").strip()
160
+ if title and url:
161
+ rows.append({"title": title, "href": url, "body": summary})
162
+ return rows
163
+
164
+
165
+ def _wikipedia(question: str, max_results: int = 4) -> list[dict]:
166
+ """Wikipedia's free full-text search API — for encyclopedic queries. Same row shape."""
167
+ import requests
168
+ r = requests.get("https://en.wikipedia.org/w/api.php",
169
+ params={"action": "query", "list": "search", "srsearch": question,
170
+ "srlimit": max_results, "format": "json"},
171
+ headers={"User-Agent": _UA}, timeout=_TIMEOUT)
172
+ r.raise_for_status()
173
+ rows = []
174
+ for hit in r.json().get("query", {}).get("search", []):
175
+ title = hit.get("title", "")
176
+ if title:
177
+ rows.append({"title": title,
178
+ "href": "https://en.wikipedia.org/wiki/" + title.replace(" ", "_"),
179
+ "body": re.sub(r"<[^>]+>", "", hit.get("snippet", ""))})
180
+ return rows
181
+
182
+
183
+ def _wikipedia_fallback(question: str) -> str:
184
+ """Clean official-API fallback when search yields nothing (no geo-signal, but reliable)."""
185
+ try:
186
+ import requests
187
+ r = requests.get("https://en.wikipedia.org/w/api.php",
188
+ params={"action": "opensearch", "search": question,
189
+ "limit": 3, "format": "json"},
190
+ headers={"User-Agent": _UA}, timeout=_TIMEOUT)
191
+ r.raise_for_status()
192
+ _, titles, descs, urls = r.json()
193
+ rows = [{"title": t, "body": d, "url": u} for t, d, u in zip(titles, descs, urls)]
194
+ return _clean(rows)
195
+ except Exception:
196
+ return ""
197
+
198
+
199
+ @lru_cache(maxsize=256)
200
+ def _cached(question: str, _bucket: int, backend: str) -> str:
201
+ if backend == "searxng":
202
+ rows = _searxng(question)
203
+ else:
204
+ rows = _ddgs(question)
205
+ found = _clean(rows)
206
+ return found or _wikipedia_fallback(question)
207
+
208
+
209
+ def search(question: str) -> str:
210
+ """The web_search hook: (question) -> findings text. Best-effort; '' on any failure."""
211
+ if _backend() == "off":
212
+ return ""
213
+ q = (question or "").strip()[:300]
214
+ if not q:
215
+ return ""
216
+ try:
217
+ bucket = int(time.time() // 900) # 15-minute cache window via the lru_cache key
218
+ return _cached(q, bucket, _backend())
219
+ except Exception:
220
+ return ""
221
+
222
+
223
+ # ---- dynamic source routing (R17/D29): pick which keyless engines a query should hit ----
224
+ # academic signals → also query arXiv; definitional/background signals → also query Wikipedia.
225
+ # 'web' (egress-localized meta-search — the moat) is ALWAYS included; the extras AUGMENT it,
226
+ # they never replace it. Pure + env-gated (PW_SOURCE_ROUTING=off → web only).
227
+ _ACADEMIC_RE = re.compile(
228
+ r"\b(arxiv|preprint|peer.?reviewed|research papers?|journal article|study|studies|"
229
+ r"meta.?analysis|clinical trial|systematic review|algorithm|theorem|equation|dataset|"
230
+ r"benchmark|state.of.the.art|sota|neural network|machine learning|deep learning|"
231
+ r"reinforcement learning|transformer|quantum|genomic|proteomic|astrophysics)\b", re.I)
232
+ # 'who/what is/are' only counts at the start (or after sentence punctuation) so a mid-sentence
233
+ # 'what is it like' in prose doesn't over-route; the explicit phrases match anywhere.
234
+ _ENCYCLOPEDIC_RE = re.compile(
235
+ r"(?:^|[.!?]\s+)(?:who|what)\s+(?:is|are|was|were)\b"
236
+ r"|\b(?:definition of|meaning of|history of|biography of|overview of|background on|"
237
+ r"capital of|where is)\b", re.I)
238
+
239
+
240
+ def route_engines(query: str) -> list[str]:
241
+ """Ordered keyless engines for a query. Always starts with 'web' (the egress-localized
242
+ moat); the extras AUGMENT it, never replace it. Appends 'academic' (arXiv) and/or
243
+ 'encyclopedic' (Wikipedia) when the query clearly calls for them. NOTE: arXiv/Wikipedia are
244
+ CENTRAL APIs — they do not geo-localize (no egress moat) and are not deanonymizing beyond an
245
+ ordinary HTTP request. PW_SOURCE_ROUTING=off|0|false (case-insensitive) pins it to web only."""
246
+ if os.environ.get("PW_SOURCE_ROUTING", "on").lower() in ("off", "0", "false"):
247
+ return ["web"]
248
+ q = query or ""
249
+ engines = ["web"]
250
+ if _ACADEMIC_RE.search(q):
251
+ engines.append("academic")
252
+ if _ENCYCLOPEDIC_RE.search(q):
253
+ engines.append("encyclopedic")
254
+ return engines
255
+
256
+
257
+ def search_structured(query: str, max_results: int = 5, engine: str = "web") -> list[dict]:
258
+ """Structured variant for the researcher: [{title, url, host, snippet}], SSRF-guarded,
259
+ deduped by host. Best-effort; [] on any failure. Same egress-localization as search().
260
+ `engine`: web (meta-search) | academic (arXiv) | encyclopedic (Wikipedia) — keyless."""
261
+ backend = _backend()
262
+ if backend == "off":
263
+ return []
264
+ q = (query or "").strip()[:300]
265
+ if not q:
266
+ return []
267
+ try:
268
+ if engine == "academic":
269
+ rows = _arxiv(q, max_results)
270
+ elif engine == "encyclopedic":
271
+ rows = _wikipedia(q, max_results)
272
+ elif backend == "searxng":
273
+ rows = _searxng(q)
274
+ else:
275
+ rows = _ddgs(q)
276
+ except Exception:
277
+ return []
278
+ from council.sanitize import clean as _sanitize
279
+ out, seen = [], set()
280
+ for r in rows:
281
+ url = (r.get("href") or r.get("url") or "").strip()
282
+ host = (urlparse(url).hostname or "").lower()
283
+ if not host or host in seen or not _host_is_public(host) or not _is_quality_host(host):
284
+ continue
285
+ seen.add(host)
286
+ out.append({"title": _sanitize((r.get("title") or ""))[:160],
287
+ "url": url,
288
+ "host": host,
289
+ "snippet": _sanitize((r.get("body") or r.get("content") or ""))[:500]})
290
+ if len(out) >= max_results:
291
+ break
292
+ return out
293
+
294
+
295
+ # ---- full-page evidence (R5/D17: the leaders draft from pages, not snippets) ----
296
+ _FETCH_CAP = 200_000 # bytes per page — extraction input, not archival
297
+ _HTML_JUNK = re.compile(r"(?is)<(script|style|noscript|svg|header|footer|nav)[^>]*>.*?</\1>")
298
+ _TAGS = re.compile(r"(?s)<[^>]+>")
299
+
300
+
301
+ def _strip_html(html: str) -> str:
302
+ text = _TAGS.sub(" ", _HTML_JUNK.sub(" ", html))
303
+ return re.sub(r"\s+", " ", text).strip()
304
+
305
+
306
+ def _extract_main(html: str) -> tuple[str, str]:
307
+ """(main_text, iso_date) from raw HTML. Prefer trafilatura (real boilerplate removal +
308
+ metadata, Apache-2.0; see docs/PRIOR_ART.md); fall back to our regex strip if it's
309
+ absent or yields nothing. Date is best-effort ('' when unknown)."""
310
+ try:
311
+ import trafilatura
312
+ text = trafilatura.extract(html, include_comments=False, include_tables=True,
313
+ favor_precision=True) or ""
314
+ date = ""
315
+ try:
316
+ md = trafilatura.extract_metadata(html)
317
+ date = (getattr(md, "date", "") or "") if md else ""
318
+ except Exception:
319
+ date = ""
320
+ if text.strip():
321
+ return text.strip(), date
322
+ except Exception:
323
+ pass
324
+ return _strip_html(html), ""
325
+
326
+
327
+ # ---- freshness signals (R18/D30): the council's edge is currency, so lead with recent sources ----
328
+ _MONTHS = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
329
+ "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
330
+ _DAY = r"(0?[1-9]|[12]\d|3[01])" # 1-31 only (no 0/32+)
331
+ _ISO_RE = re.compile(r"(20\d{2})[-/](0[1-9]|1[0-2])[-/](0[1-9]|[12]\d|3[01])")
332
+ _URL_YM_RE = re.compile(r"/(20\d{2})/(0[1-9]|1[0-2])(?:/|\b)")
333
+ _URL_Y_RE = re.compile(r"/(20[1-3]\d)(?:/|\b)")
334
+ _MON = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)"
335
+ _MON_DMY = re.compile(rf"(?i)\b{_DAY}\s+{_MON}[a-z]*\.?,?\s+(20\d{{2}})\b")
336
+ _MON_MDY = re.compile(rf"(?i)\b{_MON}[a-z]*\.?\s+{_DAY},?\s+(20\d{{2}})\b")
337
+ _MON_MY = re.compile(rf"(?i)\b{_MON}[a-z]*\.?\s+(20\d{{2}})\b")
338
+
339
+ # does a brief/query actually care about recency? If not, recency reordering is noise (and could
340
+ # bury an authoritative older source under a recent repost), so we leave relevance order alone.
341
+ _TEMPORAL_RE = re.compile(
342
+ r"(?i)\b(current|currently|latest|newest|recent|recently|now|today|as of|up.?to.?date|"
343
+ r"most recent|breaking|so far|to date|right now|when|next|upcoming|deadline|date|"
344
+ r"this (year|month|week)|202\d)\b")
345
+
346
+
347
+ def _valid_ymd(y, mo, d) -> bool:
348
+ try:
349
+ import datetime
350
+ datetime.date(int(y), int(mo), int(d))
351
+ return True
352
+ except (ValueError, TypeError):
353
+ return False
354
+
355
+
356
+ def extract_date_hint(url: str, text: str = "") -> str:
357
+ """Best-effort publication date as 'YYYY-MM-DD' / 'YYYY-MM' / 'YYYY' (most precise first),
358
+ or '' if none. A freshness RANKING signal, not an authority claim. Trust is stratified
359
+ (review R18): URL-path dates are intentional publication dates (trusted at all granularities);
360
+ from free TEXT only FULL, valid dates are trusted — a bare year in prose ('the 2008 crisis')
361
+ is usually a TOPIC year, not the publish date, so it is ignored. All full dates are validated
362
+ (no impossible days like 2026-02-30)."""
363
+ url, text = url or "", text or ""
364
+
365
+ def _full(y, mo, d):
366
+ return f"{int(y):04d}-{int(mo):02d}-{int(d):02d}" if _valid_ymd(y, mo, d) else ""
367
+
368
+ # 1) full ISO date — URL first, then text (both are precise, low ambiguity)
369
+ for blob in (url, text):
370
+ m = _ISO_RE.search(blob)
371
+ if m and (v := _full(m.group(1), m.group(2), m.group(3))):
372
+ return v
373
+ # 2) month-name full dates — text only (URLs don't carry these), validated
374
+ m = _MON_DMY.search(text)
375
+ if m and (v := _full(m.group(3), _MONTHS[m.group(2).lower()[:3]], m.group(1))):
376
+ return v
377
+ m = _MON_MDY.search(text)
378
+ if m and (v := _full(m.group(3), _MONTHS[m.group(1).lower()[:3]], m.group(2))):
379
+ return v
380
+ # 3) URL year/month (intentional path date)
381
+ m = _URL_YM_RE.search(url)
382
+ if m:
383
+ return f"{m.group(1)}-{m.group(2)}"
384
+ # 4) month-year in text ('Aug 2026' — usually an as-of/publish signal, less so a topic date)
385
+ m = _MON_MY.search(text)
386
+ if m:
387
+ return f"{m.group(2)}-{_MONTHS[m.group(1).lower()[:3]]:02d}"
388
+ # 5) bare year ONLY from a URL path (a bare year in free text is too often a topic year)
389
+ m = _URL_Y_RE.search(url)
390
+ if m:
391
+ return m.group(1)
392
+ return ""
393
+
394
+
395
+ def is_time_sensitive(text: str) -> bool:
396
+ """True when a brief/query shows recency intent — the gate for freshness reordering (R18)."""
397
+ return bool(_TEMPORAL_RE.search(text or ""))
398
+
399
+
400
+ # ---- current-year query injection (R19/D31): the fix recency RANKING can't make ----
401
+ # R18 reorders evidence by date, but you cannot reorder a fresh source that search never
402
+ # returned. On a time-sensitive query a small planner model often OMITS the year (or hallucinates
403
+ # a STALE one), so a meta-search returns the SEO-dominant HISTORICAL page (e.g. the famous 2023
404
+ # FOMC meeting for "current federal funds rate"). Pinning the current year INTO the web query
405
+ # forces the engine to surface this year's results, which R18 then orders freshest-first. WEB
406
+ # ONLY — arXiv sorts by relevance and Wikipedia is full-text, where a bare year pollutes instead.
407
+ #
408
+ # Review R19 hardening:
409
+ # • "Already pinned" must mean a *standalone, plausible* year — never a price ($2000), a fused
410
+ # token (fy2025), or a bare large count (2048); else injection is silently suppressed on
411
+ # exactly the concrete current-fact queries it targets (finding 1).
412
+ # • A RECENTLY-stale year (hallucinated "2023") gets the current year APPENDED alongside it, not
413
+ # no-op'd — so the engine still sees the fresh signal and R18 picks it (finding 2). We APPEND,
414
+ # never string-replace, so a deliberately historical query is never corrupted into nonsense.
415
+ # • Historical / timeless queries are skipped per-query, so a historical sub-query of a
416
+ # time-sensitive brief isn't poisoned with the current year (findings 3/7/9).
417
+ # • Accepted trade-offs (documented, not fixed): a year fused to a word (fy2025) can still get a
418
+ # second year appended (rare; original intent preserved) [finding 5]; a literal year is a
419
+ # soft-AND term, so an evergreen page that omits the year string can rank lower — the page
420
+ # fetch + order_by_recency + the 12-16 evidence cap keep it in play and re-rank by real date
421
+ # [finding 6].
422
+ _QUERY_CAP = 300 # search_structured/search truncate to this; never let the year be the cut tail
423
+ # A *standalone* calendar year — not fused to a word (fy2025), not a currency amount ($2000).
424
+ _STANDALONE_YEAR_RE = re.compile(r"(?<![\w$£€¥])((?:19|20)\d{2})(?![\w%])")
425
+ _STALE_WINDOW = 4 # a past year within N years of today reads as "recently stale" (refresh it);
426
+ # an older standalone year reads as a deliberate historical reference (respect).
427
+ # Query-level historical / timeless intent — pinning the current year would mis-steer it. Keyword
428
+ # based, not exhaustive; deliberately EXCLUDES "what is/are" (too common in legitimate current Qs).
429
+ _HISTORICAL_RE = re.compile(
430
+ r"(?i)(\b(history|historical|historically|origins?|originally|founded|inception|evolution|"
431
+ r"timeline|etymology|biography|retrospective|definition|defined\s+as)\b"
432
+ r"|\bmeaning\s+of\b|\bover\s+the\s+years\b|\bback\s+in\b|\bsince\s+\d{4}\b|\b\d{2,4}0s\b)")
433
+
434
+
435
+ def _year_of(today: str) -> str:
436
+ # search (not anchored match): robust to whatever _today() emits — '2026-06-13' OR
437
+ # 'June 13, 2026' both yield '2026' (review R19, finding 8). '' when no plausible year.
438
+ m = _STANDALONE_YEAR_RE.search(today or "")
439
+ return m.group(1) if m else ""
440
+
441
+
442
+ def inject_recency(query: str, today: str, time_sensitive: bool = True) -> str:
443
+ """Pin the current year into a time-sensitive WEB query so search returns CURRENT results
444
+ instead of the SEO-dominant historical page (R19/D31) — the fix R18's recency RANKING can't
445
+ make. No-op when: not time-sensitive (brief-level), query empty, the query shows historical/
446
+ timeless intent, the current (or a near-future forecast) year is already present, a deliberate
447
+ deep-historical year is present, `today` has no parseable year, or appending would exceed the
448
+ 300-char cap. A RECENTLY-stale year is kept and the current year APPENDED alongside it (never
449
+ replaced) so the fresh signal reaches search without corrupting a historical query."""
450
+ q = (query or "").strip()
451
+ if not time_sensitive or not q or _HISTORICAL_RE.search(q):
452
+ return q
453
+ year = _year_of(today)
454
+ if not year:
455
+ return q
456
+ cur = int(year)
457
+ # plausible standalone year tokens already in the query (ignore prices / fused / out-of-range)
458
+ years = [int(m.group(1)) for m in _STANDALONE_YEAR_RE.finditer(q)
459
+ if 1990 <= int(m.group(1)) <= cur + 1]
460
+ if any(y >= cur for y in years):
461
+ return q # current year already pinned, or a deliberate near-future (forecast) year
462
+ if any(y < cur - _STALE_WINDOW for y in years):
463
+ return q # a deliberate, deep-historical year — respect it, don't append a second year
464
+ # else: no year, or only a RECENTLY-stale year (e.g. a hallucinated "2023") → append current
465
+ if len(q) + 1 + len(year) > _QUERY_CAP:
466
+ return q
467
+ return f"{q} {year}"
468
+
469
+
470
+ # ---- breaking-news auto-deepen (R19/D31): heavier retrieval for fast-moving topics ----
471
+ # A STRICTER subset of recency intent: "happening now" signals where the answer is volatile and
472
+ # SEO favors stale pages, so more queries + more page fetches earn their keep. Deliberately
473
+ # excludes plain "latest"/"current"/"recent" (those are handled by year injection above, which
474
+ # is cheap) so we don't over-deepen — and double the local compute on — every dated query.
475
+ _BREAKING_RE = re.compile(
476
+ r"(?i)\b(breaking|just\s+(announced|released|happened|reported|now)|right\s+now|"
477
+ r"as\s+of\s+(today|now)|today'?s|developing\s+(story|news|situation)|live\s+updates?|"
478
+ r"happening\s+now|this\s+(morning|afternoon|evening))\b")
479
+
480
+
481
+ def is_breaking(text: str) -> bool:
482
+ """True for the strongest 'happening now' signals — warrants a depth bump (R19). A strict
483
+ subset of is_time_sensitive(): every breaking brief is time-sensitive, but not vice-versa."""
484
+ return bool(_BREAKING_RE.search(text or ""))
485
+
486
+
487
+ def _pad_date(d: str) -> str:
488
+ """'2026' -> '2026-00-00', '2026-06' -> '2026-06-00' so ISO strings compare chronologically."""
489
+ parts = (d or "").split("-")
490
+ while len(parts) < 3:
491
+ parts.append("00")
492
+ return "-".join(parts[:3])
493
+
494
+
495
+ def order_by_recency(evidence: list[dict]) -> list[dict]:
496
+ """Reorder evidence so the most-recently-dated sources come first (using each item's
497
+ 'date' if fetched, else 'date_hint', else a sniff of url+snippet); undated items keep their
498
+ original relative (relevance) order and sort last. Stable; returns a new list."""
499
+ def _key(item: dict) -> str:
500
+ d = item.get("date") or item.get("date_hint") \
501
+ or extract_date_hint(item.get("url", ""), item.get("snippet", ""))
502
+ return _pad_date(d) if d else ""
503
+ return sorted(evidence, key=_key, reverse=True)
504
+
505
+
506
+ def fetch_extract(url: str, max_chars: int = 6000, with_date: bool = False):
507
+ """One polite, SSRF-guarded fetch of a PUBLIC http(s) page → sanitized main text.
508
+ Shared by the researcher (page evidence) and batch fetch shards. Raises on failure —
509
+ callers treat page evidence as best-effort. with_date=True → (text, iso_date)."""
510
+ import requests
511
+ from council.sanitize import clean
512
+ host = (urlparse(url).hostname or "").lower()
513
+ if not url.startswith(("http://", "https://")) or not _host_is_public(host):
514
+ raise ValueError(f"not a public http(s) URL: {url[:80]}")
515
+ r = requests.get(url, headers={"User-Agent": _UA}, timeout=15, stream=True)
516
+ r.raise_for_status()
517
+ raw = r.raw.read(_FETCH_CAP, decode_content=True).decode("utf-8", "replace")
518
+ text, date = _extract_main(raw)
519
+ text = clean(text)[:max_chars]
520
+ return (text, date) if with_date else text