citesentry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. citesentry/__init__.py +3 -0
  2. citesentry/cache.py +70 -0
  3. citesentry/checks/__init__.py +0 -0
  4. citesentry/checks/existence.py +219 -0
  5. citesentry/checks/relevance.py +176 -0
  6. citesentry/checks/url_liveness.py +143 -0
  7. citesentry/cli.py +193 -0
  8. citesentry/config.py +46 -0
  9. citesentry/core/__init__.py +4 -0
  10. citesentry/core/cascade.py +17 -0
  11. citesentry/core/engine.py +94 -0
  12. citesentry/core/verdict.py +56 -0
  13. citesentry/llm/__init__.py +3 -0
  14. citesentry/llm/base.py +13 -0
  15. citesentry/llm/deepseek.py +41 -0
  16. citesentry/llm/mcp_sampling.py +17 -0
  17. citesentry/mcp_server.py +156 -0
  18. citesentry/models.py +81 -0
  19. citesentry/parse/__init__.py +3 -0
  20. citesentry/parse/bibtex.py +87 -0
  21. citesentry/parse/csl_json.py +87 -0
  22. citesentry/parse/detect.py +100 -0
  23. citesentry/parse/doi_list.py +20 -0
  24. citesentry/parse/nbib.py +87 -0
  25. citesentry/parse/pdf_refs.py +47 -0
  26. citesentry/parse/plaintext.py +329 -0
  27. citesentry/parse/ris.py +75 -0
  28. citesentry/sources/__init__.py +3 -0
  29. citesentry/sources/arxiv.py +111 -0
  30. citesentry/sources/base.py +19 -0
  31. citesentry/sources/crossref.py +97 -0
  32. citesentry/sources/domain/__init__.py +0 -0
  33. citesentry/sources/domain/dblp.py +86 -0
  34. citesentry/sources/domain/pubmed.py +153 -0
  35. citesentry/sources/openalex.py +99 -0
  36. citesentry/sources/semantic_scholar.py +73 -0
  37. citesentry/sources/unpaywall.py +46 -0
  38. citesentry-0.1.1.dist-info/METADATA +201 -0
  39. citesentry-0.1.1.dist-info/RECORD +41 -0
  40. citesentry-0.1.1.dist-info/WHEEL +4 -0
  41. citesentry-0.1.1.dist-info/entry_points.txt +3 -0
citesentry/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """citesentry — citation verification tool."""
2
+
3
+ __version__ = "0.1.1"
citesentry/cache.py ADDED
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import sqlite3
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ class Cache:
12
+ def __init__(self, path: Path) -> None:
13
+ self._path = path
14
+ self._path.parent.mkdir(parents=True, exist_ok=True)
15
+ self._conn = sqlite3.connect(str(path), check_same_thread=False)
16
+ self._init()
17
+
18
+ def _init(self) -> None:
19
+ self._conn.execute(
20
+ """CREATE TABLE IF NOT EXISTS cache (
21
+ key TEXT PRIMARY KEY,
22
+ value TEXT NOT NULL,
23
+ created_at REAL NOT NULL
24
+ )"""
25
+ )
26
+ self._conn.commit()
27
+
28
+ @staticmethod
29
+ def _key(namespace: str, identifier: str) -> str:
30
+ h = hashlib.sha256(f"{namespace}:{identifier}".encode()).hexdigest()
31
+ return h
32
+
33
+ def get(self, namespace: str, identifier: str) -> Any | None:
34
+ key = self._key(namespace, identifier)
35
+ row = self._conn.execute(
36
+ "SELECT value FROM cache WHERE key = ?", (key,)
37
+ ).fetchone()
38
+ if row is None:
39
+ return None
40
+ return json.loads(row[0])
41
+
42
+ def set(self, namespace: str, identifier: str, value: Any) -> None:
43
+ key = self._key(namespace, identifier)
44
+ self._conn.execute(
45
+ "INSERT OR REPLACE INTO cache (key, value, created_at) VALUES (?, ?, ?)",
46
+ (key, json.dumps(value), time.time()),
47
+ )
48
+ self._conn.commit()
49
+
50
+ def close(self) -> None:
51
+ self._conn.close()
52
+
53
+
54
+ _cache: Cache | None = None
55
+
56
+
57
+ def get_cache(path: Path | None = None) -> Cache:
58
+ global _cache
59
+ if _cache is None:
60
+ from citesentry.config import get_settings
61
+ p = path or get_settings().cache_path
62
+ _cache = Cache(p)
63
+ return _cache
64
+
65
+
66
+ def reset_cache() -> None:
67
+ global _cache
68
+ if _cache is not None:
69
+ _cache.close()
70
+ _cache = None
File without changes
@@ -0,0 +1,219 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from typing import Any
5
+
6
+ from rapidfuzz import fuzz
7
+
8
+ from citesentry.models import Candidate, CheckCost, CheckResult, CheckStatus, Reference, Verdict
9
+ from citesentry.sources.base import SourceAdapter
10
+
11
+ _TITLE_PASS_THRESHOLD = 85.0
12
+ _TITLE_WARN_THRESHOLD = 70.0
13
+
14
+
15
+ def _surname(name: str) -> str:
16
+ """Extract surname from 'Last, First' or 'First Last' format."""
17
+ name = name.strip()
18
+ if "," in name:
19
+ return name.split(",")[0].strip().lower()
20
+ parts = name.split()
21
+ return parts[-1].lower() if parts else ""
22
+
23
+
24
+ def _author_overlap(ref_authors: list[str], cand_authors: list[str]) -> float:
25
+ if not ref_authors or not cand_authors:
26
+ return 0.5
27
+ ref_surnames = {_surname(a) for a in ref_authors}
28
+ cand_surnames = {_surname(a) for a in cand_authors}
29
+ intersection = ref_surnames & cand_surnames
30
+ # Use subset recall when cited list is much shorter than found list
31
+ # (covers et al. truncation and papers with many authors).
32
+ if len(ref_surnames) < len(cand_surnames) * 0.5:
33
+ return len(intersection) / len(ref_surnames) if ref_surnames else 0.0
34
+ union = ref_surnames | cand_surnames
35
+ return len(intersection) / len(union) if union else 0.0
36
+
37
+
38
+ def _year_score(ref_year: int | None, cand_year: int | None) -> float:
39
+ if ref_year is None or cand_year is None:
40
+ return 0.5
41
+ diff = abs(ref_year - cand_year)
42
+ if diff == 0:
43
+ return 1.0
44
+ if diff == 1:
45
+ return 0.7
46
+ return 0.0
47
+
48
+
49
+ def _score_candidate(ref: Reference, cand: Candidate) -> tuple[float, dict[str, Any]]:
50
+ evidence: dict[str, Any] = {}
51
+
52
+ title_score = 0.0
53
+ if ref.title and cand.title:
54
+ title_score = fuzz.token_set_ratio(ref.title.lower(), cand.title.lower()) / 100.0
55
+ evidence["title_score"] = round(title_score, 3)
56
+
57
+ author_score = _author_overlap(ref.authors, cand.authors)
58
+ evidence["author_score"] = round(author_score, 3)
59
+
60
+ year_score = _year_score(ref.year, cand.year)
61
+ evidence["year_score"] = round(year_score, 3)
62
+
63
+ venue_score = 0.5
64
+ if ref.venue and cand.venue:
65
+ venue_score = fuzz.token_set_ratio(ref.venue.lower(), cand.venue.lower()) / 100.0
66
+ evidence["venue_score"] = round(venue_score, 3)
67
+
68
+ if title_score == 0.0:
69
+ composite = 0.0
70
+ else:
71
+ composite = (
72
+ title_score * 0.55
73
+ + author_score * 0.25
74
+ + year_score * 0.15
75
+ + venue_score * 0.05
76
+ )
77
+ evidence["composite"] = round(composite, 3)
78
+ return composite, evidence
79
+
80
+
81
+ def _check_metadata_consistency(ref: Reference, best: Candidate) -> list[str]:
82
+ mismatches = []
83
+ if ref.year and best.year and abs(ref.year - best.year) > 2:
84
+ mismatches.append(f"year: cited={ref.year}, found={best.year}")
85
+ if ref.doi and best.doi and ref.doi.lower().strip("/") != best.doi.lower().strip("/"):
86
+ mismatches.append(f"doi: cited={ref.doi}, found={best.doi}")
87
+ if ref.authors and best.authors:
88
+ overlap = _author_overlap(ref.authors, best.authors)
89
+ if overlap < 0.3:
90
+ mismatches.append(
91
+ f"authors: low overlap ({overlap:.2f}), "
92
+ f"cited={ref.authors[:2]}, found={best.authors[:2]}"
93
+ )
94
+ return mismatches
95
+
96
+
97
+ async def check_existence(
98
+ ref: Reference,
99
+ sources: list[SourceAdapter],
100
+ domain_sources: list[SourceAdapter] | None = None,
101
+ use_cache: bool = True,
102
+ ) -> CheckResult:
103
+ start = time.monotonic()
104
+ api_calls = 0
105
+ candidates: list[tuple[float, dict, Candidate]] = []
106
+ evidence: dict[str, Any] = {}
107
+
108
+ if use_cache:
109
+ from citesentry.cache import get_cache
110
+ cache = get_cache()
111
+ cache_key = ref.doi or (f"{ref.title}|{ref.year}" if ref.title else None)
112
+ if cache_key:
113
+ cached = cache.get("existence", cache_key)
114
+ if cached is not None:
115
+ elapsed = (time.monotonic() - start) * 1000
116
+ return CheckResult(
117
+ name="existence",
118
+ status=CheckStatus(cached["status"]),
119
+ confidence=cached["confidence"],
120
+ evidence={**cached["evidence"], "from_cache": True},
121
+ cost=CheckCost(api_calls=0, elapsed_ms=elapsed),
122
+ )
123
+
124
+ if ref.doi:
125
+ for src in sources:
126
+ try:
127
+ cand = await src.lookup_doi(ref.doi)
128
+ api_calls += 1
129
+ if cand:
130
+ score, ev = _score_candidate(ref, cand)
131
+ candidates.append((score, {**ev, "source": src.name, "via": "doi_lookup"}, cand))
132
+ except Exception as e:
133
+ evidence[f"{src.name}_error"] = str(e)
134
+
135
+ if not candidates:
136
+ for src in sources:
137
+ try:
138
+ results = await src.search(ref)
139
+ api_calls += 1
140
+ for cand in results:
141
+ score, ev = _score_candidate(ref, cand)
142
+ candidates.append((score, {**ev, "source": src.name, "via": "search"}, cand))
143
+ except Exception as e:
144
+ evidence[f"{src.name}_error"] = str(e)
145
+
146
+ if not candidates and domain_sources:
147
+ for src in domain_sources:
148
+ try:
149
+ if ref.doi:
150
+ cand = await src.lookup_doi(ref.doi)
151
+ api_calls += 1
152
+ if cand:
153
+ score, ev = _score_candidate(ref, cand)
154
+ candidates.append((score, {**ev, "source": src.name, "via": "doi_lookup"}, cand))
155
+ results = await src.search(ref)
156
+ api_calls += 1
157
+ for cand in results:
158
+ score, ev = _score_candidate(ref, cand)
159
+ candidates.append((score, {**ev, "source": src.name, "via": "search"}, cand))
160
+ except Exception as e:
161
+ evidence[f"{src.name}_error"] = str(e)
162
+
163
+ elapsed = (time.monotonic() - start) * 1000
164
+
165
+ if not candidates:
166
+ result = CheckResult(
167
+ name="existence",
168
+ status=CheckStatus.FAIL,
169
+ confidence=0.9 if (ref.title or ref.doi) else 0.5,
170
+ evidence={**evidence, "verdict": "not_found", "candidates_checked": 0},
171
+ cost=CheckCost(api_calls=api_calls, elapsed_ms=elapsed),
172
+ )
173
+ else:
174
+ candidates.sort(key=lambda x: x[0], reverse=True)
175
+ best_score, best_ev, best_cand = candidates[0]
176
+ evidence.update(best_ev)
177
+ evidence["best_candidate_title"] = best_cand.title
178
+ evidence["best_candidate_source"] = best_cand.source
179
+ evidence["total_candidates"] = len(candidates)
180
+
181
+ title_score = best_ev.get("title_score", 0.0)
182
+
183
+ # High title match always warrants a look, even if other fields diverge
184
+ if title_score >= 0.85 or best_score * 100 >= _TITLE_PASS_THRESHOLD:
185
+ mismatches = _check_metadata_consistency(ref, best_cand)
186
+ if mismatches:
187
+ evidence["mismatches"] = mismatches
188
+ status = CheckStatus.WARN
189
+ confidence = min(max(title_score, best_score), 0.85)
190
+ else:
191
+ status = CheckStatus.PASS
192
+ confidence = max(title_score, best_score)
193
+ elif best_score * 100 >= _TITLE_WARN_THRESHOLD:
194
+ status = CheckStatus.WARN
195
+ confidence = best_score * 0.7
196
+ else:
197
+ status = CheckStatus.FAIL
198
+ confidence = 0.7
199
+
200
+ result = CheckResult(
201
+ name="existence",
202
+ status=status,
203
+ confidence=confidence,
204
+ evidence=evidence,
205
+ cost=CheckCost(api_calls=api_calls, elapsed_ms=elapsed),
206
+ )
207
+
208
+ if use_cache:
209
+ from citesentry.cache import get_cache
210
+ cache = get_cache()
211
+ cache_key = ref.doi or (f"{ref.title}|{ref.year}" if ref.title else None)
212
+ if cache_key:
213
+ cache.set("existence", cache_key, {
214
+ "status": result.status.value,
215
+ "confidence": result.confidence,
216
+ "evidence": result.evidence,
217
+ })
218
+
219
+ return result
@@ -0,0 +1,176 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from typing import Any
6
+
7
+ import httpx
8
+
9
+ from citesentry.models import CheckCost, CheckResult, CheckStatus, RelevanceLabel, Reference
10
+
11
+ _RELEVANCE_PROMPT = """\
12
+ You are a citation relevance judge. Given a reference title/topic and fetched content from the cited source, determine if the content matches the citation.
13
+
14
+ Reference title: {title}
15
+ Reference topic context: {context}
16
+
17
+ Fetched content (first 1500 chars):
18
+ {content}
19
+
20
+ Respond with ONLY valid JSON in this exact format:
21
+ {{"label": "<RELEVANT|PARTIAL|UNRELATED|CANNOT_DETERMINE>", "confidence": <0.0-1.0>, "rationale": "<one sentence>"}}
22
+
23
+ Labels:
24
+ - RELEVANT: content clearly matches the cited title/topic
25
+ - PARTIAL: content partially matches or is in the same general area
26
+ - UNRELATED: content does not match the citation at all
27
+ - CANNOT_DETERMINE: paywalled, JS-only page, no meaningful content to judge
28
+ """
29
+
30
+
31
+ async def _fetch_content(url: str, doi: str | None = None) -> str:
32
+ """Fetch content for relevance check. Returns text snippet."""
33
+ headers = {
34
+ "User-Agent": (
35
+ "Mozilla/5.0 (compatible; citesentry/0.1; citation verification bot)"
36
+ )
37
+ }
38
+ try:
39
+ async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
40
+ r = await client.get(url, headers=headers)
41
+ if 200 <= r.status_code < 300:
42
+ text = r.text[:3000]
43
+ text = " ".join(text.split())
44
+ return text[:1500]
45
+ except httpx.HTTPError:
46
+ pass
47
+ return ""
48
+
49
+
50
+ async def check_relevance(
51
+ ref: Reference,
52
+ llm_client: object,
53
+ abstract: str | None = None,
54
+ use_cache: bool = True,
55
+ ) -> CheckResult:
56
+ start = time.monotonic()
57
+
58
+ if llm_client is None:
59
+ return CheckResult(
60
+ name="relevance",
61
+ status=CheckStatus.SKIPPED,
62
+ confidence=0.0,
63
+ evidence={"reason": "no_llm_client"},
64
+ cost=CheckCost(elapsed_ms=0.0),
65
+ )
66
+
67
+ if not ref.title and not ref.doi:
68
+ return CheckResult(
69
+ name="relevance",
70
+ status=CheckStatus.SKIPPED,
71
+ confidence=0.0,
72
+ evidence={"reason": "insufficient_reference_data"},
73
+ cost=CheckCost(elapsed_ms=0.0),
74
+ )
75
+
76
+ cache_key = ref.doi or (f"{ref.title}|{','.join(ref.urls[:1])}" if ref.title else None)
77
+ if use_cache and cache_key:
78
+ from citesentry.cache import get_cache
79
+ cache = get_cache()
80
+ cached = cache.get("relevance", cache_key)
81
+ if cached is not None:
82
+ elapsed = (time.monotonic() - start) * 1000
83
+ return CheckResult(
84
+ name="relevance",
85
+ status=CheckStatus(cached["status"]),
86
+ confidence=cached["confidence"],
87
+ evidence={**cached["evidence"], "from_cache": True},
88
+ cost=CheckCost(elapsed_ms=elapsed),
89
+ )
90
+
91
+ content = abstract or ""
92
+ if not content and ref.urls:
93
+ content = await _fetch_content(ref.urls[0])
94
+
95
+ if not content:
96
+ return CheckResult(
97
+ name="relevance",
98
+ status=CheckStatus.WARN,
99
+ confidence=0.3,
100
+ evidence={"reason": "no_content_available", "label": "CANNOT_DETERMINE"},
101
+ cost=CheckCost(elapsed_ms=(time.monotonic() - start) * 1000),
102
+ )
103
+
104
+ context = ""
105
+ if ref.venue:
106
+ context += f"Published in: {ref.venue}. "
107
+ if ref.year:
108
+ context += f"Year: {ref.year}."
109
+
110
+ prompt = _RELEVANCE_PROMPT.format(
111
+ title=ref.title or "Unknown title",
112
+ context=context,
113
+ content=content,
114
+ )
115
+
116
+ tokens_used = 0
117
+ label = RelevanceLabel.CANNOT_DETERMINE
118
+ confidence = 0.3
119
+ rationale = ""
120
+ evidence: dict[str, Any] = {"content_source": "abstract" if abstract else "fetched"}
121
+
122
+ try:
123
+ from citesentry.llm.base import LLMClient
124
+ response_text = await llm_client.complete(prompt) # type: ignore[union-attr]
125
+ tokens_used = len(prompt.split()) + len(response_text.split())
126
+
127
+ response_text = response_text.strip()
128
+ if response_text.startswith("```"):
129
+ lines = response_text.split("\n")
130
+ response_text = "\n".join(
131
+ l for l in lines if not l.startswith("```")
132
+ ).strip()
133
+
134
+ parsed = json.loads(response_text)
135
+ label = RelevanceLabel(parsed.get("label", "CANNOT_DETERMINE"))
136
+ confidence = float(parsed.get("confidence", 0.3))
137
+ rationale = parsed.get("rationale", "")
138
+ evidence["label"] = label.value
139
+ evidence["rationale"] = rationale
140
+
141
+ except (json.JSONDecodeError, ValueError, KeyError):
142
+ label = RelevanceLabel.CANNOT_DETERMINE
143
+ confidence = 0.2
144
+ evidence["parse_error"] = True
145
+ except Exception as e:
146
+ evidence["llm_error"] = str(e)
147
+ label = RelevanceLabel.CANNOT_DETERMINE
148
+ confidence = 0.1
149
+
150
+ if label == RelevanceLabel.RELEVANT:
151
+ status = CheckStatus.PASS
152
+ elif label == RelevanceLabel.PARTIAL:
153
+ status = CheckStatus.WARN
154
+ elif label == RelevanceLabel.UNRELATED:
155
+ status = CheckStatus.FAIL
156
+ else:
157
+ status = CheckStatus.WARN
158
+
159
+ result = CheckResult(
160
+ name="relevance",
161
+ status=status,
162
+ confidence=confidence,
163
+ evidence=evidence,
164
+ cost=CheckCost(tokens_used=tokens_used, elapsed_ms=(time.monotonic() - start) * 1000),
165
+ )
166
+
167
+ if use_cache and cache_key:
168
+ from citesentry.cache import get_cache
169
+ cache = get_cache()
170
+ cache.set("relevance", cache_key, {
171
+ "status": result.status.value,
172
+ "confidence": result.confidence,
173
+ "evidence": evidence,
174
+ })
175
+
176
+ return result
@@ -0,0 +1,143 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import time
5
+ from typing import Any
6
+
7
+ import httpx
8
+
9
+ from citesentry.config import get_settings
10
+ from citesentry.models import CheckCost, CheckResult, CheckStatus
11
+
12
+ _BOT_PROTECTION_HOSTS = {
13
+ "linkedin.com", "www.linkedin.com",
14
+ "twitter.com", "x.com", "www.twitter.com",
15
+ "facebook.com", "www.facebook.com",
16
+ "instagram.com", "www.instagram.com",
17
+ }
18
+
19
+ _BOT_PROTECTION_INDICATORS = [
20
+ "cloudflare", "just a moment", "enable javascript", "checking your browser",
21
+ "ddos protection", "attention required",
22
+ ]
23
+
24
+ _HEADERS = {
25
+ "User-Agent": (
26
+ "Mozilla/5.0 (compatible; citesentry/0.1; citation verification bot; "
27
+ "+https://github.com/mkassaf/CiteSentry)"
28
+ )
29
+ }
30
+
31
+
32
+ def _is_bot_protected(url: str, status: int, body: str) -> bool:
33
+ from urllib.parse import urlparse
34
+ host = urlparse(url).hostname or ""
35
+ if host in _BOT_PROTECTION_HOSTS:
36
+ return True
37
+ if status == 403 and any(ind in body.lower() for ind in _BOT_PROTECTION_INDICATORS):
38
+ return True
39
+ return False
40
+
41
+
42
+ async def _check_single_url(
43
+ client: httpx.AsyncClient, url: str
44
+ ) -> tuple[str, CheckStatus, dict[str, Any]]:
45
+ evidence: dict[str, Any] = {"url": url}
46
+ try:
47
+ r = await client.head(url, headers=_HEADERS, follow_redirects=True)
48
+ if r.status_code == 405:
49
+ r = await client.get(url, headers=_HEADERS, follow_redirects=True)
50
+ final_url = str(r.url)
51
+ evidence["status_code"] = r.status_code
52
+ evidence["final_url"] = final_url
53
+ evidence["redirected"] = final_url != url
54
+
55
+ body = ""
56
+ if r.status_code == 403:
57
+ try:
58
+ body = r.text[:2000]
59
+ except Exception:
60
+ pass
61
+
62
+ if _is_bot_protected(final_url, r.status_code, body):
63
+ evidence["bot_protection"] = True
64
+ return url, CheckStatus.SKIPPED, evidence
65
+
66
+ if 200 <= r.status_code < 300:
67
+ return url, CheckStatus.PASS, evidence
68
+ else:
69
+ evidence["error"] = f"HTTP {r.status_code}"
70
+ return url, CheckStatus.FAIL, evidence
71
+
72
+ except httpx.TimeoutException:
73
+ evidence["error"] = "timeout"
74
+ return url, CheckStatus.WARN, evidence
75
+ except httpx.HTTPError as e:
76
+ evidence["error"] = str(e)
77
+ return url, CheckStatus.FAIL, evidence
78
+
79
+
80
+ async def check_url_liveness(
81
+ urls: list[str],
82
+ use_cache: bool = True,
83
+ ) -> CheckResult:
84
+ start = time.monotonic()
85
+
86
+ if not urls:
87
+ return CheckResult(
88
+ name="url_liveness",
89
+ status=CheckStatus.SKIPPED,
90
+ confidence=1.0,
91
+ evidence={"reason": "no urls"},
92
+ cost=CheckCost(elapsed_ms=0.0),
93
+ )
94
+
95
+ results: list[dict[str, Any]] = []
96
+ settings = get_settings()
97
+
98
+ async with httpx.AsyncClient(
99
+ timeout=settings.request_timeout, follow_redirects=True
100
+ ) as client:
101
+ for url in urls:
102
+ if use_cache:
103
+ from citesentry.cache import get_cache
104
+ cache = get_cache()
105
+ cached = cache.get("url_liveness", url)
106
+ if cached is not None:
107
+ results.append({**cached, "from_cache": True})
108
+ continue
109
+
110
+ _, status, evidence = await _check_single_url(client, url)
111
+ results.append({"status": status.value, "evidence": evidence})
112
+
113
+ if use_cache:
114
+ from citesentry.cache import get_cache
115
+ cache = get_cache()
116
+ cache.set("url_liveness", url, {"status": status.value, "evidence": evidence})
117
+
118
+ await asyncio.sleep(settings.politeness_delay)
119
+
120
+ elapsed = (time.monotonic() - start) * 1000
121
+
122
+ statuses = [CheckStatus(r["status"]) for r in results]
123
+
124
+ if any(s == CheckStatus.FAIL for s in statuses):
125
+ overall = CheckStatus.FAIL
126
+ confidence = 0.9
127
+ elif any(s == CheckStatus.WARN for s in statuses):
128
+ overall = CheckStatus.WARN
129
+ confidence = 0.7
130
+ elif all(s == CheckStatus.SKIPPED for s in statuses):
131
+ overall = CheckStatus.SKIPPED
132
+ confidence = 0.5
133
+ else:
134
+ overall = CheckStatus.PASS
135
+ confidence = 0.95
136
+
137
+ return CheckResult(
138
+ name="url_liveness",
139
+ status=overall,
140
+ confidence=confidence,
141
+ evidence={"url_results": results, "total_urls": len(urls)},
142
+ cost=CheckCost(api_calls=len(urls), elapsed_ms=elapsed),
143
+ )