citesentry 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citesentry/__init__.py +3 -0
- citesentry/cache.py +70 -0
- citesentry/checks/__init__.py +0 -0
- citesentry/checks/existence.py +219 -0
- citesentry/checks/relevance.py +176 -0
- citesentry/checks/url_liveness.py +143 -0
- citesentry/cli.py +193 -0
- citesentry/config.py +46 -0
- citesentry/core/__init__.py +4 -0
- citesentry/core/cascade.py +17 -0
- citesentry/core/engine.py +94 -0
- citesentry/core/verdict.py +56 -0
- citesentry/llm/__init__.py +3 -0
- citesentry/llm/base.py +13 -0
- citesentry/llm/deepseek.py +41 -0
- citesentry/llm/mcp_sampling.py +17 -0
- citesentry/mcp_server.py +156 -0
- citesentry/models.py +81 -0
- citesentry/parse/__init__.py +3 -0
- citesentry/parse/bibtex.py +87 -0
- citesentry/parse/csl_json.py +87 -0
- citesentry/parse/detect.py +100 -0
- citesentry/parse/doi_list.py +20 -0
- citesentry/parse/nbib.py +87 -0
- citesentry/parse/pdf_refs.py +47 -0
- citesentry/parse/plaintext.py +329 -0
- citesentry/parse/ris.py +75 -0
- citesentry/sources/__init__.py +3 -0
- citesentry/sources/arxiv.py +111 -0
- citesentry/sources/base.py +19 -0
- citesentry/sources/crossref.py +97 -0
- citesentry/sources/domain/__init__.py +0 -0
- citesentry/sources/domain/dblp.py +86 -0
- citesentry/sources/domain/pubmed.py +153 -0
- citesentry/sources/openalex.py +99 -0
- citesentry/sources/semantic_scholar.py +73 -0
- citesentry/sources/unpaywall.py +46 -0
- citesentry-0.1.1.dist-info/METADATA +201 -0
- citesentry-0.1.1.dist-info/RECORD +41 -0
- citesentry-0.1.1.dist-info/WHEEL +4 -0
- citesentry-0.1.1.dist-info/entry_points.txt +3 -0
citesentry/__init__.py
ADDED
citesentry/cache.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import sqlite3
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Cache:
|
|
12
|
+
def __init__(self, path: Path) -> None:
|
|
13
|
+
self._path = path
|
|
14
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
self._conn = sqlite3.connect(str(path), check_same_thread=False)
|
|
16
|
+
self._init()
|
|
17
|
+
|
|
18
|
+
def _init(self) -> None:
|
|
19
|
+
self._conn.execute(
|
|
20
|
+
"""CREATE TABLE IF NOT EXISTS cache (
|
|
21
|
+
key TEXT PRIMARY KEY,
|
|
22
|
+
value TEXT NOT NULL,
|
|
23
|
+
created_at REAL NOT NULL
|
|
24
|
+
)"""
|
|
25
|
+
)
|
|
26
|
+
self._conn.commit()
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def _key(namespace: str, identifier: str) -> str:
|
|
30
|
+
h = hashlib.sha256(f"{namespace}:{identifier}".encode()).hexdigest()
|
|
31
|
+
return h
|
|
32
|
+
|
|
33
|
+
def get(self, namespace: str, identifier: str) -> Any | None:
|
|
34
|
+
key = self._key(namespace, identifier)
|
|
35
|
+
row = self._conn.execute(
|
|
36
|
+
"SELECT value FROM cache WHERE key = ?", (key,)
|
|
37
|
+
).fetchone()
|
|
38
|
+
if row is None:
|
|
39
|
+
return None
|
|
40
|
+
return json.loads(row[0])
|
|
41
|
+
|
|
42
|
+
def set(self, namespace: str, identifier: str, value: Any) -> None:
|
|
43
|
+
key = self._key(namespace, identifier)
|
|
44
|
+
self._conn.execute(
|
|
45
|
+
"INSERT OR REPLACE INTO cache (key, value, created_at) VALUES (?, ?, ?)",
|
|
46
|
+
(key, json.dumps(value), time.time()),
|
|
47
|
+
)
|
|
48
|
+
self._conn.commit()
|
|
49
|
+
|
|
50
|
+
def close(self) -> None:
|
|
51
|
+
self._conn.close()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_cache: Cache | None = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_cache(path: Path | None = None) -> Cache:
|
|
58
|
+
global _cache
|
|
59
|
+
if _cache is None:
|
|
60
|
+
from citesentry.config import get_settings
|
|
61
|
+
p = path or get_settings().cache_path
|
|
62
|
+
_cache = Cache(p)
|
|
63
|
+
return _cache
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def reset_cache() -> None:
|
|
67
|
+
global _cache
|
|
68
|
+
if _cache is not None:
|
|
69
|
+
_cache.close()
|
|
70
|
+
_cache = None
|
|
File without changes
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from rapidfuzz import fuzz
|
|
7
|
+
|
|
8
|
+
from citesentry.models import Candidate, CheckCost, CheckResult, CheckStatus, Reference, Verdict
|
|
9
|
+
from citesentry.sources.base import SourceAdapter
|
|
10
|
+
|
|
11
|
+
_TITLE_PASS_THRESHOLD = 85.0
|
|
12
|
+
_TITLE_WARN_THRESHOLD = 70.0
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _surname(name: str) -> str:
|
|
16
|
+
"""Extract surname from 'Last, First' or 'First Last' format."""
|
|
17
|
+
name = name.strip()
|
|
18
|
+
if "," in name:
|
|
19
|
+
return name.split(",")[0].strip().lower()
|
|
20
|
+
parts = name.split()
|
|
21
|
+
return parts[-1].lower() if parts else ""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _author_overlap(ref_authors: list[str], cand_authors: list[str]) -> float:
|
|
25
|
+
if not ref_authors or not cand_authors:
|
|
26
|
+
return 0.5
|
|
27
|
+
ref_surnames = {_surname(a) for a in ref_authors}
|
|
28
|
+
cand_surnames = {_surname(a) for a in cand_authors}
|
|
29
|
+
intersection = ref_surnames & cand_surnames
|
|
30
|
+
# Use subset recall when cited list is much shorter than found list
|
|
31
|
+
# (covers et al. truncation and papers with many authors).
|
|
32
|
+
if len(ref_surnames) < len(cand_surnames) * 0.5:
|
|
33
|
+
return len(intersection) / len(ref_surnames) if ref_surnames else 0.0
|
|
34
|
+
union = ref_surnames | cand_surnames
|
|
35
|
+
return len(intersection) / len(union) if union else 0.0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _year_score(ref_year: int | None, cand_year: int | None) -> float:
|
|
39
|
+
if ref_year is None or cand_year is None:
|
|
40
|
+
return 0.5
|
|
41
|
+
diff = abs(ref_year - cand_year)
|
|
42
|
+
if diff == 0:
|
|
43
|
+
return 1.0
|
|
44
|
+
if diff == 1:
|
|
45
|
+
return 0.7
|
|
46
|
+
return 0.0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _score_candidate(ref: Reference, cand: Candidate) -> tuple[float, dict[str, Any]]:
|
|
50
|
+
evidence: dict[str, Any] = {}
|
|
51
|
+
|
|
52
|
+
title_score = 0.0
|
|
53
|
+
if ref.title and cand.title:
|
|
54
|
+
title_score = fuzz.token_set_ratio(ref.title.lower(), cand.title.lower()) / 100.0
|
|
55
|
+
evidence["title_score"] = round(title_score, 3)
|
|
56
|
+
|
|
57
|
+
author_score = _author_overlap(ref.authors, cand.authors)
|
|
58
|
+
evidence["author_score"] = round(author_score, 3)
|
|
59
|
+
|
|
60
|
+
year_score = _year_score(ref.year, cand.year)
|
|
61
|
+
evidence["year_score"] = round(year_score, 3)
|
|
62
|
+
|
|
63
|
+
venue_score = 0.5
|
|
64
|
+
if ref.venue and cand.venue:
|
|
65
|
+
venue_score = fuzz.token_set_ratio(ref.venue.lower(), cand.venue.lower()) / 100.0
|
|
66
|
+
evidence["venue_score"] = round(venue_score, 3)
|
|
67
|
+
|
|
68
|
+
if title_score == 0.0:
|
|
69
|
+
composite = 0.0
|
|
70
|
+
else:
|
|
71
|
+
composite = (
|
|
72
|
+
title_score * 0.55
|
|
73
|
+
+ author_score * 0.25
|
|
74
|
+
+ year_score * 0.15
|
|
75
|
+
+ venue_score * 0.05
|
|
76
|
+
)
|
|
77
|
+
evidence["composite"] = round(composite, 3)
|
|
78
|
+
return composite, evidence
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _check_metadata_consistency(ref: Reference, best: Candidate) -> list[str]:
|
|
82
|
+
mismatches = []
|
|
83
|
+
if ref.year and best.year and abs(ref.year - best.year) > 2:
|
|
84
|
+
mismatches.append(f"year: cited={ref.year}, found={best.year}")
|
|
85
|
+
if ref.doi and best.doi and ref.doi.lower().strip("/") != best.doi.lower().strip("/"):
|
|
86
|
+
mismatches.append(f"doi: cited={ref.doi}, found={best.doi}")
|
|
87
|
+
if ref.authors and best.authors:
|
|
88
|
+
overlap = _author_overlap(ref.authors, best.authors)
|
|
89
|
+
if overlap < 0.3:
|
|
90
|
+
mismatches.append(
|
|
91
|
+
f"authors: low overlap ({overlap:.2f}), "
|
|
92
|
+
f"cited={ref.authors[:2]}, found={best.authors[:2]}"
|
|
93
|
+
)
|
|
94
|
+
return mismatches
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def check_existence(
|
|
98
|
+
ref: Reference,
|
|
99
|
+
sources: list[SourceAdapter],
|
|
100
|
+
domain_sources: list[SourceAdapter] | None = None,
|
|
101
|
+
use_cache: bool = True,
|
|
102
|
+
) -> CheckResult:
|
|
103
|
+
start = time.monotonic()
|
|
104
|
+
api_calls = 0
|
|
105
|
+
candidates: list[tuple[float, dict, Candidate]] = []
|
|
106
|
+
evidence: dict[str, Any] = {}
|
|
107
|
+
|
|
108
|
+
if use_cache:
|
|
109
|
+
from citesentry.cache import get_cache
|
|
110
|
+
cache = get_cache()
|
|
111
|
+
cache_key = ref.doi or (f"{ref.title}|{ref.year}" if ref.title else None)
|
|
112
|
+
if cache_key:
|
|
113
|
+
cached = cache.get("existence", cache_key)
|
|
114
|
+
if cached is not None:
|
|
115
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
116
|
+
return CheckResult(
|
|
117
|
+
name="existence",
|
|
118
|
+
status=CheckStatus(cached["status"]),
|
|
119
|
+
confidence=cached["confidence"],
|
|
120
|
+
evidence={**cached["evidence"], "from_cache": True},
|
|
121
|
+
cost=CheckCost(api_calls=0, elapsed_ms=elapsed),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if ref.doi:
|
|
125
|
+
for src in sources:
|
|
126
|
+
try:
|
|
127
|
+
cand = await src.lookup_doi(ref.doi)
|
|
128
|
+
api_calls += 1
|
|
129
|
+
if cand:
|
|
130
|
+
score, ev = _score_candidate(ref, cand)
|
|
131
|
+
candidates.append((score, {**ev, "source": src.name, "via": "doi_lookup"}, cand))
|
|
132
|
+
except Exception as e:
|
|
133
|
+
evidence[f"{src.name}_error"] = str(e)
|
|
134
|
+
|
|
135
|
+
if not candidates:
|
|
136
|
+
for src in sources:
|
|
137
|
+
try:
|
|
138
|
+
results = await src.search(ref)
|
|
139
|
+
api_calls += 1
|
|
140
|
+
for cand in results:
|
|
141
|
+
score, ev = _score_candidate(ref, cand)
|
|
142
|
+
candidates.append((score, {**ev, "source": src.name, "via": "search"}, cand))
|
|
143
|
+
except Exception as e:
|
|
144
|
+
evidence[f"{src.name}_error"] = str(e)
|
|
145
|
+
|
|
146
|
+
if not candidates and domain_sources:
|
|
147
|
+
for src in domain_sources:
|
|
148
|
+
try:
|
|
149
|
+
if ref.doi:
|
|
150
|
+
cand = await src.lookup_doi(ref.doi)
|
|
151
|
+
api_calls += 1
|
|
152
|
+
if cand:
|
|
153
|
+
score, ev = _score_candidate(ref, cand)
|
|
154
|
+
candidates.append((score, {**ev, "source": src.name, "via": "doi_lookup"}, cand))
|
|
155
|
+
results = await src.search(ref)
|
|
156
|
+
api_calls += 1
|
|
157
|
+
for cand in results:
|
|
158
|
+
score, ev = _score_candidate(ref, cand)
|
|
159
|
+
candidates.append((score, {**ev, "source": src.name, "via": "search"}, cand))
|
|
160
|
+
except Exception as e:
|
|
161
|
+
evidence[f"{src.name}_error"] = str(e)
|
|
162
|
+
|
|
163
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
164
|
+
|
|
165
|
+
if not candidates:
|
|
166
|
+
result = CheckResult(
|
|
167
|
+
name="existence",
|
|
168
|
+
status=CheckStatus.FAIL,
|
|
169
|
+
confidence=0.9 if (ref.title or ref.doi) else 0.5,
|
|
170
|
+
evidence={**evidence, "verdict": "not_found", "candidates_checked": 0},
|
|
171
|
+
cost=CheckCost(api_calls=api_calls, elapsed_ms=elapsed),
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
175
|
+
best_score, best_ev, best_cand = candidates[0]
|
|
176
|
+
evidence.update(best_ev)
|
|
177
|
+
evidence["best_candidate_title"] = best_cand.title
|
|
178
|
+
evidence["best_candidate_source"] = best_cand.source
|
|
179
|
+
evidence["total_candidates"] = len(candidates)
|
|
180
|
+
|
|
181
|
+
title_score = best_ev.get("title_score", 0.0)
|
|
182
|
+
|
|
183
|
+
# High title match always warrants a look, even if other fields diverge
|
|
184
|
+
if title_score >= 0.85 or best_score * 100 >= _TITLE_PASS_THRESHOLD:
|
|
185
|
+
mismatches = _check_metadata_consistency(ref, best_cand)
|
|
186
|
+
if mismatches:
|
|
187
|
+
evidence["mismatches"] = mismatches
|
|
188
|
+
status = CheckStatus.WARN
|
|
189
|
+
confidence = min(max(title_score, best_score), 0.85)
|
|
190
|
+
else:
|
|
191
|
+
status = CheckStatus.PASS
|
|
192
|
+
confidence = max(title_score, best_score)
|
|
193
|
+
elif best_score * 100 >= _TITLE_WARN_THRESHOLD:
|
|
194
|
+
status = CheckStatus.WARN
|
|
195
|
+
confidence = best_score * 0.7
|
|
196
|
+
else:
|
|
197
|
+
status = CheckStatus.FAIL
|
|
198
|
+
confidence = 0.7
|
|
199
|
+
|
|
200
|
+
result = CheckResult(
|
|
201
|
+
name="existence",
|
|
202
|
+
status=status,
|
|
203
|
+
confidence=confidence,
|
|
204
|
+
evidence=evidence,
|
|
205
|
+
cost=CheckCost(api_calls=api_calls, elapsed_ms=elapsed),
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if use_cache:
|
|
209
|
+
from citesentry.cache import get_cache
|
|
210
|
+
cache = get_cache()
|
|
211
|
+
cache_key = ref.doi or (f"{ref.title}|{ref.year}" if ref.title else None)
|
|
212
|
+
if cache_key:
|
|
213
|
+
cache.set("existence", cache_key, {
|
|
214
|
+
"status": result.status.value,
|
|
215
|
+
"confidence": result.confidence,
|
|
216
|
+
"evidence": result.evidence,
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
return result
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from citesentry.models import CheckCost, CheckResult, CheckStatus, RelevanceLabel, Reference
|
|
10
|
+
|
|
11
|
+
_RELEVANCE_PROMPT = """\
|
|
12
|
+
You are a citation relevance judge. Given a reference title/topic and fetched content from the cited source, determine if the content matches the citation.
|
|
13
|
+
|
|
14
|
+
Reference title: {title}
|
|
15
|
+
Reference topic context: {context}
|
|
16
|
+
|
|
17
|
+
Fetched content (first 1500 chars):
|
|
18
|
+
{content}
|
|
19
|
+
|
|
20
|
+
Respond with ONLY valid JSON in this exact format:
|
|
21
|
+
{{"label": "<RELEVANT|PARTIAL|UNRELATED|CANNOT_DETERMINE>", "confidence": <0.0-1.0>, "rationale": "<one sentence>"}}
|
|
22
|
+
|
|
23
|
+
Labels:
|
|
24
|
+
- RELEVANT: content clearly matches the cited title/topic
|
|
25
|
+
- PARTIAL: content partially matches or is in the same general area
|
|
26
|
+
- UNRELATED: content does not match the citation at all
|
|
27
|
+
- CANNOT_DETERMINE: paywalled, JS-only page, no meaningful content to judge
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def _fetch_content(url: str, doi: str | None = None) -> str:
|
|
32
|
+
"""Fetch content for relevance check. Returns text snippet."""
|
|
33
|
+
headers = {
|
|
34
|
+
"User-Agent": (
|
|
35
|
+
"Mozilla/5.0 (compatible; citesentry/0.1; citation verification bot)"
|
|
36
|
+
)
|
|
37
|
+
}
|
|
38
|
+
try:
|
|
39
|
+
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
|
40
|
+
r = await client.get(url, headers=headers)
|
|
41
|
+
if 200 <= r.status_code < 300:
|
|
42
|
+
text = r.text[:3000]
|
|
43
|
+
text = " ".join(text.split())
|
|
44
|
+
return text[:1500]
|
|
45
|
+
except httpx.HTTPError:
|
|
46
|
+
pass
|
|
47
|
+
return ""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def check_relevance(
|
|
51
|
+
ref: Reference,
|
|
52
|
+
llm_client: object,
|
|
53
|
+
abstract: str | None = None,
|
|
54
|
+
use_cache: bool = True,
|
|
55
|
+
) -> CheckResult:
|
|
56
|
+
start = time.monotonic()
|
|
57
|
+
|
|
58
|
+
if llm_client is None:
|
|
59
|
+
return CheckResult(
|
|
60
|
+
name="relevance",
|
|
61
|
+
status=CheckStatus.SKIPPED,
|
|
62
|
+
confidence=0.0,
|
|
63
|
+
evidence={"reason": "no_llm_client"},
|
|
64
|
+
cost=CheckCost(elapsed_ms=0.0),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if not ref.title and not ref.doi:
|
|
68
|
+
return CheckResult(
|
|
69
|
+
name="relevance",
|
|
70
|
+
status=CheckStatus.SKIPPED,
|
|
71
|
+
confidence=0.0,
|
|
72
|
+
evidence={"reason": "insufficient_reference_data"},
|
|
73
|
+
cost=CheckCost(elapsed_ms=0.0),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
cache_key = ref.doi or (f"{ref.title}|{','.join(ref.urls[:1])}" if ref.title else None)
|
|
77
|
+
if use_cache and cache_key:
|
|
78
|
+
from citesentry.cache import get_cache
|
|
79
|
+
cache = get_cache()
|
|
80
|
+
cached = cache.get("relevance", cache_key)
|
|
81
|
+
if cached is not None:
|
|
82
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
83
|
+
return CheckResult(
|
|
84
|
+
name="relevance",
|
|
85
|
+
status=CheckStatus(cached["status"]),
|
|
86
|
+
confidence=cached["confidence"],
|
|
87
|
+
evidence={**cached["evidence"], "from_cache": True},
|
|
88
|
+
cost=CheckCost(elapsed_ms=elapsed),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
content = abstract or ""
|
|
92
|
+
if not content and ref.urls:
|
|
93
|
+
content = await _fetch_content(ref.urls[0])
|
|
94
|
+
|
|
95
|
+
if not content:
|
|
96
|
+
return CheckResult(
|
|
97
|
+
name="relevance",
|
|
98
|
+
status=CheckStatus.WARN,
|
|
99
|
+
confidence=0.3,
|
|
100
|
+
evidence={"reason": "no_content_available", "label": "CANNOT_DETERMINE"},
|
|
101
|
+
cost=CheckCost(elapsed_ms=(time.monotonic() - start) * 1000),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
context = ""
|
|
105
|
+
if ref.venue:
|
|
106
|
+
context += f"Published in: {ref.venue}. "
|
|
107
|
+
if ref.year:
|
|
108
|
+
context += f"Year: {ref.year}."
|
|
109
|
+
|
|
110
|
+
prompt = _RELEVANCE_PROMPT.format(
|
|
111
|
+
title=ref.title or "Unknown title",
|
|
112
|
+
context=context,
|
|
113
|
+
content=content,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
tokens_used = 0
|
|
117
|
+
label = RelevanceLabel.CANNOT_DETERMINE
|
|
118
|
+
confidence = 0.3
|
|
119
|
+
rationale = ""
|
|
120
|
+
evidence: dict[str, Any] = {"content_source": "abstract" if abstract else "fetched"}
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
from citesentry.llm.base import LLMClient
|
|
124
|
+
response_text = await llm_client.complete(prompt) # type: ignore[union-attr]
|
|
125
|
+
tokens_used = len(prompt.split()) + len(response_text.split())
|
|
126
|
+
|
|
127
|
+
response_text = response_text.strip()
|
|
128
|
+
if response_text.startswith("```"):
|
|
129
|
+
lines = response_text.split("\n")
|
|
130
|
+
response_text = "\n".join(
|
|
131
|
+
l for l in lines if not l.startswith("```")
|
|
132
|
+
).strip()
|
|
133
|
+
|
|
134
|
+
parsed = json.loads(response_text)
|
|
135
|
+
label = RelevanceLabel(parsed.get("label", "CANNOT_DETERMINE"))
|
|
136
|
+
confidence = float(parsed.get("confidence", 0.3))
|
|
137
|
+
rationale = parsed.get("rationale", "")
|
|
138
|
+
evidence["label"] = label.value
|
|
139
|
+
evidence["rationale"] = rationale
|
|
140
|
+
|
|
141
|
+
except (json.JSONDecodeError, ValueError, KeyError):
|
|
142
|
+
label = RelevanceLabel.CANNOT_DETERMINE
|
|
143
|
+
confidence = 0.2
|
|
144
|
+
evidence["parse_error"] = True
|
|
145
|
+
except Exception as e:
|
|
146
|
+
evidence["llm_error"] = str(e)
|
|
147
|
+
label = RelevanceLabel.CANNOT_DETERMINE
|
|
148
|
+
confidence = 0.1
|
|
149
|
+
|
|
150
|
+
if label == RelevanceLabel.RELEVANT:
|
|
151
|
+
status = CheckStatus.PASS
|
|
152
|
+
elif label == RelevanceLabel.PARTIAL:
|
|
153
|
+
status = CheckStatus.WARN
|
|
154
|
+
elif label == RelevanceLabel.UNRELATED:
|
|
155
|
+
status = CheckStatus.FAIL
|
|
156
|
+
else:
|
|
157
|
+
status = CheckStatus.WARN
|
|
158
|
+
|
|
159
|
+
result = CheckResult(
|
|
160
|
+
name="relevance",
|
|
161
|
+
status=status,
|
|
162
|
+
confidence=confidence,
|
|
163
|
+
evidence=evidence,
|
|
164
|
+
cost=CheckCost(tokens_used=tokens_used, elapsed_ms=(time.monotonic() - start) * 1000),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if use_cache and cache_key:
|
|
168
|
+
from citesentry.cache import get_cache
|
|
169
|
+
cache = get_cache()
|
|
170
|
+
cache.set("relevance", cache_key, {
|
|
171
|
+
"status": result.status.value,
|
|
172
|
+
"confidence": result.confidence,
|
|
173
|
+
"evidence": evidence,
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
return result
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from citesentry.config import get_settings
|
|
10
|
+
from citesentry.models import CheckCost, CheckResult, CheckStatus
|
|
11
|
+
|
|
12
|
+
_BOT_PROTECTION_HOSTS = {
|
|
13
|
+
"linkedin.com", "www.linkedin.com",
|
|
14
|
+
"twitter.com", "x.com", "www.twitter.com",
|
|
15
|
+
"facebook.com", "www.facebook.com",
|
|
16
|
+
"instagram.com", "www.instagram.com",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
_BOT_PROTECTION_INDICATORS = [
|
|
20
|
+
"cloudflare", "just a moment", "enable javascript", "checking your browser",
|
|
21
|
+
"ddos protection", "attention required",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
_HEADERS = {
|
|
25
|
+
"User-Agent": (
|
|
26
|
+
"Mozilla/5.0 (compatible; citesentry/0.1; citation verification bot; "
|
|
27
|
+
"+https://github.com/mkassaf/CiteSentry)"
|
|
28
|
+
)
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _is_bot_protected(url: str, status: int, body: str) -> bool:
|
|
33
|
+
from urllib.parse import urlparse
|
|
34
|
+
host = urlparse(url).hostname or ""
|
|
35
|
+
if host in _BOT_PROTECTION_HOSTS:
|
|
36
|
+
return True
|
|
37
|
+
if status == 403 and any(ind in body.lower() for ind in _BOT_PROTECTION_INDICATORS):
|
|
38
|
+
return True
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def _check_single_url(
|
|
43
|
+
client: httpx.AsyncClient, url: str
|
|
44
|
+
) -> tuple[str, CheckStatus, dict[str, Any]]:
|
|
45
|
+
evidence: dict[str, Any] = {"url": url}
|
|
46
|
+
try:
|
|
47
|
+
r = await client.head(url, headers=_HEADERS, follow_redirects=True)
|
|
48
|
+
if r.status_code == 405:
|
|
49
|
+
r = await client.get(url, headers=_HEADERS, follow_redirects=True)
|
|
50
|
+
final_url = str(r.url)
|
|
51
|
+
evidence["status_code"] = r.status_code
|
|
52
|
+
evidence["final_url"] = final_url
|
|
53
|
+
evidence["redirected"] = final_url != url
|
|
54
|
+
|
|
55
|
+
body = ""
|
|
56
|
+
if r.status_code == 403:
|
|
57
|
+
try:
|
|
58
|
+
body = r.text[:2000]
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
if _is_bot_protected(final_url, r.status_code, body):
|
|
63
|
+
evidence["bot_protection"] = True
|
|
64
|
+
return url, CheckStatus.SKIPPED, evidence
|
|
65
|
+
|
|
66
|
+
if 200 <= r.status_code < 300:
|
|
67
|
+
return url, CheckStatus.PASS, evidence
|
|
68
|
+
else:
|
|
69
|
+
evidence["error"] = f"HTTP {r.status_code}"
|
|
70
|
+
return url, CheckStatus.FAIL, evidence
|
|
71
|
+
|
|
72
|
+
except httpx.TimeoutException:
|
|
73
|
+
evidence["error"] = "timeout"
|
|
74
|
+
return url, CheckStatus.WARN, evidence
|
|
75
|
+
except httpx.HTTPError as e:
|
|
76
|
+
evidence["error"] = str(e)
|
|
77
|
+
return url, CheckStatus.FAIL, evidence
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def check_url_liveness(
|
|
81
|
+
urls: list[str],
|
|
82
|
+
use_cache: bool = True,
|
|
83
|
+
) -> CheckResult:
|
|
84
|
+
start = time.monotonic()
|
|
85
|
+
|
|
86
|
+
if not urls:
|
|
87
|
+
return CheckResult(
|
|
88
|
+
name="url_liveness",
|
|
89
|
+
status=CheckStatus.SKIPPED,
|
|
90
|
+
confidence=1.0,
|
|
91
|
+
evidence={"reason": "no urls"},
|
|
92
|
+
cost=CheckCost(elapsed_ms=0.0),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
results: list[dict[str, Any]] = []
|
|
96
|
+
settings = get_settings()
|
|
97
|
+
|
|
98
|
+
async with httpx.AsyncClient(
|
|
99
|
+
timeout=settings.request_timeout, follow_redirects=True
|
|
100
|
+
) as client:
|
|
101
|
+
for url in urls:
|
|
102
|
+
if use_cache:
|
|
103
|
+
from citesentry.cache import get_cache
|
|
104
|
+
cache = get_cache()
|
|
105
|
+
cached = cache.get("url_liveness", url)
|
|
106
|
+
if cached is not None:
|
|
107
|
+
results.append({**cached, "from_cache": True})
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
_, status, evidence = await _check_single_url(client, url)
|
|
111
|
+
results.append({"status": status.value, "evidence": evidence})
|
|
112
|
+
|
|
113
|
+
if use_cache:
|
|
114
|
+
from citesentry.cache import get_cache
|
|
115
|
+
cache = get_cache()
|
|
116
|
+
cache.set("url_liveness", url, {"status": status.value, "evidence": evidence})
|
|
117
|
+
|
|
118
|
+
await asyncio.sleep(settings.politeness_delay)
|
|
119
|
+
|
|
120
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
121
|
+
|
|
122
|
+
statuses = [CheckStatus(r["status"]) for r in results]
|
|
123
|
+
|
|
124
|
+
if any(s == CheckStatus.FAIL for s in statuses):
|
|
125
|
+
overall = CheckStatus.FAIL
|
|
126
|
+
confidence = 0.9
|
|
127
|
+
elif any(s == CheckStatus.WARN for s in statuses):
|
|
128
|
+
overall = CheckStatus.WARN
|
|
129
|
+
confidence = 0.7
|
|
130
|
+
elif all(s == CheckStatus.SKIPPED for s in statuses):
|
|
131
|
+
overall = CheckStatus.SKIPPED
|
|
132
|
+
confidence = 0.5
|
|
133
|
+
else:
|
|
134
|
+
overall = CheckStatus.PASS
|
|
135
|
+
confidence = 0.95
|
|
136
|
+
|
|
137
|
+
return CheckResult(
|
|
138
|
+
name="url_liveness",
|
|
139
|
+
status=overall,
|
|
140
|
+
confidence=confidence,
|
|
141
|
+
evidence={"url_results": results, "total_urls": len(urls)},
|
|
142
|
+
cost=CheckCost(api_calls=len(urls), elapsed_ms=elapsed),
|
|
143
|
+
)
|