adversarial-workflow 0.6.6__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adversarial_workflow/__init__.py +1 -1
- adversarial_workflow/cli.py +351 -5
- adversarial_workflow/evaluators/__init__.py +11 -2
- adversarial_workflow/evaluators/config.py +39 -2
- adversarial_workflow/evaluators/discovery.py +97 -9
- adversarial_workflow/evaluators/resolver.py +211 -0
- adversarial_workflow/evaluators/runner.py +36 -13
- adversarial_workflow/library/__init__.py +56 -0
- adversarial_workflow/library/cache.py +184 -0
- adversarial_workflow/library/client.py +224 -0
- adversarial_workflow/library/commands.py +849 -0
- adversarial_workflow/library/config.py +81 -0
- adversarial_workflow/library/models.py +129 -0
- adversarial_workflow/utils/citations.py +643 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/METADATA +160 -3
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/RECORD +20 -12
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/WHEEL +0 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/entry_points.txt +0 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Citation verification utilities for checking URLs in documents.
|
|
3
|
+
|
|
4
|
+
This module provides:
|
|
5
|
+
- URL extraction from markdown documents
|
|
6
|
+
- Async parallel URL checking with caching
|
|
7
|
+
- Inline marking of URL status
|
|
8
|
+
- Blocked URL task file generation
|
|
9
|
+
|
|
10
|
+
Status categories:
|
|
11
|
+
- available: 200 OK, content accessible
|
|
12
|
+
- blocked: Paywall/auth/bot-blocked (401, 403, or bot detection)
|
|
13
|
+
- broken: 404, 500, timeout, DNS failure
|
|
14
|
+
- redirect: 301/302 with final destination noted
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import hashlib
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
import time
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from enum import Enum
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Optional
|
|
29
|
+
|
|
30
|
+
# Module logger for debugging URL check failures
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class URLStatus(Enum):
|
|
35
|
+
"""URL verification status categories."""
|
|
36
|
+
|
|
37
|
+
AVAILABLE = "available"
|
|
38
|
+
BLOCKED = "blocked"
|
|
39
|
+
BROKEN = "broken"
|
|
40
|
+
REDIRECT = "redirect"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class URLResult:
|
|
45
|
+
"""Result of checking a single URL."""
|
|
46
|
+
|
|
47
|
+
url: str
|
|
48
|
+
status: URLStatus
|
|
49
|
+
status_code: Optional[int] = None
|
|
50
|
+
final_url: Optional[str] = None
|
|
51
|
+
error: Optional[str] = None
|
|
52
|
+
checked_at: Optional[float] = None
|
|
53
|
+
|
|
54
|
+
def to_dict(self) -> dict:
|
|
55
|
+
"""Convert to dictionary for JSON serialization."""
|
|
56
|
+
return {
|
|
57
|
+
"url": self.url,
|
|
58
|
+
"status": self.status.value,
|
|
59
|
+
"status_code": self.status_code,
|
|
60
|
+
"final_url": self.final_url,
|
|
61
|
+
"error": self.error,
|
|
62
|
+
"checked_at": self.checked_at,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_dict(cls, data: dict) -> "URLResult":
|
|
67
|
+
"""Create from dictionary."""
|
|
68
|
+
return cls(
|
|
69
|
+
url=data["url"],
|
|
70
|
+
status=URLStatus(data["status"]),
|
|
71
|
+
status_code=data.get("status_code"),
|
|
72
|
+
final_url=data.get("final_url"),
|
|
73
|
+
error=data.get("error"),
|
|
74
|
+
checked_at=data.get("checked_at"),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class ExtractedURL:
|
|
80
|
+
"""A URL extracted from a document with context."""
|
|
81
|
+
|
|
82
|
+
url: str
|
|
83
|
+
position: int
|
|
84
|
+
context: str
|
|
85
|
+
line_number: int
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# URL extraction pattern - matches http/https URLs
|
|
89
|
+
URL_PATTERN = re.compile(r"https?://[^\s\)\]\>\"\'\`]+")
|
|
90
|
+
|
|
91
|
+
# Bot detection patterns in response
|
|
92
|
+
BOT_DETECTION_PATTERNS = [
|
|
93
|
+
"captcha",
|
|
94
|
+
"cloudflare",
|
|
95
|
+
"access denied",
|
|
96
|
+
"forbidden",
|
|
97
|
+
"bot detected",
|
|
98
|
+
"please verify",
|
|
99
|
+
"human verification",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Default configuration
|
|
103
|
+
DEFAULT_CONFIG = {
|
|
104
|
+
"max_urls": 100,
|
|
105
|
+
"concurrency": 10,
|
|
106
|
+
"timeout_per_url": 10,
|
|
107
|
+
"cache_ttl": 86400, # 24 hours
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def extract_urls(document: str, max_urls: int = 100) -> list[ExtractedURL]:
|
|
112
|
+
"""
|
|
113
|
+
Extract URLs from a document with surrounding context.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
document: The document text to extract URLs from
|
|
117
|
+
max_urls: Maximum number of URLs to extract (default: 100)
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of ExtractedURL objects with position and context
|
|
121
|
+
"""
|
|
122
|
+
urls = []
|
|
123
|
+
lines = document.split("\n")
|
|
124
|
+
line_starts = [0]
|
|
125
|
+
for line in lines:
|
|
126
|
+
line_starts.append(line_starts[-1] + len(line) + 1)
|
|
127
|
+
|
|
128
|
+
for match in URL_PATTERN.finditer(document):
|
|
129
|
+
url = match.group().rstrip(".,;:!?") # Clean trailing punctuation
|
|
130
|
+
position = match.start()
|
|
131
|
+
|
|
132
|
+
# Find line number
|
|
133
|
+
line_number = 1
|
|
134
|
+
for i, start in enumerate(line_starts):
|
|
135
|
+
if start > position:
|
|
136
|
+
line_number = i
|
|
137
|
+
break
|
|
138
|
+
|
|
139
|
+
# Get context (50 chars before and after)
|
|
140
|
+
start = max(0, position - 50)
|
|
141
|
+
end = min(len(document), match.end() + 50)
|
|
142
|
+
context = document[start:end]
|
|
143
|
+
|
|
144
|
+
urls.append(
|
|
145
|
+
ExtractedURL(
|
|
146
|
+
url=url,
|
|
147
|
+
position=position,
|
|
148
|
+
context=context,
|
|
149
|
+
line_number=line_number,
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if len(urls) >= max_urls:
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
return urls
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_cache_path(cache_dir: Optional[Path] = None) -> Path:
|
|
160
|
+
"""Get the path to the URL cache file."""
|
|
161
|
+
if cache_dir is None:
|
|
162
|
+
cache_dir = Path.cwd() / ".adversarial"
|
|
163
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
164
|
+
return cache_dir / "url_cache.json"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def load_cache(cache_path: Path) -> dict[str, dict]:
|
|
168
|
+
"""Load URL cache from disk."""
|
|
169
|
+
if not cache_path.exists():
|
|
170
|
+
return {}
|
|
171
|
+
try:
|
|
172
|
+
with open(cache_path) as f:
|
|
173
|
+
return json.load(f)
|
|
174
|
+
except (json.JSONDecodeError, OSError):
|
|
175
|
+
return {}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def save_cache(cache_path: Path, cache: dict[str, dict]) -> None:
|
|
179
|
+
"""Save URL cache to disk."""
|
|
180
|
+
with open(cache_path, "w") as f:
|
|
181
|
+
json.dump(cache, f, indent=2)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def get_cache_key(url: str) -> str:
|
|
185
|
+
"""Generate a cache key for a URL."""
|
|
186
|
+
return hashlib.md5(url.encode()).hexdigest()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def classify_response(status_code: int, _headers: dict, content: Optional[str] = None) -> URLStatus:
|
|
190
|
+
"""
|
|
191
|
+
Classify HTTP response into a URL status.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
status_code: HTTP status code
|
|
195
|
+
_headers: Response headers (reserved for future use)
|
|
196
|
+
content: Optional response body content (for bot detection)
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
URLStatus enum value
|
|
200
|
+
"""
|
|
201
|
+
if status_code == 200:
|
|
202
|
+
# Check for bot blocking in content
|
|
203
|
+
if content:
|
|
204
|
+
content_lower = content.lower()
|
|
205
|
+
for pattern in BOT_DETECTION_PATTERNS:
|
|
206
|
+
if pattern in content_lower:
|
|
207
|
+
return URLStatus.BLOCKED
|
|
208
|
+
return URLStatus.AVAILABLE
|
|
209
|
+
elif status_code in (301, 302, 307, 308):
|
|
210
|
+
return URLStatus.REDIRECT
|
|
211
|
+
elif status_code in (401, 403):
|
|
212
|
+
return URLStatus.BLOCKED
|
|
213
|
+
elif status_code == 429:
|
|
214
|
+
return URLStatus.BLOCKED # Rate limited
|
|
215
|
+
else:
|
|
216
|
+
return URLStatus.BROKEN
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
async def check_url_async(
|
|
220
|
+
url: str,
|
|
221
|
+
timeout: int = 10,
|
|
222
|
+
session=None,
|
|
223
|
+
) -> URLResult:
|
|
224
|
+
"""
|
|
225
|
+
Check a single URL asynchronously.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
url: URL to check
|
|
229
|
+
timeout: Request timeout in seconds
|
|
230
|
+
session: Optional aiohttp session to reuse
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
URLResult with status information
|
|
234
|
+
"""
|
|
235
|
+
try:
|
|
236
|
+
import aiohttp
|
|
237
|
+
except ImportError:
|
|
238
|
+
return URLResult(
|
|
239
|
+
url=url,
|
|
240
|
+
status=URLStatus.BROKEN,
|
|
241
|
+
error="aiohttp not installed - run: pip install aiohttp",
|
|
242
|
+
checked_at=time.time(),
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
close_session = False
|
|
246
|
+
if session is None:
|
|
247
|
+
session = aiohttp.ClientSession()
|
|
248
|
+
close_session = True
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
async with session.head(
|
|
252
|
+
url,
|
|
253
|
+
timeout=aiohttp.ClientTimeout(total=timeout),
|
|
254
|
+
allow_redirects=True,
|
|
255
|
+
headers={"User-Agent": "Mozilla/5.0 (compatible; CitationVerifier/1.0)"},
|
|
256
|
+
) as response:
|
|
257
|
+
final_url = str(response.url) if str(response.url) != url else None
|
|
258
|
+
status = classify_response(response.status, dict(response.headers))
|
|
259
|
+
|
|
260
|
+
# If redirect to an available page, mark as redirect (informational)
|
|
261
|
+
# Keep broken/blocked status if redirect leads to error page
|
|
262
|
+
if final_url and response.history and status == URLStatus.AVAILABLE:
|
|
263
|
+
status = URLStatus.REDIRECT
|
|
264
|
+
|
|
265
|
+
return URLResult(
|
|
266
|
+
url=url,
|
|
267
|
+
status=status,
|
|
268
|
+
status_code=response.status,
|
|
269
|
+
final_url=final_url,
|
|
270
|
+
checked_at=time.time(),
|
|
271
|
+
)
|
|
272
|
+
except asyncio.TimeoutError:
|
|
273
|
+
return URLResult(
|
|
274
|
+
url=url,
|
|
275
|
+
status=URLStatus.BROKEN,
|
|
276
|
+
error="Timeout",
|
|
277
|
+
checked_at=time.time(),
|
|
278
|
+
)
|
|
279
|
+
except Exception as e:
|
|
280
|
+
error_name = type(e).__name__
|
|
281
|
+
# Log full exception for debugging while returning truncated message
|
|
282
|
+
logger.debug("URL check failed for %s: %s", url, e, exc_info=True)
|
|
283
|
+
return URLResult(
|
|
284
|
+
url=url,
|
|
285
|
+
status=URLStatus.BROKEN,
|
|
286
|
+
error=f"{error_name}: {str(e)[:50]}",
|
|
287
|
+
checked_at=time.time(),
|
|
288
|
+
)
|
|
289
|
+
finally:
|
|
290
|
+
if close_session:
|
|
291
|
+
await session.close()
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
async def check_urls_parallel(
|
|
295
|
+
urls: list[str],
|
|
296
|
+
concurrency: int = 10,
|
|
297
|
+
timeout: int = 10,
|
|
298
|
+
cache: Optional[dict] = None,
|
|
299
|
+
cache_ttl: int = 86400,
|
|
300
|
+
) -> list[URLResult]:
|
|
301
|
+
"""
|
|
302
|
+
Check multiple URLs in parallel with optional caching.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
urls: List of URLs to check
|
|
306
|
+
concurrency: Maximum concurrent requests (must be >= 1)
|
|
307
|
+
timeout: Timeout per request in seconds (must be >= 1)
|
|
308
|
+
cache: Optional cache dictionary
|
|
309
|
+
cache_ttl: Cache TTL in seconds (default: 24 hours)
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
List of URLResult objects
|
|
313
|
+
|
|
314
|
+
Raises:
|
|
315
|
+
ValueError: If concurrency or timeout is less than 1
|
|
316
|
+
"""
|
|
317
|
+
# Validate parameters to prevent deadlocks
|
|
318
|
+
if concurrency < 1:
|
|
319
|
+
raise ValueError(f"concurrency must be >= 1, got {concurrency}")
|
|
320
|
+
if timeout < 1:
|
|
321
|
+
raise ValueError(f"timeout must be >= 1, got {timeout}")
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
import aiohttp
|
|
325
|
+
except ImportError:
|
|
326
|
+
return [
|
|
327
|
+
URLResult(
|
|
328
|
+
url=url,
|
|
329
|
+
status=URLStatus.BROKEN,
|
|
330
|
+
error="aiohttp not installed",
|
|
331
|
+
checked_at=time.time(),
|
|
332
|
+
)
|
|
333
|
+
for url in urls
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
url_to_result: dict[str, URLResult] = {}
|
|
337
|
+
urls_to_check = []
|
|
338
|
+
current_time = time.time()
|
|
339
|
+
|
|
340
|
+
# Check cache first
|
|
341
|
+
if cache is not None:
|
|
342
|
+
for url in urls:
|
|
343
|
+
cache_key = get_cache_key(url)
|
|
344
|
+
if cache_key in cache:
|
|
345
|
+
cached = cache[cache_key]
|
|
346
|
+
if cached.get("expires", 0) > current_time:
|
|
347
|
+
url_to_result[url] = URLResult.from_dict(cached["result"])
|
|
348
|
+
continue
|
|
349
|
+
urls_to_check.append(url)
|
|
350
|
+
else:
|
|
351
|
+
urls_to_check = list(urls)
|
|
352
|
+
|
|
353
|
+
if urls_to_check:
|
|
354
|
+
# Create semaphore for concurrency limiting
|
|
355
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
356
|
+
|
|
357
|
+
async def check_with_semaphore(session, url):
|
|
358
|
+
async with semaphore:
|
|
359
|
+
return await check_url_async(url, timeout, session)
|
|
360
|
+
|
|
361
|
+
# Check remaining URLs
|
|
362
|
+
connector = aiohttp.TCPConnector(limit=concurrency, limit_per_host=5)
|
|
363
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
|
364
|
+
tasks = [check_with_semaphore(session, url) for url in urls_to_check]
|
|
365
|
+
checked_results = await asyncio.gather(*tasks)
|
|
366
|
+
|
|
367
|
+
# Update cache and store results
|
|
368
|
+
for result in checked_results:
|
|
369
|
+
if cache is not None:
|
|
370
|
+
cache_key = get_cache_key(result.url)
|
|
371
|
+
cache[cache_key] = {
|
|
372
|
+
"result": result.to_dict(),
|
|
373
|
+
"expires": current_time + cache_ttl,
|
|
374
|
+
}
|
|
375
|
+
url_to_result[result.url] = result
|
|
376
|
+
|
|
377
|
+
# Return results in original URL order
|
|
378
|
+
return [url_to_result[url] for url in urls]
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def check_urls(
|
|
382
|
+
urls: list[str],
|
|
383
|
+
concurrency: int = 10,
|
|
384
|
+
timeout: int = 10,
|
|
385
|
+
cache_dir: Optional[Path] = None,
|
|
386
|
+
cache_ttl: int = 86400,
|
|
387
|
+
) -> list[URLResult]:
|
|
388
|
+
"""
|
|
389
|
+
Check multiple URLs synchronously (wrapper around async version).
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
urls: List of URLs to check
|
|
393
|
+
concurrency: Maximum concurrent requests
|
|
394
|
+
timeout: Timeout per request in seconds
|
|
395
|
+
cache_dir: Optional cache directory
|
|
396
|
+
cache_ttl: Cache TTL in seconds
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
List of URLResult objects
|
|
400
|
+
|
|
401
|
+
Raises:
|
|
402
|
+
RuntimeError: If called from within an async context (event loop running).
|
|
403
|
+
Use check_urls_parallel() directly from async code.
|
|
404
|
+
"""
|
|
405
|
+
# Guard against calling from async context
|
|
406
|
+
try:
|
|
407
|
+
asyncio.get_running_loop()
|
|
408
|
+
raise RuntimeError(
|
|
409
|
+
"check_urls() cannot be called from within an async context. "
|
|
410
|
+
"Use check_urls_parallel() directly instead."
|
|
411
|
+
)
|
|
412
|
+
except RuntimeError as e:
|
|
413
|
+
# No running loop - this is expected, proceed
|
|
414
|
+
if "no running event loop" not in str(e).lower():
|
|
415
|
+
raise
|
|
416
|
+
|
|
417
|
+
# Load cache
|
|
418
|
+
cache_path = get_cache_path(cache_dir)
|
|
419
|
+
cache = load_cache(cache_path)
|
|
420
|
+
|
|
421
|
+
# Run async check
|
|
422
|
+
results = asyncio.run(
|
|
423
|
+
check_urls_parallel(
|
|
424
|
+
urls,
|
|
425
|
+
concurrency=concurrency,
|
|
426
|
+
timeout=timeout,
|
|
427
|
+
cache=cache,
|
|
428
|
+
cache_ttl=cache_ttl,
|
|
429
|
+
)
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Save cache
|
|
433
|
+
save_cache(cache_path, cache)
|
|
434
|
+
|
|
435
|
+
return results
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def get_status_badge(result: URLResult) -> str:
|
|
439
|
+
"""
|
|
440
|
+
Generate an inline status badge for a URL result.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
result: URLResult to generate badge for
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
Markdown-formatted status badge
|
|
447
|
+
"""
|
|
448
|
+
if result.status == URLStatus.AVAILABLE:
|
|
449
|
+
return f"[✅ Verified | {result.status_code} OK]"
|
|
450
|
+
elif result.status == URLStatus.BLOCKED:
|
|
451
|
+
if result.status_code:
|
|
452
|
+
return f"[⚠️ Blocked | {result.status_code}]"
|
|
453
|
+
return "[⚠️ Blocked | Access Denied]"
|
|
454
|
+
elif result.status == URLStatus.BROKEN:
|
|
455
|
+
if result.error:
|
|
456
|
+
return f"[❌ Broken | {result.error}]"
|
|
457
|
+
if result.status_code:
|
|
458
|
+
return f"[❌ Broken | {result.status_code}]"
|
|
459
|
+
return "[❌ Broken | Unreachable]"
|
|
460
|
+
elif result.status == URLStatus.REDIRECT:
|
|
461
|
+
dest = (
|
|
462
|
+
result.final_url[:30] + "..."
|
|
463
|
+
if result.final_url and len(result.final_url) > 30
|
|
464
|
+
else result.final_url
|
|
465
|
+
)
|
|
466
|
+
return f"[🔄 Redirect | → {dest}]"
|
|
467
|
+
return "[❓ Unknown]"
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def mark_urls_inline(document: str, results: list[URLResult]) -> str:
|
|
471
|
+
"""
|
|
472
|
+
Mark URLs in a document with their status badges.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
document: Original document text
|
|
476
|
+
results: List of URL check results
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Document with inline status badges added after URLs
|
|
480
|
+
"""
|
|
481
|
+
# Create URL to result mapping
|
|
482
|
+
url_results = {r.url: r for r in results}
|
|
483
|
+
|
|
484
|
+
# Find all URLs and their positions
|
|
485
|
+
marked = document
|
|
486
|
+
offset = 0 # Track offset as we insert badges
|
|
487
|
+
|
|
488
|
+
for match in URL_PATTERN.finditer(document):
|
|
489
|
+
url = match.group().rstrip(".,;:!?") # Same stripping as extract_urls
|
|
490
|
+
if url in url_results:
|
|
491
|
+
result = url_results[url]
|
|
492
|
+
badge = get_status_badge(result)
|
|
493
|
+
|
|
494
|
+
# Check if badge already exists after this URL
|
|
495
|
+
end_pos = match.end() + offset
|
|
496
|
+
remaining = marked[end_pos:]
|
|
497
|
+
if remaining.startswith((" [✅", " [⚠️", " [❌", " [🔄")):
|
|
498
|
+
continue # Already marked
|
|
499
|
+
|
|
500
|
+
# Insert badge after URL
|
|
501
|
+
insert_pos = end_pos
|
|
502
|
+
marked = marked[:insert_pos] + " " + badge + marked[insert_pos:]
|
|
503
|
+
offset += len(badge) + 1
|
|
504
|
+
|
|
505
|
+
return marked
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def generate_blocked_tasks(
|
|
509
|
+
results: list[URLResult],
|
|
510
|
+
document_path: str,
|
|
511
|
+
output_path: Optional[Path] = None,
|
|
512
|
+
) -> str:
|
|
513
|
+
"""
|
|
514
|
+
Generate a task file for blocked URLs requiring manual verification.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
results: List of URL check results
|
|
518
|
+
document_path: Path to the source document
|
|
519
|
+
output_path: Optional path to write task file
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
Task file content as string
|
|
523
|
+
"""
|
|
524
|
+
blocked = [r for r in results if r.status in (URLStatus.BLOCKED, URLStatus.BROKEN)]
|
|
525
|
+
|
|
526
|
+
if not blocked:
|
|
527
|
+
return ""
|
|
528
|
+
|
|
529
|
+
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
530
|
+
content = f"""# Blocked Citation Verification Tasks
|
|
531
|
+
|
|
532
|
+
**Source**: {document_path}
|
|
533
|
+
**Generated**: {timestamp}
|
|
534
|
+
**Total blocked URLs**: {len(blocked)}
|
|
535
|
+
|
|
536
|
+
## URLs Requiring Manual Verification
|
|
537
|
+
|
|
538
|
+
"""
|
|
539
|
+
|
|
540
|
+
for i, result in enumerate(blocked, 1):
|
|
541
|
+
status_label = "⚠️ Blocked" if result.status == URLStatus.BLOCKED else "❌ Broken"
|
|
542
|
+
reason = result.error or (f"HTTP {result.status_code}" if result.status_code else "Unknown")
|
|
543
|
+
|
|
544
|
+
content += f"""### {i}. {status_label}
|
|
545
|
+
|
|
546
|
+
- **URL**: {result.url}
|
|
547
|
+
- **Reason**: {reason}
|
|
548
|
+
- [ ] Verify URL manually
|
|
549
|
+
- [ ] Update document if URL is permanently unavailable
|
|
550
|
+
|
|
551
|
+
"""
|
|
552
|
+
|
|
553
|
+
content += """---
|
|
554
|
+
|
|
555
|
+
## Instructions
|
|
556
|
+
|
|
557
|
+
1. Open each URL in a browser
|
|
558
|
+
2. Verify if content is accessible
|
|
559
|
+
3. If blocked by paywall/auth, note the access method needed
|
|
560
|
+
4. If broken, find replacement URL or remove citation
|
|
561
|
+
5. Update the source document accordingly
|
|
562
|
+
"""
|
|
563
|
+
|
|
564
|
+
if output_path:
|
|
565
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
566
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
567
|
+
f.write(content)
|
|
568
|
+
|
|
569
|
+
return content
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def verify_document(
|
|
573
|
+
document_path: Path,
|
|
574
|
+
output_tasks_path: Optional[Path] = None,
|
|
575
|
+
mark_inline: bool = True,
|
|
576
|
+
concurrency: int = 10,
|
|
577
|
+
timeout: int = 10,
|
|
578
|
+
cache_dir: Optional[Path] = None,
|
|
579
|
+
) -> tuple[str, list[URLResult], str]:
|
|
580
|
+
"""
|
|
581
|
+
Verify all citations in a document.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
document_path: Path to the document to verify
|
|
585
|
+
output_tasks_path: Optional path for blocked URL task file
|
|
586
|
+
mark_inline: Whether to mark URLs inline in the document
|
|
587
|
+
concurrency: Maximum concurrent requests
|
|
588
|
+
timeout: Timeout per request
|
|
589
|
+
cache_dir: Optional cache directory
|
|
590
|
+
|
|
591
|
+
Returns:
|
|
592
|
+
Tuple of (marked_document, results, blocked_tasks)
|
|
593
|
+
"""
|
|
594
|
+
with open(document_path, encoding="utf-8") as f:
|
|
595
|
+
document = f.read()
|
|
596
|
+
|
|
597
|
+
# Extract URLs
|
|
598
|
+
extracted = extract_urls(document)
|
|
599
|
+
urls = [e.url for e in extracted]
|
|
600
|
+
|
|
601
|
+
if not urls:
|
|
602
|
+
return document, [], ""
|
|
603
|
+
|
|
604
|
+
# Check URLs
|
|
605
|
+
results = check_urls(
|
|
606
|
+
urls,
|
|
607
|
+
concurrency=concurrency,
|
|
608
|
+
timeout=timeout,
|
|
609
|
+
cache_dir=cache_dir,
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
# Mark document if requested
|
|
613
|
+
marked_document = document
|
|
614
|
+
if mark_inline:
|
|
615
|
+
marked_document = mark_urls_inline(document, results)
|
|
616
|
+
|
|
617
|
+
# Generate blocked tasks
|
|
618
|
+
blocked_tasks = generate_blocked_tasks(
|
|
619
|
+
results,
|
|
620
|
+
str(document_path),
|
|
621
|
+
output_tasks_path,
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
return marked_document, results, blocked_tasks
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def print_verification_summary(results: list[URLResult]) -> None:
|
|
628
|
+
"""Print a summary of verification results to stdout."""
|
|
629
|
+
available = sum(1 for r in results if r.status == URLStatus.AVAILABLE)
|
|
630
|
+
blocked = sum(1 for r in results if r.status == URLStatus.BLOCKED)
|
|
631
|
+
broken = sum(1 for r in results if r.status == URLStatus.BROKEN)
|
|
632
|
+
redirect = sum(1 for r in results if r.status == URLStatus.REDIRECT)
|
|
633
|
+
|
|
634
|
+
total = len(results)
|
|
635
|
+
print("\n📋 Citation Verification Summary")
|
|
636
|
+
print(f" Total URLs checked: {total}")
|
|
637
|
+
print(f" ✅ Available: {available}")
|
|
638
|
+
print(f" 🔄 Redirect: {redirect}")
|
|
639
|
+
print(f" ⚠️ Blocked: {blocked}")
|
|
640
|
+
print(f" ❌ Broken: {broken}")
|
|
641
|
+
|
|
642
|
+
if blocked + broken > 0:
|
|
643
|
+
print(f"\n ⚠️ {blocked + broken} URLs need manual verification")
|