adversarial-workflow 0.6.6__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,643 @@
1
+ """
2
+ Citation verification utilities for checking URLs in documents.
3
+
4
+ This module provides:
5
+ - URL extraction from markdown documents
6
+ - Async parallel URL checking with caching
7
+ - Inline marking of URL status
8
+ - Blocked URL task file generation
9
+
10
+ Status categories:
11
+ - available: 200 OK, content accessible
12
+ - blocked: Paywall/auth/bot-blocked (401, 403, or bot detection)
13
+ - broken: 404, 500, timeout, DNS failure
14
+ - redirect: 301/302 with final destination noted
15
+ """
16
+
17
+ import asyncio
18
+ import hashlib
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import time
24
+ from dataclasses import dataclass
25
+ from datetime import datetime, timezone
26
+ from enum import Enum
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+ # Module logger for debugging URL check failures
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class URLStatus(Enum):
35
+ """URL verification status categories."""
36
+
37
+ AVAILABLE = "available"
38
+ BLOCKED = "blocked"
39
+ BROKEN = "broken"
40
+ REDIRECT = "redirect"
41
+
42
+
43
+ @dataclass
44
+ class URLResult:
45
+ """Result of checking a single URL."""
46
+
47
+ url: str
48
+ status: URLStatus
49
+ status_code: Optional[int] = None
50
+ final_url: Optional[str] = None
51
+ error: Optional[str] = None
52
+ checked_at: Optional[float] = None
53
+
54
+ def to_dict(self) -> dict:
55
+ """Convert to dictionary for JSON serialization."""
56
+ return {
57
+ "url": self.url,
58
+ "status": self.status.value,
59
+ "status_code": self.status_code,
60
+ "final_url": self.final_url,
61
+ "error": self.error,
62
+ "checked_at": self.checked_at,
63
+ }
64
+
65
+ @classmethod
66
+ def from_dict(cls, data: dict) -> "URLResult":
67
+ """Create from dictionary."""
68
+ return cls(
69
+ url=data["url"],
70
+ status=URLStatus(data["status"]),
71
+ status_code=data.get("status_code"),
72
+ final_url=data.get("final_url"),
73
+ error=data.get("error"),
74
+ checked_at=data.get("checked_at"),
75
+ )
76
+
77
+
78
+ @dataclass
79
+ class ExtractedURL:
80
+ """A URL extracted from a document with context."""
81
+
82
+ url: str
83
+ position: int
84
+ context: str
85
+ line_number: int
86
+
87
+
88
+ # URL extraction pattern - matches http/https URLs
89
+ URL_PATTERN = re.compile(r"https?://[^\s\)\]\>\"\'\`]+")
90
+
91
+ # Bot detection patterns in response
92
+ BOT_DETECTION_PATTERNS = [
93
+ "captcha",
94
+ "cloudflare",
95
+ "access denied",
96
+ "forbidden",
97
+ "bot detected",
98
+ "please verify",
99
+ "human verification",
100
+ ]
101
+
102
+ # Default configuration
103
+ DEFAULT_CONFIG = {
104
+ "max_urls": 100,
105
+ "concurrency": 10,
106
+ "timeout_per_url": 10,
107
+ "cache_ttl": 86400, # 24 hours
108
+ }
109
+
110
+
111
+ def extract_urls(document: str, max_urls: int = 100) -> list[ExtractedURL]:
112
+ """
113
+ Extract URLs from a document with surrounding context.
114
+
115
+ Args:
116
+ document: The document text to extract URLs from
117
+ max_urls: Maximum number of URLs to extract (default: 100)
118
+
119
+ Returns:
120
+ List of ExtractedURL objects with position and context
121
+ """
122
+ urls = []
123
+ lines = document.split("\n")
124
+ line_starts = [0]
125
+ for line in lines:
126
+ line_starts.append(line_starts[-1] + len(line) + 1)
127
+
128
+ for match in URL_PATTERN.finditer(document):
129
+ url = match.group().rstrip(".,;:!?") # Clean trailing punctuation
130
+ position = match.start()
131
+
132
+ # Find line number
133
+ line_number = 1
134
+ for i, start in enumerate(line_starts):
135
+ if start > position:
136
+ line_number = i
137
+ break
138
+
139
+ # Get context (50 chars before and after)
140
+ start = max(0, position - 50)
141
+ end = min(len(document), match.end() + 50)
142
+ context = document[start:end]
143
+
144
+ urls.append(
145
+ ExtractedURL(
146
+ url=url,
147
+ position=position,
148
+ context=context,
149
+ line_number=line_number,
150
+ )
151
+ )
152
+
153
+ if len(urls) >= max_urls:
154
+ break
155
+
156
+ return urls
157
+
158
+
159
+ def get_cache_path(cache_dir: Optional[Path] = None) -> Path:
160
+ """Get the path to the URL cache file."""
161
+ if cache_dir is None:
162
+ cache_dir = Path.cwd() / ".adversarial"
163
+ cache_dir.mkdir(parents=True, exist_ok=True)
164
+ return cache_dir / "url_cache.json"
165
+
166
+
167
+ def load_cache(cache_path: Path) -> dict[str, dict]:
168
+ """Load URL cache from disk."""
169
+ if not cache_path.exists():
170
+ return {}
171
+ try:
172
+ with open(cache_path) as f:
173
+ return json.load(f)
174
+ except (json.JSONDecodeError, OSError):
175
+ return {}
176
+
177
+
178
+ def save_cache(cache_path: Path, cache: dict[str, dict]) -> None:
179
+ """Save URL cache to disk."""
180
+ with open(cache_path, "w") as f:
181
+ json.dump(cache, f, indent=2)
182
+
183
+
184
+ def get_cache_key(url: str) -> str:
185
+ """Generate a cache key for a URL."""
186
+ return hashlib.md5(url.encode()).hexdigest()
187
+
188
+
189
+ def classify_response(status_code: int, _headers: dict, content: Optional[str] = None) -> URLStatus:
190
+ """
191
+ Classify HTTP response into a URL status.
192
+
193
+ Args:
194
+ status_code: HTTP status code
195
+ _headers: Response headers (reserved for future use)
196
+ content: Optional response body content (for bot detection)
197
+
198
+ Returns:
199
+ URLStatus enum value
200
+ """
201
+ if status_code == 200:
202
+ # Check for bot blocking in content
203
+ if content:
204
+ content_lower = content.lower()
205
+ for pattern in BOT_DETECTION_PATTERNS:
206
+ if pattern in content_lower:
207
+ return URLStatus.BLOCKED
208
+ return URLStatus.AVAILABLE
209
+ elif status_code in (301, 302, 307, 308):
210
+ return URLStatus.REDIRECT
211
+ elif status_code in (401, 403):
212
+ return URLStatus.BLOCKED
213
+ elif status_code == 429:
214
+ return URLStatus.BLOCKED # Rate limited
215
+ else:
216
+ return URLStatus.BROKEN
217
+
218
+
219
+ async def check_url_async(
220
+ url: str,
221
+ timeout: int = 10,
222
+ session=None,
223
+ ) -> URLResult:
224
+ """
225
+ Check a single URL asynchronously.
226
+
227
+ Args:
228
+ url: URL to check
229
+ timeout: Request timeout in seconds
230
+ session: Optional aiohttp session to reuse
231
+
232
+ Returns:
233
+ URLResult with status information
234
+ """
235
+ try:
236
+ import aiohttp
237
+ except ImportError:
238
+ return URLResult(
239
+ url=url,
240
+ status=URLStatus.BROKEN,
241
+ error="aiohttp not installed - run: pip install aiohttp",
242
+ checked_at=time.time(),
243
+ )
244
+
245
+ close_session = False
246
+ if session is None:
247
+ session = aiohttp.ClientSession()
248
+ close_session = True
249
+
250
+ try:
251
+ async with session.head(
252
+ url,
253
+ timeout=aiohttp.ClientTimeout(total=timeout),
254
+ allow_redirects=True,
255
+ headers={"User-Agent": "Mozilla/5.0 (compatible; CitationVerifier/1.0)"},
256
+ ) as response:
257
+ final_url = str(response.url) if str(response.url) != url else None
258
+ status = classify_response(response.status, dict(response.headers))
259
+
260
+ # If redirect to an available page, mark as redirect (informational)
261
+ # Keep broken/blocked status if redirect leads to error page
262
+ if final_url and response.history and status == URLStatus.AVAILABLE:
263
+ status = URLStatus.REDIRECT
264
+
265
+ return URLResult(
266
+ url=url,
267
+ status=status,
268
+ status_code=response.status,
269
+ final_url=final_url,
270
+ checked_at=time.time(),
271
+ )
272
+ except asyncio.TimeoutError:
273
+ return URLResult(
274
+ url=url,
275
+ status=URLStatus.BROKEN,
276
+ error="Timeout",
277
+ checked_at=time.time(),
278
+ )
279
+ except Exception as e:
280
+ error_name = type(e).__name__
281
+ # Log full exception for debugging while returning truncated message
282
+ logger.debug("URL check failed for %s: %s", url, e, exc_info=True)
283
+ return URLResult(
284
+ url=url,
285
+ status=URLStatus.BROKEN,
286
+ error=f"{error_name}: {str(e)[:50]}",
287
+ checked_at=time.time(),
288
+ )
289
+ finally:
290
+ if close_session:
291
+ await session.close()
292
+
293
+
294
+ async def check_urls_parallel(
295
+ urls: list[str],
296
+ concurrency: int = 10,
297
+ timeout: int = 10,
298
+ cache: Optional[dict] = None,
299
+ cache_ttl: int = 86400,
300
+ ) -> list[URLResult]:
301
+ """
302
+ Check multiple URLs in parallel with optional caching.
303
+
304
+ Args:
305
+ urls: List of URLs to check
306
+ concurrency: Maximum concurrent requests (must be >= 1)
307
+ timeout: Timeout per request in seconds (must be >= 1)
308
+ cache: Optional cache dictionary
309
+ cache_ttl: Cache TTL in seconds (default: 24 hours)
310
+
311
+ Returns:
312
+ List of URLResult objects
313
+
314
+ Raises:
315
+ ValueError: If concurrency or timeout is less than 1
316
+ """
317
+ # Validate parameters to prevent deadlocks
318
+ if concurrency < 1:
319
+ raise ValueError(f"concurrency must be >= 1, got {concurrency}")
320
+ if timeout < 1:
321
+ raise ValueError(f"timeout must be >= 1, got {timeout}")
322
+
323
+ try:
324
+ import aiohttp
325
+ except ImportError:
326
+ return [
327
+ URLResult(
328
+ url=url,
329
+ status=URLStatus.BROKEN,
330
+ error="aiohttp not installed",
331
+ checked_at=time.time(),
332
+ )
333
+ for url in urls
334
+ ]
335
+
336
+ url_to_result: dict[str, URLResult] = {}
337
+ urls_to_check = []
338
+ current_time = time.time()
339
+
340
+ # Check cache first
341
+ if cache is not None:
342
+ for url in urls:
343
+ cache_key = get_cache_key(url)
344
+ if cache_key in cache:
345
+ cached = cache[cache_key]
346
+ if cached.get("expires", 0) > current_time:
347
+ url_to_result[url] = URLResult.from_dict(cached["result"])
348
+ continue
349
+ urls_to_check.append(url)
350
+ else:
351
+ urls_to_check = list(urls)
352
+
353
+ if urls_to_check:
354
+ # Create semaphore for concurrency limiting
355
+ semaphore = asyncio.Semaphore(concurrency)
356
+
357
+ async def check_with_semaphore(session, url):
358
+ async with semaphore:
359
+ return await check_url_async(url, timeout, session)
360
+
361
+ # Check remaining URLs
362
+ connector = aiohttp.TCPConnector(limit=concurrency, limit_per_host=5)
363
+ async with aiohttp.ClientSession(connector=connector) as session:
364
+ tasks = [check_with_semaphore(session, url) for url in urls_to_check]
365
+ checked_results = await asyncio.gather(*tasks)
366
+
367
+ # Update cache and store results
368
+ for result in checked_results:
369
+ if cache is not None:
370
+ cache_key = get_cache_key(result.url)
371
+ cache[cache_key] = {
372
+ "result": result.to_dict(),
373
+ "expires": current_time + cache_ttl,
374
+ }
375
+ url_to_result[result.url] = result
376
+
377
+ # Return results in original URL order
378
+ return [url_to_result[url] for url in urls]
379
+
380
+
381
+ def check_urls(
382
+ urls: list[str],
383
+ concurrency: int = 10,
384
+ timeout: int = 10,
385
+ cache_dir: Optional[Path] = None,
386
+ cache_ttl: int = 86400,
387
+ ) -> list[URLResult]:
388
+ """
389
+ Check multiple URLs synchronously (wrapper around async version).
390
+
391
+ Args:
392
+ urls: List of URLs to check
393
+ concurrency: Maximum concurrent requests
394
+ timeout: Timeout per request in seconds
395
+ cache_dir: Optional cache directory
396
+ cache_ttl: Cache TTL in seconds
397
+
398
+ Returns:
399
+ List of URLResult objects
400
+
401
+ Raises:
402
+ RuntimeError: If called from within an async context (event loop running).
403
+ Use check_urls_parallel() directly from async code.
404
+ """
405
+ # Guard against calling from async context
406
+ try:
407
+ asyncio.get_running_loop()
408
+ raise RuntimeError(
409
+ "check_urls() cannot be called from within an async context. "
410
+ "Use check_urls_parallel() directly instead."
411
+ )
412
+ except RuntimeError as e:
413
+ # No running loop - this is expected, proceed
414
+ if "no running event loop" not in str(e).lower():
415
+ raise
416
+
417
+ # Load cache
418
+ cache_path = get_cache_path(cache_dir)
419
+ cache = load_cache(cache_path)
420
+
421
+ # Run async check
422
+ results = asyncio.run(
423
+ check_urls_parallel(
424
+ urls,
425
+ concurrency=concurrency,
426
+ timeout=timeout,
427
+ cache=cache,
428
+ cache_ttl=cache_ttl,
429
+ )
430
+ )
431
+
432
+ # Save cache
433
+ save_cache(cache_path, cache)
434
+
435
+ return results
436
+
437
+
438
+ def get_status_badge(result: URLResult) -> str:
439
+ """
440
+ Generate an inline status badge for a URL result.
441
+
442
+ Args:
443
+ result: URLResult to generate badge for
444
+
445
+ Returns:
446
+ Markdown-formatted status badge
447
+ """
448
+ if result.status == URLStatus.AVAILABLE:
449
+ return f"[✅ Verified | {result.status_code} OK]"
450
+ elif result.status == URLStatus.BLOCKED:
451
+ if result.status_code:
452
+ return f"[⚠️ Blocked | {result.status_code}]"
453
+ return "[⚠️ Blocked | Access Denied]"
454
+ elif result.status == URLStatus.BROKEN:
455
+ if result.error:
456
+ return f"[❌ Broken | {result.error}]"
457
+ if result.status_code:
458
+ return f"[❌ Broken | {result.status_code}]"
459
+ return "[❌ Broken | Unreachable]"
460
+ elif result.status == URLStatus.REDIRECT:
461
+ dest = (
462
+ result.final_url[:30] + "..."
463
+ if result.final_url and len(result.final_url) > 30
464
+ else result.final_url
465
+ )
466
+ return f"[🔄 Redirect | → {dest}]"
467
+ return "[❓ Unknown]"
468
+
469
+
470
+ def mark_urls_inline(document: str, results: list[URLResult]) -> str:
471
+ """
472
+ Mark URLs in a document with their status badges.
473
+
474
+ Args:
475
+ document: Original document text
476
+ results: List of URL check results
477
+
478
+ Returns:
479
+ Document with inline status badges added after URLs
480
+ """
481
+ # Create URL to result mapping
482
+ url_results = {r.url: r for r in results}
483
+
484
+ # Find all URLs and their positions
485
+ marked = document
486
+ offset = 0 # Track offset as we insert badges
487
+
488
+ for match in URL_PATTERN.finditer(document):
489
+ url = match.group().rstrip(".,;:!?") # Same stripping as extract_urls
490
+ if url in url_results:
491
+ result = url_results[url]
492
+ badge = get_status_badge(result)
493
+
494
+ # Check if badge already exists after this URL
495
+ end_pos = match.end() + offset
496
+ remaining = marked[end_pos:]
497
+ if remaining.startswith((" [✅", " [⚠️", " [❌", " [🔄")):
498
+ continue # Already marked
499
+
500
+ # Insert badge after URL
501
+ insert_pos = end_pos
502
+ marked = marked[:insert_pos] + " " + badge + marked[insert_pos:]
503
+ offset += len(badge) + 1
504
+
505
+ return marked
506
+
507
+
508
+ def generate_blocked_tasks(
509
+ results: list[URLResult],
510
+ document_path: str,
511
+ output_path: Optional[Path] = None,
512
+ ) -> str:
513
+ """
514
+ Generate a task file for blocked URLs requiring manual verification.
515
+
516
+ Args:
517
+ results: List of URL check results
518
+ document_path: Path to the source document
519
+ output_path: Optional path to write task file
520
+
521
+ Returns:
522
+ Task file content as string
523
+ """
524
+ blocked = [r for r in results if r.status in (URLStatus.BLOCKED, URLStatus.BROKEN)]
525
+
526
+ if not blocked:
527
+ return ""
528
+
529
+ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
530
+ content = f"""# Blocked Citation Verification Tasks
531
+
532
+ **Source**: {document_path}
533
+ **Generated**: {timestamp}
534
+ **Total blocked URLs**: {len(blocked)}
535
+
536
+ ## URLs Requiring Manual Verification
537
+
538
+ """
539
+
540
+ for i, result in enumerate(blocked, 1):
541
+ status_label = "⚠️ Blocked" if result.status == URLStatus.BLOCKED else "❌ Broken"
542
+ reason = result.error or (f"HTTP {result.status_code}" if result.status_code else "Unknown")
543
+
544
+ content += f"""### {i}. {status_label}
545
+
546
+ - **URL**: {result.url}
547
+ - **Reason**: {reason}
548
+ - [ ] Verify URL manually
549
+ - [ ] Update document if URL is permanently unavailable
550
+
551
+ """
552
+
553
+ content += """---
554
+
555
+ ## Instructions
556
+
557
+ 1. Open each URL in a browser
558
+ 2. Verify if content is accessible
559
+ 3. If blocked by paywall/auth, note the access method needed
560
+ 4. If broken, find replacement URL or remove citation
561
+ 5. Update the source document accordingly
562
+ """
563
+
564
+ if output_path:
565
+ output_path.parent.mkdir(parents=True, exist_ok=True)
566
+ with open(output_path, "w", encoding="utf-8") as f:
567
+ f.write(content)
568
+
569
+ return content
570
+
571
+
572
+ def verify_document(
573
+ document_path: Path,
574
+ output_tasks_path: Optional[Path] = None,
575
+ mark_inline: bool = True,
576
+ concurrency: int = 10,
577
+ timeout: int = 10,
578
+ cache_dir: Optional[Path] = None,
579
+ ) -> tuple[str, list[URLResult], str]:
580
+ """
581
+ Verify all citations in a document.
582
+
583
+ Args:
584
+ document_path: Path to the document to verify
585
+ output_tasks_path: Optional path for blocked URL task file
586
+ mark_inline: Whether to mark URLs inline in the document
587
+ concurrency: Maximum concurrent requests
588
+ timeout: Timeout per request
589
+ cache_dir: Optional cache directory
590
+
591
+ Returns:
592
+ Tuple of (marked_document, results, blocked_tasks)
593
+ """
594
+ with open(document_path, encoding="utf-8") as f:
595
+ document = f.read()
596
+
597
+ # Extract URLs
598
+ extracted = extract_urls(document)
599
+ urls = [e.url for e in extracted]
600
+
601
+ if not urls:
602
+ return document, [], ""
603
+
604
+ # Check URLs
605
+ results = check_urls(
606
+ urls,
607
+ concurrency=concurrency,
608
+ timeout=timeout,
609
+ cache_dir=cache_dir,
610
+ )
611
+
612
+ # Mark document if requested
613
+ marked_document = document
614
+ if mark_inline:
615
+ marked_document = mark_urls_inline(document, results)
616
+
617
+ # Generate blocked tasks
618
+ blocked_tasks = generate_blocked_tasks(
619
+ results,
620
+ str(document_path),
621
+ output_tasks_path,
622
+ )
623
+
624
+ return marked_document, results, blocked_tasks
625
+
626
+
627
+ def print_verification_summary(results: list[URLResult]) -> None:
628
+ """Print a summary of verification results to stdout."""
629
+ available = sum(1 for r in results if r.status == URLStatus.AVAILABLE)
630
+ blocked = sum(1 for r in results if r.status == URLStatus.BLOCKED)
631
+ broken = sum(1 for r in results if r.status == URLStatus.BROKEN)
632
+ redirect = sum(1 for r in results if r.status == URLStatus.REDIRECT)
633
+
634
+ total = len(results)
635
+ print("\n📋 Citation Verification Summary")
636
+ print(f" Total URLs checked: {total}")
637
+ print(f" ✅ Available: {available}")
638
+ print(f" 🔄 Redirect: {redirect}")
639
+ print(f" ⚠️ Blocked: {blocked}")
640
+ print(f" ❌ Broken: {broken}")
641
+
642
+ if blocked + broken > 0:
643
+ print(f"\n ⚠️ {blocked + broken} URLs need manual verification")