adversarial-workflow 0.6.6__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ Usage:
12
12
  adversarial validate "pytest"
13
13
  """
14
14
 
15
- __version__ = "0.6.6"
15
+ __version__ = "0.7.0"
16
16
  __author__ = "Fredrik Matheson"
17
17
  __license__ = "MIT"
18
18
 
@@ -13,6 +13,7 @@ Commands:
13
13
  review - Run Phase 3: Code review
14
14
  validate - Run Phase 4: Test validation
15
15
  split - Split large task files into smaller evaluable chunks
16
+ check-citations - Verify URLs in documents before evaluation
16
17
  """
17
18
 
18
19
  import argparse
@@ -29,7 +30,7 @@ from typing import Dict, List, Optional, Tuple
29
30
  import yaml
30
31
  from dotenv import dotenv_values, load_dotenv
31
32
 
32
- __version__ = "0.6.6"
33
+ __version__ = "0.7.0"
33
34
 
34
35
  # ANSI color codes for better output
35
36
  RESET = "\033[0m"
@@ -2819,6 +2820,106 @@ def list_evaluators() -> int:
2819
2820
  return 0
2820
2821
 
2821
2822
 
2823
+ def check_citations(
2824
+ file_path: str,
2825
+ output_tasks: Optional[str] = None,
2826
+ mark_inline: bool = False,
2827
+ concurrency: int = 10,
2828
+ timeout: int = 10,
2829
+ ) -> int:
2830
+ """
2831
+ Check citations (URLs) in a document.
2832
+
2833
+ Args:
2834
+ file_path: Path to document to check
2835
+ output_tasks: Optional path to write blocked URL tasks
2836
+ mark_inline: Whether to mark URLs inline with status badges
2837
+ concurrency: Maximum concurrent URL checks
2838
+ timeout: Timeout per URL in seconds
2839
+
2840
+ Returns:
2841
+ 0 on success, 1 on error
2842
+ """
2843
+ from adversarial_workflow.utils.citations import (
2844
+ URLStatus,
2845
+ check_urls,
2846
+ extract_urls,
2847
+ generate_blocked_tasks,
2848
+ mark_urls_inline,
2849
+ print_verification_summary,
2850
+ )
2851
+
2852
+ # Check file exists
2853
+ if not os.path.exists(file_path):
2854
+ print(f"{RED}Error: File not found: {file_path}{RESET}")
2855
+ return 1
2856
+
2857
+ # Validate parameters
2858
+ if concurrency < 1:
2859
+ print(f"{RED}Error: Concurrency must be at least 1, got {concurrency}{RESET}")
2860
+ return 1
2861
+ if timeout < 1:
2862
+ print(f"{RED}Error: Timeout must be at least 1 second, got {timeout}{RESET}")
2863
+ return 1
2864
+
2865
+ print(f"🔗 Checking citations in: {file_path}")
2866
+ print()
2867
+
2868
+ # Read document
2869
+ with open(file_path, encoding="utf-8") as f:
2870
+ document = f.read()
2871
+
2872
+ # Extract URLs
2873
+ extracted = extract_urls(document)
2874
+ urls = [e.url for e in extracted]
2875
+
2876
+ if not urls:
2877
+ print(f"{YELLOW}No URLs found in document.{RESET}")
2878
+ return 0
2879
+
2880
+ print(f" Found {len(urls)} URLs to check")
2881
+ print(f" Checking with concurrency={concurrency}, timeout={timeout}s...")
2882
+ print()
2883
+
2884
+ # Check URLs
2885
+ results = check_urls(
2886
+ urls,
2887
+ concurrency=concurrency,
2888
+ timeout=timeout,
2889
+ )
2890
+
2891
+ # Print summary
2892
+ print_verification_summary(results)
2893
+
2894
+ # Count blocked/broken
2895
+ blocked_count = sum(1 for r in results if r.status in (URLStatus.BLOCKED, URLStatus.BROKEN))
2896
+
2897
+ # Mark document inline if requested
2898
+ if mark_inline and results:
2899
+ marked_document = mark_urls_inline(document, results)
2900
+ if marked_document != document:
2901
+ with open(file_path, "w", encoding="utf-8") as f:
2902
+ f.write(marked_document)
2903
+ print("\n ✅ Updated document with status badges")
2904
+
2905
+ # Generate blocked tasks if requested or if there are blocked URLs
2906
+ if blocked_count > 0:
2907
+ if output_tasks:
2908
+ output_path = Path(output_tasks)
2909
+ else:
2910
+ # Default to .adversarial/blocked-citations/
2911
+ output_dir = Path.cwd() / ".adversarial" / "blocked-citations"
2912
+ output_dir.mkdir(parents=True, exist_ok=True)
2913
+ base_name = Path(file_path).stem
2914
+ output_path = output_dir / f"{base_name}-blocked-urls.md"
2915
+
2916
+ task_content = generate_blocked_tasks(results, file_path, output_path)
2917
+ if task_content:
2918
+ print(f" 📋 Blocked URL tasks: {output_path}")
2919
+
2920
+ return 0
2921
+
2922
+
2822
2923
  def main():
2823
2924
  """Main CLI entry point."""
2824
2925
  import logging
@@ -2862,6 +2963,7 @@ def main():
2862
2963
  "validate",
2863
2964
  "review",
2864
2965
  "list-evaluators",
2966
+ "check-citations",
2865
2967
  }
2866
2968
 
2867
2969
  parser = argparse.ArgumentParser(
@@ -2879,6 +2981,7 @@ Examples:
2879
2981
  adversarial review # Review implementation
2880
2982
  adversarial validate "npm test" # Validate with tests
2881
2983
  adversarial split large-task.md # Split large files
2984
+ adversarial check-citations doc.md # Verify URLs in document
2882
2985
 
2883
2986
  For more information: https://github.com/movito/adversarial-workflow
2884
2987
  """,
@@ -2961,6 +3064,38 @@ For more information: https://github.com/movito/adversarial-workflow
2961
3064
  help="List all available evaluators (built-in and local)",
2962
3065
  )
2963
3066
 
3067
+ # check-citations command
3068
+ citations_parser = subparsers.add_parser(
3069
+ "check-citations",
3070
+ help="Verify URLs in a document before evaluation",
3071
+ )
3072
+ citations_parser.add_argument("file", help="Document to check citations in")
3073
+ citations_parser.add_argument(
3074
+ "--output-tasks",
3075
+ "-o",
3076
+ help="Output file for blocked URL tasks (markdown)",
3077
+ )
3078
+ citations_parser.add_argument(
3079
+ "--mark-inline",
3080
+ action="store_true",
3081
+ default=False,
3082
+ help="Mark URLs inline with status badges (modifies document)",
3083
+ )
3084
+ citations_parser.add_argument(
3085
+ "--concurrency",
3086
+ "-c",
3087
+ type=int,
3088
+ default=10,
3089
+ help="Maximum concurrent URL checks (default: 10)",
3090
+ )
3091
+ citations_parser.add_argument(
3092
+ "--timeout",
3093
+ "-t",
3094
+ type=int,
3095
+ default=10,
3096
+ help="Timeout per URL in seconds (default: 10)",
3097
+ )
3098
+
2964
3099
  # Dynamic evaluator registration
2965
3100
  try:
2966
3101
  evaluators = get_all_evaluators()
@@ -3009,6 +3144,11 @@ For more information: https://github.com/movito/adversarial-workflow
3009
3144
  default=None,
3010
3145
  help="Timeout in seconds (default: from evaluator config or 180, max: 600)",
3011
3146
  )
3147
+ eval_parser.add_argument(
3148
+ "--check-citations",
3149
+ action="store_true",
3150
+ help="Verify URLs in document before evaluation",
3151
+ )
3012
3152
  # Store config for later execution
3013
3153
  eval_parser.set_defaults(evaluator_config=config)
3014
3154
 
@@ -3044,6 +3184,16 @@ For more information: https://github.com/movito/adversarial-workflow
3044
3184
  # Log actual timeout and source
3045
3185
  print(f"Using timeout: {timeout}s ({source})")
3046
3186
 
3187
+ # Check citations first if requested (read-only, doesn't modify file)
3188
+ if getattr(args, "check_citations", False):
3189
+ print()
3190
+ result = check_citations(args.file, mark_inline=False)
3191
+ if result != 0:
3192
+ print(
3193
+ f"{YELLOW}Warning: Citation check had issues, continuing with evaluation...{RESET}"
3194
+ )
3195
+ print()
3196
+
3047
3197
  return run_evaluator(
3048
3198
  args.evaluator_config,
3049
3199
  args.file,
@@ -3083,6 +3233,14 @@ For more information: https://github.com/movito/adversarial-workflow
3083
3233
  )
3084
3234
  elif args.command == "list-evaluators":
3085
3235
  return list_evaluators()
3236
+ elif args.command == "check-citations":
3237
+ return check_citations(
3238
+ args.file,
3239
+ output_tasks=args.output_tasks,
3240
+ mark_inline=args.mark_inline,
3241
+ concurrency=args.concurrency,
3242
+ timeout=args.timeout,
3243
+ )
3086
3244
  else:
3087
3245
  parser.print_help()
3088
3246
  return 1
@@ -0,0 +1,643 @@
1
+ """
2
+ Citation verification utilities for checking URLs in documents.
3
+
4
+ This module provides:
5
+ - URL extraction from markdown documents
6
+ - Async parallel URL checking with caching
7
+ - Inline marking of URL status
8
+ - Blocked URL task file generation
9
+
10
+ Status categories:
11
+ - available: 200 OK, content accessible
12
+ - blocked: Paywall/auth/bot-blocked (401, 403, or bot detection)
13
+ - broken: 404, 500, timeout, DNS failure
14
+ - redirect: 301/302 with final destination noted
15
+ """
16
+
17
+ import asyncio
18
+ import hashlib
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import time
24
+ from dataclasses import dataclass
25
+ from datetime import datetime, timezone
26
+ from enum import Enum
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+ # Module logger for debugging URL check failures
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class URLStatus(Enum):
35
+ """URL verification status categories."""
36
+
37
+ AVAILABLE = "available"
38
+ BLOCKED = "blocked"
39
+ BROKEN = "broken"
40
+ REDIRECT = "redirect"
41
+
42
+
43
+ @dataclass
44
+ class URLResult:
45
+ """Result of checking a single URL."""
46
+
47
+ url: str
48
+ status: URLStatus
49
+ status_code: Optional[int] = None
50
+ final_url: Optional[str] = None
51
+ error: Optional[str] = None
52
+ checked_at: Optional[float] = None
53
+
54
+ def to_dict(self) -> dict:
55
+ """Convert to dictionary for JSON serialization."""
56
+ return {
57
+ "url": self.url,
58
+ "status": self.status.value,
59
+ "status_code": self.status_code,
60
+ "final_url": self.final_url,
61
+ "error": self.error,
62
+ "checked_at": self.checked_at,
63
+ }
64
+
65
+ @classmethod
66
+ def from_dict(cls, data: dict) -> "URLResult":
67
+ """Create from dictionary."""
68
+ return cls(
69
+ url=data["url"],
70
+ status=URLStatus(data["status"]),
71
+ status_code=data.get("status_code"),
72
+ final_url=data.get("final_url"),
73
+ error=data.get("error"),
74
+ checked_at=data.get("checked_at"),
75
+ )
76
+
77
+
78
+ @dataclass
79
+ class ExtractedURL:
80
+ """A URL extracted from a document with context."""
81
+
82
+ url: str
83
+ position: int
84
+ context: str
85
+ line_number: int
86
+
87
+
88
+ # URL extraction pattern - matches http/https URLs
89
+ URL_PATTERN = re.compile(r"https?://[^\s\)\]\>\"\'\`]+")
90
+
91
+ # Bot detection patterns in response
92
+ BOT_DETECTION_PATTERNS = [
93
+ "captcha",
94
+ "cloudflare",
95
+ "access denied",
96
+ "forbidden",
97
+ "bot detected",
98
+ "please verify",
99
+ "human verification",
100
+ ]
101
+
102
+ # Default configuration
103
+ DEFAULT_CONFIG = {
104
+ "max_urls": 100,
105
+ "concurrency": 10,
106
+ "timeout_per_url": 10,
107
+ "cache_ttl": 86400, # 24 hours
108
+ }
109
+
110
+
111
+ def extract_urls(document: str, max_urls: int = 100) -> list[ExtractedURL]:
112
+ """
113
+ Extract URLs from a document with surrounding context.
114
+
115
+ Args:
116
+ document: The document text to extract URLs from
117
+ max_urls: Maximum number of URLs to extract (default: 100)
118
+
119
+ Returns:
120
+ List of ExtractedURL objects with position and context
121
+ """
122
+ urls = []
123
+ lines = document.split("\n")
124
+ line_starts = [0]
125
+ for line in lines:
126
+ line_starts.append(line_starts[-1] + len(line) + 1)
127
+
128
+ for match in URL_PATTERN.finditer(document):
129
+ url = match.group().rstrip(".,;:!?") # Clean trailing punctuation
130
+ position = match.start()
131
+
132
+ # Find line number
133
+ line_number = 1
134
+ for i, start in enumerate(line_starts):
135
+ if start > position:
136
+ line_number = i
137
+ break
138
+
139
+ # Get context (50 chars before and after)
140
+ start = max(0, position - 50)
141
+ end = min(len(document), match.end() + 50)
142
+ context = document[start:end]
143
+
144
+ urls.append(
145
+ ExtractedURL(
146
+ url=url,
147
+ position=position,
148
+ context=context,
149
+ line_number=line_number,
150
+ )
151
+ )
152
+
153
+ if len(urls) >= max_urls:
154
+ break
155
+
156
+ return urls
157
+
158
+
159
+ def get_cache_path(cache_dir: Optional[Path] = None) -> Path:
160
+ """Get the path to the URL cache file."""
161
+ if cache_dir is None:
162
+ cache_dir = Path.cwd() / ".adversarial"
163
+ cache_dir.mkdir(parents=True, exist_ok=True)
164
+ return cache_dir / "url_cache.json"
165
+
166
+
167
+ def load_cache(cache_path: Path) -> dict[str, dict]:
168
+ """Load URL cache from disk."""
169
+ if not cache_path.exists():
170
+ return {}
171
+ try:
172
+ with open(cache_path) as f:
173
+ return json.load(f)
174
+ except (json.JSONDecodeError, OSError):
175
+ return {}
176
+
177
+
178
+ def save_cache(cache_path: Path, cache: dict[str, dict]) -> None:
179
+ """Save URL cache to disk."""
180
+ with open(cache_path, "w") as f:
181
+ json.dump(cache, f, indent=2)
182
+
183
+
184
+ def get_cache_key(url: str) -> str:
185
+ """Generate a cache key for a URL."""
186
+ return hashlib.md5(url.encode()).hexdigest()
187
+
188
+
189
+ def classify_response(status_code: int, _headers: dict, content: Optional[str] = None) -> URLStatus:
190
+ """
191
+ Classify HTTP response into a URL status.
192
+
193
+ Args:
194
+ status_code: HTTP status code
195
+ _headers: Response headers (reserved for future use)
196
+ content: Optional response body content (for bot detection)
197
+
198
+ Returns:
199
+ URLStatus enum value
200
+ """
201
+ if status_code == 200:
202
+ # Check for bot blocking in content
203
+ if content:
204
+ content_lower = content.lower()
205
+ for pattern in BOT_DETECTION_PATTERNS:
206
+ if pattern in content_lower:
207
+ return URLStatus.BLOCKED
208
+ return URLStatus.AVAILABLE
209
+ elif status_code in (301, 302, 307, 308):
210
+ return URLStatus.REDIRECT
211
+ elif status_code in (401, 403):
212
+ return URLStatus.BLOCKED
213
+ elif status_code == 429:
214
+ return URLStatus.BLOCKED # Rate limited
215
+ else:
216
+ return URLStatus.BROKEN
217
+
218
+
219
+ async def check_url_async(
220
+ url: str,
221
+ timeout: int = 10,
222
+ session=None,
223
+ ) -> URLResult:
224
+ """
225
+ Check a single URL asynchronously.
226
+
227
+ Args:
228
+ url: URL to check
229
+ timeout: Request timeout in seconds
230
+ session: Optional aiohttp session to reuse
231
+
232
+ Returns:
233
+ URLResult with status information
234
+ """
235
+ try:
236
+ import aiohttp
237
+ except ImportError:
238
+ return URLResult(
239
+ url=url,
240
+ status=URLStatus.BROKEN,
241
+ error="aiohttp not installed - run: pip install aiohttp",
242
+ checked_at=time.time(),
243
+ )
244
+
245
+ close_session = False
246
+ if session is None:
247
+ session = aiohttp.ClientSession()
248
+ close_session = True
249
+
250
+ try:
251
+ async with session.head(
252
+ url,
253
+ timeout=aiohttp.ClientTimeout(total=timeout),
254
+ allow_redirects=True,
255
+ headers={"User-Agent": "Mozilla/5.0 (compatible; CitationVerifier/1.0)"},
256
+ ) as response:
257
+ final_url = str(response.url) if str(response.url) != url else None
258
+ status = classify_response(response.status, dict(response.headers))
259
+
260
+ # If redirect to an available page, mark as redirect (informational)
261
+ # Keep broken/blocked status if redirect leads to error page
262
+ if final_url and response.history and status == URLStatus.AVAILABLE:
263
+ status = URLStatus.REDIRECT
264
+
265
+ return URLResult(
266
+ url=url,
267
+ status=status,
268
+ status_code=response.status,
269
+ final_url=final_url,
270
+ checked_at=time.time(),
271
+ )
272
+ except asyncio.TimeoutError:
273
+ return URLResult(
274
+ url=url,
275
+ status=URLStatus.BROKEN,
276
+ error="Timeout",
277
+ checked_at=time.time(),
278
+ )
279
+ except Exception as e:
280
+ error_name = type(e).__name__
281
+ # Log full exception for debugging while returning truncated message
282
+ logger.debug("URL check failed for %s: %s", url, e, exc_info=True)
283
+ return URLResult(
284
+ url=url,
285
+ status=URLStatus.BROKEN,
286
+ error=f"{error_name}: {str(e)[:50]}",
287
+ checked_at=time.time(),
288
+ )
289
+ finally:
290
+ if close_session:
291
+ await session.close()
292
+
293
+
294
+ async def check_urls_parallel(
295
+ urls: list[str],
296
+ concurrency: int = 10,
297
+ timeout: int = 10,
298
+ cache: Optional[dict] = None,
299
+ cache_ttl: int = 86400,
300
+ ) -> list[URLResult]:
301
+ """
302
+ Check multiple URLs in parallel with optional caching.
303
+
304
+ Args:
305
+ urls: List of URLs to check
306
+ concurrency: Maximum concurrent requests (must be >= 1)
307
+ timeout: Timeout per request in seconds (must be >= 1)
308
+ cache: Optional cache dictionary
309
+ cache_ttl: Cache TTL in seconds (default: 24 hours)
310
+
311
+ Returns:
312
+ List of URLResult objects
313
+
314
+ Raises:
315
+ ValueError: If concurrency or timeout is less than 1
316
+ """
317
+ # Validate parameters to prevent deadlocks
318
+ if concurrency < 1:
319
+ raise ValueError(f"concurrency must be >= 1, got {concurrency}")
320
+ if timeout < 1:
321
+ raise ValueError(f"timeout must be >= 1, got {timeout}")
322
+
323
+ try:
324
+ import aiohttp
325
+ except ImportError:
326
+ return [
327
+ URLResult(
328
+ url=url,
329
+ status=URLStatus.BROKEN,
330
+ error="aiohttp not installed",
331
+ checked_at=time.time(),
332
+ )
333
+ for url in urls
334
+ ]
335
+
336
+ url_to_result: dict[str, URLResult] = {}
337
+ urls_to_check = []
338
+ current_time = time.time()
339
+
340
+ # Check cache first
341
+ if cache is not None:
342
+ for url in urls:
343
+ cache_key = get_cache_key(url)
344
+ if cache_key in cache:
345
+ cached = cache[cache_key]
346
+ if cached.get("expires", 0) > current_time:
347
+ url_to_result[url] = URLResult.from_dict(cached["result"])
348
+ continue
349
+ urls_to_check.append(url)
350
+ else:
351
+ urls_to_check = list(urls)
352
+
353
+ if urls_to_check:
354
+ # Create semaphore for concurrency limiting
355
+ semaphore = asyncio.Semaphore(concurrency)
356
+
357
+ async def check_with_semaphore(session, url):
358
+ async with semaphore:
359
+ return await check_url_async(url, timeout, session)
360
+
361
+ # Check remaining URLs
362
+ connector = aiohttp.TCPConnector(limit=concurrency, limit_per_host=5)
363
+ async with aiohttp.ClientSession(connector=connector) as session:
364
+ tasks = [check_with_semaphore(session, url) for url in urls_to_check]
365
+ checked_results = await asyncio.gather(*tasks)
366
+
367
+ # Update cache and store results
368
+ for result in checked_results:
369
+ if cache is not None:
370
+ cache_key = get_cache_key(result.url)
371
+ cache[cache_key] = {
372
+ "result": result.to_dict(),
373
+ "expires": current_time + cache_ttl,
374
+ }
375
+ url_to_result[result.url] = result
376
+
377
+ # Return results in original URL order
378
+ return [url_to_result[url] for url in urls]
379
+
380
+
381
+ def check_urls(
382
+ urls: list[str],
383
+ concurrency: int = 10,
384
+ timeout: int = 10,
385
+ cache_dir: Optional[Path] = None,
386
+ cache_ttl: int = 86400,
387
+ ) -> list[URLResult]:
388
+ """
389
+ Check multiple URLs synchronously (wrapper around async version).
390
+
391
+ Args:
392
+ urls: List of URLs to check
393
+ concurrency: Maximum concurrent requests
394
+ timeout: Timeout per request in seconds
395
+ cache_dir: Optional cache directory
396
+ cache_ttl: Cache TTL in seconds
397
+
398
+ Returns:
399
+ List of URLResult objects
400
+
401
+ Raises:
402
+ RuntimeError: If called from within an async context (event loop running).
403
+ Use check_urls_parallel() directly from async code.
404
+ """
405
+ # Guard against calling from async context
406
+ try:
407
+ asyncio.get_running_loop()
408
+ raise RuntimeError(
409
+ "check_urls() cannot be called from within an async context. "
410
+ "Use check_urls_parallel() directly instead."
411
+ )
412
+ except RuntimeError as e:
413
+ # No running loop - this is expected, proceed
414
+ if "no running event loop" not in str(e).lower():
415
+ raise
416
+
417
+ # Load cache
418
+ cache_path = get_cache_path(cache_dir)
419
+ cache = load_cache(cache_path)
420
+
421
+ # Run async check
422
+ results = asyncio.run(
423
+ check_urls_parallel(
424
+ urls,
425
+ concurrency=concurrency,
426
+ timeout=timeout,
427
+ cache=cache,
428
+ cache_ttl=cache_ttl,
429
+ )
430
+ )
431
+
432
+ # Save cache
433
+ save_cache(cache_path, cache)
434
+
435
+ return results
436
+
437
+
438
+ def get_status_badge(result: URLResult) -> str:
439
+ """
440
+ Generate an inline status badge for a URL result.
441
+
442
+ Args:
443
+ result: URLResult to generate badge for
444
+
445
+ Returns:
446
+ Markdown-formatted status badge
447
+ """
448
+ if result.status == URLStatus.AVAILABLE:
449
+ return f"[✅ Verified | {result.status_code} OK]"
450
+ elif result.status == URLStatus.BLOCKED:
451
+ if result.status_code:
452
+ return f"[⚠️ Blocked | {result.status_code}]"
453
+ return "[⚠️ Blocked | Access Denied]"
454
+ elif result.status == URLStatus.BROKEN:
455
+ if result.error:
456
+ return f"[❌ Broken | {result.error}]"
457
+ if result.status_code:
458
+ return f"[❌ Broken | {result.status_code}]"
459
+ return "[❌ Broken | Unreachable]"
460
+ elif result.status == URLStatus.REDIRECT:
461
+ dest = (
462
+ result.final_url[:30] + "..."
463
+ if result.final_url and len(result.final_url) > 30
464
+ else result.final_url
465
+ )
466
+ return f"[🔄 Redirect | → {dest}]"
467
+ return "[❓ Unknown]"
468
+
469
+
470
+ def mark_urls_inline(document: str, results: list[URLResult]) -> str:
471
+ """
472
+ Mark URLs in a document with their status badges.
473
+
474
+ Args:
475
+ document: Original document text
476
+ results: List of URL check results
477
+
478
+ Returns:
479
+ Document with inline status badges added after URLs
480
+ """
481
+ # Create URL to result mapping
482
+ url_results = {r.url: r for r in results}
483
+
484
+ # Find all URLs and their positions
485
+ marked = document
486
+ offset = 0 # Track offset as we insert badges
487
+
488
+ for match in URL_PATTERN.finditer(document):
489
+ url = match.group().rstrip(".,;:!?") # Same stripping as extract_urls
490
+ if url in url_results:
491
+ result = url_results[url]
492
+ badge = get_status_badge(result)
493
+
494
+ # Check if badge already exists after this URL
495
+ end_pos = match.end() + offset
496
+ remaining = marked[end_pos:]
497
+ if remaining.startswith((" [✅", " [⚠️", " [❌", " [🔄")):
498
+ continue # Already marked
499
+
500
+ # Insert badge after URL
501
+ insert_pos = end_pos
502
+ marked = marked[:insert_pos] + " " + badge + marked[insert_pos:]
503
+ offset += len(badge) + 1
504
+
505
+ return marked
506
+
507
+
508
+ def generate_blocked_tasks(
509
+ results: list[URLResult],
510
+ document_path: str,
511
+ output_path: Optional[Path] = None,
512
+ ) -> str:
513
+ """
514
+ Generate a task file for blocked URLs requiring manual verification.
515
+
516
+ Args:
517
+ results: List of URL check results
518
+ document_path: Path to the source document
519
+ output_path: Optional path to write task file
520
+
521
+ Returns:
522
+ Task file content as string
523
+ """
524
+ blocked = [r for r in results if r.status in (URLStatus.BLOCKED, URLStatus.BROKEN)]
525
+
526
+ if not blocked:
527
+ return ""
528
+
529
+ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
530
+ content = f"""# Blocked Citation Verification Tasks
531
+
532
+ **Source**: {document_path}
533
+ **Generated**: {timestamp}
534
+ **Total blocked URLs**: {len(blocked)}
535
+
536
+ ## URLs Requiring Manual Verification
537
+
538
+ """
539
+
540
+ for i, result in enumerate(blocked, 1):
541
+ status_label = "⚠️ Blocked" if result.status == URLStatus.BLOCKED else "❌ Broken"
542
+ reason = result.error or (f"HTTP {result.status_code}" if result.status_code else "Unknown")
543
+
544
+ content += f"""### {i}. {status_label}
545
+
546
+ - **URL**: {result.url}
547
+ - **Reason**: {reason}
548
+ - [ ] Verify URL manually
549
+ - [ ] Update document if URL is permanently unavailable
550
+
551
+ """
552
+
553
+ content += """---
554
+
555
+ ## Instructions
556
+
557
+ 1. Open each URL in a browser
558
+ 2. Verify if content is accessible
559
+ 3. If blocked by paywall/auth, note the access method needed
560
+ 4. If broken, find replacement URL or remove citation
561
+ 5. Update the source document accordingly
562
+ """
563
+
564
+ if output_path:
565
+ output_path.parent.mkdir(parents=True, exist_ok=True)
566
+ with open(output_path, "w", encoding="utf-8") as f:
567
+ f.write(content)
568
+
569
+ return content
570
+
571
+
572
+ def verify_document(
573
+ document_path: Path,
574
+ output_tasks_path: Optional[Path] = None,
575
+ mark_inline: bool = True,
576
+ concurrency: int = 10,
577
+ timeout: int = 10,
578
+ cache_dir: Optional[Path] = None,
579
+ ) -> tuple[str, list[URLResult], str]:
580
+ """
581
+ Verify all citations in a document.
582
+
583
+ Args:
584
+ document_path: Path to the document to verify
585
+ output_tasks_path: Optional path for blocked URL task file
586
+ mark_inline: Whether to mark URLs inline in the document
587
+ concurrency: Maximum concurrent requests
588
+ timeout: Timeout per request
589
+ cache_dir: Optional cache directory
590
+
591
+ Returns:
592
+ Tuple of (marked_document, results, blocked_tasks)
593
+ """
594
+ with open(document_path, encoding="utf-8") as f:
595
+ document = f.read()
596
+
597
+ # Extract URLs
598
+ extracted = extract_urls(document)
599
+ urls = [e.url for e in extracted]
600
+
601
+ if not urls:
602
+ return document, [], ""
603
+
604
+ # Check URLs
605
+ results = check_urls(
606
+ urls,
607
+ concurrency=concurrency,
608
+ timeout=timeout,
609
+ cache_dir=cache_dir,
610
+ )
611
+
612
+ # Mark document if requested
613
+ marked_document = document
614
+ if mark_inline:
615
+ marked_document = mark_urls_inline(document, results)
616
+
617
+ # Generate blocked tasks
618
+ blocked_tasks = generate_blocked_tasks(
619
+ results,
620
+ str(document_path),
621
+ output_tasks_path,
622
+ )
623
+
624
+ return marked_document, results, blocked_tasks
625
+
626
+
627
+ def print_verification_summary(results: list[URLResult]) -> None:
628
+ """Print a summary of verification results to stdout."""
629
+ available = sum(1 for r in results if r.status == URLStatus.AVAILABLE)
630
+ blocked = sum(1 for r in results if r.status == URLStatus.BLOCKED)
631
+ broken = sum(1 for r in results if r.status == URLStatus.BROKEN)
632
+ redirect = sum(1 for r in results if r.status == URLStatus.REDIRECT)
633
+
634
+ total = len(results)
635
+ print("\n📋 Citation Verification Summary")
636
+ print(f" Total URLs checked: {total}")
637
+ print(f" ✅ Available: {available}")
638
+ print(f" 🔄 Redirect: {redirect}")
639
+ print(f" ⚠️ Blocked: {blocked}")
640
+ print(f" ❌ Broken: {broken}")
641
+
642
+ if blocked + broken > 0:
643
+ print(f"\n ⚠️ {blocked + broken} URLs need manual verification")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adversarial-workflow
3
- Version: 0.6.6
3
+ Version: 0.7.0
4
4
  Summary: Multi-stage AI evaluation system for task plans, code review, and test validation
5
5
  Author: Fredrik Matheson
6
6
  License: MIT
@@ -24,9 +24,11 @@ License-File: LICENSE
24
24
  Requires-Dist: pyyaml>=6.0
25
25
  Requires-Dist: python-dotenv>=0.19.0
26
26
  Requires-Dist: aider-chat>=0.86.0
27
+ Requires-Dist: aiohttp>=3.8.0
27
28
  Provides-Extra: dev
28
29
  Requires-Dist: pytest>=7.0; extra == "dev"
29
30
  Requires-Dist: pytest-cov>=3.0; extra == "dev"
31
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
30
32
  Requires-Dist: black>=22.0; extra == "dev"
31
33
  Requires-Dist: isort>=5.0; extra == "dev"
32
34
  Requires-Dist: flake8>=4.0; extra == "dev"
@@ -1,6 +1,6 @@
1
- adversarial_workflow/__init__.py,sha256=P9demPy3XsLoPU7tsgsfoJnOaCZaCw9I0PGYek5oOh8,596
1
+ adversarial_workflow/__init__.py,sha256=Aj_FdCEOJYpaeOe9SMO1vWqlpzoR9AONRz2SZ8RKgVg,596
2
2
  adversarial_workflow/__main__.py,sha256=iM2jmO5YCFpGxfWiEhIYi_SsxVa0hRIE-MB7J0EcN7Y,120
3
- adversarial_workflow/cli.py,sha256=fmr46xQ3PpW0mPimYmyczADd_EjilzoGttmv7dKw3DE,110577
3
+ adversarial_workflow/cli.py,sha256=mKvOe3Q-afCfzXALNCQ47GBOSwccE17kqYpxOHbOC6k,115541
4
4
  adversarial_workflow/evaluators/__init__.py,sha256=A9ZKUmjSMfyvEu6jDzYAFLxfkt_OQ4RGA10Bv_eO2i4,1267
5
5
  adversarial_workflow/evaluators/builtins.py,sha256=u5LokYLe8ruEW2tunhOQaNSkpcZ9Ee2IeTkaC0dZDSY,1102
6
6
  adversarial_workflow/evaluators/config.py,sha256=H_4vkto07rAqnz0qEYdzN_DH6WbvRPMIEdkEOFE58UI,1651
@@ -21,13 +21,14 @@ adversarial_workflow/templates/agent-context/agent-handoffs-minimal.json.templat
21
21
  adversarial_workflow/templates/agent-context/agent-handoffs.json.template,sha256=rdRX79xdNKyTGd8_g6pvSdQK5VC5sDiErq_OZFslSXI,3904
22
22
  adversarial_workflow/templates/agent-context/current-state.json.template,sha256=UH3SQGjXGNzScqHnQcPrhrI9ZvjQBC3oBp4S9TilzoY,2325
23
23
  adversarial_workflow/utils/__init__.py,sha256=Pnm-a_jqoMVOxHdvVWXeVrL0IKI-zkY7EAdbQmZAkSI,352
24
+ adversarial_workflow/utils/citations.py,sha256=Xhla-M3az4aXtd9F0UHyBXhTBDyYOo9Mb1IUi8nPLPs,18619
24
25
  adversarial_workflow/utils/colors.py,sha256=uRrG6KfIDBLo0F5_vPwms9NCm9-x8YXBiyZ4naCr868,160
25
26
  adversarial_workflow/utils/config.py,sha256=3VmF65ItUbFzbyAZ1RUoOtpS_t6n1wqIhKft8eSNsdw,1303
26
27
  adversarial_workflow/utils/file_splitter.py,sha256=kvWh0xVjd08fsEXgysoHd5zFwJHqs-JRKottO8scYCA,12381
27
28
  adversarial_workflow/utils/validation.py,sha256=0QfuRd-kurcadUCd9XQvO-N8RsmLp6ONQnc0vaQTUBA,2188
28
- adversarial_workflow-0.6.6.dist-info/licenses/LICENSE,sha256=M-dOQlre-NmicyPa55hYOJUW8roGpCKEgtq-z0z1KCA,1073
29
- adversarial_workflow-0.6.6.dist-info/METADATA,sha256=q-JQ92-0bIsBzxyCl5HzXEUtZseRy8HcO8XqMeoiSro,30832
30
- adversarial_workflow-0.6.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
31
- adversarial_workflow-0.6.6.dist-info/entry_points.txt,sha256=9H-iZ-yF1uKZ8P0G1suc6kWR0NvK7uPZJbhN7nvt1sE,62
32
- adversarial_workflow-0.6.6.dist-info/top_level.txt,sha256=8irutNxLRjUbTlzfAibIpz7_ovkkF2h8ES69NQpv24c,21
33
- adversarial_workflow-0.6.6.dist-info/RECORD,,
29
+ adversarial_workflow-0.7.0.dist-info/licenses/LICENSE,sha256=M-dOQlre-NmicyPa55hYOJUW8roGpCKEgtq-z0z1KCA,1073
30
+ adversarial_workflow-0.7.0.dist-info/METADATA,sha256=s0lrhtLRaXy6HE-QeCAcmdH8mswaeyxTvQ1H3khhG1k,30916
31
+ adversarial_workflow-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
32
+ adversarial_workflow-0.7.0.dist-info/entry_points.txt,sha256=9H-iZ-yF1uKZ8P0G1suc6kWR0NvK7uPZJbhN7nvt1sE,62
33
+ adversarial_workflow-0.7.0.dist-info/top_level.txt,sha256=8irutNxLRjUbTlzfAibIpz7_ovkkF2h8ES69NQpv24c,21
34
+ adversarial_workflow-0.7.0.dist-info/RECORD,,