linksanity 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linksanity/__init__.py +3 -0
- linksanity/__main__.py +3 -0
- linksanity/checkers/__init__.py +0 -0
- linksanity/checkers/filesystem.py +136 -0
- linksanity/checkers/http.py +171 -0
- linksanity/checkers/playwright.py +228 -0
- linksanity/cli.py +254 -0
- linksanity/config.py +104 -0
- linksanity/crawler.py +125 -0
- linksanity/parsers/__init__.py +0 -0
- linksanity/parsers/html.py +42 -0
- linksanity/parsers/markdown.py +48 -0
- linksanity/parsers/rst.py +53 -0
- linksanity/py.typed +0 -0
- linksanity/queue.py +72 -0
- linksanity/reporters/__init__.py +26 -0
- linksanity/reporters/console.py +78 -0
- linksanity/reporters/csv_reporter.py +39 -0
- linksanity/reporters/github_reporter.py +108 -0
- linksanity/reporters/json_reporter.py +28 -0
- linksanity/reporters/markdown_reporter.py +68 -0
- linksanity/router.py +72 -0
- linksanity/scanner.py +77 -0
- linksanity-0.1.0.dist-info/METADATA +436 -0
- linksanity-0.1.0.dist-info/RECORD +27 -0
- linksanity-0.1.0.dist-info/WHEEL +4 -0
- linksanity-0.1.0.dist-info/entry_points.txt +2 -0
linksanity/__init__.py
ADDED
linksanity/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Check internal links and anchor fragments against the local filesystem."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from linksanity.queue import LinkResult, LinkStatus, LinkType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def check(
|
|
12
|
+
url: str,
|
|
13
|
+
source_file: str,
|
|
14
|
+
line: int,
|
|
15
|
+
link_type: LinkType,
|
|
16
|
+
*,
|
|
17
|
+
check_anchors: bool = False,
|
|
18
|
+
) -> LinkResult:
|
|
19
|
+
"""Resolve and validate an internal or anchor link.
|
|
20
|
+
|
|
21
|
+
For ANCHOR links (#fragment), validates the fragment exists in the source
|
|
22
|
+
file when check_anchors is True.
|
|
23
|
+
For INTERNAL links (./path or ../path), validates the target file exists.
|
|
24
|
+
"""
|
|
25
|
+
source_path = Path(source_file)
|
|
26
|
+
|
|
27
|
+
# Split path and fragment
|
|
28
|
+
fragment: str | None = None
|
|
29
|
+
path_part = url
|
|
30
|
+
if "#" in url:
|
|
31
|
+
path_part, fragment = url.split("#", 1)
|
|
32
|
+
|
|
33
|
+
# Resolve the target file
|
|
34
|
+
if link_type == LinkType.ANCHOR or not path_part:
|
|
35
|
+
target_path = source_path
|
|
36
|
+
else:
|
|
37
|
+
target_path = (source_path.parent / path_part).resolve()
|
|
38
|
+
|
|
39
|
+
# Check file existence for non-pure-anchor links
|
|
40
|
+
if path_part and not target_path.exists():
|
|
41
|
+
return LinkResult(
|
|
42
|
+
source_file=source_file,
|
|
43
|
+
line=line,
|
|
44
|
+
url=url,
|
|
45
|
+
link_type=link_type,
|
|
46
|
+
status=LinkStatus.BROKEN,
|
|
47
|
+
error=f"file not found: {target_path}",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Optionally validate anchor fragment
|
|
51
|
+
if fragment and check_anchors and not _anchor_exists(target_path, fragment):
|
|
52
|
+
return LinkResult(
|
|
53
|
+
source_file=source_file,
|
|
54
|
+
line=line,
|
|
55
|
+
url=url,
|
|
56
|
+
link_type=link_type,
|
|
57
|
+
status=LinkStatus.BROKEN,
|
|
58
|
+
error=f"anchor '#{fragment}' not found in {target_path.name}",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return LinkResult(
|
|
62
|
+
source_file=source_file,
|
|
63
|
+
line=line,
|
|
64
|
+
url=url,
|
|
65
|
+
link_type=link_type,
|
|
66
|
+
status=LinkStatus.OK,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _anchor_exists(path: Path, fragment: str) -> bool:
|
|
71
|
+
"""Return True if fragment matches a heading/ID in the file."""
|
|
72
|
+
suffix = path.suffix.lower()
|
|
73
|
+
try:
|
|
74
|
+
content = path.read_text(encoding="utf-8")
|
|
75
|
+
except OSError:
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
if suffix == ".md":
|
|
79
|
+
return fragment in _md_anchors(content)
|
|
80
|
+
if suffix == ".rst":
|
|
81
|
+
return fragment in _rst_anchors(content)
|
|
82
|
+
if suffix in (".html", ".htm"):
|
|
83
|
+
return fragment in _html_ids(content)
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _md_anchors(content: str) -> set[str]:
|
|
88
|
+
"""Extract GitHub-style anchor slugs from Markdown headings."""
|
|
89
|
+
anchors: set[str] = set()
|
|
90
|
+
for line in content.splitlines():
|
|
91
|
+
# ATX headings: # Heading, ## Heading, etc.
|
|
92
|
+
m = re.match(r"^#{1,6}\s+(.+?)(?:\s+#+)?$", line)
|
|
93
|
+
if m:
|
|
94
|
+
anchors.add(_gh_slug(m.group(1)))
|
|
95
|
+
return anchors
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _gh_slug(text: str) -> str:
|
|
99
|
+
"""Convert heading text to a GitHub Markdown anchor slug."""
|
|
100
|
+
text = text.lower()
|
|
101
|
+
text = re.sub(r"[^\w\s-]", "", text) # remove special chars except - and _
|
|
102
|
+
text = re.sub(r"\s+", "-", text.strip())
|
|
103
|
+
return text
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _rst_anchors(content: str) -> set[str]:
|
|
107
|
+
"""Extract docutils-style IDs from all RST nodes that carry ids."""
|
|
108
|
+
from io import StringIO
|
|
109
|
+
|
|
110
|
+
from docutils.core import publish_doctree
|
|
111
|
+
from docutils.utils import Reporter
|
|
112
|
+
|
|
113
|
+
anchors: set[str] = set()
|
|
114
|
+
try:
|
|
115
|
+
doc = publish_doctree(
|
|
116
|
+
content,
|
|
117
|
+
settings_overrides={
|
|
118
|
+
"report_level": Reporter.SEVERE_LEVEL,
|
|
119
|
+
"halt_level": Reporter.SEVERE_LEVEL,
|
|
120
|
+
"warning_stream": StringIO(),
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
# Walk every Element node — titles, sections, and targets all carry ids
|
|
124
|
+
from docutils.nodes import Element
|
|
125
|
+
for node in doc.findall(Element):
|
|
126
|
+
for id_ in node.get("ids", []):
|
|
127
|
+
if isinstance(id_, str):
|
|
128
|
+
anchors.add(id_)
|
|
129
|
+
except Exception: # noqa: BLE001
|
|
130
|
+
pass
|
|
131
|
+
return anchors
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _html_ids(content: str) -> set[str]:
|
|
135
|
+
"""Extract all id= attribute values from HTML."""
|
|
136
|
+
return set(re.findall(r'\bid=["\']([^"\']+)["\']', content))
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Async HTTP link checker using httpx with retry and fallback."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import ipaddress
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
from linksanity.queue import LinkResult, LinkStatus, LinkType
|
|
12
|
+
|
|
13
|
+
_RETRY_ON = {429, 503}
|
|
14
|
+
_FALLBACK_ON = {405}
|
|
15
|
+
_TIMEOUT = httpx.Timeout(10.0)
|
|
16
|
+
_HEADERS = {"User-Agent": "linksanity/0.1 link-checker (+https://github.com/linksanity)"}
|
|
17
|
+
|
|
18
|
+
# Hostnames that are always private regardless of DNS resolution
|
|
19
|
+
_PRIVATE_HOSTNAMES = frozenset({"localhost", "metadata.google.internal"})
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_private_host(hostname: str) -> bool:
|
|
23
|
+
"""Return True if hostname is a loopback, link-local, or private address."""
|
|
24
|
+
if hostname.lower() in _PRIVATE_HOSTNAMES:
|
|
25
|
+
return True
|
|
26
|
+
try:
|
|
27
|
+
addr = ipaddress.ip_address(hostname)
|
|
28
|
+
return addr.is_loopback or addr.is_link_local or addr.is_private
|
|
29
|
+
except ValueError:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def check(
|
|
34
|
+
url: str,
|
|
35
|
+
source_file: str,
|
|
36
|
+
line: int,
|
|
37
|
+
link_type: LinkType,
|
|
38
|
+
*,
|
|
39
|
+
ignore_domains: set[str] | None = None,
|
|
40
|
+
timeout: int = 10,
|
|
41
|
+
retries: int = 2,
|
|
42
|
+
) -> LinkResult:
|
|
43
|
+
"""Check an external URL and return a LinkResult.
|
|
44
|
+
|
|
45
|
+
Strategy:
|
|
46
|
+
1. HEAD request first (fast, low bandwidth).
|
|
47
|
+
2. On 405 Method Not Allowed, retry with GET + stream (no body download).
|
|
48
|
+
3. On 429/503, retry up to `retries` times with exponential backoff.
|
|
49
|
+
"""
|
|
50
|
+
parsed = urlparse(url)
|
|
51
|
+
domain = parsed.netloc.lower()
|
|
52
|
+
hostname = parsed.hostname or ""
|
|
53
|
+
|
|
54
|
+
if _is_private_host(hostname):
|
|
55
|
+
return LinkResult(
|
|
56
|
+
source_file=source_file, line=line, url=url,
|
|
57
|
+
link_type=link_type, status=LinkStatus.SKIPPED,
|
|
58
|
+
error="skipped: private/loopback address",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if ignore_domains and _domain_match(domain, ignore_domains):
|
|
62
|
+
return LinkResult(
|
|
63
|
+
source_file=source_file, line=line, url=url,
|
|
64
|
+
link_type=link_type, status=LinkStatus.SKIPPED,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
client_timeout = httpx.Timeout(float(timeout))
|
|
68
|
+
try:
|
|
69
|
+
async with httpx.AsyncClient(
|
|
70
|
+
follow_redirects=True,
|
|
71
|
+
timeout=client_timeout,
|
|
72
|
+
headers=_HEADERS,
|
|
73
|
+
) as client:
|
|
74
|
+
return await _check_with_retry(
|
|
75
|
+
client, url, source_file, line, link_type, retries
|
|
76
|
+
)
|
|
77
|
+
except Exception as exc:
|
|
78
|
+
return LinkResult(
|
|
79
|
+
source_file=source_file, line=line, url=url,
|
|
80
|
+
link_type=link_type, status=LinkStatus.ERROR,
|
|
81
|
+
error=str(exc),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def _check_with_retry(
|
|
86
|
+
client: httpx.AsyncClient,
|
|
87
|
+
url: str,
|
|
88
|
+
source_file: str,
|
|
89
|
+
line: int,
|
|
90
|
+
link_type: LinkType,
|
|
91
|
+
retries: int,
|
|
92
|
+
) -> LinkResult:
|
|
93
|
+
last_exc: Exception | None = None
|
|
94
|
+
for attempt in range(retries + 1):
|
|
95
|
+
try:
|
|
96
|
+
result = await _try_head(client, url, source_file, line, link_type)
|
|
97
|
+
if result.http_code in _RETRY_ON and attempt < retries:
|
|
98
|
+
await asyncio.sleep(2 ** attempt)
|
|
99
|
+
continue
|
|
100
|
+
return result
|
|
101
|
+
except httpx.HTTPError as exc:
|
|
102
|
+
last_exc = exc
|
|
103
|
+
if attempt < retries:
|
|
104
|
+
await asyncio.sleep(2 ** attempt)
|
|
105
|
+
|
|
106
|
+
return LinkResult(
|
|
107
|
+
source_file=source_file, line=line, url=url,
|
|
108
|
+
link_type=link_type, status=LinkStatus.ERROR,
|
|
109
|
+
error=str(last_exc),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def _try_head(
|
|
114
|
+
client: httpx.AsyncClient,
|
|
115
|
+
url: str,
|
|
116
|
+
source_file: str,
|
|
117
|
+
line: int,
|
|
118
|
+
link_type: LinkType,
|
|
119
|
+
) -> LinkResult:
|
|
120
|
+
try:
|
|
121
|
+
resp = await client.head(url)
|
|
122
|
+
except httpx.HTTPError:
|
|
123
|
+
raise
|
|
124
|
+
|
|
125
|
+
if resp.status_code in _FALLBACK_ON:
|
|
126
|
+
# Server doesn't support HEAD — try GET with streaming (no body)
|
|
127
|
+
async with client.stream("GET", url) as stream_resp:
|
|
128
|
+
code = stream_resp.status_code
|
|
129
|
+
resolved = str(stream_resp.url)
|
|
130
|
+
else:
|
|
131
|
+
code = resp.status_code
|
|
132
|
+
resolved = str(resp.url)
|
|
133
|
+
|
|
134
|
+
return _make_result(url, source_file, line, link_type, code, resolved)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _make_result(
|
|
138
|
+
url: str,
|
|
139
|
+
source_file: str,
|
|
140
|
+
line: int,
|
|
141
|
+
link_type: LinkType,
|
|
142
|
+
code: int,
|
|
143
|
+
resolved_url: str,
|
|
144
|
+
) -> LinkResult:
|
|
145
|
+
# With follow_redirects=True, httpx resolves the full chain.
|
|
146
|
+
# A redirect is detected when the final URL differs from the original.
|
|
147
|
+
was_redirected = resolved_url.rstrip("/") != url.rstrip("/")
|
|
148
|
+
|
|
149
|
+
if code >= 400:
|
|
150
|
+
status = LinkStatus.BROKEN
|
|
151
|
+
elif was_redirected:
|
|
152
|
+
status = LinkStatus.REDIRECT
|
|
153
|
+
else:
|
|
154
|
+
status = LinkStatus.OK
|
|
155
|
+
|
|
156
|
+
return LinkResult(
|
|
157
|
+
source_file=source_file,
|
|
158
|
+
line=line,
|
|
159
|
+
url=url,
|
|
160
|
+
link_type=link_type,
|
|
161
|
+
status=status,
|
|
162
|
+
http_code=code,
|
|
163
|
+
resolved_url=resolved_url if was_redirected else None,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _domain_match(domain: str, ignore_set: set[str]) -> bool:
|
|
168
|
+
"""Return True if domain or any parent domain is in the ignore set."""
|
|
169
|
+
return domain in ignore_set or any(
|
|
170
|
+
domain.endswith("." + d) for d in ignore_set
|
|
171
|
+
)
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Playwright-based link extractor and checker for JS-rendered pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import contextlib
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from linksanity.queue import LinkResult, LinkStatus, LinkType
|
|
10
|
+
|
|
11
|
+
_SKIP_SCHEMES = ("mailto:", "javascript:", "data:", "blob:")
|
|
12
|
+
|
|
13
|
+
# Well-known analytics and tracking domains. Requests to these are aborted
|
|
14
|
+
# when --block-analytics is set, speeding up crawls and suppressing false hits.
|
|
15
|
+
ANALYTICS_DOMAINS: frozenset[str] = frozenset({
|
|
16
|
+
"google-analytics.com",
|
|
17
|
+
"analytics.google.com",
|
|
18
|
+
"googletagmanager.com",
|
|
19
|
+
"googletagservices.com",
|
|
20
|
+
"doubleclick.net",
|
|
21
|
+
"hotjar.com",
|
|
22
|
+
"segment.com",
|
|
23
|
+
"cdn.segment.com",
|
|
24
|
+
"api.segment.io",
|
|
25
|
+
"mixpanel.com",
|
|
26
|
+
"amplitude.com",
|
|
27
|
+
"heap.io",
|
|
28
|
+
"heapanalytics.com",
|
|
29
|
+
"fullstory.com",
|
|
30
|
+
"clarity.ms",
|
|
31
|
+
"plausible.io",
|
|
32
|
+
"intercom.io",
|
|
33
|
+
"intercomcdn.com",
|
|
34
|
+
"widget.intercom.io",
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _require_playwright() -> None:
|
|
39
|
+
try:
|
|
40
|
+
import playwright # noqa: F401
|
|
41
|
+
except ImportError:
|
|
42
|
+
raise ImportError(
|
|
43
|
+
"Playwright is not installed. "
|
|
44
|
+
"Run: pip install linksanity[browser] && playwright install chromium"
|
|
45
|
+
) from None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def extract_links(url: str, *, semaphore: asyncio.Semaphore | None = None) -> list[str]:
|
|
49
|
+
"""Launch a headless browser, render the page, and return all href values.
|
|
50
|
+
|
|
51
|
+
Filters out mailto:, javascript:, data:, blob:, and empty hrefs.
|
|
52
|
+
semaphore limits concurrent browser contexts.
|
|
53
|
+
"""
|
|
54
|
+
_require_playwright()
|
|
55
|
+
from playwright.async_api import async_playwright
|
|
56
|
+
|
|
57
|
+
sem = semaphore or asyncio.Semaphore(2)
|
|
58
|
+
async with sem, async_playwright() as pw:
|
|
59
|
+
browser = await pw.chromium.launch(headless=True)
|
|
60
|
+
try:
|
|
61
|
+
page = await browser.new_page()
|
|
62
|
+
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
|
63
|
+
hrefs: list[str] = await page.eval_on_selector_all(
|
|
64
|
+
"a[href]",
|
|
65
|
+
"els => els.map(e => e.href).filter(h => h)",
|
|
66
|
+
)
|
|
67
|
+
return [
|
|
68
|
+
h for h in hrefs
|
|
69
|
+
if h and not any(h.startswith(s) for s in _SKIP_SCHEMES)
|
|
70
|
+
]
|
|
71
|
+
finally:
|
|
72
|
+
await browser.close()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def check(
|
|
76
|
+
url: str,
|
|
77
|
+
source_file: str,
|
|
78
|
+
line: int,
|
|
79
|
+
link_type: LinkType,
|
|
80
|
+
*,
|
|
81
|
+
semaphore: asyncio.Semaphore | None = None,
|
|
82
|
+
timeout: int = 10,
|
|
83
|
+
) -> LinkResult:
|
|
84
|
+
"""Check whether a URL is reachable using a headless browser.
|
|
85
|
+
|
|
86
|
+
Uses Playwright's network response to determine status.
|
|
87
|
+
"""
|
|
88
|
+
_require_playwright()
|
|
89
|
+
from playwright.async_api import Error as PlaywrightError
|
|
90
|
+
from playwright.async_api import async_playwright
|
|
91
|
+
|
|
92
|
+
sem = semaphore or asyncio.Semaphore(2)
|
|
93
|
+
async with sem, async_playwright() as pw:
|
|
94
|
+
browser = await pw.chromium.launch(headless=True)
|
|
95
|
+
try:
|
|
96
|
+
page = await browser.new_page()
|
|
97
|
+
try:
|
|
98
|
+
response = await page.goto(
|
|
99
|
+
url,
|
|
100
|
+
wait_until="domcontentloaded",
|
|
101
|
+
timeout=timeout * 1000,
|
|
102
|
+
)
|
|
103
|
+
if response is None:
|
|
104
|
+
return LinkResult(
|
|
105
|
+
source_file=source_file, line=line, url=url,
|
|
106
|
+
link_type=link_type, status=LinkStatus.ERROR,
|
|
107
|
+
error="no response",
|
|
108
|
+
)
|
|
109
|
+
code = response.status
|
|
110
|
+
resolved = page.url
|
|
111
|
+
was_redirected = _strip(resolved) != _strip(url)
|
|
112
|
+
if code >= 400:
|
|
113
|
+
status = LinkStatus.BROKEN
|
|
114
|
+
elif was_redirected:
|
|
115
|
+
status = LinkStatus.REDIRECT
|
|
116
|
+
else:
|
|
117
|
+
status = LinkStatus.OK
|
|
118
|
+
return LinkResult(
|
|
119
|
+
source_file=source_file, line=line, url=url,
|
|
120
|
+
link_type=link_type, status=status,
|
|
121
|
+
http_code=code,
|
|
122
|
+
resolved_url=resolved if was_redirected else None,
|
|
123
|
+
)
|
|
124
|
+
except PlaywrightError as exc:
|
|
125
|
+
return LinkResult(
|
|
126
|
+
source_file=source_file, line=line, url=url,
|
|
127
|
+
link_type=link_type, status=LinkStatus.ERROR,
|
|
128
|
+
error=str(exc),
|
|
129
|
+
)
|
|
130
|
+
finally:
|
|
131
|
+
await browser.close()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
async def crawl_page(
|
|
135
|
+
url: str,
|
|
136
|
+
source_file: str,
|
|
137
|
+
line: int,
|
|
138
|
+
link_type: LinkType,
|
|
139
|
+
*,
|
|
140
|
+
semaphore: asyncio.Semaphore | None = None,
|
|
141
|
+
timeout: int = 10,
|
|
142
|
+
block_domains: frozenset[str] | None = None,
|
|
143
|
+
) -> tuple[LinkResult, list[str]]:
|
|
144
|
+
"""Visit a page, check its reachability, and return (result, hrefs).
|
|
145
|
+
|
|
146
|
+
Combines check() and extract_links() into a single browser session.
|
|
147
|
+
"""
|
|
148
|
+
_require_playwright()
|
|
149
|
+
from playwright.async_api import Error as PlaywrightError
|
|
150
|
+
from playwright.async_api import async_playwright
|
|
151
|
+
|
|
152
|
+
sem = semaphore or asyncio.Semaphore(2)
|
|
153
|
+
async with sem, async_playwright() as pw:
|
|
154
|
+
browser = await pw.chromium.launch(headless=True)
|
|
155
|
+
try:
|
|
156
|
+
page = await browser.new_page()
|
|
157
|
+
if block_domains:
|
|
158
|
+
from playwright.async_api import Route
|
|
159
|
+
|
|
160
|
+
async def _block(route: Route) -> None:
|
|
161
|
+
netloc = urlparse(route.request.url).netloc.lower()
|
|
162
|
+
if any(netloc == d or netloc.endswith("." + d) for d in block_domains):
|
|
163
|
+
await route.abort()
|
|
164
|
+
else:
|
|
165
|
+
await route.continue_()
|
|
166
|
+
|
|
167
|
+
await page.route("**/*", _block)
|
|
168
|
+
try:
|
|
169
|
+
response = await page.goto(
|
|
170
|
+
url,
|
|
171
|
+
wait_until="domcontentloaded",
|
|
172
|
+
timeout=timeout * 1000,
|
|
173
|
+
)
|
|
174
|
+
if response is None:
|
|
175
|
+
return LinkResult(
|
|
176
|
+
source_file=source_file, line=line, url=url,
|
|
177
|
+
link_type=link_type, status=LinkStatus.ERROR,
|
|
178
|
+
error="no response",
|
|
179
|
+
), []
|
|
180
|
+
# Wait for JS to finish rendering navigation (SPAs build links client-side)
|
|
181
|
+
with contextlib.suppress(PlaywrightError):
|
|
182
|
+
await page.wait_for_load_state("networkidle", timeout=5000)
|
|
183
|
+
code = response.status
|
|
184
|
+
resolved = page.url
|
|
185
|
+
was_redirected = _strip(resolved) != _strip(url)
|
|
186
|
+
if code >= 400:
|
|
187
|
+
status = LinkStatus.BROKEN
|
|
188
|
+
elif was_redirected:
|
|
189
|
+
status = LinkStatus.REDIRECT
|
|
190
|
+
else:
|
|
191
|
+
status = LinkStatus.OK
|
|
192
|
+
result = LinkResult(
|
|
193
|
+
source_file=source_file, line=line, url=url,
|
|
194
|
+
link_type=link_type, status=status,
|
|
195
|
+
http_code=code,
|
|
196
|
+
resolved_url=resolved if was_redirected else None,
|
|
197
|
+
)
|
|
198
|
+
# Extract links from any reachable page, including redirects
|
|
199
|
+
if status in (LinkStatus.OK, LinkStatus.REDIRECT):
|
|
200
|
+
hrefs: list[str] = await page.eval_on_selector_all(
|
|
201
|
+
"a[href]",
|
|
202
|
+
"els => els.map(e => e.href).filter(h => h)",
|
|
203
|
+
)
|
|
204
|
+
links = [
|
|
205
|
+
h for h in hrefs
|
|
206
|
+
if h and not any(h.startswith(s) for s in _SKIP_SCHEMES)
|
|
207
|
+
]
|
|
208
|
+
else:
|
|
209
|
+
links = []
|
|
210
|
+
return result, links
|
|
211
|
+
except PlaywrightError as exc:
|
|
212
|
+
return LinkResult(
|
|
213
|
+
source_file=source_file, line=line, url=url,
|
|
214
|
+
link_type=link_type, status=LinkStatus.ERROR,
|
|
215
|
+
error=str(exc),
|
|
216
|
+
), []
|
|
217
|
+
finally:
|
|
218
|
+
await browser.close()
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _strip(url: str) -> str:
|
|
222
|
+
return url.rstrip("/")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def scope_filter(urls: list[str], start_url: str) -> list[str]:
|
|
226
|
+
"""Keep only URLs on the same domain as start_url."""
|
|
227
|
+
domain = urlparse(start_url).netloc.lower()
|
|
228
|
+
return [u for u in urls if urlparse(u).netloc.lower() == domain]
|