linksanity 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
linksanity/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """linksanity — detect broken links in Markdown, reStructuredText, and HTML documentation."""
2
+
3
+ __version__ = "0.1.0"
linksanity/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from linksanity.cli import app
2
+
3
+ app()
File without changes
@@ -0,0 +1,136 @@
1
+ """Check internal links and anchor fragments against the local filesystem."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+
8
+ from linksanity.queue import LinkResult, LinkStatus, LinkType
9
+
10
+
11
+ def check(
12
+ url: str,
13
+ source_file: str,
14
+ line: int,
15
+ link_type: LinkType,
16
+ *,
17
+ check_anchors: bool = False,
18
+ ) -> LinkResult:
19
+ """Resolve and validate an internal or anchor link.
20
+
21
+ For ANCHOR links (#fragment), validates the fragment exists in the source
22
+ file when check_anchors is True.
23
+ For INTERNAL links (./path or ../path), validates the target file exists.
24
+ """
25
+ source_path = Path(source_file)
26
+
27
+ # Split path and fragment
28
+ fragment: str | None = None
29
+ path_part = url
30
+ if "#" in url:
31
+ path_part, fragment = url.split("#", 1)
32
+
33
+ # Resolve the target file
34
+ if link_type == LinkType.ANCHOR or not path_part:
35
+ target_path = source_path
36
+ else:
37
+ target_path = (source_path.parent / path_part).resolve()
38
+
39
+ # Check file existence for non-pure-anchor links
40
+ if path_part and not target_path.exists():
41
+ return LinkResult(
42
+ source_file=source_file,
43
+ line=line,
44
+ url=url,
45
+ link_type=link_type,
46
+ status=LinkStatus.BROKEN,
47
+ error=f"file not found: {target_path}",
48
+ )
49
+
50
+ # Optionally validate anchor fragment
51
+ if fragment and check_anchors and not _anchor_exists(target_path, fragment):
52
+ return LinkResult(
53
+ source_file=source_file,
54
+ line=line,
55
+ url=url,
56
+ link_type=link_type,
57
+ status=LinkStatus.BROKEN,
58
+ error=f"anchor '#{fragment}' not found in {target_path.name}",
59
+ )
60
+
61
+ return LinkResult(
62
+ source_file=source_file,
63
+ line=line,
64
+ url=url,
65
+ link_type=link_type,
66
+ status=LinkStatus.OK,
67
+ )
68
+
69
+
70
+ def _anchor_exists(path: Path, fragment: str) -> bool:
71
+ """Return True if fragment matches a heading/ID in the file."""
72
+ suffix = path.suffix.lower()
73
+ try:
74
+ content = path.read_text(encoding="utf-8")
75
+ except OSError:
76
+ return False
77
+
78
+ if suffix == ".md":
79
+ return fragment in _md_anchors(content)
80
+ if suffix == ".rst":
81
+ return fragment in _rst_anchors(content)
82
+ if suffix in (".html", ".htm"):
83
+ return fragment in _html_ids(content)
84
+ return False
85
+
86
+
87
+ def _md_anchors(content: str) -> set[str]:
88
+ """Extract GitHub-style anchor slugs from Markdown headings."""
89
+ anchors: set[str] = set()
90
+ for line in content.splitlines():
91
+ # ATX headings: # Heading, ## Heading, etc.
92
+ m = re.match(r"^#{1,6}\s+(.+?)(?:\s+#+)?$", line)
93
+ if m:
94
+ anchors.add(_gh_slug(m.group(1)))
95
+ return anchors
96
+
97
+
98
+ def _gh_slug(text: str) -> str:
99
+ """Convert heading text to a GitHub Markdown anchor slug."""
100
+ text = text.lower()
101
+ text = re.sub(r"[^\w\s-]", "", text) # remove special chars except - and _
102
+ text = re.sub(r"\s+", "-", text.strip())
103
+ return text
104
+
105
+
106
+ def _rst_anchors(content: str) -> set[str]:
107
+ """Extract docutils-style IDs from all RST nodes that carry ids."""
108
+ from io import StringIO
109
+
110
+ from docutils.core import publish_doctree
111
+ from docutils.utils import Reporter
112
+
113
+ anchors: set[str] = set()
114
+ try:
115
+ doc = publish_doctree(
116
+ content,
117
+ settings_overrides={
118
+ "report_level": Reporter.SEVERE_LEVEL,
119
+ "halt_level": Reporter.SEVERE_LEVEL,
120
+ "warning_stream": StringIO(),
121
+ },
122
+ )
123
+ # Walk every Element node — titles, sections, and targets all carry ids
124
+ from docutils.nodes import Element
125
+ for node in doc.findall(Element):
126
+ for id_ in node.get("ids", []):
127
+ if isinstance(id_, str):
128
+ anchors.add(id_)
129
+ except Exception: # noqa: BLE001
130
+ pass
131
+ return anchors
132
+
133
+
134
+ def _html_ids(content: str) -> set[str]:
135
+ """Extract all id= attribute values from HTML."""
136
+ return set(re.findall(r'\bid=["\']([^"\']+)["\']', content))
@@ -0,0 +1,171 @@
1
+ """Async HTTP link checker using httpx with retry and fallback."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import ipaddress
7
+ from urllib.parse import urlparse
8
+
9
+ import httpx
10
+
11
+ from linksanity.queue import LinkResult, LinkStatus, LinkType
12
+
13
+ _RETRY_ON = {429, 503}
14
+ _FALLBACK_ON = {405}
15
+ _TIMEOUT = httpx.Timeout(10.0)
16
+ _HEADERS = {"User-Agent": "linksanity/0.1 link-checker (+https://github.com/linksanity)"}
17
+
18
+ # Hostnames that are always private regardless of DNS resolution
19
+ _PRIVATE_HOSTNAMES = frozenset({"localhost", "metadata.google.internal"})
20
+
21
+
22
+ def _is_private_host(hostname: str) -> bool:
23
+ """Return True if hostname is a loopback, link-local, or private address."""
24
+ if hostname.lower() in _PRIVATE_HOSTNAMES:
25
+ return True
26
+ try:
27
+ addr = ipaddress.ip_address(hostname)
28
+ return addr.is_loopback or addr.is_link_local or addr.is_private
29
+ except ValueError:
30
+ return False
31
+
32
+
33
+ async def check(
34
+ url: str,
35
+ source_file: str,
36
+ line: int,
37
+ link_type: LinkType,
38
+ *,
39
+ ignore_domains: set[str] | None = None,
40
+ timeout: int = 10,
41
+ retries: int = 2,
42
+ ) -> LinkResult:
43
+ """Check an external URL and return a LinkResult.
44
+
45
+ Strategy:
46
+ 1. HEAD request first (fast, low bandwidth).
47
+ 2. On 405 Method Not Allowed, retry with GET + stream (no body download).
48
+ 3. On 429/503, retry up to `retries` times with exponential backoff.
49
+ """
50
+ parsed = urlparse(url)
51
+ domain = parsed.netloc.lower()
52
+ hostname = parsed.hostname or ""
53
+
54
+ if _is_private_host(hostname):
55
+ return LinkResult(
56
+ source_file=source_file, line=line, url=url,
57
+ link_type=link_type, status=LinkStatus.SKIPPED,
58
+ error="skipped: private/loopback address",
59
+ )
60
+
61
+ if ignore_domains and _domain_match(domain, ignore_domains):
62
+ return LinkResult(
63
+ source_file=source_file, line=line, url=url,
64
+ link_type=link_type, status=LinkStatus.SKIPPED,
65
+ )
66
+
67
+ client_timeout = httpx.Timeout(float(timeout))
68
+ try:
69
+ async with httpx.AsyncClient(
70
+ follow_redirects=True,
71
+ timeout=client_timeout,
72
+ headers=_HEADERS,
73
+ ) as client:
74
+ return await _check_with_retry(
75
+ client, url, source_file, line, link_type, retries
76
+ )
77
+ except Exception as exc:
78
+ return LinkResult(
79
+ source_file=source_file, line=line, url=url,
80
+ link_type=link_type, status=LinkStatus.ERROR,
81
+ error=str(exc),
82
+ )
83
+
84
+
85
+ async def _check_with_retry(
86
+ client: httpx.AsyncClient,
87
+ url: str,
88
+ source_file: str,
89
+ line: int,
90
+ link_type: LinkType,
91
+ retries: int,
92
+ ) -> LinkResult:
93
+ last_exc: Exception | None = None
94
+ for attempt in range(retries + 1):
95
+ try:
96
+ result = await _try_head(client, url, source_file, line, link_type)
97
+ if result.http_code in _RETRY_ON and attempt < retries:
98
+ await asyncio.sleep(2 ** attempt)
99
+ continue
100
+ return result
101
+ except httpx.HTTPError as exc:
102
+ last_exc = exc
103
+ if attempt < retries:
104
+ await asyncio.sleep(2 ** attempt)
105
+
106
+ return LinkResult(
107
+ source_file=source_file, line=line, url=url,
108
+ link_type=link_type, status=LinkStatus.ERROR,
109
+ error=str(last_exc),
110
+ )
111
+
112
+
113
+ async def _try_head(
114
+ client: httpx.AsyncClient,
115
+ url: str,
116
+ source_file: str,
117
+ line: int,
118
+ link_type: LinkType,
119
+ ) -> LinkResult:
120
+ try:
121
+ resp = await client.head(url)
122
+ except httpx.HTTPError:
123
+ raise
124
+
125
+ if resp.status_code in _FALLBACK_ON:
126
+ # Server doesn't support HEAD — try GET with streaming (no body)
127
+ async with client.stream("GET", url) as stream_resp:
128
+ code = stream_resp.status_code
129
+ resolved = str(stream_resp.url)
130
+ else:
131
+ code = resp.status_code
132
+ resolved = str(resp.url)
133
+
134
+ return _make_result(url, source_file, line, link_type, code, resolved)
135
+
136
+
137
+ def _make_result(
138
+ url: str,
139
+ source_file: str,
140
+ line: int,
141
+ link_type: LinkType,
142
+ code: int,
143
+ resolved_url: str,
144
+ ) -> LinkResult:
145
+ # With follow_redirects=True, httpx resolves the full chain.
146
+ # A redirect is detected when the final URL differs from the original.
147
+ was_redirected = resolved_url.rstrip("/") != url.rstrip("/")
148
+
149
+ if code >= 400:
150
+ status = LinkStatus.BROKEN
151
+ elif was_redirected:
152
+ status = LinkStatus.REDIRECT
153
+ else:
154
+ status = LinkStatus.OK
155
+
156
+ return LinkResult(
157
+ source_file=source_file,
158
+ line=line,
159
+ url=url,
160
+ link_type=link_type,
161
+ status=status,
162
+ http_code=code,
163
+ resolved_url=resolved_url if was_redirected else None,
164
+ )
165
+
166
+
167
+ def _domain_match(domain: str, ignore_set: set[str]) -> bool:
168
+ """Return True if domain or any parent domain is in the ignore set."""
169
+ return domain in ignore_set or any(
170
+ domain.endswith("." + d) for d in ignore_set
171
+ )
@@ -0,0 +1,228 @@
1
+ """Playwright-based link extractor and checker for JS-rendered pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import contextlib
7
+ from urllib.parse import urlparse
8
+
9
+ from linksanity.queue import LinkResult, LinkStatus, LinkType
10
+
11
+ _SKIP_SCHEMES = ("mailto:", "javascript:", "data:", "blob:")
12
+
13
+ # Well-known analytics and tracking domains. Requests to these are aborted
14
+ # when --block-analytics is set, speeding up crawls and suppressing false hits.
15
+ ANALYTICS_DOMAINS: frozenset[str] = frozenset({
16
+ "google-analytics.com",
17
+ "analytics.google.com",
18
+ "googletagmanager.com",
19
+ "googletagservices.com",
20
+ "doubleclick.net",
21
+ "hotjar.com",
22
+ "segment.com",
23
+ "cdn.segment.com",
24
+ "api.segment.io",
25
+ "mixpanel.com",
26
+ "amplitude.com",
27
+ "heap.io",
28
+ "heapanalytics.com",
29
+ "fullstory.com",
30
+ "clarity.ms",
31
+ "plausible.io",
32
+ "intercom.io",
33
+ "intercomcdn.com",
34
+ "widget.intercom.io",
35
+ })
36
+
37
+
38
+ def _require_playwright() -> None:
39
+ try:
40
+ import playwright # noqa: F401
41
+ except ImportError:
42
+ raise ImportError(
43
+ "Playwright is not installed. "
44
+ "Run: pip install linksanity[browser] && playwright install chromium"
45
+ ) from None
46
+
47
+
48
+ async def extract_links(url: str, *, semaphore: asyncio.Semaphore | None = None) -> list[str]:
49
+ """Launch a headless browser, render the page, and return all href values.
50
+
51
+ Filters out mailto:, javascript:, data:, blob:, and empty hrefs.
52
+ semaphore limits concurrent browser contexts.
53
+ """
54
+ _require_playwright()
55
+ from playwright.async_api import async_playwright
56
+
57
+ sem = semaphore or asyncio.Semaphore(2)
58
+ async with sem, async_playwright() as pw:
59
+ browser = await pw.chromium.launch(headless=True)
60
+ try:
61
+ page = await browser.new_page()
62
+ await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
63
+ hrefs: list[str] = await page.eval_on_selector_all(
64
+ "a[href]",
65
+ "els => els.map(e => e.href).filter(h => h)",
66
+ )
67
+ return [
68
+ h for h in hrefs
69
+ if h and not any(h.startswith(s) for s in _SKIP_SCHEMES)
70
+ ]
71
+ finally:
72
+ await browser.close()
73
+
74
+
75
+ async def check(
76
+ url: str,
77
+ source_file: str,
78
+ line: int,
79
+ link_type: LinkType,
80
+ *,
81
+ semaphore: asyncio.Semaphore | None = None,
82
+ timeout: int = 10,
83
+ ) -> LinkResult:
84
+ """Check whether a URL is reachable using a headless browser.
85
+
86
+ Uses Playwright's network response to determine status.
87
+ """
88
+ _require_playwright()
89
+ from playwright.async_api import Error as PlaywrightError
90
+ from playwright.async_api import async_playwright
91
+
92
+ sem = semaphore or asyncio.Semaphore(2)
93
+ async with sem, async_playwright() as pw:
94
+ browser = await pw.chromium.launch(headless=True)
95
+ try:
96
+ page = await browser.new_page()
97
+ try:
98
+ response = await page.goto(
99
+ url,
100
+ wait_until="domcontentloaded",
101
+ timeout=timeout * 1000,
102
+ )
103
+ if response is None:
104
+ return LinkResult(
105
+ source_file=source_file, line=line, url=url,
106
+ link_type=link_type, status=LinkStatus.ERROR,
107
+ error="no response",
108
+ )
109
+ code = response.status
110
+ resolved = page.url
111
+ was_redirected = _strip(resolved) != _strip(url)
112
+ if code >= 400:
113
+ status = LinkStatus.BROKEN
114
+ elif was_redirected:
115
+ status = LinkStatus.REDIRECT
116
+ else:
117
+ status = LinkStatus.OK
118
+ return LinkResult(
119
+ source_file=source_file, line=line, url=url,
120
+ link_type=link_type, status=status,
121
+ http_code=code,
122
+ resolved_url=resolved if was_redirected else None,
123
+ )
124
+ except PlaywrightError as exc:
125
+ return LinkResult(
126
+ source_file=source_file, line=line, url=url,
127
+ link_type=link_type, status=LinkStatus.ERROR,
128
+ error=str(exc),
129
+ )
130
+ finally:
131
+ await browser.close()
132
+
133
+
134
+ async def crawl_page(
135
+ url: str,
136
+ source_file: str,
137
+ line: int,
138
+ link_type: LinkType,
139
+ *,
140
+ semaphore: asyncio.Semaphore | None = None,
141
+ timeout: int = 10,
142
+ block_domains: frozenset[str] | None = None,
143
+ ) -> tuple[LinkResult, list[str]]:
144
+ """Visit a page, check its reachability, and return (result, hrefs).
145
+
146
+ Combines check() and extract_links() into a single browser session.
147
+ """
148
+ _require_playwright()
149
+ from playwright.async_api import Error as PlaywrightError
150
+ from playwright.async_api import async_playwright
151
+
152
+ sem = semaphore or asyncio.Semaphore(2)
153
+ async with sem, async_playwright() as pw:
154
+ browser = await pw.chromium.launch(headless=True)
155
+ try:
156
+ page = await browser.new_page()
157
+ if block_domains:
158
+ from playwright.async_api import Route
159
+
160
+ async def _block(route: Route) -> None:
161
+ netloc = urlparse(route.request.url).netloc.lower()
162
+ if any(netloc == d or netloc.endswith("." + d) for d in block_domains):
163
+ await route.abort()
164
+ else:
165
+ await route.continue_()
166
+
167
+ await page.route("**/*", _block)
168
+ try:
169
+ response = await page.goto(
170
+ url,
171
+ wait_until="domcontentloaded",
172
+ timeout=timeout * 1000,
173
+ )
174
+ if response is None:
175
+ return LinkResult(
176
+ source_file=source_file, line=line, url=url,
177
+ link_type=link_type, status=LinkStatus.ERROR,
178
+ error="no response",
179
+ ), []
180
+ # Wait for JS to finish rendering navigation (SPAs build links client-side)
181
+ with contextlib.suppress(PlaywrightError):
182
+ await page.wait_for_load_state("networkidle", timeout=5000)
183
+ code = response.status
184
+ resolved = page.url
185
+ was_redirected = _strip(resolved) != _strip(url)
186
+ if code >= 400:
187
+ status = LinkStatus.BROKEN
188
+ elif was_redirected:
189
+ status = LinkStatus.REDIRECT
190
+ else:
191
+ status = LinkStatus.OK
192
+ result = LinkResult(
193
+ source_file=source_file, line=line, url=url,
194
+ link_type=link_type, status=status,
195
+ http_code=code,
196
+ resolved_url=resolved if was_redirected else None,
197
+ )
198
+ # Extract links from any reachable page, including redirects
199
+ if status in (LinkStatus.OK, LinkStatus.REDIRECT):
200
+ hrefs: list[str] = await page.eval_on_selector_all(
201
+ "a[href]",
202
+ "els => els.map(e => e.href).filter(h => h)",
203
+ )
204
+ links = [
205
+ h for h in hrefs
206
+ if h and not any(h.startswith(s) for s in _SKIP_SCHEMES)
207
+ ]
208
+ else:
209
+ links = []
210
+ return result, links
211
+ except PlaywrightError as exc:
212
+ return LinkResult(
213
+ source_file=source_file, line=line, url=url,
214
+ link_type=link_type, status=LinkStatus.ERROR,
215
+ error=str(exc),
216
+ ), []
217
+ finally:
218
+ await browser.close()
219
+
220
+
221
+ def _strip(url: str) -> str:
222
+ return url.rstrip("/")
223
+
224
+
225
+ def scope_filter(urls: list[str], start_url: str) -> list[str]:
226
+ """Keep only URLs on the same domain as start_url."""
227
+ domain = urlparse(start_url).netloc.lower()
228
+ return [u for u in urls if urlparse(u).netloc.lower() == domain]