pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pull_cli/links.py ADDED
@@ -0,0 +1,186 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from urllib.parse import urlsplit
6
+
7
+ from bs4 import BeautifulSoup
8
+
9
+ from .assets import ATTACHMENT_PATH_RE
10
+ from .models import AssetRecord, LinkRecord, PageSummary, WarningRecord
11
+ from .paths import relative_path
12
+ from .resolver import page_id_from_url
13
+
14
+ HEADING_CHARS_RE = re.compile(r"[^a-z0-9 -]")
15
+
16
+
17
+ def rewrite_html_links(
18
+ html: str,
19
+ *,
20
+ page: PageSummary,
21
+ page_index_path: str,
22
+ pages_by_id: dict[str, PageSummary],
23
+ page_paths: dict[str, str],
24
+ assets: list[AssetRecord],
25
+ rewrite_links: bool,
26
+ ) -> tuple[str, list[LinkRecord], list[WarningRecord]]:
27
+ soup = BeautifulSoup(html or "", "lxml")
28
+ links: list[LinkRecord] = []
29
+ warnings: list[WarningRecord] = []
30
+ asset_by_original = _asset_lookup(assets)
31
+ anchors = _heading_anchors(soup)
32
+
33
+ for tag in soup.find_all("img"):
34
+ src = tag.get("src")
35
+ if not isinstance(src, str):
36
+ continue
37
+ asset = asset_by_original.get(_asset_key(src))
38
+ if asset and rewrite_links:
39
+ rewritten = relative_path(page_index_path, asset.local_path)
40
+ tag["src"] = rewritten
41
+ links.append(
42
+ LinkRecord(
43
+ original=src,
44
+ normalized=_asset_key(src),
45
+ kind="asset",
46
+ source_page_id=page.page_id,
47
+ target_asset_id=asset.asset_id,
48
+ rewritten=rewritten,
49
+ status="rewritten",
50
+ )
51
+ )
52
+
53
+ for tag in soup.find_all("a"):
54
+ href = tag.get("href")
55
+ if not isinstance(href, str):
56
+ continue
57
+ record = _rewrite_href(
58
+ href,
59
+ page=page,
60
+ page_index_path=page_index_path,
61
+ pages_by_id=pages_by_id,
62
+ page_paths=page_paths,
63
+ asset_by_original=asset_by_original,
64
+ anchors=anchors,
65
+ rewrite_links=rewrite_links,
66
+ )
67
+ links.append(record)
68
+ if record.rewritten and rewrite_links:
69
+ tag["href"] = record.rewritten
70
+ if record.warning:
71
+ warnings.append(
72
+ WarningRecord(
73
+ code=record.warning,
74
+ message=f"Link could not be fully rewritten: {href}",
75
+ source_page_id=page.page_id,
76
+ details={"href": href},
77
+ )
78
+ )
79
+ return str(soup.body or soup), links, warnings
80
+
81
+
82
+ def markdown_anchor(text: str) -> str:
83
+ value = text.strip().lower().replace("_", "-")
84
+ value = HEADING_CHARS_RE.sub("", value)
85
+ value = re.sub(r"\s+", "-", value)
86
+ value = re.sub(r"-+", "-", value).strip("-")
87
+ return value
88
+
89
+
90
+ def _rewrite_href(
91
+ href: str,
92
+ *,
93
+ page: PageSummary,
94
+ page_index_path: str,
95
+ pages_by_id: dict[str, PageSummary],
96
+ page_paths: dict[str, str],
97
+ asset_by_original: dict[str, AssetRecord],
98
+ anchors: set[str],
99
+ rewrite_links: bool,
100
+ ) -> LinkRecord:
101
+ if href.startswith("mailto:"):
102
+ return LinkRecord(href, href, "mailto", page.page_id, status="preserved")
103
+ if _is_jira(href):
104
+ return LinkRecord(href, href, "jira", page.page_id, status="preserved")
105
+ if href.startswith("#"):
106
+ anchor = markdown_anchor(href[1:])
107
+ status = "rewritten" if anchor in anchors else "unresolved"
108
+ warning = None if anchor in anchors else "W_LINK_ANCHOR_UNRESOLVED"
109
+ rewritten = f"#{anchor}" if anchor and rewrite_links else href
110
+ return LinkRecord(href, anchor, "anchor", page.page_id, rewritten=rewritten, status=status, warning=warning)
111
+
112
+ asset = asset_by_original.get(_asset_key(href))
113
+ if asset:
114
+ rewritten = relative_path(page_index_path, asset.local_path)
115
+ return LinkRecord(
116
+ href,
117
+ _asset_key(href),
118
+ "attachment",
119
+ page.page_id,
120
+ target_asset_id=asset.asset_id,
121
+ rewritten=rewritten,
122
+ status="rewritten",
123
+ )
124
+
125
+ target_page_id = page_id_from_url(href)
126
+ if target_page_id and target_page_id in pages_by_id:
127
+ anchor = urlsplit(href).fragment
128
+ rewritten = relative_path(page_index_path, page_paths[target_page_id])
129
+ if anchor:
130
+ rewritten += f"#{markdown_anchor(anchor)}"
131
+ return LinkRecord(
132
+ href,
133
+ target_page_id,
134
+ "page",
135
+ page.page_id,
136
+ target_page_id=target_page_id,
137
+ rewritten=rewritten,
138
+ status="rewritten",
139
+ )
140
+ if ATTACHMENT_PATH_RE.search(href):
141
+ return LinkRecord(
142
+ href,
143
+ _asset_key(href),
144
+ "attachment",
145
+ page.page_id,
146
+ status="unresolved",
147
+ warning="W_LINK_UNRESOLVED",
148
+ )
149
+ if href.startswith(("http://", "https://")):
150
+ return LinkRecord(href, href, "external", page.page_id, status="preserved")
151
+ return LinkRecord(href, href, "unknown", page.page_id, status="preserved")
152
+
153
+
154
+ def _heading_anchors(soup: BeautifulSoup) -> set[str]:
155
+ anchors: set[str] = set()
156
+ for tag in soup.find_all(re.compile("^h[1-6]$")):
157
+ if tag.get("id"):
158
+ anchors.add(markdown_anchor(str(tag["id"])))
159
+ text = tag.get_text(" ", strip=True)
160
+ if text:
161
+ anchors.add(markdown_anchor(text))
162
+ for tag in soup.find_all(attrs={"name": True}):
163
+ anchors.add(markdown_anchor(str(tag["name"])))
164
+ return anchors
165
+
166
+
167
+ def _asset_lookup(assets: list[AssetRecord]) -> dict[str, AssetRecord]:
168
+ lookup: dict[str, AssetRecord] = {}
169
+ for asset in assets:
170
+ for ref in asset.references:
171
+ lookup[_asset_key(ref.original)] = asset
172
+ if asset.source_url:
173
+ lookup[_asset_key(asset.source_url)] = asset
174
+ lookup[Path(asset.filename).name.lower()] = asset
175
+ return lookup
176
+
177
+
178
+ def _asset_key(url: str) -> str:
179
+ parsed = urlsplit(url)
180
+ if parsed.path:
181
+ return parsed.path.lower()
182
+ return url.lower()
183
+
184
+
185
+ def _is_jira(href: str) -> bool:
186
+ return bool(re.search(r"/browse/[A-Z][A-Z0-9]+-\d+", href)) or href.startswith("jira:")