pull-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pull_cli/__init__.py +5 -0
- pull_cli/__main__.py +6 -0
- pull_cli/assets.py +235 -0
- pull_cli/attachment_extractors.py +85 -0
- pull_cli/cli.py +329 -0
- pull_cli/clients/__init__.py +8 -0
- pull_cli/clients/base.py +29 -0
- pull_cli/clients/cloud_v2.py +132 -0
- pull_cli/clients/data_center.py +360 -0
- pull_cli/clients/hybrid.py +15 -0
- pull_cli/config.py +82 -0
- pull_cli/crawler.py +51 -0
- pull_cli/envelope.py +59 -0
- pull_cli/errors.py +50 -0
- pull_cli/extractor.py +344 -0
- pull_cli/guide.py +115 -0
- pull_cli/html_normalizer.py +111 -0
- pull_cli/links.py +186 -0
- pull_cli/macros.py +527 -0
- pull_cli/markdown_writer.py +24 -0
- pull_cli/models.py +232 -0
- pull_cli/paths.py +45 -0
- pull_cli/resolver.py +72 -0
- pull_cli/security.py +103 -0
- pull_cli/validator.py +398 -0
- pull_cli/writer.py +792 -0
- pull_cli-0.1.0.dist-info/METADATA +218 -0
- pull_cli-0.1.0.dist-info/RECORD +31 -0
- pull_cli-0.1.0.dist-info/WHEEL +4 -0
- pull_cli-0.1.0.dist-info/entry_points.txt +3 -0
- pull_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
pull_cli/links.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from urllib.parse import urlsplit
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
from .assets import ATTACHMENT_PATH_RE
|
|
10
|
+
from .models import AssetRecord, LinkRecord, PageSummary, WarningRecord
|
|
11
|
+
from .paths import relative_path
|
|
12
|
+
from .resolver import page_id_from_url
|
|
13
|
+
|
|
14
|
+
HEADING_CHARS_RE = re.compile(r"[^a-z0-9 -]")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def rewrite_html_links(
|
|
18
|
+
html: str,
|
|
19
|
+
*,
|
|
20
|
+
page: PageSummary,
|
|
21
|
+
page_index_path: str,
|
|
22
|
+
pages_by_id: dict[str, PageSummary],
|
|
23
|
+
page_paths: dict[str, str],
|
|
24
|
+
assets: list[AssetRecord],
|
|
25
|
+
rewrite_links: bool,
|
|
26
|
+
) -> tuple[str, list[LinkRecord], list[WarningRecord]]:
|
|
27
|
+
soup = BeautifulSoup(html or "", "lxml")
|
|
28
|
+
links: list[LinkRecord] = []
|
|
29
|
+
warnings: list[WarningRecord] = []
|
|
30
|
+
asset_by_original = _asset_lookup(assets)
|
|
31
|
+
anchors = _heading_anchors(soup)
|
|
32
|
+
|
|
33
|
+
for tag in soup.find_all("img"):
|
|
34
|
+
src = tag.get("src")
|
|
35
|
+
if not isinstance(src, str):
|
|
36
|
+
continue
|
|
37
|
+
asset = asset_by_original.get(_asset_key(src))
|
|
38
|
+
if asset and rewrite_links:
|
|
39
|
+
rewritten = relative_path(page_index_path, asset.local_path)
|
|
40
|
+
tag["src"] = rewritten
|
|
41
|
+
links.append(
|
|
42
|
+
LinkRecord(
|
|
43
|
+
original=src,
|
|
44
|
+
normalized=_asset_key(src),
|
|
45
|
+
kind="asset",
|
|
46
|
+
source_page_id=page.page_id,
|
|
47
|
+
target_asset_id=asset.asset_id,
|
|
48
|
+
rewritten=rewritten,
|
|
49
|
+
status="rewritten",
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
for tag in soup.find_all("a"):
|
|
54
|
+
href = tag.get("href")
|
|
55
|
+
if not isinstance(href, str):
|
|
56
|
+
continue
|
|
57
|
+
record = _rewrite_href(
|
|
58
|
+
href,
|
|
59
|
+
page=page,
|
|
60
|
+
page_index_path=page_index_path,
|
|
61
|
+
pages_by_id=pages_by_id,
|
|
62
|
+
page_paths=page_paths,
|
|
63
|
+
asset_by_original=asset_by_original,
|
|
64
|
+
anchors=anchors,
|
|
65
|
+
rewrite_links=rewrite_links,
|
|
66
|
+
)
|
|
67
|
+
links.append(record)
|
|
68
|
+
if record.rewritten and rewrite_links:
|
|
69
|
+
tag["href"] = record.rewritten
|
|
70
|
+
if record.warning:
|
|
71
|
+
warnings.append(
|
|
72
|
+
WarningRecord(
|
|
73
|
+
code=record.warning,
|
|
74
|
+
message=f"Link could not be fully rewritten: {href}",
|
|
75
|
+
source_page_id=page.page_id,
|
|
76
|
+
details={"href": href},
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
return str(soup.body or soup), links, warnings
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def markdown_anchor(text: str) -> str:
|
|
83
|
+
value = text.strip().lower().replace("_", "-")
|
|
84
|
+
value = HEADING_CHARS_RE.sub("", value)
|
|
85
|
+
value = re.sub(r"\s+", "-", value)
|
|
86
|
+
value = re.sub(r"-+", "-", value).strip("-")
|
|
87
|
+
return value
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _rewrite_href(
|
|
91
|
+
href: str,
|
|
92
|
+
*,
|
|
93
|
+
page: PageSummary,
|
|
94
|
+
page_index_path: str,
|
|
95
|
+
pages_by_id: dict[str, PageSummary],
|
|
96
|
+
page_paths: dict[str, str],
|
|
97
|
+
asset_by_original: dict[str, AssetRecord],
|
|
98
|
+
anchors: set[str],
|
|
99
|
+
rewrite_links: bool,
|
|
100
|
+
) -> LinkRecord:
|
|
101
|
+
if href.startswith("mailto:"):
|
|
102
|
+
return LinkRecord(href, href, "mailto", page.page_id, status="preserved")
|
|
103
|
+
if _is_jira(href):
|
|
104
|
+
return LinkRecord(href, href, "jira", page.page_id, status="preserved")
|
|
105
|
+
if href.startswith("#"):
|
|
106
|
+
anchor = markdown_anchor(href[1:])
|
|
107
|
+
status = "rewritten" if anchor in anchors else "unresolved"
|
|
108
|
+
warning = None if anchor in anchors else "W_LINK_ANCHOR_UNRESOLVED"
|
|
109
|
+
rewritten = f"#{anchor}" if anchor and rewrite_links else href
|
|
110
|
+
return LinkRecord(href, anchor, "anchor", page.page_id, rewritten=rewritten, status=status, warning=warning)
|
|
111
|
+
|
|
112
|
+
asset = asset_by_original.get(_asset_key(href))
|
|
113
|
+
if asset:
|
|
114
|
+
rewritten = relative_path(page_index_path, asset.local_path)
|
|
115
|
+
return LinkRecord(
|
|
116
|
+
href,
|
|
117
|
+
_asset_key(href),
|
|
118
|
+
"attachment",
|
|
119
|
+
page.page_id,
|
|
120
|
+
target_asset_id=asset.asset_id,
|
|
121
|
+
rewritten=rewritten,
|
|
122
|
+
status="rewritten",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
target_page_id = page_id_from_url(href)
|
|
126
|
+
if target_page_id and target_page_id in pages_by_id:
|
|
127
|
+
anchor = urlsplit(href).fragment
|
|
128
|
+
rewritten = relative_path(page_index_path, page_paths[target_page_id])
|
|
129
|
+
if anchor:
|
|
130
|
+
rewritten += f"#{markdown_anchor(anchor)}"
|
|
131
|
+
return LinkRecord(
|
|
132
|
+
href,
|
|
133
|
+
target_page_id,
|
|
134
|
+
"page",
|
|
135
|
+
page.page_id,
|
|
136
|
+
target_page_id=target_page_id,
|
|
137
|
+
rewritten=rewritten,
|
|
138
|
+
status="rewritten",
|
|
139
|
+
)
|
|
140
|
+
if ATTACHMENT_PATH_RE.search(href):
|
|
141
|
+
return LinkRecord(
|
|
142
|
+
href,
|
|
143
|
+
_asset_key(href),
|
|
144
|
+
"attachment",
|
|
145
|
+
page.page_id,
|
|
146
|
+
status="unresolved",
|
|
147
|
+
warning="W_LINK_UNRESOLVED",
|
|
148
|
+
)
|
|
149
|
+
if href.startswith(("http://", "https://")):
|
|
150
|
+
return LinkRecord(href, href, "external", page.page_id, status="preserved")
|
|
151
|
+
return LinkRecord(href, href, "unknown", page.page_id, status="preserved")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _heading_anchors(soup: BeautifulSoup) -> set[str]:
|
|
155
|
+
anchors: set[str] = set()
|
|
156
|
+
for tag in soup.find_all(re.compile("^h[1-6]$")):
|
|
157
|
+
if tag.get("id"):
|
|
158
|
+
anchors.add(markdown_anchor(str(tag["id"])))
|
|
159
|
+
text = tag.get_text(" ", strip=True)
|
|
160
|
+
if text:
|
|
161
|
+
anchors.add(markdown_anchor(text))
|
|
162
|
+
for tag in soup.find_all(attrs={"name": True}):
|
|
163
|
+
anchors.add(markdown_anchor(str(tag["name"])))
|
|
164
|
+
return anchors
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _asset_lookup(assets: list[AssetRecord]) -> dict[str, AssetRecord]:
|
|
168
|
+
lookup: dict[str, AssetRecord] = {}
|
|
169
|
+
for asset in assets:
|
|
170
|
+
for ref in asset.references:
|
|
171
|
+
lookup[_asset_key(ref.original)] = asset
|
|
172
|
+
if asset.source_url:
|
|
173
|
+
lookup[_asset_key(asset.source_url)] = asset
|
|
174
|
+
lookup[Path(asset.filename).name.lower()] = asset
|
|
175
|
+
return lookup
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _asset_key(url: str) -> str:
|
|
179
|
+
parsed = urlsplit(url)
|
|
180
|
+
if parsed.path:
|
|
181
|
+
return parsed.path.lower()
|
|
182
|
+
return url.lower()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _is_jira(href: str) -> bool:
|
|
186
|
+
return bool(re.search(r"/browse/[A-Z][A-Z0-9]+-\d+", href)) or href.startswith("jira:")
|