pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pull_cli/envelope.py ADDED
@@ -0,0 +1,59 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import time
6
+ import uuid
7
+ from dataclasses import asdict
8
+ from typing import Any
9
+
10
+ from .errors import PullError
11
+ from .models import WarningRecord
12
+ from .security import redact_value
13
+
14
+ SCHEMA_VERSION = "1.0"
15
+
16
+
17
+ def wants_json(explicit: bool) -> bool:
18
+ return explicit or os.environ.get("LLM", "").lower() == "true"
19
+
20
+
21
+ def request_id() -> str:
22
+ return time.strftime("req_%Y%m%d_%H%M%S_") + uuid.uuid4().hex[:8]
23
+
24
+
25
+ def make_envelope(
26
+ *,
27
+ ok: bool,
28
+ command: str,
29
+ target: dict[str, Any] | None = None,
30
+ result: dict[str, Any] | None = None,
31
+ warnings: list[WarningRecord | dict[str, Any]] | None = None,
32
+ errors: list[PullError | dict[str, Any]] | None = None,
33
+ metrics: dict[str, Any] | None = None,
34
+ request_id_value: str | None = None,
35
+ ) -> dict[str, Any]:
36
+ warning_records = [
37
+ warning.to_dict() if isinstance(warning, WarningRecord) else warning
38
+ for warning in (warnings or [])
39
+ ]
40
+ error_records = [error.to_record() if isinstance(error, PullError) else error for error in errors or []]
41
+ return {
42
+ "schema_version": SCHEMA_VERSION,
43
+ "request_id": request_id_value or request_id(),
44
+ "ok": ok,
45
+ "command": command,
46
+ "target": target or {},
47
+ "result": result if ok else None,
48
+ "warnings": warning_records,
49
+ "errors": error_records,
50
+ "metrics": metrics or {},
51
+ }
52
+
53
+
54
+ def emit_json(data: dict[str, Any]) -> None:
55
+ print(json.dumps(redact_value(data), ensure_ascii=False, separators=(",", ":")))
56
+
57
+
58
+ def dataclass_dict(value: Any) -> dict[str, Any]:
59
+ return asdict(value)
pull_cli/errors.py ADDED
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+ EXIT_SUCCESS = 0
7
+ EXIT_VALIDATION = 10
8
+ EXIT_AUTH = 20
9
+ EXIT_SOURCE = 30
10
+ EXIT_STRICT_PARTIAL = 40
11
+ EXIT_IO = 50
12
+ EXIT_INTERNAL = 90
13
+
14
+
15
+ @dataclass
16
+ class PullError(Exception):
17
+ code: str
18
+ message: str
19
+ exit_code: int = EXIT_INTERNAL
20
+ retryable: bool = False
21
+ suggested_action: str | None = None
22
+ details: dict[str, Any] = field(default_factory=dict)
23
+
24
+ def __str__(self) -> str:
25
+ return f"{self.code}: {self.message}"
26
+
27
+ def to_record(self) -> dict[str, Any]:
28
+ return {
29
+ "code": self.code,
30
+ "message": self.message,
31
+ "retryable": self.retryable,
32
+ "suggested_action": self.suggested_action,
33
+ "details": self.details,
34
+ }
35
+
36
+
37
+ def validation_error(
38
+ code: str,
39
+ message: str,
40
+ *,
41
+ suggested_action: str | None = None,
42
+ details: dict[str, Any] | None = None,
43
+ ) -> PullError:
44
+ return PullError(
45
+ code=code,
46
+ message=message,
47
+ exit_code=EXIT_VALIDATION,
48
+ suggested_action=suggested_action,
49
+ details=details or {},
50
+ )
pull_cli/extractor.py ADDED
@@ -0,0 +1,344 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from .assets import discover_asset_candidates, download_assets, skipped_asset_warnings
6
+ from .clients.base import ConfluenceClient
7
+ from .crawler import crawl_pages
8
+ from .errors import EXIT_STRICT_PARTIAL, PullError
9
+ from .html_normalizer import normalize_html
10
+ from .links import rewrite_html_links
11
+ from .macros import MacroContext, MacroRegistry
12
+ from .markdown_writer import rendered_html_to_markdown
13
+ from .models import (
14
+ CommentRecord,
15
+ ExtractionResult,
16
+ PageArtifact,
17
+ PageSummary,
18
+ PullOptions,
19
+ WarningRecord,
20
+ )
21
+ from .paths import relative_path, slugify
22
+ from .security import redact_source_url_text, redact_text
23
+ from .writer import (
24
+ page_markdown_header,
25
+ prepare_output_dir,
26
+ write_bundle,
27
+ write_diagnostics,
28
+ write_manifest,
29
+ write_page_artifact,
30
+ )
31
+
32
+
33
+ def extract(
34
+ *,
35
+ client: ConfluenceClient,
36
+ root: PageSummary,
37
+ options: PullOptions,
38
+ ) -> ExtractionResult:
39
+ prepare_output_dir(options.output, force=options.force, clean=options.clean)
40
+ summaries = crawl_pages(
41
+ client,
42
+ root,
43
+ tree=options.tree,
44
+ depth=options.depth,
45
+ max_pages=options.max_pages,
46
+ )
47
+ page_paths = _page_paths(summaries, options=options)
48
+ pages_by_id = {summary.page_id: summary for summary in summaries}
49
+ registry = MacroRegistry()
50
+ result = ExtractionResult(
51
+ output_dir=options.output,
52
+ manifest_path=options.output / "manifest.yaml",
53
+ bundle_path=options.output / "bundle.md" if options.write_bundle else None,
54
+ pages=[],
55
+ assets=[],
56
+ warnings=[],
57
+ links=[],
58
+ macros=[],
59
+ )
60
+ for summary in summaries:
61
+ page = client.get_page(summary.page_id)
62
+ page.order = summary.order
63
+ page.depth = summary.depth
64
+ page.parent_id = summary.parent_id
65
+ page.title = page.title or summary.title
66
+ page.url = page.url or summary.url
67
+ page_dir = page_paths[page.page_id].removesuffix("/index.md")
68
+ index_md = f"{page_dir}/index.md"
69
+ index_html = f"{page_dir}/index.html" if options.write_html else None
70
+ source_path = f"{page_dir}/source.storage.xml" if options.write_source and page.body_storage else None
71
+ page_json = f"{page_dir}/page.json"
72
+ comments, comment_warnings = _collect_comments(client, page.page_id, options=options)
73
+ comments_path = f"{page_dir}/comments.md" if comments else None
74
+ rendered = _select_rendered_body(page.body_view, page.body_export_view, page.body_storage)
75
+ normalized_html, html_warnings = normalize_html(
76
+ rendered,
77
+ source_page_id=page.page_id,
78
+ )
79
+ attachments = client.list_attachments(page.page_id)
80
+ candidates = discover_asset_candidates(
81
+ normalized_html,
82
+ page_id=page.page_id,
83
+ attachments=attachments,
84
+ options=options,
85
+ )
86
+ assets, asset_warnings = download_assets(
87
+ candidates,
88
+ page_id=page.page_id,
89
+ page_assets_dir=options.output / page_dir / "assets",
90
+ page_assets_path=f"{page_dir}/assets",
91
+ client=client,
92
+ extract_attachments=options.extract_attachments,
93
+ )
94
+ if options.no_assets:
95
+ asset_warnings.extend(skipped_asset_warnings(normalized_html, page_id=page.page_id))
96
+ rewritten_html, links, link_warnings = rewrite_html_links(
97
+ normalized_html,
98
+ page=page,
99
+ page_index_path=index_md,
100
+ pages_by_id=pages_by_id,
101
+ page_paths=page_paths,
102
+ assets=assets,
103
+ rewrite_links=options.rewrite_links,
104
+ )
105
+ if options.redact_manifest or options.redact_source_urls:
106
+ _redact_links(links, redact_source_urls=options.redact_source_urls)
107
+ if options.redact_source_urls:
108
+ rewritten_html, _redaction_warnings = normalize_html(
109
+ rewritten_html,
110
+ source_page_id=page.page_id,
111
+ redact_source_urls=True,
112
+ )
113
+ macro_context = MacroContext(
114
+ page_id=page.page_id,
115
+ attachments=attachments,
116
+ options=options,
117
+ child_links=_child_links(page, summaries, page_paths),
118
+ )
119
+ macros = registry.convert_all(page.body_storage, macro_context)
120
+ _enforce_strict_macros(macros, options=options)
121
+ macro_warnings = [warning for macro in macros for warning in macro.warnings]
122
+ visible_markdown = rendered_html_to_markdown(rewritten_html)
123
+ attachment_markdown = _attachment_markdown(assets, page_index_path=index_md)
124
+ if attachment_markdown:
125
+ visible_markdown = visible_markdown.rstrip() + "\n\n" + attachment_markdown + "\n"
126
+ macro_markdown = _macro_recovery_markdown(macros)
127
+ artifact = PageArtifact(
128
+ page=page,
129
+ order=page.order,
130
+ page_dir=page_dir,
131
+ index_md=index_md,
132
+ index_html=index_html,
133
+ source_path=source_path,
134
+ page_json=page_json,
135
+ markdown="",
136
+ html=rewritten_html,
137
+ assets=assets,
138
+ links=links,
139
+ macros=macros,
140
+ warnings=[*html_warnings, *asset_warnings, *link_warnings, *macro_warnings, *comment_warnings],
141
+ comments_path=comments_path,
142
+ comments=comments,
143
+ )
144
+ artifact.markdown = (
145
+ page_markdown_header(artifact, options=options)
146
+ + visible_markdown
147
+ + ("\n\n## Macro Recovery\n\n" + macro_markdown + "\n" if macro_markdown else "")
148
+ )
149
+ write_page_artifact(options.output, artifact, options=options)
150
+ result.pages.append(artifact)
151
+ result.assets.extend(assets)
152
+ result.links.extend(links)
153
+ result.macros.extend(macros)
154
+ result.warnings.extend(artifact.warnings)
155
+
156
+ unresolved = [
157
+ link.__dict__
158
+ for link in result.links
159
+ if link.status == "unresolved" or link.warning == "W_LINK_ANCHOR_UNRESOLVED"
160
+ ]
161
+ write_bundle(result, root_title=result.pages[0].page.title if result.pages else root.title, options=options)
162
+ if options.write_chunks:
163
+ _write_chunks(result)
164
+ write_diagnostics(options.output, result.warnings, unresolved)
165
+ write_manifest(
166
+ result,
167
+ options=options,
168
+ root_page_id=root.page_id,
169
+ base_url=client.base_url,
170
+ deployment_type=client.deployment_type,
171
+ )
172
+ result.metrics["api_calls"] = client.api_calls
173
+ result.metrics["pages"] = len(result.pages)
174
+ result.metrics["assets"] = len(result.assets)
175
+ return result
176
+
177
+
178
+ def _select_rendered_body(view: str | None, export_view: str | None, storage: str | None) -> str:
179
+ return view or export_view or storage or ""
180
+
181
+
182
+ def _macro_recovery_markdown(macros) -> str:
183
+ blocks = [macro.markdown.strip() for macro in macros if macro.markdown and macro.status != "ignored"]
184
+ return "\n\n".join(block for block in blocks if block)
185
+
186
+
187
+ def _attachment_markdown(assets, *, page_index_path: str) -> str:
188
+ rows = []
189
+ for asset in assets:
190
+ if asset.attachment_id:
191
+ asset_link = relative_path(page_index_path, asset.local_path)
192
+ sidecars = (
193
+ ", ".join(
194
+ f"`{sidecar}` ([open]({relative_path(page_index_path, sidecar)}))"
195
+ for sidecar in asset.sidecars
196
+ )
197
+ or ""
198
+ )
199
+ rows.append(
200
+ "| "
201
+ + " | ".join(
202
+ [
203
+ asset.filename,
204
+ f"`{asset.local_path}` ([open]({asset_link}))",
205
+ asset.media_type or "",
206
+ sidecars,
207
+ ]
208
+ )
209
+ + " |"
210
+ )
211
+ if not rows:
212
+ return ""
213
+ return "\n".join(
214
+ [
215
+ "## Attachments",
216
+ "",
217
+ "| Filename | Local path | Media type | Extracted sidecars |",
218
+ "| --- | --- | --- | --- |",
219
+ *rows,
220
+ ]
221
+ )
222
+
223
+
224
+ def _collect_comments(
225
+ client: ConfluenceClient, page_id: str, *, options: PullOptions
226
+ ) -> tuple[list[CommentRecord], list[WarningRecord]]:
227
+ if not options.comments:
228
+ return [], []
229
+ try:
230
+ return _unique_comments(client.list_comments(page_id)), []
231
+ except Exception as exc: # noqa: BLE001
232
+ return [], [
233
+ WarningRecord(
234
+ code="W_COMMENTS_FETCH_FAILED",
235
+ message="Could not fetch Confluence comments for this page.",
236
+ source_page_id=page_id,
237
+ details={"reason": _redacted_warning_reason(exc, options=options)},
238
+ )
239
+ ]
240
+
241
+
242
+ def _unique_comments(comments: list[CommentRecord]) -> list[CommentRecord]:
243
+ output: list[CommentRecord] = []
244
+ seen: set[str] = set()
245
+ for comment in comments:
246
+ if comment.comment_id and comment.comment_id in seen:
247
+ continue
248
+ if comment.comment_id:
249
+ seen.add(comment.comment_id)
250
+ output.append(comment)
251
+ return output
252
+
253
+
254
+ def _redacted_warning_reason(exc: Exception, *, options: PullOptions) -> str:
255
+ reason = redact_text(str(exc))
256
+ return redact_source_url_text(reason) if options.redact_source_urls else reason
257
+
258
+
259
+ def _redact_links(links, *, redact_source_urls: bool) -> None:
260
+ from .security import redact_text, sanitize_url
261
+
262
+ for link in links:
263
+ link.original = sanitize_url(link.original, redact_source_url=redact_source_urls) or redact_text(link.original)
264
+ link.normalized = sanitize_url(link.normalized, redact_source_url=redact_source_urls) or redact_text(link.normalized)
265
+
266
+
267
+ def _enforce_strict_macros(macros, *, options: PullOptions) -> None:
268
+ strict = options.macro_policy == "strict" or options.unknown_macro == "error"
269
+ if not strict:
270
+ return
271
+ failures = [
272
+ {
273
+ "macro_id": macro.macro_id,
274
+ "name": macro.name,
275
+ "status": macro.status,
276
+ "warnings": [warning.code for warning in macro.warnings],
277
+ }
278
+ for macro in macros
279
+ if macro.status in {"placeholder", "error"} or macro.warnings
280
+ ]
281
+ if failures:
282
+ raise PullError(
283
+ code="ERR_INTERNAL_CONVERSION",
284
+ message="Strict macro policy rejected one or more partial macro conversions.",
285
+ exit_code=EXIT_STRICT_PARTIAL,
286
+ suggested_action="Use --macro-policy expand or --unknown-macro warn to allow placeholders.",
287
+ details={"macros": failures},
288
+ )
289
+
290
+
291
+ def _page_paths(summaries: list[PageSummary], *, options: PullOptions) -> dict[str, str]:
292
+ layout = options.layout
293
+ if layout == "auto":
294
+ layout = "nested" if options.tree else "flat"
295
+ paths: dict[str, str] = {}
296
+ by_id = {summary.page_id: summary for summary in summaries}
297
+ for summary in summaries:
298
+ segment = f"{summary.order:04d}-{slugify(summary.title, fallback=summary.page_id)}"
299
+ if layout == "nested" and summary.parent_id and summary.parent_id in paths:
300
+ parent_dir = str(Path(paths[summary.parent_id]).parent).replace("\\", "/")
301
+ paths[summary.page_id] = f"{parent_dir}/{segment}/index.md"
302
+ elif layout == "nested" and summary.parent_id and summary.parent_id in by_id:
303
+ paths[summary.page_id] = f"pages/{segment}/index.md"
304
+ else:
305
+ paths[summary.page_id] = f"pages/{segment}/index.md"
306
+ return paths
307
+
308
+
309
+ def _child_links(
310
+ page: PageSummary, summaries: list[PageSummary], page_paths: dict[str, str]
311
+ ) -> list[tuple[str, str]]:
312
+ links = []
313
+ source_index = page_paths.get(page.page_id, "")
314
+ for summary in summaries:
315
+ if summary.parent_id == page.page_id:
316
+ from .paths import relative_path
317
+
318
+ links.append((summary.title, relative_path(source_index, page_paths[summary.page_id])))
319
+ return links
320
+
321
+
322
+ def _write_chunks(result: ExtractionResult) -> None:
323
+ import json
324
+
325
+ chunks_path = result.output_dir / "chunks.jsonl"
326
+ records = []
327
+ for artifact in result.pages:
328
+ paragraphs = [block.strip() for block in artifact.markdown.split("\n\n") if block.strip()]
329
+ for index, paragraph in enumerate(paragraphs, start=1):
330
+ records.append(
331
+ {
332
+ "schema_version": "1.0",
333
+ "chunk_id": f"{artifact.page.page_id}-{index:04d}",
334
+ "page_id": artifact.page.page_id,
335
+ "title": artifact.page.title,
336
+ "source_path": artifact.index_md,
337
+ "order": artifact.order,
338
+ "text": paragraph,
339
+ }
340
+ )
341
+ chunks_path.write_text(
342
+ "".join(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n" for record in records),
343
+ encoding="utf-8",
344
+ )
pull_cli/guide.py ADDED
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def guide_payload() -> dict[str, object]:
5
+ return {
6
+ "schema_version": "1.0",
7
+ "commands": {
8
+ "pull": {
9
+ "usage": "pull PAGE_REF [OPTIONS]",
10
+ "selectors": ["PAGE_REF", "--page-id", "--url", "--space + --title"],
11
+ "resolution_order": [
12
+ "explicit --page-id",
13
+ "explicit --url",
14
+ "positional URL",
15
+ "positional numeric page ID",
16
+ "--space + --title",
17
+ ],
18
+ "options": {
19
+ "scope": ["--tree", "--depth N", "--max-pages N"],
20
+ "output": [
21
+ "-o/--output PATH",
22
+ "--output-mode simple|full",
23
+ "--force",
24
+ "--clean",
25
+ "--html/--no-html",
26
+ "--source/--no-source",
27
+ "--bundle/--no-bundle",
28
+ ],
29
+ "assets": ["--assets visible|page|all", "--no-assets", "--extract-attachments", "--diagram-sources"],
30
+ "comments": ["--comments"],
31
+ "links": ["--rewrite-links/--no-rewrite-links", "--follow-includes", "--follow-links same-tree|same-space|none"],
32
+ "agent": ["--json", "LLM=true"],
33
+ },
34
+ },
35
+ "validate": {"usage": "pull validate MANIFEST_OR_OUTPUT_DIR [--json]"},
36
+ "guide": {"usage": "pull guide [--json]"},
37
+ },
38
+ "output_schema": {
39
+ "default_mode": "simple",
40
+ "mode_rules": {
41
+ "simple": "Default. Agent-facing output is the sanitized root AI Markdown, page index.md files, and downloaded assets/sidecars. Control files are still written for validation/provenance but are not linked from the root AI Markdown.",
42
+ "full": "Writes the current full evidence package: bundle.md, page index.html snapshots, source.storage.xml when available, page.json, manifests, diagnostics, and assets.",
43
+ "overrides": "--bundle/--no-bundle, --html/--no-html, --source/--no-source, and --chunks override mode defaults.",
44
+ "clean": "Use --clean when switching modes if you need the physical output tree to contain only files from the new mode.",
45
+ },
46
+ "simple_root_files": [
47
+ "<sanitized-root-page-title>.md",
48
+ "<sanitized-root-page-title>.yaml",
49
+ "manifest.yaml",
50
+ "diagnostics/warnings.jsonl",
51
+ "diagnostics/unresolved-links.md",
52
+ ],
53
+ "simple_per_page": ["index.md", "page.json", "assets/", "comments.md when --comments finds comments"],
54
+ "full_extra_files": ["bundle.md", "pages/*/index.html", "pages/*/source.storage.xml"],
55
+ "per_page": ["index.md", "page.json", "assets/", "comments.md when --comments finds comments"],
56
+ "ai_manifest": "Agent handoff files named from the sanitized root page title, with package-root path rules, hierarchical local page markdown paths, assets, sidecars, and diagnostics pointers.",
57
+ "manifest_paths": "Manifest and AI manifest paths are package-root-relative. Resolve them against the directory containing the root AI Markdown/YAML file, not the shell current working directory.",
58
+ "bundle_links": "Local links in bundle.md are rebased to package-root-relative paths.",
59
+ "comments": "--comments is opt-in. It fetches page and inline comments, writes page-local comments.md sidecars only when comments exist, and links them from agent-facing navigation.",
60
+ },
61
+ "json_envelope": {
62
+ "schema_version": "1.0",
63
+ "shape": ["schema_version", "request_id", "ok", "command", "target", "result", "warnings", "errors", "metrics"],
64
+ "failure_result": None,
65
+ },
66
+ "error_codes": [
67
+ "ERR_VALIDATION_REQUIRED",
68
+ "ERR_VALIDATION_AMBIGUOUS_PAGE",
69
+ "ERR_VALIDATION_INVALID_URL",
70
+ "ERR_VALIDATION_OUTPUT_EXISTS",
71
+ "ERR_AUTH_REQUIRED",
72
+ "ERR_AUTH_FORBIDDEN",
73
+ "ERR_AUTH_EXPIRED",
74
+ "ERR_SOURCE_PAGE_NOT_FOUND",
75
+ "ERR_SOURCE_BODY_UNAVAILABLE",
76
+ "ERR_SOURCE_TREE_TOO_LARGE",
77
+ "ERR_IO_CONNECTION",
78
+ "ERR_IO_TIMEOUT",
79
+ "ERR_IO_WRITE_FAILED",
80
+ "ERR_INTERNAL_CONVERSION",
81
+ "ERR_INTERNAL_API_RESPONSE",
82
+ ],
83
+ "warning_codes": [
84
+ "W_MACRO_UNKNOWN",
85
+ "W_MACRO_PARTIAL",
86
+ "W_MACRO_RENDER_EMPTY",
87
+ "W_ASSET_DOWNLOAD_FAILED",
88
+ "W_ASSET_SKIPPED_BY_POLICY",
89
+ "W_ASSET_DIAGRAM_SOURCE_NOT_FOUND",
90
+ "W_ATTACHMENT_TEXT_EXTRACTION_FAILED",
91
+ "W_LINK_UNRESOLVED",
92
+ "W_LINK_ANCHOR_UNRESOLVED",
93
+ "W_LINK_EXTERNAL_PRESERVED",
94
+ "W_PAGE_SKIPPED_PERMISSION",
95
+ "W_PAGE_SKIPPED_LIMIT",
96
+ "W_COMMENTS_FETCH_FAILED",
97
+ "W_BODY_REPRESENTATION_FALLBACK",
98
+ "W_DYNAMIC_MACRO_SNAPSHOT",
99
+ "W_SANITIZED_HTML",
100
+ ],
101
+ "examples": [
102
+ "pull 123456 -o pulled",
103
+ "pull --page-id 123456 --output-mode full -o pulled-full",
104
+ "pull \"https://example.atlassian.net/wiki/spaces/EA/pages/123456/Architecture\" -o pulled",
105
+ "pull --page-id 123456 --tree --depth 2 --assets all -o pulled-tree",
106
+ "pull --page-id 123456 --tree --comments -o pulled-comments",
107
+ "pull validate pulled-tree",
108
+ "LLM=true pull --page-id 123456 --json",
109
+ ],
110
+ "compatibility_notes": [
111
+ "PULL_* environment variables take precedence over config file values.",
112
+ "CONFPUB_URL, CONFPUB_USER, CONFPUB_TOKEN, and CONFPUB_SSL_VERIFY are accepted as compatibility fallbacks.",
113
+ "The CLI is read-only and does not call LLM services.",
114
+ ],
115
+ }
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from bs4 import BeautifulSoup, NavigableString
4
+
5
+ from .models import WarningRecord
6
+ from .security import SECRET_KEY_PATTERN, redact_source_url_text, redact_text, sanitize_url
7
+
8
+ WRITE_UI_SELECTORS = (
9
+ ".plugin_attachments_container",
10
+ ".plugin_attachments_upload_container",
11
+ ".plugin_attachments_table_container",
12
+ ".attachments-table-drop-zone",
13
+ ".download-all-link",
14
+ ".attachment-buttons",
15
+ "table.attachments",
16
+ ".labels-edit-container",
17
+ ".show-labels-editor",
18
+ ".editAttachmentLink",
19
+ ".removeAttachmentLink",
20
+ )
21
+
22
+
23
+ def normalize_html(
24
+ html: str, *, source_page_id: str, redact_source_urls: bool = False
25
+ ) -> tuple[str, list[WarningRecord]]:
26
+ soup = BeautifulSoup(html or "", "lxml")
27
+ warnings: list[WarningRecord] = []
28
+ removed_executable = False
29
+ for tag in soup.find_all(["script", "style", "iframe", "object", "embed", "form"]):
30
+ tag.decompose()
31
+ removed_executable = True
32
+ for selector in WRITE_UI_SELECTORS:
33
+ for tag in soup.select(selector):
34
+ tag.decompose()
35
+ removed_executable = True
36
+ for tag in soup.find_all("input"):
37
+ input_type = str(tag.get("type") or "").lower()
38
+ input_name = str(tag.get("name") or "")
39
+ if input_type == "hidden" or input_type == "file" or SECRET_KEY_PATTERN.search(input_name):
40
+ tag.decompose()
41
+ removed_executable = True
42
+ for tag in soup.find_all(True):
43
+ for attr in list(tag.attrs):
44
+ attr_lower = attr.lower()
45
+ if attr_lower.startswith("on") or SECRET_KEY_PATTERN.search(attr_lower):
46
+ del tag.attrs[attr]
47
+ removed_executable = True
48
+ continue
49
+ value = tag.attrs.get(attr)
50
+ if isinstance(value, str):
51
+ redacted = (
52
+ sanitize_url(value, redact_source_url=redact_source_urls)
53
+ if _is_source_url(value)
54
+ else redact_text(value)
55
+ )
56
+ if redacted != value:
57
+ tag.attrs[attr] = redacted
58
+ removed_executable = True
59
+ for attr in ("href", "src", "data-file-src"):
60
+ value = tag.get(attr)
61
+ if isinstance(value, str) and value.strip().lower().startswith("javascript:"):
62
+ del tag.attrs[attr]
63
+ removed_executable = True
64
+ continue
65
+ if isinstance(value, str) and _is_source_url(value):
66
+ sanitized = sanitize_url(value, redact_source_url=redact_source_urls)
67
+ if sanitized != value:
68
+ tag.attrs[attr] = sanitized
69
+ removed_executable = True
70
+ if redact_source_urls:
71
+ for node in soup.find_all(string=True):
72
+ if isinstance(node, NavigableString):
73
+ redacted = redact_source_url_text(str(node))
74
+ if redacted != str(node):
75
+ node.replace_with(redacted)
76
+ removed_executable = True
77
+ for tag in soup.find_all("img"):
78
+ src = tag.get("src")
79
+ if isinstance(src, str) and _is_redacted_url(src) and not _has_accessible_label(tag):
80
+ tag.decompose()
81
+ removed_executable = True
82
+ if removed_executable:
83
+ warnings.append(
84
+ WarningRecord(
85
+ code="W_SANITIZED_HTML",
86
+ message="Executable or active HTML content was stripped from the rendered page snapshot.",
87
+ source_page_id=source_page_id,
88
+ )
89
+ )
90
+ body = soup.body or soup
91
+ return str(body), warnings
92
+
93
+
94
+ def soup_from_html(html: str) -> BeautifulSoup:
95
+ return BeautifulSoup(html or "", "lxml")
96
+
97
+
98
+ def _is_source_url(value: str) -> bool:
99
+ return value.strip().lower().startswith(("http://", "https://", "//", "/wiki/", "/download/"))
100
+
101
+
102
+ def _is_redacted_url(value: str) -> bool:
103
+ return value.strip().lower() in {"<redacted-url>", "&lt;redacted-url&gt;"}
104
+
105
+
106
+ def _has_accessible_label(tag) -> bool:
107
+ for attr in ("alt", "title", "aria-label"):
108
+ value = tag.get(attr)
109
+ if isinstance(value, str) and value.strip():
110
+ return True
111
+ return False