pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pull_cli/writer.py ADDED
@@ -0,0 +1,792 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ import shutil
6
+ from dataclasses import asdict
7
+ from datetime import UTC, datetime
8
+ from html import unescape
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import yaml
13
+
14
+ from .errors import EXIT_VALIDATION, PullError
15
+ from .html_normalizer import normalize_html
16
+ from .markdown_writer import rendered_html_to_markdown
17
+ from .models import (
18
+ AssetRecord,
19
+ CommentRecord,
20
+ ExtractionResult,
21
+ PageArtifact,
22
+ PullOptions,
23
+ WarningRecord,
24
+ )
25
+ from .paths import as_posix, relative_path, slugify
26
+ from .security import (
27
+ SECRET_KEY_PATTERN,
28
+ redact_source_url_text,
29
+ redact_text,
30
+ redact_value,
31
+ sanitize_url,
32
+ )
33
+
34
+ BUNDLE_LINK_RE = re.compile(r"(!?\[[^\]]*]\()([^)]+)(\))")
35
+ WRITE_ORIENTED_SNAPSHOT_KEYS = {
36
+ "draft",
37
+ "draftid",
38
+ "edit",
39
+ "editui",
40
+ "edituiv2",
41
+ "isactiveliveeditsession",
42
+ "operations",
43
+ "permissions",
44
+ }
45
+ REDACTED_SNAPSHOT_KEYS = {
46
+ "draftversion",
47
+ "restrictions",
48
+ "schedulepublishdate",
49
+ "schedulepublishinfo",
50
+ }
51
+ REDACTED_LINK_KEYS = {
52
+ "base",
53
+ "context",
54
+ "self",
55
+ "tinyui",
56
+ "webui",
57
+ }
58
+
59
+
60
+ def prepare_output_dir(output: Path, *, force: bool, clean: bool) -> None:
61
+ if output.exists() and clean:
62
+ shutil.rmtree(output)
63
+ if output.exists() and any(output.iterdir()) and not force and not clean:
64
+ raise PullError(
65
+ code="ERR_VALIDATION_OUTPUT_EXISTS",
66
+ message=f"Output directory already exists and is not empty: {output}",
67
+ exit_code=EXIT_VALIDATION,
68
+ suggested_action="Use --force to add/overwrite files or --clean to replace the directory.",
69
+ )
70
+ output.mkdir(parents=True, exist_ok=True)
71
+ (output / "pages").mkdir(exist_ok=True)
72
+ (output / "diagnostics").mkdir(exist_ok=True)
73
+
74
+
75
+ def write_page_artifact(output: Path, artifact: PageArtifact, *, options: PullOptions) -> None:
76
+ page_dir = output / artifact.page_dir
77
+ page_dir.mkdir(parents=True, exist_ok=True)
78
+ (page_dir / "assets").mkdir(exist_ok=True)
79
+ (output / artifact.index_md).write_text(artifact.markdown, encoding="utf-8")
80
+ if options.write_html and artifact.index_html:
81
+ (output / artifact.index_html).write_text(
82
+ _sanitize_snapshot(artifact.html, redact_source_urls=options.redact_source_urls),
83
+ encoding="utf-8",
84
+ )
85
+ if options.write_source and artifact.source_path and artifact.page.body_storage:
86
+ (output / artifact.source_path).write_text(
87
+ _sanitize_snapshot(artifact.page.body_storage, redact_source_urls=options.redact_source_urls),
88
+ encoding="utf-8",
89
+ )
90
+ page_json_data = {
91
+ "page": _sanitize_snapshot(artifact.page.raw, redact_source_urls=options.redact_source_urls),
92
+ "metadata": {
93
+ "page_id": artifact.page.page_id,
94
+ "title": artifact.page.title,
95
+ "space_key": artifact.page.space_key,
96
+ "version": artifact.page.version,
97
+ "url": sanitize_url(artifact.page.url, redact_source_url=options.redact_source_urls),
98
+ "labels": artifact.page.labels,
99
+ },
100
+ "representations": {
101
+ "has_rendered_html": bool(artifact.page.body_view or artifact.page.body_export_view),
102
+ "has_storage": bool(artifact.page.body_storage),
103
+ "has_adf": bool(artifact.page.body_adf),
104
+ },
105
+ "warnings": [warning.to_dict() for warning in artifact.warnings],
106
+ }
107
+ (output / artifact.page_json).write_text(
108
+ json.dumps(
109
+ redact_value(page_json_data, redact_source_urls=options.redact_source_urls),
110
+ indent=2,
111
+ sort_keys=True,
112
+ ),
113
+ encoding="utf-8",
114
+ )
115
+ if artifact.comments_path and artifact.comments:
116
+ (output / artifact.comments_path).write_text(
117
+ _comments_markdown(artifact, options=options),
118
+ encoding="utf-8",
119
+ )
120
+
121
+
122
+ def write_manifest(result: ExtractionResult, *, options: PullOptions, root_page_id: str, base_url: str, deployment_type: str) -> None:
123
+ manifest = build_manifest(
124
+ result,
125
+ options=options,
126
+ root_page_id=root_page_id,
127
+ base_url=base_url,
128
+ deployment_type=deployment_type,
129
+ )
130
+ result.manifest_path.write_text(
131
+ yaml.safe_dump(
132
+ redact_value(manifest, redact_source_urls=options.redact_manifest),
133
+ sort_keys=False,
134
+ allow_unicode=True,
135
+ ),
136
+ encoding="utf-8",
137
+ )
138
+ write_ai_manifests(result, options=options)
139
+
140
+
141
+ def build_manifest(
142
+ result: ExtractionResult,
143
+ *,
144
+ options: PullOptions,
145
+ root_page_id: str,
146
+ base_url: str,
147
+ deployment_type: str,
148
+ ) -> dict[str, Any]:
149
+ ai_paths = _ai_manifest_paths(result)
150
+ pages = []
151
+ for artifact in result.pages:
152
+ paths = {
153
+ "dir": artifact.page_dir,
154
+ "markdown": artifact.index_md,
155
+ "html": artifact.index_html,
156
+ "source": artifact.source_path,
157
+ "metadata": artifact.page_json,
158
+ }
159
+ page_entry = {
160
+ "order": artifact.order,
161
+ "page_id": artifact.page.page_id,
162
+ "title": artifact.page.title,
163
+ "space_key": artifact.page.space_key,
164
+ "parent_id": artifact.page.parent_id,
165
+ "depth": artifact.page.depth,
166
+ "version": artifact.page.version,
167
+ "url": artifact.page.url,
168
+ "paths": paths,
169
+ "assets": [asset.asset_id for asset in artifact.assets],
170
+ "warnings": [warning.to_dict() for warning in artifact.warnings],
171
+ "macro_records": [macro.macro_id for macro in artifact.macros],
172
+ }
173
+ if artifact.comments_path and artifact.comments:
174
+ paths["comments"] = artifact.comments_path
175
+ page_entry["comments"] = {
176
+ "count": len(artifact.comments),
177
+ "locations": _comment_locations(artifact.comments, options=options),
178
+ }
179
+ pages.append(page_entry)
180
+ return {
181
+ "schema_version": "1.0",
182
+ "tool": {"name": "pull-cli", "version": _tool_version()},
183
+ "generated_at": datetime.now(UTC).isoformat(),
184
+ "source": {
185
+ "base_url": base_url,
186
+ "deployment_type": deployment_type,
187
+ },
188
+ "root": {"page_id": root_page_id},
189
+ "path_base": {
190
+ "kind": "package_root",
191
+ "root": ".",
192
+ "rule": "All relative paths in this manifest are relative to the output package root.",
193
+ },
194
+ "options": options.manifest_dict(),
195
+ "paths": {
196
+ "manifest": "manifest.yaml",
197
+ "ai_manifest": ai_paths["manifest"],
198
+ "ai_entry": ai_paths["entry"],
199
+ "bundle": as_posix(result.bundle_path.relative_to(result.output_dir)) if result.bundle_path else None,
200
+ "chunks": "chunks.jsonl" if options.write_chunks else None,
201
+ "warnings": "diagnostics/warnings.jsonl",
202
+ "unresolved_links": "diagnostics/unresolved-links.md",
203
+ },
204
+ "pages": pages,
205
+ "assets": [asset.to_manifest() for asset in result.assets],
206
+ "links": [asdict(link) for link in result.links],
207
+ "macros": [macro.to_manifest() for macro in result.macros],
208
+ "warnings": [warning.to_dict() for warning in result.warnings],
209
+ "errors": [],
210
+ "completeness": {
211
+ "pages_requested": len(result.pages),
212
+ "pages_written": len(result.pages),
213
+ "assets_downloaded": len(result.assets),
214
+ "warnings": len(result.warnings),
215
+ "rendered_page_first": True,
216
+ },
217
+ }
218
+
219
+
220
+ def write_ai_manifests(result: ExtractionResult, *, options: PullOptions) -> None:
221
+ page_names = _page_names(result.pages)
222
+ ai_paths = _ai_manifest_paths(result, page_names=page_names)
223
+ ai_manifest = build_ai_manifest(result, options=options, page_names=page_names, ai_paths=ai_paths)
224
+ result.ai_manifest_path = result.output_dir / ai_paths["manifest"]
225
+ result.ai_entry_path = result.output_dir / ai_paths["entry"]
226
+ result.ai_manifest_path.write_text(
227
+ yaml.safe_dump(ai_manifest, sort_keys=False, allow_unicode=True),
228
+ encoding="utf-8",
229
+ )
230
+ result.ai_entry_path.write_text(
231
+ build_ai_entry_markdown(ai_manifest),
232
+ encoding="utf-8",
233
+ )
234
+
235
+
236
+ def build_ai_manifest(
237
+ result: ExtractionResult,
238
+ *,
239
+ options: PullOptions,
240
+ page_names: dict[str, str] | None = None,
241
+ ai_paths: dict[str, str] | None = None,
242
+ ) -> dict[str, Any]:
243
+ page_names = page_names or _page_names(result.pages)
244
+ ai_paths = ai_paths or _ai_manifest_paths(result, page_names=page_names)
245
+ children_by_parent: dict[str, list[str]] = {}
246
+ for artifact in result.pages:
247
+ parent_id = artifact.page.parent_id
248
+ if parent_id and parent_id in page_names:
249
+ children_by_parent.setdefault(parent_id, []).append(page_names[artifact.page.page_id])
250
+
251
+ pages = []
252
+ for artifact in result.pages:
253
+ parent_name = page_names.get(artifact.page.parent_id or "")
254
+ page_assets = [_ai_asset(asset) for asset in artifact.assets]
255
+ page_entry = {
256
+ "name": page_names[artifact.page.page_id],
257
+ "title": artifact.page.title,
258
+ "page_id": artifact.page.page_id,
259
+ "parent": parent_name,
260
+ "depth": artifact.page.depth,
261
+ "markdown": artifact.index_md,
262
+ "children": children_by_parent.get(artifact.page.page_id, []),
263
+ "assets": page_assets,
264
+ "warnings": len(artifact.warnings),
265
+ }
266
+ if artifact.comments_path and artifact.comments:
267
+ page_entry["comments"] = artifact.comments_path
268
+ page_entry["comments_count"] = len(artifact.comments)
269
+ pages.append(page_entry)
270
+
271
+ return {
272
+ "schema_version": "1.0",
273
+ "output_mode": options.output_mode,
274
+ "purpose": "Minimal AI navigation manifest for this pulled Confluence package.",
275
+ "start_here": "Read this file first, then open page markdown paths or asset sidecars as needed.",
276
+ "artifact_guidance": _artifact_guidance(result, options=options),
277
+ "path_base": {
278
+ "kind": "package_root",
279
+ "root": ".",
280
+ "rule": "Resolve every relative path in this YAML against the directory containing this YAML file, regardless of the agent shell current working directory.",
281
+ "page_markdown_rule": "After opening a page markdown file, resolve links inside that page relative to that page file.",
282
+ "bundle_rule": "bundle.md is for linear reading and search; its local links are rebased to package_root."
283
+ if result.bundle_path
284
+ else None,
285
+ },
286
+ "root": page_names[result.pages[0].page.page_id] if result.pages else None,
287
+ "entrypoints": {
288
+ "ai_entry": ai_paths["entry"],
289
+ "ai_manifest": ai_paths["manifest"],
290
+ "bundle": as_posix(result.bundle_path.relative_to(result.output_dir))
291
+ if result.bundle_path
292
+ else None,
293
+ "full_manifest": "manifest.yaml",
294
+ "warnings": "diagnostics/warnings.jsonl",
295
+ "unresolved_links": "diagnostics/unresolved-links.md",
296
+ "chunks": "chunks.jsonl" if (result.output_dir / "chunks.jsonl").exists() else None,
297
+ },
298
+ "pages": pages,
299
+ "diagnostics": {
300
+ "warnings": len(result.warnings),
301
+ "warning_codes": _warning_counts(result.warnings),
302
+ "warnings_path": "diagnostics/warnings.jsonl",
303
+ "unresolved_links_path": "diagnostics/unresolved-links.md",
304
+ },
305
+ }
306
+
307
+
308
+ def build_ai_entry_markdown(ai_manifest: dict[str, Any]) -> str:
309
+ simple_mode = ai_manifest.get("output_mode") == "simple"
310
+ entrypoints = ai_manifest.get("entrypoints", {})
311
+ bundle_path = entrypoints.get("bundle") if isinstance(entrypoints, dict) else None
312
+ lines = [
313
+ "# AI Navigation Manifest",
314
+ "",
315
+ str(ai_manifest["start_here"]),
316
+ "",
317
+ f"Root page: `{ai_manifest.get('root')}`",
318
+ "",
319
+ "## Agent Instructions",
320
+ "",
321
+ "1. Set `PACKAGE_ROOT` to the directory containing this file.",
322
+ "2. If you are launched from a repo root or another working directory, keep `PACKAGE_ROOT` as the path base; do not resolve these links against the repo root.",
323
+ "3. Resolve every relative path in this file against `PACKAGE_ROOT`."
324
+ if simple_mode
325
+ else "3. Resolve every relative path in this file and in the YAML manifest against `PACKAGE_ROOT`.",
326
+ "4. Open page Markdown paths under `pages/` for detailed evidence; after opening a page, resolve links inside it relative to that page file.",
327
+ "5. Use the page hierarchy below to choose the smallest relevant page set before reading broad context.",
328
+ _agent_instruction_6(simple_mode=simple_mode, bundle_path=bundle_path),
329
+ "7. Open asset sidecars when present before inferring image, diagram, PDF, or text attachment content.",
330
+ "8. Treat warning counts below as a signal to run validation before making claims about missing content, broken links, macros, or assets."
331
+ if simple_mode
332
+ else "8. Check diagnostics when warning counts are nonzero before making claims about missing content, broken links, macros, or assets.",
333
+ "",
334
+ "## Artifact Guidance",
335
+ "",
336
+ str(ai_manifest.get("artifact_guidance", {}).get("rule", "")),
337
+ "",
338
+ _surfaces_line("Navigation surfaces", ai_manifest.get("artifact_guidance", {}).get("navigation_surfaces")),
339
+ _simple_control_files_line(simple_mode)
340
+ if simple_mode
341
+ else _surfaces_line(
342
+ "Raw reference surfaces",
343
+ ai_manifest.get("artifact_guidance", {}).get("raw_reference_surfaces"),
344
+ suffix="; their links may be redacted and are not evidence of failed local rewriting.",
345
+ ),
346
+ "",
347
+ "## First Checks",
348
+ "",
349
+ "Run `pull validate <PACKAGE_ROOT>` before analysis. If validation fails, inspect the reported file, link, resolution base, candidate path, and diagnostics before trusting generated links or artifacts.",
350
+ ]
351
+ core_file_labels = ("bundle", "chunks") if simple_mode else (
352
+ "ai_manifest",
353
+ "bundle",
354
+ "full_manifest",
355
+ "warnings",
356
+ "unresolved_links",
357
+ "chunks",
358
+ )
359
+ core_file_lines = []
360
+ for label in core_file_labels:
361
+ path = entrypoints.get(label) if isinstance(entrypoints, dict) else None
362
+ if path:
363
+ core_file_lines.append(f"- {label}: [{path}]({path})")
364
+ if core_file_lines:
365
+ lines.extend(["", "## Core Files", "", *core_file_lines])
366
+ lines.extend(["", "## Page Hierarchy", ""])
367
+ _append_page_hierarchy(lines, ai_manifest)
368
+ assets = [
369
+ (page["name"], asset)
370
+ for page in ai_manifest.get("pages", [])
371
+ for asset in page.get("assets", [])
372
+ ]
373
+ if assets:
374
+ lines.extend(["", "## Assets", ""])
375
+ for page_name, asset in assets:
376
+ sidecars = asset.get("sidecars") or []
377
+ sidecar_text = ""
378
+ if sidecars:
379
+ sidecar_links = ", ".join(f"[{sidecar}]({sidecar})" for sidecar in sidecars)
380
+ sidecar_text = f"; sidecars: {sidecar_links}"
381
+ lines.append(
382
+ f"- `{page_name}/{asset['name']}`: [{asset['path']}]({asset['path']}){sidecar_text}"
383
+ )
384
+ lines.extend(["", "## Diagnostics", "", f"- warnings: {ai_manifest.get('diagnostics', {}).get('warnings', 0)}"])
385
+ if not simple_mode:
386
+ lines.extend(
387
+ [
388
+ _markdown_link_line("warning records", ai_manifest.get("diagnostics", {}).get("warnings_path")),
389
+ _markdown_link_line(
390
+ "unresolved links", ai_manifest.get("diagnostics", {}).get("unresolved_links_path")
391
+ ),
392
+ ]
393
+ )
394
+ warning_codes = ai_manifest.get("diagnostics", {}).get("warning_codes", {})
395
+ if isinstance(warning_codes, dict) and warning_codes:
396
+ lines.extend(["", "Warning codes:", ""])
397
+ for code, count in sorted(warning_codes.items()):
398
+ lines.append(f"- `{code}`: {count}")
399
+ return "\n".join(lines).rstrip() + "\n"
400
+
401
+
402
+ def _agent_instruction_6(*, simple_mode: bool, bundle_path: object) -> str:
403
+ if bundle_path:
404
+ return "6. Prefer individual page files for navigation and `bundle.md` for linear reading or search; bundle links are rebased to `PACKAGE_ROOT`."
405
+ if simple_mode:
406
+ return "6. Use individual page files for navigation and reading."
407
+ return "6. Prefer individual page files for navigation; no `bundle.md` was written for this package."
408
+
409
+
410
+ def _surfaces_line(label: str, surfaces: object, *, suffix: str = ".") -> str:
411
+ items = surfaces if isinstance(surfaces, list) else []
412
+ rendered = ", ".join(f"`{item}`" for item in items if isinstance(item, str)) or "none"
413
+ return f"- {label}: {rendered}{suffix}"
414
+
415
+
416
+ def _simple_control_files_line(simple_mode: bool) -> str:
417
+ if simple_mode:
418
+ return "- Control and provenance files are written for tooling but are intentionally not listed as reading targets in simple mode."
419
+ return ""
420
+
421
+
422
+ def _append_page_hierarchy(lines: list[str], ai_manifest: dict[str, Any]) -> None:
423
+ pages = [page for page in ai_manifest.get("pages", []) if isinstance(page, dict)]
424
+ by_name = {page.get("name"): page for page in pages if isinstance(page.get("name"), str)}
425
+ root_name = ai_manifest.get("root")
426
+ roots = [by_name[root_name]] if isinstance(root_name, str) and root_name in by_name else []
427
+ if not roots:
428
+ roots = [page for page in pages if not page.get("parent")]
429
+ if not roots and pages:
430
+ roots = [pages[0]]
431
+
432
+ visited: set[str] = set()
433
+
434
+ def append_page(page: dict[str, Any], depth: int) -> None:
435
+ name = page.get("name")
436
+ if not isinstance(name, str):
437
+ return
438
+ indent = " " * depth
439
+ lines.append(f"{indent}- {_page_hierarchy_line(page)}")
440
+ visited.add(name)
441
+ for child_name in page.get("children", []):
442
+ child = by_name.get(child_name)
443
+ if child is None:
444
+ lines.append(f"{indent} - `{child_name}`: missing from page index")
445
+ continue
446
+ if child_name in visited:
447
+ lines.append(f"{indent} - `{child_name}`: already listed above")
448
+ continue
449
+ append_page(child, depth + 1)
450
+
451
+ for root in roots:
452
+ append_page(root, 0)
453
+ unlisted = [page for page in pages if page.get("name") not in visited]
454
+ if unlisted:
455
+ lines.extend(["", "Unlinked pages:", ""])
456
+ for page in unlisted:
457
+ lines.append(f"- {_page_hierarchy_line(page)}")
458
+
459
+
460
+ def _page_hierarchy_line(page: dict[str, Any]) -> str:
461
+ markdown = page.get("markdown", "")
462
+ comments = ""
463
+ if isinstance(page.get("comments"), str):
464
+ comments = f", comments {page.get('comments_count', 0)} ([comments.md]({page['comments']}))"
465
+ return (
466
+ f"`{page.get('name')}`: [{page.get('title')}]({markdown}) "
467
+ f"- path `{markdown}`, depth {page.get('depth')}, assets {len(page.get('assets', []))}, warnings {page.get('warnings')}{comments}"
468
+ )
469
+
470
+
471
+ def write_bundle(result: ExtractionResult, *, root_title: str, options: PullOptions) -> None:
472
+ if not result.bundle_path:
473
+ return
474
+ bundle_path = as_posix(result.bundle_path.relative_to(result.output_dir))
475
+ lines = [
476
+ "# Pulled Confluence Bundle",
477
+ "",
478
+ f"Source root: {root_title}",
479
+ f"Generated: {datetime.now(UTC).isoformat()}",
480
+ f"Pages: {len(result.pages)}",
481
+ f"Assets: {len(result.assets)}",
482
+ f"Warnings: {len(result.warnings)}",
483
+ "Manifest: ./manifest.yaml",
484
+ "",
485
+ "---",
486
+ "",
487
+ ]
488
+ for artifact in result.pages:
489
+ source_url = "<redacted-url>" if options.redact_source_urls else artifact.page.url or ""
490
+ lines.extend(
491
+ [
492
+ f'<!-- pull:page-start id="{artifact.page.page_id}" path="{artifact.index_md}" -->',
493
+ "",
494
+ f"# {artifact.page.title}",
495
+ "",
496
+ f"Source: {source_url}",
497
+ f"Confluence version: {artifact.page.version or 'unknown'}",
498
+ "",
499
+ _rebase_bundle_links(artifact.markdown.strip(), from_file=artifact.index_md, bundle_file=bundle_path),
500
+ "",
501
+ f'<!-- pull:page-end id="{artifact.page.page_id}" -->',
502
+ "",
503
+ "---",
504
+ "",
505
+ ]
506
+ )
507
+ result.bundle_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
508
+
509
+
510
+ def write_diagnostics(output: Path, warnings: list[WarningRecord], unresolved_links: list[dict[str, Any]]) -> None:
511
+ diagnostics = output / "diagnostics"
512
+ diagnostics.mkdir(exist_ok=True)
513
+ warnings_path = diagnostics / "warnings.jsonl"
514
+ warnings_path.write_text(
515
+ "".join(json.dumps(warning.to_dict(), sort_keys=True) + "\n" for warning in warnings),
516
+ encoding="utf-8",
517
+ )
518
+ lines = ["# Unresolved Links", ""]
519
+ if not unresolved_links:
520
+ lines.append("No unresolved local links were recorded.")
521
+ else:
522
+ for link in unresolved_links:
523
+ lines.append(f"- Page `{link.get('source_page_id')}`: `{link.get('original')}` ({link.get('warning')})")
524
+ (diagnostics / "unresolved-links.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
525
+
526
+
527
+ def page_markdown_header(artifact: PageArtifact, *, options: PullOptions) -> str:
528
+ source_url = "<redacted-url>" if options.redact_source_urls else artifact.page.url or ""
529
+ lines = [
530
+ "---",
531
+ f'pull_page_id: "{artifact.page.page_id}"',
532
+ f'title: "{artifact.page.title}"',
533
+ f'space: "{artifact.page.space_key or ""}"',
534
+ f"confluence_version: {artifact.page.version or 'null'}",
535
+ f'retrieved_at: "{datetime.now(UTC).isoformat()}"',
536
+ f'source_url: "{source_url}"',
537
+ f"local_assets: {len(artifact.assets)}",
538
+ f"warnings: {len(artifact.warnings)}",
539
+ "---",
540
+ "",
541
+ f"# {artifact.page.title}",
542
+ "",
543
+ f"> Source: Confluence page `{artifact.page.page_id}`, version {artifact.page.version or 'unknown'}.",
544
+ "",
545
+ ]
546
+ if artifact.comments_path and artifact.comments:
547
+ comments_link = relative_path(artifact.index_md, artifact.comments_path)
548
+ lines.extend(
549
+ [
550
+ f"> Comments sidecar: [{len(artifact.comments)} comment(s)]({comments_link}).",
551
+ "",
552
+ ]
553
+ )
554
+ return "\n".join(lines)
555
+
556
+
557
+ def _tool_version() -> str:
558
+ from . import __version__
559
+
560
+ return __version__
561
+
562
+
563
+ def _page_names(pages: list[PageArtifact]) -> dict[str, str]:
564
+ names: dict[str, str] = {}
565
+ used: set[str] = set()
566
+ for artifact in pages:
567
+ base = slugify(artifact.page.title, fallback=artifact.page.page_id)
568
+ name = base
569
+ counter = 2
570
+ while name in used:
571
+ name = f"{base}-{counter}"
572
+ counter += 1
573
+ used.add(name)
574
+ names[artifact.page.page_id] = name
575
+ return names
576
+
577
+
578
+ def _ai_manifest_paths(
579
+ result: ExtractionResult, *, page_names: dict[str, str] | None = None
580
+ ) -> dict[str, str]:
581
+ page_names = page_names or _page_names(result.pages)
582
+ root_name = page_names[result.pages[0].page.page_id] if result.pages else "pulled-confluence"
583
+ reserved = {"manifest", "bundle", "chunks"}
584
+ file_stem = f"{root_name}-ai" if root_name in reserved else root_name
585
+ return {"entry": f"{file_stem}.md", "manifest": f"{file_stem}.yaml"}
586
+
587
+
588
+ def _ai_asset(asset: AssetRecord) -> dict[str, Any]:
589
+ return {
590
+ "name": slugify(Path(asset.filename).stem, fallback=asset.asset_id),
591
+ "filename": asset.filename,
592
+ "path": asset.local_path,
593
+ "media_type": asset.media_type,
594
+ "sidecars": asset.sidecars,
595
+ }
596
+
597
+
598
+ def _warning_counts(warnings: list[WarningRecord]) -> dict[str, int]:
599
+ counts: dict[str, int] = {}
600
+ for warning in warnings:
601
+ counts[warning.code] = counts.get(warning.code, 0) + 1
602
+ return counts
603
+
604
+
605
+ def _comments_markdown(artifact: PageArtifact, *, options: PullOptions) -> str:
606
+ lines = [
607
+ f"# Comments for {artifact.page.title}",
608
+ "",
609
+ f"Page ID: `{artifact.page.page_id}`",
610
+ f"Comment count: {len(artifact.comments)}",
611
+ "",
612
+ ]
613
+ for index, comment in enumerate(artifact.comments, start=1):
614
+ lines.extend(_comment_markdown_block(index, comment, options=options))
615
+ return "\n".join(lines).rstrip() + "\n"
616
+
617
+
618
+ def _comment_markdown_block(index: int, comment: CommentRecord, *, options: PullOptions) -> list[str]:
619
+ lines = [
620
+ f"## Comment {index}: `{_comment_field(comment.comment_id, options=options)}`",
621
+ "",
622
+ ]
623
+ metadata = [
624
+ ("location", comment.location),
625
+ ("status", comment.status),
626
+ ("resolution", comment.resolution),
627
+ ("version", comment.version),
628
+ ("author", comment.author),
629
+ ("created", comment.created_at),
630
+ ("updated", comment.updated_at),
631
+ ("parent", comment.parent_id),
632
+ ]
633
+ for label, value in metadata:
634
+ if value is not None and value != "":
635
+ lines.append(f"- {label}: {_comment_field(value, options=options)}")
636
+ body = _comment_body_markdown(comment, options=options)
637
+ lines.extend(["", body or "_No comment body returned._", ""])
638
+ return lines
639
+
640
+
641
+ def _comment_body_markdown(comment: CommentRecord, *, options: PullOptions) -> str:
642
+ sanitized_html, _warnings = normalize_html(
643
+ comment.body_html,
644
+ source_page_id=comment.page_id,
645
+ redact_source_urls=options.redact_source_urls,
646
+ )
647
+ return rendered_html_to_markdown(sanitized_html).strip()
648
+
649
+
650
+ def _comment_field(value: object, *, options: PullOptions) -> str:
651
+ text = str(_sanitize_snapshot(value, redact_source_urls=options.redact_source_urls) or "")
652
+ return text.replace("\n", " ").strip()
653
+
654
+
655
+ def _comment_locations(comments: list[CommentRecord], *, options: PullOptions) -> list[str]:
656
+ return sorted(
657
+ {
658
+ _comment_field(comment.location, options=options)
659
+ for comment in comments
660
+ if comment.location
661
+ }
662
+ )
663
+
664
+
665
+ def _markdown_link_line(label: str, path: object) -> str:
666
+ if not isinstance(path, str) or not path:
667
+ return f"- {label}: unavailable"
668
+ return f"- {label}: [{path}]({path})"
669
+
670
+
671
+ def _artifact_guidance(result: ExtractionResult, *, options: PullOptions) -> dict[str, Any]:
672
+ navigation_surfaces = ["page index.md files"]
673
+ if result.bundle_path:
674
+ navigation_surfaces.append("bundle.md")
675
+ raw_reference_surfaces = ["page.json"]
676
+ if any(artifact.source_path for artifact in result.pages):
677
+ raw_reference_surfaces.insert(0, "source.storage.xml")
678
+ rendered_reference_surfaces = ["index.html"] if any(artifact.index_html for artifact in result.pages) else []
679
+ if result.bundle_path:
680
+ navigation_rule = "Use page Markdown files and bundle.md for navigation."
681
+ else:
682
+ navigation_rule = "Use page Markdown files for navigation."
683
+ if options.output_mode == "simple":
684
+ rule = (
685
+ f"{navigation_rule} Simple mode keeps control and provenance artifacts available for tooling "
686
+ "without listing them as primary reading targets."
687
+ )
688
+ else:
689
+ rule = (
690
+ f"{navigation_rule} Treat raw reference artifacts as source evidence only; their source links may be "
691
+ "redacted and should not be used to judge rewritten local navigation."
692
+ )
693
+ return {
694
+ "rule": rule,
695
+ "navigation_surfaces": navigation_surfaces,
696
+ "raw_reference_surfaces": raw_reference_surfaces,
697
+ "rendered_reference_surfaces": rendered_reference_surfaces,
698
+ }
699
+
700
+
701
+ def _sanitize_snapshot(value: Any, *, redact_source_urls: bool = False) -> Any:
702
+ if isinstance(value, str):
703
+ if ("<" in value and ">" in value) or ("&lt;" in value and "&gt;" in value):
704
+ text = unescape(value)
705
+ normalized, _warnings = normalize_html(text, source_page_id="", redact_source_urls=redact_source_urls)
706
+ redacted = redact_text(normalized)
707
+ return redact_source_url_text(redacted) if redact_source_urls else redacted
708
+ text = redact_text(value)
709
+ if text.startswith(("http://", "https://")):
710
+ sanitized = sanitize_url(text, redact_source_url=redact_source_urls)
711
+ return sanitized or text
712
+ if redact_source_urls:
713
+ return redact_source_url_text(text)
714
+ return text
715
+ if isinstance(value, dict):
716
+ output: dict[str, Any] = {}
717
+ for key, child in value.items():
718
+ key_text = str(key)
719
+ if _is_write_oriented_snapshot_key(key_text):
720
+ continue
721
+ if redact_source_urls and _is_redacted_snapshot_key(key_text):
722
+ continue
723
+ output[key_text] = "<redacted>" if SECRET_KEY_PATTERN.search(key_text) else _sanitize_snapshot(child, redact_source_urls=redact_source_urls)
724
+ return output
725
+ if isinstance(value, list):
726
+ return [_sanitize_snapshot(child, redact_source_urls=redact_source_urls) for child in value]
727
+ return redact_value(value)
728
+
729
+
730
+ def _is_write_oriented_snapshot_key(key: str) -> bool:
731
+ normalized = re.sub(r"[^a-z0-9]", "", key.lower())
732
+ return normalized in WRITE_ORIENTED_SNAPSHOT_KEYS
733
+
734
+
735
+ def _is_redacted_snapshot_key(key: str) -> bool:
736
+ normalized = re.sub(r"[^a-z0-9]", "", key.lower())
737
+ return normalized in REDACTED_SNAPSHOT_KEYS or normalized in REDACTED_LINK_KEYS
738
+
739
+
740
+ def _rebase_bundle_links(markdown: str, *, from_file: str, bundle_file: str) -> str:
741
+ def replace(match: re.Match[str]) -> str:
742
+ prefix, raw_target, suffix = match.groups()
743
+ rebased = _rebase_bundle_link_target(raw_target, from_file=from_file, bundle_file=bundle_file)
744
+ return f"{prefix}{rebased}{suffix}"
745
+
746
+ return BUNDLE_LINK_RE.sub(replace, markdown)
747
+
748
+
749
+ def _rebase_bundle_link_target(raw_target: str, *, from_file: str, bundle_file: str) -> str:
750
+ leading = raw_target[: len(raw_target) - len(raw_target.lstrip())]
751
+ trailing = raw_target[len(raw_target.rstrip()) :]
752
+ core = raw_target.strip()
753
+ if not core:
754
+ return raw_target
755
+
756
+ angle_wrapped = core.startswith("<")
757
+ if angle_wrapped:
758
+ end = core.find(">")
759
+ if end == -1:
760
+ return raw_target
761
+ target = core[1:end]
762
+ trailer = core[end + 1 :]
763
+ else:
764
+ target, trailer = _split_markdown_target(core)
765
+
766
+ if _is_external_or_page_local(target):
767
+ return raw_target
768
+
769
+ path_part, marker, fragment = target.partition("#")
770
+ if not path_part:
771
+ return raw_target
772
+ rebased_path = relative_path(bundle_file, Path(from_file).parent / path_part)
773
+ if rebased_path.startswith("../"):
774
+ return raw_target
775
+ rebased_target = f"{rebased_path}{marker}{fragment}"
776
+ if angle_wrapped:
777
+ rebased_target = f"<{rebased_target}>{trailer}"
778
+ else:
779
+ rebased_target = f"{rebased_target}{trailer}"
780
+ return f"{leading}{rebased_target}{trailing}"
781
+
782
+
783
+ def _split_markdown_target(core: str) -> tuple[str, str]:
784
+ for marker in (' "', " '", "\t\"", "\t'"):
785
+ if marker in core:
786
+ path, title = core.split(marker, 1)
787
+ return path, f"{marker}{title}"
788
+ return core, ""
789
+
790
+
791
+ def _is_external_or_page_local(target: str) -> bool:
792
+ return target in {"redacted-url", "<redacted-url>"} or target.startswith(("#", "/", "http://", "https://", "mailto:", "jira:"))