pull-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pull_cli/__init__.py +5 -0
- pull_cli/__main__.py +6 -0
- pull_cli/assets.py +235 -0
- pull_cli/attachment_extractors.py +85 -0
- pull_cli/cli.py +329 -0
- pull_cli/clients/__init__.py +8 -0
- pull_cli/clients/base.py +29 -0
- pull_cli/clients/cloud_v2.py +132 -0
- pull_cli/clients/data_center.py +360 -0
- pull_cli/clients/hybrid.py +15 -0
- pull_cli/config.py +82 -0
- pull_cli/crawler.py +51 -0
- pull_cli/envelope.py +59 -0
- pull_cli/errors.py +50 -0
- pull_cli/extractor.py +344 -0
- pull_cli/guide.py +115 -0
- pull_cli/html_normalizer.py +111 -0
- pull_cli/links.py +186 -0
- pull_cli/macros.py +527 -0
- pull_cli/markdown_writer.py +24 -0
- pull_cli/models.py +232 -0
- pull_cli/paths.py +45 -0
- pull_cli/resolver.py +72 -0
- pull_cli/security.py +103 -0
- pull_cli/validator.py +398 -0
- pull_cli/writer.py +792 -0
- pull_cli-0.1.0.dist-info/METADATA +218 -0
- pull_cli-0.1.0.dist-info/RECORD +31 -0
- pull_cli-0.1.0.dist-info/WHEEL +4 -0
- pull_cli-0.1.0.dist-info/entry_points.txt +3 -0
- pull_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
pull_cli/writer.py
ADDED
|
@@ -0,0 +1,792 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import shutil
|
|
6
|
+
from dataclasses import asdict
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from html import unescape
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
from .errors import EXIT_VALIDATION, PullError
|
|
15
|
+
from .html_normalizer import normalize_html
|
|
16
|
+
from .markdown_writer import rendered_html_to_markdown
|
|
17
|
+
from .models import (
|
|
18
|
+
AssetRecord,
|
|
19
|
+
CommentRecord,
|
|
20
|
+
ExtractionResult,
|
|
21
|
+
PageArtifact,
|
|
22
|
+
PullOptions,
|
|
23
|
+
WarningRecord,
|
|
24
|
+
)
|
|
25
|
+
from .paths import as_posix, relative_path, slugify
|
|
26
|
+
from .security import (
|
|
27
|
+
SECRET_KEY_PATTERN,
|
|
28
|
+
redact_source_url_text,
|
|
29
|
+
redact_text,
|
|
30
|
+
redact_value,
|
|
31
|
+
sanitize_url,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
BUNDLE_LINK_RE = re.compile(r"(!?\[[^\]]*]\()([^)]+)(\))")
|
|
35
|
+
WRITE_ORIENTED_SNAPSHOT_KEYS = {
|
|
36
|
+
"draft",
|
|
37
|
+
"draftid",
|
|
38
|
+
"edit",
|
|
39
|
+
"editui",
|
|
40
|
+
"edituiv2",
|
|
41
|
+
"isactiveliveeditsession",
|
|
42
|
+
"operations",
|
|
43
|
+
"permissions",
|
|
44
|
+
}
|
|
45
|
+
REDACTED_SNAPSHOT_KEYS = {
|
|
46
|
+
"draftversion",
|
|
47
|
+
"restrictions",
|
|
48
|
+
"schedulepublishdate",
|
|
49
|
+
"schedulepublishinfo",
|
|
50
|
+
}
|
|
51
|
+
REDACTED_LINK_KEYS = {
|
|
52
|
+
"base",
|
|
53
|
+
"context",
|
|
54
|
+
"self",
|
|
55
|
+
"tinyui",
|
|
56
|
+
"webui",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def prepare_output_dir(output: Path, *, force: bool, clean: bool) -> None:
|
|
61
|
+
if output.exists() and clean:
|
|
62
|
+
shutil.rmtree(output)
|
|
63
|
+
if output.exists() and any(output.iterdir()) and not force and not clean:
|
|
64
|
+
raise PullError(
|
|
65
|
+
code="ERR_VALIDATION_OUTPUT_EXISTS",
|
|
66
|
+
message=f"Output directory already exists and is not empty: {output}",
|
|
67
|
+
exit_code=EXIT_VALIDATION,
|
|
68
|
+
suggested_action="Use --force to add/overwrite files or --clean to replace the directory.",
|
|
69
|
+
)
|
|
70
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
(output / "pages").mkdir(exist_ok=True)
|
|
72
|
+
(output / "diagnostics").mkdir(exist_ok=True)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def write_page_artifact(output: Path, artifact: PageArtifact, *, options: PullOptions) -> None:
|
|
76
|
+
page_dir = output / artifact.page_dir
|
|
77
|
+
page_dir.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
(page_dir / "assets").mkdir(exist_ok=True)
|
|
79
|
+
(output / artifact.index_md).write_text(artifact.markdown, encoding="utf-8")
|
|
80
|
+
if options.write_html and artifact.index_html:
|
|
81
|
+
(output / artifact.index_html).write_text(
|
|
82
|
+
_sanitize_snapshot(artifact.html, redact_source_urls=options.redact_source_urls),
|
|
83
|
+
encoding="utf-8",
|
|
84
|
+
)
|
|
85
|
+
if options.write_source and artifact.source_path and artifact.page.body_storage:
|
|
86
|
+
(output / artifact.source_path).write_text(
|
|
87
|
+
_sanitize_snapshot(artifact.page.body_storage, redact_source_urls=options.redact_source_urls),
|
|
88
|
+
encoding="utf-8",
|
|
89
|
+
)
|
|
90
|
+
page_json_data = {
|
|
91
|
+
"page": _sanitize_snapshot(artifact.page.raw, redact_source_urls=options.redact_source_urls),
|
|
92
|
+
"metadata": {
|
|
93
|
+
"page_id": artifact.page.page_id,
|
|
94
|
+
"title": artifact.page.title,
|
|
95
|
+
"space_key": artifact.page.space_key,
|
|
96
|
+
"version": artifact.page.version,
|
|
97
|
+
"url": sanitize_url(artifact.page.url, redact_source_url=options.redact_source_urls),
|
|
98
|
+
"labels": artifact.page.labels,
|
|
99
|
+
},
|
|
100
|
+
"representations": {
|
|
101
|
+
"has_rendered_html": bool(artifact.page.body_view or artifact.page.body_export_view),
|
|
102
|
+
"has_storage": bool(artifact.page.body_storage),
|
|
103
|
+
"has_adf": bool(artifact.page.body_adf),
|
|
104
|
+
},
|
|
105
|
+
"warnings": [warning.to_dict() for warning in artifact.warnings],
|
|
106
|
+
}
|
|
107
|
+
(output / artifact.page_json).write_text(
|
|
108
|
+
json.dumps(
|
|
109
|
+
redact_value(page_json_data, redact_source_urls=options.redact_source_urls),
|
|
110
|
+
indent=2,
|
|
111
|
+
sort_keys=True,
|
|
112
|
+
),
|
|
113
|
+
encoding="utf-8",
|
|
114
|
+
)
|
|
115
|
+
if artifact.comments_path and artifact.comments:
|
|
116
|
+
(output / artifact.comments_path).write_text(
|
|
117
|
+
_comments_markdown(artifact, options=options),
|
|
118
|
+
encoding="utf-8",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def write_manifest(result: ExtractionResult, *, options: PullOptions, root_page_id: str, base_url: str, deployment_type: str) -> None:
|
|
123
|
+
manifest = build_manifest(
|
|
124
|
+
result,
|
|
125
|
+
options=options,
|
|
126
|
+
root_page_id=root_page_id,
|
|
127
|
+
base_url=base_url,
|
|
128
|
+
deployment_type=deployment_type,
|
|
129
|
+
)
|
|
130
|
+
result.manifest_path.write_text(
|
|
131
|
+
yaml.safe_dump(
|
|
132
|
+
redact_value(manifest, redact_source_urls=options.redact_manifest),
|
|
133
|
+
sort_keys=False,
|
|
134
|
+
allow_unicode=True,
|
|
135
|
+
),
|
|
136
|
+
encoding="utf-8",
|
|
137
|
+
)
|
|
138
|
+
write_ai_manifests(result, options=options)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def build_manifest(
|
|
142
|
+
result: ExtractionResult,
|
|
143
|
+
*,
|
|
144
|
+
options: PullOptions,
|
|
145
|
+
root_page_id: str,
|
|
146
|
+
base_url: str,
|
|
147
|
+
deployment_type: str,
|
|
148
|
+
) -> dict[str, Any]:
|
|
149
|
+
ai_paths = _ai_manifest_paths(result)
|
|
150
|
+
pages = []
|
|
151
|
+
for artifact in result.pages:
|
|
152
|
+
paths = {
|
|
153
|
+
"dir": artifact.page_dir,
|
|
154
|
+
"markdown": artifact.index_md,
|
|
155
|
+
"html": artifact.index_html,
|
|
156
|
+
"source": artifact.source_path,
|
|
157
|
+
"metadata": artifact.page_json,
|
|
158
|
+
}
|
|
159
|
+
page_entry = {
|
|
160
|
+
"order": artifact.order,
|
|
161
|
+
"page_id": artifact.page.page_id,
|
|
162
|
+
"title": artifact.page.title,
|
|
163
|
+
"space_key": artifact.page.space_key,
|
|
164
|
+
"parent_id": artifact.page.parent_id,
|
|
165
|
+
"depth": artifact.page.depth,
|
|
166
|
+
"version": artifact.page.version,
|
|
167
|
+
"url": artifact.page.url,
|
|
168
|
+
"paths": paths,
|
|
169
|
+
"assets": [asset.asset_id for asset in artifact.assets],
|
|
170
|
+
"warnings": [warning.to_dict() for warning in artifact.warnings],
|
|
171
|
+
"macro_records": [macro.macro_id for macro in artifact.macros],
|
|
172
|
+
}
|
|
173
|
+
if artifact.comments_path and artifact.comments:
|
|
174
|
+
paths["comments"] = artifact.comments_path
|
|
175
|
+
page_entry["comments"] = {
|
|
176
|
+
"count": len(artifact.comments),
|
|
177
|
+
"locations": _comment_locations(artifact.comments, options=options),
|
|
178
|
+
}
|
|
179
|
+
pages.append(page_entry)
|
|
180
|
+
return {
|
|
181
|
+
"schema_version": "1.0",
|
|
182
|
+
"tool": {"name": "pull-cli", "version": _tool_version()},
|
|
183
|
+
"generated_at": datetime.now(UTC).isoformat(),
|
|
184
|
+
"source": {
|
|
185
|
+
"base_url": base_url,
|
|
186
|
+
"deployment_type": deployment_type,
|
|
187
|
+
},
|
|
188
|
+
"root": {"page_id": root_page_id},
|
|
189
|
+
"path_base": {
|
|
190
|
+
"kind": "package_root",
|
|
191
|
+
"root": ".",
|
|
192
|
+
"rule": "All relative paths in this manifest are relative to the output package root.",
|
|
193
|
+
},
|
|
194
|
+
"options": options.manifest_dict(),
|
|
195
|
+
"paths": {
|
|
196
|
+
"manifest": "manifest.yaml",
|
|
197
|
+
"ai_manifest": ai_paths["manifest"],
|
|
198
|
+
"ai_entry": ai_paths["entry"],
|
|
199
|
+
"bundle": as_posix(result.bundle_path.relative_to(result.output_dir)) if result.bundle_path else None,
|
|
200
|
+
"chunks": "chunks.jsonl" if options.write_chunks else None,
|
|
201
|
+
"warnings": "diagnostics/warnings.jsonl",
|
|
202
|
+
"unresolved_links": "diagnostics/unresolved-links.md",
|
|
203
|
+
},
|
|
204
|
+
"pages": pages,
|
|
205
|
+
"assets": [asset.to_manifest() for asset in result.assets],
|
|
206
|
+
"links": [asdict(link) for link in result.links],
|
|
207
|
+
"macros": [macro.to_manifest() for macro in result.macros],
|
|
208
|
+
"warnings": [warning.to_dict() for warning in result.warnings],
|
|
209
|
+
"errors": [],
|
|
210
|
+
"completeness": {
|
|
211
|
+
"pages_requested": len(result.pages),
|
|
212
|
+
"pages_written": len(result.pages),
|
|
213
|
+
"assets_downloaded": len(result.assets),
|
|
214
|
+
"warnings": len(result.warnings),
|
|
215
|
+
"rendered_page_first": True,
|
|
216
|
+
},
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def write_ai_manifests(result: ExtractionResult, *, options: PullOptions) -> None:
|
|
221
|
+
page_names = _page_names(result.pages)
|
|
222
|
+
ai_paths = _ai_manifest_paths(result, page_names=page_names)
|
|
223
|
+
ai_manifest = build_ai_manifest(result, options=options, page_names=page_names, ai_paths=ai_paths)
|
|
224
|
+
result.ai_manifest_path = result.output_dir / ai_paths["manifest"]
|
|
225
|
+
result.ai_entry_path = result.output_dir / ai_paths["entry"]
|
|
226
|
+
result.ai_manifest_path.write_text(
|
|
227
|
+
yaml.safe_dump(ai_manifest, sort_keys=False, allow_unicode=True),
|
|
228
|
+
encoding="utf-8",
|
|
229
|
+
)
|
|
230
|
+
result.ai_entry_path.write_text(
|
|
231
|
+
build_ai_entry_markdown(ai_manifest),
|
|
232
|
+
encoding="utf-8",
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def build_ai_manifest(
|
|
237
|
+
result: ExtractionResult,
|
|
238
|
+
*,
|
|
239
|
+
options: PullOptions,
|
|
240
|
+
page_names: dict[str, str] | None = None,
|
|
241
|
+
ai_paths: dict[str, str] | None = None,
|
|
242
|
+
) -> dict[str, Any]:
|
|
243
|
+
page_names = page_names or _page_names(result.pages)
|
|
244
|
+
ai_paths = ai_paths or _ai_manifest_paths(result, page_names=page_names)
|
|
245
|
+
children_by_parent: dict[str, list[str]] = {}
|
|
246
|
+
for artifact in result.pages:
|
|
247
|
+
parent_id = artifact.page.parent_id
|
|
248
|
+
if parent_id and parent_id in page_names:
|
|
249
|
+
children_by_parent.setdefault(parent_id, []).append(page_names[artifact.page.page_id])
|
|
250
|
+
|
|
251
|
+
pages = []
|
|
252
|
+
for artifact in result.pages:
|
|
253
|
+
parent_name = page_names.get(artifact.page.parent_id or "")
|
|
254
|
+
page_assets = [_ai_asset(asset) for asset in artifact.assets]
|
|
255
|
+
page_entry = {
|
|
256
|
+
"name": page_names[artifact.page.page_id],
|
|
257
|
+
"title": artifact.page.title,
|
|
258
|
+
"page_id": artifact.page.page_id,
|
|
259
|
+
"parent": parent_name,
|
|
260
|
+
"depth": artifact.page.depth,
|
|
261
|
+
"markdown": artifact.index_md,
|
|
262
|
+
"children": children_by_parent.get(artifact.page.page_id, []),
|
|
263
|
+
"assets": page_assets,
|
|
264
|
+
"warnings": len(artifact.warnings),
|
|
265
|
+
}
|
|
266
|
+
if artifact.comments_path and artifact.comments:
|
|
267
|
+
page_entry["comments"] = artifact.comments_path
|
|
268
|
+
page_entry["comments_count"] = len(artifact.comments)
|
|
269
|
+
pages.append(page_entry)
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
"schema_version": "1.0",
|
|
273
|
+
"output_mode": options.output_mode,
|
|
274
|
+
"purpose": "Minimal AI navigation manifest for this pulled Confluence package.",
|
|
275
|
+
"start_here": "Read this file first, then open page markdown paths or asset sidecars as needed.",
|
|
276
|
+
"artifact_guidance": _artifact_guidance(result, options=options),
|
|
277
|
+
"path_base": {
|
|
278
|
+
"kind": "package_root",
|
|
279
|
+
"root": ".",
|
|
280
|
+
"rule": "Resolve every relative path in this YAML against the directory containing this YAML file, regardless of the agent shell current working directory.",
|
|
281
|
+
"page_markdown_rule": "After opening a page markdown file, resolve links inside that page relative to that page file.",
|
|
282
|
+
"bundle_rule": "bundle.md is for linear reading and search; its local links are rebased to package_root."
|
|
283
|
+
if result.bundle_path
|
|
284
|
+
else None,
|
|
285
|
+
},
|
|
286
|
+
"root": page_names[result.pages[0].page.page_id] if result.pages else None,
|
|
287
|
+
"entrypoints": {
|
|
288
|
+
"ai_entry": ai_paths["entry"],
|
|
289
|
+
"ai_manifest": ai_paths["manifest"],
|
|
290
|
+
"bundle": as_posix(result.bundle_path.relative_to(result.output_dir))
|
|
291
|
+
if result.bundle_path
|
|
292
|
+
else None,
|
|
293
|
+
"full_manifest": "manifest.yaml",
|
|
294
|
+
"warnings": "diagnostics/warnings.jsonl",
|
|
295
|
+
"unresolved_links": "diagnostics/unresolved-links.md",
|
|
296
|
+
"chunks": "chunks.jsonl" if (result.output_dir / "chunks.jsonl").exists() else None,
|
|
297
|
+
},
|
|
298
|
+
"pages": pages,
|
|
299
|
+
"diagnostics": {
|
|
300
|
+
"warnings": len(result.warnings),
|
|
301
|
+
"warning_codes": _warning_counts(result.warnings),
|
|
302
|
+
"warnings_path": "diagnostics/warnings.jsonl",
|
|
303
|
+
"unresolved_links_path": "diagnostics/unresolved-links.md",
|
|
304
|
+
},
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def build_ai_entry_markdown(ai_manifest: dict[str, Any]) -> str:
|
|
309
|
+
simple_mode = ai_manifest.get("output_mode") == "simple"
|
|
310
|
+
entrypoints = ai_manifest.get("entrypoints", {})
|
|
311
|
+
bundle_path = entrypoints.get("bundle") if isinstance(entrypoints, dict) else None
|
|
312
|
+
lines = [
|
|
313
|
+
"# AI Navigation Manifest",
|
|
314
|
+
"",
|
|
315
|
+
str(ai_manifest["start_here"]),
|
|
316
|
+
"",
|
|
317
|
+
f"Root page: `{ai_manifest.get('root')}`",
|
|
318
|
+
"",
|
|
319
|
+
"## Agent Instructions",
|
|
320
|
+
"",
|
|
321
|
+
"1. Set `PACKAGE_ROOT` to the directory containing this file.",
|
|
322
|
+
"2. If you are launched from a repo root or another working directory, keep `PACKAGE_ROOT` as the path base; do not resolve these links against the repo root.",
|
|
323
|
+
"3. Resolve every relative path in this file against `PACKAGE_ROOT`."
|
|
324
|
+
if simple_mode
|
|
325
|
+
else "3. Resolve every relative path in this file and in the YAML manifest against `PACKAGE_ROOT`.",
|
|
326
|
+
"4. Open page Markdown paths under `pages/` for detailed evidence; after opening a page, resolve links inside it relative to that page file.",
|
|
327
|
+
"5. Use the page hierarchy below to choose the smallest relevant page set before reading broad context.",
|
|
328
|
+
_agent_instruction_6(simple_mode=simple_mode, bundle_path=bundle_path),
|
|
329
|
+
"7. Open asset sidecars when present before inferring image, diagram, PDF, or text attachment content.",
|
|
330
|
+
"8. Treat warning counts below as a signal to run validation before making claims about missing content, broken links, macros, or assets."
|
|
331
|
+
if simple_mode
|
|
332
|
+
else "8. Check diagnostics when warning counts are nonzero before making claims about missing content, broken links, macros, or assets.",
|
|
333
|
+
"",
|
|
334
|
+
"## Artifact Guidance",
|
|
335
|
+
"",
|
|
336
|
+
str(ai_manifest.get("artifact_guidance", {}).get("rule", "")),
|
|
337
|
+
"",
|
|
338
|
+
_surfaces_line("Navigation surfaces", ai_manifest.get("artifact_guidance", {}).get("navigation_surfaces")),
|
|
339
|
+
_simple_control_files_line(simple_mode)
|
|
340
|
+
if simple_mode
|
|
341
|
+
else _surfaces_line(
|
|
342
|
+
"Raw reference surfaces",
|
|
343
|
+
ai_manifest.get("artifact_guidance", {}).get("raw_reference_surfaces"),
|
|
344
|
+
suffix="; their links may be redacted and are not evidence of failed local rewriting.",
|
|
345
|
+
),
|
|
346
|
+
"",
|
|
347
|
+
"## First Checks",
|
|
348
|
+
"",
|
|
349
|
+
"Run `pull validate <PACKAGE_ROOT>` before analysis. If validation fails, inspect the reported file, link, resolution base, candidate path, and diagnostics before trusting generated links or artifacts.",
|
|
350
|
+
]
|
|
351
|
+
core_file_labels = ("bundle", "chunks") if simple_mode else (
|
|
352
|
+
"ai_manifest",
|
|
353
|
+
"bundle",
|
|
354
|
+
"full_manifest",
|
|
355
|
+
"warnings",
|
|
356
|
+
"unresolved_links",
|
|
357
|
+
"chunks",
|
|
358
|
+
)
|
|
359
|
+
core_file_lines = []
|
|
360
|
+
for label in core_file_labels:
|
|
361
|
+
path = entrypoints.get(label) if isinstance(entrypoints, dict) else None
|
|
362
|
+
if path:
|
|
363
|
+
core_file_lines.append(f"- {label}: [{path}]({path})")
|
|
364
|
+
if core_file_lines:
|
|
365
|
+
lines.extend(["", "## Core Files", "", *core_file_lines])
|
|
366
|
+
lines.extend(["", "## Page Hierarchy", ""])
|
|
367
|
+
_append_page_hierarchy(lines, ai_manifest)
|
|
368
|
+
assets = [
|
|
369
|
+
(page["name"], asset)
|
|
370
|
+
for page in ai_manifest.get("pages", [])
|
|
371
|
+
for asset in page.get("assets", [])
|
|
372
|
+
]
|
|
373
|
+
if assets:
|
|
374
|
+
lines.extend(["", "## Assets", ""])
|
|
375
|
+
for page_name, asset in assets:
|
|
376
|
+
sidecars = asset.get("sidecars") or []
|
|
377
|
+
sidecar_text = ""
|
|
378
|
+
if sidecars:
|
|
379
|
+
sidecar_links = ", ".join(f"[{sidecar}]({sidecar})" for sidecar in sidecars)
|
|
380
|
+
sidecar_text = f"; sidecars: {sidecar_links}"
|
|
381
|
+
lines.append(
|
|
382
|
+
f"- `{page_name}/{asset['name']}`: [{asset['path']}]({asset['path']}){sidecar_text}"
|
|
383
|
+
)
|
|
384
|
+
lines.extend(["", "## Diagnostics", "", f"- warnings: {ai_manifest.get('diagnostics', {}).get('warnings', 0)}"])
|
|
385
|
+
if not simple_mode:
|
|
386
|
+
lines.extend(
|
|
387
|
+
[
|
|
388
|
+
_markdown_link_line("warning records", ai_manifest.get("diagnostics", {}).get("warnings_path")),
|
|
389
|
+
_markdown_link_line(
|
|
390
|
+
"unresolved links", ai_manifest.get("diagnostics", {}).get("unresolved_links_path")
|
|
391
|
+
),
|
|
392
|
+
]
|
|
393
|
+
)
|
|
394
|
+
warning_codes = ai_manifest.get("diagnostics", {}).get("warning_codes", {})
|
|
395
|
+
if isinstance(warning_codes, dict) and warning_codes:
|
|
396
|
+
lines.extend(["", "Warning codes:", ""])
|
|
397
|
+
for code, count in sorted(warning_codes.items()):
|
|
398
|
+
lines.append(f"- `{code}`: {count}")
|
|
399
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _agent_instruction_6(*, simple_mode: bool, bundle_path: object) -> str:
|
|
403
|
+
if bundle_path:
|
|
404
|
+
return "6. Prefer individual page files for navigation and `bundle.md` for linear reading or search; bundle links are rebased to `PACKAGE_ROOT`."
|
|
405
|
+
if simple_mode:
|
|
406
|
+
return "6. Use individual page files for navigation and reading."
|
|
407
|
+
return "6. Prefer individual page files for navigation; no `bundle.md` was written for this package."
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _surfaces_line(label: str, surfaces: object, *, suffix: str = ".") -> str:
|
|
411
|
+
items = surfaces if isinstance(surfaces, list) else []
|
|
412
|
+
rendered = ", ".join(f"`{item}`" for item in items if isinstance(item, str)) or "none"
|
|
413
|
+
return f"- {label}: {rendered}{suffix}"
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _simple_control_files_line(simple_mode: bool) -> str:
|
|
417
|
+
if simple_mode:
|
|
418
|
+
return "- Control and provenance files are written for tooling but are intentionally not listed as reading targets in simple mode."
|
|
419
|
+
return ""
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _append_page_hierarchy(lines: list[str], ai_manifest: dict[str, Any]) -> None:
|
|
423
|
+
pages = [page for page in ai_manifest.get("pages", []) if isinstance(page, dict)]
|
|
424
|
+
by_name = {page.get("name"): page for page in pages if isinstance(page.get("name"), str)}
|
|
425
|
+
root_name = ai_manifest.get("root")
|
|
426
|
+
roots = [by_name[root_name]] if isinstance(root_name, str) and root_name in by_name else []
|
|
427
|
+
if not roots:
|
|
428
|
+
roots = [page for page in pages if not page.get("parent")]
|
|
429
|
+
if not roots and pages:
|
|
430
|
+
roots = [pages[0]]
|
|
431
|
+
|
|
432
|
+
visited: set[str] = set()
|
|
433
|
+
|
|
434
|
+
def append_page(page: dict[str, Any], depth: int) -> None:
|
|
435
|
+
name = page.get("name")
|
|
436
|
+
if not isinstance(name, str):
|
|
437
|
+
return
|
|
438
|
+
indent = " " * depth
|
|
439
|
+
lines.append(f"{indent}- {_page_hierarchy_line(page)}")
|
|
440
|
+
visited.add(name)
|
|
441
|
+
for child_name in page.get("children", []):
|
|
442
|
+
child = by_name.get(child_name)
|
|
443
|
+
if child is None:
|
|
444
|
+
lines.append(f"{indent} - `{child_name}`: missing from page index")
|
|
445
|
+
continue
|
|
446
|
+
if child_name in visited:
|
|
447
|
+
lines.append(f"{indent} - `{child_name}`: already listed above")
|
|
448
|
+
continue
|
|
449
|
+
append_page(child, depth + 1)
|
|
450
|
+
|
|
451
|
+
for root in roots:
|
|
452
|
+
append_page(root, 0)
|
|
453
|
+
unlisted = [page for page in pages if page.get("name") not in visited]
|
|
454
|
+
if unlisted:
|
|
455
|
+
lines.extend(["", "Unlinked pages:", ""])
|
|
456
|
+
for page in unlisted:
|
|
457
|
+
lines.append(f"- {_page_hierarchy_line(page)}")
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _page_hierarchy_line(page: dict[str, Any]) -> str:
|
|
461
|
+
markdown = page.get("markdown", "")
|
|
462
|
+
comments = ""
|
|
463
|
+
if isinstance(page.get("comments"), str):
|
|
464
|
+
comments = f", comments {page.get('comments_count', 0)} ([comments.md]({page['comments']}))"
|
|
465
|
+
return (
|
|
466
|
+
f"`{page.get('name')}`: [{page.get('title')}]({markdown}) "
|
|
467
|
+
f"- path `{markdown}`, depth {page.get('depth')}, assets {len(page.get('assets', []))}, warnings {page.get('warnings')}{comments}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def write_bundle(result: ExtractionResult, *, root_title: str, options: PullOptions) -> None:
|
|
472
|
+
if not result.bundle_path:
|
|
473
|
+
return
|
|
474
|
+
bundle_path = as_posix(result.bundle_path.relative_to(result.output_dir))
|
|
475
|
+
lines = [
|
|
476
|
+
"# Pulled Confluence Bundle",
|
|
477
|
+
"",
|
|
478
|
+
f"Source root: {root_title}",
|
|
479
|
+
f"Generated: {datetime.now(UTC).isoformat()}",
|
|
480
|
+
f"Pages: {len(result.pages)}",
|
|
481
|
+
f"Assets: {len(result.assets)}",
|
|
482
|
+
f"Warnings: {len(result.warnings)}",
|
|
483
|
+
"Manifest: ./manifest.yaml",
|
|
484
|
+
"",
|
|
485
|
+
"---",
|
|
486
|
+
"",
|
|
487
|
+
]
|
|
488
|
+
for artifact in result.pages:
|
|
489
|
+
source_url = "<redacted-url>" if options.redact_source_urls else artifact.page.url or ""
|
|
490
|
+
lines.extend(
|
|
491
|
+
[
|
|
492
|
+
f'<!-- pull:page-start id="{artifact.page.page_id}" path="{artifact.index_md}" -->',
|
|
493
|
+
"",
|
|
494
|
+
f"# {artifact.page.title}",
|
|
495
|
+
"",
|
|
496
|
+
f"Source: {source_url}",
|
|
497
|
+
f"Confluence version: {artifact.page.version or 'unknown'}",
|
|
498
|
+
"",
|
|
499
|
+
_rebase_bundle_links(artifact.markdown.strip(), from_file=artifact.index_md, bundle_file=bundle_path),
|
|
500
|
+
"",
|
|
501
|
+
f'<!-- pull:page-end id="{artifact.page.page_id}" -->',
|
|
502
|
+
"",
|
|
503
|
+
"---",
|
|
504
|
+
"",
|
|
505
|
+
]
|
|
506
|
+
)
|
|
507
|
+
result.bundle_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def write_diagnostics(output: Path, warnings: list[WarningRecord], unresolved_links: list[dict[str, Any]]) -> None:
|
|
511
|
+
diagnostics = output / "diagnostics"
|
|
512
|
+
diagnostics.mkdir(exist_ok=True)
|
|
513
|
+
warnings_path = diagnostics / "warnings.jsonl"
|
|
514
|
+
warnings_path.write_text(
|
|
515
|
+
"".join(json.dumps(warning.to_dict(), sort_keys=True) + "\n" for warning in warnings),
|
|
516
|
+
encoding="utf-8",
|
|
517
|
+
)
|
|
518
|
+
lines = ["# Unresolved Links", ""]
|
|
519
|
+
if not unresolved_links:
|
|
520
|
+
lines.append("No unresolved local links were recorded.")
|
|
521
|
+
else:
|
|
522
|
+
for link in unresolved_links:
|
|
523
|
+
lines.append(f"- Page `{link.get('source_page_id')}`: `{link.get('original')}` ({link.get('warning')})")
|
|
524
|
+
(diagnostics / "unresolved-links.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def page_markdown_header(artifact: PageArtifact, *, options: PullOptions) -> str:
|
|
528
|
+
source_url = "<redacted-url>" if options.redact_source_urls else artifact.page.url or ""
|
|
529
|
+
lines = [
|
|
530
|
+
"---",
|
|
531
|
+
f'pull_page_id: "{artifact.page.page_id}"',
|
|
532
|
+
f'title: "{artifact.page.title}"',
|
|
533
|
+
f'space: "{artifact.page.space_key or ""}"',
|
|
534
|
+
f"confluence_version: {artifact.page.version or 'null'}",
|
|
535
|
+
f'retrieved_at: "{datetime.now(UTC).isoformat()}"',
|
|
536
|
+
f'source_url: "{source_url}"',
|
|
537
|
+
f"local_assets: {len(artifact.assets)}",
|
|
538
|
+
f"warnings: {len(artifact.warnings)}",
|
|
539
|
+
"---",
|
|
540
|
+
"",
|
|
541
|
+
f"# {artifact.page.title}",
|
|
542
|
+
"",
|
|
543
|
+
f"> Source: Confluence page `{artifact.page.page_id}`, version {artifact.page.version or 'unknown'}.",
|
|
544
|
+
"",
|
|
545
|
+
]
|
|
546
|
+
if artifact.comments_path and artifact.comments:
|
|
547
|
+
comments_link = relative_path(artifact.index_md, artifact.comments_path)
|
|
548
|
+
lines.extend(
|
|
549
|
+
[
|
|
550
|
+
f"> Comments sidecar: [{len(artifact.comments)} comment(s)]({comments_link}).",
|
|
551
|
+
"",
|
|
552
|
+
]
|
|
553
|
+
)
|
|
554
|
+
return "\n".join(lines)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def _tool_version() -> str:
|
|
558
|
+
from . import __version__
|
|
559
|
+
|
|
560
|
+
return __version__
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _page_names(pages: list[PageArtifact]) -> dict[str, str]:
|
|
564
|
+
names: dict[str, str] = {}
|
|
565
|
+
used: set[str] = set()
|
|
566
|
+
for artifact in pages:
|
|
567
|
+
base = slugify(artifact.page.title, fallback=artifact.page.page_id)
|
|
568
|
+
name = base
|
|
569
|
+
counter = 2
|
|
570
|
+
while name in used:
|
|
571
|
+
name = f"{base}-{counter}"
|
|
572
|
+
counter += 1
|
|
573
|
+
used.add(name)
|
|
574
|
+
names[artifact.page.page_id] = name
|
|
575
|
+
return names
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def _ai_manifest_paths(
|
|
579
|
+
result: ExtractionResult, *, page_names: dict[str, str] | None = None
|
|
580
|
+
) -> dict[str, str]:
|
|
581
|
+
page_names = page_names or _page_names(result.pages)
|
|
582
|
+
root_name = page_names[result.pages[0].page.page_id] if result.pages else "pulled-confluence"
|
|
583
|
+
reserved = {"manifest", "bundle", "chunks"}
|
|
584
|
+
file_stem = f"{root_name}-ai" if root_name in reserved else root_name
|
|
585
|
+
return {"entry": f"{file_stem}.md", "manifest": f"{file_stem}.yaml"}
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def _ai_asset(asset: AssetRecord) -> dict[str, Any]:
|
|
589
|
+
return {
|
|
590
|
+
"name": slugify(Path(asset.filename).stem, fallback=asset.asset_id),
|
|
591
|
+
"filename": asset.filename,
|
|
592
|
+
"path": asset.local_path,
|
|
593
|
+
"media_type": asset.media_type,
|
|
594
|
+
"sidecars": asset.sidecars,
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def _warning_counts(warnings: list[WarningRecord]) -> dict[str, int]:
|
|
599
|
+
counts: dict[str, int] = {}
|
|
600
|
+
for warning in warnings:
|
|
601
|
+
counts[warning.code] = counts.get(warning.code, 0) + 1
|
|
602
|
+
return counts
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _comments_markdown(artifact: PageArtifact, *, options: PullOptions) -> str:
|
|
606
|
+
lines = [
|
|
607
|
+
f"# Comments for {artifact.page.title}",
|
|
608
|
+
"",
|
|
609
|
+
f"Page ID: `{artifact.page.page_id}`",
|
|
610
|
+
f"Comment count: {len(artifact.comments)}",
|
|
611
|
+
"",
|
|
612
|
+
]
|
|
613
|
+
for index, comment in enumerate(artifact.comments, start=1):
|
|
614
|
+
lines.extend(_comment_markdown_block(index, comment, options=options))
|
|
615
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def _comment_markdown_block(index: int, comment: CommentRecord, *, options: PullOptions) -> list[str]:
|
|
619
|
+
lines = [
|
|
620
|
+
f"## Comment {index}: `{_comment_field(comment.comment_id, options=options)}`",
|
|
621
|
+
"",
|
|
622
|
+
]
|
|
623
|
+
metadata = [
|
|
624
|
+
("location", comment.location),
|
|
625
|
+
("status", comment.status),
|
|
626
|
+
("resolution", comment.resolution),
|
|
627
|
+
("version", comment.version),
|
|
628
|
+
("author", comment.author),
|
|
629
|
+
("created", comment.created_at),
|
|
630
|
+
("updated", comment.updated_at),
|
|
631
|
+
("parent", comment.parent_id),
|
|
632
|
+
]
|
|
633
|
+
for label, value in metadata:
|
|
634
|
+
if value is not None and value != "":
|
|
635
|
+
lines.append(f"- {label}: {_comment_field(value, options=options)}")
|
|
636
|
+
body = _comment_body_markdown(comment, options=options)
|
|
637
|
+
lines.extend(["", body or "_No comment body returned._", ""])
|
|
638
|
+
return lines
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def _comment_body_markdown(comment: CommentRecord, *, options: PullOptions) -> str:
|
|
642
|
+
sanitized_html, _warnings = normalize_html(
|
|
643
|
+
comment.body_html,
|
|
644
|
+
source_page_id=comment.page_id,
|
|
645
|
+
redact_source_urls=options.redact_source_urls,
|
|
646
|
+
)
|
|
647
|
+
return rendered_html_to_markdown(sanitized_html).strip()
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def _comment_field(value: object, *, options: PullOptions) -> str:
|
|
651
|
+
text = str(_sanitize_snapshot(value, redact_source_urls=options.redact_source_urls) or "")
|
|
652
|
+
return text.replace("\n", " ").strip()
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def _comment_locations(comments: list[CommentRecord], *, options: PullOptions) -> list[str]:
|
|
656
|
+
return sorted(
|
|
657
|
+
{
|
|
658
|
+
_comment_field(comment.location, options=options)
|
|
659
|
+
for comment in comments
|
|
660
|
+
if comment.location
|
|
661
|
+
}
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
def _markdown_link_line(label: str, path: object) -> str:
|
|
666
|
+
if not isinstance(path, str) or not path:
|
|
667
|
+
return f"- {label}: unavailable"
|
|
668
|
+
return f"- {label}: [{path}]({path})"
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def _artifact_guidance(result: ExtractionResult, *, options: PullOptions) -> dict[str, Any]:
|
|
672
|
+
navigation_surfaces = ["page index.md files"]
|
|
673
|
+
if result.bundle_path:
|
|
674
|
+
navigation_surfaces.append("bundle.md")
|
|
675
|
+
raw_reference_surfaces = ["page.json"]
|
|
676
|
+
if any(artifact.source_path for artifact in result.pages):
|
|
677
|
+
raw_reference_surfaces.insert(0, "source.storage.xml")
|
|
678
|
+
rendered_reference_surfaces = ["index.html"] if any(artifact.index_html for artifact in result.pages) else []
|
|
679
|
+
if result.bundle_path:
|
|
680
|
+
navigation_rule = "Use page Markdown files and bundle.md for navigation."
|
|
681
|
+
else:
|
|
682
|
+
navigation_rule = "Use page Markdown files for navigation."
|
|
683
|
+
if options.output_mode == "simple":
|
|
684
|
+
rule = (
|
|
685
|
+
f"{navigation_rule} Simple mode keeps control and provenance artifacts available for tooling "
|
|
686
|
+
"without listing them as primary reading targets."
|
|
687
|
+
)
|
|
688
|
+
else:
|
|
689
|
+
rule = (
|
|
690
|
+
f"{navigation_rule} Treat raw reference artifacts as source evidence only; their source links may be "
|
|
691
|
+
"redacted and should not be used to judge rewritten local navigation."
|
|
692
|
+
)
|
|
693
|
+
return {
|
|
694
|
+
"rule": rule,
|
|
695
|
+
"navigation_surfaces": navigation_surfaces,
|
|
696
|
+
"raw_reference_surfaces": raw_reference_surfaces,
|
|
697
|
+
"rendered_reference_surfaces": rendered_reference_surfaces,
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def _sanitize_snapshot(value: Any, *, redact_source_urls: bool = False) -> Any:
|
|
702
|
+
if isinstance(value, str):
|
|
703
|
+
if ("<" in value and ">" in value) or ("<" in value and ">" in value):
|
|
704
|
+
text = unescape(value)
|
|
705
|
+
normalized, _warnings = normalize_html(text, source_page_id="", redact_source_urls=redact_source_urls)
|
|
706
|
+
redacted = redact_text(normalized)
|
|
707
|
+
return redact_source_url_text(redacted) if redact_source_urls else redacted
|
|
708
|
+
text = redact_text(value)
|
|
709
|
+
if text.startswith(("http://", "https://")):
|
|
710
|
+
sanitized = sanitize_url(text, redact_source_url=redact_source_urls)
|
|
711
|
+
return sanitized or text
|
|
712
|
+
if redact_source_urls:
|
|
713
|
+
return redact_source_url_text(text)
|
|
714
|
+
return text
|
|
715
|
+
if isinstance(value, dict):
|
|
716
|
+
output: dict[str, Any] = {}
|
|
717
|
+
for key, child in value.items():
|
|
718
|
+
key_text = str(key)
|
|
719
|
+
if _is_write_oriented_snapshot_key(key_text):
|
|
720
|
+
continue
|
|
721
|
+
if redact_source_urls and _is_redacted_snapshot_key(key_text):
|
|
722
|
+
continue
|
|
723
|
+
output[key_text] = "<redacted>" if SECRET_KEY_PATTERN.search(key_text) else _sanitize_snapshot(child, redact_source_urls=redact_source_urls)
|
|
724
|
+
return output
|
|
725
|
+
if isinstance(value, list):
|
|
726
|
+
return [_sanitize_snapshot(child, redact_source_urls=redact_source_urls) for child in value]
|
|
727
|
+
return redact_value(value)
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def _is_write_oriented_snapshot_key(key: str) -> bool:
|
|
731
|
+
normalized = re.sub(r"[^a-z0-9]", "", key.lower())
|
|
732
|
+
return normalized in WRITE_ORIENTED_SNAPSHOT_KEYS
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def _is_redacted_snapshot_key(key: str) -> bool:
|
|
736
|
+
normalized = re.sub(r"[^a-z0-9]", "", key.lower())
|
|
737
|
+
return normalized in REDACTED_SNAPSHOT_KEYS or normalized in REDACTED_LINK_KEYS
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def _rebase_bundle_links(markdown: str, *, from_file: str, bundle_file: str) -> str:
|
|
741
|
+
def replace(match: re.Match[str]) -> str:
|
|
742
|
+
prefix, raw_target, suffix = match.groups()
|
|
743
|
+
rebased = _rebase_bundle_link_target(raw_target, from_file=from_file, bundle_file=bundle_file)
|
|
744
|
+
return f"{prefix}{rebased}{suffix}"
|
|
745
|
+
|
|
746
|
+
return BUNDLE_LINK_RE.sub(replace, markdown)
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def _rebase_bundle_link_target(raw_target: str, *, from_file: str, bundle_file: str) -> str:
|
|
750
|
+
leading = raw_target[: len(raw_target) - len(raw_target.lstrip())]
|
|
751
|
+
trailing = raw_target[len(raw_target.rstrip()) :]
|
|
752
|
+
core = raw_target.strip()
|
|
753
|
+
if not core:
|
|
754
|
+
return raw_target
|
|
755
|
+
|
|
756
|
+
angle_wrapped = core.startswith("<")
|
|
757
|
+
if angle_wrapped:
|
|
758
|
+
end = core.find(">")
|
|
759
|
+
if end == -1:
|
|
760
|
+
return raw_target
|
|
761
|
+
target = core[1:end]
|
|
762
|
+
trailer = core[end + 1 :]
|
|
763
|
+
else:
|
|
764
|
+
target, trailer = _split_markdown_target(core)
|
|
765
|
+
|
|
766
|
+
if _is_external_or_page_local(target):
|
|
767
|
+
return raw_target
|
|
768
|
+
|
|
769
|
+
path_part, marker, fragment = target.partition("#")
|
|
770
|
+
if not path_part:
|
|
771
|
+
return raw_target
|
|
772
|
+
rebased_path = relative_path(bundle_file, Path(from_file).parent / path_part)
|
|
773
|
+
if rebased_path.startswith("../"):
|
|
774
|
+
return raw_target
|
|
775
|
+
rebased_target = f"{rebased_path}{marker}{fragment}"
|
|
776
|
+
if angle_wrapped:
|
|
777
|
+
rebased_target = f"<{rebased_target}>{trailer}"
|
|
778
|
+
else:
|
|
779
|
+
rebased_target = f"{rebased_target}{trailer}"
|
|
780
|
+
return f"{leading}{rebased_target}{trailing}"
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def _split_markdown_target(core: str) -> tuple[str, str]:
|
|
784
|
+
for marker in (' "', " '", "\t\"", "\t'"):
|
|
785
|
+
if marker in core:
|
|
786
|
+
path, title = core.split(marker, 1)
|
|
787
|
+
return path, f"{marker}{title}"
|
|
788
|
+
return core, ""
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def _is_external_or_page_local(target: str) -> bool:
|
|
792
|
+
return target in {"redacted-url", "<redacted-url>"} or target.startswith(("#", "/", "http://", "https://", "mailto:", "jira:"))
|