pull-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pull_cli/__init__.py +5 -0
- pull_cli/__main__.py +6 -0
- pull_cli/assets.py +235 -0
- pull_cli/attachment_extractors.py +85 -0
- pull_cli/cli.py +329 -0
- pull_cli/clients/__init__.py +8 -0
- pull_cli/clients/base.py +29 -0
- pull_cli/clients/cloud_v2.py +132 -0
- pull_cli/clients/data_center.py +360 -0
- pull_cli/clients/hybrid.py +15 -0
- pull_cli/config.py +82 -0
- pull_cli/crawler.py +51 -0
- pull_cli/envelope.py +59 -0
- pull_cli/errors.py +50 -0
- pull_cli/extractor.py +344 -0
- pull_cli/guide.py +115 -0
- pull_cli/html_normalizer.py +111 -0
- pull_cli/links.py +186 -0
- pull_cli/macros.py +527 -0
- pull_cli/markdown_writer.py +24 -0
- pull_cli/models.py +232 -0
- pull_cli/paths.py +45 -0
- pull_cli/resolver.py +72 -0
- pull_cli/security.py +103 -0
- pull_cli/validator.py +398 -0
- pull_cli/writer.py +792 -0
- pull_cli-0.1.0.dist-info/METADATA +218 -0
- pull_cli-0.1.0.dist-info/RECORD +31 -0
- pull_cli-0.1.0.dist-info/WHEEL +4 -0
- pull_cli-0.1.0.dist-info/entry_points.txt +3 -0
- pull_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
pull_cli/envelope.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
from dataclasses import asdict
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .errors import PullError
|
|
11
|
+
from .models import WarningRecord
|
|
12
|
+
from .security import redact_value
|
|
13
|
+
|
|
14
|
+
SCHEMA_VERSION = "1.0"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def wants_json(explicit: bool) -> bool:
|
|
18
|
+
return explicit or os.environ.get("LLM", "").lower() == "true"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def request_id() -> str:
|
|
22
|
+
return time.strftime("req_%Y%m%d_%H%M%S_") + uuid.uuid4().hex[:8]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_envelope(
|
|
26
|
+
*,
|
|
27
|
+
ok: bool,
|
|
28
|
+
command: str,
|
|
29
|
+
target: dict[str, Any] | None = None,
|
|
30
|
+
result: dict[str, Any] | None = None,
|
|
31
|
+
warnings: list[WarningRecord | dict[str, Any]] | None = None,
|
|
32
|
+
errors: list[PullError | dict[str, Any]] | None = None,
|
|
33
|
+
metrics: dict[str, Any] | None = None,
|
|
34
|
+
request_id_value: str | None = None,
|
|
35
|
+
) -> dict[str, Any]:
|
|
36
|
+
warning_records = [
|
|
37
|
+
warning.to_dict() if isinstance(warning, WarningRecord) else warning
|
|
38
|
+
for warning in (warnings or [])
|
|
39
|
+
]
|
|
40
|
+
error_records = [error.to_record() if isinstance(error, PullError) else error for error in errors or []]
|
|
41
|
+
return {
|
|
42
|
+
"schema_version": SCHEMA_VERSION,
|
|
43
|
+
"request_id": request_id_value or request_id(),
|
|
44
|
+
"ok": ok,
|
|
45
|
+
"command": command,
|
|
46
|
+
"target": target or {},
|
|
47
|
+
"result": result if ok else None,
|
|
48
|
+
"warnings": warning_records,
|
|
49
|
+
"errors": error_records,
|
|
50
|
+
"metrics": metrics or {},
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def emit_json(data: dict[str, Any]) -> None:
|
|
55
|
+
print(json.dumps(redact_value(data), ensure_ascii=False, separators=(",", ":")))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def dataclass_dict(value: Any) -> dict[str, Any]:
|
|
59
|
+
return asdict(value)
|
pull_cli/errors.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
EXIT_SUCCESS = 0
|
|
7
|
+
EXIT_VALIDATION = 10
|
|
8
|
+
EXIT_AUTH = 20
|
|
9
|
+
EXIT_SOURCE = 30
|
|
10
|
+
EXIT_STRICT_PARTIAL = 40
|
|
11
|
+
EXIT_IO = 50
|
|
12
|
+
EXIT_INTERNAL = 90
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class PullError(Exception):
|
|
17
|
+
code: str
|
|
18
|
+
message: str
|
|
19
|
+
exit_code: int = EXIT_INTERNAL
|
|
20
|
+
retryable: bool = False
|
|
21
|
+
suggested_action: str | None = None
|
|
22
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return f"{self.code}: {self.message}"
|
|
26
|
+
|
|
27
|
+
def to_record(self) -> dict[str, Any]:
|
|
28
|
+
return {
|
|
29
|
+
"code": self.code,
|
|
30
|
+
"message": self.message,
|
|
31
|
+
"retryable": self.retryable,
|
|
32
|
+
"suggested_action": self.suggested_action,
|
|
33
|
+
"details": self.details,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def validation_error(
|
|
38
|
+
code: str,
|
|
39
|
+
message: str,
|
|
40
|
+
*,
|
|
41
|
+
suggested_action: str | None = None,
|
|
42
|
+
details: dict[str, Any] | None = None,
|
|
43
|
+
) -> PullError:
|
|
44
|
+
return PullError(
|
|
45
|
+
code=code,
|
|
46
|
+
message=message,
|
|
47
|
+
exit_code=EXIT_VALIDATION,
|
|
48
|
+
suggested_action=suggested_action,
|
|
49
|
+
details=details or {},
|
|
50
|
+
)
|
pull_cli/extractor.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from .assets import discover_asset_candidates, download_assets, skipped_asset_warnings
|
|
6
|
+
from .clients.base import ConfluenceClient
|
|
7
|
+
from .crawler import crawl_pages
|
|
8
|
+
from .errors import EXIT_STRICT_PARTIAL, PullError
|
|
9
|
+
from .html_normalizer import normalize_html
|
|
10
|
+
from .links import rewrite_html_links
|
|
11
|
+
from .macros import MacroContext, MacroRegistry
|
|
12
|
+
from .markdown_writer import rendered_html_to_markdown
|
|
13
|
+
from .models import (
|
|
14
|
+
CommentRecord,
|
|
15
|
+
ExtractionResult,
|
|
16
|
+
PageArtifact,
|
|
17
|
+
PageSummary,
|
|
18
|
+
PullOptions,
|
|
19
|
+
WarningRecord,
|
|
20
|
+
)
|
|
21
|
+
from .paths import relative_path, slugify
|
|
22
|
+
from .security import redact_source_url_text, redact_text
|
|
23
|
+
from .writer import (
|
|
24
|
+
page_markdown_header,
|
|
25
|
+
prepare_output_dir,
|
|
26
|
+
write_bundle,
|
|
27
|
+
write_diagnostics,
|
|
28
|
+
write_manifest,
|
|
29
|
+
write_page_artifact,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract(
|
|
34
|
+
*,
|
|
35
|
+
client: ConfluenceClient,
|
|
36
|
+
root: PageSummary,
|
|
37
|
+
options: PullOptions,
|
|
38
|
+
) -> ExtractionResult:
|
|
39
|
+
prepare_output_dir(options.output, force=options.force, clean=options.clean)
|
|
40
|
+
summaries = crawl_pages(
|
|
41
|
+
client,
|
|
42
|
+
root,
|
|
43
|
+
tree=options.tree,
|
|
44
|
+
depth=options.depth,
|
|
45
|
+
max_pages=options.max_pages,
|
|
46
|
+
)
|
|
47
|
+
page_paths = _page_paths(summaries, options=options)
|
|
48
|
+
pages_by_id = {summary.page_id: summary for summary in summaries}
|
|
49
|
+
registry = MacroRegistry()
|
|
50
|
+
result = ExtractionResult(
|
|
51
|
+
output_dir=options.output,
|
|
52
|
+
manifest_path=options.output / "manifest.yaml",
|
|
53
|
+
bundle_path=options.output / "bundle.md" if options.write_bundle else None,
|
|
54
|
+
pages=[],
|
|
55
|
+
assets=[],
|
|
56
|
+
warnings=[],
|
|
57
|
+
links=[],
|
|
58
|
+
macros=[],
|
|
59
|
+
)
|
|
60
|
+
for summary in summaries:
|
|
61
|
+
page = client.get_page(summary.page_id)
|
|
62
|
+
page.order = summary.order
|
|
63
|
+
page.depth = summary.depth
|
|
64
|
+
page.parent_id = summary.parent_id
|
|
65
|
+
page.title = page.title or summary.title
|
|
66
|
+
page.url = page.url or summary.url
|
|
67
|
+
page_dir = page_paths[page.page_id].removesuffix("/index.md")
|
|
68
|
+
index_md = f"{page_dir}/index.md"
|
|
69
|
+
index_html = f"{page_dir}/index.html" if options.write_html else None
|
|
70
|
+
source_path = f"{page_dir}/source.storage.xml" if options.write_source and page.body_storage else None
|
|
71
|
+
page_json = f"{page_dir}/page.json"
|
|
72
|
+
comments, comment_warnings = _collect_comments(client, page.page_id, options=options)
|
|
73
|
+
comments_path = f"{page_dir}/comments.md" if comments else None
|
|
74
|
+
rendered = _select_rendered_body(page.body_view, page.body_export_view, page.body_storage)
|
|
75
|
+
normalized_html, html_warnings = normalize_html(
|
|
76
|
+
rendered,
|
|
77
|
+
source_page_id=page.page_id,
|
|
78
|
+
)
|
|
79
|
+
attachments = client.list_attachments(page.page_id)
|
|
80
|
+
candidates = discover_asset_candidates(
|
|
81
|
+
normalized_html,
|
|
82
|
+
page_id=page.page_id,
|
|
83
|
+
attachments=attachments,
|
|
84
|
+
options=options,
|
|
85
|
+
)
|
|
86
|
+
assets, asset_warnings = download_assets(
|
|
87
|
+
candidates,
|
|
88
|
+
page_id=page.page_id,
|
|
89
|
+
page_assets_dir=options.output / page_dir / "assets",
|
|
90
|
+
page_assets_path=f"{page_dir}/assets",
|
|
91
|
+
client=client,
|
|
92
|
+
extract_attachments=options.extract_attachments,
|
|
93
|
+
)
|
|
94
|
+
if options.no_assets:
|
|
95
|
+
asset_warnings.extend(skipped_asset_warnings(normalized_html, page_id=page.page_id))
|
|
96
|
+
rewritten_html, links, link_warnings = rewrite_html_links(
|
|
97
|
+
normalized_html,
|
|
98
|
+
page=page,
|
|
99
|
+
page_index_path=index_md,
|
|
100
|
+
pages_by_id=pages_by_id,
|
|
101
|
+
page_paths=page_paths,
|
|
102
|
+
assets=assets,
|
|
103
|
+
rewrite_links=options.rewrite_links,
|
|
104
|
+
)
|
|
105
|
+
if options.redact_manifest or options.redact_source_urls:
|
|
106
|
+
_redact_links(links, redact_source_urls=options.redact_source_urls)
|
|
107
|
+
if options.redact_source_urls:
|
|
108
|
+
rewritten_html, _redaction_warnings = normalize_html(
|
|
109
|
+
rewritten_html,
|
|
110
|
+
source_page_id=page.page_id,
|
|
111
|
+
redact_source_urls=True,
|
|
112
|
+
)
|
|
113
|
+
macro_context = MacroContext(
|
|
114
|
+
page_id=page.page_id,
|
|
115
|
+
attachments=attachments,
|
|
116
|
+
options=options,
|
|
117
|
+
child_links=_child_links(page, summaries, page_paths),
|
|
118
|
+
)
|
|
119
|
+
macros = registry.convert_all(page.body_storage, macro_context)
|
|
120
|
+
_enforce_strict_macros(macros, options=options)
|
|
121
|
+
macro_warnings = [warning for macro in macros for warning in macro.warnings]
|
|
122
|
+
visible_markdown = rendered_html_to_markdown(rewritten_html)
|
|
123
|
+
attachment_markdown = _attachment_markdown(assets, page_index_path=index_md)
|
|
124
|
+
if attachment_markdown:
|
|
125
|
+
visible_markdown = visible_markdown.rstrip() + "\n\n" + attachment_markdown + "\n"
|
|
126
|
+
macro_markdown = _macro_recovery_markdown(macros)
|
|
127
|
+
artifact = PageArtifact(
|
|
128
|
+
page=page,
|
|
129
|
+
order=page.order,
|
|
130
|
+
page_dir=page_dir,
|
|
131
|
+
index_md=index_md,
|
|
132
|
+
index_html=index_html,
|
|
133
|
+
source_path=source_path,
|
|
134
|
+
page_json=page_json,
|
|
135
|
+
markdown="",
|
|
136
|
+
html=rewritten_html,
|
|
137
|
+
assets=assets,
|
|
138
|
+
links=links,
|
|
139
|
+
macros=macros,
|
|
140
|
+
warnings=[*html_warnings, *asset_warnings, *link_warnings, *macro_warnings, *comment_warnings],
|
|
141
|
+
comments_path=comments_path,
|
|
142
|
+
comments=comments,
|
|
143
|
+
)
|
|
144
|
+
artifact.markdown = (
|
|
145
|
+
page_markdown_header(artifact, options=options)
|
|
146
|
+
+ visible_markdown
|
|
147
|
+
+ ("\n\n## Macro Recovery\n\n" + macro_markdown + "\n" if macro_markdown else "")
|
|
148
|
+
)
|
|
149
|
+
write_page_artifact(options.output, artifact, options=options)
|
|
150
|
+
result.pages.append(artifact)
|
|
151
|
+
result.assets.extend(assets)
|
|
152
|
+
result.links.extend(links)
|
|
153
|
+
result.macros.extend(macros)
|
|
154
|
+
result.warnings.extend(artifact.warnings)
|
|
155
|
+
|
|
156
|
+
unresolved = [
|
|
157
|
+
link.__dict__
|
|
158
|
+
for link in result.links
|
|
159
|
+
if link.status == "unresolved" or link.warning == "W_LINK_ANCHOR_UNRESOLVED"
|
|
160
|
+
]
|
|
161
|
+
write_bundle(result, root_title=result.pages[0].page.title if result.pages else root.title, options=options)
|
|
162
|
+
if options.write_chunks:
|
|
163
|
+
_write_chunks(result)
|
|
164
|
+
write_diagnostics(options.output, result.warnings, unresolved)
|
|
165
|
+
write_manifest(
|
|
166
|
+
result,
|
|
167
|
+
options=options,
|
|
168
|
+
root_page_id=root.page_id,
|
|
169
|
+
base_url=client.base_url,
|
|
170
|
+
deployment_type=client.deployment_type,
|
|
171
|
+
)
|
|
172
|
+
result.metrics["api_calls"] = client.api_calls
|
|
173
|
+
result.metrics["pages"] = len(result.pages)
|
|
174
|
+
result.metrics["assets"] = len(result.assets)
|
|
175
|
+
return result
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _select_rendered_body(view: str | None, export_view: str | None, storage: str | None) -> str:
|
|
179
|
+
return view or export_view or storage or ""
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _macro_recovery_markdown(macros) -> str:
|
|
183
|
+
blocks = [macro.markdown.strip() for macro in macros if macro.markdown and macro.status != "ignored"]
|
|
184
|
+
return "\n\n".join(block for block in blocks if block)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _attachment_markdown(assets, *, page_index_path: str) -> str:
|
|
188
|
+
rows = []
|
|
189
|
+
for asset in assets:
|
|
190
|
+
if asset.attachment_id:
|
|
191
|
+
asset_link = relative_path(page_index_path, asset.local_path)
|
|
192
|
+
sidecars = (
|
|
193
|
+
", ".join(
|
|
194
|
+
f"`{sidecar}` ([open]({relative_path(page_index_path, sidecar)}))"
|
|
195
|
+
for sidecar in asset.sidecars
|
|
196
|
+
)
|
|
197
|
+
or ""
|
|
198
|
+
)
|
|
199
|
+
rows.append(
|
|
200
|
+
"| "
|
|
201
|
+
+ " | ".join(
|
|
202
|
+
[
|
|
203
|
+
asset.filename,
|
|
204
|
+
f"`{asset.local_path}` ([open]({asset_link}))",
|
|
205
|
+
asset.media_type or "",
|
|
206
|
+
sidecars,
|
|
207
|
+
]
|
|
208
|
+
)
|
|
209
|
+
+ " |"
|
|
210
|
+
)
|
|
211
|
+
if not rows:
|
|
212
|
+
return ""
|
|
213
|
+
return "\n".join(
|
|
214
|
+
[
|
|
215
|
+
"## Attachments",
|
|
216
|
+
"",
|
|
217
|
+
"| Filename | Local path | Media type | Extracted sidecars |",
|
|
218
|
+
"| --- | --- | --- | --- |",
|
|
219
|
+
*rows,
|
|
220
|
+
]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _collect_comments(
|
|
225
|
+
client: ConfluenceClient, page_id: str, *, options: PullOptions
|
|
226
|
+
) -> tuple[list[CommentRecord], list[WarningRecord]]:
|
|
227
|
+
if not options.comments:
|
|
228
|
+
return [], []
|
|
229
|
+
try:
|
|
230
|
+
return _unique_comments(client.list_comments(page_id)), []
|
|
231
|
+
except Exception as exc: # noqa: BLE001
|
|
232
|
+
return [], [
|
|
233
|
+
WarningRecord(
|
|
234
|
+
code="W_COMMENTS_FETCH_FAILED",
|
|
235
|
+
message="Could not fetch Confluence comments for this page.",
|
|
236
|
+
source_page_id=page_id,
|
|
237
|
+
details={"reason": _redacted_warning_reason(exc, options=options)},
|
|
238
|
+
)
|
|
239
|
+
]
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _unique_comments(comments: list[CommentRecord]) -> list[CommentRecord]:
|
|
243
|
+
output: list[CommentRecord] = []
|
|
244
|
+
seen: set[str] = set()
|
|
245
|
+
for comment in comments:
|
|
246
|
+
if comment.comment_id and comment.comment_id in seen:
|
|
247
|
+
continue
|
|
248
|
+
if comment.comment_id:
|
|
249
|
+
seen.add(comment.comment_id)
|
|
250
|
+
output.append(comment)
|
|
251
|
+
return output
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _redacted_warning_reason(exc: Exception, *, options: PullOptions) -> str:
|
|
255
|
+
reason = redact_text(str(exc))
|
|
256
|
+
return redact_source_url_text(reason) if options.redact_source_urls else reason
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _redact_links(links, *, redact_source_urls: bool) -> None:
|
|
260
|
+
from .security import redact_text, sanitize_url
|
|
261
|
+
|
|
262
|
+
for link in links:
|
|
263
|
+
link.original = sanitize_url(link.original, redact_source_url=redact_source_urls) or redact_text(link.original)
|
|
264
|
+
link.normalized = sanitize_url(link.normalized, redact_source_url=redact_source_urls) or redact_text(link.normalized)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _enforce_strict_macros(macros, *, options: PullOptions) -> None:
|
|
268
|
+
strict = options.macro_policy == "strict" or options.unknown_macro == "error"
|
|
269
|
+
if not strict:
|
|
270
|
+
return
|
|
271
|
+
failures = [
|
|
272
|
+
{
|
|
273
|
+
"macro_id": macro.macro_id,
|
|
274
|
+
"name": macro.name,
|
|
275
|
+
"status": macro.status,
|
|
276
|
+
"warnings": [warning.code for warning in macro.warnings],
|
|
277
|
+
}
|
|
278
|
+
for macro in macros
|
|
279
|
+
if macro.status in {"placeholder", "error"} or macro.warnings
|
|
280
|
+
]
|
|
281
|
+
if failures:
|
|
282
|
+
raise PullError(
|
|
283
|
+
code="ERR_INTERNAL_CONVERSION",
|
|
284
|
+
message="Strict macro policy rejected one or more partial macro conversions.",
|
|
285
|
+
exit_code=EXIT_STRICT_PARTIAL,
|
|
286
|
+
suggested_action="Use --macro-policy expand or --unknown-macro warn to allow placeholders.",
|
|
287
|
+
details={"macros": failures},
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _page_paths(summaries: list[PageSummary], *, options: PullOptions) -> dict[str, str]:
|
|
292
|
+
layout = options.layout
|
|
293
|
+
if layout == "auto":
|
|
294
|
+
layout = "nested" if options.tree else "flat"
|
|
295
|
+
paths: dict[str, str] = {}
|
|
296
|
+
by_id = {summary.page_id: summary for summary in summaries}
|
|
297
|
+
for summary in summaries:
|
|
298
|
+
segment = f"{summary.order:04d}-{slugify(summary.title, fallback=summary.page_id)}"
|
|
299
|
+
if layout == "nested" and summary.parent_id and summary.parent_id in paths:
|
|
300
|
+
parent_dir = str(Path(paths[summary.parent_id]).parent).replace("\\", "/")
|
|
301
|
+
paths[summary.page_id] = f"{parent_dir}/{segment}/index.md"
|
|
302
|
+
elif layout == "nested" and summary.parent_id and summary.parent_id in by_id:
|
|
303
|
+
paths[summary.page_id] = f"pages/{segment}/index.md"
|
|
304
|
+
else:
|
|
305
|
+
paths[summary.page_id] = f"pages/{segment}/index.md"
|
|
306
|
+
return paths
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _child_links(
|
|
310
|
+
page: PageSummary, summaries: list[PageSummary], page_paths: dict[str, str]
|
|
311
|
+
) -> list[tuple[str, str]]:
|
|
312
|
+
links = []
|
|
313
|
+
source_index = page_paths.get(page.page_id, "")
|
|
314
|
+
for summary in summaries:
|
|
315
|
+
if summary.parent_id == page.page_id:
|
|
316
|
+
from .paths import relative_path
|
|
317
|
+
|
|
318
|
+
links.append((summary.title, relative_path(source_index, page_paths[summary.page_id])))
|
|
319
|
+
return links
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _write_chunks(result: ExtractionResult) -> None:
|
|
323
|
+
import json
|
|
324
|
+
|
|
325
|
+
chunks_path = result.output_dir / "chunks.jsonl"
|
|
326
|
+
records = []
|
|
327
|
+
for artifact in result.pages:
|
|
328
|
+
paragraphs = [block.strip() for block in artifact.markdown.split("\n\n") if block.strip()]
|
|
329
|
+
for index, paragraph in enumerate(paragraphs, start=1):
|
|
330
|
+
records.append(
|
|
331
|
+
{
|
|
332
|
+
"schema_version": "1.0",
|
|
333
|
+
"chunk_id": f"{artifact.page.page_id}-{index:04d}",
|
|
334
|
+
"page_id": artifact.page.page_id,
|
|
335
|
+
"title": artifact.page.title,
|
|
336
|
+
"source_path": artifact.index_md,
|
|
337
|
+
"order": artifact.order,
|
|
338
|
+
"text": paragraph,
|
|
339
|
+
}
|
|
340
|
+
)
|
|
341
|
+
chunks_path.write_text(
|
|
342
|
+
"".join(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n" for record in records),
|
|
343
|
+
encoding="utf-8",
|
|
344
|
+
)
|
pull_cli/guide.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def guide_payload() -> dict[str, object]:
|
|
5
|
+
return {
|
|
6
|
+
"schema_version": "1.0",
|
|
7
|
+
"commands": {
|
|
8
|
+
"pull": {
|
|
9
|
+
"usage": "pull PAGE_REF [OPTIONS]",
|
|
10
|
+
"selectors": ["PAGE_REF", "--page-id", "--url", "--space + --title"],
|
|
11
|
+
"resolution_order": [
|
|
12
|
+
"explicit --page-id",
|
|
13
|
+
"explicit --url",
|
|
14
|
+
"positional URL",
|
|
15
|
+
"positional numeric page ID",
|
|
16
|
+
"--space + --title",
|
|
17
|
+
],
|
|
18
|
+
"options": {
|
|
19
|
+
"scope": ["--tree", "--depth N", "--max-pages N"],
|
|
20
|
+
"output": [
|
|
21
|
+
"-o/--output PATH",
|
|
22
|
+
"--output-mode simple|full",
|
|
23
|
+
"--force",
|
|
24
|
+
"--clean",
|
|
25
|
+
"--html/--no-html",
|
|
26
|
+
"--source/--no-source",
|
|
27
|
+
"--bundle/--no-bundle",
|
|
28
|
+
],
|
|
29
|
+
"assets": ["--assets visible|page|all", "--no-assets", "--extract-attachments", "--diagram-sources"],
|
|
30
|
+
"comments": ["--comments"],
|
|
31
|
+
"links": ["--rewrite-links/--no-rewrite-links", "--follow-includes", "--follow-links same-tree|same-space|none"],
|
|
32
|
+
"agent": ["--json", "LLM=true"],
|
|
33
|
+
},
|
|
34
|
+
},
|
|
35
|
+
"validate": {"usage": "pull validate MANIFEST_OR_OUTPUT_DIR [--json]"},
|
|
36
|
+
"guide": {"usage": "pull guide [--json]"},
|
|
37
|
+
},
|
|
38
|
+
"output_schema": {
|
|
39
|
+
"default_mode": "simple",
|
|
40
|
+
"mode_rules": {
|
|
41
|
+
"simple": "Default. Agent-facing output is the sanitized root AI Markdown, page index.md files, and downloaded assets/sidecars. Control files are still written for validation/provenance but are not linked from the root AI Markdown.",
|
|
42
|
+
"full": "Writes the current full evidence package: bundle.md, page index.html snapshots, source.storage.xml when available, page.json, manifests, diagnostics, and assets.",
|
|
43
|
+
"overrides": "--bundle/--no-bundle, --html/--no-html, --source/--no-source, and --chunks override mode defaults.",
|
|
44
|
+
"clean": "Use --clean when switching modes if you need the physical output tree to contain only files from the new mode.",
|
|
45
|
+
},
|
|
46
|
+
"simple_root_files": [
|
|
47
|
+
"<sanitized-root-page-title>.md",
|
|
48
|
+
"<sanitized-root-page-title>.yaml",
|
|
49
|
+
"manifest.yaml",
|
|
50
|
+
"diagnostics/warnings.jsonl",
|
|
51
|
+
"diagnostics/unresolved-links.md",
|
|
52
|
+
],
|
|
53
|
+
"simple_per_page": ["index.md", "page.json", "assets/", "comments.md when --comments finds comments"],
|
|
54
|
+
"full_extra_files": ["bundle.md", "pages/*/index.html", "pages/*/source.storage.xml"],
|
|
55
|
+
"per_page": ["index.md", "page.json", "assets/", "comments.md when --comments finds comments"],
|
|
56
|
+
"ai_manifest": "Agent handoff files named from the sanitized root page title, with package-root path rules, hierarchical local page markdown paths, assets, sidecars, and diagnostics pointers.",
|
|
57
|
+
"manifest_paths": "Manifest and AI manifest paths are package-root-relative. Resolve them against the directory containing the root AI Markdown/YAML file, not the shell current working directory.",
|
|
58
|
+
"bundle_links": "Local links in bundle.md are rebased to package-root-relative paths.",
|
|
59
|
+
"comments": "--comments is opt-in. It fetches page and inline comments, writes page-local comments.md sidecars only when comments exist, and links them from agent-facing navigation.",
|
|
60
|
+
},
|
|
61
|
+
"json_envelope": {
|
|
62
|
+
"schema_version": "1.0",
|
|
63
|
+
"shape": ["schema_version", "request_id", "ok", "command", "target", "result", "warnings", "errors", "metrics"],
|
|
64
|
+
"failure_result": None,
|
|
65
|
+
},
|
|
66
|
+
"error_codes": [
|
|
67
|
+
"ERR_VALIDATION_REQUIRED",
|
|
68
|
+
"ERR_VALIDATION_AMBIGUOUS_PAGE",
|
|
69
|
+
"ERR_VALIDATION_INVALID_URL",
|
|
70
|
+
"ERR_VALIDATION_OUTPUT_EXISTS",
|
|
71
|
+
"ERR_AUTH_REQUIRED",
|
|
72
|
+
"ERR_AUTH_FORBIDDEN",
|
|
73
|
+
"ERR_AUTH_EXPIRED",
|
|
74
|
+
"ERR_SOURCE_PAGE_NOT_FOUND",
|
|
75
|
+
"ERR_SOURCE_BODY_UNAVAILABLE",
|
|
76
|
+
"ERR_SOURCE_TREE_TOO_LARGE",
|
|
77
|
+
"ERR_IO_CONNECTION",
|
|
78
|
+
"ERR_IO_TIMEOUT",
|
|
79
|
+
"ERR_IO_WRITE_FAILED",
|
|
80
|
+
"ERR_INTERNAL_CONVERSION",
|
|
81
|
+
"ERR_INTERNAL_API_RESPONSE",
|
|
82
|
+
],
|
|
83
|
+
"warning_codes": [
|
|
84
|
+
"W_MACRO_UNKNOWN",
|
|
85
|
+
"W_MACRO_PARTIAL",
|
|
86
|
+
"W_MACRO_RENDER_EMPTY",
|
|
87
|
+
"W_ASSET_DOWNLOAD_FAILED",
|
|
88
|
+
"W_ASSET_SKIPPED_BY_POLICY",
|
|
89
|
+
"W_ASSET_DIAGRAM_SOURCE_NOT_FOUND",
|
|
90
|
+
"W_ATTACHMENT_TEXT_EXTRACTION_FAILED",
|
|
91
|
+
"W_LINK_UNRESOLVED",
|
|
92
|
+
"W_LINK_ANCHOR_UNRESOLVED",
|
|
93
|
+
"W_LINK_EXTERNAL_PRESERVED",
|
|
94
|
+
"W_PAGE_SKIPPED_PERMISSION",
|
|
95
|
+
"W_PAGE_SKIPPED_LIMIT",
|
|
96
|
+
"W_COMMENTS_FETCH_FAILED",
|
|
97
|
+
"W_BODY_REPRESENTATION_FALLBACK",
|
|
98
|
+
"W_DYNAMIC_MACRO_SNAPSHOT",
|
|
99
|
+
"W_SANITIZED_HTML",
|
|
100
|
+
],
|
|
101
|
+
"examples": [
|
|
102
|
+
"pull 123456 -o pulled",
|
|
103
|
+
"pull --page-id 123456 --output-mode full -o pulled-full",
|
|
104
|
+
"pull \"https://example.atlassian.net/wiki/spaces/EA/pages/123456/Architecture\" -o pulled",
|
|
105
|
+
"pull --page-id 123456 --tree --depth 2 --assets all -o pulled-tree",
|
|
106
|
+
"pull --page-id 123456 --tree --comments -o pulled-comments",
|
|
107
|
+
"pull validate pulled-tree",
|
|
108
|
+
"LLM=true pull --page-id 123456 --json",
|
|
109
|
+
],
|
|
110
|
+
"compatibility_notes": [
|
|
111
|
+
"PULL_* environment variables take precedence over config file values.",
|
|
112
|
+
"CONFPUB_URL, CONFPUB_USER, CONFPUB_TOKEN, and CONFPUB_SSL_VERIFY are accepted as compatibility fallbacks.",
|
|
113
|
+
"The CLI is read-only and does not call LLM services.",
|
|
114
|
+
],
|
|
115
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from bs4 import BeautifulSoup, NavigableString
|
|
4
|
+
|
|
5
|
+
from .models import WarningRecord
|
|
6
|
+
from .security import SECRET_KEY_PATTERN, redact_source_url_text, redact_text, sanitize_url
|
|
7
|
+
|
|
8
|
+
WRITE_UI_SELECTORS = (
|
|
9
|
+
".plugin_attachments_container",
|
|
10
|
+
".plugin_attachments_upload_container",
|
|
11
|
+
".plugin_attachments_table_container",
|
|
12
|
+
".attachments-table-drop-zone",
|
|
13
|
+
".download-all-link",
|
|
14
|
+
".attachment-buttons",
|
|
15
|
+
"table.attachments",
|
|
16
|
+
".labels-edit-container",
|
|
17
|
+
".show-labels-editor",
|
|
18
|
+
".editAttachmentLink",
|
|
19
|
+
".removeAttachmentLink",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def normalize_html(
|
|
24
|
+
html: str, *, source_page_id: str, redact_source_urls: bool = False
|
|
25
|
+
) -> tuple[str, list[WarningRecord]]:
|
|
26
|
+
soup = BeautifulSoup(html or "", "lxml")
|
|
27
|
+
warnings: list[WarningRecord] = []
|
|
28
|
+
removed_executable = False
|
|
29
|
+
for tag in soup.find_all(["script", "style", "iframe", "object", "embed", "form"]):
|
|
30
|
+
tag.decompose()
|
|
31
|
+
removed_executable = True
|
|
32
|
+
for selector in WRITE_UI_SELECTORS:
|
|
33
|
+
for tag in soup.select(selector):
|
|
34
|
+
tag.decompose()
|
|
35
|
+
removed_executable = True
|
|
36
|
+
for tag in soup.find_all("input"):
|
|
37
|
+
input_type = str(tag.get("type") or "").lower()
|
|
38
|
+
input_name = str(tag.get("name") or "")
|
|
39
|
+
if input_type == "hidden" or input_type == "file" or SECRET_KEY_PATTERN.search(input_name):
|
|
40
|
+
tag.decompose()
|
|
41
|
+
removed_executable = True
|
|
42
|
+
for tag in soup.find_all(True):
|
|
43
|
+
for attr in list(tag.attrs):
|
|
44
|
+
attr_lower = attr.lower()
|
|
45
|
+
if attr_lower.startswith("on") or SECRET_KEY_PATTERN.search(attr_lower):
|
|
46
|
+
del tag.attrs[attr]
|
|
47
|
+
removed_executable = True
|
|
48
|
+
continue
|
|
49
|
+
value = tag.attrs.get(attr)
|
|
50
|
+
if isinstance(value, str):
|
|
51
|
+
redacted = (
|
|
52
|
+
sanitize_url(value, redact_source_url=redact_source_urls)
|
|
53
|
+
if _is_source_url(value)
|
|
54
|
+
else redact_text(value)
|
|
55
|
+
)
|
|
56
|
+
if redacted != value:
|
|
57
|
+
tag.attrs[attr] = redacted
|
|
58
|
+
removed_executable = True
|
|
59
|
+
for attr in ("href", "src", "data-file-src"):
|
|
60
|
+
value = tag.get(attr)
|
|
61
|
+
if isinstance(value, str) and value.strip().lower().startswith("javascript:"):
|
|
62
|
+
del tag.attrs[attr]
|
|
63
|
+
removed_executable = True
|
|
64
|
+
continue
|
|
65
|
+
if isinstance(value, str) and _is_source_url(value):
|
|
66
|
+
sanitized = sanitize_url(value, redact_source_url=redact_source_urls)
|
|
67
|
+
if sanitized != value:
|
|
68
|
+
tag.attrs[attr] = sanitized
|
|
69
|
+
removed_executable = True
|
|
70
|
+
if redact_source_urls:
|
|
71
|
+
for node in soup.find_all(string=True):
|
|
72
|
+
if isinstance(node, NavigableString):
|
|
73
|
+
redacted = redact_source_url_text(str(node))
|
|
74
|
+
if redacted != str(node):
|
|
75
|
+
node.replace_with(redacted)
|
|
76
|
+
removed_executable = True
|
|
77
|
+
for tag in soup.find_all("img"):
|
|
78
|
+
src = tag.get("src")
|
|
79
|
+
if isinstance(src, str) and _is_redacted_url(src) and not _has_accessible_label(tag):
|
|
80
|
+
tag.decompose()
|
|
81
|
+
removed_executable = True
|
|
82
|
+
if removed_executable:
|
|
83
|
+
warnings.append(
|
|
84
|
+
WarningRecord(
|
|
85
|
+
code="W_SANITIZED_HTML",
|
|
86
|
+
message="Executable or active HTML content was stripped from the rendered page snapshot.",
|
|
87
|
+
source_page_id=source_page_id,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
body = soup.body or soup
|
|
91
|
+
return str(body), warnings
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def soup_from_html(html: str) -> BeautifulSoup:
|
|
95
|
+
return BeautifulSoup(html or "", "lxml")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _is_source_url(value: str) -> bool:
|
|
99
|
+
return value.strip().lower().startswith(("http://", "https://", "//", "/wiki/", "/download/"))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _is_redacted_url(value: str) -> bool:
|
|
103
|
+
return value.strip().lower() in {"<redacted-url>", "<redacted-url>"}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _has_accessible_label(tag) -> bool:
|
|
107
|
+
for attr in ("alt", "title", "aria-label"):
|
|
108
|
+
value = tag.get(attr)
|
|
109
|
+
if isinstance(value, str) and value.strip():
|
|
110
|
+
return True
|
|
111
|
+
return False
|