pull-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pull_cli/__init__.py +5 -0
- pull_cli/__main__.py +6 -0
- pull_cli/assets.py +235 -0
- pull_cli/attachment_extractors.py +85 -0
- pull_cli/cli.py +329 -0
- pull_cli/clients/__init__.py +8 -0
- pull_cli/clients/base.py +29 -0
- pull_cli/clients/cloud_v2.py +132 -0
- pull_cli/clients/data_center.py +360 -0
- pull_cli/clients/hybrid.py +15 -0
- pull_cli/config.py +82 -0
- pull_cli/crawler.py +51 -0
- pull_cli/envelope.py +59 -0
- pull_cli/errors.py +50 -0
- pull_cli/extractor.py +344 -0
- pull_cli/guide.py +115 -0
- pull_cli/html_normalizer.py +111 -0
- pull_cli/links.py +186 -0
- pull_cli/macros.py +527 -0
- pull_cli/markdown_writer.py +24 -0
- pull_cli/models.py +232 -0
- pull_cli/paths.py +45 -0
- pull_cli/resolver.py +72 -0
- pull_cli/security.py +103 -0
- pull_cli/validator.py +398 -0
- pull_cli/writer.py +792 -0
- pull_cli-0.1.0.dist-info/METADATA +218 -0
- pull_cli-0.1.0.dist-info/RECORD +31 -0
- pull_cli-0.1.0.dist-info/WHEEL +4 -0
- pull_cli-0.1.0.dist-info/entry_points.txt +3 -0
- pull_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
pull_cli/__init__.py
ADDED
pull_cli/__main__.py
ADDED
pull_cli/assets.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import mimetypes
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from urllib.parse import unquote, urlsplit
|
|
9
|
+
|
|
10
|
+
from bs4 import BeautifulSoup
|
|
11
|
+
|
|
12
|
+
from .attachment_extractors import extract_text_sidecar, write_extracted_markdown
|
|
13
|
+
from .clients.base import ConfluenceClient
|
|
14
|
+
from .models import AssetRecord, AssetReference, AttachmentRecord, PullOptions, WarningRecord
|
|
15
|
+
from .paths import safe_filename, unique_name
|
|
16
|
+
from .security import sanitize_url
|
|
17
|
+
|
|
18
|
+
ATTACHMENT_PATH_RE = re.compile(r"/download/attachments/(?P<page_id>[^/]+)/(?P<filename>[^?#]+)")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class AssetCandidate:
|
|
23
|
+
original: str
|
|
24
|
+
role: str
|
|
25
|
+
html_attribute: str
|
|
26
|
+
attachment: AttachmentRecord | None = None
|
|
27
|
+
filename: str | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def discover_asset_candidates(
|
|
31
|
+
html: str,
|
|
32
|
+
*,
|
|
33
|
+
page_id: str,
|
|
34
|
+
attachments: list[AttachmentRecord],
|
|
35
|
+
options: PullOptions,
|
|
36
|
+
) -> list[AssetCandidate]:
|
|
37
|
+
if options.no_assets:
|
|
38
|
+
return []
|
|
39
|
+
soup = BeautifulSoup(html or "", "lxml")
|
|
40
|
+
attachment_by_name = {attachment.filename.lower(): attachment for attachment in attachments}
|
|
41
|
+
attachment_by_url = {
|
|
42
|
+
_normalize_url(attachment.download_url): attachment
|
|
43
|
+
for attachment in attachments
|
|
44
|
+
if attachment.download_url
|
|
45
|
+
}
|
|
46
|
+
candidates: list[AssetCandidate] = []
|
|
47
|
+
seen: set[tuple[str, str]] = set()
|
|
48
|
+
|
|
49
|
+
for tag in soup.find_all("img"):
|
|
50
|
+
src = tag.get("src")
|
|
51
|
+
if not isinstance(src, str) or _is_external(src) or _is_confluence_chrome_asset(src):
|
|
52
|
+
continue
|
|
53
|
+
attachment = attachment_by_url.get(_normalize_url(src)) or attachment_by_name.get(_filename_from_url(src).lower())
|
|
54
|
+
_add_candidate(
|
|
55
|
+
candidates,
|
|
56
|
+
seen,
|
|
57
|
+
AssetCandidate(src, "visible-image", "src", attachment, _filename_from_url(src)),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
for tag in soup.find_all("a"):
|
|
61
|
+
href = tag.get("href")
|
|
62
|
+
if not isinstance(href, str):
|
|
63
|
+
continue
|
|
64
|
+
filename = _filename_from_url(href)
|
|
65
|
+
attachment = attachment_by_url.get(_normalize_url(href)) or attachment_by_name.get(filename.lower())
|
|
66
|
+
if attachment or ATTACHMENT_PATH_RE.search(href):
|
|
67
|
+
_add_candidate(candidates, seen, AssetCandidate(href, "linked-attachment", "href", attachment, filename))
|
|
68
|
+
|
|
69
|
+
if options.asset_policy in {"page", "all"}:
|
|
70
|
+
for attachment in attachments:
|
|
71
|
+
_add_candidate(
|
|
72
|
+
candidates,
|
|
73
|
+
seen,
|
|
74
|
+
AssetCandidate(
|
|
75
|
+
attachment.download_url or attachment.web_url or attachment.filename,
|
|
76
|
+
"page-attachment",
|
|
77
|
+
"attachment",
|
|
78
|
+
attachment,
|
|
79
|
+
attachment.filename,
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return candidates
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def download_assets(
|
|
87
|
+
candidates: list[AssetCandidate],
|
|
88
|
+
*,
|
|
89
|
+
page_id: str,
|
|
90
|
+
page_assets_dir: Path,
|
|
91
|
+
page_assets_path: str,
|
|
92
|
+
client: ConfluenceClient,
|
|
93
|
+
extract_attachments: bool = False,
|
|
94
|
+
) -> tuple[list[AssetRecord], list[WarningRecord]]:
|
|
95
|
+
assets: list[AssetRecord] = []
|
|
96
|
+
warnings: list[WarningRecord] = []
|
|
97
|
+
used_names: set[str] = set()
|
|
98
|
+
page_assets_dir.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
for index, candidate in enumerate(candidates, start=1):
|
|
100
|
+
filename = safe_filename(candidate.attachment.filename if candidate.attachment else candidate.filename or "asset")
|
|
101
|
+
filename = unique_name(filename, used_names)
|
|
102
|
+
try:
|
|
103
|
+
content = (
|
|
104
|
+
client.download_attachment(candidate.attachment)
|
|
105
|
+
if candidate.attachment
|
|
106
|
+
else client.download_url(candidate.original)
|
|
107
|
+
)
|
|
108
|
+
except Exception as exc: # noqa: BLE001
|
|
109
|
+
warnings.append(
|
|
110
|
+
WarningRecord(
|
|
111
|
+
code="W_ASSET_DOWNLOAD_FAILED",
|
|
112
|
+
message=f"Could not download asset {filename}.",
|
|
113
|
+
source_page_id=page_id,
|
|
114
|
+
details={"source_url": sanitize_url(candidate.original), "reason": str(exc)},
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
continue
|
|
118
|
+
target = page_assets_dir / filename
|
|
119
|
+
target.write_bytes(content)
|
|
120
|
+
digest = hashlib.sha256(content).hexdigest()
|
|
121
|
+
media_type = candidate.attachment.media_type if candidate.attachment else mimetypes.guess_type(filename)[0]
|
|
122
|
+
sidecars: list[str] = []
|
|
123
|
+
if extract_attachments:
|
|
124
|
+
try:
|
|
125
|
+
extracted = extract_text_sidecar(target)
|
|
126
|
+
if extracted:
|
|
127
|
+
sidecar = write_extracted_markdown(target, extracted)
|
|
128
|
+
sidecars.append(f"{page_assets_path}/{sidecar.name}")
|
|
129
|
+
except Exception as exc: # noqa: BLE001
|
|
130
|
+
warnings.append(
|
|
131
|
+
WarningRecord(
|
|
132
|
+
code="W_ATTACHMENT_TEXT_EXTRACTION_FAILED",
|
|
133
|
+
message=f"Could not extract text sidecar for {filename}.",
|
|
134
|
+
source_page_id=page_id,
|
|
135
|
+
details={"reason": str(exc), "filename": filename},
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
assets.append(
|
|
139
|
+
AssetRecord(
|
|
140
|
+
asset_id=f"asset-{page_id}-{index}",
|
|
141
|
+
source_page_id=page_id,
|
|
142
|
+
attachment_id=candidate.attachment.attachment_id if candidate.attachment else None,
|
|
143
|
+
filename=filename,
|
|
144
|
+
media_type=media_type,
|
|
145
|
+
local_path=f"{page_assets_path}/{filename}",
|
|
146
|
+
sha256=digest,
|
|
147
|
+
role=candidate.role,
|
|
148
|
+
source_url=sanitize_url(candidate.attachment.download_url if candidate.attachment else candidate.original),
|
|
149
|
+
references=[
|
|
150
|
+
AssetReference(
|
|
151
|
+
page_id=page_id,
|
|
152
|
+
html_attribute=candidate.html_attribute,
|
|
153
|
+
original=sanitize_url(candidate.original) or candidate.original,
|
|
154
|
+
)
|
|
155
|
+
],
|
|
156
|
+
sidecars=sidecars,
|
|
157
|
+
)
|
|
158
|
+
)
|
|
159
|
+
return assets, warnings
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def skipped_asset_warnings(html: str, *, page_id: str) -> list[WarningRecord]:
|
|
163
|
+
soup = BeautifulSoup(html or "", "lxml")
|
|
164
|
+
warnings: list[WarningRecord] = []
|
|
165
|
+
for tag in soup.find_all("img"):
|
|
166
|
+
src = tag.get("src")
|
|
167
|
+
if isinstance(src, str):
|
|
168
|
+
warnings.append(
|
|
169
|
+
WarningRecord(
|
|
170
|
+
code="W_ASSET_SKIPPED_BY_POLICY",
|
|
171
|
+
message="Image asset download was skipped by --no-assets.",
|
|
172
|
+
source_page_id=page_id,
|
|
173
|
+
details={"source_url": sanitize_url(src)},
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
for tag in soup.find_all("a"):
|
|
177
|
+
href = tag.get("href")
|
|
178
|
+
if isinstance(href, str) and ATTACHMENT_PATH_RE.search(href):
|
|
179
|
+
warnings.append(
|
|
180
|
+
WarningRecord(
|
|
181
|
+
code="W_ASSET_SKIPPED_BY_POLICY",
|
|
182
|
+
message="Attachment download was skipped by --no-assets.",
|
|
183
|
+
source_page_id=page_id,
|
|
184
|
+
details={"source_url": sanitize_url(href)},
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
return warnings
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _add_candidate(
|
|
191
|
+
candidates: list[AssetCandidate], seen: set[tuple[str, str]], candidate: AssetCandidate
|
|
192
|
+
) -> None:
|
|
193
|
+
key_value = (
|
|
194
|
+
f"attachment:{candidate.attachment.attachment_id}"
|
|
195
|
+
if candidate.attachment
|
|
196
|
+
else _normalize_url(candidate.original)
|
|
197
|
+
)
|
|
198
|
+
key = (key_value, "asset")
|
|
199
|
+
if key in seen:
|
|
200
|
+
return
|
|
201
|
+
seen.add(key)
|
|
202
|
+
candidates.append(candidate)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _filename_from_url(url: str) -> str:
|
|
206
|
+
parsed = urlsplit(url)
|
|
207
|
+
path = unquote(parsed.path)
|
|
208
|
+
match = ATTACHMENT_PATH_RE.search(path)
|
|
209
|
+
if match:
|
|
210
|
+
return unquote(match.group("filename"))
|
|
211
|
+
return Path(path).name or "asset"
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _normalize_url(url: str | None) -> str:
|
|
215
|
+
if not url:
|
|
216
|
+
return ""
|
|
217
|
+
parsed = urlsplit(url)
|
|
218
|
+
return unquote(parsed.path).lower()
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _is_external(url: str) -> bool:
|
|
222
|
+
return url.startswith(("http://", "https://")) and "/download/attachments/" not in url
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _is_confluence_chrome_asset(url: str) -> bool:
|
|
226
|
+
parsed = urlsplit(url)
|
|
227
|
+
return any(
|
|
228
|
+
marker in parsed.path
|
|
229
|
+
for marker in (
|
|
230
|
+
"/images/icons/",
|
|
231
|
+
"/s/",
|
|
232
|
+
"/download/resources/",
|
|
233
|
+
"/plugins/servlet/",
|
|
234
|
+
)
|
|
235
|
+
)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
TEXT_SUFFIXES = {".txt", ".md", ".csv", ".tsv", ".json", ".xml", ".svg", ".log", ".yaml", ".yml"}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def extract_text_sidecar(path: Path) -> str | None:
|
|
10
|
+
suffix = path.suffix.lower()
|
|
11
|
+
if suffix in TEXT_SUFFIXES:
|
|
12
|
+
return _read_text(path)
|
|
13
|
+
if suffix == ".pdf":
|
|
14
|
+
return _extract_pdf(path)
|
|
15
|
+
if suffix == ".docx":
|
|
16
|
+
return _extract_docx(path)
|
|
17
|
+
if suffix == ".xlsx":
|
|
18
|
+
return _extract_xlsx(path)
|
|
19
|
+
if suffix == ".pptx":
|
|
20
|
+
return _extract_pptx(path)
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_extracted_markdown(path: Path, text: str) -> Path:
|
|
25
|
+
sidecar = path.with_name(f"{path.stem}.extracted.md")
|
|
26
|
+
sidecar.write_text(f"# Extracted Text: {path.name}\n\n{text.strip()}\n", encoding="utf-8")
|
|
27
|
+
return sidecar
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _read_text(path: Path) -> str:
|
|
31
|
+
return path.read_text(encoding="utf-8", errors="replace")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _extract_pdf(path: Path) -> str | None:
|
|
35
|
+
try:
|
|
36
|
+
from pypdf import PdfReader
|
|
37
|
+
except ImportError:
|
|
38
|
+
return None
|
|
39
|
+
reader = PdfReader(str(path))
|
|
40
|
+
return "\n\n".join(page.extract_text() or "" for page in reader.pages).strip() or None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _extract_docx(path: Path) -> str | None:
|
|
44
|
+
try:
|
|
45
|
+
import docx
|
|
46
|
+
except ImportError:
|
|
47
|
+
return None
|
|
48
|
+
document = docx.Document(str(path))
|
|
49
|
+
return "\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text).strip() or None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _extract_xlsx(path: Path) -> str | None:
|
|
53
|
+
try:
|
|
54
|
+
import openpyxl
|
|
55
|
+
except ImportError:
|
|
56
|
+
return None
|
|
57
|
+
workbook = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
|
|
58
|
+
lines: list[str] = []
|
|
59
|
+
for sheet in workbook.worksheets:
|
|
60
|
+
lines.append(f"## Sheet: {sheet.title}")
|
|
61
|
+
for row in sheet.iter_rows(values_only=True):
|
|
62
|
+
values = ["" if value is None else str(value) for value in row]
|
|
63
|
+
if any(values):
|
|
64
|
+
lines.append("\t".join(values))
|
|
65
|
+
return "\n".join(lines).strip() or None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _extract_pptx(path: Path) -> str | None:
|
|
69
|
+
try:
|
|
70
|
+
from pptx import Presentation
|
|
71
|
+
except ImportError:
|
|
72
|
+
return None
|
|
73
|
+
presentation = Presentation(str(path))
|
|
74
|
+
lines: list[str] = []
|
|
75
|
+
for slide_number, slide in enumerate(presentation.slides, start=1):
|
|
76
|
+
lines.append(f"## Slide {slide_number}")
|
|
77
|
+
for shape in slide.shapes:
|
|
78
|
+
if hasattr(shape, "text") and shape.text:
|
|
79
|
+
lines.append(shape.text)
|
|
80
|
+
if getattr(shape, "has_table", False):
|
|
81
|
+
rows = []
|
|
82
|
+
for row in shape.table.rows:
|
|
83
|
+
rows.append([cell.text for cell in row.cells])
|
|
84
|
+
lines.append(json.dumps(rows, ensure_ascii=False))
|
|
85
|
+
return "\n".join(lines).strip() or None
|
pull_cli/cli.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import Sequence
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from . import __version__
|
|
11
|
+
from .clients import build_client
|
|
12
|
+
from .config import resolve_config
|
|
13
|
+
from .envelope import emit_json, make_envelope, wants_json
|
|
14
|
+
from .errors import EXIT_INTERNAL, EXIT_SUCCESS, EXIT_VALIDATION, PullError
|
|
15
|
+
from .extractor import extract
|
|
16
|
+
from .guide import guide_payload
|
|
17
|
+
from .models import PullOptions, TargetSelection
|
|
18
|
+
from .resolver import resolve_target
|
|
19
|
+
from .security import sanitize_url
|
|
20
|
+
from .validator import validate_package
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PullArgumentParser(argparse.ArgumentParser):
|
|
24
|
+
def __init__(self, *args, json_mode: bool = False, command: str = "pull", **kwargs) -> None:
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
self.json_mode = json_mode
|
|
27
|
+
self.command = command
|
|
28
|
+
|
|
29
|
+
def error(self, message: str) -> None:
|
|
30
|
+
if self.json_mode:
|
|
31
|
+
error = PullError(
|
|
32
|
+
code="ERR_VALIDATION_INVALID_ARGUMENT",
|
|
33
|
+
message=message,
|
|
34
|
+
exit_code=EXIT_VALIDATION,
|
|
35
|
+
suggested_action="Run pull --help or pull guide --json for valid arguments.",
|
|
36
|
+
details={"argument_error": message},
|
|
37
|
+
)
|
|
38
|
+
emit_json(make_envelope(ok=False, command=self.command, errors=[error]))
|
|
39
|
+
raise SystemExit(EXIT_VALIDATION)
|
|
40
|
+
super().error(message)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
44
|
+
args = list(argv if argv is not None else sys.argv[1:])
|
|
45
|
+
try:
|
|
46
|
+
if args and args[0] == "validate":
|
|
47
|
+
return _main_validate(args[1:])
|
|
48
|
+
if args and args[0] == "guide":
|
|
49
|
+
return _main_guide(args[1:])
|
|
50
|
+
if args and args[0] == "version":
|
|
51
|
+
print(f"pull-cli {__version__}")
|
|
52
|
+
return EXIT_SUCCESS
|
|
53
|
+
return _main_pull(args)
|
|
54
|
+
except PullError as exc:
|
|
55
|
+
if _argv_wants_json(args):
|
|
56
|
+
emit_json(make_envelope(ok=False, command="pull", errors=[exc]))
|
|
57
|
+
else:
|
|
58
|
+
print(f"{exc.code}: {exc.message}", file=sys.stderr)
|
|
59
|
+
if exc.suggested_action:
|
|
60
|
+
print(f"Suggested action: {exc.suggested_action}", file=sys.stderr)
|
|
61
|
+
return exc.exit_code
|
|
62
|
+
except KeyboardInterrupt:
|
|
63
|
+
return 130
|
|
64
|
+
except Exception as exc: # noqa: BLE001
|
|
65
|
+
error = PullError(
|
|
66
|
+
code="ERR_INTERNAL_CONVERSION",
|
|
67
|
+
message="An internal error occurred.",
|
|
68
|
+
exit_code=EXIT_INTERNAL,
|
|
69
|
+
details={"reason": str(exc)},
|
|
70
|
+
)
|
|
71
|
+
if _argv_wants_json(args):
|
|
72
|
+
emit_json(make_envelope(ok=False, command="pull", errors=[error]))
|
|
73
|
+
else:
|
|
74
|
+
print(f"{error.code}: {error.message}", file=sys.stderr)
|
|
75
|
+
return EXIT_INTERNAL
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _main_pull(argv: Sequence[str]) -> int:
|
|
79
|
+
parser = _pull_parser(json_mode=_argv_wants_json(argv))
|
|
80
|
+
ns = parser.parse_args(argv)
|
|
81
|
+
json_mode = wants_json(ns.json)
|
|
82
|
+
started = time.perf_counter()
|
|
83
|
+
config = resolve_config(
|
|
84
|
+
base_url=ns.base_url,
|
|
85
|
+
user=ns.user,
|
|
86
|
+
token=ns.token,
|
|
87
|
+
cloud_id=ns.cloud_id,
|
|
88
|
+
ssl_verify=ns.ssl_verify,
|
|
89
|
+
config_path=ns.config,
|
|
90
|
+
)
|
|
91
|
+
selection = TargetSelection(
|
|
92
|
+
positional=ns.page_ref,
|
|
93
|
+
page_id=ns.page_id,
|
|
94
|
+
url=ns.url,
|
|
95
|
+
space=ns.space,
|
|
96
|
+
title=ns.title,
|
|
97
|
+
)
|
|
98
|
+
options = PullOptions(
|
|
99
|
+
output=Path(ns.output),
|
|
100
|
+
force=ns.force,
|
|
101
|
+
clean=ns.clean,
|
|
102
|
+
tree=ns.tree,
|
|
103
|
+
depth=ns.depth,
|
|
104
|
+
max_pages=ns.max_pages,
|
|
105
|
+
layout=ns.layout,
|
|
106
|
+
output_mode=ns.output_mode,
|
|
107
|
+
write_bundle=ns.bundle,
|
|
108
|
+
write_html=ns.html,
|
|
109
|
+
write_source=ns.source,
|
|
110
|
+
write_chunks=ns.chunks,
|
|
111
|
+
asset_policy=ns.assets,
|
|
112
|
+
no_assets=ns.no_assets,
|
|
113
|
+
extract_attachments=ns.extract_attachments,
|
|
114
|
+
comments=ns.comments,
|
|
115
|
+
diagram_sources=ns.diagram_sources,
|
|
116
|
+
render_mode=ns.render_mode,
|
|
117
|
+
macro_policy=ns.macro_policy,
|
|
118
|
+
unknown_macro=ns.unknown_macro,
|
|
119
|
+
rewrite_links=ns.rewrite_links,
|
|
120
|
+
follow_includes=ns.follow_includes,
|
|
121
|
+
follow_links=ns.follow_links,
|
|
122
|
+
include_non_page_children=ns.include_non_page_children,
|
|
123
|
+
redact_source_urls=ns.redact_source_urls,
|
|
124
|
+
redact_manifest=ns.redact_manifest,
|
|
125
|
+
strict=ns.strict,
|
|
126
|
+
)
|
|
127
|
+
client = build_client(config)
|
|
128
|
+
try:
|
|
129
|
+
root = resolve_target(selection, client)
|
|
130
|
+
result = extract(client=client, root=root, options=options)
|
|
131
|
+
finally:
|
|
132
|
+
client.close()
|
|
133
|
+
duration_ms = int((time.perf_counter() - started) * 1000)
|
|
134
|
+
result.metrics["duration_ms"] = duration_ms
|
|
135
|
+
payload = make_envelope(
|
|
136
|
+
ok=True,
|
|
137
|
+
command="pull",
|
|
138
|
+
target={
|
|
139
|
+
"page_id": result.pages[0].page.page_id if result.pages else root.page_id,
|
|
140
|
+
"url": sanitize_url(
|
|
141
|
+
result.pages[0].page.url if result.pages else root.url,
|
|
142
|
+
redact_source_url=options.redact_source_urls,
|
|
143
|
+
),
|
|
144
|
+
},
|
|
145
|
+
result={
|
|
146
|
+
"output_dir": str(result.output_dir),
|
|
147
|
+
"manifest": str(result.manifest_path),
|
|
148
|
+
"ai_entry": str(result.ai_entry_path) if result.ai_entry_path else None,
|
|
149
|
+
"bundle": str(result.bundle_path) if result.bundle_path else None,
|
|
150
|
+
"output_mode": options.output_mode,
|
|
151
|
+
"pages": len(result.pages),
|
|
152
|
+
"assets": len(result.assets),
|
|
153
|
+
"warnings": len(result.warnings),
|
|
154
|
+
},
|
|
155
|
+
warnings=result.warnings,
|
|
156
|
+
metrics=result.metrics,
|
|
157
|
+
)
|
|
158
|
+
if json_mode:
|
|
159
|
+
emit_json(payload)
|
|
160
|
+
else:
|
|
161
|
+
print(
|
|
162
|
+
f"Pulled {len(result.pages)} page(s), {len(result.assets)} asset(s), "
|
|
163
|
+
f"{len(result.warnings)} warning(s) into {result.output_dir}"
|
|
164
|
+
)
|
|
165
|
+
return EXIT_SUCCESS
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _main_validate(argv: Sequence[str]) -> int:
|
|
169
|
+
parser = PullArgumentParser(
|
|
170
|
+
prog="pull validate",
|
|
171
|
+
description="Validate a pulled Confluence package.",
|
|
172
|
+
json_mode=_argv_wants_json(argv),
|
|
173
|
+
command="validate",
|
|
174
|
+
)
|
|
175
|
+
parser.add_argument("path", nargs="?", metavar="MANIFEST_OR_OUTPUT_DIR")
|
|
176
|
+
parser.add_argument("--json", action="store_true", help="Emit a structured JSON envelope.")
|
|
177
|
+
ns = parser.parse_args(argv)
|
|
178
|
+
json_mode = wants_json(ns.json)
|
|
179
|
+
if not ns.path:
|
|
180
|
+
error = PullError(
|
|
181
|
+
code="ERR_VALIDATION_REQUIRED",
|
|
182
|
+
message="Missing required MANIFEST_OR_OUTPUT_DIR argument.",
|
|
183
|
+
exit_code=EXIT_VALIDATION,
|
|
184
|
+
suggested_action="Pass an output directory or manifest.yaml path.",
|
|
185
|
+
)
|
|
186
|
+
payload = make_envelope(ok=False, command="validate", target={}, errors=[error])
|
|
187
|
+
if json_mode:
|
|
188
|
+
emit_json(payload)
|
|
189
|
+
else:
|
|
190
|
+
print(f"{error.code}: {error.message}", file=sys.stderr)
|
|
191
|
+
print("Usage: pull validate MANIFEST_OR_OUTPUT_DIR [--json]", file=sys.stderr)
|
|
192
|
+
return EXIT_VALIDATION
|
|
193
|
+
validation = validate_package(Path(ns.path))
|
|
194
|
+
payload = make_envelope(
|
|
195
|
+
ok=validation.ok,
|
|
196
|
+
command="validate",
|
|
197
|
+
target={"path": ns.path},
|
|
198
|
+
result={
|
|
199
|
+
"manifest": str(validation.manifest_path),
|
|
200
|
+
"output_dir": str(validation.output_dir),
|
|
201
|
+
"errors": len(validation.errors),
|
|
202
|
+
"warnings": len(validation.warnings),
|
|
203
|
+
"validation_warnings": len(validation.warnings),
|
|
204
|
+
"package_warnings": validation.metrics.get("package_warnings", 0),
|
|
205
|
+
},
|
|
206
|
+
warnings=validation.warnings,
|
|
207
|
+
errors=validation.errors,
|
|
208
|
+
metrics=validation.metrics,
|
|
209
|
+
)
|
|
210
|
+
if json_mode:
|
|
211
|
+
emit_json(payload)
|
|
212
|
+
elif validation.ok:
|
|
213
|
+
print(
|
|
214
|
+
f"Validation passed for {validation.manifest_path} "
|
|
215
|
+
f"({validation.metrics.get('pages', 0)} page(s), {validation.metrics.get('assets', 0)} asset(s))."
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
for error in validation.errors:
|
|
219
|
+
print(f"{error['code']}: {error['message']}", file=sys.stderr)
|
|
220
|
+
details = error.get("details", {})
|
|
221
|
+
if details.get("file"):
|
|
222
|
+
print(f" file: {details['file']}", file=sys.stderr)
|
|
223
|
+
if details.get("link"):
|
|
224
|
+
print(f" link: {details['link']}", file=sys.stderr)
|
|
225
|
+
if details.get("resolution_base"):
|
|
226
|
+
print(f" resolution_base: {details['resolution_base']}", file=sys.stderr)
|
|
227
|
+
if details.get("candidate_path"):
|
|
228
|
+
print(f" candidate_path: {details['candidate_path']}", file=sys.stderr)
|
|
229
|
+
print("Rerun with --json for the structured validation envelope.", file=sys.stderr)
|
|
230
|
+
return 0 if validation.ok else 10
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _main_guide(argv: Sequence[str]) -> int:
|
|
234
|
+
parser = PullArgumentParser(
|
|
235
|
+
prog="pull guide",
|
|
236
|
+
description="Emit agent-readable CLI and output schema.",
|
|
237
|
+
json_mode=_argv_wants_json(argv),
|
|
238
|
+
command="guide",
|
|
239
|
+
)
|
|
240
|
+
parser.add_argument("--json", action="store_true", help="Emit only the guide payload as JSON.")
|
|
241
|
+
ns = parser.parse_args(argv)
|
|
242
|
+
payload = guide_payload()
|
|
243
|
+
if wants_json(ns.json):
|
|
244
|
+
print(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
|
|
245
|
+
else:
|
|
246
|
+
print("pull-cli guide")
|
|
247
|
+
print("Commands: pull PAGE_REF [OPTIONS], pull validate PATH, pull guide --json")
|
|
248
|
+
print("Use --json or LLM=true for stable agent envelopes on pull/validate.")
|
|
249
|
+
print("Recommended agent flow: pull guide --json, pull ... --json, pull validate <output-dir> --json.")
|
|
250
|
+
print("Default output mode is simple: root AI Markdown, page Markdown, assets, and validation control files.")
|
|
251
|
+
print("Use --output-mode full for bundle.md, page HTML snapshots, and source.storage.xml; use --clean to remove stale files when switching modes.")
|
|
252
|
+
print("Start analysis from <sanitized-root-page-title>.md in the output package.")
|
|
253
|
+
print("Manifest and AI manifest paths are package-root-relative; page links are page-file-relative.")
|
|
254
|
+
print("Run pull guide --json for the full machine-readable schema, error codes, and warning codes.")
|
|
255
|
+
return EXIT_SUCCESS
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _pull_parser(*, json_mode: bool = False) -> argparse.ArgumentParser:
|
|
259
|
+
parser = PullArgumentParser(
|
|
260
|
+
prog="pull",
|
|
261
|
+
description="Pull Confluence pages into local AI-consumable evidence packages.",
|
|
262
|
+
epilog=(
|
|
263
|
+
"Commands: pull PAGE_REF [OPTIONS]; pull validate MANIFEST_OR_OUTPUT_DIR [--json]; "
|
|
264
|
+
"pull guide [--json]; pull version. "
|
|
265
|
+
"Default output mode is simple; use --output-mode full for bundle/html/source artifacts. "
|
|
266
|
+
"Agent flow: pull guide --json, pull ... --json, pull validate <output-dir> --json."
|
|
267
|
+
),
|
|
268
|
+
json_mode=json_mode,
|
|
269
|
+
command="pull",
|
|
270
|
+
)
|
|
271
|
+
parser.add_argument("page_ref", nargs="?", metavar="PAGE_REF", help="Confluence page ID or page URL.")
|
|
272
|
+
parser.add_argument("--page-id", dest="page_id", help="Confluence page/content ID.")
|
|
273
|
+
parser.add_argument("--url", help="Confluence page URL.")
|
|
274
|
+
parser.add_argument("--space", help="Confluence space key, used with --title.")
|
|
275
|
+
parser.add_argument("--title", help="Confluence page title, used with --space.")
|
|
276
|
+
|
|
277
|
+
parser.add_argument("--tree", action="store_true", help="Pull descendant page hierarchy.")
|
|
278
|
+
parser.add_argument("--depth", type=int, help="Tree depth limit; 0 equals single page.")
|
|
279
|
+
parser.add_argument("--max-pages", type=int, default=500, help="Safety cap for tree pulls.")
|
|
280
|
+
parser.add_argument("--include-non-page-children", action="store_true")
|
|
281
|
+
|
|
282
|
+
parser.add_argument("-o", "--output", default="pulled-confluence", help="Output directory.")
|
|
283
|
+
parser.add_argument("--force", action="store_true", help="Overwrite files in an existing output directory.")
|
|
284
|
+
parser.add_argument("--clean", action="store_true", help="Delete stale files in the output directory first.")
|
|
285
|
+
parser.add_argument("--layout", choices=["auto", "nested", "flat"], default="auto")
|
|
286
|
+
parser.add_argument(
|
|
287
|
+
"--output-mode",
|
|
288
|
+
choices=["simple", "full"],
|
|
289
|
+
default="simple",
|
|
290
|
+
help="Output artifact profile. simple writes quiet agent-facing Markdown by default; full writes all evidence artifacts.",
|
|
291
|
+
)
|
|
292
|
+
parser.add_argument("--bundle", dest="bundle", action=argparse.BooleanOptionalAction, default=None)
|
|
293
|
+
parser.add_argument("--html", dest="html", action=argparse.BooleanOptionalAction, default=None)
|
|
294
|
+
parser.add_argument("--source", dest="source", action=argparse.BooleanOptionalAction, default=None)
|
|
295
|
+
parser.add_argument("--chunks", action="store_true")
|
|
296
|
+
|
|
297
|
+
parser.add_argument("--assets", choices=["visible", "page", "all"], default="visible")
|
|
298
|
+
parser.add_argument("--no-assets", action="store_true")
|
|
299
|
+
parser.add_argument("--extract-attachments", action="store_true")
|
|
300
|
+
parser.add_argument("--comments", action="store_true", help="Fetch page and inline comments into page-local comments.md sidecars.")
|
|
301
|
+
parser.add_argument("--diagram-sources", action="store_true")
|
|
302
|
+
|
|
303
|
+
parser.add_argument("--render-mode", choices=["hybrid", "view", "export-view", "styled-view", "storage"], default="hybrid")
|
|
304
|
+
parser.add_argument("--macro-policy", choices=["expand", "placeholder", "strict"], default="expand")
|
|
305
|
+
parser.add_argument("--unknown-macro", choices=["warn", "error", "ignore"], default="warn")
|
|
306
|
+
|
|
307
|
+
parser.add_argument("--rewrite-links", dest="rewrite_links", action=argparse.BooleanOptionalAction, default=True)
|
|
308
|
+
parser.add_argument("--follow-includes", action="store_true")
|
|
309
|
+
parser.add_argument("--follow-links", choices=["same-tree", "same-space", "none"], default="none")
|
|
310
|
+
|
|
311
|
+
parser.add_argument("--base-url", help="Confluence base URL.")
|
|
312
|
+
parser.add_argument("--user", help="Confluence username/email.")
|
|
313
|
+
parser.add_argument("--token", help="Confluence API token or PAT. Prefer env vars.")
|
|
314
|
+
parser.add_argument("--cloud-id", help="Optional Confluence Cloud ID.")
|
|
315
|
+
parser.add_argument("--ssl-verify", help="true, false, or path to an enterprise CA bundle.")
|
|
316
|
+
parser.add_argument("--config", help="Optional config YAML path.")
|
|
317
|
+
|
|
318
|
+
parser.add_argument("--json", action="store_true", help="Emit a structured JSON object on stdout.")
|
|
319
|
+
parser.add_argument("--version", action="version", version=f"pull-cli {__version__}")
|
|
320
|
+
parser.add_argument("--quiet", action="store_true", help="Accepted but currently no-op; reserved for progress suppression.")
|
|
321
|
+
parser.add_argument("--verbose", action="store_true", help="Accepted but currently no-op; reserved for extra diagnostics.")
|
|
322
|
+
parser.add_argument("--redact-source-urls", action="store_true")
|
|
323
|
+
parser.add_argument("--redact-manifest", action="store_true")
|
|
324
|
+
parser.add_argument("--strict", action="store_true", help="Treat strict extraction failures as errors.")
|
|
325
|
+
return parser
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _argv_wants_json(argv: Sequence[str]) -> bool:
|
|
329
|
+
return "--json" in argv or wants_json(False)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .base import ConfluenceClient
|
|
4
|
+
from .cloud_v2 import CloudV2Client
|
|
5
|
+
from .data_center import DataCenterClient
|
|
6
|
+
from .hybrid import build_client
|
|
7
|
+
|
|
8
|
+
__all__ = ["CloudV2Client", "ConfluenceClient", "DataCenterClient", "build_client"]
|