pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pull_cli/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """AI-optimized Confluence extraction CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
pull_cli/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
pull_cli/assets.py ADDED
@@ -0,0 +1,235 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import mimetypes
5
+ import re
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from urllib.parse import unquote, urlsplit
9
+
10
+ from bs4 import BeautifulSoup
11
+
12
+ from .attachment_extractors import extract_text_sidecar, write_extracted_markdown
13
+ from .clients.base import ConfluenceClient
14
+ from .models import AssetRecord, AssetReference, AttachmentRecord, PullOptions, WarningRecord
15
+ from .paths import safe_filename, unique_name
16
+ from .security import sanitize_url
17
+
18
+ ATTACHMENT_PATH_RE = re.compile(r"/download/attachments/(?P<page_id>[^/]+)/(?P<filename>[^?#]+)")
19
+
20
+
21
+ @dataclass
22
+ class AssetCandidate:
23
+ original: str
24
+ role: str
25
+ html_attribute: str
26
+ attachment: AttachmentRecord | None = None
27
+ filename: str | None = None
28
+
29
+
30
+ def discover_asset_candidates(
31
+ html: str,
32
+ *,
33
+ page_id: str,
34
+ attachments: list[AttachmentRecord],
35
+ options: PullOptions,
36
+ ) -> list[AssetCandidate]:
37
+ if options.no_assets:
38
+ return []
39
+ soup = BeautifulSoup(html or "", "lxml")
40
+ attachment_by_name = {attachment.filename.lower(): attachment for attachment in attachments}
41
+ attachment_by_url = {
42
+ _normalize_url(attachment.download_url): attachment
43
+ for attachment in attachments
44
+ if attachment.download_url
45
+ }
46
+ candidates: list[AssetCandidate] = []
47
+ seen: set[tuple[str, str]] = set()
48
+
49
+ for tag in soup.find_all("img"):
50
+ src = tag.get("src")
51
+ if not isinstance(src, str) or _is_external(src) or _is_confluence_chrome_asset(src):
52
+ continue
53
+ attachment = attachment_by_url.get(_normalize_url(src)) or attachment_by_name.get(_filename_from_url(src).lower())
54
+ _add_candidate(
55
+ candidates,
56
+ seen,
57
+ AssetCandidate(src, "visible-image", "src", attachment, _filename_from_url(src)),
58
+ )
59
+
60
+ for tag in soup.find_all("a"):
61
+ href = tag.get("href")
62
+ if not isinstance(href, str):
63
+ continue
64
+ filename = _filename_from_url(href)
65
+ attachment = attachment_by_url.get(_normalize_url(href)) or attachment_by_name.get(filename.lower())
66
+ if attachment or ATTACHMENT_PATH_RE.search(href):
67
+ _add_candidate(candidates, seen, AssetCandidate(href, "linked-attachment", "href", attachment, filename))
68
+
69
+ if options.asset_policy in {"page", "all"}:
70
+ for attachment in attachments:
71
+ _add_candidate(
72
+ candidates,
73
+ seen,
74
+ AssetCandidate(
75
+ attachment.download_url or attachment.web_url or attachment.filename,
76
+ "page-attachment",
77
+ "attachment",
78
+ attachment,
79
+ attachment.filename,
80
+ ),
81
+ )
82
+
83
+ return candidates
84
+
85
+
86
+ def download_assets(
87
+ candidates: list[AssetCandidate],
88
+ *,
89
+ page_id: str,
90
+ page_assets_dir: Path,
91
+ page_assets_path: str,
92
+ client: ConfluenceClient,
93
+ extract_attachments: bool = False,
94
+ ) -> tuple[list[AssetRecord], list[WarningRecord]]:
95
+ assets: list[AssetRecord] = []
96
+ warnings: list[WarningRecord] = []
97
+ used_names: set[str] = set()
98
+ page_assets_dir.mkdir(parents=True, exist_ok=True)
99
+ for index, candidate in enumerate(candidates, start=1):
100
+ filename = safe_filename(candidate.attachment.filename if candidate.attachment else candidate.filename or "asset")
101
+ filename = unique_name(filename, used_names)
102
+ try:
103
+ content = (
104
+ client.download_attachment(candidate.attachment)
105
+ if candidate.attachment
106
+ else client.download_url(candidate.original)
107
+ )
108
+ except Exception as exc: # noqa: BLE001
109
+ warnings.append(
110
+ WarningRecord(
111
+ code="W_ASSET_DOWNLOAD_FAILED",
112
+ message=f"Could not download asset {filename}.",
113
+ source_page_id=page_id,
114
+ details={"source_url": sanitize_url(candidate.original), "reason": str(exc)},
115
+ )
116
+ )
117
+ continue
118
+ target = page_assets_dir / filename
119
+ target.write_bytes(content)
120
+ digest = hashlib.sha256(content).hexdigest()
121
+ media_type = candidate.attachment.media_type if candidate.attachment else mimetypes.guess_type(filename)[0]
122
+ sidecars: list[str] = []
123
+ if extract_attachments:
124
+ try:
125
+ extracted = extract_text_sidecar(target)
126
+ if extracted:
127
+ sidecar = write_extracted_markdown(target, extracted)
128
+ sidecars.append(f"{page_assets_path}/{sidecar.name}")
129
+ except Exception as exc: # noqa: BLE001
130
+ warnings.append(
131
+ WarningRecord(
132
+ code="W_ATTACHMENT_TEXT_EXTRACTION_FAILED",
133
+ message=f"Could not extract text sidecar for {filename}.",
134
+ source_page_id=page_id,
135
+ details={"reason": str(exc), "filename": filename},
136
+ )
137
+ )
138
+ assets.append(
139
+ AssetRecord(
140
+ asset_id=f"asset-{page_id}-{index}",
141
+ source_page_id=page_id,
142
+ attachment_id=candidate.attachment.attachment_id if candidate.attachment else None,
143
+ filename=filename,
144
+ media_type=media_type,
145
+ local_path=f"{page_assets_path}/{filename}",
146
+ sha256=digest,
147
+ role=candidate.role,
148
+ source_url=sanitize_url(candidate.attachment.download_url if candidate.attachment else candidate.original),
149
+ references=[
150
+ AssetReference(
151
+ page_id=page_id,
152
+ html_attribute=candidate.html_attribute,
153
+ original=sanitize_url(candidate.original) or candidate.original,
154
+ )
155
+ ],
156
+ sidecars=sidecars,
157
+ )
158
+ )
159
+ return assets, warnings
160
+
161
+
162
+ def skipped_asset_warnings(html: str, *, page_id: str) -> list[WarningRecord]:
163
+ soup = BeautifulSoup(html or "", "lxml")
164
+ warnings: list[WarningRecord] = []
165
+ for tag in soup.find_all("img"):
166
+ src = tag.get("src")
167
+ if isinstance(src, str):
168
+ warnings.append(
169
+ WarningRecord(
170
+ code="W_ASSET_SKIPPED_BY_POLICY",
171
+ message="Image asset download was skipped by --no-assets.",
172
+ source_page_id=page_id,
173
+ details={"source_url": sanitize_url(src)},
174
+ )
175
+ )
176
+ for tag in soup.find_all("a"):
177
+ href = tag.get("href")
178
+ if isinstance(href, str) and ATTACHMENT_PATH_RE.search(href):
179
+ warnings.append(
180
+ WarningRecord(
181
+ code="W_ASSET_SKIPPED_BY_POLICY",
182
+ message="Attachment download was skipped by --no-assets.",
183
+ source_page_id=page_id,
184
+ details={"source_url": sanitize_url(href)},
185
+ )
186
+ )
187
+ return warnings
188
+
189
+
190
+ def _add_candidate(
191
+ candidates: list[AssetCandidate], seen: set[tuple[str, str]], candidate: AssetCandidate
192
+ ) -> None:
193
+ key_value = (
194
+ f"attachment:{candidate.attachment.attachment_id}"
195
+ if candidate.attachment
196
+ else _normalize_url(candidate.original)
197
+ )
198
+ key = (key_value, "asset")
199
+ if key in seen:
200
+ return
201
+ seen.add(key)
202
+ candidates.append(candidate)
203
+
204
+
205
+ def _filename_from_url(url: str) -> str:
206
+ parsed = urlsplit(url)
207
+ path = unquote(parsed.path)
208
+ match = ATTACHMENT_PATH_RE.search(path)
209
+ if match:
210
+ return unquote(match.group("filename"))
211
+ return Path(path).name or "asset"
212
+
213
+
214
+ def _normalize_url(url: str | None) -> str:
215
+ if not url:
216
+ return ""
217
+ parsed = urlsplit(url)
218
+ return unquote(parsed.path).lower()
219
+
220
+
221
+ def _is_external(url: str) -> bool:
222
+ return url.startswith(("http://", "https://")) and "/download/attachments/" not in url
223
+
224
+
225
+ def _is_confluence_chrome_asset(url: str) -> bool:
226
+ parsed = urlsplit(url)
227
+ return any(
228
+ marker in parsed.path
229
+ for marker in (
230
+ "/images/icons/",
231
+ "/s/",
232
+ "/download/resources/",
233
+ "/plugins/servlet/",
234
+ )
235
+ )
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ TEXT_SUFFIXES = {".txt", ".md", ".csv", ".tsv", ".json", ".xml", ".svg", ".log", ".yaml", ".yml"}
7
+
8
+
9
+ def extract_text_sidecar(path: Path) -> str | None:
10
+ suffix = path.suffix.lower()
11
+ if suffix in TEXT_SUFFIXES:
12
+ return _read_text(path)
13
+ if suffix == ".pdf":
14
+ return _extract_pdf(path)
15
+ if suffix == ".docx":
16
+ return _extract_docx(path)
17
+ if suffix == ".xlsx":
18
+ return _extract_xlsx(path)
19
+ if suffix == ".pptx":
20
+ return _extract_pptx(path)
21
+ return None
22
+
23
+
24
+ def write_extracted_markdown(path: Path, text: str) -> Path:
25
+ sidecar = path.with_name(f"{path.stem}.extracted.md")
26
+ sidecar.write_text(f"# Extracted Text: {path.name}\n\n{text.strip()}\n", encoding="utf-8")
27
+ return sidecar
28
+
29
+
30
+ def _read_text(path: Path) -> str:
31
+ return path.read_text(encoding="utf-8", errors="replace")
32
+
33
+
34
+ def _extract_pdf(path: Path) -> str | None:
35
+ try:
36
+ from pypdf import PdfReader
37
+ except ImportError:
38
+ return None
39
+ reader = PdfReader(str(path))
40
+ return "\n\n".join(page.extract_text() or "" for page in reader.pages).strip() or None
41
+
42
+
43
+ def _extract_docx(path: Path) -> str | None:
44
+ try:
45
+ import docx
46
+ except ImportError:
47
+ return None
48
+ document = docx.Document(str(path))
49
+ return "\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text).strip() or None
50
+
51
+
52
+ def _extract_xlsx(path: Path) -> str | None:
53
+ try:
54
+ import openpyxl
55
+ except ImportError:
56
+ return None
57
+ workbook = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
58
+ lines: list[str] = []
59
+ for sheet in workbook.worksheets:
60
+ lines.append(f"## Sheet: {sheet.title}")
61
+ for row in sheet.iter_rows(values_only=True):
62
+ values = ["" if value is None else str(value) for value in row]
63
+ if any(values):
64
+ lines.append("\t".join(values))
65
+ return "\n".join(lines).strip() or None
66
+
67
+
68
+ def _extract_pptx(path: Path) -> str | None:
69
+ try:
70
+ from pptx import Presentation
71
+ except ImportError:
72
+ return None
73
+ presentation = Presentation(str(path))
74
+ lines: list[str] = []
75
+ for slide_number, slide in enumerate(presentation.slides, start=1):
76
+ lines.append(f"## Slide {slide_number}")
77
+ for shape in slide.shapes:
78
+ if hasattr(shape, "text") and shape.text:
79
+ lines.append(shape.text)
80
+ if getattr(shape, "has_table", False):
81
+ rows = []
82
+ for row in shape.table.rows:
83
+ rows.append([cell.text for cell in row.cells])
84
+ lines.append(json.dumps(rows, ensure_ascii=False))
85
+ return "\n".join(lines).strip() or None
pull_cli/cli.py ADDED
@@ -0,0 +1,329 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ import time
7
+ from collections.abc import Sequence
8
+ from pathlib import Path
9
+
10
+ from . import __version__
11
+ from .clients import build_client
12
+ from .config import resolve_config
13
+ from .envelope import emit_json, make_envelope, wants_json
14
+ from .errors import EXIT_INTERNAL, EXIT_SUCCESS, EXIT_VALIDATION, PullError
15
+ from .extractor import extract
16
+ from .guide import guide_payload
17
+ from .models import PullOptions, TargetSelection
18
+ from .resolver import resolve_target
19
+ from .security import sanitize_url
20
+ from .validator import validate_package
21
+
22
+
23
+ class PullArgumentParser(argparse.ArgumentParser):
24
+ def __init__(self, *args, json_mode: bool = False, command: str = "pull", **kwargs) -> None:
25
+ super().__init__(*args, **kwargs)
26
+ self.json_mode = json_mode
27
+ self.command = command
28
+
29
+ def error(self, message: str) -> None:
30
+ if self.json_mode:
31
+ error = PullError(
32
+ code="ERR_VALIDATION_INVALID_ARGUMENT",
33
+ message=message,
34
+ exit_code=EXIT_VALIDATION,
35
+ suggested_action="Run pull --help or pull guide --json for valid arguments.",
36
+ details={"argument_error": message},
37
+ )
38
+ emit_json(make_envelope(ok=False, command=self.command, errors=[error]))
39
+ raise SystemExit(EXIT_VALIDATION)
40
+ super().error(message)
41
+
42
+
43
+ def main(argv: Sequence[str] | None = None) -> int:
44
+ args = list(argv if argv is not None else sys.argv[1:])
45
+ try:
46
+ if args and args[0] == "validate":
47
+ return _main_validate(args[1:])
48
+ if args and args[0] == "guide":
49
+ return _main_guide(args[1:])
50
+ if args and args[0] == "version":
51
+ print(f"pull-cli {__version__}")
52
+ return EXIT_SUCCESS
53
+ return _main_pull(args)
54
+ except PullError as exc:
55
+ if _argv_wants_json(args):
56
+ emit_json(make_envelope(ok=False, command="pull", errors=[exc]))
57
+ else:
58
+ print(f"{exc.code}: {exc.message}", file=sys.stderr)
59
+ if exc.suggested_action:
60
+ print(f"Suggested action: {exc.suggested_action}", file=sys.stderr)
61
+ return exc.exit_code
62
+ except KeyboardInterrupt:
63
+ return 130
64
+ except Exception as exc: # noqa: BLE001
65
+ error = PullError(
66
+ code="ERR_INTERNAL_CONVERSION",
67
+ message="An internal error occurred.",
68
+ exit_code=EXIT_INTERNAL,
69
+ details={"reason": str(exc)},
70
+ )
71
+ if _argv_wants_json(args):
72
+ emit_json(make_envelope(ok=False, command="pull", errors=[error]))
73
+ else:
74
+ print(f"{error.code}: {error.message}", file=sys.stderr)
75
+ return EXIT_INTERNAL
76
+
77
+
78
+ def _main_pull(argv: Sequence[str]) -> int:
79
+ parser = _pull_parser(json_mode=_argv_wants_json(argv))
80
+ ns = parser.parse_args(argv)
81
+ json_mode = wants_json(ns.json)
82
+ started = time.perf_counter()
83
+ config = resolve_config(
84
+ base_url=ns.base_url,
85
+ user=ns.user,
86
+ token=ns.token,
87
+ cloud_id=ns.cloud_id,
88
+ ssl_verify=ns.ssl_verify,
89
+ config_path=ns.config,
90
+ )
91
+ selection = TargetSelection(
92
+ positional=ns.page_ref,
93
+ page_id=ns.page_id,
94
+ url=ns.url,
95
+ space=ns.space,
96
+ title=ns.title,
97
+ )
98
+ options = PullOptions(
99
+ output=Path(ns.output),
100
+ force=ns.force,
101
+ clean=ns.clean,
102
+ tree=ns.tree,
103
+ depth=ns.depth,
104
+ max_pages=ns.max_pages,
105
+ layout=ns.layout,
106
+ output_mode=ns.output_mode,
107
+ write_bundle=ns.bundle,
108
+ write_html=ns.html,
109
+ write_source=ns.source,
110
+ write_chunks=ns.chunks,
111
+ asset_policy=ns.assets,
112
+ no_assets=ns.no_assets,
113
+ extract_attachments=ns.extract_attachments,
114
+ comments=ns.comments,
115
+ diagram_sources=ns.diagram_sources,
116
+ render_mode=ns.render_mode,
117
+ macro_policy=ns.macro_policy,
118
+ unknown_macro=ns.unknown_macro,
119
+ rewrite_links=ns.rewrite_links,
120
+ follow_includes=ns.follow_includes,
121
+ follow_links=ns.follow_links,
122
+ include_non_page_children=ns.include_non_page_children,
123
+ redact_source_urls=ns.redact_source_urls,
124
+ redact_manifest=ns.redact_manifest,
125
+ strict=ns.strict,
126
+ )
127
+ client = build_client(config)
128
+ try:
129
+ root = resolve_target(selection, client)
130
+ result = extract(client=client, root=root, options=options)
131
+ finally:
132
+ client.close()
133
+ duration_ms = int((time.perf_counter() - started) * 1000)
134
+ result.metrics["duration_ms"] = duration_ms
135
+ payload = make_envelope(
136
+ ok=True,
137
+ command="pull",
138
+ target={
139
+ "page_id": result.pages[0].page.page_id if result.pages else root.page_id,
140
+ "url": sanitize_url(
141
+ result.pages[0].page.url if result.pages else root.url,
142
+ redact_source_url=options.redact_source_urls,
143
+ ),
144
+ },
145
+ result={
146
+ "output_dir": str(result.output_dir),
147
+ "manifest": str(result.manifest_path),
148
+ "ai_entry": str(result.ai_entry_path) if result.ai_entry_path else None,
149
+ "bundle": str(result.bundle_path) if result.bundle_path else None,
150
+ "output_mode": options.output_mode,
151
+ "pages": len(result.pages),
152
+ "assets": len(result.assets),
153
+ "warnings": len(result.warnings),
154
+ },
155
+ warnings=result.warnings,
156
+ metrics=result.metrics,
157
+ )
158
+ if json_mode:
159
+ emit_json(payload)
160
+ else:
161
+ print(
162
+ f"Pulled {len(result.pages)} page(s), {len(result.assets)} asset(s), "
163
+ f"{len(result.warnings)} warning(s) into {result.output_dir}"
164
+ )
165
+ return EXIT_SUCCESS
166
+
167
+
168
+ def _main_validate(argv: Sequence[str]) -> int:
169
+ parser = PullArgumentParser(
170
+ prog="pull validate",
171
+ description="Validate a pulled Confluence package.",
172
+ json_mode=_argv_wants_json(argv),
173
+ command="validate",
174
+ )
175
+ parser.add_argument("path", nargs="?", metavar="MANIFEST_OR_OUTPUT_DIR")
176
+ parser.add_argument("--json", action="store_true", help="Emit a structured JSON envelope.")
177
+ ns = parser.parse_args(argv)
178
+ json_mode = wants_json(ns.json)
179
+ if not ns.path:
180
+ error = PullError(
181
+ code="ERR_VALIDATION_REQUIRED",
182
+ message="Missing required MANIFEST_OR_OUTPUT_DIR argument.",
183
+ exit_code=EXIT_VALIDATION,
184
+ suggested_action="Pass an output directory or manifest.yaml path.",
185
+ )
186
+ payload = make_envelope(ok=False, command="validate", target={}, errors=[error])
187
+ if json_mode:
188
+ emit_json(payload)
189
+ else:
190
+ print(f"{error.code}: {error.message}", file=sys.stderr)
191
+ print("Usage: pull validate MANIFEST_OR_OUTPUT_DIR [--json]", file=sys.stderr)
192
+ return EXIT_VALIDATION
193
+ validation = validate_package(Path(ns.path))
194
+ payload = make_envelope(
195
+ ok=validation.ok,
196
+ command="validate",
197
+ target={"path": ns.path},
198
+ result={
199
+ "manifest": str(validation.manifest_path),
200
+ "output_dir": str(validation.output_dir),
201
+ "errors": len(validation.errors),
202
+ "warnings": len(validation.warnings),
203
+ "validation_warnings": len(validation.warnings),
204
+ "package_warnings": validation.metrics.get("package_warnings", 0),
205
+ },
206
+ warnings=validation.warnings,
207
+ errors=validation.errors,
208
+ metrics=validation.metrics,
209
+ )
210
+ if json_mode:
211
+ emit_json(payload)
212
+ elif validation.ok:
213
+ print(
214
+ f"Validation passed for {validation.manifest_path} "
215
+ f"({validation.metrics.get('pages', 0)} page(s), {validation.metrics.get('assets', 0)} asset(s))."
216
+ )
217
+ else:
218
+ for error in validation.errors:
219
+ print(f"{error['code']}: {error['message']}", file=sys.stderr)
220
+ details = error.get("details", {})
221
+ if details.get("file"):
222
+ print(f" file: {details['file']}", file=sys.stderr)
223
+ if details.get("link"):
224
+ print(f" link: {details['link']}", file=sys.stderr)
225
+ if details.get("resolution_base"):
226
+ print(f" resolution_base: {details['resolution_base']}", file=sys.stderr)
227
+ if details.get("candidate_path"):
228
+ print(f" candidate_path: {details['candidate_path']}", file=sys.stderr)
229
+ print("Rerun with --json for the structured validation envelope.", file=sys.stderr)
230
+ return 0 if validation.ok else 10
231
+
232
+
233
+ def _main_guide(argv: Sequence[str]) -> int:
234
+ parser = PullArgumentParser(
235
+ prog="pull guide",
236
+ description="Emit agent-readable CLI and output schema.",
237
+ json_mode=_argv_wants_json(argv),
238
+ command="guide",
239
+ )
240
+ parser.add_argument("--json", action="store_true", help="Emit only the guide payload as JSON.")
241
+ ns = parser.parse_args(argv)
242
+ payload = guide_payload()
243
+ if wants_json(ns.json):
244
+ print(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
245
+ else:
246
+ print("pull-cli guide")
247
+ print("Commands: pull PAGE_REF [OPTIONS], pull validate PATH, pull guide --json")
248
+ print("Use --json or LLM=true for stable agent envelopes on pull/validate.")
249
+ print("Recommended agent flow: pull guide --json, pull ... --json, pull validate <output-dir> --json.")
250
+ print("Default output mode is simple: root AI Markdown, page Markdown, assets, and validation control files.")
251
+ print("Use --output-mode full for bundle.md, page HTML snapshots, and source.storage.xml; use --clean to remove stale files when switching modes.")
252
+ print("Start analysis from <sanitized-root-page-title>.md in the output package.")
253
+ print("Manifest and AI manifest paths are package-root-relative; page links are page-file-relative.")
254
+ print("Run pull guide --json for the full machine-readable schema, error codes, and warning codes.")
255
+ return EXIT_SUCCESS
256
+
257
+
258
+ def _pull_parser(*, json_mode: bool = False) -> argparse.ArgumentParser:
259
+ parser = PullArgumentParser(
260
+ prog="pull",
261
+ description="Pull Confluence pages into local AI-consumable evidence packages.",
262
+ epilog=(
263
+ "Commands: pull PAGE_REF [OPTIONS]; pull validate MANIFEST_OR_OUTPUT_DIR [--json]; "
264
+ "pull guide [--json]; pull version. "
265
+ "Default output mode is simple; use --output-mode full for bundle/html/source artifacts. "
266
+ "Agent flow: pull guide --json, pull ... --json, pull validate <output-dir> --json."
267
+ ),
268
+ json_mode=json_mode,
269
+ command="pull",
270
+ )
271
+ parser.add_argument("page_ref", nargs="?", metavar="PAGE_REF", help="Confluence page ID or page URL.")
272
+ parser.add_argument("--page-id", dest="page_id", help="Confluence page/content ID.")
273
+ parser.add_argument("--url", help="Confluence page URL.")
274
+ parser.add_argument("--space", help="Confluence space key, used with --title.")
275
+ parser.add_argument("--title", help="Confluence page title, used with --space.")
276
+
277
+ parser.add_argument("--tree", action="store_true", help="Pull descendant page hierarchy.")
278
+ parser.add_argument("--depth", type=int, help="Tree depth limit; 0 equals single page.")
279
+ parser.add_argument("--max-pages", type=int, default=500, help="Safety cap for tree pulls.")
280
+ parser.add_argument("--include-non-page-children", action="store_true")
281
+
282
+ parser.add_argument("-o", "--output", default="pulled-confluence", help="Output directory.")
283
+ parser.add_argument("--force", action="store_true", help="Overwrite files in an existing output directory.")
284
+ parser.add_argument("--clean", action="store_true", help="Delete stale files in the output directory first.")
285
+ parser.add_argument("--layout", choices=["auto", "nested", "flat"], default="auto")
286
+ parser.add_argument(
287
+ "--output-mode",
288
+ choices=["simple", "full"],
289
+ default="simple",
290
+ help="Output artifact profile. simple writes quiet agent-facing Markdown by default; full writes all evidence artifacts.",
291
+ )
292
+ parser.add_argument("--bundle", dest="bundle", action=argparse.BooleanOptionalAction, default=None)
293
+ parser.add_argument("--html", dest="html", action=argparse.BooleanOptionalAction, default=None)
294
+ parser.add_argument("--source", dest="source", action=argparse.BooleanOptionalAction, default=None)
295
+ parser.add_argument("--chunks", action="store_true")
296
+
297
+ parser.add_argument("--assets", choices=["visible", "page", "all"], default="visible")
298
+ parser.add_argument("--no-assets", action="store_true")
299
+ parser.add_argument("--extract-attachments", action="store_true")
300
+ parser.add_argument("--comments", action="store_true", help="Fetch page and inline comments into page-local comments.md sidecars.")
301
+ parser.add_argument("--diagram-sources", action="store_true")
302
+
303
+ parser.add_argument("--render-mode", choices=["hybrid", "view", "export-view", "styled-view", "storage"], default="hybrid")
304
+ parser.add_argument("--macro-policy", choices=["expand", "placeholder", "strict"], default="expand")
305
+ parser.add_argument("--unknown-macro", choices=["warn", "error", "ignore"], default="warn")
306
+
307
+ parser.add_argument("--rewrite-links", dest="rewrite_links", action=argparse.BooleanOptionalAction, default=True)
308
+ parser.add_argument("--follow-includes", action="store_true")
309
+ parser.add_argument("--follow-links", choices=["same-tree", "same-space", "none"], default="none")
310
+
311
+ parser.add_argument("--base-url", help="Confluence base URL.")
312
+ parser.add_argument("--user", help="Confluence username/email.")
313
+ parser.add_argument("--token", help="Confluence API token or PAT. Prefer env vars.")
314
+ parser.add_argument("--cloud-id", help="Optional Confluence Cloud ID.")
315
+ parser.add_argument("--ssl-verify", help="true, false, or path to an enterprise CA bundle.")
316
+ parser.add_argument("--config", help="Optional config YAML path.")
317
+
318
+ parser.add_argument("--json", action="store_true", help="Emit a structured JSON object on stdout.")
319
+ parser.add_argument("--version", action="version", version=f"pull-cli {__version__}")
320
+ parser.add_argument("--quiet", action="store_true", help="Accepted but currently no-op; reserved for progress suppression.")
321
+ parser.add_argument("--verbose", action="store_true", help="Accepted but currently no-op; reserved for extra diagnostics.")
322
+ parser.add_argument("--redact-source-urls", action="store_true")
323
+ parser.add_argument("--redact-manifest", action="store_true")
324
+ parser.add_argument("--strict", action="store_true", help="Treat strict extraction failures as errors.")
325
+ return parser
326
+
327
+
328
+ def _argv_wants_json(argv: Sequence[str]) -> bool:
329
+ return "--json" in argv or wants_json(False)
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ from .base import ConfluenceClient
4
+ from .cloud_v2 import CloudV2Client
5
+ from .data_center import DataCenterClient
6
+ from .hybrid import build_client
7
+
8
+ __all__ = ["CloudV2Client", "ConfluenceClient", "DataCenterClient", "build_client"]