PyPI - pull-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

pull_cli/__init__.py +5 -0
pull_cli/__main__.py +6 -0
pull_cli/assets.py +235 -0
pull_cli/attachment_extractors.py +85 -0
pull_cli/cli.py +329 -0
pull_cli/clients/__init__.py +8 -0
pull_cli/clients/base.py +29 -0
pull_cli/clients/cloud_v2.py +132 -0
pull_cli/clients/data_center.py +360 -0
pull_cli/clients/hybrid.py +15 -0
pull_cli/config.py +82 -0
pull_cli/crawler.py +51 -0
pull_cli/envelope.py +59 -0
pull_cli/errors.py +50 -0
pull_cli/extractor.py +344 -0
pull_cli/guide.py +115 -0
pull_cli/html_normalizer.py +111 -0
pull_cli/links.py +186 -0
pull_cli/macros.py +527 -0
pull_cli/markdown_writer.py +24 -0
pull_cli/models.py +232 -0
pull_cli/paths.py +45 -0
pull_cli/resolver.py +72 -0
pull_cli/security.py +103 -0
pull_cli/validator.py +398 -0
pull_cli/writer.py +792 -0
pull_cli-0.1.0.dist-info/METADATA +218 -0
pull_cli-0.1.0.dist-info/RECORD +31 -0
pull_cli-0.1.0.dist-info/WHEEL +4 -0
pull_cli-0.1.0.dist-info/entry_points.txt +3 -0
pull_cli-0.1.0.dist-info/licenses/LICENSE +21 -0

pull_cli/clients/base.py ADDED Viewed

@@ -0,0 +1,29 @@
+from __future__ import annotations
+from typing import Protocol
+from pull_cli.models import AttachmentRecord, CommentRecord, PageRecord, PageSummary
+class ConfluenceClient(Protocol):
+    base_url: str
+    deployment_type: str
+    api_calls: int
+    def get_page(self, page_id: str) -> PageRecord: ...
+    def find_page(self, space: str, title: str) -> list[PageSummary]: ...
+    def get_children(self, page_id: str) -> list[PageSummary]: ...
+    def get_descendants(self, page_id: str, depth: int | None = None) -> list[PageSummary]: ...
+    def list_attachments(self, page_id: str) -> list[AttachmentRecord]: ...
+    def list_comments(self, page_id: str) -> list[CommentRecord]: ...
+    def download_attachment(self, attachment: AttachmentRecord) -> bytes: ...
+    def download_url(self, url: str) -> bytes: ...
+    def close(self) -> None: ...

pull_cli/clients/cloud_v2.py ADDED Viewed

@@ -0,0 +1,132 @@
+from __future__ import annotations
+from typing import Any
+from atlassian import Confluence
+from pull_cli.models import AttachmentRecord, Config, PageRecord, PageSummary
+from pull_cli.security import redact_value
+from .data_center import DataCenterClient
+class CloudV2Client(DataCenterClient):
+    """Confluence Cloud adapter backed by atlassian-python-api.
+    The installed atlassian-python-api package exposes the legacy `Confluence`
+    class in this environment. We use its public helpers for v1 content endpoints
+    and its low-level `get` method for Cloud v2 endpoints until the documented
+    `ConfluenceCloud` class is available in the package index.
+    """
+    deployment_type = "cloud"
+    def __init__(self, config: Config, *, api: Confluence | None = None) -> None:
+        super().__init__(config, api=api)
+        self._site_url = self.base_url.removesuffix("/wiki")
+    def _build_api(self, config: Config) -> Confluence:
+        kwargs = {
+            "url": self.base_url,
+            "verify_ssl": config.ssl_verify,
+            "timeout": 30,
+            "cloud": True,
+            "backoff_and_retry": True,
+            "retry_status_codes": [429, 502, 503, 504],
+            "max_backoff_retries": 3,
+            "max_backoff_seconds": 8,
+            "backoff_factor": 0.25,
+            "backoff_jitter": 0,
+        }
+        if config.token and config.user:
+            kwargs["username"] = config.user
+            kwargs["password"] = config.token
+        elif config.token:
+            kwargs["token"] = config.token
+        return Confluence(**kwargs)
+    def _v2_url(self, *parts: str) -> str:
+        return self._site_url + "/" + "/".join(["wiki", "api", "v2", *parts])
+    def _cloud_v2_get(self, *parts: str, params: dict[str, object] | None = None) -> dict[str, Any]:
+        data = self._call(
+            self._api.get,
+            self._v2_url(*parts),
+            params=params,
+            absolute=True,
+        )
+        return data if isinstance(data, dict) else {}
+    def get_page(self, page_id: str) -> PageRecord:
+        # Prefer the v1 helper for rich combined body expansion; annotate with v2 metadata.
+        page = super().get_page(page_id)
+        data = self._cloud_v2_get("pages", page_id, params={"body-format": "storage"})
+        if data:
+            page.raw["cloud_v2"] = redact_value(data)
+            if not page.body_storage:
+                body = data.get("body") if isinstance(data.get("body"), dict) else {}
+                storage = body.get("storage") if isinstance(body, dict) else {}
+                if isinstance(storage, dict) and isinstance(storage.get("value"), str):
+                    page.body_storage = storage["value"]
+        return page
+    def get_children(self, page_id: str) -> list[PageSummary]:
+        data = self._cloud_v2_get("pages", page_id, "children", params={"limit": 100})
+        results = data.get("results") if isinstance(data, dict) else None
+        if isinstance(results, list):
+            summaries = [
+                PageSummary(
+                    page_id=str(item.get("id")),
+                    title=str(item.get("title") or "Untitled"),
+                    space_key=_space_key(item),
+                    url=self._absolute_url(str(item.get("_links", {}).get("webui", "")))
+                    if isinstance(item.get("_links"), dict)
+                    else None,
+                    parent_id=page_id,
+                )
+                for item in results
+                if isinstance(item, dict) and item.get("id")
+            ]
+            if summaries:
+                return summaries
+        return super().get_children(page_id)
+    def list_attachments(self, page_id: str) -> list[AttachmentRecord]:
+        data = self._cloud_v2_get("pages", page_id, "attachments", params={"limit": 250})
+        results = data.get("results") if isinstance(data, dict) else None
+        if isinstance(results, list):
+            attachments = [
+                AttachmentRecord(
+                    attachment_id=str(item.get("id")),
+                    page_id=page_id,
+                    filename=str(item.get("title") or item.get("filename") or "attachment"),
+                    media_type=str(item.get("mediaType") or "") or None,
+                    download_url=self._absolute_url(str(item.get("downloadLink") or ""))
+                    if item.get("downloadLink")
+                    else None,
+                    web_url=self._absolute_url(str(item.get("_links", {}).get("webui", "")))
+                    if isinstance(item.get("_links"), dict)
+                    else None,
+                    file_size=int(item["fileSize"]) if isinstance(item.get("fileSize"), int) else None,
+                    raw=redact_value(item),
+                )
+                for item in results
+                if isinstance(item, dict) and item.get("id")
+            ]
+            if attachments:
+                return attachments
+        return super().list_attachments(page_id)
+    def download_attachment(self, attachment: AttachmentRecord) -> bytes:
+        return self._call(
+            self._api.get,
+            f"{self._site_url}/wiki/rest/api/content/{attachment.page_id}/child/attachment/{attachment.attachment_id}/download",
+            headers={"Accept": "*/*"},
+            not_json_response=True,
+            absolute=True,
+        )
+def _space_key(item: dict[str, Any]) -> str | None:
+    space = item.get("space") if isinstance(item.get("space"), dict) else {}
+    return str(space.get("key") or item.get("spaceKey") or "") or None

pull_cli/clients/data_center.py ADDED Viewed

@@ -0,0 +1,360 @@
+from __future__ import annotations
+from collections.abc import Iterable
+from urllib.parse import quote, urlencode, urljoin
+import requests
+from atlassian import Confluence
+from atlassian.errors import ApiError, ApiPermissionError
+from pull_cli.errors import EXIT_AUTH, EXIT_IO, EXIT_SOURCE, PullError
+from pull_cli.models import AttachmentRecord, CommentRecord, Config, PageRecord, PageSummary
+from pull_cli.security import redact_value, sanitize_url
+class DataCenterClient:
+    deployment_type = "data_center"
+    def __init__(self, config: Config, *, api: Confluence | None = None) -> None:
+        if not config.base_url:
+            raise PullError(
+                code="ERR_VALIDATION_REQUIRED",
+                message="A Confluence base URL is required.",
+                exit_code=10,
+                suggested_action="Set --base-url, PULL_URL, or CONFPUB_URL.",
+            )
+        self.base_url = config.base_url.rstrip("/")
+        self.api_calls = 0
+        self._api = api or self._build_api(config)
+    def _build_api(self, config: Config) -> Confluence:
+        kwargs = {
+            "url": self.base_url,
+            "verify_ssl": config.ssl_verify,
+            "timeout": 30,
+            "backoff_and_retry": True,
+            "retry_status_codes": [429, 502, 503, 504],
+            "max_backoff_retries": 3,
+            "max_backoff_seconds": 8,
+            "backoff_factor": 0.25,
+            "backoff_jitter": 0,
+        }
+        if config.token and config.user:
+            kwargs["username"] = config.user
+            kwargs["password"] = config.token
+        elif config.token:
+            kwargs["token"] = config.token
+        return Confluence(**kwargs)
+    def close(self) -> None:
+        close = getattr(self._api, "close", None)
+        if callable(close):
+            close()
+    def _absolute_url(self, value: str | None) -> str | None:
+        if not value:
+            return None
+        if value.startswith(("http://", "https://")):
+            return value
+        if value.startswith("/wiki/") and self.base_url.endswith("/wiki"):
+            return urljoin(self.base_url.removesuffix("/wiki") + "/", value.lstrip("/"))
+        return urljoin(self.base_url + "/", value.lstrip("/"))
+    def _call(self, operation, *args, **kwargs):
+        self.api_calls += 1
+        try:
+            return operation(*args, **kwargs)
+        except requests.Timeout as exc:
+            raise PullError(
+                code="ERR_IO_TIMEOUT",
+                message="Timed out while contacting Confluence.",
+                exit_code=EXIT_IO,
+                retryable=True,
+                suggested_action="Retry the command or reduce scope.",
+            ) from exc
+        except (ApiPermissionError, requests.HTTPError) as exc:
+            status = _status_code(exc)
+            if status in {401, 403}:
+                raise PullError(
+                    code="ERR_AUTH_FORBIDDEN" if status == 403 else "ERR_AUTH_REQUIRED",
+                    message="Confluence authentication failed or the page is not visible.",
+                    exit_code=EXIT_AUTH,
+                    suggested_action="Check credentials and page permissions.",
+                    details=_error_details(exc),
+                ) from exc
+            if status == 404:
+                raise PullError(
+                    code="ERR_SOURCE_PAGE_NOT_FOUND",
+                    message="The requested Confluence page was not found.",
+                    exit_code=EXIT_SOURCE,
+                    details=_error_details(exc),
+                ) from exc
+            raise PullError(
+                code="ERR_INTERNAL_API_RESPONSE",
+                message=f"Confluence returned HTTP {status or 'error'}.",
+                exit_code=EXIT_IO,
+                retryable=status in {429, 502, 503, 504},
+                details=_error_details(exc),
+            ) from exc
+        except ApiError as exc:
+            raise PullError(
+                code="ERR_INTERNAL_API_RESPONSE",
+                message="Confluence API returned an error.",
+                exit_code=EXIT_IO,
+                details={"reason": str(exc)},
+            ) from exc
+        except requests.RequestException as exc:
+            raise PullError(
+                code="ERR_IO_CONNECTION",
+                message="Could not contact Confluence.",
+                exit_code=EXIT_IO,
+                retryable=True,
+                details={"reason": str(exc)},
+            ) from exc
+    def _get_paged(
+        self,
+        path: str,
+        *,
+        params: dict[str, object] | None = None,
+        page_size: int = 100,
+    ) -> Iterable[dict[str, object]]:
+        start = 0
+        while True:
+            merged = {"limit": page_size, "start": start}
+            if params:
+                merged.update(params)
+            data = self._call(self._api.get, path, params=merged)
+            if not isinstance(data, dict):
+                return
+            results = data.get("results") or []
+            for item in results:
+                if isinstance(item, dict):
+                    yield item
+            if len(results) < page_size:
+                break
+            start += len(results)
+    def get_page(self, page_id: str) -> PageRecord:
+        expand = "body.view,body.export_view,body.storage,version,space,metadata.labels,_links,ancestors"
+        data = self._call(self._api.get_page_by_id, page_id, expand=expand)
+        return self._parse_page(data)
+    def find_page(self, space: str, title: str) -> list[PageSummary]:
+        path = "rest/api/content"
+        params = {"spaceKey": space, "title": title, "type": "page", "expand": "space,_links"}
+        return [self._parse_summary(item) for item in self._get_paged(path, params=params)]
+    def get_children(self, page_id: str) -> list[PageSummary]:
+        children = self._call(
+            self._api.get_page_child_by_type,
+            page_id=page_id,
+            type="page",
+            start=0,
+            limit=100,
+            expand="space,_links,ancestors",
+        )
+        if children is None:
+            return []
+        if isinstance(children, dict):
+            children = children.get("results", [])
+        return [self._parse_summary(item, parent_id=page_id) for item in children if isinstance(item, dict)]
+    def get_descendants(self, page_id: str, depth: int | None = None) -> list[PageSummary]:
+        path = f"rest/api/content/{page_id}/descendant/page"
+        summaries = [self._parse_summary(item) for item in self._get_paged(path)]
+        if depth is None:
+            return summaries
+        return [summary for summary in summaries if summary.depth <= depth]
+    def list_attachments(self, page_id: str) -> list[AttachmentRecord]:
+        attachments: list[AttachmentRecord] = []
+        for item in self._get_paged(
+            f"rest/api/content/{page_id}/child/attachment",
+            params={"expand": "version,_links,extensions"},
+            page_size=100,
+        ):
+            attachments.append(self._parse_attachment(item, page_id))
+        return attachments
+    def list_comments(self, page_id: str) -> list[CommentRecord]:
+        comments: list[CommentRecord] = []
+        seen: set[str] = set()
+        for location in (None, "inline"):
+            for item in self._get_comment_pages(page_id, location=location):
+                comment = self._parse_comment(item, page_id, fallback_location=location or "footer")
+                if not comment.comment_id or comment.comment_id in seen:
+                    continue
+                seen.add(comment.comment_id)
+                comments.append(comment)
+        return comments
+    def download_attachment(self, attachment: AttachmentRecord) -> bytes:
+        if not attachment.download_url:
+            raise PullError(
+                code="ERR_SOURCE_BODY_UNAVAILABLE",
+                message=f"Attachment {attachment.filename} has no download URL.",
+                exit_code=EXIT_SOURCE,
+            )
+        return self.download_url(attachment.download_url)
+    def download_url(self, url: str) -> bytes:
+        absolute = self._absolute_url(url) or url
+        return self._call(self._api.get, absolute, not_json_response=True, absolute=True)
+    def _get_comment_pages(self, page_id: str, *, location: str | None) -> Iterable[dict[str, object]]:
+        start = 0
+        page_size = 100
+        while True:
+            kwargs: dict[str, object] = {
+                "content_id": page_id,
+                "expand": "body.view,version,history,container,extensions.inlineProperties,extensions.resolution",
+                "start": start,
+                "limit": page_size,
+                "depth": "all",
+            }
+            if location:
+                kwargs["location"] = location
+            data = self._call(self._api.get_page_comments, **kwargs)
+            if not isinstance(data, dict):
+                return
+            results = data.get("results") or []
+            for item in results:
+                if isinstance(item, dict):
+                    yield item
+            if len(results) < page_size:
+                break
+            start += len(results)
+    def _parse_summary(self, data: dict[str, object], *, parent_id: str | None = None) -> PageSummary:
+        links = data.get("_links") if isinstance(data.get("_links"), dict) else {}
+        ancestors = data.get("ancestors") if isinstance(data.get("ancestors"), list) else []
+        parsed_parent = parent_id
+        if not parsed_parent and ancestors:
+            last = ancestors[-1]
+            if isinstance(last, dict):
+                parsed_parent = str(last.get("id") or "") or None
+        space = data.get("space") if isinstance(data.get("space"), dict) else {}
+        return PageSummary(
+            page_id=str(data.get("id") or data.get("contentId") or ""),
+            title=str(data.get("title") or "Untitled"),
+            space_key=str(space.get("key") or data.get("spaceKey") or "") or None,
+            url=self._absolute_url(str(links.get("webui") or "")) if links else None,
+            parent_id=parsed_parent,
+        )
+    def _parse_page(self, data: dict[str, object]) -> PageRecord:
+        summary = self._parse_summary(data)
+        body = data.get("body") if isinstance(data.get("body"), dict) else {}
+        version = data.get("version") if isinstance(data.get("version"), dict) else {}
+        metadata = data.get("metadata") if isinstance(data.get("metadata"), dict) else {}
+        labels_data = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
+        labels = [
+            str(item.get("name"))
+            for item in labels_data.get("results", [])
+            if isinstance(item, dict) and item.get("name")
+        ]
+        return PageRecord(
+            page_id=summary.page_id,
+            title=summary.title,
+            space_key=summary.space_key,
+            url=summary.url,
+            parent_id=summary.parent_id,
+            version=int(version["number"]) if isinstance(version.get("number"), int) else None,
+            body_view=_body_value(body, "view"),
+            body_export_view=_body_value(body, "export_view"),
+            body_storage=_body_value(body, "storage"),
+            labels=labels,
+            raw=redact_value(data),
+        )
+    def _parse_attachment(self, data: dict[str, object], page_id: str) -> AttachmentRecord:
+        links = data.get("_links") if isinstance(data.get("_links"), dict) else {}
+        metadata = data.get("metadata") if isinstance(data.get("metadata"), dict) else {}
+        media_type = str(metadata.get("mediaType") or data.get("mediaType") or "") or None
+        extensions = data.get("extensions") if isinstance(data.get("extensions"), dict) else {}
+        return AttachmentRecord(
+            attachment_id=str(data.get("id") or ""),
+            page_id=page_id,
+            filename=str(data.get("title") or data.get("filename") or "attachment"),
+            media_type=media_type,
+            download_url=self._absolute_url(str(links.get("download") or "")) if links else None,
+            web_url=self._absolute_url(str(links.get("webui") or "")) if links else None,
+            file_size=int(extensions["fileSize"]) if isinstance(extensions.get("fileSize"), int) else None,
+            raw=redact_value(data),
+        )
+    def _parse_comment(
+        self, data: dict[str, object], page_id: str, *, fallback_location: str
+    ) -> CommentRecord:
+        body = data.get("body") if isinstance(data.get("body"), dict) else {}
+        view = body.get("view") if isinstance(body.get("view"), dict) else {}
+        version = data.get("version") if isinstance(data.get("version"), dict) else {}
+        history = data.get("history") if isinstance(data.get("history"), dict) else {}
+        extensions = data.get("extensions") if isinstance(data.get("extensions"), dict) else {}
+        resolution = extensions.get("resolution") if isinstance(extensions.get("resolution"), dict) else {}
+        parent = data.get("parent") if isinstance(data.get("parent"), dict) else {}
+        return CommentRecord(
+            comment_id=str(data.get("id") or ""),
+            page_id=page_id,
+            body_html=str(view.get("value") or ""),
+            location=str(extensions.get("location") or fallback_location or "") or None,
+            status=str(data.get("status") or "") or None,
+            version=int(version["number"]) if isinstance(version.get("number"), int) else None,
+            author=_person_display_name(history.get("createdBy")) or _person_display_name(version.get("by")),
+            created_at=str(history.get("createdDate") or "") or None,
+            updated_at=str(version.get("when") or "") or None,
+            parent_id=str(parent.get("id") or data.get("parentId") or "") or None,
+            resolution=_resolution_label(resolution),
+            raw=redact_value(data),
+        )
+def _body_value(body: dict[str, object], name: str) -> str | None:
+    value = body.get(name)
+    if isinstance(value, dict) and isinstance(value.get("value"), str):
+        return value["value"]
+    return None
+def _person_display_name(value: object) -> str | None:
+    if not isinstance(value, dict):
+        return None
+    return str(value.get("displayName") or value.get("publicName") or value.get("username") or "") or None
+def _resolution_label(value: dict[str, object]) -> str | None:
+    if not value:
+        return None
+    for key in ("status", "state"):
+        if value.get(key):
+            return str(value[key])
+    if isinstance(value.get("resolved"), bool):
+        return "resolved" if value["resolved"] else "unresolved"
+    return None
+def _status_code(exc: Exception) -> int | None:
+    response = getattr(exc, "response", None)
+    if response is not None:
+        return getattr(response, "status_code", None)
+    reason = getattr(exc, "reason", None)
+    response = getattr(reason, "response", None)
+    if response is not None:
+        return getattr(response, "status_code", None)
+    return None
+def _error_details(exc: Exception) -> dict[str, object]:
+    response = getattr(exc, "response", None) or getattr(getattr(exc, "reason", None), "response", None)
+    details: dict[str, object] = {"reason": str(exc)}
+    if response is not None:
+        details["status_code"] = getattr(response, "status_code", None)
+        request = getattr(response, "request", None)
+        if request is not None:
+            details["url"] = sanitize_url(str(getattr(request, "url", "")))
+    return details
+def query(params: dict[str, object]) -> str:
+    return urlencode(params, doseq=True, quote_via=quote)

pull_cli/clients/hybrid.py ADDED Viewed

@@ -0,0 +1,15 @@
+from __future__ import annotations
+from pull_cli.models import Config
+from .cloud_v2 import CloudV2Client
+from .data_center import DataCenterClient
+def build_client(config: Config):
+    deployment = config.deployment
+    if deployment == "cloud" or (
+        deployment == "auto" and config.base_url and ".atlassian.net" in config.base_url
+    ):
+        return CloudV2Client(config)
+    return DataCenterClient(config)

pull_cli/config.py ADDED Viewed

@@ -0,0 +1,82 @@
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any
+import yaml
+from .models import Config
+def _coerce_ssl_verify(value: str | bool | None) -> bool | str:
+    if value is None or value == "":
+        return True
+    if isinstance(value, bool):
+        return value
+    lowered = value.strip().lower()
+    if lowered in {"true", "1", "yes", "y", "on"}:
+        return True
+    if lowered in {"false", "0", "no", "n", "off"}:
+        return False
+    return value
+def _load_config_file(path: Path | None) -> dict[str, Any]:
+    if not path:
+        return {}
+    if not path.exists():
+        return {}
+    data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    if not isinstance(data, dict):
+        return {}
+    return data
+def resolve_config(
+    *,
+    base_url: str | None = None,
+    user: str | None = None,
+    token: str | None = None,
+    cloud_id: str | None = None,
+    ssl_verify: str | bool | None = None,
+    config_path: str | Path | None = None,
+    env: dict[str, str] | None = None,
+) -> Config:
+    env_map = env if env is not None else os.environ
+    path = Path(config_path).expanduser() if config_path else None
+    file_data = _load_config_file(path)
+    resolved = Config(
+        base_url=(
+            base_url
+            or env_map.get("PULL_URL")
+            or file_data.get("base_url")
+            or env_map.get("CONFPUB_URL")
+        ),
+        user=(
+            user
+            or env_map.get("PULL_USER")
+            or file_data.get("user")
+            or env_map.get("CONFPUB_USER")
+        ),
+        token=(
+            token
+            or env_map.get("PULL_TOKEN")
+            or file_data.get("token")
+            or env_map.get("CONFPUB_TOKEN")
+        ),
+        cloud_id=cloud_id or env_map.get("PULL_CLOUD_ID") or file_data.get("cloud_id"),
+        ssl_verify=_coerce_ssl_verify(
+            ssl_verify
+            if ssl_verify is not None
+            else env_map.get("PULL_SSL_VERIFY")
+            or file_data.get("ssl_verify")
+            or env_map.get("CONFPUB_SSL_VERIFY")
+        ),
+        deployment=file_data.get("deployment", "auto"),
+        config_path=path,
+    )
+    if resolved.base_url:
+        resolved.base_url = resolved.base_url.rstrip("/")
+    return resolved

pull_cli/crawler.py ADDED Viewed

@@ -0,0 +1,51 @@
+from __future__ import annotations
+from collections import deque
+from .clients.base import ConfluenceClient
+from .errors import EXIT_SOURCE, PullError
+from .models import PageSummary
+def crawl_pages(
+    client: ConfluenceClient,
+    root: PageSummary,
+    *,
+    tree: bool,
+    depth: int | None,
+    max_pages: int,
+) -> list[PageSummary]:
+    root.depth = 0
+    root.parent_id = None
+    ordered: list[PageSummary] = [root]
+    if not tree or depth == 0:
+        root.order = 1
+        return ordered
+    queue: deque[PageSummary] = deque([root])
+    seen = {root.page_id}
+    while queue:
+        parent = queue.popleft()
+        if depth is not None and parent.depth >= depth:
+            continue
+        children = client.get_children(parent.page_id)
+        for child in children:
+            if child.page_id in seen:
+                continue
+            if len(ordered) >= max_pages:
+                raise PullError(
+                    code="ERR_SOURCE_TREE_TOO_LARGE",
+                    message=f"Tree extraction exceeded the max page cap of {max_pages}.",
+                    exit_code=EXIT_SOURCE,
+                    suggested_action="Use --max-pages with a higher value or reduce --depth.",
+                    details={"max_pages": max_pages},
+                )
+            child.parent_id = parent.page_id
+            child.depth = parent.depth + 1
+            seen.add(child.page_id)
+            ordered.append(child)
+            queue.append(child)
+    for index, summary in enumerate(ordered, start=1):
+        summary.order = index
+    return ordered