pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from pull_cli.models import AttachmentRecord, CommentRecord, PageRecord, PageSummary
6
+
7
+
8
+ class ConfluenceClient(Protocol):
9
+ base_url: str
10
+ deployment_type: str
11
+ api_calls: int
12
+
13
+ def get_page(self, page_id: str) -> PageRecord: ...
14
+
15
+ def find_page(self, space: str, title: str) -> list[PageSummary]: ...
16
+
17
+ def get_children(self, page_id: str) -> list[PageSummary]: ...
18
+
19
+ def get_descendants(self, page_id: str, depth: int | None = None) -> list[PageSummary]: ...
20
+
21
+ def list_attachments(self, page_id: str) -> list[AttachmentRecord]: ...
22
+
23
+ def list_comments(self, page_id: str) -> list[CommentRecord]: ...
24
+
25
+ def download_attachment(self, attachment: AttachmentRecord) -> bytes: ...
26
+
27
+ def download_url(self, url: str) -> bytes: ...
28
+
29
+ def close(self) -> None: ...
@@ -0,0 +1,132 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from atlassian import Confluence
6
+
7
+ from pull_cli.models import AttachmentRecord, Config, PageRecord, PageSummary
8
+ from pull_cli.security import redact_value
9
+
10
+ from .data_center import DataCenterClient
11
+
12
+
13
+ class CloudV2Client(DataCenterClient):
14
+ """Confluence Cloud adapter backed by atlassian-python-api.
15
+
16
+ The installed atlassian-python-api package exposes the legacy `Confluence`
17
+ class in this environment. We use its public helpers for v1 content endpoints
18
+ and its low-level `get` method for Cloud v2 endpoints until the documented
19
+ `ConfluenceCloud` class is available in the package index.
20
+ """
21
+
22
+ deployment_type = "cloud"
23
+
24
+ def __init__(self, config: Config, *, api: Confluence | None = None) -> None:
25
+ super().__init__(config, api=api)
26
+ self._site_url = self.base_url.removesuffix("/wiki")
27
+
28
+ def _build_api(self, config: Config) -> Confluence:
29
+ kwargs = {
30
+ "url": self.base_url,
31
+ "verify_ssl": config.ssl_verify,
32
+ "timeout": 30,
33
+ "cloud": True,
34
+ "backoff_and_retry": True,
35
+ "retry_status_codes": [429, 502, 503, 504],
36
+ "max_backoff_retries": 3,
37
+ "max_backoff_seconds": 8,
38
+ "backoff_factor": 0.25,
39
+ "backoff_jitter": 0,
40
+ }
41
+ if config.token and config.user:
42
+ kwargs["username"] = config.user
43
+ kwargs["password"] = config.token
44
+ elif config.token:
45
+ kwargs["token"] = config.token
46
+ return Confluence(**kwargs)
47
+
48
+ def _v2_url(self, *parts: str) -> str:
49
+ return self._site_url + "/" + "/".join(["wiki", "api", "v2", *parts])
50
+
51
+ def _cloud_v2_get(self, *parts: str, params: dict[str, object] | None = None) -> dict[str, Any]:
52
+ data = self._call(
53
+ self._api.get,
54
+ self._v2_url(*parts),
55
+ params=params,
56
+ absolute=True,
57
+ )
58
+ return data if isinstance(data, dict) else {}
59
+
60
+ def get_page(self, page_id: str) -> PageRecord:
61
+ # Prefer the v1 helper for rich combined body expansion; annotate with v2 metadata.
62
+ page = super().get_page(page_id)
63
+ data = self._cloud_v2_get("pages", page_id, params={"body-format": "storage"})
64
+ if data:
65
+ page.raw["cloud_v2"] = redact_value(data)
66
+ if not page.body_storage:
67
+ body = data.get("body") if isinstance(data.get("body"), dict) else {}
68
+ storage = body.get("storage") if isinstance(body, dict) else {}
69
+ if isinstance(storage, dict) and isinstance(storage.get("value"), str):
70
+ page.body_storage = storage["value"]
71
+ return page
72
+
73
+ def get_children(self, page_id: str) -> list[PageSummary]:
74
+ data = self._cloud_v2_get("pages", page_id, "children", params={"limit": 100})
75
+ results = data.get("results") if isinstance(data, dict) else None
76
+ if isinstance(results, list):
77
+ summaries = [
78
+ PageSummary(
79
+ page_id=str(item.get("id")),
80
+ title=str(item.get("title") or "Untitled"),
81
+ space_key=_space_key(item),
82
+ url=self._absolute_url(str(item.get("_links", {}).get("webui", "")))
83
+ if isinstance(item.get("_links"), dict)
84
+ else None,
85
+ parent_id=page_id,
86
+ )
87
+ for item in results
88
+ if isinstance(item, dict) and item.get("id")
89
+ ]
90
+ if summaries:
91
+ return summaries
92
+ return super().get_children(page_id)
93
+
94
+ def list_attachments(self, page_id: str) -> list[AttachmentRecord]:
95
+ data = self._cloud_v2_get("pages", page_id, "attachments", params={"limit": 250})
96
+ results = data.get("results") if isinstance(data, dict) else None
97
+ if isinstance(results, list):
98
+ attachments = [
99
+ AttachmentRecord(
100
+ attachment_id=str(item.get("id")),
101
+ page_id=page_id,
102
+ filename=str(item.get("title") or item.get("filename") or "attachment"),
103
+ media_type=str(item.get("mediaType") or "") or None,
104
+ download_url=self._absolute_url(str(item.get("downloadLink") or ""))
105
+ if item.get("downloadLink")
106
+ else None,
107
+ web_url=self._absolute_url(str(item.get("_links", {}).get("webui", "")))
108
+ if isinstance(item.get("_links"), dict)
109
+ else None,
110
+ file_size=int(item["fileSize"]) if isinstance(item.get("fileSize"), int) else None,
111
+ raw=redact_value(item),
112
+ )
113
+ for item in results
114
+ if isinstance(item, dict) and item.get("id")
115
+ ]
116
+ if attachments:
117
+ return attachments
118
+ return super().list_attachments(page_id)
119
+
120
+ def download_attachment(self, attachment: AttachmentRecord) -> bytes:
121
+ return self._call(
122
+ self._api.get,
123
+ f"{self._site_url}/wiki/rest/api/content/{attachment.page_id}/child/attachment/{attachment.attachment_id}/download",
124
+ headers={"Accept": "*/*"},
125
+ not_json_response=True,
126
+ absolute=True,
127
+ )
128
+
129
+
130
+ def _space_key(item: dict[str, Any]) -> str | None:
131
+ space = item.get("space") if isinstance(item.get("space"), dict) else {}
132
+ return str(space.get("key") or item.get("spaceKey") or "") or None
@@ -0,0 +1,360 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable
4
+ from urllib.parse import quote, urlencode, urljoin
5
+
6
+ import requests
7
+ from atlassian import Confluence
8
+ from atlassian.errors import ApiError, ApiPermissionError
9
+
10
+ from pull_cli.errors import EXIT_AUTH, EXIT_IO, EXIT_SOURCE, PullError
11
+ from pull_cli.models import AttachmentRecord, CommentRecord, Config, PageRecord, PageSummary
12
+ from pull_cli.security import redact_value, sanitize_url
13
+
14
+
15
+ class DataCenterClient:
16
+ deployment_type = "data_center"
17
+
18
+ def __init__(self, config: Config, *, api: Confluence | None = None) -> None:
19
+ if not config.base_url:
20
+ raise PullError(
21
+ code="ERR_VALIDATION_REQUIRED",
22
+ message="A Confluence base URL is required.",
23
+ exit_code=10,
24
+ suggested_action="Set --base-url, PULL_URL, or CONFPUB_URL.",
25
+ )
26
+ self.base_url = config.base_url.rstrip("/")
27
+ self.api_calls = 0
28
+ self._api = api or self._build_api(config)
29
+
30
+ def _build_api(self, config: Config) -> Confluence:
31
+ kwargs = {
32
+ "url": self.base_url,
33
+ "verify_ssl": config.ssl_verify,
34
+ "timeout": 30,
35
+ "backoff_and_retry": True,
36
+ "retry_status_codes": [429, 502, 503, 504],
37
+ "max_backoff_retries": 3,
38
+ "max_backoff_seconds": 8,
39
+ "backoff_factor": 0.25,
40
+ "backoff_jitter": 0,
41
+ }
42
+ if config.token and config.user:
43
+ kwargs["username"] = config.user
44
+ kwargs["password"] = config.token
45
+ elif config.token:
46
+ kwargs["token"] = config.token
47
+ return Confluence(**kwargs)
48
+
49
+ def close(self) -> None:
50
+ close = getattr(self._api, "close", None)
51
+ if callable(close):
52
+ close()
53
+
54
+ def _absolute_url(self, value: str | None) -> str | None:
55
+ if not value:
56
+ return None
57
+ if value.startswith(("http://", "https://")):
58
+ return value
59
+ if value.startswith("/wiki/") and self.base_url.endswith("/wiki"):
60
+ return urljoin(self.base_url.removesuffix("/wiki") + "/", value.lstrip("/"))
61
+ return urljoin(self.base_url + "/", value.lstrip("/"))
62
+
63
+ def _call(self, operation, *args, **kwargs):
64
+ self.api_calls += 1
65
+ try:
66
+ return operation(*args, **kwargs)
67
+ except requests.Timeout as exc:
68
+ raise PullError(
69
+ code="ERR_IO_TIMEOUT",
70
+ message="Timed out while contacting Confluence.",
71
+ exit_code=EXIT_IO,
72
+ retryable=True,
73
+ suggested_action="Retry the command or reduce scope.",
74
+ ) from exc
75
+ except (ApiPermissionError, requests.HTTPError) as exc:
76
+ status = _status_code(exc)
77
+ if status in {401, 403}:
78
+ raise PullError(
79
+ code="ERR_AUTH_FORBIDDEN" if status == 403 else "ERR_AUTH_REQUIRED",
80
+ message="Confluence authentication failed or the page is not visible.",
81
+ exit_code=EXIT_AUTH,
82
+ suggested_action="Check credentials and page permissions.",
83
+ details=_error_details(exc),
84
+ ) from exc
85
+ if status == 404:
86
+ raise PullError(
87
+ code="ERR_SOURCE_PAGE_NOT_FOUND",
88
+ message="The requested Confluence page was not found.",
89
+ exit_code=EXIT_SOURCE,
90
+ details=_error_details(exc),
91
+ ) from exc
92
+ raise PullError(
93
+ code="ERR_INTERNAL_API_RESPONSE",
94
+ message=f"Confluence returned HTTP {status or 'error'}.",
95
+ exit_code=EXIT_IO,
96
+ retryable=status in {429, 502, 503, 504},
97
+ details=_error_details(exc),
98
+ ) from exc
99
+ except ApiError as exc:
100
+ raise PullError(
101
+ code="ERR_INTERNAL_API_RESPONSE",
102
+ message="Confluence API returned an error.",
103
+ exit_code=EXIT_IO,
104
+ details={"reason": str(exc)},
105
+ ) from exc
106
+ except requests.RequestException as exc:
107
+ raise PullError(
108
+ code="ERR_IO_CONNECTION",
109
+ message="Could not contact Confluence.",
110
+ exit_code=EXIT_IO,
111
+ retryable=True,
112
+ details={"reason": str(exc)},
113
+ ) from exc
114
+
115
+ def _get_paged(
116
+ self,
117
+ path: str,
118
+ *,
119
+ params: dict[str, object] | None = None,
120
+ page_size: int = 100,
121
+ ) -> Iterable[dict[str, object]]:
122
+ start = 0
123
+ while True:
124
+ merged = {"limit": page_size, "start": start}
125
+ if params:
126
+ merged.update(params)
127
+ data = self._call(self._api.get, path, params=merged)
128
+ if not isinstance(data, dict):
129
+ return
130
+ results = data.get("results") or []
131
+ for item in results:
132
+ if isinstance(item, dict):
133
+ yield item
134
+ if len(results) < page_size:
135
+ break
136
+ start += len(results)
137
+
138
+ def get_page(self, page_id: str) -> PageRecord:
139
+ expand = "body.view,body.export_view,body.storage,version,space,metadata.labels,_links,ancestors"
140
+ data = self._call(self._api.get_page_by_id, page_id, expand=expand)
141
+ return self._parse_page(data)
142
+
143
+ def find_page(self, space: str, title: str) -> list[PageSummary]:
144
+ path = "rest/api/content"
145
+ params = {"spaceKey": space, "title": title, "type": "page", "expand": "space,_links"}
146
+ return [self._parse_summary(item) for item in self._get_paged(path, params=params)]
147
+
148
+ def get_children(self, page_id: str) -> list[PageSummary]:
149
+ children = self._call(
150
+ self._api.get_page_child_by_type,
151
+ page_id=page_id,
152
+ type="page",
153
+ start=0,
154
+ limit=100,
155
+ expand="space,_links,ancestors",
156
+ )
157
+ if children is None:
158
+ return []
159
+ if isinstance(children, dict):
160
+ children = children.get("results", [])
161
+ return [self._parse_summary(item, parent_id=page_id) for item in children if isinstance(item, dict)]
162
+
163
+ def get_descendants(self, page_id: str, depth: int | None = None) -> list[PageSummary]:
164
+ path = f"rest/api/content/{page_id}/descendant/page"
165
+ summaries = [self._parse_summary(item) for item in self._get_paged(path)]
166
+ if depth is None:
167
+ return summaries
168
+ return [summary for summary in summaries if summary.depth <= depth]
169
+
170
+ def list_attachments(self, page_id: str) -> list[AttachmentRecord]:
171
+ attachments: list[AttachmentRecord] = []
172
+ for item in self._get_paged(
173
+ f"rest/api/content/{page_id}/child/attachment",
174
+ params={"expand": "version,_links,extensions"},
175
+ page_size=100,
176
+ ):
177
+ attachments.append(self._parse_attachment(item, page_id))
178
+ return attachments
179
+
180
+ def list_comments(self, page_id: str) -> list[CommentRecord]:
181
+ comments: list[CommentRecord] = []
182
+ seen: set[str] = set()
183
+ for location in (None, "inline"):
184
+ for item in self._get_comment_pages(page_id, location=location):
185
+ comment = self._parse_comment(item, page_id, fallback_location=location or "footer")
186
+ if not comment.comment_id or comment.comment_id in seen:
187
+ continue
188
+ seen.add(comment.comment_id)
189
+ comments.append(comment)
190
+ return comments
191
+
192
+ def download_attachment(self, attachment: AttachmentRecord) -> bytes:
193
+ if not attachment.download_url:
194
+ raise PullError(
195
+ code="ERR_SOURCE_BODY_UNAVAILABLE",
196
+ message=f"Attachment {attachment.filename} has no download URL.",
197
+ exit_code=EXIT_SOURCE,
198
+ )
199
+ return self.download_url(attachment.download_url)
200
+
201
+ def download_url(self, url: str) -> bytes:
202
+ absolute = self._absolute_url(url) or url
203
+ return self._call(self._api.get, absolute, not_json_response=True, absolute=True)
204
+
205
+ def _get_comment_pages(self, page_id: str, *, location: str | None) -> Iterable[dict[str, object]]:
206
+ start = 0
207
+ page_size = 100
208
+ while True:
209
+ kwargs: dict[str, object] = {
210
+ "content_id": page_id,
211
+ "expand": "body.view,version,history,container,extensions.inlineProperties,extensions.resolution",
212
+ "start": start,
213
+ "limit": page_size,
214
+ "depth": "all",
215
+ }
216
+ if location:
217
+ kwargs["location"] = location
218
+ data = self._call(self._api.get_page_comments, **kwargs)
219
+ if not isinstance(data, dict):
220
+ return
221
+ results = data.get("results") or []
222
+ for item in results:
223
+ if isinstance(item, dict):
224
+ yield item
225
+ if len(results) < page_size:
226
+ break
227
+ start += len(results)
228
+
229
+ def _parse_summary(self, data: dict[str, object], *, parent_id: str | None = None) -> PageSummary:
230
+ links = data.get("_links") if isinstance(data.get("_links"), dict) else {}
231
+ ancestors = data.get("ancestors") if isinstance(data.get("ancestors"), list) else []
232
+ parsed_parent = parent_id
233
+ if not parsed_parent and ancestors:
234
+ last = ancestors[-1]
235
+ if isinstance(last, dict):
236
+ parsed_parent = str(last.get("id") or "") or None
237
+ space = data.get("space") if isinstance(data.get("space"), dict) else {}
238
+ return PageSummary(
239
+ page_id=str(data.get("id") or data.get("contentId") or ""),
240
+ title=str(data.get("title") or "Untitled"),
241
+ space_key=str(space.get("key") or data.get("spaceKey") or "") or None,
242
+ url=self._absolute_url(str(links.get("webui") or "")) if links else None,
243
+ parent_id=parsed_parent,
244
+ )
245
+
246
+ def _parse_page(self, data: dict[str, object]) -> PageRecord:
247
+ summary = self._parse_summary(data)
248
+ body = data.get("body") if isinstance(data.get("body"), dict) else {}
249
+ version = data.get("version") if isinstance(data.get("version"), dict) else {}
250
+ metadata = data.get("metadata") if isinstance(data.get("metadata"), dict) else {}
251
+ labels_data = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
252
+ labels = [
253
+ str(item.get("name"))
254
+ for item in labels_data.get("results", [])
255
+ if isinstance(item, dict) and item.get("name")
256
+ ]
257
+ return PageRecord(
258
+ page_id=summary.page_id,
259
+ title=summary.title,
260
+ space_key=summary.space_key,
261
+ url=summary.url,
262
+ parent_id=summary.parent_id,
263
+ version=int(version["number"]) if isinstance(version.get("number"), int) else None,
264
+ body_view=_body_value(body, "view"),
265
+ body_export_view=_body_value(body, "export_view"),
266
+ body_storage=_body_value(body, "storage"),
267
+ labels=labels,
268
+ raw=redact_value(data),
269
+ )
270
+
271
+ def _parse_attachment(self, data: dict[str, object], page_id: str) -> AttachmentRecord:
272
+ links = data.get("_links") if isinstance(data.get("_links"), dict) else {}
273
+ metadata = data.get("metadata") if isinstance(data.get("metadata"), dict) else {}
274
+ media_type = str(metadata.get("mediaType") or data.get("mediaType") or "") or None
275
+ extensions = data.get("extensions") if isinstance(data.get("extensions"), dict) else {}
276
+ return AttachmentRecord(
277
+ attachment_id=str(data.get("id") or ""),
278
+ page_id=page_id,
279
+ filename=str(data.get("title") or data.get("filename") or "attachment"),
280
+ media_type=media_type,
281
+ download_url=self._absolute_url(str(links.get("download") or "")) if links else None,
282
+ web_url=self._absolute_url(str(links.get("webui") or "")) if links else None,
283
+ file_size=int(extensions["fileSize"]) if isinstance(extensions.get("fileSize"), int) else None,
284
+ raw=redact_value(data),
285
+ )
286
+
287
+ def _parse_comment(
288
+ self, data: dict[str, object], page_id: str, *, fallback_location: str
289
+ ) -> CommentRecord:
290
+ body = data.get("body") if isinstance(data.get("body"), dict) else {}
291
+ view = body.get("view") if isinstance(body.get("view"), dict) else {}
292
+ version = data.get("version") if isinstance(data.get("version"), dict) else {}
293
+ history = data.get("history") if isinstance(data.get("history"), dict) else {}
294
+ extensions = data.get("extensions") if isinstance(data.get("extensions"), dict) else {}
295
+ resolution = extensions.get("resolution") if isinstance(extensions.get("resolution"), dict) else {}
296
+ parent = data.get("parent") if isinstance(data.get("parent"), dict) else {}
297
+ return CommentRecord(
298
+ comment_id=str(data.get("id") or ""),
299
+ page_id=page_id,
300
+ body_html=str(view.get("value") or ""),
301
+ location=str(extensions.get("location") or fallback_location or "") or None,
302
+ status=str(data.get("status") or "") or None,
303
+ version=int(version["number"]) if isinstance(version.get("number"), int) else None,
304
+ author=_person_display_name(history.get("createdBy")) or _person_display_name(version.get("by")),
305
+ created_at=str(history.get("createdDate") or "") or None,
306
+ updated_at=str(version.get("when") or "") or None,
307
+ parent_id=str(parent.get("id") or data.get("parentId") or "") or None,
308
+ resolution=_resolution_label(resolution),
309
+ raw=redact_value(data),
310
+ )
311
+
312
+
313
+ def _body_value(body: dict[str, object], name: str) -> str | None:
314
+ value = body.get(name)
315
+ if isinstance(value, dict) and isinstance(value.get("value"), str):
316
+ return value["value"]
317
+ return None
318
+
319
+
320
+ def _person_display_name(value: object) -> str | None:
321
+ if not isinstance(value, dict):
322
+ return None
323
+ return str(value.get("displayName") or value.get("publicName") or value.get("username") or "") or None
324
+
325
+
326
+ def _resolution_label(value: dict[str, object]) -> str | None:
327
+ if not value:
328
+ return None
329
+ for key in ("status", "state"):
330
+ if value.get(key):
331
+ return str(value[key])
332
+ if isinstance(value.get("resolved"), bool):
333
+ return "resolved" if value["resolved"] else "unresolved"
334
+ return None
335
+
336
+
337
+ def _status_code(exc: Exception) -> int | None:
338
+ response = getattr(exc, "response", None)
339
+ if response is not None:
340
+ return getattr(response, "status_code", None)
341
+ reason = getattr(exc, "reason", None)
342
+ response = getattr(reason, "response", None)
343
+ if response is not None:
344
+ return getattr(response, "status_code", None)
345
+ return None
346
+
347
+
348
+ def _error_details(exc: Exception) -> dict[str, object]:
349
+ response = getattr(exc, "response", None) or getattr(getattr(exc, "reason", None), "response", None)
350
+ details: dict[str, object] = {"reason": str(exc)}
351
+ if response is not None:
352
+ details["status_code"] = getattr(response, "status_code", None)
353
+ request = getattr(response, "request", None)
354
+ if request is not None:
355
+ details["url"] = sanitize_url(str(getattr(request, "url", "")))
356
+ return details
357
+
358
+
359
+ def query(params: dict[str, object]) -> str:
360
+ return urlencode(params, doseq=True, quote_via=quote)
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from pull_cli.models import Config
4
+
5
+ from .cloud_v2 import CloudV2Client
6
+ from .data_center import DataCenterClient
7
+
8
+
9
+ def build_client(config: Config):
10
+ deployment = config.deployment
11
+ if deployment == "cloud" or (
12
+ deployment == "auto" and config.base_url and ".atlassian.net" in config.base_url
13
+ ):
14
+ return CloudV2Client(config)
15
+ return DataCenterClient(config)
pull_cli/config.py ADDED
@@ -0,0 +1,82 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import yaml
8
+
9
+ from .models import Config
10
+
11
+
12
+ def _coerce_ssl_verify(value: str | bool | None) -> bool | str:
13
+ if value is None or value == "":
14
+ return True
15
+ if isinstance(value, bool):
16
+ return value
17
+ lowered = value.strip().lower()
18
+ if lowered in {"true", "1", "yes", "y", "on"}:
19
+ return True
20
+ if lowered in {"false", "0", "no", "n", "off"}:
21
+ return False
22
+ return value
23
+
24
+
25
+ def _load_config_file(path: Path | None) -> dict[str, Any]:
26
+ if not path:
27
+ return {}
28
+ if not path.exists():
29
+ return {}
30
+ data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
31
+ if not isinstance(data, dict):
32
+ return {}
33
+ return data
34
+
35
+
36
+ def resolve_config(
37
+ *,
38
+ base_url: str | None = None,
39
+ user: str | None = None,
40
+ token: str | None = None,
41
+ cloud_id: str | None = None,
42
+ ssl_verify: str | bool | None = None,
43
+ config_path: str | Path | None = None,
44
+ env: dict[str, str] | None = None,
45
+ ) -> Config:
46
+ env_map = env if env is not None else os.environ
47
+ path = Path(config_path).expanduser() if config_path else None
48
+ file_data = _load_config_file(path)
49
+
50
+ resolved = Config(
51
+ base_url=(
52
+ base_url
53
+ or env_map.get("PULL_URL")
54
+ or file_data.get("base_url")
55
+ or env_map.get("CONFPUB_URL")
56
+ ),
57
+ user=(
58
+ user
59
+ or env_map.get("PULL_USER")
60
+ or file_data.get("user")
61
+ or env_map.get("CONFPUB_USER")
62
+ ),
63
+ token=(
64
+ token
65
+ or env_map.get("PULL_TOKEN")
66
+ or file_data.get("token")
67
+ or env_map.get("CONFPUB_TOKEN")
68
+ ),
69
+ cloud_id=cloud_id or env_map.get("PULL_CLOUD_ID") or file_data.get("cloud_id"),
70
+ ssl_verify=_coerce_ssl_verify(
71
+ ssl_verify
72
+ if ssl_verify is not None
73
+ else env_map.get("PULL_SSL_VERIFY")
74
+ or file_data.get("ssl_verify")
75
+ or env_map.get("CONFPUB_SSL_VERIFY")
76
+ ),
77
+ deployment=file_data.get("deployment", "auto"),
78
+ config_path=path,
79
+ )
80
+ if resolved.base_url:
81
+ resolved.base_url = resolved.base_url.rstrip("/")
82
+ return resolved
pull_cli/crawler.py ADDED
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+
5
+ from .clients.base import ConfluenceClient
6
+ from .errors import EXIT_SOURCE, PullError
7
+ from .models import PageSummary
8
+
9
+
10
+ def crawl_pages(
11
+ client: ConfluenceClient,
12
+ root: PageSummary,
13
+ *,
14
+ tree: bool,
15
+ depth: int | None,
16
+ max_pages: int,
17
+ ) -> list[PageSummary]:
18
+ root.depth = 0
19
+ root.parent_id = None
20
+ ordered: list[PageSummary] = [root]
21
+ if not tree or depth == 0:
22
+ root.order = 1
23
+ return ordered
24
+
25
+ queue: deque[PageSummary] = deque([root])
26
+ seen = {root.page_id}
27
+ while queue:
28
+ parent = queue.popleft()
29
+ if depth is not None and parent.depth >= depth:
30
+ continue
31
+ children = client.get_children(parent.page_id)
32
+ for child in children:
33
+ if child.page_id in seen:
34
+ continue
35
+ if len(ordered) >= max_pages:
36
+ raise PullError(
37
+ code="ERR_SOURCE_TREE_TOO_LARGE",
38
+ message=f"Tree extraction exceeded the max page cap of {max_pages}.",
39
+ exit_code=EXIT_SOURCE,
40
+ suggested_action="Use --max-pages with a higher value or reduce --depth.",
41
+ details={"max_pages": max_pages},
42
+ )
43
+ child.parent_id = parent.page_id
44
+ child.depth = parent.depth + 1
45
+ seen.add(child.page_id)
46
+ ordered.append(child)
47
+ queue.append(child)
48
+
49
+ for index, summary in enumerate(ordered, start=1):
50
+ summary.order = index
51
+ return ordered