pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pull_cli/models.py ADDED
@@ -0,0 +1,232 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import asdict, dataclass, field
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ AssetPolicy = Literal["visible", "page", "all"]
8
+ OutputMode = Literal["simple", "full"]
9
+ RenderMode = Literal["hybrid", "view", "export-view", "styled-view", "storage"]
10
+ MacroPolicy = Literal["expand", "placeholder", "strict"]
11
+ UnknownMacroPolicy = Literal["warn", "error", "ignore"]
12
+
13
+
14
+ @dataclass
15
+ class PullOptions:
16
+ output: Path
17
+ force: bool = False
18
+ clean: bool = False
19
+ tree: bool = False
20
+ depth: int | None = None
21
+ max_pages: int = 500
22
+ layout: Literal["auto", "nested", "flat"] = "auto"
23
+ output_mode: OutputMode = "simple"
24
+ write_bundle: bool | None = None
25
+ write_html: bool | None = None
26
+ write_source: bool | None = None
27
+ write_chunks: bool = False
28
+ asset_policy: AssetPolicy = "visible"
29
+ no_assets: bool = False
30
+ extract_attachments: bool = False
31
+ comments: bool = False
32
+ diagram_sources: bool = False
33
+ render_mode: RenderMode = "hybrid"
34
+ macro_policy: MacroPolicy = "expand"
35
+ unknown_macro: UnknownMacroPolicy = "warn"
36
+ rewrite_links: bool = True
37
+ follow_includes: bool = False
38
+ follow_links: Literal["same-tree", "same-space", "none"] = "none"
39
+ include_non_page_children: bool = False
40
+ redact_source_urls: bool = False
41
+ redact_manifest: bool = False
42
+ strict: bool = False
43
+
44
+ def __post_init__(self) -> None:
45
+ full_mode = self.output_mode == "full"
46
+ if self.write_bundle is None:
47
+ self.write_bundle = full_mode
48
+ if self.write_html is None:
49
+ self.write_html = full_mode
50
+ if self.write_source is None:
51
+ self.write_source = full_mode
52
+
53
+ def manifest_dict(self) -> dict[str, Any]:
54
+ data = asdict(self)
55
+ data["output"] = str(self.output)
56
+ return data
57
+
58
+
59
+ @dataclass
60
+ class Config:
61
+ base_url: str | None = None
62
+ user: str | None = None
63
+ token: str | None = None
64
+ cloud_id: str | None = None
65
+ ssl_verify: bool | str = True
66
+ deployment: Literal["auto", "cloud", "data_center"] = "auto"
67
+ config_path: Path | None = None
68
+
69
+ @property
70
+ def has_auth(self) -> bool:
71
+ return bool(self.token)
72
+
73
+
74
+ @dataclass
75
+ class TargetSelection:
76
+ positional: str | None = None
77
+ page_id: str | None = None
78
+ url: str | None = None
79
+ space: str | None = None
80
+ title: str | None = None
81
+
82
+
83
+ @dataclass
84
+ class PageSummary:
85
+ page_id: str
86
+ title: str
87
+ space_key: str | None = None
88
+ url: str | None = None
89
+ parent_id: str | None = None
90
+ depth: int = 0
91
+ order: int = 0
92
+
93
+
94
+ @dataclass
95
+ class PageRecord(PageSummary):
96
+ version: int | None = None
97
+ body_view: str | None = None
98
+ body_export_view: str | None = None
99
+ body_storage: str | None = None
100
+ body_adf: dict[str, Any] | None = None
101
+ labels: list[str] = field(default_factory=list)
102
+ raw: dict[str, Any] = field(default_factory=dict)
103
+
104
+
105
+ @dataclass
106
+ class AttachmentRecord:
107
+ attachment_id: str
108
+ page_id: str
109
+ filename: str
110
+ media_type: str | None = None
111
+ download_url: str | None = None
112
+ web_url: str | None = None
113
+ file_size: int | None = None
114
+ raw: dict[str, Any] = field(default_factory=dict)
115
+
116
+
117
+ @dataclass
118
+ class CommentRecord:
119
+ comment_id: str
120
+ page_id: str
121
+ body_html: str
122
+ location: str | None = None
123
+ status: str | None = None
124
+ version: int | None = None
125
+ author: str | None = None
126
+ created_at: str | None = None
127
+ updated_at: str | None = None
128
+ parent_id: str | None = None
129
+ resolution: str | None = None
130
+ raw: dict[str, Any] = field(default_factory=dict)
131
+
132
+
133
+ @dataclass
134
+ class WarningRecord:
135
+ code: str
136
+ message: str
137
+ source_page_id: str | None = None
138
+ details: dict[str, Any] = field(default_factory=dict)
139
+
140
+ def to_dict(self) -> dict[str, Any]:
141
+ return asdict(self)
142
+
143
+
144
+ @dataclass
145
+ class AssetReference:
146
+ page_id: str
147
+ html_attribute: str
148
+ original: str
149
+
150
+
151
+ @dataclass
152
+ class AssetRecord:
153
+ asset_id: str
154
+ source_page_id: str | None
155
+ attachment_id: str | None
156
+ filename: str
157
+ media_type: str | None
158
+ local_path: str
159
+ sha256: str | None
160
+ role: str
161
+ source_url: str | None = None
162
+ references: list[AssetReference] = field(default_factory=list)
163
+ sidecars: list[str] = field(default_factory=list)
164
+
165
+ def to_manifest(self) -> dict[str, Any]:
166
+ data = asdict(self)
167
+ data["references"] = [asdict(ref) for ref in self.references]
168
+ return data
169
+
170
+
171
+ @dataclass
172
+ class LinkRecord:
173
+ original: str
174
+ normalized: str
175
+ kind: str
176
+ source_page_id: str
177
+ target_page_id: str | None = None
178
+ target_asset_id: str | None = None
179
+ rewritten: str | None = None
180
+ status: Literal["rewritten", "preserved", "unresolved", "skipped"] = "preserved"
181
+ warning: str | None = None
182
+
183
+
184
+ @dataclass
185
+ class MacroRecord:
186
+ macro_id: str
187
+ name: str
188
+ adapter: str
189
+ source_page_id: str
190
+ status: Literal["converted", "placeholder", "ignored", "error"] = "converted"
191
+ markdown: str | None = None
192
+ params: dict[str, str] = field(default_factory=dict)
193
+ warnings: list[WarningRecord] = field(default_factory=list)
194
+
195
+ def to_manifest(self) -> dict[str, Any]:
196
+ data = asdict(self)
197
+ data["warnings"] = [warning.to_dict() for warning in self.warnings]
198
+ return data
199
+
200
+
201
+ @dataclass
202
+ class PageArtifact:
203
+ page: PageRecord
204
+ order: int
205
+ page_dir: str
206
+ index_md: str
207
+ index_html: str | None
208
+ source_path: str | None
209
+ page_json: str
210
+ markdown: str
211
+ html: str
212
+ assets: list[AssetRecord] = field(default_factory=list)
213
+ links: list[LinkRecord] = field(default_factory=list)
214
+ macros: list[MacroRecord] = field(default_factory=list)
215
+ warnings: list[WarningRecord] = field(default_factory=list)
216
+ comments_path: str | None = None
217
+ comments: list[CommentRecord] = field(default_factory=list)
218
+
219
+
220
+ @dataclass
221
+ class ExtractionResult:
222
+ output_dir: Path
223
+ manifest_path: Path
224
+ bundle_path: Path | None
225
+ pages: list[PageArtifact]
226
+ assets: list[AssetRecord]
227
+ warnings: list[WarningRecord]
228
+ links: list[LinkRecord]
229
+ macros: list[MacroRecord]
230
+ metrics: dict[str, Any] = field(default_factory=dict)
231
+ ai_entry_path: Path | None = None
232
+ ai_manifest_path: Path | None = None
pull_cli/paths.py ADDED
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+
7
+
8
+ def slugify(value: str, *, fallback: str = "page") -> str:
9
+ lowered = value.strip().lower()
10
+ lowered = re.sub(r"[^a-z0-9]+", "-", lowered)
11
+ lowered = lowered.strip("-")
12
+ return lowered[:80] or fallback
13
+
14
+
15
+ def safe_filename(value: str, *, fallback: str = "asset") -> str:
16
+ name = Path(value).name.strip()
17
+ name = re.sub(r"[\x00-\x1f<>:\"|?*\\/]+", "-", name)
18
+ name = re.sub(r"\s+", " ", name).strip(" .")
19
+ return name[:180] or fallback
20
+
21
+
22
+ def unique_name(name: str, used: set[str]) -> str:
23
+ if name not in used:
24
+ used.add(name)
25
+ return name
26
+ path = Path(name)
27
+ stem = path.stem or "asset"
28
+ suffix = path.suffix
29
+ counter = 2
30
+ while True:
31
+ candidate = f"{stem}-{counter}{suffix}"
32
+ if candidate not in used:
33
+ used.add(candidate)
34
+ return candidate
35
+ counter += 1
36
+
37
+
38
+ def as_posix(path: str | Path) -> str:
39
+ return Path(path).as_posix()
40
+
41
+
42
+ def relative_path(from_file: str | Path, to_file: str | Path) -> str:
43
+ from_path = Path(from_file)
44
+ to_path = Path(to_file)
45
+ return Path(os.path.relpath(to_path, start=from_path.parent)).as_posix()
pull_cli/resolver.py ADDED
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from urllib.parse import parse_qs, unquote, urlsplit
5
+
6
+ from .clients.base import ConfluenceClient
7
+ from .errors import EXIT_SOURCE, PullError, validation_error
8
+ from .models import PageSummary, TargetSelection
9
+
10
+ PAGE_URL_RE = re.compile(r"/pages/(?:viewpage\.action\?pageId=)?(?P<id>\d+)|[?&]pageId=(?P<query_id>\d+)")
11
+ NUMERIC_ID_RE = re.compile(r"^\d+$")
12
+
13
+
14
+ def is_url(value: str | None) -> bool:
15
+ return bool(value and value.startswith(("http://", "https://")))
16
+
17
+
18
+ def page_id_from_url(url: str) -> str | None:
19
+ parsed = urlsplit(url)
20
+ query_id = parse_qs(parsed.query).get("pageId")
21
+ if query_id:
22
+ return query_id[0]
23
+ path = unquote(parsed.path)
24
+ match = PAGE_URL_RE.search(path)
25
+ if match:
26
+ return match.group("id") or match.group("query_id")
27
+ return None
28
+
29
+
30
+ def resolve_target(selection: TargetSelection, client: ConfluenceClient) -> PageSummary:
31
+ if selection.page_id:
32
+ return PageSummary(page_id=selection.page_id, title=selection.page_id)
33
+ if selection.url:
34
+ return _summary_from_url(selection.url)
35
+ if selection.positional and is_url(selection.positional):
36
+ return _summary_from_url(selection.positional)
37
+ if selection.positional and NUMERIC_ID_RE.match(selection.positional):
38
+ return PageSummary(page_id=selection.positional, title=selection.positional)
39
+ if selection.space and selection.title:
40
+ matches = client.find_page(selection.space, selection.title)
41
+ if not matches:
42
+ raise PullError(
43
+ code="ERR_SOURCE_PAGE_NOT_FOUND",
44
+ message=f"No Confluence page matched space {selection.space!r} and title {selection.title!r}.",
45
+ exit_code=EXIT_SOURCE,
46
+ suggested_action="Check the space key/title or use --page-id.",
47
+ )
48
+ if len(matches) > 1:
49
+ raise validation_error(
50
+ "ERR_VALIDATION_AMBIGUOUS_PAGE",
51
+ "Multiple Confluence pages matched the requested title.",
52
+ suggested_action="Use --page-id or --url for an exact page.",
53
+ details={"candidates": [match.__dict__ for match in matches]},
54
+ )
55
+ return matches[0]
56
+ raise validation_error(
57
+ "ERR_VALIDATION_REQUIRED",
58
+ "A page selector is required.",
59
+ suggested_action="Pass PAGE_REF, --page-id, --url, or --space with --title.",
60
+ )
61
+
62
+
63
+ def _summary_from_url(url: str) -> PageSummary:
64
+ page_id = page_id_from_url(url)
65
+ if not page_id:
66
+ raise validation_error(
67
+ "ERR_VALIDATION_INVALID_URL",
68
+ "The URL does not look like a Confluence page URL with a page ID.",
69
+ suggested_action="Use a canonical Confluence page URL or --page-id.",
70
+ details={"url": url},
71
+ )
72
+ return PageSummary(page_id=page_id, title=page_id, url=url)
pull_cli/security.py ADDED
@@ -0,0 +1,103 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from collections.abc import Mapping, Sequence
5
+ from typing import Any
6
+ from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
7
+
8
+ SECRET_KEY_PATTERN = re.compile(
9
+ r"(^|[_\-.])(authorization|cookie|token|password|secret|signature|session|jwt|pat|access[_-]?key)([_\-.]|$)",
10
+ re.IGNORECASE,
11
+ )
12
+ SECRET_TEXT_PATTERNS = [
13
+ re.compile(r"\bBearer\s+[A-Za-z0-9._~+/=-]+", re.IGNORECASE),
14
+ re.compile(r"\bBasic\s+[A-Za-z0-9._~+/=-]+", re.IGNORECASE),
15
+ re.compile(r"(?i)(access_token|api_token|jwt|token|signature|atl_token)=([^&\s]+)"),
16
+ re.compile(r"(?i)\bname=[\"']?(atl_token|token|signature|jwt|password|secret)[\"']?"),
17
+ ]
18
+ SOURCE_URL_TEXT_PATTERN = re.compile(r"https?://[^\s<>'\")]+", re.IGNORECASE)
19
+ SECRET_HTML_INPUT_PATTERN = re.compile(
20
+ r"(?is)<input\b(?=[^>]*\bname=[\"']?(?:atl_token|token|signature|jwt|password|secret)[\"']?)[^>]*>"
21
+ )
22
+ SENSITIVE_QUERY_KEYS = {
23
+ "access_token",
24
+ "api_token",
25
+ "atl_token",
26
+ "auth",
27
+ "authorization",
28
+ "downloadtoken",
29
+ "expires",
30
+ "jwt",
31
+ "signature",
32
+ "sig",
33
+ "token",
34
+ "x-amz-algorithm",
35
+ "x-amz-credential",
36
+ "x-amz-date",
37
+ "x-amz-expires",
38
+ "x-amz-security-token",
39
+ "x-amz-signature",
40
+ "x-amz-signedheaders",
41
+ }
42
+
43
+
44
+ def redact_text(value: str) -> str:
45
+ redacted = SECRET_HTML_INPUT_PATTERN.sub("<input name=<redacted> value=<redacted>>", value)
46
+ for pattern in SECRET_TEXT_PATTERNS:
47
+ redacted = pattern.sub(lambda match: match.group(0).split("=", 1)[0] + "=<redacted>" if "=" in match.group(0) else "<redacted-auth>", redacted)
48
+ return redacted
49
+
50
+
51
+ def redact_source_url_text(value: str) -> str:
52
+ return SOURCE_URL_TEXT_PATTERN.sub("<redacted-url>", value)
53
+
54
+
55
+ def sanitize_url(url: str | None, *, redact_source_url: bool = False) -> str | None:
56
+ if not url:
57
+ return url
58
+ if redact_source_url:
59
+ return "<redacted-url>"
60
+ try:
61
+ parts = urlsplit(url)
62
+ except ValueError:
63
+ return redact_text(url)
64
+ query_pairs = []
65
+ for key, value in parse_qsl(parts.query, keep_blank_values=True):
66
+ if key.lower() in SENSITIVE_QUERY_KEYS or SECRET_KEY_PATTERN.search(key):
67
+ query_pairs.append((key, "<redacted>"))
68
+ else:
69
+ query_pairs.append((key, redact_text(value)))
70
+ sanitized = urlunsplit(
71
+ (parts.scheme, parts.netloc, parts.path, urlencode(query_pairs), parts.fragment)
72
+ )
73
+ return redact_text(sanitized)
74
+
75
+
76
+ def redact_value(value: Any, *, redact_source_urls: bool = False) -> Any:
77
+ if isinstance(value, str):
78
+ if value.startswith(("http://", "https://", "/wiki/", "/download/")):
79
+ return sanitize_url(value, redact_source_url=redact_source_urls)
80
+ redacted = redact_text(value)
81
+ return redact_source_url_text(redacted) if redact_source_urls else redacted
82
+ if isinstance(value, Mapping):
83
+ output: dict[str, Any] = {}
84
+ for key, child in value.items():
85
+ key_text = str(key)
86
+ if SECRET_KEY_PATTERN.search(key_text):
87
+ output[key_text] = "<redacted>"
88
+ else:
89
+ output[key_text] = redact_value(child, redact_source_urls=redact_source_urls)
90
+ return output
91
+ if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray)):
92
+ return [redact_value(child, redact_source_urls=redact_source_urls) for child in value]
93
+ return value
94
+
95
+
96
+ def contains_secret_text(value: str) -> bool:
97
+ if SECRET_KEY_PATTERN.search(value):
98
+ return True
99
+ for pattern in SECRET_TEXT_PATTERNS:
100
+ for match in pattern.finditer(value):
101
+ if "<redacted>" not in match.group(0) and "<redacted-auth>" not in match.group(0):
102
+ return True
103
+ return False