PyPI - pull-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pull-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

pull_cli/__init__.py +5 -0
pull_cli/__main__.py +6 -0
pull_cli/assets.py +235 -0
pull_cli/attachment_extractors.py +85 -0
pull_cli/cli.py +329 -0
pull_cli/clients/__init__.py +8 -0
pull_cli/clients/base.py +29 -0
pull_cli/clients/cloud_v2.py +132 -0
pull_cli/clients/data_center.py +360 -0
pull_cli/clients/hybrid.py +15 -0
pull_cli/config.py +82 -0
pull_cli/crawler.py +51 -0
pull_cli/envelope.py +59 -0
pull_cli/errors.py +50 -0
pull_cli/extractor.py +344 -0
pull_cli/guide.py +115 -0
pull_cli/html_normalizer.py +111 -0
pull_cli/links.py +186 -0
pull_cli/macros.py +527 -0
pull_cli/markdown_writer.py +24 -0
pull_cli/models.py +232 -0
pull_cli/paths.py +45 -0
pull_cli/resolver.py +72 -0
pull_cli/security.py +103 -0
pull_cli/validator.py +398 -0
pull_cli/writer.py +792 -0
pull_cli-0.1.0.dist-info/METADATA +218 -0
pull_cli-0.1.0.dist-info/RECORD +31 -0
pull_cli-0.1.0.dist-info/WHEEL +4 -0
pull_cli-0.1.0.dist-info/entry_points.txt +3 -0
pull_cli-0.1.0.dist-info/licenses/LICENSE +21 -0

pull_cli/models.py ADDED Viewed

@@ -0,0 +1,232 @@
+from __future__ import annotations
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Literal
+AssetPolicy = Literal["visible", "page", "all"]
+OutputMode = Literal["simple", "full"]
+RenderMode = Literal["hybrid", "view", "export-view", "styled-view", "storage"]
+MacroPolicy = Literal["expand", "placeholder", "strict"]
+UnknownMacroPolicy = Literal["warn", "error", "ignore"]
+@dataclass
+class PullOptions:
+    output: Path
+    force: bool = False
+    clean: bool = False
+    tree: bool = False
+    depth: int | None = None
+    max_pages: int = 500
+    layout: Literal["auto", "nested", "flat"] = "auto"
+    output_mode: OutputMode = "simple"
+    write_bundle: bool | None = None
+    write_html: bool | None = None
+    write_source: bool | None = None
+    write_chunks: bool = False
+    asset_policy: AssetPolicy = "visible"
+    no_assets: bool = False
+    extract_attachments: bool = False
+    comments: bool = False
+    diagram_sources: bool = False
+    render_mode: RenderMode = "hybrid"
+    macro_policy: MacroPolicy = "expand"
+    unknown_macro: UnknownMacroPolicy = "warn"
+    rewrite_links: bool = True
+    follow_includes: bool = False
+    follow_links: Literal["same-tree", "same-space", "none"] = "none"
+    include_non_page_children: bool = False
+    redact_source_urls: bool = False
+    redact_manifest: bool = False
+    strict: bool = False
+    def __post_init__(self) -> None:
+        full_mode = self.output_mode == "full"
+        if self.write_bundle is None:
+            self.write_bundle = full_mode
+        if self.write_html is None:
+            self.write_html = full_mode
+        if self.write_source is None:
+            self.write_source = full_mode
+    def manifest_dict(self) -> dict[str, Any]:
+        data = asdict(self)
+        data["output"] = str(self.output)
+        return data
+@dataclass
+class Config:
+    base_url: str | None = None
+    user: str | None = None
+    token: str | None = None
+    cloud_id: str | None = None
+    ssl_verify: bool | str = True
+    deployment: Literal["auto", "cloud", "data_center"] = "auto"
+    config_path: Path | None = None
+    @property
+    def has_auth(self) -> bool:
+        return bool(self.token)
+@dataclass
+class TargetSelection:
+    positional: str | None = None
+    page_id: str | None = None
+    url: str | None = None
+    space: str | None = None
+    title: str | None = None
+@dataclass
+class PageSummary:
+    page_id: str
+    title: str
+    space_key: str | None = None
+    url: str | None = None
+    parent_id: str | None = None
+    depth: int = 0
+    order: int = 0
+@dataclass
+class PageRecord(PageSummary):
+    version: int | None = None
+    body_view: str | None = None
+    body_export_view: str | None = None
+    body_storage: str | None = None
+    body_adf: dict[str, Any] | None = None
+    labels: list[str] = field(default_factory=list)
+    raw: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class AttachmentRecord:
+    attachment_id: str
+    page_id: str
+    filename: str
+    media_type: str | None = None
+    download_url: str | None = None
+    web_url: str | None = None
+    file_size: int | None = None
+    raw: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class CommentRecord:
+    comment_id: str
+    page_id: str
+    body_html: str
+    location: str | None = None
+    status: str | None = None
+    version: int | None = None
+    author: str | None = None
+    created_at: str | None = None
+    updated_at: str | None = None
+    parent_id: str | None = None
+    resolution: str | None = None
+    raw: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class WarningRecord:
+    code: str
+    message: str
+    source_page_id: str | None = None
+    details: dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+@dataclass
+class AssetReference:
+    page_id: str
+    html_attribute: str
+    original: str
+@dataclass
+class AssetRecord:
+    asset_id: str
+    source_page_id: str | None
+    attachment_id: str | None
+    filename: str
+    media_type: str | None
+    local_path: str
+    sha256: str | None
+    role: str
+    source_url: str | None = None
+    references: list[AssetReference] = field(default_factory=list)
+    sidecars: list[str] = field(default_factory=list)
+    def to_manifest(self) -> dict[str, Any]:
+        data = asdict(self)
+        data["references"] = [asdict(ref) for ref in self.references]
+        return data
+@dataclass
+class LinkRecord:
+    original: str
+    normalized: str
+    kind: str
+    source_page_id: str
+    target_page_id: str | None = None
+    target_asset_id: str | None = None
+    rewritten: str | None = None
+    status: Literal["rewritten", "preserved", "unresolved", "skipped"] = "preserved"
+    warning: str | None = None
+@dataclass
+class MacroRecord:
+    macro_id: str
+    name: str
+    adapter: str
+    source_page_id: str
+    status: Literal["converted", "placeholder", "ignored", "error"] = "converted"
+    markdown: str | None = None
+    params: dict[str, str] = field(default_factory=dict)
+    warnings: list[WarningRecord] = field(default_factory=list)
+    def to_manifest(self) -> dict[str, Any]:
+        data = asdict(self)
+        data["warnings"] = [warning.to_dict() for warning in self.warnings]
+        return data
+@dataclass
+class PageArtifact:
+    page: PageRecord
+    order: int
+    page_dir: str
+    index_md: str
+    index_html: str | None
+    source_path: str | None
+    page_json: str
+    markdown: str
+    html: str
+    assets: list[AssetRecord] = field(default_factory=list)
+    links: list[LinkRecord] = field(default_factory=list)
+    macros: list[MacroRecord] = field(default_factory=list)
+    warnings: list[WarningRecord] = field(default_factory=list)
+    comments_path: str | None = None
+    comments: list[CommentRecord] = field(default_factory=list)
+@dataclass
+class ExtractionResult:
+    output_dir: Path
+    manifest_path: Path
+    bundle_path: Path | None
+    pages: list[PageArtifact]
+    assets: list[AssetRecord]
+    warnings: list[WarningRecord]
+    links: list[LinkRecord]
+    macros: list[MacroRecord]
+    metrics: dict[str, Any] = field(default_factory=dict)
+    ai_entry_path: Path | None = None
+    ai_manifest_path: Path | None = None

pull_cli/paths.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+import os
+import re
+from pathlib import Path
+def slugify(value: str, *, fallback: str = "page") -> str:
+    lowered = value.strip().lower()
+    lowered = re.sub(r"[^a-z0-9]+", "-", lowered)
+    lowered = lowered.strip("-")
+    return lowered[:80] or fallback
+def safe_filename(value: str, *, fallback: str = "asset") -> str:
+    name = Path(value).name.strip()
+    name = re.sub(r"[\x00-\x1f<>:\"|?*\\/]+", "-", name)
+    name = re.sub(r"\s+", " ", name).strip(" .")
+    return name[:180] or fallback
+def unique_name(name: str, used: set[str]) -> str:
+    if name not in used:
+        used.add(name)
+        return name
+    path = Path(name)
+    stem = path.stem or "asset"
+    suffix = path.suffix
+    counter = 2
+    while True:
+        candidate = f"{stem}-{counter}{suffix}"
+        if candidate not in used:
+            used.add(candidate)
+            return candidate
+        counter += 1
+def as_posix(path: str | Path) -> str:
+    return Path(path).as_posix()
+def relative_path(from_file: str | Path, to_file: str | Path) -> str:
+    from_path = Path(from_file)
+    to_path = Path(to_file)
+    return Path(os.path.relpath(to_path, start=from_path.parent)).as_posix()

pull_cli/resolver.py ADDED Viewed

@@ -0,0 +1,72 @@
+from __future__ import annotations
+import re
+from urllib.parse import parse_qs, unquote, urlsplit
+from .clients.base import ConfluenceClient
+from .errors import EXIT_SOURCE, PullError, validation_error
+from .models import PageSummary, TargetSelection
+PAGE_URL_RE = re.compile(r"/pages/(?:viewpage\.action\?pageId=)?(?P<id>\d+)|[?&]pageId=(?P<query_id>\d+)")
+NUMERIC_ID_RE = re.compile(r"^\d+$")
+def is_url(value: str | None) -> bool:
+    return bool(value and value.startswith(("http://", "https://")))
+def page_id_from_url(url: str) -> str | None:
+    parsed = urlsplit(url)
+    query_id = parse_qs(parsed.query).get("pageId")
+    if query_id:
+        return query_id[0]
+    path = unquote(parsed.path)
+    match = PAGE_URL_RE.search(path)
+    if match:
+        return match.group("id") or match.group("query_id")
+    return None
+def resolve_target(selection: TargetSelection, client: ConfluenceClient) -> PageSummary:
+    if selection.page_id:
+        return PageSummary(page_id=selection.page_id, title=selection.page_id)
+    if selection.url:
+        return _summary_from_url(selection.url)
+    if selection.positional and is_url(selection.positional):
+        return _summary_from_url(selection.positional)
+    if selection.positional and NUMERIC_ID_RE.match(selection.positional):
+        return PageSummary(page_id=selection.positional, title=selection.positional)
+    if selection.space and selection.title:
+        matches = client.find_page(selection.space, selection.title)
+        if not matches:
+            raise PullError(
+                code="ERR_SOURCE_PAGE_NOT_FOUND",
+                message=f"No Confluence page matched space {selection.space!r} and title {selection.title!r}.",
+                exit_code=EXIT_SOURCE,
+                suggested_action="Check the space key/title or use --page-id.",
+            )
+        if len(matches) > 1:
+            raise validation_error(
+                "ERR_VALIDATION_AMBIGUOUS_PAGE",
+                "Multiple Confluence pages matched the requested title.",
+                suggested_action="Use --page-id or --url for an exact page.",
+                details={"candidates": [match.__dict__ for match in matches]},
+            )
+        return matches[0]
+    raise validation_error(
+        "ERR_VALIDATION_REQUIRED",
+        "A page selector is required.",
+        suggested_action="Pass PAGE_REF, --page-id, --url, or --space with --title.",
+    )
+def _summary_from_url(url: str) -> PageSummary:
+    page_id = page_id_from_url(url)
+    if not page_id:
+        raise validation_error(
+            "ERR_VALIDATION_INVALID_URL",
+            "The URL does not look like a Confluence page URL with a page ID.",
+            suggested_action="Use a canonical Confluence page URL or --page-id.",
+            details={"url": url},
+        )
+    return PageSummary(page_id=page_id, title=page_id, url=url)

pull_cli/security.py ADDED Viewed

@@ -0,0 +1,103 @@
+from __future__ import annotations
+import re
+from collections.abc import Mapping, Sequence
+from typing import Any
+from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
+SECRET_KEY_PATTERN = re.compile(
+    r"(^|[_\-.])(authorization|cookie|token|password|secret|signature|session|jwt|pat|access[_-]?key)([_\-.]|$)",
+    re.IGNORECASE,
+)
+SECRET_TEXT_PATTERNS = [
+    re.compile(r"\bBearer\s+[A-Za-z0-9._~+/=-]+", re.IGNORECASE),
+    re.compile(r"\bBasic\s+[A-Za-z0-9._~+/=-]+", re.IGNORECASE),
+    re.compile(r"(?i)(access_token|api_token|jwt|token|signature|atl_token)=([^&\s]+)"),
+    re.compile(r"(?i)\bname=[\"']?(atl_token|token|signature|jwt|password|secret)[\"']?"),
+]
+SOURCE_URL_TEXT_PATTERN = re.compile(r"https?://[^\s<>'\")]+", re.IGNORECASE)
+SECRET_HTML_INPUT_PATTERN = re.compile(
+    r"(?is)<input\b(?=[^>]*\bname=[\"']?(?:atl_token|token|signature|jwt|password|secret)[\"']?)[^>]*>"
+)
+SENSITIVE_QUERY_KEYS = {
+    "access_token",
+    "api_token",
+    "atl_token",
+    "auth",
+    "authorization",
+    "downloadtoken",
+    "expires",
+    "jwt",
+    "signature",
+    "sig",
+    "token",
+    "x-amz-algorithm",
+    "x-amz-credential",
+    "x-amz-date",
+    "x-amz-expires",
+    "x-amz-security-token",
+    "x-amz-signature",
+    "x-amz-signedheaders",
+}
+def redact_text(value: str) -> str:
+    redacted = SECRET_HTML_INPUT_PATTERN.sub("<input name=<redacted> value=<redacted>>", value)
+    for pattern in SECRET_TEXT_PATTERNS:
+        redacted = pattern.sub(lambda match: match.group(0).split("=", 1)[0] + "=<redacted>" if "=" in match.group(0) else "<redacted-auth>", redacted)
+    return redacted
+def redact_source_url_text(value: str) -> str:
+    return SOURCE_URL_TEXT_PATTERN.sub("<redacted-url>", value)
+def sanitize_url(url: str | None, *, redact_source_url: bool = False) -> str | None:
+    if not url:
+        return url
+    if redact_source_url:
+        return "<redacted-url>"
+    try:
+        parts = urlsplit(url)
+    except ValueError:
+        return redact_text(url)
+    query_pairs = []
+    for key, value in parse_qsl(parts.query, keep_blank_values=True):
+        if key.lower() in SENSITIVE_QUERY_KEYS or SECRET_KEY_PATTERN.search(key):
+            query_pairs.append((key, "<redacted>"))
+        else:
+            query_pairs.append((key, redact_text(value)))
+    sanitized = urlunsplit(
+        (parts.scheme, parts.netloc, parts.path, urlencode(query_pairs), parts.fragment)
+    )
+    return redact_text(sanitized)
+def redact_value(value: Any, *, redact_source_urls: bool = False) -> Any:
+    if isinstance(value, str):
+        if value.startswith(("http://", "https://", "/wiki/", "/download/")):
+            return sanitize_url(value, redact_source_url=redact_source_urls)
+        redacted = redact_text(value)
+        return redact_source_url_text(redacted) if redact_source_urls else redacted
+    if isinstance(value, Mapping):
+        output: dict[str, Any] = {}
+        for key, child in value.items():
+            key_text = str(key)
+            if SECRET_KEY_PATTERN.search(key_text):
+                output[key_text] = "<redacted>"
+            else:
+                output[key_text] = redact_value(child, redact_source_urls=redact_source_urls)
+        return output
+    if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray)):
+        return [redact_value(child, redact_source_urls=redact_source_urls) for child in value]
+    return value
+def contains_secret_text(value: str) -> bool:
+    if SECRET_KEY_PATTERN.search(value):
+        return True
+    for pattern in SECRET_TEXT_PATTERNS:
+        for match in pattern.finditer(value):
+            if "<redacted>" not in match.group(0) and "<redacted-auth>" not in match.group(0):
+                return True
+    return False