PyPI - magicmd - Versions diffs - 0.1.0__py3-none-any.whl - Mend

magicmd 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

magicmd/__init__.py +2 -0
magicmd/assets.py +149 -0
magicmd/cli.py +422 -0
magicmd/config.py +78 -0
magicmd/detect.py +8 -0
magicmd/diagnostics.py +118 -0
magicmd/fetchers/__init__.py +1 -0
magicmd/fetchers/browser.py +51 -0
magicmd/fetchers/http.py +17 -0
magicmd/models.py +53 -0
magicmd/output.py +58 -0
magicmd/platforms/__init__.py +1 -0
magicmd/platforms/base.py +19 -0
magicmd/platforms/csdn.py +76 -0
magicmd/platforms/generic.py +54 -0
magicmd/platforms/juejin.py +95 -0
magicmd/platforms/registry.py +73 -0
magicmd/platforms/shared/__init__.py +0 -0
magicmd/platforms/shared/content.py +440 -0
magicmd/platforms/shared/markdown.py +95 -0
magicmd/platforms/shared/metadata.py +38 -0
magicmd/platforms/wechat.py +57 -0
magicmd/quality.py +199 -0
magicmd/renderers/__init__.py +1 -0
magicmd/renderers/markdown.py +62 -0
magicmd/templates/magicmd.example.toml +41 -0
magicmd-0.1.0.dist-info/METADATA +315 -0
magicmd-0.1.0.dist-info/RECORD +31 -0
magicmd-0.1.0.dist-info/WHEEL +4 -0
magicmd-0.1.0.dist-info/entry_points.txt +2 -0
magicmd-0.1.0.dist-info/licenses/LICENSE +21 -0

magicmd/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ __version__ = "0.1.0"
2	+

magicmd/assets.py ADDED Viewed

@@ -0,0 +1,149 @@
+from __future__ import annotations
+import re
+from pathlib import Path
+import httpx
+from magicmd.models import Article, ImageAsset
+def infer_image_extension(url: str, content_type: str = "") -> str:
+    match = re.search(r"wx_fmt=(\w+)", url) or re.search(r"\.(\w{3,4})(?:\?|$)", url)
+    if match:
+        ext = match.group(1).lower().replace("jpeg", "jpg")
+        if ext != "other":
+            return ext
+    if "jpeg" in content_type:
+        return "jpg"
+    if "png" in content_type:
+        return "png"
+    if "gif" in content_type:
+        return "gif"
+    if "webp" in content_type:
+        return "webp"
+    return "png"
+def _separate_markdown_images(markdown: str) -> str:
+    image = r"!\[[^\]]*\]\([^)\n]+\)"
+    markdown = re.sub(r"!\[\]\(\)", "", markdown)
+    markdown = re.sub(rf"([^\[\n])({image})", r"\1\n\n\2", markdown)
+    markdown = re.sub(rf"({image})([^\]\n])", r"\1\n\n\2", markdown)
+    markdown = re.sub(rf"({image})\s*({image})", r"\1\n\n\2", markdown)
+    return re.sub(r"\n{4,}", "\n\n\n", markdown).strip()
+def rewrite_markdown_image_links(markdown: str, images: list[ImageAsset]) -> str:
+    result = markdown
+    for image in images:
+        if not image.local_path:
+            continue
+        pattern = re.compile(r"!\[([^\]]*)\]\(" + re.escape(image.source_url) + r"\)")
+        result = pattern.sub(lambda match: f"![{match.group(1)}]({image.local_path})", result)
+    return _separate_markdown_images(result)
+def _video_transport():
+    return None
+def _image_transport():
+    return None
+def _unescape_markdown_url(url: str) -> str:
+    return re.sub(r"\\([_&=?.:/+\-])", r"\1", url)
+def _infer_video_extension(url: str, content_type: str = "") -> str:
+    match = re.search(r"\.(mp4|mov|webm|m4v)(?:\?|$)", url, re.I)
+    if match:
+        return match.group(1).lower()
+    if "webm" in content_type:
+        return "webm"
+    if "quicktime" in content_type:
+        return "mov"
+    return "mp4"
+def download_videos(article: Article, package_dir: Path, video_dir_name: str = "videos") -> Article:
+    matches = list(re.finditer(r"\[视频\]\((https?://[^)\n]+)\)", article.content_markdown))
+    if not matches:
+        return article
+    video_dir = package_dir / video_dir_name
+    video_dir.mkdir(parents=True, exist_ok=True)
+    markdown = article.content_markdown
+    warnings = list(article.extraction.warnings)
+    seen: dict[str, str] = {}
+    transport = _video_transport()
+    client_kwargs = {"timeout": 30.0, "follow_redirects": True}
+    if transport is not None:
+        client_kwargs["transport"] = transport
+    with httpx.Client(**client_kwargs) as client:
+        for match in matches:
+            markdown_url = match.group(1)
+            url = _unescape_markdown_url(markdown_url)
+            if url not in seen:
+                try:
+                    response = client.get(
+                        url,
+                        headers={
+                            "Referer": article.source_url,
+                            "User-Agent": "Mozilla/5.0 MagicMD",
+                            "Accept": "video/mp4,video/*,*/*",
+                        },
+                    )
+                    response.raise_for_status()
+                    ext = _infer_video_extension(url, response.headers.get("content-type", ""))
+                    local_path = f"{video_dir_name}/video_{len(seen) + 1:03d}.{ext}"
+                    (package_dir / local_path).write_bytes(response.content)
+                    seen[url] = local_path
+                except Exception as exc:
+                    warnings.append(f"video_download_failed:{url}:{exc}")
+                    seen[url] = url
+            markdown = markdown.replace(f"[视频]({markdown_url})", f"[视频]({seen[url]})")
+    next_article = article.model_copy(update={"content_markdown": markdown})
+    next_article.extraction.warnings = warnings
+    return next_article
+def download_images(
+    article: Article,
+    package_dir: Path,
+    image_dir_name: str = "images",
+    filename_pattern: str = "img_{index:03d}.{ext}",
+) -> Article:
+    if not article.images:
+        return article
+    image_dir = package_dir / image_dir_name
+    image_dir.mkdir(parents=True, exist_ok=True)
+    next_images: list[ImageAsset] = []
+    warnings = list(article.extraction.warnings)
+    transport = _image_transport()
+    client_kwargs = {"timeout": 20.0, "follow_redirects": True}
+    if transport is not None:
+        client_kwargs["transport"] = transport
+    with httpx.Client(**client_kwargs) as client:
+        for index, image in enumerate(article.images, start=1):
+            url = image.source_url if not image.source_url.startswith("//") else f"https:{image.source_url}"
+            try:
+                response = client.get(url, headers={"Referer": article.source_url})
+                response.raise_for_status()
+                ext = infer_image_extension(url, response.headers.get("content-type", ""))
+                filename = filename_pattern.format(index=index, ext=ext)
+                local_path = f"{image_dir_name}/{filename}"
+                (package_dir / local_path).write_bytes(response.content)
+                next_images.append(image.model_copy(update={"local_path": local_path}))
+            except Exception as exc:
+                warnings.append(f"image_download_failed:{url}:{exc}")
+                next_images.append(image)
+    next_article = article.model_copy(update={"images": next_images})
+    next_article.content_markdown = rewrite_markdown_image_links(
+        next_article.content_markdown, next_images
+    )
+    next_article.extraction.warnings = warnings
+    return next_article

magicmd/cli.py ADDED Viewed

@@ -0,0 +1,422 @@
+from __future__ import annotations
+import json
+import shutil
+import sys
+from time import perf_counter
+from importlib import resources
+from pathlib import Path
+from typing import Any, Optional
+import typer
+import click
+from rich.console import Console
+from rich.text import Text
+from magicmd import __version__
+from magicmd.config import load_config
+from magicmd.detect import detect_platform
+from magicmd.diagnostics import (
+    build_doctor_report,
+    render_doctor_report,
+    save_debug_html,
+    save_extraction_report,
+)
+from magicmd.fetchers.browser import fetch_browser
+from magicmd.fetchers.http import fetch_http
+from magicmd.output import write_article_files, write_article_package
+from magicmd.platforms.registry import get_platform_adapter
+from magicmd.quality import (
+    build_failure_quality,
+    build_package_quality,
+    build_skipped_quality,
+    write_batch_report,
+)
+app = typer.Typer(help="Convert public article links into Markdown packages.", no_args_is_help=True)
+def _version_callback(value: bool):
+    if value:
+        typer.echo(f"MagicMD {__version__}")
+        raise typer.Exit()
+@app.callback()
+def main(
+    version: bool = typer.Option(
+        False,
+        "--version",
+        callback=_version_callback,
+        is_eager=True,
+        help="Show MagicMD version and exit.",
+    ),
+):
+    return None
+class ConversionStageError(click.ClickException):
+    def __init__(self, stage: str, error: Exception):
+        self.stage = stage
+        self.original_error = error
+        super().__init__(f"{stage}: {error}")
+class ProgressReporter:
+    def __init__(self, enabled: bool = False, console: Console | None = None):
+        self.enabled = enabled
+        self.console = console or Console(no_color=False)
+    def run(self, index: int, total: int, message: str, operation):
+        if not self.enabled:
+            return operation()
+        status_text = f"[cyan]⠋ [{index}/{total}] {message}...[/cyan]"
+        with self.console.status(status_text, spinner="dots"):
+            result = operation()
+        line = Text()
+        line.append("✓", style="green")
+        line.append(f" [{index}/{total}] {message}")
+        self.console.print(line)
+        return result
+def _run_conversion_stage(
+    progress: ProgressReporter,
+    stage: str,
+    index: int,
+    total: int,
+    message: str,
+    operation,
+):
+    try:
+        return progress.run(index, total, message, operation)
+    except ConversionStageError:
+        raise
+    except Exception as exc:
+        raise ConversionStageError(stage, exc) from exc
+def parse_article(platform: str, html: str, url: str):
+    try:
+        adapter = get_platform_adapter(platform)
+    except KeyError:
+        adapter = get_platform_adapter("generic")
+    return adapter.parser(html, url)
+def fetch_for_platform(url: str, platform: str, config_path: Optional[Path] = None) -> str:
+    config = load_config(config_path)
+    platform_config = config.platforms.get(platform)
+    if platform_config and platform_config.browser == "camoufox":
+        return fetch_browser(
+            url,
+            wait_selector=platform_config.wait_selector,
+            timeout_ms=config.fetch.browser_timeout_seconds * 1000,
+            attempts=config.fetch.browser_attempts,
+        )
+    return fetch_http(url, timeout_seconds=config.fetch.timeout_seconds, user_agent=config.fetch.user_agent)
+def entrypoint():
+    if len(sys.argv) > 1 and sys.argv[1].startswith(("http://", "https://")):
+        sys.argv.insert(1, "convert")
+    try:
+        app(standalone_mode=False)
+    except Exception as exc:
+        if hasattr(exc, "show") and hasattr(exc, "exit_code"):
+            exc.show()
+            raise SystemExit(exc.exit_code) from exc
+        raise
+def _resolve_output(output: Path | None, config_path: Optional[Path]) -> Path:
+    if output is not None:
+        return output
+    return Path(load_config(config_path).output.directory)
+def _ensure_platform_enabled(platform: str, config_path: Optional[Path]) -> None:
+    config = load_config(config_path)
+    platform_config = config.platforms.get(platform)
+    if platform_config and not platform_config.enabled:
+        raise click.ClickException(f"Platform disabled: {platform}")
+def _batch_context(url: str, platform: str, config_path: Optional[Path]) -> dict[str, Any]:
+    config = load_config(config_path)
+    resolved_platform = detect_platform(url) if platform == "auto" else platform
+    platform_config = config.platforms.get(resolved_platform)
+    fetcher = platform_config.browser if platform_config else "http"
+    max_attempts = config.fetch.browser_attempts if fetcher == "camoufox" else 1
+    return {
+        "platform": resolved_platform,
+        "fetcher": fetcher,
+        "max_attempts": max_attempts,
+        "retry_enabled": max_attempts > 1,
+    }
+def _decorate_batch_result(
+    item: dict[str, Any],
+    context: dict[str, Any],
+    elapsed_ms: int,
+    stage: str,
+) -> dict[str, Any]:
+    result = dict(item)
+    result.update(context)
+    result["elapsed_ms"] = elapsed_ms
+    result["stage"] = _quality_failure_stage(result, stage) if result.get("status") == "fail" else stage
+    return result
+def _quality_failure_stage(item: dict[str, Any], fallback: str) -> str:
+    error = str(item.get("error") or "")
+    if error.endswith("_content_not_found"):
+        return "parse"
+    return fallback
+def _find_existing_package(output: Path, url: str) -> Path | None:
+    if not output.exists():
+        return None
+    for metadata_path in sorted(output.glob("*/metadata.json")):
+        package_dir = metadata_path.parent
+        if not (package_dir / "article.md").exists():
+            continue
+        try:
+            metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(metadata, dict):
+            continue
+        if url in {metadata.get("source_url"), metadata.get("canonical_url")}:
+            return package_dir
+    return None
+def _should_save_debug_html(debug: bool, save_mode: str, warnings: list[str]) -> bool:
+    normalized = save_mode.lower()
+    return debug or normalized == "always" or (normalized == "on_failure" and bool(warnings))
+def convert_url(
+    url: str,
+    output: Path,
+    platform: str = "auto",
+    config_path: Optional[Path] = None,
+    debug: bool = False,
+    overwrite: bool = False,
+    download_images_enabled: bool = True,
+    show_progress: bool = False,
+) -> Path:
+    progress = ProgressReporter(show_progress)
+    config = load_config(config_path)
+    resolved_platform = _run_conversion_stage(
+        progress,
+        "detect",
+        1,
+        6,
+        "Detecting platform",
+        lambda: detect_platform(url) if platform == "auto" else platform,
+    )
+    try:
+        _ensure_platform_enabled(resolved_platform, config_path)
+    except Exception as exc:
+        raise ConversionStageError("detect", exc) from exc
+    html = _run_conversion_stage(
+        progress,
+        "fetch",
+        2,
+        6,
+        f"Fetching article ({resolved_platform})",
+        lambda: fetch_for_platform(url, resolved_platform, config_path),
+    )
+    article = _run_conversion_stage(
+        progress,
+        "parse",
+        3,
+        6,
+        "Parsing article",
+        lambda: parse_article(resolved_platform, html, url),
+    )
+    package_dir = _run_conversion_stage(
+        progress,
+        "write",
+        4,
+        6,
+        "Writing Markdown package",
+        lambda: write_article_package(
+            article,
+            output,
+            overwrite=overwrite or config.output.overwrite,
+            markdown_config=config.markdown,
+        ),
+    )
+    if _should_save_debug_html(debug, config.output.save_debug_html, article.extraction.warnings):
+        save_debug_html(package_dir, html)
+    if download_images_enabled and config.images.download:
+        from magicmd.assets import download_images, download_videos
+        article = _run_conversion_stage(
+            progress,
+            "media",
+            5,
+            6,
+            "Downloading media",
+            lambda: download_videos(
+                download_images(
+                    article,
+                    package_dir,
+                    config.images.directory,
+                    config.images.filename_pattern,
+                ),
+                package_dir,
+            ),
+            )
+        write_article_files(article, package_dir, markdown_config=config.markdown)
+    else:
+        progress.run(5, 6, "Skipping image download", lambda: article)
+    _run_conversion_stage(
+        progress,
+        "report",
+        6,
+        6,
+        "Saving extraction report",
+        lambda: save_extraction_report(package_dir, article.to_metadata()["extraction"]),
+    )
+    return package_dir
+@app.command()
+def convert(
+    url: str,
+    output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output directory."),
+    platform: str = typer.Option("auto", "--platform", help="auto, wechat, juejin, csdn, generic."),
+    config_path: Optional[Path] = typer.Option(None, "--config", help="Config file path."),
+    no_images: bool = typer.Option(False, "--no-images", help="Do not download images."),
+    debug: bool = typer.Option(False, "--debug", help="Save debug HTML."),
+    overwrite: bool = typer.Option(False, "--overwrite", help="Overwrite output package."),
+):
+    resolved_output = _resolve_output(output, config_path)
+    package_dir = convert_url(
+        url,
+        resolved_output,
+        platform=platform,
+        config_path=config_path,
+        debug=debug,
+        overwrite=overwrite,
+        download_images_enabled=not no_images,
+        show_progress=True,
+    )
+    quality = build_package_quality(url, package_dir)
+    if quality["status"] == "fail":
+        raise click.ClickException(
+            f"Extraction failed: {quality.get('error')}. Debug package saved at: {package_dir}"
+        )
+    typer.echo(f"Created output package: {package_dir}")
+@app.command()
+def batch(
+    file: Path,
+    output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output directory."),
+    platform: str = typer.Option("auto", "--platform", help="auto, wechat, juejin, csdn, generic."),
+    config_path: Optional[Path] = typer.Option(None, "--config", help="Config file path."),
+    no_images: bool = typer.Option(False, "--no-images", help="Do not download images."),
+    debug: bool = typer.Option(False, "--debug", help="Save debug HTML."),
+    overwrite: bool = typer.Option(False, "--overwrite", help="Overwrite output package."),
+    skip_existing: bool = typer.Option(False, "--skip-existing", help="Skip URLs already present in output metadata."),
+):
+    resolved_output = _resolve_output(output, config_path)
+    urls = [
+        line.strip()
+        for line in file.read_text(encoding="utf-8").splitlines()
+        if line.strip() and not line.strip().startswith("#")
+    ]
+    results = []
+    for url in urls:
+        started_at = perf_counter()
+        context = _batch_context(url, platform, config_path)
+        try:
+            if skip_existing:
+                existing_package = _find_existing_package(resolved_output, url)
+                if existing_package:
+                    elapsed_ms = int((perf_counter() - started_at) * 1000)
+                    results.append(
+                        _decorate_batch_result(
+                            build_skipped_quality(url, existing_package),
+                            context,
+                            elapsed_ms,
+                            "skip",
+                        )
+                    )
+                    typer.echo(f"SKIP {url} -> {existing_package}")
+                    continue
+            package_dir = convert_url(
+                url,
+                resolved_output,
+                platform=platform,
+                config_path=config_path,
+                debug=debug,
+                overwrite=overwrite,
+                download_images_enabled=not no_images,
+                show_progress=True,
+            )
+            elapsed_ms = int((perf_counter() - started_at) * 1000)
+            results.append(
+                _decorate_batch_result(
+                    build_package_quality(url, package_dir),
+                    context,
+                    elapsed_ms,
+                    "complete",
+                )
+            )
+            typer.echo(f"OK {url} -> {package_dir}")
+        except Exception as exc:
+            elapsed_ms = int((perf_counter() - started_at) * 1000)
+            stage = exc.stage if isinstance(exc, ConversionStageError) else "convert"
+            results.append(
+                _decorate_batch_result(
+                    build_failure_quality(url, exc),
+                    context,
+                    elapsed_ms,
+                    stage,
+                )
+            )
+            typer.echo(f"FAIL {url}: {exc}", err=True)
+    report_paths = write_batch_report(results, resolved_output)
+    typer.echo(f"Batch report: {report_paths['markdown']}")
+config_app = typer.Typer(help="Manage MagicMD config.")
+app.add_typer(config_app, name="config")
+@config_app.command("init")
+def config_init(path: Path = typer.Option(Path(".magicmd.toml"), "--path", help="Config path.")):
+    if path.exists():
+        typer.echo(f"Config already exists: {path}")
+        return
+    package_template = resources.files("magicmd").joinpath("templates/magicmd.example.toml")
+    if package_template.is_file():
+        path.write_text(package_template.read_text(encoding="utf-8"), encoding="utf-8")
+        typer.echo(f"Created config: {path}")
+        return
+    example = Path(__file__).resolve().parents[2] / ".magicmd.example.toml"
+    if example.exists():
+        shutil.copyfile(example, path)
+    else:
+        path.write_text("[output]\ndirectory = \"output\"\n", encoding="utf-8")
+    typer.echo(f"Created config: {path}")
+@app.command()
+def doctor(
+    config_path: Optional[Path] = typer.Option(None, "--config", help="Config file path."),
+    output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output directory to check."),
+):
+    report = build_doctor_report(config_path=config_path, output_dir=output)
+    typer.echo(render_doctor_report(report), nl=False)
+    if not report["ok"]:
+        typer.echo("MagicMD doctor found issues.", err=True)
+        raise typer.Exit(1)

magicmd/config.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+import tomllib
+from pathlib import Path
+from pydantic import BaseModel, Field
+from magicmd.platforms.registry import platform_adapters
+class OutputConfig(BaseModel):
+    directory: str = "output"
+    overwrite: bool = False
+    save_debug_html: str = "on_failure"
+class MarkdownConfig(BaseModel):
+    template: str = "default"
+    front_matter: str = "yaml"
+    include_source_block: bool = True
+    heading_offset: int = 0
+class ImagesConfig(BaseModel):
+    download: bool = True
+    directory: str = "images"
+    filename_pattern: str = "img_{index:03d}.{ext}"
+    concurrency: int = 5
+class FetchConfig(BaseModel):
+    timeout_seconds: int = 20
+    browser_timeout_seconds: int = 15
+    browser_attempts: int = 2
+    user_agent: str = "default"
+class PlatformConfig(BaseModel):
+    enabled: bool = True
+    browser: str = "http"
+    wait_selector: str = ""
+class MagicMDConfig(BaseModel):
+    output: OutputConfig = Field(default_factory=OutputConfig)
+    markdown: MarkdownConfig = Field(default_factory=MarkdownConfig)
+    images: ImagesConfig = Field(default_factory=ImagesConfig)
+    fetch: FetchConfig = Field(default_factory=FetchConfig)
+    platforms: dict[str, PlatformConfig] = Field(
+        default_factory=lambda: {
+            adapter.name: PlatformConfig(
+                browser=adapter.default_browser,
+                wait_selector=adapter.default_wait_selector,
+            )
+            for adapter in platform_adapters()
+        }
+    )
+def _deep_merge(base: dict, override: dict) -> dict:
+    merged = dict(base)
+    for key, value in override.items():
+        if isinstance(value, dict) and isinstance(merged.get(key), dict):
+            merged[key] = _deep_merge(merged[key], value)
+        else:
+            merged[key] = value
+    return merged
+def load_config(path: str | Path | None = None) -> MagicMDConfig:
+    default = MagicMDConfig().model_dump()
+    if not path:
+        return MagicMDConfig.model_validate(default)
+    config_path = Path(path)
+    if not config_path.exists():
+        return MagicMDConfig.model_validate(default)
+    loaded = tomllib.loads(config_path.read_text(encoding="utf-8"))
+    return MagicMDConfig.model_validate(_deep_merge(default, loaded))

magicmd/detect.py ADDED Viewed

@@ -0,0 +1,8 @@
+from urllib.parse import urlparse
+from magicmd.platforms.registry import match_platform_by_host
+def detect_platform(url: str) -> str:
+    host = urlparse(url).netloc.lower()
+    return match_platform_by_host(host)