PyPI - markdown-webscraper - Versions diffs - 0.1.0__py3-none-any.whl - Mend

markdown-webscraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

markdown_webscraper/__init__.py +4 -0
markdown_webscraper/cli.py +26 -0
markdown_webscraper/config.py +38 -0
markdown_webscraper/fetcher.py +64 -0
markdown_webscraper/html_utils.py +70 -0
markdown_webscraper/pipeline.py +139 -0
markdown_webscraper-0.1.0.dist-info/METADATA +170 -0
markdown_webscraper-0.1.0.dist-info/RECORD +12 -0
markdown_webscraper-0.1.0.dist-info/WHEEL +5 -0
markdown_webscraper-0.1.0.dist-info/entry_points.txt +2 -0
markdown_webscraper-0.1.0.dist-info/licenses/LICENSE +21 -0
markdown_webscraper-0.1.0.dist-info/top_level.txt +1 -0

markdown_webscraper/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .config import ScraperConfig, load_config
+from .pipeline import CrawlStats, WebsiteScraper
+__all__ = ["ScraperConfig", "load_config", "CrawlStats", "WebsiteScraper"]

markdown_webscraper/cli.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+import argparse
+from . import WebsiteScraper, load_config
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Scrape websites to HTML and Markdown.")
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="Path to config.json (default: config.json)",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    config = load_config(args.config)
+    stats = WebsiteScraper(config).run()
+    print(
+        f"Done. pages_fetched={stats.pages_fetched}, "
+        f"html_files_saved={stats.html_files_saved}, "
+        f"markdown_files_saved={stats.markdown_files_saved}"
+    )

markdown_webscraper/config.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class ScraperConfig:
+    raw_html_dir: Path
+    markdown_dir: Path
+    wildcard_websites: list[str]
+    individual_websites: list[str]
+    remove_header_footer: bool
+    markdown_convert: bool
+    time_delay: float
+    total_timeout: float
+def _clean_url_list(urls: list[str]) -> list[str]:
+    return [url.strip() for url in urls if isinstance(url, str) and url.strip()]
+def load_config(config_path: str | Path) -> ScraperConfig:
+    path = Path(config_path)
+    with path.open("r", encoding="utf-8") as infile:
+        raw = json.load(infile)
+    return ScraperConfig(
+        raw_html_dir=Path(raw["raw_html_dir"]),
+        markdown_dir=Path(raw["markdown_dir"]),
+        wildcard_websites=_clean_url_list(raw.get("wildcard_websites", [])),
+        individual_websites=_clean_url_list(raw.get("individual_websites", [])),
+        remove_header_footer=bool(raw.get("remove_header_footer", False)),
+        markdown_convert=bool(raw.get("markdown_convert", True)),
+        time_delay=float(raw.get("time_delay", 0)),
+        total_timeout=float(raw.get("total_timeout",0))
+    )

markdown_webscraper/fetcher.py ADDED Viewed

@@ -0,0 +1,64 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Protocol
+from botasaurus.browser import Driver, browser
+@dataclass(frozen=True)
+class FetchedPage:
+    requested_url: str
+    resolved_url: str
+    html: str
+    links: list[str]
+class PageFetcher(Protocol):
+    def fetch(self, url: str) -> FetchedPage:
+        ...
+    def close(self) -> None:
+        ...
+@browser(
+    headless=True,
+    reuse_driver=True,
+    close_on_crash=True,
+    raise_exception=True,
+    output=None,
+)
+def _fetch_with_botasaurus(driver: Driver, payload: dict[str, str]) -> dict:
+    url = payload["url"]
+    driver.get(url)
+    # Explicitly use human-like movement + click as required.
+    driver.enable_human_mode()
+    moved = driver.move_mouse_to_element("body")
+    if moved:
+        driver.click("body", skip_move=False)
+    html = driver.page_html
+    links = driver.get_all_links()
+    return {
+        "requested_url": url,
+        "resolved_url": driver.current_url,
+        "html": html,
+        "links": links,
+    }
+class BotasaurusFetcher:
+    def fetch(self, url: str) -> FetchedPage:
+        data = _fetch_with_botasaurus({"url": url})
+        return FetchedPage(
+            requested_url=data["requested_url"],
+            resolved_url=data["resolved_url"],
+            html=data["html"],
+            links=list(data.get("links", [])),
+        )
+    def close(self) -> None:
+        if hasattr(_fetch_with_botasaurus, "close"):
+            _fetch_with_botasaurus.close()

markdown_webscraper/html_utils.py ADDED Viewed

@@ -0,0 +1,70 @@
+from __future__ import annotations
+import re
+from pathlib import Path
+from urllib.parse import urljoin, urlsplit, urlunsplit
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+def normalize_url(url: str) -> str:
+    parsed = urlsplit(url.strip())
+    if parsed.scheme not in {"http", "https"}:
+        raise ValueError(f"Unsupported URL scheme: {url}")
+    path = parsed.path or "/"
+    return urlunsplit((parsed.scheme.lower(), parsed.netloc.lower(), path, parsed.query, ""))
+def normalize_link(base_url: str, href: str) -> str | None:
+    if not href:
+        return None
+    absolute = urljoin(base_url, href)
+    parsed = urlsplit(absolute)
+    if parsed.scheme not in {"http", "https"}:
+        return None
+    return normalize_url(absolute)
+def is_within_scope(candidate_url: str, root_url: str) -> bool:
+    candidate = urlsplit(normalize_url(candidate_url))
+    root = urlsplit(normalize_url(root_url))
+    if candidate.netloc != root.netloc:
+        return False
+    root_path = root.path or "/"
+    if root_path == "/":
+        return True
+    if root_path.endswith("/"):
+        return candidate.path.startswith(root_path)
+    return candidate.path == root_path or candidate.path.startswith(f"{root_path}/")
+def prune_header_footer(html: str) -> str:
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup.find_all(["header", "footer"]):
+        tag.decompose()
+    return str(soup)
+def to_markdown(html: str) -> str:
+    return md(html, heading_style="ATX")
+def _safe_segment(segment: str) -> str:
+    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", segment)
+    return cleaned.strip("._") or "index"
+def url_to_output_path(url: str, output_dir: Path, extension: str) -> Path:
+    parsed = urlsplit(normalize_url(url))
+    path_segments = [seg for seg in parsed.path.split("/") if seg]
+    if not path_segments or parsed.path.endswith("/"):
+        path_segments.append("index")
+    file_stem = _safe_segment(path_segments[-1])
+    if parsed.query:
+        file_stem = f"{file_stem}__q_{_safe_segment(parsed.query)}"
+    directory_segments = [_safe_segment(seg) for seg in path_segments[:-1]]
+    return output_dir / _safe_segment(parsed.netloc) / Path(*directory_segments) / f"{file_stem}.{extension}"

markdown_webscraper/pipeline.py ADDED Viewed

@@ -0,0 +1,139 @@
+from __future__ import annotations
+import time
+import requests
+import signal
+from collections import deque
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable
+from .config import ScraperConfig
+from .fetcher import BotasaurusFetcher, FetchedPage, PageFetcher
+from .html_utils import (
+    is_within_scope,
+    normalize_link,
+    normalize_url,
+    prune_header_footer,
+    to_markdown,
+    url_to_output_path,
+)
+@dataclass
+class CrawlStats:
+    pages_fetched: int = 0
+    html_files_saved: int = 0
+    markdown_files_saved: int = 0
+class WebsiteScraper:
+    def __init__(
+        self,
+        config: ScraperConfig,
+        fetcher: PageFetcher | None = None,
+        sleeper: Callable[[float], None] = time.sleep,
+    ) -> None:
+        self.config = config
+        self.fetcher = fetcher or BotasaurusFetcher()
+        self.sleeper = sleeper
+        self.stats = CrawlStats()
+        self._visited: set[str] = set()
+    def _handle_timeout(self, signum, frame):
+        print("\nTotal timeout reached. Quitting...")
+        exit(0)
+    def run(self) -> CrawlStats:
+        self.config.raw_html_dir.mkdir(parents=True, exist_ok=True)
+        self.config.markdown_dir.mkdir(parents=True, exist_ok=True)
+        if self.config.total_timeout > 0:
+            signal.signal(signal.SIGALRM, self._handle_timeout)
+            signal.alarm(int(self.config.total_timeout))
+        try:
+            for url in self.config.individual_websites:
+                self._scrape_one(url)
+            for root_url in self.config.wildcard_websites:
+                self._scrape_recursive(root_url)
+        finally:
+            if self.config.total_timeout > 0:
+                signal.alarm(0)
+            self.fetcher.close()
+        return self.stats
+    def _scrape_recursive(self, root_url: str) -> None:
+        queue: deque[str] = deque([normalize_url(root_url)])
+        while queue:
+            current = queue.popleft()
+            if current in self._visited:
+                continue
+            if not is_within_scope(current, root_url):
+                continue
+            fetched = self._scrape_one(current)
+            for href in fetched.links:
+                child = normalize_link(fetched.resolved_url, href)
+                if child and child not in self._visited and is_within_scope(child, root_url):
+                    queue.append(child)
+    def _scrape_one(self, url: str) -> FetchedPage:
+        normalized = normalize_url(url)
+        if normalized in self._visited:
+            return FetchedPage(normalized, normalized, "", [])
+        fetched = self.fetcher.fetch(normalized)
+        self._visited.add(normalize_url(fetched.resolved_url))
+        self.stats.pages_fetched += 1
+        resolved_url_lower = fetched.resolved_url.lower()
+        if resolved_url_lower.endswith(".pdf") or resolved_url_lower.endswith(".txt"):
+            self._download_file(fetched.resolved_url)
+            return fetched
+        html = fetched.html
+        if self.config.remove_header_footer:
+            html = prune_header_footer(html)
+        html_path = url_to_output_path(fetched.resolved_url, self.config.raw_html_dir, "html")
+        self._write_text_file(html_path, html)
+        self.stats.html_files_saved += 1
+        if self.config.markdown_convert:
+            markdown_path = url_to_output_path(fetched.resolved_url, self.config.markdown_dir, "md")
+            self._write_text_file(markdown_path, to_markdown(html))
+            self.stats.markdown_files_saved += 1
+        if self.config.time_delay > 0:
+            self.sleeper(self.config.time_delay)
+        return fetched
+    def _download_file(self, url: str) -> None:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        ext = url.split(".")[-1].lower()
+        output_dir = self.config.raw_html_dir
+        # Use a safe way to determine the filename/path
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        path = parsed.path
+        if not path or path == "/":
+            path = "/index"
+        file_path = output_dir / parsed.netloc / Path(path.lstrip("/"))
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(file_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+    @staticmethod
+    def _write_text_file(path: Path, content: str) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(content, encoding="utf-8")

markdown_webscraper-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,170 @@
+Metadata-Version: 2.4
+Name: markdown_webscraper
+Version: 0.1.0
+Summary: Scrape websites to raw HTML with Botasaurus and convert to Markdown with markdownify.
+Author: markdown_webscraper contributors
+License-Expression: MIT
+Project-URL: Homepage, https://pypi.org/project/markdown_webscraper/
+Project-URL: Source, https://github.com/your-org/markdown_webscraper
+Project-URL: Issues, https://github.com/your-org/markdown_webscraper/issues
+Keywords: webscraping,markdown,botasaurus,html
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Operating System :: OS Independent
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Text Processing :: Markup
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: botasaurus>=4.0.97
+Requires-Dist: markdownify>=1.2.2
+Requires-Dist: beautifulsoup4>=4.14.3
+Provides-Extra: test
+Requires-Dist: pytest>=9.0.3; extra == "test"
+Requires-Dist: pytest-mock>=3.15.1; extra == "test"
+Dynamic: license-file
+# markdown_webscraper
+Scrape websites with `botasaurus`, save raw `.html`, then convert to `.md` with `markdownify`.
+## API Reference
+### Core Classes
+#### `markdown_webscraper.WebsiteScraper`
+The main class for running the scraping process.
+**Constructor:**
+`WebsiteScraper(config: ScraperConfig, fetcher: PageFetcher | None = None, sleeper: Callable[[float], None] = time.sleep)`
+* `config`: A `ScraperConfig` object containing scraping parameters.
+* `fetcher`: An optional implementation of `PageFetcher`. Defaults to `BotasaurusFetcher`.
+* `sleeper`: A function to handle time delays. Defaults to `time.sleep`.
+**Methods:**
+* `run() -> CrawlStats`: Starts the scraping process based on the provided configuration. Returns `CrawlStats` containing the results.
+#### `markdown_webscraper.ScraperConfig`
+A dataclass representing the scraper configuration.
+**Attributes:**
+* `raw_html_dir (Path)`: Directory to save raw HTML files.
+* `markdown_dir (Path)`: Directory to save converted Markdown files.
+* `wildcard_websites (list[str])`: List of root URLs for recursive scraping.
+* `individual_websites (list[str])`: List of specific URLs to scrape.
+* `remove_header_footer (bool)`: Whether to prune `<header>` and `<footer>` tags.
+* `markdown_convert (bool)`: Whether to convert HTML to Markdown.
+* `time_delay (float)`: Delay between requests in seconds.
+* `total_timeout (float)`: Maximum time in seconds for the entire scraping process.
+#### `markdown_webscraper.CrawlStats`
+A dataclass containing statistics from a completed crawl.
+**Attributes:**
+* `pages_fetched (int)`: Total number of pages requested.
+* `html_files_saved (int)`: Total number of HTML files written to disk.
+* `markdown_files_saved (int)`: Total number of Markdown files written to disk.
+### Utilities
+#### `markdown_webscraper.load_config(config_path: str | Path) -> ScraperConfig`
+Loads a `ScraperConfig` from a JSON file.
+---
+## Usage Example
+```python
+from pathlib import Path
+from markdown_webscraper import WebsiteScraper, load_config
+# Load configuration from a JSON file
+config = load_config("config.json")
+# Initialize and run the scraper
+scraper = WebsiteScraper(config=config)
+stats = scraper.run()
+print(f"Scraped {stats.pages_fetched} pages.")
+print(f"Saved {stats.markdown_files_saved} markdown files.")
+```
+---
+## Local Development
+```bash
+python3 -m venv .venv
+. .venv/bin/activate
+pip install -r requirements.txt
+```
+Run with local script:
+```bash
+python scrape.py --config config.json
+```
+Run as installed package CLI:
+```bash
+markdown-webscraper --config config.json
+```
+## Configuration
+The CLI expects a JSON config file:
+```json
+{
+  "raw_html_dir": "/home/brosnan/markdown_webscraper/raw_html/",
+  "markdown_dir": "/home/brosnan/markdown_webscraper/markdown/",
+  "wildcard_websites": ["https://www.allaboutcircuits.com/textbook", ""],
+  "individual_websites": ["https://example.com/", "https://www.ti.com/lit/ds/sprs590g/sprs590g.pdf"],
+  "remove_header_footer": true,
+  "markdown_convert": true,
+  "time_delay": 2,
+  "total_timeout": 180
+}
+```
+## Tests
+```bash
+pytest tests/unit -q
+```
+Integration example.com:
+```bash
+RUN_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_example_com -m integration -q
+```
+Integration allaboutcircuits textbook:
+```bash
+RUN_INTEGRATION=1 RUN_FULL_TEXTBOOK_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_allaboutcircuits_textbook_recursive -m integration -q
+```
+## Build and Publish to PyPI
+1. Update version in `pyproject.toml`.
+2. Build distributions:
+```bash
+python -m pip install --upgrade build twine
+python -m build
+```
+3. Check artifacts:
+```bash
+python -m twine check dist/*
+```
+4. Upload:
+```bash
+python -m twine upload dist/*
+```

markdown_webscraper-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+markdown_webscraper/__init__.py,sha256=WVRWOcR5Isv75UupBahGyBEDPWv_qsUIrFYakSx9jNE,172
+markdown_webscraper/cli.py,sha256=oT7cY0t_Y2Mh8aq3ycARq-ri6WMxLwie_43llP6wrI4,712
+markdown_webscraper/config.py,sha256=6Oso_8SgOvpnOr1mDAESK8gOrUT4FUVRnxRSbE0AHE0,1218
+markdown_webscraper/fetcher.py,sha256=Jv0q8F7P0ywrQ5PP7IcvY_0FsEVWp9d7n53dRr933lw,1517
+markdown_webscraper/html_utils.py,sha256=YPEuNJtvQe4Kmxcwxp6NjU86NXsOja-vRNgkUbmJLwg,2261
+markdown_webscraper/pipeline.py,sha256=gOLg70gNsjcmhWobNE-CuAstUUnx-Zh2XXSdhBk_nrE,4581
+markdown_webscraper-0.1.0.dist-info/licenses/LICENSE,sha256=npHRLyaZCtBy1kAuEABAeB5iW5u4svkjgIJ6Fg3Yr0c,1089
+markdown_webscraper-0.1.0.dist-info/METADATA,sha256=Wpa69lRM_Xnw8itU1TwJGMUVi1P4rcqLaGxSr53eBJc,4825
+markdown_webscraper-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+markdown_webscraper-0.1.0.dist-info/entry_points.txt,sha256=gjuNjkX3BZMUysDrCGC-l6PT0QUgWQgOX_Jx3o7BaXI,69
+markdown_webscraper-0.1.0.dist-info/top_level.txt,sha256=EnXORQP0EnKsHpmwLWZGKCooprvkJE_ouP0OtI2y-d0,20
+markdown_webscraper-0.1.0.dist-info/RECORD,,

markdown_webscraper-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

markdown_webscraper-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ markdown-webscraper = markdown_webscraper.cli:main

markdown_webscraper-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 markdown_webscraper contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

markdown_webscraper-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ markdown_webscraper