markdown-webscraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 markdown_webscraper contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.4
2
+ Name: markdown_webscraper
3
+ Version: 0.1.0
4
+ Summary: Scrape websites to raw HTML with Botasaurus and convert to Markdown with markdownify.
5
+ Author: markdown_webscraper contributors
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://pypi.org/project/markdown_webscraper/
8
+ Project-URL: Source, https://github.com/your-org/markdown_webscraper
9
+ Project-URL: Issues, https://github.com/your-org/markdown_webscraper/issues
10
+ Keywords: webscraping,markdown,botasaurus,html
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Internet :: WWW/HTTP
15
+ Classifier: Topic :: Text Processing :: Markup
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: botasaurus>=4.0.97
20
+ Requires-Dist: markdownify>=1.2.2
21
+ Requires-Dist: beautifulsoup4>=4.14.3
22
+ Provides-Extra: test
23
+ Requires-Dist: pytest>=9.0.3; extra == "test"
24
+ Requires-Dist: pytest-mock>=3.15.1; extra == "test"
25
+ Dynamic: license-file
26
+
27
+ # markdown_webscraper
28
+
29
+ Scrape websites with `botasaurus`, save raw `.html`, then convert to `.md` with `markdownify`.
30
+
31
+ ## API Reference
32
+
33
+ ### Core Classes
34
+
35
+ #### `markdown_webscraper.WebsiteScraper`
36
+ The main class for running the scraping process.
37
+
38
+ **Constructor:**
39
+ `WebsiteScraper(config: ScraperConfig, fetcher: PageFetcher | None = None, sleeper: Callable[[float], None] = time.sleep)`
40
+
41
+ * `config`: A `ScraperConfig` object containing scraping parameters.
42
+ * `fetcher`: An optional implementation of `PageFetcher`. Defaults to `BotasaurusFetcher`.
43
+ * `sleeper`: A function to handle time delays. Defaults to `time.sleep`.
44
+
45
+ **Methods:**
46
+ * `run() -> CrawlStats`: Starts the scraping process based on the provided configuration. Returns `CrawlStats` containing the results.
47
+
48
+ #### `markdown_webscraper.ScraperConfig`
49
+ A dataclass representing the scraper configuration.
50
+
51
+ **Attributes:**
52
+ * `raw_html_dir (Path)`: Directory to save raw HTML files.
53
+ * `markdown_dir (Path)`: Directory to save converted Markdown files.
54
+ * `wildcard_websites (list[str])`: List of root URLs for recursive scraping.
55
+ * `individual_websites (list[str])`: List of specific URLs to scrape.
56
+ * `remove_header_footer (bool)`: Whether to prune `<header>` and `<footer>` tags.
57
+ * `markdown_convert (bool)`: Whether to convert HTML to Markdown.
58
+ * `time_delay (float)`: Delay between requests in seconds.
59
+ * `total_timeout (float)`: Maximum time in seconds for the entire scraping process.
60
+
61
+ #### `markdown_webscraper.CrawlStats`
62
+ A dataclass containing statistics from a completed crawl.
63
+
64
+ **Attributes:**
65
+ * `pages_fetched (int)`: Total number of pages requested.
66
+ * `html_files_saved (int)`: Total number of HTML files written to disk.
67
+ * `markdown_files_saved (int)`: Total number of Markdown files written to disk.
68
+
69
+ ### Utilities
70
+
71
+ #### `markdown_webscraper.load_config(config_path: str | Path) -> ScraperConfig`
72
+ Loads a `ScraperConfig` from a JSON file.
73
+
74
+ ---
75
+
76
+ ## Usage Example
77
+
78
+ ```python
79
+ from pathlib import Path
80
+ from markdown_webscraper import WebsiteScraper, load_config
81
+
82
+ # Load configuration from a JSON file
83
+ config = load_config("config.json")
84
+
85
+ # Initialize and run the scraper
86
+ scraper = WebsiteScraper(config=config)
87
+ stats = scraper.run()
88
+
89
+ print(f"Scraped {stats.pages_fetched} pages.")
90
+ print(f"Saved {stats.markdown_files_saved} markdown files.")
91
+ ```
92
+
93
+ ---
94
+
95
+ ## Local Development
96
+
97
+ ```bash
98
+ python3 -m venv .venv
99
+ . .venv/bin/activate
100
+ pip install -r requirements.txt
101
+ ```
102
+
103
+ Run with local script:
104
+
105
+ ```bash
106
+ python scrape.py --config config.json
107
+ ```
108
+
109
+ Run as installed package CLI:
110
+
111
+ ```bash
112
+ markdown-webscraper --config config.json
113
+ ```
114
+
115
+ ## Configuration
116
+
117
+ The CLI expects a JSON config file:
118
+
119
+ ```json
120
+ {
121
+ "raw_html_dir": "/home/brosnan/markdown_webscraper/raw_html/",
122
+ "markdown_dir": "/home/brosnan/markdown_webscraper/markdown/",
123
+ "wildcard_websites": ["https://www.allaboutcircuits.com/textbook", ""],
124
+ "individual_websites": ["https://example.com/", "https://www.ti.com/lit/ds/sprs590g/sprs590g.pdf"],
125
+ "remove_header_footer": true,
126
+ "markdown_convert": true,
127
+ "time_delay": 2,
128
+ "total_timeout": 180
129
+ }
130
+ ```
131
+
132
+ ## Tests
133
+
134
+ ```bash
135
+ pytest tests/unit -q
136
+ ```
137
+
138
+ Integration example.com:
139
+
140
+ ```bash
141
+ RUN_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_example_com -m integration -q
142
+ ```
143
+
144
+ Integration allaboutcircuits textbook:
145
+
146
+ ```bash
147
+ RUN_INTEGRATION=1 RUN_FULL_TEXTBOOK_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_allaboutcircuits_textbook_recursive -m integration -q
148
+ ```
149
+
150
+ ## Build and Publish to PyPI
151
+
152
+ 1. Update version in `pyproject.toml`.
153
+ 2. Build distributions:
154
+
155
+ ```bash
156
+ python -m pip install --upgrade build twine
157
+ python -m build
158
+ ```
159
+
160
+ 3. Check artifacts:
161
+
162
+ ```bash
163
+ python -m twine check dist/*
164
+ ```
165
+
166
+ 4. Upload:
167
+
168
+ ```bash
169
+ python -m twine upload dist/*
170
+ ```
@@ -0,0 +1,144 @@
1
+ # markdown_webscraper
2
+
3
+ Scrape websites with `botasaurus`, save raw `.html`, then convert to `.md` with `markdownify`.
4
+
5
+ ## API Reference
6
+
7
+ ### Core Classes
8
+
9
+ #### `markdown_webscraper.WebsiteScraper`
10
+ The main class for running the scraping process.
11
+
12
+ **Constructor:**
13
+ `WebsiteScraper(config: ScraperConfig, fetcher: PageFetcher | None = None, sleeper: Callable[[float], None] = time.sleep)`
14
+
15
+ * `config`: A `ScraperConfig` object containing scraping parameters.
16
+ * `fetcher`: An optional implementation of `PageFetcher`. Defaults to `BotasaurusFetcher`.
17
+ * `sleeper`: A function to handle time delays. Defaults to `time.sleep`.
18
+
19
+ **Methods:**
20
+ * `run() -> CrawlStats`: Starts the scraping process based on the provided configuration. Returns `CrawlStats` containing the results.
21
+
22
+ #### `markdown_webscraper.ScraperConfig`
23
+ A dataclass representing the scraper configuration.
24
+
25
+ **Attributes:**
26
+ * `raw_html_dir (Path)`: Directory to save raw HTML files.
27
+ * `markdown_dir (Path)`: Directory to save converted Markdown files.
28
+ * `wildcard_websites (list[str])`: List of root URLs for recursive scraping.
29
+ * `individual_websites (list[str])`: List of specific URLs to scrape.
30
+ * `remove_header_footer (bool)`: Whether to prune `<header>` and `<footer>` tags.
31
+ * `markdown_convert (bool)`: Whether to convert HTML to Markdown.
32
+ * `time_delay (float)`: Delay between requests in seconds.
33
+ * `total_timeout (float)`: Maximum time in seconds for the entire scraping process.
34
+
35
+ #### `markdown_webscraper.CrawlStats`
36
+ A dataclass containing statistics from a completed crawl.
37
+
38
+ **Attributes:**
39
+ * `pages_fetched (int)`: Total number of pages requested.
40
+ * `html_files_saved (int)`: Total number of HTML files written to disk.
41
+ * `markdown_files_saved (int)`: Total number of Markdown files written to disk.
42
+
43
+ ### Utilities
44
+
45
+ #### `markdown_webscraper.load_config(config_path: str | Path) -> ScraperConfig`
46
+ Loads a `ScraperConfig` from a JSON file.
47
+
48
+ ---
49
+
50
+ ## Usage Example
51
+
52
+ ```python
53
+ from pathlib import Path
54
+ from markdown_webscraper import WebsiteScraper, load_config
55
+
56
+ # Load configuration from a JSON file
57
+ config = load_config("config.json")
58
+
59
+ # Initialize and run the scraper
60
+ scraper = WebsiteScraper(config=config)
61
+ stats = scraper.run()
62
+
63
+ print(f"Scraped {stats.pages_fetched} pages.")
64
+ print(f"Saved {stats.markdown_files_saved} markdown files.")
65
+ ```
66
+
67
+ ---
68
+
69
+ ## Local Development
70
+
71
+ ```bash
72
+ python3 -m venv .venv
73
+ . .venv/bin/activate
74
+ pip install -r requirements.txt
75
+ ```
76
+
77
+ Run with local script:
78
+
79
+ ```bash
80
+ python scrape.py --config config.json
81
+ ```
82
+
83
+ Run as installed package CLI:
84
+
85
+ ```bash
86
+ markdown-webscraper --config config.json
87
+ ```
88
+
89
+ ## Configuration
90
+
91
+ The CLI expects a JSON config file:
92
+
93
+ ```json
94
+ {
95
+ "raw_html_dir": "/home/brosnan/markdown_webscraper/raw_html/",
96
+ "markdown_dir": "/home/brosnan/markdown_webscraper/markdown/",
97
+ "wildcard_websites": ["https://www.allaboutcircuits.com/textbook", ""],
98
+ "individual_websites": ["https://example.com/", "https://www.ti.com/lit/ds/sprs590g/sprs590g.pdf"],
99
+ "remove_header_footer": true,
100
+ "markdown_convert": true,
101
+ "time_delay": 2,
102
+ "total_timeout": 180
103
+ }
104
+ ```
105
+
106
+ ## Tests
107
+
108
+ ```bash
109
+ pytest tests/unit -q
110
+ ```
111
+
112
+ Integration example.com:
113
+
114
+ ```bash
115
+ RUN_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_example_com -m integration -q
116
+ ```
117
+
118
+ Integration allaboutcircuits textbook:
119
+
120
+ ```bash
121
+ RUN_INTEGRATION=1 RUN_FULL_TEXTBOOK_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_allaboutcircuits_textbook_recursive -m integration -q
122
+ ```
123
+
124
+ ## Build and Publish to PyPI
125
+
126
+ 1. Update version in `pyproject.toml`.
127
+ 2. Build distributions:
128
+
129
+ ```bash
130
+ python -m pip install --upgrade build twine
131
+ python -m build
132
+ ```
133
+
134
+ 3. Check artifacts:
135
+
136
+ ```bash
137
+ python -m twine check dist/*
138
+ ```
139
+
140
+ 4. Upload:
141
+
142
+ ```bash
143
+ python -m twine upload dist/*
144
+ ```
@@ -0,0 +1,4 @@
1
+ from .config import ScraperConfig, load_config
2
+ from .pipeline import CrawlStats, WebsiteScraper
3
+
4
+ __all__ = ["ScraperConfig", "load_config", "CrawlStats", "WebsiteScraper"]
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+
5
+ from . import WebsiteScraper, load_config
6
+
7
+
8
+ def parse_args() -> argparse.Namespace:
9
+ parser = argparse.ArgumentParser(description="Scrape websites to HTML and Markdown.")
10
+ parser.add_argument(
11
+ "--config",
12
+ default="config.json",
13
+ help="Path to config.json (default: config.json)",
14
+ )
15
+ return parser.parse_args()
16
+
17
+
18
+ def main() -> None:
19
+ args = parse_args()
20
+ config = load_config(args.config)
21
+ stats = WebsiteScraper(config).run()
22
+ print(
23
+ f"Done. pages_fetched={stats.pages_fetched}, "
24
+ f"html_files_saved={stats.html_files_saved}, "
25
+ f"markdown_files_saved={stats.markdown_files_saved}"
26
+ )
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class ScraperConfig:
10
+ raw_html_dir: Path
11
+ markdown_dir: Path
12
+ wildcard_websites: list[str]
13
+ individual_websites: list[str]
14
+ remove_header_footer: bool
15
+ markdown_convert: bool
16
+ time_delay: float
17
+ total_timeout: float
18
+
19
+
20
+ def _clean_url_list(urls: list[str]) -> list[str]:
21
+ return [url.strip() for url in urls if isinstance(url, str) and url.strip()]
22
+
23
+
24
+ def load_config(config_path: str | Path) -> ScraperConfig:
25
+ path = Path(config_path)
26
+ with path.open("r", encoding="utf-8") as infile:
27
+ raw = json.load(infile)
28
+
29
+ return ScraperConfig(
30
+ raw_html_dir=Path(raw["raw_html_dir"]),
31
+ markdown_dir=Path(raw["markdown_dir"]),
32
+ wildcard_websites=_clean_url_list(raw.get("wildcard_websites", [])),
33
+ individual_websites=_clean_url_list(raw.get("individual_websites", [])),
34
+ remove_header_footer=bool(raw.get("remove_header_footer", False)),
35
+ markdown_convert=bool(raw.get("markdown_convert", True)),
36
+ time_delay=float(raw.get("time_delay", 0)),
37
+ total_timeout=float(raw.get("total_timeout",0))
38
+ )
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Protocol
5
+
6
+ from botasaurus.browser import Driver, browser
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class FetchedPage:
11
+ requested_url: str
12
+ resolved_url: str
13
+ html: str
14
+ links: list[str]
15
+
16
+
17
+ class PageFetcher(Protocol):
18
+ def fetch(self, url: str) -> FetchedPage:
19
+ ...
20
+
21
+ def close(self) -> None:
22
+ ...
23
+
24
+
25
+ @browser(
26
+ headless=True,
27
+ reuse_driver=True,
28
+ close_on_crash=True,
29
+ raise_exception=True,
30
+ output=None,
31
+ )
32
+ def _fetch_with_botasaurus(driver: Driver, payload: dict[str, str]) -> dict:
33
+ url = payload["url"]
34
+ driver.get(url)
35
+
36
+ # Explicitly use human-like movement + click as required.
37
+ driver.enable_human_mode()
38
+ moved = driver.move_mouse_to_element("body")
39
+ if moved:
40
+ driver.click("body", skip_move=False)
41
+
42
+ html = driver.page_html
43
+ links = driver.get_all_links()
44
+ return {
45
+ "requested_url": url,
46
+ "resolved_url": driver.current_url,
47
+ "html": html,
48
+ "links": links,
49
+ }
50
+
51
+
52
+ class BotasaurusFetcher:
53
+ def fetch(self, url: str) -> FetchedPage:
54
+ data = _fetch_with_botasaurus({"url": url})
55
+ return FetchedPage(
56
+ requested_url=data["requested_url"],
57
+ resolved_url=data["resolved_url"],
58
+ html=data["html"],
59
+ links=list(data.get("links", [])),
60
+ )
61
+
62
+ def close(self) -> None:
63
+ if hasattr(_fetch_with_botasaurus, "close"):
64
+ _fetch_with_botasaurus.close()
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from urllib.parse import urljoin, urlsplit, urlunsplit
6
+
7
+ from bs4 import BeautifulSoup
8
+ from markdownify import markdownify as md
9
+
10
+
11
+ def normalize_url(url: str) -> str:
12
+ parsed = urlsplit(url.strip())
13
+ if parsed.scheme not in {"http", "https"}:
14
+ raise ValueError(f"Unsupported URL scheme: {url}")
15
+ path = parsed.path or "/"
16
+ return urlunsplit((parsed.scheme.lower(), parsed.netloc.lower(), path, parsed.query, ""))
17
+
18
+
19
+ def normalize_link(base_url: str, href: str) -> str | None:
20
+ if not href:
21
+ return None
22
+ absolute = urljoin(base_url, href)
23
+ parsed = urlsplit(absolute)
24
+ if parsed.scheme not in {"http", "https"}:
25
+ return None
26
+ return normalize_url(absolute)
27
+
28
+
29
+ def is_within_scope(candidate_url: str, root_url: str) -> bool:
30
+ candidate = urlsplit(normalize_url(candidate_url))
31
+ root = urlsplit(normalize_url(root_url))
32
+ if candidate.netloc != root.netloc:
33
+ return False
34
+
35
+ root_path = root.path or "/"
36
+ if root_path == "/":
37
+ return True
38
+ if root_path.endswith("/"):
39
+ return candidate.path.startswith(root_path)
40
+ return candidate.path == root_path or candidate.path.startswith(f"{root_path}/")
41
+
42
+
43
+ def prune_header_footer(html: str) -> str:
44
+ soup = BeautifulSoup(html, "html.parser")
45
+ for tag in soup.find_all(["header", "footer"]):
46
+ tag.decompose()
47
+ return str(soup)
48
+
49
+
50
+ def to_markdown(html: str) -> str:
51
+ return md(html, heading_style="ATX")
52
+
53
+
54
+ def _safe_segment(segment: str) -> str:
55
+ cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", segment)
56
+ return cleaned.strip("._") or "index"
57
+
58
+
59
+ def url_to_output_path(url: str, output_dir: Path, extension: str) -> Path:
60
+ parsed = urlsplit(normalize_url(url))
61
+ path_segments = [seg for seg in parsed.path.split("/") if seg]
62
+ if not path_segments or parsed.path.endswith("/"):
63
+ path_segments.append("index")
64
+
65
+ file_stem = _safe_segment(path_segments[-1])
66
+ if parsed.query:
67
+ file_stem = f"{file_stem}__q_{_safe_segment(parsed.query)}"
68
+
69
+ directory_segments = [_safe_segment(seg) for seg in path_segments[:-1]]
70
+ return output_dir / _safe_segment(parsed.netloc) / Path(*directory_segments) / f"{file_stem}.{extension}"
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ import requests
5
+ import signal
6
+ from collections import deque
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Callable
10
+
11
+ from .config import ScraperConfig
12
+ from .fetcher import BotasaurusFetcher, FetchedPage, PageFetcher
13
+ from .html_utils import (
14
+ is_within_scope,
15
+ normalize_link,
16
+ normalize_url,
17
+ prune_header_footer,
18
+ to_markdown,
19
+ url_to_output_path,
20
+ )
21
+
22
+
23
+ @dataclass
24
+ class CrawlStats:
25
+ pages_fetched: int = 0
26
+ html_files_saved: int = 0
27
+ markdown_files_saved: int = 0
28
+
29
+
30
+ class WebsiteScraper:
31
+ def __init__(
32
+ self,
33
+ config: ScraperConfig,
34
+ fetcher: PageFetcher | None = None,
35
+ sleeper: Callable[[float], None] = time.sleep,
36
+ ) -> None:
37
+ self.config = config
38
+ self.fetcher = fetcher or BotasaurusFetcher()
39
+ self.sleeper = sleeper
40
+ self.stats = CrawlStats()
41
+ self._visited: set[str] = set()
42
+
43
+ def _handle_timeout(self, signum, frame):
44
+ print("\nTotal timeout reached. Quitting...")
45
+ exit(0)
46
+
47
+ def run(self) -> CrawlStats:
48
+ self.config.raw_html_dir.mkdir(parents=True, exist_ok=True)
49
+ self.config.markdown_dir.mkdir(parents=True, exist_ok=True)
50
+
51
+ if self.config.total_timeout > 0:
52
+ signal.signal(signal.SIGALRM, self._handle_timeout)
53
+ signal.alarm(int(self.config.total_timeout))
54
+
55
+ try:
56
+ for url in self.config.individual_websites:
57
+ self._scrape_one(url)
58
+
59
+ for root_url in self.config.wildcard_websites:
60
+ self._scrape_recursive(root_url)
61
+ finally:
62
+ if self.config.total_timeout > 0:
63
+ signal.alarm(0)
64
+ self.fetcher.close()
65
+
66
+ return self.stats
67
+
68
+ def _scrape_recursive(self, root_url: str) -> None:
69
+ queue: deque[str] = deque([normalize_url(root_url)])
70
+ while queue:
71
+ current = queue.popleft()
72
+ if current in self._visited:
73
+ continue
74
+ if not is_within_scope(current, root_url):
75
+ continue
76
+
77
+ fetched = self._scrape_one(current)
78
+ for href in fetched.links:
79
+ child = normalize_link(fetched.resolved_url, href)
80
+ if child and child not in self._visited and is_within_scope(child, root_url):
81
+ queue.append(child)
82
+
83
+ def _scrape_one(self, url: str) -> FetchedPage:
84
+ normalized = normalize_url(url)
85
+ if normalized in self._visited:
86
+ return FetchedPage(normalized, normalized, "", [])
87
+
88
+ fetched = self.fetcher.fetch(normalized)
89
+ self._visited.add(normalize_url(fetched.resolved_url))
90
+ self.stats.pages_fetched += 1
91
+
92
+ resolved_url_lower = fetched.resolved_url.lower()
93
+ if resolved_url_lower.endswith(".pdf") or resolved_url_lower.endswith(".txt"):
94
+ self._download_file(fetched.resolved_url)
95
+ return fetched
96
+
97
+ html = fetched.html
98
+ if self.config.remove_header_footer:
99
+ html = prune_header_footer(html)
100
+
101
+ html_path = url_to_output_path(fetched.resolved_url, self.config.raw_html_dir, "html")
102
+ self._write_text_file(html_path, html)
103
+ self.stats.html_files_saved += 1
104
+
105
+ if self.config.markdown_convert:
106
+ markdown_path = url_to_output_path(fetched.resolved_url, self.config.markdown_dir, "md")
107
+ self._write_text_file(markdown_path, to_markdown(html))
108
+ self.stats.markdown_files_saved += 1
109
+
110
+ if self.config.time_delay > 0:
111
+ self.sleeper(self.config.time_delay)
112
+
113
+ return fetched
114
+
115
+ def _download_file(self, url: str) -> None:
116
+ response = requests.get(url, stream=True)
117
+ response.raise_for_status()
118
+
119
+ ext = url.split(".")[-1].lower()
120
+ output_dir = self.config.raw_html_dir
121
+
122
+ # Use a safe way to determine the filename/path
123
+ from urllib.parse import urlparse
124
+ parsed = urlparse(url)
125
+ path = parsed.path
126
+ if not path or path == "/":
127
+ path = "/index"
128
+
129
+ file_path = output_dir / parsed.netloc / Path(path.lstrip("/"))
130
+ file_path.parent.mkdir(parents=True, exist_ok=True)
131
+
132
+ with open(file_path, "wb") as f:
133
+ for chunk in response.iter_content(chunk_size=8192):
134
+ f.write(chunk)
135
+
136
+ @staticmethod
137
+ def _write_text_file(path: Path, content: str) -> None:
138
+ path.parent.mkdir(parents=True, exist_ok=True)
139
+ path.write_text(content, encoding="utf-8")
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.4
2
+ Name: markdown_webscraper
3
+ Version: 0.1.0
4
+ Summary: Scrape websites to raw HTML with Botasaurus and convert to Markdown with markdownify.
5
+ Author: markdown_webscraper contributors
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://pypi.org/project/markdown_webscraper/
8
+ Project-URL: Source, https://github.com/your-org/markdown_webscraper
9
+ Project-URL: Issues, https://github.com/your-org/markdown_webscraper/issues
10
+ Keywords: webscraping,markdown,botasaurus,html
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Internet :: WWW/HTTP
15
+ Classifier: Topic :: Text Processing :: Markup
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: botasaurus>=4.0.97
20
+ Requires-Dist: markdownify>=1.2.2
21
+ Requires-Dist: beautifulsoup4>=4.14.3
22
+ Provides-Extra: test
23
+ Requires-Dist: pytest>=9.0.3; extra == "test"
24
+ Requires-Dist: pytest-mock>=3.15.1; extra == "test"
25
+ Dynamic: license-file
26
+
27
+ # markdown_webscraper
28
+
29
+ Scrape websites with `botasaurus`, save raw `.html`, then convert to `.md` with `markdownify`.
30
+
31
+ ## API Reference
32
+
33
+ ### Core Classes
34
+
35
+ #### `markdown_webscraper.WebsiteScraper`
36
+ The main class for running the scraping process.
37
+
38
+ **Constructor:**
39
+ `WebsiteScraper(config: ScraperConfig, fetcher: PageFetcher | None = None, sleeper: Callable[[float], None] = time.sleep)`
40
+
41
+ * `config`: A `ScraperConfig` object containing scraping parameters.
42
+ * `fetcher`: An optional implementation of `PageFetcher`. Defaults to `BotasaurusFetcher`.
43
+ * `sleeper`: A function to handle time delays. Defaults to `time.sleep`.
44
+
45
+ **Methods:**
46
+ * `run() -> CrawlStats`: Starts the scraping process based on the provided configuration. Returns `CrawlStats` containing the results.
47
+
48
+ #### `markdown_webscraper.ScraperConfig`
49
+ A dataclass representing the scraper configuration.
50
+
51
+ **Attributes:**
52
+ * `raw_html_dir (Path)`: Directory to save raw HTML files.
53
+ * `markdown_dir (Path)`: Directory to save converted Markdown files.
54
+ * `wildcard_websites (list[str])`: List of root URLs for recursive scraping.
55
+ * `individual_websites (list[str])`: List of specific URLs to scrape.
56
+ * `remove_header_footer (bool)`: Whether to prune `<header>` and `<footer>` tags.
57
+ * `markdown_convert (bool)`: Whether to convert HTML to Markdown.
58
+ * `time_delay (float)`: Delay between requests in seconds.
59
+ * `total_timeout (float)`: Maximum time in seconds for the entire scraping process.
60
+
61
+ #### `markdown_webscraper.CrawlStats`
62
+ A dataclass containing statistics from a completed crawl.
63
+
64
+ **Attributes:**
65
+ * `pages_fetched (int)`: Total number of pages requested.
66
+ * `html_files_saved (int)`: Total number of HTML files written to disk.
67
+ * `markdown_files_saved (int)`: Total number of Markdown files written to disk.
68
+
69
+ ### Utilities
70
+
71
+ #### `markdown_webscraper.load_config(config_path: str | Path) -> ScraperConfig`
72
+ Loads a `ScraperConfig` from a JSON file.
73
+
74
+ ---
75
+
76
+ ## Usage Example
77
+
78
+ ```python
79
+ from pathlib import Path
80
+ from markdown_webscraper import WebsiteScraper, load_config
81
+
82
+ # Load configuration from a JSON file
83
+ config = load_config("config.json")
84
+
85
+ # Initialize and run the scraper
86
+ scraper = WebsiteScraper(config=config)
87
+ stats = scraper.run()
88
+
89
+ print(f"Scraped {stats.pages_fetched} pages.")
90
+ print(f"Saved {stats.markdown_files_saved} markdown files.")
91
+ ```
92
+
93
+ ---
94
+
95
+ ## Local Development
96
+
97
+ ```bash
98
+ python3 -m venv .venv
99
+ . .venv/bin/activate
100
+ pip install -r requirements.txt
101
+ ```
102
+
103
+ Run with local script:
104
+
105
+ ```bash
106
+ python scrape.py --config config.json
107
+ ```
108
+
109
+ Run as installed package CLI:
110
+
111
+ ```bash
112
+ markdown-webscraper --config config.json
113
+ ```
114
+
115
+ ## Configuration
116
+
117
+ The CLI expects a JSON config file:
118
+
119
+ ```json
120
+ {
121
+ "raw_html_dir": "/home/brosnan/markdown_webscraper/raw_html/",
122
+ "markdown_dir": "/home/brosnan/markdown_webscraper/markdown/",
123
+ "wildcard_websites": ["https://www.allaboutcircuits.com/textbook", ""],
124
+ "individual_websites": ["https://example.com/", "https://www.ti.com/lit/ds/sprs590g/sprs590g.pdf"],
125
+ "remove_header_footer": true,
126
+ "markdown_convert": true,
127
+ "time_delay": 2,
128
+ "total_timeout": 180
129
+ }
130
+ ```
131
+
132
+ ## Tests
133
+
134
+ ```bash
135
+ pytest tests/unit -q
136
+ ```
137
+
138
+ Integration example.com:
139
+
140
+ ```bash
141
+ RUN_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_example_com -m integration -q
142
+ ```
143
+
144
+ Integration allaboutcircuits textbook:
145
+
146
+ ```bash
147
+ RUN_INTEGRATION=1 RUN_FULL_TEXTBOOK_INTEGRATION=1 pytest tests/integration/test_live_scrape.py::test_integration_allaboutcircuits_textbook_recursive -m integration -q
148
+ ```
149
+
150
+ ## Build and Publish to PyPI
151
+
152
+ 1. Update version in `pyproject.toml`.
153
+ 2. Build distributions:
154
+
155
+ ```bash
156
+ python -m pip install --upgrade build twine
157
+ python -m build
158
+ ```
159
+
160
+ 3. Check artifacts:
161
+
162
+ ```bash
163
+ python -m twine check dist/*
164
+ ```
165
+
166
+ 4. Upload:
167
+
168
+ ```bash
169
+ python -m twine upload dist/*
170
+ ```
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ markdown_webscraper/__init__.py
5
+ markdown_webscraper/cli.py
6
+ markdown_webscraper/config.py
7
+ markdown_webscraper/fetcher.py
8
+ markdown_webscraper/html_utils.py
9
+ markdown_webscraper/pipeline.py
10
+ markdown_webscraper.egg-info/PKG-INFO
11
+ markdown_webscraper.egg-info/SOURCES.txt
12
+ markdown_webscraper.egg-info/dependency_links.txt
13
+ markdown_webscraper.egg-info/entry_points.txt
14
+ markdown_webscraper.egg-info/requires.txt
15
+ markdown_webscraper.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ markdown-webscraper = markdown_webscraper.cli:main
@@ -0,0 +1,7 @@
1
+ botasaurus>=4.0.97
2
+ markdownify>=1.2.2
3
+ beautifulsoup4>=4.14.3
4
+
5
+ [test]
6
+ pytest>=9.0.3
7
+ pytest-mock>=3.15.1
@@ -0,0 +1 @@
1
+ markdown_webscraper
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "markdown_webscraper"
7
+ version = "0.1.0"
8
+ description = "Scrape websites to raw HTML with Botasaurus and convert to Markdown with markdownify."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "markdown_webscraper contributors" }
14
+ ]
15
+ keywords = ["webscraping", "markdown", "botasaurus", "html"]
16
+ classifiers = [
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3 :: Only",
19
+ "Operating System :: OS Independent",
20
+ "Topic :: Internet :: WWW/HTTP",
21
+ "Topic :: Text Processing :: Markup",
22
+ ]
23
+ dependencies = [
24
+ "botasaurus>=4.0.97",
25
+ "markdownify>=1.2.2",
26
+ "beautifulsoup4>=4.14.3",
27
+ ]
28
+
29
+ [project.optional-dependencies]
30
+ test = [
31
+ "pytest>=9.0.3",
32
+ "pytest-mock>=3.15.1",
33
+ ]
34
+
35
+ [project.scripts]
36
+ markdown-webscraper = "markdown_webscraper.cli:main"
37
+
38
+ [project.urls]
39
+ Homepage = "https://pypi.org/project/markdown_webscraper/"
40
+ Source = "https://github.com/your-org/markdown_webscraper"
41
+ Issues = "https://github.com/your-org/markdown_webscraper/issues"
42
+
43
+ [tool.setuptools.packages.find]
44
+ include = ["markdown_webscraper*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+