sitewalker 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Neil Johnson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: sitewalker
3
+ Version: 0.2.0
4
+ Summary: Crawl a website and create a structured map of its pages
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Author: Neil Johnson
8
+ Author-email: neil@cadent.net
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
19
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
20
+ Requires-Dist: urllib3 (>=2.2.3,<3.0.0)
21
+ Description-Content-Type: text/markdown
22
+
23
+ # sitewalker
24
+
25
+ Crawl a website and create a structured map of its pages.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pipx install sitewalker
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```bash
36
+ # Map all pages on a site (single-level crawl)
37
+ sitewalker example.com
38
+
39
+ # Recursive crawl of all internal pages
40
+ sitewalker example.com -r
41
+
42
+ # Collect external links
43
+ sitewalker example.com -e
44
+
45
+ # Recursive crawl with external link collection
46
+ sitewalker example.com -r -e
47
+
48
+ # Only crawl web pages (skip images, PDFs, etc.)
49
+ sitewalker example.com -r -p
50
+
51
+ # Crawl an HTTP-only site (e.g., LAN staging server)
52
+ sitewalker http://staging.lan --allow-private
53
+
54
+ # Verbose output for debugging
55
+ sitewalker example.com -r -v
56
+ ```
57
+
58
+ The target accepts a bare domain (`example.com`) or a full URL (`http://example.com`). Bare domains default to HTTPS — if the connection fails, sitewalker exits with a message to provide the full URL.
59
+
60
+ ## Options
61
+
62
+ | Flag | Description | Default |
63
+ |------|-------------|---------|
64
+ | `-r`, `--recursive` | Recursively crawl internal links | Off |
65
+ | `-e`, `--external-links` | Collect external links | Off |
66
+ | `-p`, `--pages` | Only crawl web pages (HTML, PHP, etc.) | Off |
67
+ | `-v`, `--verbose` | Enable verbose/debug output | Off |
68
+ | `-t`, `--timeout` | Request timeout in seconds | 30 |
69
+ | `--max-pages` | Maximum number of pages to crawl | 1000 |
70
+ | `--max-depth` | Maximum crawl depth for recursive mode | 10 |
71
+ | `--allow-private` | Allow crawling domains that resolve to private IPs | Off |
72
+ | `--ignore-robots` | Ignore robots.txt rules | Off |
73
+
74
+ ## Output
75
+
76
+ Results are saved to a CSV file named `{domain}_{timestamp}.csv` with columns:
77
+
78
+ - **URL** — the page URL
79
+ - **Title** — the page's `<title>` tag content
80
+ - **Status Code** — HTTP response status
81
+
82
+ When using `-e`, external links are saved to a separate `{domain}_{timestamp}_external_links.csv`.
83
+
84
+ ## Security
85
+
86
+ - **SSRF protection**: Domains that resolve to private/reserved IP addresses are blocked by default. Use `--allow-private` to override for legitimate internal use.
87
+ - **robots.txt**: Respected by default. Use `--ignore-robots` to override.
88
+ - **CSV injection**: Output values are sanitized to prevent spreadsheet formula injection.
89
+ - **Crawl limits**: Recursive crawls are bounded by `--max-pages` and `--max-depth` to prevent resource exhaustion.
90
+
91
+ ## Roadmap
92
+
93
+ - `--format json` — JSON output format
94
+ - `--check-links` — broken link detection
95
+ - `--images --check-alt` — image inventory with alt text auditing
96
+
97
+ ## License
98
+
99
+ MIT
100
+
@@ -0,0 +1,77 @@
1
+ # sitewalker
2
+
3
+ Crawl a website and create a structured map of its pages.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pipx install sitewalker
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```bash
14
+ # Map all pages on a site (single-level crawl)
15
+ sitewalker example.com
16
+
17
+ # Recursive crawl of all internal pages
18
+ sitewalker example.com -r
19
+
20
+ # Collect external links
21
+ sitewalker example.com -e
22
+
23
+ # Recursive crawl with external link collection
24
+ sitewalker example.com -r -e
25
+
26
+ # Only crawl web pages (skip images, PDFs, etc.)
27
+ sitewalker example.com -r -p
28
+
29
+ # Crawl an HTTP-only site (e.g., LAN staging server)
30
+ sitewalker http://staging.lan --allow-private
31
+
32
+ # Verbose output for debugging
33
+ sitewalker example.com -r -v
34
+ ```
35
+
36
+ The target accepts a bare domain (`example.com`) or a full URL (`http://example.com`). Bare domains default to HTTPS — if the connection fails, sitewalker exits with a message to provide the full URL.
37
+
38
+ ## Options
39
+
40
+ | Flag | Description | Default |
41
+ |------|-------------|---------|
42
+ | `-r`, `--recursive` | Recursively crawl internal links | Off |
43
+ | `-e`, `--external-links` | Collect external links | Off |
44
+ | `-p`, `--pages` | Only crawl web pages (HTML, PHP, etc.) | Off |
45
+ | `-v`, `--verbose` | Enable verbose/debug output | Off |
46
+ | `-t`, `--timeout` | Request timeout in seconds | 30 |
47
+ | `--max-pages` | Maximum number of pages to crawl | 1000 |
48
+ | `--max-depth` | Maximum crawl depth for recursive mode | 10 |
49
+ | `--allow-private` | Allow crawling domains that resolve to private IPs | Off |
50
+ | `--ignore-robots` | Ignore robots.txt rules | Off |
51
+
52
+ ## Output
53
+
54
+ Results are saved to a CSV file named `{domain}_{timestamp}.csv` with columns:
55
+
56
+ - **URL** — the page URL
57
+ - **Title** — the page's `<title>` tag content
58
+ - **Status Code** — HTTP response status
59
+
60
+ When using `-e`, external links are saved to a separate `{domain}_{timestamp}_external_links.csv`.
61
+
62
+ ## Security
63
+
64
+ - **SSRF protection**: Domains that resolve to private/reserved IP addresses are blocked by default. Use `--allow-private` to override for legitimate internal use.
65
+ - **robots.txt**: Respected by default. Use `--ignore-robots` to override.
66
+ - **CSV injection**: Output values are sanitized to prevent spreadsheet formula injection.
67
+ - **Crawl limits**: Recursive crawls are bounded by `--max-pages` and `--max-depth` to prevent resource exhaustion.
68
+
69
+ ## Roadmap
70
+
71
+ - `--format json` — JSON output format
72
+ - `--check-links` — broken link detection
73
+ - `--images --check-alt` — image inventory with alt text auditing
74
+
75
+ ## License
76
+
77
+ MIT
@@ -0,0 +1,26 @@
1
+ [tool.poetry]
2
+ name = "sitewalker"
3
+ version = "0.2.0"
4
+ description = "Crawl a website and create a structured map of its pages"
5
+ authors = ["Neil Johnson <neil@cadent.net>"]
6
+ readme = "README.md"
7
+ license = "MIT"
8
+ packages = [{include = "sitewalker", from = "src"}]
9
+
10
+ [tool.poetry.scripts]
11
+ sitewalker = "sitewalker.cli:main"
12
+
13
+ [tool.poetry.dependencies]
14
+ python = "^3.9"
15
+ requests = "^2.32.3"
16
+ beautifulsoup4 = "^4.12.3"
17
+ urllib3 = "^2.2.3"
18
+
19
+ [tool.poetry.group.dev.dependencies]
20
+ pytest = "^8.3.4"
21
+ pytest-cov = "^6.0.0"
22
+ responses = "^0.25.3"
23
+
24
+ [build-system]
25
+ requires = ["poetry-core"]
26
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1 @@
1
+ """sitewalker — Crawl a website and create a structured map of its pages."""
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ import argparse
5
+ import logging
6
+ from datetime import datetime
7
+ from urllib.parse import urlparse
8
+ import requests
9
+ from sitewalker.crawler import WebsiteCrawler
10
+
11
+
12
+ def setup_logging(verbose: bool):
13
+ """Configure logging based on verbosity level."""
14
+ root_logger = logging.getLogger()
15
+ handler = logging.StreamHandler()
16
+ formatter = logging.Formatter(
17
+ '%(asctime)s - %(levelname)s - %(message)s',
18
+ '%Y-%m-%d %H:%M:%S'
19
+ )
20
+ handler.setFormatter(formatter)
21
+ root_logger.addHandler(handler)
22
+
23
+ if verbose:
24
+ root_logger.setLevel(logging.DEBUG)
25
+ else:
26
+ root_logger.setLevel(logging.INFO)
27
+
28
+
29
+ def main():
30
+ """Main function to run the crawler."""
31
+ parser = argparse.ArgumentParser(
32
+ description="Crawl a website and create a structured map of its pages"
33
+ )
34
+ parser.add_argument(
35
+ "target",
36
+ help="Domain or URL to crawl (e.g., example.com or http://example.com)"
37
+ )
38
+ parser.add_argument(
39
+ "-e", "--external-links",
40
+ action="store_true",
41
+ help="Check for external links on the domain"
42
+ )
43
+ parser.add_argument(
44
+ "-v", "--verbose",
45
+ action="store_true",
46
+ help="Enable verbose output"
47
+ )
48
+ parser.add_argument(
49
+ "-r", "--recursive",
50
+ action="store_true",
51
+ help="Recursively crawl internal links"
52
+ )
53
+ parser.add_argument(
54
+ "-p", "--pages",
55
+ action="store_true",
56
+ help="Only crawl web pages (HTML, PHP, etc.) and skip other file types"
57
+ )
58
+ parser.add_argument(
59
+ "-t", "--timeout",
60
+ type=int,
61
+ default=30,
62
+ help="Request timeout in seconds (default: 30)"
63
+ )
64
+ parser.add_argument(
65
+ "--max-pages",
66
+ type=int,
67
+ default=1000,
68
+ help="Maximum number of pages to crawl (default: 1000)"
69
+ )
70
+ parser.add_argument(
71
+ "--max-depth",
72
+ type=int,
73
+ default=10,
74
+ help="Maximum crawl depth for recursive mode (default: 10)"
75
+ )
76
+ parser.add_argument(
77
+ "--allow-private",
78
+ action="store_true",
79
+ help="Allow crawling domains that resolve to private/reserved IPs"
80
+ )
81
+ parser.add_argument(
82
+ "--ignore-robots",
83
+ action="store_true",
84
+ help="Ignore robots.txt rules when crawling"
85
+ )
86
+
87
+ args = parser.parse_args()
88
+ setup_logging(args.verbose)
89
+
90
+ try:
91
+ target = args.target
92
+ parsed = urlparse(target)
93
+
94
+ # If bare domain (no scheme), probe HTTPS first
95
+ if parsed.scheme not in ('http', 'https'):
96
+ probe_url = f"https://{target}"
97
+ try:
98
+ requests.head(probe_url, timeout=5, allow_redirects=True)
99
+ target = probe_url
100
+ except requests.ConnectionError:
101
+ logging.error(
102
+ f"Could not connect to {probe_url}\n"
103
+ f"If this site uses HTTP, provide the full URL:\n"
104
+ f" sitewalker http://{target}"
105
+ )
106
+ sys.exit(1)
107
+
108
+ # Extract domain for safe filename
109
+ parsed = urlparse(target)
110
+ safe_domain = parsed.netloc.replace('/', '_').replace('\\', '_').replace('..', '_')
111
+
112
+ crawler = WebsiteCrawler(target, timeout=args.timeout,
113
+ allow_private=args.allow_private,
114
+ ignore_robots=args.ignore_robots)
115
+ timestamp = datetime.now().strftime("%Y-%m-%dT%H%M")
116
+
117
+ if args.external_links:
118
+ crawler.crawl(
119
+ collect_external=True,
120
+ recursive=args.recursive,
121
+ pages_only=args.pages,
122
+ max_pages=args.max_pages,
123
+ max_depth=args.max_depth
124
+ )
125
+ ext_links_root = "_external_links.csv"
126
+ external_links_file = f"{safe_domain}_{timestamp}{ext_links_root}"
127
+ crawler.save_external_links_results(external_links_file)
128
+ logging.info(f"External links saved to {external_links_file}")
129
+ else:
130
+ crawler.crawl(recursive=args.recursive, pages_only=args.pages,
131
+ max_pages=args.max_pages, max_depth=args.max_depth)
132
+ output_file = f"{safe_domain}_{timestamp}.csv"
133
+ crawler.save_results(output_file)
134
+ logging.info(f"Crawling complete! Results saved to {output_file}")
135
+
136
+ logging.info(f"Total pages crawled: {len(crawler.visited_urls)}")
137
+
138
+ except Exception as e:
139
+ logging.error(f"An error occurred: {str(e)}")
140
+ sys.exit(1)
141
+
142
+
143
+ # pragma: no cover
144
+ if __name__ == "__main__":
145
+ main()
@@ -0,0 +1,295 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import csv
4
+ import ipaddress
5
+ import socket
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from urllib.parse import urljoin, urlparse
9
+ from urllib.robotparser import RobotFileParser
10
+ from typing import Set, List, Tuple
11
+ import time
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # List of file extensions that are considered web pages
17
+ PAGE_EXTENSIONS = {
18
+ '', # for URLs ending in '/'
19
+ 'html',
20
+ 'htm',
21
+ 'php',
22
+ 'asp',
23
+ 'aspx',
24
+ 'jsp',
25
+ 'shtml',
26
+ 'phtml',
27
+ 'xhtml',
28
+ 'jspx',
29
+ 'do',
30
+ 'cfm',
31
+ 'cgi'
32
+ }
33
+
34
+ class URLProcessingError(Exception):
35
+ """Custom exception for URL processing errors"""
36
+ pass
37
+
38
+ class CrawlingError(Exception):
39
+ """Custom exception for crawling errors"""
40
+ pass
41
+
42
+ class SSRFProtectionError(Exception):
43
+ """Raised when a domain resolves to a private/reserved IP address."""
44
+ pass
45
+
46
+
47
+ def validate_domain_ssrf(domain: str) -> None:
48
+ """Check that a domain does not resolve to a private or reserved IP.
49
+
50
+ Raises SSRFProtectionError if the domain resolves to loopback,
51
+ private, link-local, or reserved address ranges.
52
+ """
53
+ try:
54
+ results = socket.getaddrinfo(domain, None)
55
+ for family, _, _, _, sockaddr in results:
56
+ ip = ipaddress.ip_address(sockaddr[0])
57
+ if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
58
+ raise SSRFProtectionError(
59
+ f"Domain '{domain}' resolves to private/reserved IP {ip}. "
60
+ f"Use --allow-private to override."
61
+ )
62
+ except socket.gaierror as e:
63
+ raise CrawlingError(f"Cannot resolve domain '{domain}': {e}")
64
+
65
+ class WebsiteCrawler:
66
+ USER_AGENT = 'Mozilla/5.0 (compatible; sitewalker/0.2.0; +https://github.com/cadentdev/sitewalker)'
67
+
68
+ def __init__(self, target: str, timeout: int = 30, allow_private: bool = False,
69
+ ignore_robots: bool = False):
70
+ # Parse target: accept full URL (http://example.com) or bare domain (example.com)
71
+ parsed = urlparse(target)
72
+ if parsed.scheme in ('http', 'https'):
73
+ self.domain = parsed.netloc
74
+ self.base_url = f"{parsed.scheme}://{parsed.netloc}"
75
+ else:
76
+ # Bare domain — assume HTTPS
77
+ self.domain = target
78
+ self.base_url = f"https://{target}"
79
+
80
+ if not allow_private:
81
+ validate_domain_ssrf(self.domain)
82
+
83
+ # Normalize the base URL
84
+ self.base_url, _ = self.process_url(self.base_url)
85
+ self.visited_urls: Set[str] = set()
86
+ self.results: List[Tuple[str, str, int]] = []
87
+ self.external_links: Set[str] = set()
88
+ self.pages_only: bool = False
89
+ self.timeout = timeout
90
+ self.ignore_robots = ignore_robots
91
+ self.robot_parser: RobotFileParser | None = None
92
+ self.session = requests.Session()
93
+ self.session.headers.update({'User-Agent': self.USER_AGENT})
94
+
95
+ def process_url(self, url: str) -> Tuple[str, bool]:
96
+ """
97
+ Process and validate a URL.
98
+ Returns: (cleaned_url, is_internal)
99
+ Raises: URLProcessingError if URL is invalid
100
+ """
101
+ if not url:
102
+ raise URLProcessingError("Empty URL")
103
+
104
+ try:
105
+ parsed_url = urlparse(url)
106
+ if not parsed_url.scheme or not parsed_url.netloc:
107
+ raise URLProcessingError("Invalid URL format")
108
+
109
+ # Clean URL by removing fragments, query parameters, and trailing slashes
110
+ path = parsed_url.path
111
+ if not path or path == '/':
112
+ path = ''
113
+ else:
114
+ path = path.rstrip('/')
115
+
116
+ clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{path}"
117
+ is_internal = self.domain in parsed_url.netloc
118
+
119
+ if parsed_url.scheme not in ('http', 'https'):
120
+ raise URLProcessingError("Unsupported protocol")
121
+
122
+ return clean_url, is_internal
123
+
124
+ except Exception as e:
125
+ raise URLProcessingError(f"URL processing error: {str(e)}")
126
+
127
+ def is_page(self, url: str) -> bool:
128
+ """
129
+ Check if a URL points to a web page based on its extension or path.
130
+ """
131
+ try:
132
+ parsed = urlparse(url)
133
+ if not parsed.scheme or not parsed.netloc:
134
+ return False
135
+
136
+ path = parsed.path.rstrip('/')
137
+
138
+ # URLs ending with '/' are considered pages (directory index)
139
+ if not path or path.endswith('/'):
140
+ return True
141
+
142
+ # Check if the file extension (if any) is in our list of page extensions
143
+ if '.' in path:
144
+ ext = path.split('.')[-1].lower()
145
+ return ext in PAGE_EXTENSIONS
146
+
147
+ # URLs without extensions are considered pages
148
+ return True
149
+
150
+ except Exception as e:
151
+ logger.debug(f"Error checking if URL is page: {str(e)}")
152
+ return False
153
+
154
+ def _load_robots_txt(self) -> None:
155
+ """Fetch and parse robots.txt for the target domain."""
156
+ if self.ignore_robots:
157
+ return
158
+ robots_url = f"{self.base_url}/robots.txt"
159
+ try:
160
+ resp = self.session.get(robots_url, timeout=self.timeout)
161
+ if resp.status_code == 200:
162
+ rp = RobotFileParser()
163
+ rp.set_url(robots_url)
164
+ rp.parse(resp.text.splitlines())
165
+ self.robot_parser = rp
166
+ logger.info(f"Loaded robots.txt from {robots_url}")
167
+ else:
168
+ logger.info(f"No robots.txt found at {robots_url} (HTTP {resp.status_code})")
169
+ except Exception as e:
170
+ logger.warning(f"Could not load robots.txt from {robots_url}: {e}")
171
+ self.robot_parser = None
172
+
173
+ def _is_allowed_by_robots(self, url: str) -> bool:
174
+ """Check if a URL is allowed by robots.txt rules."""
175
+ if self.ignore_robots or self.robot_parser is None:
176
+ return True
177
+ return self.robot_parser.can_fetch(self.USER_AGENT, url)
178
+
179
+ def crawl(self, collect_external: bool = False, recursive: bool = False,
180
+ pages_only: bool = False, max_pages: int = 1000, max_depth: int = 10) -> None:
181
+ """
182
+ Crawl the website starting from the base URL.
183
+
184
+ In non-recursive mode:
185
+ - Crawls the base URL and follows internal links found on that page
186
+ - Does not follow links found on subsequent pages
187
+
188
+ In recursive mode:
189
+ - Crawls the base URL and follows all internal links recursively
190
+ - Continues until all reachable internal pages are visited
191
+ """
192
+ self.pages_only = pages_only
193
+ self.max_pages = max_pages
194
+ self.max_depth = max_depth
195
+ self._load_robots_txt()
196
+ logger.info(f"Starting crawl of {self.base_url}")
197
+ logger.info(f"Mode: {'Recursive' if recursive else 'Single-level'} crawl, "
198
+ f"{'collecting' if collect_external else 'ignoring'} external links, "
199
+ f"{'pages only' if pages_only else 'all files'}, "
200
+ f"max_pages={max_pages}, max_depth={max_depth}")
201
+ self._crawl_page(self.base_url, collect_external, recursive, depth=0)
202
+ logger.info(f"Crawl complete. Visited {len(self.visited_urls)} pages")
203
+ if collect_external:
204
+ logger.info(f"Found {len(self.external_links)} unique external links")
205
+
206
+ def _crawl_page(self, url: str, collect_external: bool, recursive: bool, depth: int = 0) -> None:
207
+ """Internal method to crawl a single page and process its links."""
208
+ if len(self.visited_urls) >= self.max_pages:
209
+ logger.info(f"Reached max_pages limit ({self.max_pages})")
210
+ return
211
+ if depth > self.max_depth:
212
+ logger.debug(f"Reached max_depth limit ({self.max_depth}) at {url}")
213
+ return
214
+ try:
215
+ clean_url, is_internal = self.process_url(url)
216
+ if not is_internal or clean_url in self.visited_urls:
217
+ return
218
+
219
+ # Check robots.txt rules
220
+ if not self._is_allowed_by_robots(clean_url):
221
+ logger.debug(f"Blocked by robots.txt: {clean_url}")
222
+ return
223
+
224
+ # Skip non-page URLs if pages_only is True
225
+ if self.pages_only and not self.is_page(clean_url):
226
+ logger.debug(f"Skipping non-page URL: {clean_url}")
227
+ return
228
+
229
+ self.visited_urls.add(clean_url)
230
+ logger.debug(f"Crawling {clean_url}")
231
+
232
+ response = self.session.get(clean_url, timeout=self.timeout)
233
+ response.raise_for_status()
234
+ soup = BeautifulSoup(response.text, 'html.parser')
235
+
236
+ # Process page title
237
+ title = soup.title.string.strip() if soup.title else "No title"
238
+ self.results.append((clean_url, title, response.status_code))
239
+
240
+ # Process links
241
+ for link in soup.find_all('a', href=True):
242
+ next_url = urljoin(url, link['href'])
243
+ try:
244
+ next_clean_url, next_is_internal = self.process_url(next_url)
245
+
246
+ if next_is_internal and recursive:
247
+ if next_clean_url not in self.visited_urls:
248
+ self._crawl_page(next_clean_url, collect_external, recursive, depth + 1)
249
+ elif not next_is_internal and collect_external:
250
+ self.external_links.add(next_clean_url)
251
+
252
+ except URLProcessingError:
253
+ continue
254
+
255
+ except requests.HTTPError as e:
256
+ logger.error(f"HTTP Error crawling {url}: {str(e)}")
257
+ self.results.append((url, "Error", e.response.status_code))
258
+ except Exception as e:
259
+ logger.error(f"Error crawling {url}: {str(e)}")
260
+ self.results.append((url, "Error", 0))
261
+
262
+ time.sleep(1) # Be polite
263
+
264
+ @staticmethod
265
+ def _sanitize_csv_value(value: str) -> str:
266
+ """Sanitize a value for safe CSV output.
267
+
268
+ Prevents CSV injection by prefixing dangerous characters that
269
+ spreadsheet applications interpret as formulas.
270
+ """
271
+ if isinstance(value, str) and value and value[0] in ('=', '+', '-', '@', '\t', '\r'):
272
+ return "'" + value
273
+ return value
274
+
275
+ def save_results(self, output_file: str) -> None:
276
+ """Save results to a CSV file."""
277
+ with open(output_file, 'w', newline='', encoding='utf-8') as f:
278
+ writer = csv.writer(f)
279
+ writer.writerow(['URL', 'Title', 'Status Code'])
280
+ for url, title, status in self.results:
281
+ writer.writerow([
282
+ self._sanitize_csv_value(url),
283
+ self._sanitize_csv_value(title),
284
+ status
285
+ ])
286
+ logger.info(f"Results saved to {output_file}")
287
+
288
+ def save_external_links_results(self, filename: str) -> None:
289
+ """Save external links to a CSV file."""
290
+ with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
291
+ writer = csv.writer(csvfile)
292
+ writer.writerow(['External URL'])
293
+ for url in sorted(self.external_links):
294
+ writer.writerow([self._sanitize_csv_value(url)])
295
+ logger.info(f"External links saved to {filename}")