sitewalker 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sitewalker-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Neil Johnson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sitewalker
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Crawl a website and create a structured map of its pages
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Neil Johnson
|
|
8
|
+
Author-email: neil@cadent.net
|
|
9
|
+
Requires-Python: >=3.9,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
|
19
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
20
|
+
Requires-Dist: urllib3 (>=2.2.3,<3.0.0)
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# sitewalker
|
|
24
|
+
|
|
25
|
+
Crawl a website and create a structured map of its pages.
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pipx install sitewalker
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Map all pages on a site (single-level crawl)
|
|
37
|
+
sitewalker example.com
|
|
38
|
+
|
|
39
|
+
# Recursive crawl of all internal pages
|
|
40
|
+
sitewalker example.com -r
|
|
41
|
+
|
|
42
|
+
# Collect external links
|
|
43
|
+
sitewalker example.com -e
|
|
44
|
+
|
|
45
|
+
# Recursive crawl with external link collection
|
|
46
|
+
sitewalker example.com -r -e
|
|
47
|
+
|
|
48
|
+
# Only crawl web pages (skip images, PDFs, etc.)
|
|
49
|
+
sitewalker example.com -r -p
|
|
50
|
+
|
|
51
|
+
# Crawl an HTTP-only site (e.g., LAN staging server)
|
|
52
|
+
sitewalker http://staging.lan --allow-private
|
|
53
|
+
|
|
54
|
+
# Verbose output for debugging
|
|
55
|
+
sitewalker example.com -r -v
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The target accepts a bare domain (`example.com`) or a full URL (`http://example.com`). Bare domains default to HTTPS — if the connection fails, sitewalker exits with a message to provide the full URL.
|
|
59
|
+
|
|
60
|
+
## Options
|
|
61
|
+
|
|
62
|
+
| Flag | Description | Default |
|
|
63
|
+
|------|-------------|---------|
|
|
64
|
+
| `-r`, `--recursive` | Recursively crawl internal links | Off |
|
|
65
|
+
| `-e`, `--external-links` | Collect external links | Off |
|
|
66
|
+
| `-p`, `--pages` | Only crawl web pages (HTML, PHP, etc.) | Off |
|
|
67
|
+
| `-v`, `--verbose` | Enable verbose/debug output | Off |
|
|
68
|
+
| `-t`, `--timeout` | Request timeout in seconds | 30 |
|
|
69
|
+
| `--max-pages` | Maximum number of pages to crawl | 1000 |
|
|
70
|
+
| `--max-depth` | Maximum crawl depth for recursive mode | 10 |
|
|
71
|
+
| `--allow-private` | Allow crawling domains that resolve to private IPs | Off |
|
|
72
|
+
| `--ignore-robots` | Ignore robots.txt rules | Off |
|
|
73
|
+
|
|
74
|
+
## Output
|
|
75
|
+
|
|
76
|
+
Results are saved to a CSV file named `{domain}_{timestamp}.csv` with columns:
|
|
77
|
+
|
|
78
|
+
- **URL** — the page URL
|
|
79
|
+
- **Title** — the page's `<title>` tag content
|
|
80
|
+
- **Status Code** — HTTP response status
|
|
81
|
+
|
|
82
|
+
When using `-e`, external links are saved to a separate `{domain}_{timestamp}_external_links.csv`.
|
|
83
|
+
|
|
84
|
+
## Security
|
|
85
|
+
|
|
86
|
+
- **SSRF protection**: Domains that resolve to private/reserved IP addresses are blocked by default. Use `--allow-private` to override for legitimate internal use.
|
|
87
|
+
- **robots.txt**: Respected by default. Use `--ignore-robots` to override.
|
|
88
|
+
- **CSV injection**: Output values are sanitized to prevent spreadsheet formula injection.
|
|
89
|
+
- **Crawl limits**: Recursive crawls are bounded by `--max-pages` and `--max-depth` to prevent resource exhaustion.
|
|
90
|
+
|
|
91
|
+
## Roadmap
|
|
92
|
+
|
|
93
|
+
- `--format json` — JSON output format
|
|
94
|
+
- `--check-links` — broken link detection
|
|
95
|
+
- `--images --check-alt` — image inventory with alt text auditing
|
|
96
|
+
|
|
97
|
+
## License
|
|
98
|
+
|
|
99
|
+
MIT
|
|
100
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# sitewalker
|
|
2
|
+
|
|
3
|
+
Crawl a website and create a structured map of its pages.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pipx install sitewalker
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Map all pages on a site (single-level crawl)
|
|
15
|
+
sitewalker example.com
|
|
16
|
+
|
|
17
|
+
# Recursive crawl of all internal pages
|
|
18
|
+
sitewalker example.com -r
|
|
19
|
+
|
|
20
|
+
# Collect external links
|
|
21
|
+
sitewalker example.com -e
|
|
22
|
+
|
|
23
|
+
# Recursive crawl with external link collection
|
|
24
|
+
sitewalker example.com -r -e
|
|
25
|
+
|
|
26
|
+
# Only crawl web pages (skip images, PDFs, etc.)
|
|
27
|
+
sitewalker example.com -r -p
|
|
28
|
+
|
|
29
|
+
# Crawl an HTTP-only site (e.g., LAN staging server)
|
|
30
|
+
sitewalker http://staging.lan --allow-private
|
|
31
|
+
|
|
32
|
+
# Verbose output for debugging
|
|
33
|
+
sitewalker example.com -r -v
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The target accepts a bare domain (`example.com`) or a full URL (`http://example.com`). Bare domains default to HTTPS — if the connection fails, sitewalker exits with a message to provide the full URL.
|
|
37
|
+
|
|
38
|
+
## Options
|
|
39
|
+
|
|
40
|
+
| Flag | Description | Default |
|
|
41
|
+
|------|-------------|---------|
|
|
42
|
+
| `-r`, `--recursive` | Recursively crawl internal links | Off |
|
|
43
|
+
| `-e`, `--external-links` | Collect external links | Off |
|
|
44
|
+
| `-p`, `--pages` | Only crawl web pages (HTML, PHP, etc.) | Off |
|
|
45
|
+
| `-v`, `--verbose` | Enable verbose/debug output | Off |
|
|
46
|
+
| `-t`, `--timeout` | Request timeout in seconds | 30 |
|
|
47
|
+
| `--max-pages` | Maximum number of pages to crawl | 1000 |
|
|
48
|
+
| `--max-depth` | Maximum crawl depth for recursive mode | 10 |
|
|
49
|
+
| `--allow-private` | Allow crawling domains that resolve to private IPs | Off |
|
|
50
|
+
| `--ignore-robots` | Ignore robots.txt rules | Off |
|
|
51
|
+
|
|
52
|
+
## Output
|
|
53
|
+
|
|
54
|
+
Results are saved to a CSV file named `{domain}_{timestamp}.csv` with columns:
|
|
55
|
+
|
|
56
|
+
- **URL** — the page URL
|
|
57
|
+
- **Title** — the page's `<title>` tag content
|
|
58
|
+
- **Status Code** — HTTP response status
|
|
59
|
+
|
|
60
|
+
When using `-e`, external links are saved to a separate `{domain}_{timestamp}_external_links.csv`.
|
|
61
|
+
|
|
62
|
+
## Security
|
|
63
|
+
|
|
64
|
+
- **SSRF protection**: Domains that resolve to private/reserved IP addresses are blocked by default. Use `--allow-private` to override for legitimate internal use.
|
|
65
|
+
- **robots.txt**: Respected by default. Use `--ignore-robots` to override.
|
|
66
|
+
- **CSV injection**: Output values are sanitized to prevent spreadsheet formula injection.
|
|
67
|
+
- **Crawl limits**: Recursive crawls are bounded by `--max-pages` and `--max-depth` to prevent resource exhaustion.
|
|
68
|
+
|
|
69
|
+
## Roadmap
|
|
70
|
+
|
|
71
|
+
- `--format json` — JSON output format
|
|
72
|
+
- `--check-links` — broken link detection
|
|
73
|
+
- `--images --check-alt` — image inventory with alt text auditing
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "sitewalker"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Crawl a website and create a structured map of its pages"
|
|
5
|
+
authors = ["Neil Johnson <neil@cadent.net>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
packages = [{include = "sitewalker", from = "src"}]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.scripts]
|
|
11
|
+
sitewalker = "sitewalker.cli:main"
|
|
12
|
+
|
|
13
|
+
[tool.poetry.dependencies]
|
|
14
|
+
python = "^3.9"
|
|
15
|
+
requests = "^2.32.3"
|
|
16
|
+
beautifulsoup4 = "^4.12.3"
|
|
17
|
+
urllib3 = "^2.2.3"
|
|
18
|
+
|
|
19
|
+
[tool.poetry.group.dev.dependencies]
|
|
20
|
+
pytest = "^8.3.4"
|
|
21
|
+
pytest-cov = "^6.0.0"
|
|
22
|
+
responses = "^0.25.3"
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["poetry-core"]
|
|
26
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""sitewalker — Crawl a website and create a structured map of its pages."""
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import argparse
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
import requests
|
|
9
|
+
from sitewalker.crawler import WebsiteCrawler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def setup_logging(verbose: bool):
|
|
13
|
+
"""Configure logging based on verbosity level."""
|
|
14
|
+
root_logger = logging.getLogger()
|
|
15
|
+
handler = logging.StreamHandler()
|
|
16
|
+
formatter = logging.Formatter(
|
|
17
|
+
'%(asctime)s - %(levelname)s - %(message)s',
|
|
18
|
+
'%Y-%m-%d %H:%M:%S'
|
|
19
|
+
)
|
|
20
|
+
handler.setFormatter(formatter)
|
|
21
|
+
root_logger.addHandler(handler)
|
|
22
|
+
|
|
23
|
+
if verbose:
|
|
24
|
+
root_logger.setLevel(logging.DEBUG)
|
|
25
|
+
else:
|
|
26
|
+
root_logger.setLevel(logging.INFO)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def main():
|
|
30
|
+
"""Main function to run the crawler."""
|
|
31
|
+
parser = argparse.ArgumentParser(
|
|
32
|
+
description="Crawl a website and create a structured map of its pages"
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"target",
|
|
36
|
+
help="Domain or URL to crawl (e.g., example.com or http://example.com)"
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"-e", "--external-links",
|
|
40
|
+
action="store_true",
|
|
41
|
+
help="Check for external links on the domain"
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"-v", "--verbose",
|
|
45
|
+
action="store_true",
|
|
46
|
+
help="Enable verbose output"
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"-r", "--recursive",
|
|
50
|
+
action="store_true",
|
|
51
|
+
help="Recursively crawl internal links"
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"-p", "--pages",
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Only crawl web pages (HTML, PHP, etc.) and skip other file types"
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"-t", "--timeout",
|
|
60
|
+
type=int,
|
|
61
|
+
default=30,
|
|
62
|
+
help="Request timeout in seconds (default: 30)"
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--max-pages",
|
|
66
|
+
type=int,
|
|
67
|
+
default=1000,
|
|
68
|
+
help="Maximum number of pages to crawl (default: 1000)"
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--max-depth",
|
|
72
|
+
type=int,
|
|
73
|
+
default=10,
|
|
74
|
+
help="Maximum crawl depth for recursive mode (default: 10)"
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--allow-private",
|
|
78
|
+
action="store_true",
|
|
79
|
+
help="Allow crawling domains that resolve to private/reserved IPs"
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--ignore-robots",
|
|
83
|
+
action="store_true",
|
|
84
|
+
help="Ignore robots.txt rules when crawling"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
args = parser.parse_args()
|
|
88
|
+
setup_logging(args.verbose)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
target = args.target
|
|
92
|
+
parsed = urlparse(target)
|
|
93
|
+
|
|
94
|
+
# If bare domain (no scheme), probe HTTPS first
|
|
95
|
+
if parsed.scheme not in ('http', 'https'):
|
|
96
|
+
probe_url = f"https://{target}"
|
|
97
|
+
try:
|
|
98
|
+
requests.head(probe_url, timeout=5, allow_redirects=True)
|
|
99
|
+
target = probe_url
|
|
100
|
+
except requests.ConnectionError:
|
|
101
|
+
logging.error(
|
|
102
|
+
f"Could not connect to {probe_url}\n"
|
|
103
|
+
f"If this site uses HTTP, provide the full URL:\n"
|
|
104
|
+
f" sitewalker http://{target}"
|
|
105
|
+
)
|
|
106
|
+
sys.exit(1)
|
|
107
|
+
|
|
108
|
+
# Extract domain for safe filename
|
|
109
|
+
parsed = urlparse(target)
|
|
110
|
+
safe_domain = parsed.netloc.replace('/', '_').replace('\\', '_').replace('..', '_')
|
|
111
|
+
|
|
112
|
+
crawler = WebsiteCrawler(target, timeout=args.timeout,
|
|
113
|
+
allow_private=args.allow_private,
|
|
114
|
+
ignore_robots=args.ignore_robots)
|
|
115
|
+
timestamp = datetime.now().strftime("%Y-%m-%dT%H%M")
|
|
116
|
+
|
|
117
|
+
if args.external_links:
|
|
118
|
+
crawler.crawl(
|
|
119
|
+
collect_external=True,
|
|
120
|
+
recursive=args.recursive,
|
|
121
|
+
pages_only=args.pages,
|
|
122
|
+
max_pages=args.max_pages,
|
|
123
|
+
max_depth=args.max_depth
|
|
124
|
+
)
|
|
125
|
+
ext_links_root = "_external_links.csv"
|
|
126
|
+
external_links_file = f"{safe_domain}_{timestamp}{ext_links_root}"
|
|
127
|
+
crawler.save_external_links_results(external_links_file)
|
|
128
|
+
logging.info(f"External links saved to {external_links_file}")
|
|
129
|
+
else:
|
|
130
|
+
crawler.crawl(recursive=args.recursive, pages_only=args.pages,
|
|
131
|
+
max_pages=args.max_pages, max_depth=args.max_depth)
|
|
132
|
+
output_file = f"{safe_domain}_{timestamp}.csv"
|
|
133
|
+
crawler.save_results(output_file)
|
|
134
|
+
logging.info(f"Crawling complete! Results saved to {output_file}")
|
|
135
|
+
|
|
136
|
+
logging.info(f"Total pages crawled: {len(crawler.visited_urls)}")
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logging.error(f"An error occurred: {str(e)}")
|
|
140
|
+
sys.exit(1)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# pragma: no cover
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
main()
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import ipaddress
|
|
5
|
+
import socket
|
|
6
|
+
import requests
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from urllib.parse import urljoin, urlparse
|
|
9
|
+
from urllib.robotparser import RobotFileParser
|
|
10
|
+
from typing import Set, List, Tuple
|
|
11
|
+
import time
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# List of file extensions that are considered web pages
|
|
17
|
+
PAGE_EXTENSIONS = {
|
|
18
|
+
'', # for URLs ending in '/'
|
|
19
|
+
'html',
|
|
20
|
+
'htm',
|
|
21
|
+
'php',
|
|
22
|
+
'asp',
|
|
23
|
+
'aspx',
|
|
24
|
+
'jsp',
|
|
25
|
+
'shtml',
|
|
26
|
+
'phtml',
|
|
27
|
+
'xhtml',
|
|
28
|
+
'jspx',
|
|
29
|
+
'do',
|
|
30
|
+
'cfm',
|
|
31
|
+
'cgi'
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
class URLProcessingError(Exception):
|
|
35
|
+
"""Custom exception for URL processing errors"""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
class CrawlingError(Exception):
|
|
39
|
+
"""Custom exception for crawling errors"""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
class SSRFProtectionError(Exception):
|
|
43
|
+
"""Raised when a domain resolves to a private/reserved IP address."""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def validate_domain_ssrf(domain: str) -> None:
|
|
48
|
+
"""Check that a domain does not resolve to a private or reserved IP.
|
|
49
|
+
|
|
50
|
+
Raises SSRFProtectionError if the domain resolves to loopback,
|
|
51
|
+
private, link-local, or reserved address ranges.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
results = socket.getaddrinfo(domain, None)
|
|
55
|
+
for family, _, _, _, sockaddr in results:
|
|
56
|
+
ip = ipaddress.ip_address(sockaddr[0])
|
|
57
|
+
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
|
|
58
|
+
raise SSRFProtectionError(
|
|
59
|
+
f"Domain '{domain}' resolves to private/reserved IP {ip}. "
|
|
60
|
+
f"Use --allow-private to override."
|
|
61
|
+
)
|
|
62
|
+
except socket.gaierror as e:
|
|
63
|
+
raise CrawlingError(f"Cannot resolve domain '{domain}': {e}")
|
|
64
|
+
|
|
65
|
+
class WebsiteCrawler:
|
|
66
|
+
USER_AGENT = 'Mozilla/5.0 (compatible; sitewalker/0.2.0; +https://github.com/cadentdev/sitewalker)'
|
|
67
|
+
|
|
68
|
+
def __init__(self, target: str, timeout: int = 30, allow_private: bool = False,
|
|
69
|
+
ignore_robots: bool = False):
|
|
70
|
+
# Parse target: accept full URL (http://example.com) or bare domain (example.com)
|
|
71
|
+
parsed = urlparse(target)
|
|
72
|
+
if parsed.scheme in ('http', 'https'):
|
|
73
|
+
self.domain = parsed.netloc
|
|
74
|
+
self.base_url = f"{parsed.scheme}://{parsed.netloc}"
|
|
75
|
+
else:
|
|
76
|
+
# Bare domain — assume HTTPS
|
|
77
|
+
self.domain = target
|
|
78
|
+
self.base_url = f"https://{target}"
|
|
79
|
+
|
|
80
|
+
if not allow_private:
|
|
81
|
+
validate_domain_ssrf(self.domain)
|
|
82
|
+
|
|
83
|
+
# Normalize the base URL
|
|
84
|
+
self.base_url, _ = self.process_url(self.base_url)
|
|
85
|
+
self.visited_urls: Set[str] = set()
|
|
86
|
+
self.results: List[Tuple[str, str, int]] = []
|
|
87
|
+
self.external_links: Set[str] = set()
|
|
88
|
+
self.pages_only: bool = False
|
|
89
|
+
self.timeout = timeout
|
|
90
|
+
self.ignore_robots = ignore_robots
|
|
91
|
+
self.robot_parser: RobotFileParser | None = None
|
|
92
|
+
self.session = requests.Session()
|
|
93
|
+
self.session.headers.update({'User-Agent': self.USER_AGENT})
|
|
94
|
+
|
|
95
|
+
def process_url(self, url: str) -> Tuple[str, bool]:
|
|
96
|
+
"""
|
|
97
|
+
Process and validate a URL.
|
|
98
|
+
Returns: (cleaned_url, is_internal)
|
|
99
|
+
Raises: URLProcessingError if URL is invalid
|
|
100
|
+
"""
|
|
101
|
+
if not url:
|
|
102
|
+
raise URLProcessingError("Empty URL")
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
parsed_url = urlparse(url)
|
|
106
|
+
if not parsed_url.scheme or not parsed_url.netloc:
|
|
107
|
+
raise URLProcessingError("Invalid URL format")
|
|
108
|
+
|
|
109
|
+
# Clean URL by removing fragments, query parameters, and trailing slashes
|
|
110
|
+
path = parsed_url.path
|
|
111
|
+
if not path or path == '/':
|
|
112
|
+
path = ''
|
|
113
|
+
else:
|
|
114
|
+
path = path.rstrip('/')
|
|
115
|
+
|
|
116
|
+
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{path}"
|
|
117
|
+
is_internal = self.domain in parsed_url.netloc
|
|
118
|
+
|
|
119
|
+
if parsed_url.scheme not in ('http', 'https'):
|
|
120
|
+
raise URLProcessingError("Unsupported protocol")
|
|
121
|
+
|
|
122
|
+
return clean_url, is_internal
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
raise URLProcessingError(f"URL processing error: {str(e)}")
|
|
126
|
+
|
|
127
|
+
def is_page(self, url: str) -> bool:
|
|
128
|
+
"""
|
|
129
|
+
Check if a URL points to a web page based on its extension or path.
|
|
130
|
+
"""
|
|
131
|
+
try:
|
|
132
|
+
parsed = urlparse(url)
|
|
133
|
+
if not parsed.scheme or not parsed.netloc:
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
path = parsed.path.rstrip('/')
|
|
137
|
+
|
|
138
|
+
# URLs ending with '/' are considered pages (directory index)
|
|
139
|
+
if not path or path.endswith('/'):
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
# Check if the file extension (if any) is in our list of page extensions
|
|
143
|
+
if '.' in path:
|
|
144
|
+
ext = path.split('.')[-1].lower()
|
|
145
|
+
return ext in PAGE_EXTENSIONS
|
|
146
|
+
|
|
147
|
+
# URLs without extensions are considered pages
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.debug(f"Error checking if URL is page: {str(e)}")
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
def _load_robots_txt(self) -> None:
|
|
155
|
+
"""Fetch and parse robots.txt for the target domain."""
|
|
156
|
+
if self.ignore_robots:
|
|
157
|
+
return
|
|
158
|
+
robots_url = f"{self.base_url}/robots.txt"
|
|
159
|
+
try:
|
|
160
|
+
resp = self.session.get(robots_url, timeout=self.timeout)
|
|
161
|
+
if resp.status_code == 200:
|
|
162
|
+
rp = RobotFileParser()
|
|
163
|
+
rp.set_url(robots_url)
|
|
164
|
+
rp.parse(resp.text.splitlines())
|
|
165
|
+
self.robot_parser = rp
|
|
166
|
+
logger.info(f"Loaded robots.txt from {robots_url}")
|
|
167
|
+
else:
|
|
168
|
+
logger.info(f"No robots.txt found at {robots_url} (HTTP {resp.status_code})")
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.warning(f"Could not load robots.txt from {robots_url}: {e}")
|
|
171
|
+
self.robot_parser = None
|
|
172
|
+
|
|
173
|
+
def _is_allowed_by_robots(self, url: str) -> bool:
|
|
174
|
+
"""Check if a URL is allowed by robots.txt rules."""
|
|
175
|
+
if self.ignore_robots or self.robot_parser is None:
|
|
176
|
+
return True
|
|
177
|
+
return self.robot_parser.can_fetch(self.USER_AGENT, url)
|
|
178
|
+
|
|
179
|
+
def crawl(self, collect_external: bool = False, recursive: bool = False,
|
|
180
|
+
pages_only: bool = False, max_pages: int = 1000, max_depth: int = 10) -> None:
|
|
181
|
+
"""
|
|
182
|
+
Crawl the website starting from the base URL.
|
|
183
|
+
|
|
184
|
+
In non-recursive mode:
|
|
185
|
+
- Crawls the base URL and follows internal links found on that page
|
|
186
|
+
- Does not follow links found on subsequent pages
|
|
187
|
+
|
|
188
|
+
In recursive mode:
|
|
189
|
+
- Crawls the base URL and follows all internal links recursively
|
|
190
|
+
- Continues until all reachable internal pages are visited
|
|
191
|
+
"""
|
|
192
|
+
self.pages_only = pages_only
|
|
193
|
+
self.max_pages = max_pages
|
|
194
|
+
self.max_depth = max_depth
|
|
195
|
+
self._load_robots_txt()
|
|
196
|
+
logger.info(f"Starting crawl of {self.base_url}")
|
|
197
|
+
logger.info(f"Mode: {'Recursive' if recursive else 'Single-level'} crawl, "
|
|
198
|
+
f"{'collecting' if collect_external else 'ignoring'} external links, "
|
|
199
|
+
f"{'pages only' if pages_only else 'all files'}, "
|
|
200
|
+
f"max_pages={max_pages}, max_depth={max_depth}")
|
|
201
|
+
self._crawl_page(self.base_url, collect_external, recursive, depth=0)
|
|
202
|
+
logger.info(f"Crawl complete. Visited {len(self.visited_urls)} pages")
|
|
203
|
+
if collect_external:
|
|
204
|
+
logger.info(f"Found {len(self.external_links)} unique external links")
|
|
205
|
+
|
|
206
|
+
def _crawl_page(self, url: str, collect_external: bool, recursive: bool, depth: int = 0) -> None:
|
|
207
|
+
"""Internal method to crawl a single page and process its links."""
|
|
208
|
+
if len(self.visited_urls) >= self.max_pages:
|
|
209
|
+
logger.info(f"Reached max_pages limit ({self.max_pages})")
|
|
210
|
+
return
|
|
211
|
+
if depth > self.max_depth:
|
|
212
|
+
logger.debug(f"Reached max_depth limit ({self.max_depth}) at {url}")
|
|
213
|
+
return
|
|
214
|
+
try:
|
|
215
|
+
clean_url, is_internal = self.process_url(url)
|
|
216
|
+
if not is_internal or clean_url in self.visited_urls:
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
# Check robots.txt rules
|
|
220
|
+
if not self._is_allowed_by_robots(clean_url):
|
|
221
|
+
logger.debug(f"Blocked by robots.txt: {clean_url}")
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
# Skip non-page URLs if pages_only is True
|
|
225
|
+
if self.pages_only and not self.is_page(clean_url):
|
|
226
|
+
logger.debug(f"Skipping non-page URL: {clean_url}")
|
|
227
|
+
return
|
|
228
|
+
|
|
229
|
+
self.visited_urls.add(clean_url)
|
|
230
|
+
logger.debug(f"Crawling {clean_url}")
|
|
231
|
+
|
|
232
|
+
response = self.session.get(clean_url, timeout=self.timeout)
|
|
233
|
+
response.raise_for_status()
|
|
234
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
235
|
+
|
|
236
|
+
# Process page title
|
|
237
|
+
title = soup.title.string.strip() if soup.title else "No title"
|
|
238
|
+
self.results.append((clean_url, title, response.status_code))
|
|
239
|
+
|
|
240
|
+
# Process links
|
|
241
|
+
for link in soup.find_all('a', href=True):
|
|
242
|
+
next_url = urljoin(url, link['href'])
|
|
243
|
+
try:
|
|
244
|
+
next_clean_url, next_is_internal = self.process_url(next_url)
|
|
245
|
+
|
|
246
|
+
if next_is_internal and recursive:
|
|
247
|
+
if next_clean_url not in self.visited_urls:
|
|
248
|
+
self._crawl_page(next_clean_url, collect_external, recursive, depth + 1)
|
|
249
|
+
elif not next_is_internal and collect_external:
|
|
250
|
+
self.external_links.add(next_clean_url)
|
|
251
|
+
|
|
252
|
+
except URLProcessingError:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
except requests.HTTPError as e:
|
|
256
|
+
logger.error(f"HTTP Error crawling {url}: {str(e)}")
|
|
257
|
+
self.results.append((url, "Error", e.response.status_code))
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.error(f"Error crawling {url}: {str(e)}")
|
|
260
|
+
self.results.append((url, "Error", 0))
|
|
261
|
+
|
|
262
|
+
time.sleep(1) # Be polite
|
|
263
|
+
|
|
264
|
+
@staticmethod
|
|
265
|
+
def _sanitize_csv_value(value: str) -> str:
|
|
266
|
+
"""Sanitize a value for safe CSV output.
|
|
267
|
+
|
|
268
|
+
Prevents CSV injection by prefixing dangerous characters that
|
|
269
|
+
spreadsheet applications interpret as formulas.
|
|
270
|
+
"""
|
|
271
|
+
if isinstance(value, str) and value and value[0] in ('=', '+', '-', '@', '\t', '\r'):
|
|
272
|
+
return "'" + value
|
|
273
|
+
return value
|
|
274
|
+
|
|
275
|
+
def save_results(self, output_file: str) -> None:
|
|
276
|
+
"""Save results to a CSV file."""
|
|
277
|
+
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
278
|
+
writer = csv.writer(f)
|
|
279
|
+
writer.writerow(['URL', 'Title', 'Status Code'])
|
|
280
|
+
for url, title, status in self.results:
|
|
281
|
+
writer.writerow([
|
|
282
|
+
self._sanitize_csv_value(url),
|
|
283
|
+
self._sanitize_csv_value(title),
|
|
284
|
+
status
|
|
285
|
+
])
|
|
286
|
+
logger.info(f"Results saved to {output_file}")
|
|
287
|
+
|
|
288
|
+
def save_external_links_results(self, filename: str) -> None:
|
|
289
|
+
"""Save external links to a CSV file."""
|
|
290
|
+
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
|
291
|
+
writer = csv.writer(csvfile)
|
|
292
|
+
writer.writerow(['External URL'])
|
|
293
|
+
for url in sorted(self.external_links):
|
|
294
|
+
writer.writerow([self._sanitize_csv_value(url)])
|
|
295
|
+
logger.info(f"External links saved to {filename}")
|