ansferatu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ansferatu/__init__.py +19 -0
- ansferatu/__main__.py +4 -0
- ansferatu/cli.py +252 -0
- ansferatu/profiles/CommonExtractor.py +309 -0
- ansferatu/profiles/CommonFetcher.py +112 -0
- ansferatu/profiles/CommonHTMLHandler.py +174 -0
- ansferatu/profiles/FileSaver.py +43 -0
- ansferatu/profiles/FormDetector.py +526 -0
- ansferatu/profiles/FormFiller.py +355 -0
- ansferatu/profiles/FormFilter.py +45 -0
- ansferatu/profiles/HeadlessCandidate.py +171 -0
- ansferatu/profiles/HeadlessExtractor.py +171 -0
- ansferatu/profiles/HeadlessFormInteractor.py +418 -0
- ansferatu/profiles/JsonlWriter.py +87 -0
- ansferatu/profiles/MyProxies.py +18 -0
- ansferatu/profiles/ResponseFilter.py +62 -0
- ansferatu/profiles/UrlFilter.py +46 -0
- ansferatu/profiles/VisitLimit.py +112 -0
- ansferatu/profiles/__init__.py +0 -0
- ansferatu/profiles/form_helpers/__init__.py +1 -0
- ansferatu/profiles/form_helpers/actions.py +432 -0
- ansferatu/profiles/form_helpers/browser_lifecycle.py +76 -0
- ansferatu/profiles/form_helpers/buttons.py +216 -0
- ansferatu/profiles/form_helpers/constants.py +48 -0
- ansferatu/profiles/form_helpers/field_classifier.py +469 -0
- ansferatu/profiles/form_helpers/filters.py +56 -0
- ansferatu/profiles/form_helpers/form_classifier.py +98 -0
- ansferatu/profiles/form_helpers/overlay.py +129 -0
- ansferatu/profiles/form_helpers/signature.py +64 -0
- ansferatu/profiles/form_helpers/visited_forms.py +47 -0
- ansferatu/profiles/modes.py +149 -0
- ansferatu/profiles/network_constants.py +35 -0
- ansferatu/profiles/resource_check.py +91 -0
- ansferatu/profiles/response_dedup.py +196 -0
- ansferatu/spider/__init__.py +8 -0
- ansferatu/spider/common/__init__.py +0 -0
- ansferatu/spider/common/url.py +512 -0
- ansferatu/spider/concurrent/__init__.py +8 -0
- ansferatu/spider/concurrent/threads_inst/__init__.py +14 -0
- ansferatu/spider/concurrent/threads_inst/base.py +180 -0
- ansferatu/spider/concurrent/threads_inst/extract.py +40 -0
- ansferatu/spider/concurrent/threads_inst/fetch.py +72 -0
- ansferatu/spider/concurrent/threads_inst/form_interact.py +44 -0
- ansferatu/spider/concurrent/threads_inst/headless.py +44 -0
- ansferatu/spider/concurrent/threads_inst/html_handle.py +39 -0
- ansferatu/spider/concurrent/threads_inst/proxies.py +33 -0
- ansferatu/spider/concurrent/threads_inst/save.py +38 -0
- ansferatu/spider/concurrent/threads_pool.py +356 -0
- ansferatu/spider/instances/__init__.py +12 -0
- ansferatu/spider/instances/inst_extract.py +29 -0
- ansferatu/spider/instances/inst_fetch.py +46 -0
- ansferatu/spider/instances/inst_form_interact.py +31 -0
- ansferatu/spider/instances/inst_headless.py +31 -0
- ansferatu/spider/instances/inst_html_handle.py +29 -0
- ansferatu/spider/instances/inst_proxies.py +31 -0
- ansferatu/spider/instances/inst_save.py +32 -0
- ansferatu/spider/utilities/__init__.py +10 -0
- ansferatu/spider/utilities/cfilter.py +44 -0
- ansferatu/spider/utilities/cresult.py +133 -0
- ansferatu/spider/utilities/ctask.py +179 -0
- ansferatu/spider/utilities/functions.py +84 -0
- ansferatu/spider/wappalyzer/__init__.py +0 -0
- ansferatu/spider/wappalyzer/all.json +15481 -0
- ansferatu/spider/wappalyzer/functional.json +295 -0
- ansferatu/spider/wappalyzer/scanner.json +700 -0
- ansferatu/spider/wappalyzer/wappalyzer.py +249 -0
- ansferatu/spider/wappalyzer/webpage.py +118 -0
- ansferatu-0.1.0.dist-info/METADATA +409 -0
- ansferatu-0.1.0.dist-info/RECORD +73 -0
- ansferatu-0.1.0.dist-info/WHEEL +5 -0
- ansferatu-0.1.0.dist-info/entry_points.txt +2 -0
- ansferatu-0.1.0.dist-info/licenses/LICENSE +21 -0
- ansferatu-0.1.0.dist-info/top_level.txt +1 -0
ansferatu/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Ansferatu - multifunctional tool for HTTP reconnaissance, web crawling and directory bruteforce.
|
|
2
|
+
|
|
3
|
+
Public library API
|
|
4
|
+
------------------
|
|
5
|
+
The most common entry points for using Ansferatu programmatically are the
|
|
6
|
+
high-level orchestration helpers::
|
|
7
|
+
|
|
8
|
+
from ansferatu import common_crawler, common_brute_from_file
|
|
9
|
+
|
|
10
|
+
For lower-level control you can build a spider directly::
|
|
11
|
+
|
|
12
|
+
from ansferatu.spider import WebSpider, TaskFetch
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
|
|
17
|
+
from ansferatu.profiles.modes import common_crawler, common_brute_from_file
|
|
18
|
+
|
|
19
|
+
__all__ = ["common_crawler", "common_brute_from_file", "__version__"]
|
ansferatu/__main__.py
ADDED
ansferatu/cli.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from ansferatu.spider.common.url import Url
|
|
7
|
+
from ansferatu.profiles.modes import common_crawler
|
|
8
|
+
from ansferatu.profiles.modes import common_brute_from_file
|
|
9
|
+
|
|
10
|
+
# Create class instance and timestamp for report file
|
|
11
|
+
now = datetime.now()
|
|
12
|
+
|
|
13
|
+
# configure logging
|
|
14
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s\t%(levelname)s\t%(message)s")
|
|
15
|
+
logger = logging.getLogger()
|
|
16
|
+
|
|
17
|
+
class AnsferatuRunner:
|
|
18
|
+
def __init__(self, args):
|
|
19
|
+
logger.info("Welcome to Ansferatu")
|
|
20
|
+
self.timestamp_start = datetime.timestamp(now)
|
|
21
|
+
|
|
22
|
+
self.runner_args = args
|
|
23
|
+
|
|
24
|
+
if self.runner_args.input_file:
|
|
25
|
+
self.url_list = open(self.runner_args.input_file, "r+").read().splitlines()
|
|
26
|
+
else:
|
|
27
|
+
self.url_list = self.runner_args.url_list_initial
|
|
28
|
+
|
|
29
|
+
if not self.runner_args.scope_list:
|
|
30
|
+
self.new_scope = self.create_scope()
|
|
31
|
+
else:
|
|
32
|
+
self.new_scope = self.runner_args.scope_list
|
|
33
|
+
|
|
34
|
+
if not os.path.exists(self.runner_args.output_dir):
|
|
35
|
+
os.makedirs(self.runner_args.output_dir)
|
|
36
|
+
|
|
37
|
+
def create_scope(self):
|
|
38
|
+
scope_domains_list = []
|
|
39
|
+
for url in self.url_list:
|
|
40
|
+
n_url = Url(url)
|
|
41
|
+
scope_domains_list.append(str(n_url.domain))
|
|
42
|
+
|
|
43
|
+
return scope_domains_list
|
|
44
|
+
|
|
45
|
+
def run(self):
|
|
46
|
+
if self.runner_args.mode == "crawl":
|
|
47
|
+
self.crawl()
|
|
48
|
+
elif self.runner_args.mode == "brute":
|
|
49
|
+
self.brute()
|
|
50
|
+
self.dump_result()
|
|
51
|
+
|
|
52
|
+
def crawl(self):
|
|
53
|
+
common_crawler(
|
|
54
|
+
self.url_list,
|
|
55
|
+
self.new_scope,
|
|
56
|
+
max_deep=self.runner_args.max_deep,
|
|
57
|
+
exclude_codes_list=self.runner_args.exclude_codes_list,
|
|
58
|
+
visit_count_limit=self.runner_args.visit_count_limit,
|
|
59
|
+
threads=self.runner_args.threads,
|
|
60
|
+
headless=getattr(self.runner_args, 'headless', False),
|
|
61
|
+
fill_forms=getattr(self.runner_args, 'fill_forms', False),
|
|
62
|
+
output_file=self.runner_args.jsonl_output,
|
|
63
|
+
skip_dedup=self.runner_args.skip_dedup,
|
|
64
|
+
headless_workers=getattr(self.runner_args, 'headless_workers', 1),
|
|
65
|
+
max_headless_mem_mb=getattr(self.runner_args, 'max_headless_mem_mb', None),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def brute(self):
|
|
69
|
+
common_brute_from_file(
|
|
70
|
+
self.url_list,
|
|
71
|
+
self.runner_args.exclude_codes_list,
|
|
72
|
+
_dictionary_file=self.runner_args.wordlist_big,
|
|
73
|
+
threads=self.runner_args.threads,
|
|
74
|
+
output_file=self.runner_args.jsonl_output,
|
|
75
|
+
skip_dedup=self.runner_args.skip_dedup,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def dump_result(self):
|
|
79
|
+
# Dump results in file
|
|
80
|
+
for domain in self.new_scope:
|
|
81
|
+
logger.info("creating output file...")
|
|
82
|
+
result_file = f"{self.runner_args.output_dir}final-{domain}+{self.timestamp_start}.{self.runner_args.report_mode}"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def parse_arguments(argv=None):
|
|
86
|
+
parser = argparse.ArgumentParser()
|
|
87
|
+
|
|
88
|
+
# Usual arguments which are applicable for the whole script / top-level args
|
|
89
|
+
parser.add_argument("--verbose", help="Common top-level parameter", action="store_true", required=False)
|
|
90
|
+
|
|
91
|
+
# Same subparsers as usual
|
|
92
|
+
subparsers = parser.add_subparsers(help="Desired action to perform", dest="mode")
|
|
93
|
+
|
|
94
|
+
# Usual subparsers not using common options
|
|
95
|
+
# parser_other = subparsers.add_parser("extra-action", help='Do something without db')
|
|
96
|
+
|
|
97
|
+
# Create parent subparser. Note `add_help=False` and creation via `argparse.`
|
|
98
|
+
parent_parser = argparse.ArgumentParser(add_help=False)
|
|
99
|
+
parent_parser.add_argument(
|
|
100
|
+
"-o",
|
|
101
|
+
"--output",
|
|
102
|
+
metavar="Output Directory",
|
|
103
|
+
type=str,
|
|
104
|
+
dest="output_dir",
|
|
105
|
+
default="/tmp/",
|
|
106
|
+
help="Output directory for reports. By default: /tmp/",
|
|
107
|
+
)
|
|
108
|
+
parent_parser.add_argument(
|
|
109
|
+
"--jsonl",
|
|
110
|
+
metavar="FILE",
|
|
111
|
+
type=str,
|
|
112
|
+
dest="jsonl_output",
|
|
113
|
+
default=None,
|
|
114
|
+
help="Path to JSONL output file (Nuclei-compatible proxify format)",
|
|
115
|
+
)
|
|
116
|
+
parent_parser.add_argument(
|
|
117
|
+
"-s",
|
|
118
|
+
"--scope",
|
|
119
|
+
metavar="InScope domains",
|
|
120
|
+
dest="scope_list",
|
|
121
|
+
nargs="+",
|
|
122
|
+
default=[],
|
|
123
|
+
help="List of domains that are in scope of current scan. Split by space. "
|
|
124
|
+
"Example: news.example.com blog.example.com admin.example.com",
|
|
125
|
+
)
|
|
126
|
+
parent_parser.add_argument(
|
|
127
|
+
"--report",
|
|
128
|
+
choices=["html", "json"],
|
|
129
|
+
default="html",
|
|
130
|
+
dest="report_mode",
|
|
131
|
+
help="Type of report format",
|
|
132
|
+
)
|
|
133
|
+
parent_parser.add_argument(
|
|
134
|
+
"--exclude_codes",
|
|
135
|
+
default=[403, 404, 401],
|
|
136
|
+
nargs="*",
|
|
137
|
+
dest="exclude_codes_list",
|
|
138
|
+
help="List of filtered HTTP answer's codes",
|
|
139
|
+
)
|
|
140
|
+
parent_parser.add_argument(
|
|
141
|
+
"--threads",
|
|
142
|
+
"-t",
|
|
143
|
+
metavar="Max_threads",
|
|
144
|
+
type=int,
|
|
145
|
+
dest="threads",
|
|
146
|
+
default=10,
|
|
147
|
+
help="Number of HTTP threads",
|
|
148
|
+
)
|
|
149
|
+
parent_parser.add_argument(
|
|
150
|
+
"--sd",
|
|
151
|
+
"--skip-deduplication",
|
|
152
|
+
action="store_true",
|
|
153
|
+
default=False,
|
|
154
|
+
dest="skip_dedup",
|
|
155
|
+
help="Disable response deduplication (keep all responses)",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
group = parent_parser.add_mutually_exclusive_group(required=True)
|
|
159
|
+
group.add_argument(
|
|
160
|
+
"-u",
|
|
161
|
+
"--urls",
|
|
162
|
+
metavar="URLs",
|
|
163
|
+
type=str,
|
|
164
|
+
dest="url_list_initial",
|
|
165
|
+
nargs="*",
|
|
166
|
+
help="URLs or list of URLs for start. Example: https://mail.ru/",
|
|
167
|
+
)
|
|
168
|
+
group.add_argument(
|
|
169
|
+
"-f",
|
|
170
|
+
"--file",
|
|
171
|
+
metavar="Input file with URLs",
|
|
172
|
+
type=str,
|
|
173
|
+
dest="input_file",
|
|
174
|
+
help="Input file contains URLs (1 URL per line)",
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
parser_crawl = subparsers.add_parser("crawl", parents=[parent_parser], help="Create something")
|
|
178
|
+
parser_crawl.add_argument(
|
|
179
|
+
"--limit",
|
|
180
|
+
metavar="Limits",
|
|
181
|
+
type=int,
|
|
182
|
+
dest="visit_count_limit",
|
|
183
|
+
default=10,
|
|
184
|
+
help="Limit of request to subdirectories",
|
|
185
|
+
)
|
|
186
|
+
parser_crawl.add_argument(
|
|
187
|
+
"--deep",
|
|
188
|
+
metavar="Max_deep",
|
|
189
|
+
type=int,
|
|
190
|
+
dest="max_deep",
|
|
191
|
+
default=2,
|
|
192
|
+
help="Max deep of request to subdirectories",
|
|
193
|
+
)
|
|
194
|
+
parser_crawl.add_argument(
|
|
195
|
+
"--headless",
|
|
196
|
+
action="store_true",
|
|
197
|
+
default=False,
|
|
198
|
+
help="Enable Playwright headless extraction for root/absolute URLs",
|
|
199
|
+
)
|
|
200
|
+
parser_crawl.add_argument(
|
|
201
|
+
"--fill-forms",
|
|
202
|
+
action="store_true",
|
|
203
|
+
default=False,
|
|
204
|
+
dest="fill_forms",
|
|
205
|
+
help="Enable form detection and interaction (requires --headless). "
|
|
206
|
+
"Detects forms on pages, fills fields with smart defaults, and submits them.",
|
|
207
|
+
)
|
|
208
|
+
parser_crawl.add_argument(
|
|
209
|
+
"--headless-workers",
|
|
210
|
+
metavar="N",
|
|
211
|
+
type=int,
|
|
212
|
+
dest="headless_workers",
|
|
213
|
+
default=1,
|
|
214
|
+
help="Number of parallel headless workers (HeadlessExtractor + FormInteractor). "
|
|
215
|
+
"Each worker spawns its own Chromium (~250MB RAM, ~1 CPU under load). "
|
|
216
|
+
"Default 1 preserves legacy behaviour; raise to 3-5 to drain a backlog faster.",
|
|
217
|
+
)
|
|
218
|
+
parser_crawl.add_argument(
|
|
219
|
+
"--max-headless-mem-mb",
|
|
220
|
+
metavar="MB",
|
|
221
|
+
type=int,
|
|
222
|
+
dest="max_headless_mem_mb",
|
|
223
|
+
default=None,
|
|
224
|
+
help="Hard cap on estimated headless RAM usage (MB). When set and the "
|
|
225
|
+
"estimate (workers * 2 browsers * ~350MB) exceeds it, startup aborts. "
|
|
226
|
+
"When unset, an advisory warning is logged if estimate exceeds 70%% "
|
|
227
|
+
"of currently available RAM.",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
parser_brute = subparsers.add_parser("brute", parents=[parent_parser], help="Update something")
|
|
231
|
+
parser_brute.add_argument(
|
|
232
|
+
"-w",
|
|
233
|
+
"--wordlist",
|
|
234
|
+
required=True,
|
|
235
|
+
metavar="=wordlist file",
|
|
236
|
+
type=str,
|
|
237
|
+
dest="wordlist_big",
|
|
238
|
+
help="Path to wordlist file for brute",
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return parser.parse_args(argv)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def main(argv=None):
|
|
245
|
+
"""Console entry point for the ``ansferatu`` command."""
|
|
246
|
+
args = parse_arguments(argv)
|
|
247
|
+
ansferatu_runner = AnsferatuRunner(args)
|
|
248
|
+
ansferatu_runner.run()
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
if __name__ == "__main__":
|
|
252
|
+
main()
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
from ansferatu import spider
|
|
2
|
+
import re
|
|
3
|
+
from ansferatu.spider.common.url import Url
|
|
4
|
+
|
|
5
|
+
# Cap the size of response bodies we feed into the heavy URL-extraction
|
|
6
|
+
# regex. Running ``common_regexp_compiled`` (many alternations, VERBOSE
|
|
7
|
+
# mode) over minified JS bundles of 2–10 MB is several seconds of pure
|
|
8
|
+
# Python regex holding the GIL, which starves other threads (notably the
|
|
9
|
+
# Playwright form interactor). 500 KB is enough to catch nearly all real
|
|
10
|
+
# URLs in HTML/JS — anything further is almost always duplicates.
|
|
11
|
+
MAX_EXTRACT_BODY_BYTES = 500 * 1024
|
|
12
|
+
|
|
13
|
+
# Response types the extractor actually mines for URLs. Everything
|
|
14
|
+
# else (images, binaries, fonts, large JSON blobs, …) produces no
|
|
15
|
+
# useful output and is skipped to save GIL time. ``other`` is
|
|
16
|
+
# included because ``detect_response_type`` returns it for responses
|
|
17
|
+
# with no Content-Type header — these are rare but occasionally
|
|
18
|
+
# contain useful URLs (e.g. raw redirect bodies).
|
|
19
|
+
_EXTRACTABLE_TYPES = frozenset({"html", "js", "robots", "sitemap", "other"})
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CommonExtractor(spider.Extractor):
|
|
23
|
+
"""
|
|
24
|
+
Class for getting new links from response HTMLs, JS files, etc.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, max_deep=0, scope_list=None):
|
|
28
|
+
"""
|
|
29
|
+
"""
|
|
30
|
+
super().__init__()
|
|
31
|
+
if scope_list is None:
|
|
32
|
+
scope_list = []
|
|
33
|
+
self._max_deep = max_deep
|
|
34
|
+
self._scope_list = scope_list
|
|
35
|
+
|
|
36
|
+
self.js_content_types = ["text/javascript", "application/x-ecmascript", "application/x-javascript",
|
|
37
|
+
"application/javascript", "text/ecmascript", "text/x-ecmascript", "text/x-javascript",
|
|
38
|
+
"application/ecmascript", "text/jscript"]
|
|
39
|
+
|
|
40
|
+
self.html_content_types = ["text/html"]
|
|
41
|
+
|
|
42
|
+
# Regex used in linkfinder
|
|
43
|
+
self.common_regexp_str = r"""
|
|
44
|
+
|
|
45
|
+
(?:"|') # Start newline delimiter
|
|
46
|
+
|
|
47
|
+
(
|
|
48
|
+
((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or //
|
|
49
|
+
[^"'/]{1,}\. # Match a domainname (any character + dot)
|
|
50
|
+
[a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path
|
|
51
|
+
|
|
52
|
+
|
|
|
53
|
+
|
|
54
|
+
((?:/|\.\./|\./) # Start with /,../,./
|
|
55
|
+
[^"'><,;| *()(%%$^/\\\[\]] # Next character can't be...
|
|
56
|
+
[^"'><,;|()]{1,}) # Rest of the characters can't be
|
|
57
|
+
|
|
58
|
+
|
|
|
59
|
+
|
|
60
|
+
([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with /
|
|
61
|
+
[a-zA-Z0-9_\-/]{1,} # Resource name
|
|
62
|
+
\.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action)
|
|
63
|
+
(?:[\?|/][^"|']{0,}|)) # ? mark with parameters
|
|
64
|
+
|
|
65
|
+
|
|
|
66
|
+
|
|
67
|
+
([a-zA-Z0-9_\-]{1,} # filename
|
|
68
|
+
\.(?:|html|php|txt|htm|aspx|
|
|
69
|
+
asp|js|css|pgsql|mysql|pdf|
|
|
70
|
+
cgi|inc|gif|jpg|swf|xml|cfm|
|
|
71
|
+
xhtml|wmv|zip|axd|gz|png|doc|
|
|
72
|
+
shtml|jsp|ico|exe|csi|config|
|
|
73
|
+
jpeg|ashx|log|xls|old|mp3|tar
|
|
74
|
+
|ini|asa|tgz|flv|php3|bak|rar|
|
|
75
|
+
asmx|xlsx|page|phtml|dll|asax|
|
|
76
|
+
pl|csv|ppt|bmp|sql|new|avi|psd|
|
|
77
|
+
rss|wav|action|db|dat|do|xsl|
|
|
78
|
+
class|mdb|include|cs|htc|mov|
|
|
79
|
+
mpg|rdf|rtf|ascx|files|jar|vb|
|
|
80
|
+
mp4|local|docx|php5|wci|readme|
|
|
81
|
+
cfg|cfc|lck|ttf|jhtml|mpeg|php4|
|
|
82
|
+
tif|json|zif|shtm|sitemap|tmp|
|
|
83
|
+
backup|conf|settings|cab|asx|
|
|
84
|
+
msi|bin|htaccess|java|jsf|bat|
|
|
85
|
+
print|ics|svc|vbs|img|inf|ajax|
|
|
86
|
+
chm|m3u|py|sh|store|webinfo|jad|
|
|
87
|
+
stm|webresource|lock|phps|pptx|
|
|
88
|
+
xsd|crt|hmtl|index|iso|taf|war|
|
|
89
|
+
xslt|go|gpx|ihtml|odt|sample|spider|
|
|
90
|
+
cer|html|lib|lnk|mhtml|pgp|
|
|
91
|
+
text|view|asc|dtd|html1|ogg|out|
|
|
92
|
+
pgt|php|rb|rhtml|wsdl|apsx|asf|
|
|
93
|
+
dot|git|hta|php2|phtm|psql|reg|
|
|
94
|
+
rpm|tiff|cfml|dta|jp|php~|ps|
|
|
95
|
+
raw|svg|svn|thtml|xhtm|aspx|
|
|
96
|
+
bhtml|bml|ca|cache|htmls|htx|
|
|
97
|
+
jpe|jspf|access|app|asd|asm|bak2|
|
|
98
|
+
deb|epub|htlm|jnlp|js2|jspx|
|
|
99
|
+
php1|phpp|pop3|pwd|pyc|session|
|
|
100
|
+
setup|swp|temp) # . + extension
|
|
101
|
+
(?:\?[^"|']{0,}|)) # ? mark with parameters
|
|
102
|
+
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
(?:"|') # End newline delimiter
|
|
106
|
+
|
|
107
|
+
|
|
|
108
|
+
|
|
109
|
+
(?<=(?<=href=)|(?<=src=))((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
self.common_regexp_compiled = re.compile(self.common_regexp_str, re.VERBOSE)
|
|
113
|
+
|
|
114
|
+
self.robots_regexp_str = "Allow: (.*)|Disallow: (.*)"
|
|
115
|
+
self.robots_regexp_compiled = re.compile(self.robots_regexp_str, re.VERBOSE)
|
|
116
|
+
|
|
117
|
+
self.sitemap_regexp_str = "<loc>(.*?)</loc>"
|
|
118
|
+
self.sitemap_regexp_compiled = re.compile(self.sitemap_regexp_str, re.VERBOSE)
|
|
119
|
+
|
|
120
|
+
self.regex_common2 = re.compile(r'https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', re.VERBOSE)
|
|
121
|
+
|
|
122
|
+
# Regex for finding standalone domain names (not full URLs).
|
|
123
|
+
# Targets bare domains in quoted strings — JS configs, arrays, HTML attributes, etc.
|
|
124
|
+
# e.g. "nordvpn.com", 'china-with-nord.org', "sub.domain.example.net"
|
|
125
|
+
# Uses post-filtering (not_tld_extensions) to discard file-name false positives.
|
|
126
|
+
self.domain_regexp_str = r"""
|
|
127
|
+
(?:"|') # Opening quote
|
|
128
|
+
(?![a-zA-Z]{1,10}://) # NOT a full URL (http://, ftp://, etc.)
|
|
129
|
+
(?!//) # NOT a protocol-relative URL
|
|
130
|
+
(
|
|
131
|
+
(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])? # Domain label (RFC-compliant)
|
|
132
|
+
\.){1,10} # 1-10 labels, each followed by dot
|
|
133
|
+
[a-zA-Z]{2,15} # TLD: 2-15 alpha chars
|
|
134
|
+
)
|
|
135
|
+
(?:"|') # Closing quote
|
|
136
|
+
"""
|
|
137
|
+
self.domain_regexp_compiled = re.compile(self.domain_regexp_str, re.VERBOSE)
|
|
138
|
+
|
|
139
|
+
# Whitelist of popular TLDs — only domains ending with these are accepted.
|
|
140
|
+
# Keeps precision high: no .js/.html/.png false positives possible.
|
|
141
|
+
self.allowed_tlds = {
|
|
142
|
+
# Classic gTLDs
|
|
143
|
+
'com', 'org', 'net', 'edu', 'gov', 'biz',
|
|
144
|
+
# New gTLDs (tech / startup / business)
|
|
145
|
+
'io', 'ai', 'app', 'dev', 'co',
|
|
146
|
+
'xyz', 'tech', 'site', 'online', 'website', 'cloud',
|
|
147
|
+
'network', 'group', 'review',
|
|
148
|
+
'security', 'services', 'solutions', 'support', 'systems',
|
|
149
|
+
'wiki',
|
|
150
|
+
# Major country-code TLDs (Europe)
|
|
151
|
+
'uk', 'de', 'fr', 'nl', 'it', 'es', 'pt', 'eu', 'ch', 'at', 'be',
|
|
152
|
+
'se', 'no', 'dk', 'fi', 'ie', 'pl', 'cz', 'ro', 'hu', 'bg', 'hr',
|
|
153
|
+
'sk', 'si', 'lt', 'lv', 'ee', 'gr', 'ua', 'by', 'ru',
|
|
154
|
+
# Major country-code TLDs (Americas)
|
|
155
|
+
'us', 'ca', 'mx', 'br', 'ar', 'cl', 'co',
|
|
156
|
+
# Major country-code TLDs (Asia-Pacific)
|
|
157
|
+
'cn', 'jp', 'kr', 'in', 'au', 'nz', 'sg', 'hk', 'tw', 'th',
|
|
158
|
+
'ph', 'id', 'my', 'vn', 'pk',
|
|
159
|
+
# Major country-code TLDs (Middle East / Africa)
|
|
160
|
+
'il', 'ae', 'sa', 'tr', 'eg', 'za', 'ng', 'ke',
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
def regexp_search_common(self, data):
|
|
164
|
+
items = [m.group(1) for m in self.common_regexp_compiled.finditer(data)]
|
|
165
|
+
items2 = [m.group(0) for m in self.regex_common2.finditer(data)]
|
|
166
|
+
return items + items2
|
|
167
|
+
|
|
168
|
+
def regexp_search_domains(self, data):
|
|
169
|
+
"""Find standalone domain names (not full URLs) in text.
|
|
170
|
+
|
|
171
|
+
Targets bare domains in quoted strings — common in JS configs, arrays,
|
|
172
|
+
GA linker domains, CSP headers, CORS whitelists, etc.
|
|
173
|
+
Only keeps domains whose TLD is in the allowed_tlds whitelist.
|
|
174
|
+
Returns deduplicated, lowercased domain list.
|
|
175
|
+
"""
|
|
176
|
+
raw = [m.group(1) for m in self.domain_regexp_compiled.finditer(data)]
|
|
177
|
+
filtered = []
|
|
178
|
+
for domain in raw:
|
|
179
|
+
tld = domain.rsplit('.', 1)[-1].lower()
|
|
180
|
+
if tld in self.allowed_tlds:
|
|
181
|
+
filtered.append(domain.lower())
|
|
182
|
+
return list(set(filtered))
|
|
183
|
+
|
|
184
|
+
def regexp_search_robots(self, data):
|
|
185
|
+
items = [m.group(2) for m in self.robots_regexp_compiled.finditer(data)]
|
|
186
|
+
return items
|
|
187
|
+
|
|
188
|
+
def regexp_search_sitemap(self, data):
|
|
189
|
+
items = [m.group(1) for m in self.sitemap_regexp_compiled.finditer(data)]
|
|
190
|
+
return items
|
|
191
|
+
|
|
192
|
+
def detect_response_type(self, response):
|
|
193
|
+
|
|
194
|
+
_url = str(response.url)
|
|
195
|
+
|
|
196
|
+
if "Content-Type" in response.headers:
|
|
197
|
+
content_type = response.headers["Content-Type"].split(';')[0]
|
|
198
|
+
|
|
199
|
+
if content_type in self.js_content_types or ".js" in _url:
|
|
200
|
+
return "js"
|
|
201
|
+
|
|
202
|
+
elif content_type in self.html_content_types or ".html" in _url:
|
|
203
|
+
return "html"
|
|
204
|
+
|
|
205
|
+
elif "robots.txt" in _url:
|
|
206
|
+
return "robots"
|
|
207
|
+
|
|
208
|
+
elif content_type == "text/xml" and "sitemap" in _url:
|
|
209
|
+
return "sitemap"
|
|
210
|
+
|
|
211
|
+
else:
|
|
212
|
+
return "other"
|
|
213
|
+
|
|
214
|
+
def htm_extract(self, task_extract: spider.TaskExtract) -> spider.ResultExtract:
|
|
215
|
+
"""
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
response = task_extract.content
|
|
219
|
+
url_now = response.url
|
|
220
|
+
html_headers = str(response.headers)
|
|
221
|
+
|
|
222
|
+
## STEP 1: Define respose type
|
|
223
|
+
response_type = self.detect_response_type(response)
|
|
224
|
+
|
|
225
|
+
## STEP 2: Finding new URLs with regexp
|
|
226
|
+
regexp_headers = self.regexp_search_common(html_headers)
|
|
227
|
+
|
|
228
|
+
# Fast path: skip body decode + regex sweep entirely for non-textual
|
|
229
|
+
# or irrelevant content types. Also bounds response.text work that
|
|
230
|
+
# would otherwise run charset-normalizer on binary bodies.
|
|
231
|
+
if response_type not in _EXTRACTABLE_TYPES:
|
|
232
|
+
html_text = ""
|
|
233
|
+
regexp_body = []
|
|
234
|
+
new_domains2 = []
|
|
235
|
+
else:
|
|
236
|
+
# ``response.content`` is already the raw bytes; check size
|
|
237
|
+
# before decoding so huge JS bundles don't blow the regex path.
|
|
238
|
+
try:
|
|
239
|
+
body_size = len(response.content) if response.content is not None else 0
|
|
240
|
+
except Exception:
|
|
241
|
+
body_size = 0
|
|
242
|
+
|
|
243
|
+
if body_size > MAX_EXTRACT_BODY_BYTES:
|
|
244
|
+
# Decode only the prefix. ``response.encoding`` is already
|
|
245
|
+
# primed to utf-8 by CommonFetcher, so this is a fast C
|
|
246
|
+
# decode with errors='replace'.
|
|
247
|
+
encoding = response.encoding or "utf-8"
|
|
248
|
+
html_text = response.content[:MAX_EXTRACT_BODY_BYTES].decode(
|
|
249
|
+
encoding, errors="replace")
|
|
250
|
+
else:
|
|
251
|
+
html_text = str(response.text)
|
|
252
|
+
|
|
253
|
+
if response_type == "robots":
|
|
254
|
+
regexp_body = self.regexp_search_robots(html_text)
|
|
255
|
+
elif response_type == "sitemap":
|
|
256
|
+
regexp_body = self.regexp_search_sitemap(html_text)
|
|
257
|
+
elif response_type in ("js", "html"):
|
|
258
|
+
regexp_body = self.regexp_search_common(html_text)
|
|
259
|
+
else: # "other" — no Content-Type; keep original behaviour
|
|
260
|
+
regexp_body = []
|
|
261
|
+
|
|
262
|
+
new_domains2 = self.regexp_search_domains(html_text)
|
|
263
|
+
|
|
264
|
+
## Extract new domains and create new urls:
|
|
265
|
+
|
|
266
|
+
new_domains1 = self.regexp_search_domains(html_headers)
|
|
267
|
+
|
|
268
|
+
new_domains = new_domains1 + new_domains2
|
|
269
|
+
|
|
270
|
+
new_urls = []
|
|
271
|
+
for domain in new_domains:
|
|
272
|
+
base = "https://" + domain + "/"
|
|
273
|
+
new_urls.append(base)
|
|
274
|
+
new_urls.append(base + "sitemap.xml")
|
|
275
|
+
new_urls.append(base + "robots.txt")
|
|
276
|
+
|
|
277
|
+
if len(new_urls) > 0:
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
## STEP 3: Garbage filtration
|
|
281
|
+
|
|
282
|
+
url_list_regexp = []
|
|
283
|
+
for _url in (regexp_body + regexp_headers):
|
|
284
|
+
if _url is not None:
|
|
285
|
+
_url_ = spider.get_url_legal(_url, base_url=url_now)
|
|
286
|
+
url_list_regexp.append(_url_)
|
|
287
|
+
url_obj = Url(_url_)
|
|
288
|
+
dirs_list = url_obj.list_all_dirs()
|
|
289
|
+
url_list_regexp.extend(dirs_list)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
## STEP 4: Add absolute urls, sitemaps and robots to url list
|
|
293
|
+
additional_url_list = []
|
|
294
|
+
for url1 in url_list_regexp:
|
|
295
|
+
url2 = Url(url1)
|
|
296
|
+
absl_url = url2.absolute
|
|
297
|
+
additional_url_list.append(absl_url)
|
|
298
|
+
additional_url_list.append(absl_url + "sitemap.xml")
|
|
299
|
+
additional_url_list.append(absl_url + "robots.txt")
|
|
300
|
+
|
|
301
|
+
## STEP 5: Combine all lists
|
|
302
|
+
combined_url_list = url_list_regexp + additional_url_list + new_urls
|
|
303
|
+
|
|
304
|
+
final_url_list = {_url for _url in combined_url_list if
|
|
305
|
+
spider.check_url_legal_and_in_scope(_url, self._scope_list)}
|
|
306
|
+
task_fetch_list = [spider.TaskFetch.from_task_extract(task_extract, url_new=url) for url in final_url_list]
|
|
307
|
+
|
|
308
|
+
return spider.ResultExtract(state_code=1, task_fetch_list=task_fetch_list)
|
|
309
|
+
|