ansferatu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ansferatu/__init__.py +19 -0
  2. ansferatu/__main__.py +4 -0
  3. ansferatu/cli.py +252 -0
  4. ansferatu/profiles/CommonExtractor.py +309 -0
  5. ansferatu/profiles/CommonFetcher.py +112 -0
  6. ansferatu/profiles/CommonHTMLHandler.py +174 -0
  7. ansferatu/profiles/FileSaver.py +43 -0
  8. ansferatu/profiles/FormDetector.py +526 -0
  9. ansferatu/profiles/FormFiller.py +355 -0
  10. ansferatu/profiles/FormFilter.py +45 -0
  11. ansferatu/profiles/HeadlessCandidate.py +171 -0
  12. ansferatu/profiles/HeadlessExtractor.py +171 -0
  13. ansferatu/profiles/HeadlessFormInteractor.py +418 -0
  14. ansferatu/profiles/JsonlWriter.py +87 -0
  15. ansferatu/profiles/MyProxies.py +18 -0
  16. ansferatu/profiles/ResponseFilter.py +62 -0
  17. ansferatu/profiles/UrlFilter.py +46 -0
  18. ansferatu/profiles/VisitLimit.py +112 -0
  19. ansferatu/profiles/__init__.py +0 -0
  20. ansferatu/profiles/form_helpers/__init__.py +1 -0
  21. ansferatu/profiles/form_helpers/actions.py +432 -0
  22. ansferatu/profiles/form_helpers/browser_lifecycle.py +76 -0
  23. ansferatu/profiles/form_helpers/buttons.py +216 -0
  24. ansferatu/profiles/form_helpers/constants.py +48 -0
  25. ansferatu/profiles/form_helpers/field_classifier.py +469 -0
  26. ansferatu/profiles/form_helpers/filters.py +56 -0
  27. ansferatu/profiles/form_helpers/form_classifier.py +98 -0
  28. ansferatu/profiles/form_helpers/overlay.py +129 -0
  29. ansferatu/profiles/form_helpers/signature.py +64 -0
  30. ansferatu/profiles/form_helpers/visited_forms.py +47 -0
  31. ansferatu/profiles/modes.py +149 -0
  32. ansferatu/profiles/network_constants.py +35 -0
  33. ansferatu/profiles/resource_check.py +91 -0
  34. ansferatu/profiles/response_dedup.py +196 -0
  35. ansferatu/spider/__init__.py +8 -0
  36. ansferatu/spider/common/__init__.py +0 -0
  37. ansferatu/spider/common/url.py +512 -0
  38. ansferatu/spider/concurrent/__init__.py +8 -0
  39. ansferatu/spider/concurrent/threads_inst/__init__.py +14 -0
  40. ansferatu/spider/concurrent/threads_inst/base.py +180 -0
  41. ansferatu/spider/concurrent/threads_inst/extract.py +40 -0
  42. ansferatu/spider/concurrent/threads_inst/fetch.py +72 -0
  43. ansferatu/spider/concurrent/threads_inst/form_interact.py +44 -0
  44. ansferatu/spider/concurrent/threads_inst/headless.py +44 -0
  45. ansferatu/spider/concurrent/threads_inst/html_handle.py +39 -0
  46. ansferatu/spider/concurrent/threads_inst/proxies.py +33 -0
  47. ansferatu/spider/concurrent/threads_inst/save.py +38 -0
  48. ansferatu/spider/concurrent/threads_pool.py +356 -0
  49. ansferatu/spider/instances/__init__.py +12 -0
  50. ansferatu/spider/instances/inst_extract.py +29 -0
  51. ansferatu/spider/instances/inst_fetch.py +46 -0
  52. ansferatu/spider/instances/inst_form_interact.py +31 -0
  53. ansferatu/spider/instances/inst_headless.py +31 -0
  54. ansferatu/spider/instances/inst_html_handle.py +29 -0
  55. ansferatu/spider/instances/inst_proxies.py +31 -0
  56. ansferatu/spider/instances/inst_save.py +32 -0
  57. ansferatu/spider/utilities/__init__.py +10 -0
  58. ansferatu/spider/utilities/cfilter.py +44 -0
  59. ansferatu/spider/utilities/cresult.py +133 -0
  60. ansferatu/spider/utilities/ctask.py +179 -0
  61. ansferatu/spider/utilities/functions.py +84 -0
  62. ansferatu/spider/wappalyzer/__init__.py +0 -0
  63. ansferatu/spider/wappalyzer/all.json +15481 -0
  64. ansferatu/spider/wappalyzer/functional.json +295 -0
  65. ansferatu/spider/wappalyzer/scanner.json +700 -0
  66. ansferatu/spider/wappalyzer/wappalyzer.py +249 -0
  67. ansferatu/spider/wappalyzer/webpage.py +118 -0
  68. ansferatu-0.1.0.dist-info/METADATA +409 -0
  69. ansferatu-0.1.0.dist-info/RECORD +73 -0
  70. ansferatu-0.1.0.dist-info/WHEEL +5 -0
  71. ansferatu-0.1.0.dist-info/entry_points.txt +2 -0
  72. ansferatu-0.1.0.dist-info/licenses/LICENSE +21 -0
  73. ansferatu-0.1.0.dist-info/top_level.txt +1 -0
ansferatu/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """Ansferatu - multifunctional tool for HTTP reconnaissance, web crawling and directory bruteforce.
2
+
3
+ Public library API
4
+ ------------------
5
+ The most common entry points for using Ansferatu programmatically are the
6
+ high-level orchestration helpers::
7
+
8
+ from ansferatu import common_crawler, common_brute_from_file
9
+
10
+ For lower-level control you can build a spider directly::
11
+
12
+ from ansferatu.spider import WebSpider, TaskFetch
13
+ """
14
+
15
+ __version__ = "0.1.0"
16
+
17
+ from ansferatu.profiles.modes import common_crawler, common_brute_from_file
18
+
19
+ __all__ = ["common_crawler", "common_brute_from_file", "__version__"]
ansferatu/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from ansferatu.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
ansferatu/cli.py ADDED
@@ -0,0 +1,252 @@
1
+ import argparse
2
+
3
+ from datetime import datetime
4
+ import logging
5
+ import os
6
+ from ansferatu.spider.common.url import Url
7
+ from ansferatu.profiles.modes import common_crawler
8
+ from ansferatu.profiles.modes import common_brute_from_file
9
+
10
+ # Create class instance and timestamp for report file
11
+ now = datetime.now()
12
+
13
+ # configure logging
14
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s\t%(levelname)s\t%(message)s")
15
+ logger = logging.getLogger()
16
+
17
+ class AnsferatuRunner:
18
+ def __init__(self, args):
19
+ logger.info("Welcome to Ansferatu")
20
+ self.timestamp_start = datetime.timestamp(now)
21
+
22
+ self.runner_args = args
23
+
24
+ if self.runner_args.input_file:
25
+ self.url_list = open(self.runner_args.input_file, "r+").read().splitlines()
26
+ else:
27
+ self.url_list = self.runner_args.url_list_initial
28
+
29
+ if not self.runner_args.scope_list:
30
+ self.new_scope = self.create_scope()
31
+ else:
32
+ self.new_scope = self.runner_args.scope_list
33
+
34
+ if not os.path.exists(self.runner_args.output_dir):
35
+ os.makedirs(self.runner_args.output_dir)
36
+
37
+ def create_scope(self):
38
+ scope_domains_list = []
39
+ for url in self.url_list:
40
+ n_url = Url(url)
41
+ scope_domains_list.append(str(n_url.domain))
42
+
43
+ return scope_domains_list
44
+
45
+ def run(self):
46
+ if self.runner_args.mode == "crawl":
47
+ self.crawl()
48
+ elif self.runner_args.mode == "brute":
49
+ self.brute()
50
+ self.dump_result()
51
+
52
+ def crawl(self):
53
+ common_crawler(
54
+ self.url_list,
55
+ self.new_scope,
56
+ max_deep=self.runner_args.max_deep,
57
+ exclude_codes_list=self.runner_args.exclude_codes_list,
58
+ visit_count_limit=self.runner_args.visit_count_limit,
59
+ threads=self.runner_args.threads,
60
+ headless=getattr(self.runner_args, 'headless', False),
61
+ fill_forms=getattr(self.runner_args, 'fill_forms', False),
62
+ output_file=self.runner_args.jsonl_output,
63
+ skip_dedup=self.runner_args.skip_dedup,
64
+ headless_workers=getattr(self.runner_args, 'headless_workers', 1),
65
+ max_headless_mem_mb=getattr(self.runner_args, 'max_headless_mem_mb', None),
66
+ )
67
+
68
+ def brute(self):
69
+ common_brute_from_file(
70
+ self.url_list,
71
+ self.runner_args.exclude_codes_list,
72
+ _dictionary_file=self.runner_args.wordlist_big,
73
+ threads=self.runner_args.threads,
74
+ output_file=self.runner_args.jsonl_output,
75
+ skip_dedup=self.runner_args.skip_dedup,
76
+ )
77
+
78
+ def dump_result(self):
79
+ # Dump results in file
80
+ for domain in self.new_scope:
81
+ logger.info("creating output file...")
82
+ result_file = f"{self.runner_args.output_dir}final-{domain}+{self.timestamp_start}.{self.runner_args.report_mode}"
83
+
84
+
85
+ def parse_arguments(argv=None):
86
+ parser = argparse.ArgumentParser()
87
+
88
+ # Usual arguments which are applicable for the whole script / top-level args
89
+ parser.add_argument("--verbose", help="Common top-level parameter", action="store_true", required=False)
90
+
91
+ # Same subparsers as usual
92
+ subparsers = parser.add_subparsers(help="Desired action to perform", dest="mode")
93
+
94
+ # Usual subparsers not using common options
95
+ # parser_other = subparsers.add_parser("extra-action", help='Do something without db')
96
+
97
+ # Create parent subparser. Note `add_help=False` and creation via `argparse.`
98
+ parent_parser = argparse.ArgumentParser(add_help=False)
99
+ parent_parser.add_argument(
100
+ "-o",
101
+ "--output",
102
+ metavar="Output Directory",
103
+ type=str,
104
+ dest="output_dir",
105
+ default="/tmp/",
106
+ help="Output directory for reports. By default: /tmp/",
107
+ )
108
+ parent_parser.add_argument(
109
+ "--jsonl",
110
+ metavar="FILE",
111
+ type=str,
112
+ dest="jsonl_output",
113
+ default=None,
114
+ help="Path to JSONL output file (Nuclei-compatible proxify format)",
115
+ )
116
+ parent_parser.add_argument(
117
+ "-s",
118
+ "--scope",
119
+ metavar="InScope domains",
120
+ dest="scope_list",
121
+ nargs="+",
122
+ default=[],
123
+ help="List of domains that are in scope of current scan. Split by space. "
124
+ "Example: news.example.com blog.example.com admin.example.com",
125
+ )
126
+ parent_parser.add_argument(
127
+ "--report",
128
+ choices=["html", "json"],
129
+ default="html",
130
+ dest="report_mode",
131
+ help="Type of report format",
132
+ )
133
+ parent_parser.add_argument(
134
+ "--exclude_codes",
135
+ default=[403, 404, 401],
136
+ nargs="*",
137
+ dest="exclude_codes_list",
138
+ help="List of filtered HTTP answer's codes",
139
+ )
140
+ parent_parser.add_argument(
141
+ "--threads",
142
+ "-t",
143
+ metavar="Max_threads",
144
+ type=int,
145
+ dest="threads",
146
+ default=10,
147
+ help="Number of HTTP threads",
148
+ )
149
+ parent_parser.add_argument(
150
+ "--sd",
151
+ "--skip-deduplication",
152
+ action="store_true",
153
+ default=False,
154
+ dest="skip_dedup",
155
+ help="Disable response deduplication (keep all responses)",
156
+ )
157
+
158
+ group = parent_parser.add_mutually_exclusive_group(required=True)
159
+ group.add_argument(
160
+ "-u",
161
+ "--urls",
162
+ metavar="URLs",
163
+ type=str,
164
+ dest="url_list_initial",
165
+ nargs="*",
166
+ help="URLs or list of URLs for start. Example: https://mail.ru/",
167
+ )
168
+ group.add_argument(
169
+ "-f",
170
+ "--file",
171
+ metavar="Input file with URLs",
172
+ type=str,
173
+ dest="input_file",
174
+ help="Input file contains URLs (1 URL per line)",
175
+ )
176
+
177
+ parser_crawl = subparsers.add_parser("crawl", parents=[parent_parser], help="Create something")
178
+ parser_crawl.add_argument(
179
+ "--limit",
180
+ metavar="Limits",
181
+ type=int,
182
+ dest="visit_count_limit",
183
+ default=10,
184
+ help="Limit of request to subdirectories",
185
+ )
186
+ parser_crawl.add_argument(
187
+ "--deep",
188
+ metavar="Max_deep",
189
+ type=int,
190
+ dest="max_deep",
191
+ default=2,
192
+ help="Max deep of request to subdirectories",
193
+ )
194
+ parser_crawl.add_argument(
195
+ "--headless",
196
+ action="store_true",
197
+ default=False,
198
+ help="Enable Playwright headless extraction for root/absolute URLs",
199
+ )
200
+ parser_crawl.add_argument(
201
+ "--fill-forms",
202
+ action="store_true",
203
+ default=False,
204
+ dest="fill_forms",
205
+ help="Enable form detection and interaction (requires --headless). "
206
+ "Detects forms on pages, fills fields with smart defaults, and submits them.",
207
+ )
208
+ parser_crawl.add_argument(
209
+ "--headless-workers",
210
+ metavar="N",
211
+ type=int,
212
+ dest="headless_workers",
213
+ default=1,
214
+ help="Number of parallel headless workers (HeadlessExtractor + FormInteractor). "
215
+ "Each worker spawns its own Chromium (~250MB RAM, ~1 CPU under load). "
216
+ "Default 1 preserves legacy behaviour; raise to 3-5 to drain a backlog faster.",
217
+ )
218
+ parser_crawl.add_argument(
219
+ "--max-headless-mem-mb",
220
+ metavar="MB",
221
+ type=int,
222
+ dest="max_headless_mem_mb",
223
+ default=None,
224
+ help="Hard cap on estimated headless RAM usage (MB). When set and the "
225
+ "estimate (workers * 2 browsers * ~350MB) exceeds it, startup aborts. "
226
+ "When unset, an advisory warning is logged if estimate exceeds 70%% "
227
+ "of currently available RAM.",
228
+ )
229
+
230
+ parser_brute = subparsers.add_parser("brute", parents=[parent_parser], help="Update something")
231
+ parser_brute.add_argument(
232
+ "-w",
233
+ "--wordlist",
234
+ required=True,
235
+ metavar="=wordlist file",
236
+ type=str,
237
+ dest="wordlist_big",
238
+ help="Path to wordlist file for brute",
239
+ )
240
+
241
+ return parser.parse_args(argv)
242
+
243
+
244
+ def main(argv=None):
245
+ """Console entry point for the ``ansferatu`` command."""
246
+ args = parse_arguments(argv)
247
+ ansferatu_runner = AnsferatuRunner(args)
248
+ ansferatu_runner.run()
249
+
250
+
251
+ if __name__ == "__main__":
252
+ main()
@@ -0,0 +1,309 @@
1
+ from ansferatu import spider
2
+ import re
3
+ from ansferatu.spider.common.url import Url
4
+
5
+ # Cap the size of response bodies we feed into the heavy URL-extraction
6
+ # regex. Running ``common_regexp_compiled`` (many alternations, VERBOSE
7
+ # mode) over minified JS bundles of 2–10 MB is several seconds of pure
8
+ # Python regex holding the GIL, which starves other threads (notably the
9
+ # Playwright form interactor). 500 KB is enough to catch nearly all real
10
+ # URLs in HTML/JS — anything further is almost always duplicates.
11
+ MAX_EXTRACT_BODY_BYTES = 500 * 1024
12
+
13
+ # Response types the extractor actually mines for URLs. Everything
14
+ # else (images, binaries, fonts, large JSON blobs, …) produces no
15
+ # useful output and is skipped to save GIL time. ``other`` is
16
+ # included because ``detect_response_type`` returns it for responses
17
+ # with no Content-Type header — these are rare but occasionally
18
+ # contain useful URLs (e.g. raw redirect bodies).
19
+ _EXTRACTABLE_TYPES = frozenset({"html", "js", "robots", "sitemap", "other"})
20
+
21
+
22
+ class CommonExtractor(spider.Extractor):
23
+ """
24
+ Class for getting new links from response HTMLs, JS files, etc.
25
+ """
26
+
27
+ def __init__(self, max_deep=0, scope_list=None):
28
+ """
29
+ """
30
+ super().__init__()
31
+ if scope_list is None:
32
+ scope_list = []
33
+ self._max_deep = max_deep
34
+ self._scope_list = scope_list
35
+
36
+ self.js_content_types = ["text/javascript", "application/x-ecmascript", "application/x-javascript",
37
+ "application/javascript", "text/ecmascript", "text/x-ecmascript", "text/x-javascript",
38
+ "application/ecmascript", "text/jscript"]
39
+
40
+ self.html_content_types = ["text/html"]
41
+
42
+ # Regex used in linkfinder
43
+ self.common_regexp_str = r"""
44
+
45
+ (?:"|') # Start newline delimiter
46
+
47
+ (
48
+ ((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or //
49
+ [^"'/]{1,}\. # Match a domainname (any character + dot)
50
+ [a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path
51
+
52
+ |
53
+
54
+ ((?:/|\.\./|\./) # Start with /,../,./
55
+ [^"'><,;| *()(%%$^/\\\[\]] # Next character can't be...
56
+ [^"'><,;|()]{1,}) # Rest of the characters can't be
57
+
58
+ |
59
+
60
+ ([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with /
61
+ [a-zA-Z0-9_\-/]{1,} # Resource name
62
+ \.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action)
63
+ (?:[\?|/][^"|']{0,}|)) # ? mark with parameters
64
+
65
+ |
66
+
67
+ ([a-zA-Z0-9_\-]{1,} # filename
68
+ \.(?:|html|php|txt|htm|aspx|
69
+ asp|js|css|pgsql|mysql|pdf|
70
+ cgi|inc|gif|jpg|swf|xml|cfm|
71
+ xhtml|wmv|zip|axd|gz|png|doc|
72
+ shtml|jsp|ico|exe|csi|config|
73
+ jpeg|ashx|log|xls|old|mp3|tar
74
+ |ini|asa|tgz|flv|php3|bak|rar|
75
+ asmx|xlsx|page|phtml|dll|asax|
76
+ pl|csv|ppt|bmp|sql|new|avi|psd|
77
+ rss|wav|action|db|dat|do|xsl|
78
+ class|mdb|include|cs|htc|mov|
79
+ mpg|rdf|rtf|ascx|files|jar|vb|
80
+ mp4|local|docx|php5|wci|readme|
81
+ cfg|cfc|lck|ttf|jhtml|mpeg|php4|
82
+ tif|json|zif|shtm|sitemap|tmp|
83
+ backup|conf|settings|cab|asx|
84
+ msi|bin|htaccess|java|jsf|bat|
85
+ print|ics|svc|vbs|img|inf|ajax|
86
+ chm|m3u|py|sh|store|webinfo|jad|
87
+ stm|webresource|lock|phps|pptx|
88
+ xsd|crt|hmtl|index|iso|taf|war|
89
+ xslt|go|gpx|ihtml|odt|sample|spider|
90
+ cer|html|lib|lnk|mhtml|pgp|
91
+ text|view|asc|dtd|html1|ogg|out|
92
+ pgt|php|rb|rhtml|wsdl|apsx|asf|
93
+ dot|git|hta|php2|phtm|psql|reg|
94
+ rpm|tiff|cfml|dta|jp|php~|ps|
95
+ raw|svg|svn|thtml|xhtm|aspx|
96
+ bhtml|bml|ca|cache|htmls|htx|
97
+ jpe|jspf|access|app|asd|asm|bak2|
98
+ deb|epub|htlm|jnlp|js2|jspx|
99
+ php1|phpp|pop3|pwd|pyc|session|
100
+ setup|swp|temp) # . + extension
101
+ (?:\?[^"|']{0,}|)) # ? mark with parameters
102
+
103
+ )
104
+
105
+ (?:"|') # End newline delimiter
106
+
107
+ |
108
+
109
+ (?<=(?<=href=)|(?<=src=))((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})
110
+
111
+ """
112
+ self.common_regexp_compiled = re.compile(self.common_regexp_str, re.VERBOSE)
113
+
114
+ self.robots_regexp_str = "Allow: (.*)|Disallow: (.*)"
115
+ self.robots_regexp_compiled = re.compile(self.robots_regexp_str, re.VERBOSE)
116
+
117
+ self.sitemap_regexp_str = "<loc>(.*?)</loc>"
118
+ self.sitemap_regexp_compiled = re.compile(self.sitemap_regexp_str, re.VERBOSE)
119
+
120
+ self.regex_common2 = re.compile(r'https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', re.VERBOSE)
121
+
122
+ # Regex for finding standalone domain names (not full URLs).
123
+ # Targets bare domains in quoted strings — JS configs, arrays, HTML attributes, etc.
124
+ # e.g. "nordvpn.com", 'china-with-nord.org', "sub.domain.example.net"
125
+ # Uses post-filtering (not_tld_extensions) to discard file-name false positives.
126
+ self.domain_regexp_str = r"""
127
+ (?:"|') # Opening quote
128
+ (?![a-zA-Z]{1,10}://) # NOT a full URL (http://, ftp://, etc.)
129
+ (?!//) # NOT a protocol-relative URL
130
+ (
131
+ (?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])? # Domain label (RFC-compliant)
132
+ \.){1,10} # 1-10 labels, each followed by dot
133
+ [a-zA-Z]{2,15} # TLD: 2-15 alpha chars
134
+ )
135
+ (?:"|') # Closing quote
136
+ """
137
+ self.domain_regexp_compiled = re.compile(self.domain_regexp_str, re.VERBOSE)
138
+
139
+ # Whitelist of popular TLDs — only domains ending with these are accepted.
140
+ # Keeps precision high: no .js/.html/.png false positives possible.
141
+ self.allowed_tlds = {
142
+ # Classic gTLDs
143
+ 'com', 'org', 'net', 'edu', 'gov', 'biz',
144
+ # New gTLDs (tech / startup / business)
145
+ 'io', 'ai', 'app', 'dev', 'co',
146
+ 'xyz', 'tech', 'site', 'online', 'website', 'cloud',
147
+ 'network', 'group', 'review',
148
+ 'security', 'services', 'solutions', 'support', 'systems',
149
+ 'wiki',
150
+ # Major country-code TLDs (Europe)
151
+ 'uk', 'de', 'fr', 'nl', 'it', 'es', 'pt', 'eu', 'ch', 'at', 'be',
152
+ 'se', 'no', 'dk', 'fi', 'ie', 'pl', 'cz', 'ro', 'hu', 'bg', 'hr',
153
+ 'sk', 'si', 'lt', 'lv', 'ee', 'gr', 'ua', 'by', 'ru',
154
+ # Major country-code TLDs (Americas)
155
+ 'us', 'ca', 'mx', 'br', 'ar', 'cl', 'co',
156
+ # Major country-code TLDs (Asia-Pacific)
157
+ 'cn', 'jp', 'kr', 'in', 'au', 'nz', 'sg', 'hk', 'tw', 'th',
158
+ 'ph', 'id', 'my', 'vn', 'pk',
159
+ # Major country-code TLDs (Middle East / Africa)
160
+ 'il', 'ae', 'sa', 'tr', 'eg', 'za', 'ng', 'ke',
161
+ }
162
+
163
+ def regexp_search_common(self, data):
164
+ items = [m.group(1) for m in self.common_regexp_compiled.finditer(data)]
165
+ items2 = [m.group(0) for m in self.regex_common2.finditer(data)]
166
+ return items + items2
167
+
168
+ def regexp_search_domains(self, data):
169
+ """Find standalone domain names (not full URLs) in text.
170
+
171
+ Targets bare domains in quoted strings — common in JS configs, arrays,
172
+ GA linker domains, CSP headers, CORS whitelists, etc.
173
+ Only keeps domains whose TLD is in the allowed_tlds whitelist.
174
+ Returns deduplicated, lowercased domain list.
175
+ """
176
+ raw = [m.group(1) for m in self.domain_regexp_compiled.finditer(data)]
177
+ filtered = []
178
+ for domain in raw:
179
+ tld = domain.rsplit('.', 1)[-1].lower()
180
+ if tld in self.allowed_tlds:
181
+ filtered.append(domain.lower())
182
+ return list(set(filtered))
183
+
184
+ def regexp_search_robots(self, data):
185
+ items = [m.group(2) for m in self.robots_regexp_compiled.finditer(data)]
186
+ return items
187
+
188
+ def regexp_search_sitemap(self, data):
189
+ items = [m.group(1) for m in self.sitemap_regexp_compiled.finditer(data)]
190
+ return items
191
+
192
+ def detect_response_type(self, response):
193
+
194
+ _url = str(response.url)
195
+
196
+ if "Content-Type" in response.headers:
197
+ content_type = response.headers["Content-Type"].split(';')[0]
198
+
199
+ if content_type in self.js_content_types or ".js" in _url:
200
+ return "js"
201
+
202
+ elif content_type in self.html_content_types or ".html" in _url:
203
+ return "html"
204
+
205
+ elif "robots.txt" in _url:
206
+ return "robots"
207
+
208
+ elif content_type == "text/xml" and "sitemap" in _url:
209
+ return "sitemap"
210
+
211
+ else:
212
+ return "other"
213
+
214
+ def htm_extract(self, task_extract: spider.TaskExtract) -> spider.ResultExtract:
215
+ """
216
+ """
217
+
218
+ response = task_extract.content
219
+ url_now = response.url
220
+ html_headers = str(response.headers)
221
+
222
+ ## STEP 1: Define respose type
223
+ response_type = self.detect_response_type(response)
224
+
225
+ ## STEP 2: Finding new URLs with regexp
226
+ regexp_headers = self.regexp_search_common(html_headers)
227
+
228
+ # Fast path: skip body decode + regex sweep entirely for non-textual
229
+ # or irrelevant content types. Also bounds response.text work that
230
+ # would otherwise run charset-normalizer on binary bodies.
231
+ if response_type not in _EXTRACTABLE_TYPES:
232
+ html_text = ""
233
+ regexp_body = []
234
+ new_domains2 = []
235
+ else:
236
+ # ``response.content`` is already the raw bytes; check size
237
+ # before decoding so huge JS bundles don't blow the regex path.
238
+ try:
239
+ body_size = len(response.content) if response.content is not None else 0
240
+ except Exception:
241
+ body_size = 0
242
+
243
+ if body_size > MAX_EXTRACT_BODY_BYTES:
244
+ # Decode only the prefix. ``response.encoding`` is already
245
+ # primed to utf-8 by CommonFetcher, so this is a fast C
246
+ # decode with errors='replace'.
247
+ encoding = response.encoding or "utf-8"
248
+ html_text = response.content[:MAX_EXTRACT_BODY_BYTES].decode(
249
+ encoding, errors="replace")
250
+ else:
251
+ html_text = str(response.text)
252
+
253
+ if response_type == "robots":
254
+ regexp_body = self.regexp_search_robots(html_text)
255
+ elif response_type == "sitemap":
256
+ regexp_body = self.regexp_search_sitemap(html_text)
257
+ elif response_type in ("js", "html"):
258
+ regexp_body = self.regexp_search_common(html_text)
259
+ else: # "other" — no Content-Type; keep original behaviour
260
+ regexp_body = []
261
+
262
+ new_domains2 = self.regexp_search_domains(html_text)
263
+
264
+ ## Extract new domains and create new urls:
265
+
266
+ new_domains1 = self.regexp_search_domains(html_headers)
267
+
268
+ new_domains = new_domains1 + new_domains2
269
+
270
+ new_urls = []
271
+ for domain in new_domains:
272
+ base = "https://" + domain + "/"
273
+ new_urls.append(base)
274
+ new_urls.append(base + "sitemap.xml")
275
+ new_urls.append(base + "robots.txt")
276
+
277
+ if len(new_urls) > 0:
278
+ pass
279
+
280
+ ## STEP 3: Garbage filtration
281
+
282
+ url_list_regexp = []
283
+ for _url in (regexp_body + regexp_headers):
284
+ if _url is not None:
285
+ _url_ = spider.get_url_legal(_url, base_url=url_now)
286
+ url_list_regexp.append(_url_)
287
+ url_obj = Url(_url_)
288
+ dirs_list = url_obj.list_all_dirs()
289
+ url_list_regexp.extend(dirs_list)
290
+
291
+
292
+ ## STEP 4: Add absolute urls, sitemaps and robots to url list
293
+ additional_url_list = []
294
+ for url1 in url_list_regexp:
295
+ url2 = Url(url1)
296
+ absl_url = url2.absolute
297
+ additional_url_list.append(absl_url)
298
+ additional_url_list.append(absl_url + "sitemap.xml")
299
+ additional_url_list.append(absl_url + "robots.txt")
300
+
301
+ ## STEP 5: Combine all lists
302
+ combined_url_list = url_list_regexp + additional_url_list + new_urls
303
+
304
+ final_url_list = {_url for _url in combined_url_list if
305
+ spider.check_url_legal_and_in_scope(_url, self._scope_list)}
306
+ task_fetch_list = [spider.TaskFetch.from_task_extract(task_extract, url_new=url) for url in final_url_list]
307
+
308
+ return spider.ResultExtract(state_code=1, task_fetch_list=task_fetch_list)
309
+