ansferatu 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ansferatu-0.1.0/LICENSE +21 -0
- ansferatu-0.1.0/PKG-INFO +409 -0
- ansferatu-0.1.0/README.md +353 -0
- ansferatu-0.1.0/ansferatu/__init__.py +19 -0
- ansferatu-0.1.0/ansferatu/__main__.py +4 -0
- ansferatu-0.1.0/ansferatu/cli.py +252 -0
- ansferatu-0.1.0/ansferatu/profiles/CommonExtractor.py +309 -0
- ansferatu-0.1.0/ansferatu/profiles/CommonFetcher.py +112 -0
- ansferatu-0.1.0/ansferatu/profiles/CommonHTMLHandler.py +174 -0
- ansferatu-0.1.0/ansferatu/profiles/FileSaver.py +43 -0
- ansferatu-0.1.0/ansferatu/profiles/FormDetector.py +526 -0
- ansferatu-0.1.0/ansferatu/profiles/FormFiller.py +355 -0
- ansferatu-0.1.0/ansferatu/profiles/FormFilter.py +45 -0
- ansferatu-0.1.0/ansferatu/profiles/HeadlessCandidate.py +171 -0
- ansferatu-0.1.0/ansferatu/profiles/HeadlessExtractor.py +171 -0
- ansferatu-0.1.0/ansferatu/profiles/HeadlessFormInteractor.py +418 -0
- ansferatu-0.1.0/ansferatu/profiles/JsonlWriter.py +87 -0
- ansferatu-0.1.0/ansferatu/profiles/MyProxies.py +18 -0
- ansferatu-0.1.0/ansferatu/profiles/ResponseFilter.py +62 -0
- ansferatu-0.1.0/ansferatu/profiles/UrlFilter.py +46 -0
- ansferatu-0.1.0/ansferatu/profiles/VisitLimit.py +112 -0
- ansferatu-0.1.0/ansferatu/profiles/__init__.py +0 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/__init__.py +1 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/actions.py +432 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/browser_lifecycle.py +76 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/buttons.py +216 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/constants.py +48 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/field_classifier.py +469 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/filters.py +56 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/form_classifier.py +98 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/overlay.py +129 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/signature.py +64 -0
- ansferatu-0.1.0/ansferatu/profiles/form_helpers/visited_forms.py +47 -0
- ansferatu-0.1.0/ansferatu/profiles/modes.py +149 -0
- ansferatu-0.1.0/ansferatu/profiles/network_constants.py +35 -0
- ansferatu-0.1.0/ansferatu/profiles/resource_check.py +91 -0
- ansferatu-0.1.0/ansferatu/profiles/response_dedup.py +196 -0
- ansferatu-0.1.0/ansferatu/spider/__init__.py +8 -0
- ansferatu-0.1.0/ansferatu/spider/common/__init__.py +0 -0
- ansferatu-0.1.0/ansferatu/spider/common/url.py +512 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/__init__.py +8 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/__init__.py +14 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/base.py +180 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/extract.py +40 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/fetch.py +72 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/form_interact.py +44 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/headless.py +44 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/html_handle.py +39 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/proxies.py +33 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/save.py +38 -0
- ansferatu-0.1.0/ansferatu/spider/concurrent/threads_pool.py +356 -0
- ansferatu-0.1.0/ansferatu/spider/instances/__init__.py +12 -0
- ansferatu-0.1.0/ansferatu/spider/instances/inst_extract.py +29 -0
- ansferatu-0.1.0/ansferatu/spider/instances/inst_fetch.py +46 -0
- ansferatu-0.1.0/ansferatu/spider/instances/inst_form_interact.py +31 -0
- ansferatu-0.1.0/ansferatu/spider/instances/inst_headless.py +31 -0
- ansferatu-0.1.0/ansferatu/spider/instances/inst_html_handle.py +29 -0
- ansferatu-0.1.0/ansferatu/spider/instances/inst_proxies.py +31 -0
- ansferatu-0.1.0/ansferatu/spider/instances/inst_save.py +32 -0
- ansferatu-0.1.0/ansferatu/spider/utilities/__init__.py +10 -0
- ansferatu-0.1.0/ansferatu/spider/utilities/cfilter.py +44 -0
- ansferatu-0.1.0/ansferatu/spider/utilities/cresult.py +133 -0
- ansferatu-0.1.0/ansferatu/spider/utilities/ctask.py +179 -0
- ansferatu-0.1.0/ansferatu/spider/utilities/functions.py +84 -0
- ansferatu-0.1.0/ansferatu/spider/wappalyzer/__init__.py +0 -0
- ansferatu-0.1.0/ansferatu/spider/wappalyzer/all.json +15481 -0
- ansferatu-0.1.0/ansferatu/spider/wappalyzer/functional.json +295 -0
- ansferatu-0.1.0/ansferatu/spider/wappalyzer/scanner.json +700 -0
- ansferatu-0.1.0/ansferatu/spider/wappalyzer/wappalyzer.py +249 -0
- ansferatu-0.1.0/ansferatu/spider/wappalyzer/webpage.py +118 -0
- ansferatu-0.1.0/ansferatu.egg-info/PKG-INFO +409 -0
- ansferatu-0.1.0/ansferatu.egg-info/SOURCES.txt +82 -0
- ansferatu-0.1.0/ansferatu.egg-info/dependency_links.txt +1 -0
- ansferatu-0.1.0/ansferatu.egg-info/entry_points.txt +2 -0
- ansferatu-0.1.0/ansferatu.egg-info/requires.txt +15 -0
- ansferatu-0.1.0/ansferatu.egg-info/top_level.txt +1 -0
- ansferatu-0.1.0/pyproject.toml +53 -0
- ansferatu-0.1.0/setup.cfg +4 -0
- ansferatu-0.1.0/tests/test_browser_lifecycle.py +215 -0
- ansferatu-0.1.0/tests/test_headless_candidate.py +490 -0
- ansferatu-0.1.0/tests/test_resource_check.py +95 -0
- ansferatu-0.1.0/tests/test_response_dedup.py +1004 -0
- ansferatu-0.1.0/tests/test_runner.py +169 -0
- ansferatu-0.1.0/tests/test_thread_safety.py +201 -0
ansferatu-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 frostbits-security
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ansferatu-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ansferatu
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multifunctional tool for HTTP reconnaissance, web crawling and web directory bruteforce.
|
|
5
|
+
Author: frostbits-security
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2022 frostbits-security
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/frostbits-security/ansferatu
|
|
29
|
+
Project-URL: Repository, https://github.com/frostbits-security/ansferatu
|
|
30
|
+
Keywords: crawler,spider,bruteforce,reconnaissance,security,web
|
|
31
|
+
Classifier: Development Status :: 4 - Beta
|
|
32
|
+
Classifier: Intended Audience :: Information Technology
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Operating System :: OS Independent
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
37
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
38
|
+
Classifier: Topic :: Security
|
|
39
|
+
Requires-Python: >=3.8
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
License-File: LICENSE
|
|
42
|
+
Requires-Dist: requests
|
|
43
|
+
Requires-Dist: urllib3
|
|
44
|
+
Requires-Dist: beautifulsoup4
|
|
45
|
+
Requires-Dist: simhash
|
|
46
|
+
Requires-Dist: tldextract
|
|
47
|
+
Requires-Dist: validators
|
|
48
|
+
Requires-Dist: psutil
|
|
49
|
+
Provides-Extra: headless
|
|
50
|
+
Requires-Dist: playwright; extra == "headless"
|
|
51
|
+
Provides-Extra: dev
|
|
52
|
+
Requires-Dist: pytest; extra == "dev"
|
|
53
|
+
Requires-Dist: build; extra == "dev"
|
|
54
|
+
Requires-Dist: twine; extra == "dev"
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
|
|
57
|
+
Multifunctional tool for http reconnaissance, web crawling, web directory bruteforce.
|
|
58
|
+
Based at [PSpider](https://github.com/xianhu/PSpider)
|
|
59
|
+
|
|
60
|
+
Killer features:
|
|
61
|
+
1. Fast multiurl crawling
|
|
62
|
+
2. Fast multiurl directory bruteforce
|
|
63
|
+
3. Find new domains without DNS bruteforce. (for example https://mail.ru --> 105 Domains of *.mail.ru)
|
|
64
|
+
4. To Do: dynamic creation dictionary for brute-force
|
|
65
|
+
5. To Do: deduplication based on Simhash
|
|
66
|
+
6. Headless browsing and forms fill-up as addtional option
|
|
67
|
+
7. To Do: add proper output to jsonl + html reports
|
|
68
|
+
8. To Do: Collect query parameters (for get and post)
|
|
69
|
+
9. To Do: better deduplication based on page hash
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
### Installation
|
|
75
|
+
|
|
76
|
+
Ansferatu is a regular Python package. It requires Python 3.8+.
|
|
77
|
+
|
|
78
|
+
**From PyPI:**
|
|
79
|
+
```bash
|
|
80
|
+
pip3 install ansferatu
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**From source / GitHub:**
|
|
84
|
+
```bash
|
|
85
|
+
pip3 install git+https://github.com/frostbits-security/ansferatu.git
|
|
86
|
+
# or, from a local checkout:
|
|
87
|
+
pip3 install .
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Headless / form-filling support (optional).** The `--headless` and
|
|
91
|
+
`--fill-forms` modes rely on [Playwright](https://playwright.dev/python/).
|
|
92
|
+
Install the optional extra and download the Chromium runtime:
|
|
93
|
+
```bash
|
|
94
|
+
pip3 install 'ansferatu[headless]'
|
|
95
|
+
playwright install chromium
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Installing the package exposes an `ansferatu` console command (equivalent to
|
|
99
|
+
`python3 -m ansferatu`).
|
|
100
|
+
|
|
101
|
+
### How to run
|
|
102
|
+
|
|
103
|
+
After installation, run via the `ansferatu` command:
|
|
104
|
+
```bash
|
|
105
|
+
ansferatu crawl --url https://mail.ru -o ./results/ --limit 1
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
#### Use as a library
|
|
109
|
+
|
|
110
|
+
The package can be imported into other Python tools:
|
|
111
|
+
```python
|
|
112
|
+
from ansferatu import common_crawler, common_brute_from_file
|
|
113
|
+
|
|
114
|
+
common_crawler(
|
|
115
|
+
url_list=["https://example.com"],
|
|
116
|
+
scope=["example.com"],
|
|
117
|
+
exclude_codes_list=[403, 404, 401],
|
|
118
|
+
visit_count_limit=10,
|
|
119
|
+
max_deep=2,
|
|
120
|
+
threads=10,
|
|
121
|
+
output_file="results.jsonl",
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
For lower-level control, build the spider directly:
|
|
125
|
+
```python
|
|
126
|
+
from ansferatu.spider import WebSpider, TaskFetch
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
#### Docker
|
|
130
|
+
|
|
131
|
+
Build docker image:
|
|
132
|
+
```bash
|
|
133
|
+
docker build -t ansferatu .
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Run the container (the image's entrypoint is the `ansferatu` command):
|
|
137
|
+
```bash
|
|
138
|
+
docker run --rm -it -v /tmp/ansferatu_out:/ansferatu/results ansferatu \
|
|
139
|
+
crawl --url https://mail.ru -o /ansferatu/results/ --limit 1
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
#### Modes
|
|
143
|
+
|
|
144
|
+
**crawl** - run crawl for web sites. Main parameter is "visit_count_limit"
|
|
145
|
+
```
|
|
146
|
+
ansferatu crawl --url https://deti.mail.ru -o /home/sabotaged/BB/mail.ru/
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**crawl --headless** - same crawl but with Playwright headless extraction for qualifying pages.
|
|
150
|
+
Requires the headless extra: `pip install 'ansferatu[headless]' && playwright install chromium`.
|
|
151
|
+
```bash
|
|
152
|
+
ansferatu crawl --headless --url https://example.com -o ./results/
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**crawl --fill-forms** - extends headless crawl with form detection and interaction.
|
|
156
|
+
Detects `<form>` elements on pages, fills fields with smart defaults (email, password, search, etc.),
|
|
157
|
+
submits forms and clicks buttons, then captures the resulting POST responses and new URLs.
|
|
158
|
+
Implies `--headless`.
|
|
159
|
+
```bash
|
|
160
|
+
ansferatu crawl --fill-forms --url https://example.com -o ./results/
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
**brute** - classic web directories bruteforce. Needs wordlist.
|
|
164
|
+
```bash
|
|
165
|
+
ansferatu brute --url https://news.mail.ru -w ./wordlists/fuzz_big.txt -o /home/sabotaged/BB/mail.ru/
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
#### Modes task flow (queues and owners)
|
|
169
|
+
|
|
170
|
+
**crawl** puts start tasks into `QueueFetch`, then the queues are filled and drained by the workers shown below:
|
|
171
|
+
```mermaid
|
|
172
|
+
flowchart LR
|
|
173
|
+
start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
|
|
174
|
+
qf --> fetchers[Fetchers<br/>multi-threading]
|
|
175
|
+
fetchers -->|TaskExtract| qe[QueueExtract<br/>priority keys deep url content]
|
|
176
|
+
fetchers -->|TaskHTMLHandle| qh[QueueHTMLHandle<br/>priority keys deep url content]
|
|
177
|
+
qe --> extractor[Extractor]
|
|
178
|
+
extractor -->|TaskFetch| qf
|
|
179
|
+
qh --> html[HTML Handler]
|
|
180
|
+
html -->|TaskSave if item| qs[QueueSave<br/>priority keys deep url item]
|
|
181
|
+
qs --> saver[Saver]
|
|
182
|
+
|
|
183
|
+
proxieser[Proxieser] -.->|optional| qp[QueueProxies]
|
|
184
|
+
qp -.->|optional| fetchers
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
**crawl --headless** extends the regular crawl with a Playwright-based headless browser pipeline.
|
|
188
|
+
Qualifying pages (decided by `HeadlessCandidate`) are routed to a single-threaded headless
|
|
189
|
+
engine instead of the normal Extractor + HTML Handler path. The headless engine intercepts
|
|
190
|
+
CDP network events to discover URLs and captures the fully-rendered page for the HTML Handler.
|
|
191
|
+
|
|
192
|
+
```mermaid
|
|
193
|
+
flowchart LR
|
|
194
|
+
start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
|
|
195
|
+
qf --> fetchers[Fetchers<br/>multi-thread]
|
|
196
|
+
|
|
197
|
+
fetchers -->|HeadlessCandidate?| decision{is<br/>candidate?}
|
|
198
|
+
|
|
199
|
+
decision -->|No| qe[QueueExtract]
|
|
200
|
+
decision -->|No| qh[QueueHTMLHandle]
|
|
201
|
+
decision -->|Yes| qhl[QueueHeadless<br/>dedup: VisitLimit]
|
|
202
|
+
|
|
203
|
+
qhl --> headless[HeadlessThread<br/>single thread<br/>Playwright + CDP]
|
|
204
|
+
|
|
205
|
+
headless -->|intercepted URLs<br/>TaskFetch| qf
|
|
206
|
+
headless -->|normalized page<br/>TaskHTMLHandle| qh
|
|
207
|
+
|
|
208
|
+
qe --> extractor[Extractor]
|
|
209
|
+
extractor -->|TaskFetch| qf
|
|
210
|
+
|
|
211
|
+
qh --> html[HTML Handler<br/>_normalize_content]
|
|
212
|
+
html -->|TaskSave| qs[QueueSave]
|
|
213
|
+
qs --> saver[Saver]
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Key points:
|
|
217
|
+
- **HeadlessCandidate** decides which fetched pages qualify. Currently: root/index-like URLs
|
|
218
|
+
(`is_absolute`) and HTML responses with status 200/301/302.
|
|
219
|
+
- **HeadlessExtractor** (Playwright) uses lazy browser init on the worker thread to avoid
|
|
220
|
+
thread-affinity issues. It hooks `page.on("request")` to capture all network URLs,
|
|
221
|
+
then returns both discovered `TaskFetch` items and a `TaskHTMLHandle` with a normalized
|
|
222
|
+
dict (`status_code`, `url`, `html_text`, `headers`, `title`, etc.).
|
|
223
|
+
- **CommonHTMLHandler** accepts both `requests.Response` objects (regular path) and the
|
|
224
|
+
normalized dict (headless path) via `_normalize_content()`.
|
|
225
|
+
- **Deduplication**: `VisitLimit.check_headless_visited()` prevents the same URL from being
|
|
226
|
+
sent to headless twice. `UrlFilter` continues to deduplicate the fetch queue as usual.
|
|
227
|
+
- When a fetched URL qualifies for headless, it skips the regular Extractor and HTML Handler;
|
|
228
|
+
only the headless pipeline processes it.
|
|
229
|
+
|
|
230
|
+
**crawl --fill-forms** extends the headless pipeline with a two-phase form interaction system.
|
|
231
|
+
Phase 1 (cheap): `HeadlessExtractor` calls `FormDetector.detect(page)` on the already-loaded page
|
|
232
|
+
to produce universal form descriptors. Phase 2 (expensive, deferred): `HeadlessFormInteractor`
|
|
233
|
+
picks up form tasks from a dedicated queue, opens the page in a separate browser, fills fields
|
|
234
|
+
via `FormFiller`, submits, and captures results.
|
|
235
|
+
|
|
236
|
+
```mermaid
|
|
237
|
+
flowchart LR
|
|
238
|
+
start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
|
|
239
|
+
qf --> fetchers[Fetchers<br/>multi-thread]
|
|
240
|
+
|
|
241
|
+
fetchers -->|HeadlessCandidate?| decision{is<br/>candidate?}
|
|
242
|
+
|
|
243
|
+
decision -->|No| qe[QueueExtract]
|
|
244
|
+
decision -->|No| qh[QueueHTMLHandle]
|
|
245
|
+
decision -->|Yes| qhl[QueueHeadless<br/>dedup: VisitLimit]
|
|
246
|
+
|
|
247
|
+
qhl --> headless[HeadlessThread<br/>single thread<br/>Playwright + CDP]
|
|
248
|
+
|
|
249
|
+
headless -->|intercepted URLs<br/>TaskFetch| qf
|
|
250
|
+
headless -->|normalized page<br/>TaskHTMLHandle| qh
|
|
251
|
+
headless -->|form descriptors<br/>TaskFormInteract| qfi[QueueFormInteract]
|
|
252
|
+
|
|
253
|
+
qfi --> forminteract[FormInteractThread<br/>single thread<br/>separate Playwright browser]
|
|
254
|
+
forminteract -->|POST response URLs<br/>TaskFetch| qf
|
|
255
|
+
forminteract -->|POST response page<br/>TaskHTMLHandle| qh
|
|
256
|
+
|
|
257
|
+
qe --> extractor[Extractor]
|
|
258
|
+
extractor -->|TaskFetch| qf
|
|
259
|
+
|
|
260
|
+
qh --> html[HTML Handler<br/>_normalize_content]
|
|
261
|
+
html -->|TaskSave| qs[QueueSave]
|
|
262
|
+
qs --> saver[Saver]
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
Key points for form interaction:
|
|
266
|
+
- **FormDetector** scans the already-loaded page DOM for `<form>` elements. Pure detection,
|
|
267
|
+
no extra navigation (~50ms overhead). Returns universal form descriptors.
|
|
268
|
+
- **Form descriptor schema**: `{form_selector, action, method, fields[], buttons[], page_url}`.
|
|
269
|
+
Designed to be self-contained so `HeadlessFormInteractor` needs no extra DOM inspection.
|
|
270
|
+
- **FormFiller** maps input types/names to smart defaults (email, password, search, etc.).
|
|
271
|
+
Supports custom value overrides via dict.
|
|
272
|
+
- **HeadlessFormInteractor** runs in a dedicated thread with its own Playwright browser.
|
|
273
|
+
It navigates to the page, fills fields, submits/clicks, and captures network traffic +
|
|
274
|
+
the resulting page data. Results flow back through the normal URL_FETCH and HTM_HANDLE queues.
|
|
275
|
+
- **Budget cap**: `FormDetector.max_forms_per_page` (default 5) and
|
|
276
|
+
`HeadlessFormInteractor.max_interactions_per_page` prevent runaway on form-heavy pages.
|
|
277
|
+
- The form interaction pipeline is fully independent from the headless extraction pipeline —
|
|
278
|
+
separate queue, separate thread, separate browser instance.
|
|
279
|
+
|
|
280
|
+
**brute** skips extraction and only handles/save results from fetches:
|
|
281
|
+
```mermaid
|
|
282
|
+
flowchart LR
|
|
283
|
+
start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
|
|
284
|
+
qf --> fetchers[Fetchers<br/>multi-threading]
|
|
285
|
+
fetchers -->|TaskHTMLHandle| qh[QueueHTMLHandle<br/>priority keys deep url content]
|
|
286
|
+
qh --> html[HTML Handler]
|
|
287
|
+
html -->|TaskSave if item| qs[QueueSave<br/>priority keys deep url item]
|
|
288
|
+
qs --> saver[Saver]
|
|
289
|
+
|
|
290
|
+
proxieser[Proxieser] -.->|optional| qp[QueueProxies]
|
|
291
|
+
qp -.->|optional| fetchers
|
|
292
|
+
```
|
|
293
|
+
#### How to change settings
|
|
294
|
+
Besides parsing the console arguments, ansferatu has a settings file for:
|
|
295
|
+
- blacklist extentions for requests
|
|
296
|
+
- blacklist extentions for parsing
|
|
297
|
+
- HTTP request workers num
|
|
298
|
+
- CPU consumed workers num
|
|
299
|
+
- HTTP error_limit
|
|
300
|
+
- limit of request to one host
|
|
301
|
+
- HTTP request headers
|
|
302
|
+
- ignored content-types for report
|
|
303
|
+
- deduplication mode
|
|
304
|
+
|
|
305
|
+
The default file is stored in modules\settings\default_config.yaml
|
|
306
|
+
|
|
307
|
+
If you want to update settings, it's best to copy the file modules\settings\default_config.yaml to modules\settings\config.yaml and then edit config.yaml file.
|
|
308
|
+
|
|
309
|
+
#### How we avoid loops
|
|
310
|
+
|
|
311
|
+
`checkRecursion()` - check if something is going wrong and request start repeat the same path again and again, like: /blog/atricle/blog/article/... It is happening sometimes because of imperfection of extracting URLs process.
|
|
312
|
+
|
|
313
|
+
`check_limits ()` - Check how many times we access to parent directory.
|
|
314
|
+
How it works. Let's use http://www.example.com/blog/articles/my_article_1.php as example.
|
|
315
|
+
1. We check how many times we visit http://www.example.com/blog/articles/
|
|
316
|
+
2. If it cross crawl_limit we mark this path as over_limit_pages.
|
|
317
|
+
3. We add +1 to crawl limit to upper path (http://www.example.com/blog/).
|
|
318
|
+
4. Go to step 1 (if this path also contains big amount of URLs we also would avoid this loop too)
|
|
319
|
+
|
|
320
|
+
Step by step at the last we ban visit this website, if all limits will be crossed.
|
|
321
|
+
|
|
322
|
+
#### How retries work
|
|
323
|
+
We have two types of error limit:
|
|
324
|
+
1. To retried URL
|
|
325
|
+
2. To add same URL in queue
|
|
326
|
+
|
|
327
|
+
Retries limit should be less than error limit.
|
|
328
|
+
|
|
329
|
+
When we got connection error with url we retried it before retries limit is over and leave this url for a while.
|
|
330
|
+
Than we continue to add urls in queue (maybe it start answer after while) and if it still unavailable we ban it. But if url will answer we would reset the count.
|
|
331
|
+
|
|
332
|
+
#### Wappalazer role
|
|
333
|
+
|
|
334
|
+
Wappalazer work with app.json file. This file contains regexp database for search anything in server response. (cookies, headers, scripts, text in html, etc.)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
The idea is use wappalazer’s regex engine for “bad place” searching:
|
|
338
|
+
|
|
339
|
+
- All inputs
|
|
340
|
+
```
|
|
341
|
+
<input type="email">
|
|
342
|
+
<input type="password">
|
|
343
|
+
<input type="search">
|
|
344
|
+
<input type="submit">
|
|
345
|
+
```
|
|
346
|
+
- SSRF
|
|
347
|
+
```
|
|
348
|
+
formcontrolname="url"
|
|
349
|
+
```
|
|
350
|
+
- Submit buttons
|
|
351
|
+
```
|
|
352
|
+
<button class="aa" type="submit">Search</button>
|
|
353
|
+
```
|
|
354
|
+
- File uploads
|
|
355
|
+
```
|
|
356
|
+
<input type="file">
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
Wappalazer could be used as simple vulnerability scanner:
|
|
360
|
+
1. Send specific request
|
|
361
|
+
2. Regexp search in server's answer.
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
#### Deduplication
|
|
365
|
+
- Content length + word_count
|
|
366
|
+
- Content length prediction (not fully tested)
|
|
367
|
+
- To Do: Similarity check
|
|
368
|
+
- Check changes in HTML (search for new functions)
|
|
369
|
+
|
|
370
|
+
### Development
|
|
371
|
+
|
|
372
|
+
Editable install (changes to the source are picked up immediately):
|
|
373
|
+
```bash
|
|
374
|
+
pip3 install -e '.[headless,dev]'
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
Run the test suite:
|
|
378
|
+
```bash
|
|
379
|
+
pytest
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
### Building & publishing to PyPI
|
|
383
|
+
|
|
384
|
+
The project is configured with `pyproject.toml` (PEP 621). To build the
|
|
385
|
+
distribution artifacts (source distribution + wheel):
|
|
386
|
+
```bash
|
|
387
|
+
pip3 install build
|
|
388
|
+
python3 -m build # writes dist/ansferatu-<version>.tar.gz and .whl
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
Validate and upload with [Twine](https://twine.readthedocs.io/):
|
|
392
|
+
```bash
|
|
393
|
+
pip3 install twine
|
|
394
|
+
twine check dist/*
|
|
395
|
+
|
|
396
|
+
# Test upload first (recommended): https://test.pypi.org
|
|
397
|
+
twine upload --repository testpypi dist/*
|
|
398
|
+
|
|
399
|
+
# Real upload
|
|
400
|
+
twine upload dist/*
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
Notes:
|
|
404
|
+
- Bump `version` in `pyproject.toml` (and `__version__` in `ansferatu/__init__.py`)
|
|
405
|
+
before each release; PyPI rejects re-uploads of an existing version.
|
|
406
|
+
- Uploading requires a PyPI account and an API token (configure it via
|
|
407
|
+
`~/.pypirc` or the `TWINE_USERNAME=__token__` / `TWINE_PASSWORD=<token>`
|
|
408
|
+
environment variables).
|
|
409
|
+
- The package name `ansferatu` must be available on PyPI for the first upload.
|