ansferatu 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. ansferatu-0.1.0/LICENSE +21 -0
  2. ansferatu-0.1.0/PKG-INFO +409 -0
  3. ansferatu-0.1.0/README.md +353 -0
  4. ansferatu-0.1.0/ansferatu/__init__.py +19 -0
  5. ansferatu-0.1.0/ansferatu/__main__.py +4 -0
  6. ansferatu-0.1.0/ansferatu/cli.py +252 -0
  7. ansferatu-0.1.0/ansferatu/profiles/CommonExtractor.py +309 -0
  8. ansferatu-0.1.0/ansferatu/profiles/CommonFetcher.py +112 -0
  9. ansferatu-0.1.0/ansferatu/profiles/CommonHTMLHandler.py +174 -0
  10. ansferatu-0.1.0/ansferatu/profiles/FileSaver.py +43 -0
  11. ansferatu-0.1.0/ansferatu/profiles/FormDetector.py +526 -0
  12. ansferatu-0.1.0/ansferatu/profiles/FormFiller.py +355 -0
  13. ansferatu-0.1.0/ansferatu/profiles/FormFilter.py +45 -0
  14. ansferatu-0.1.0/ansferatu/profiles/HeadlessCandidate.py +171 -0
  15. ansferatu-0.1.0/ansferatu/profiles/HeadlessExtractor.py +171 -0
  16. ansferatu-0.1.0/ansferatu/profiles/HeadlessFormInteractor.py +418 -0
  17. ansferatu-0.1.0/ansferatu/profiles/JsonlWriter.py +87 -0
  18. ansferatu-0.1.0/ansferatu/profiles/MyProxies.py +18 -0
  19. ansferatu-0.1.0/ansferatu/profiles/ResponseFilter.py +62 -0
  20. ansferatu-0.1.0/ansferatu/profiles/UrlFilter.py +46 -0
  21. ansferatu-0.1.0/ansferatu/profiles/VisitLimit.py +112 -0
  22. ansferatu-0.1.0/ansferatu/profiles/__init__.py +0 -0
  23. ansferatu-0.1.0/ansferatu/profiles/form_helpers/__init__.py +1 -0
  24. ansferatu-0.1.0/ansferatu/profiles/form_helpers/actions.py +432 -0
  25. ansferatu-0.1.0/ansferatu/profiles/form_helpers/browser_lifecycle.py +76 -0
  26. ansferatu-0.1.0/ansferatu/profiles/form_helpers/buttons.py +216 -0
  27. ansferatu-0.1.0/ansferatu/profiles/form_helpers/constants.py +48 -0
  28. ansferatu-0.1.0/ansferatu/profiles/form_helpers/field_classifier.py +469 -0
  29. ansferatu-0.1.0/ansferatu/profiles/form_helpers/filters.py +56 -0
  30. ansferatu-0.1.0/ansferatu/profiles/form_helpers/form_classifier.py +98 -0
  31. ansferatu-0.1.0/ansferatu/profiles/form_helpers/overlay.py +129 -0
  32. ansferatu-0.1.0/ansferatu/profiles/form_helpers/signature.py +64 -0
  33. ansferatu-0.1.0/ansferatu/profiles/form_helpers/visited_forms.py +47 -0
  34. ansferatu-0.1.0/ansferatu/profiles/modes.py +149 -0
  35. ansferatu-0.1.0/ansferatu/profiles/network_constants.py +35 -0
  36. ansferatu-0.1.0/ansferatu/profiles/resource_check.py +91 -0
  37. ansferatu-0.1.0/ansferatu/profiles/response_dedup.py +196 -0
  38. ansferatu-0.1.0/ansferatu/spider/__init__.py +8 -0
  39. ansferatu-0.1.0/ansferatu/spider/common/__init__.py +0 -0
  40. ansferatu-0.1.0/ansferatu/spider/common/url.py +512 -0
  41. ansferatu-0.1.0/ansferatu/spider/concurrent/__init__.py +8 -0
  42. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/__init__.py +14 -0
  43. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/base.py +180 -0
  44. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/extract.py +40 -0
  45. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/fetch.py +72 -0
  46. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/form_interact.py +44 -0
  47. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/headless.py +44 -0
  48. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/html_handle.py +39 -0
  49. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/proxies.py +33 -0
  50. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_inst/save.py +38 -0
  51. ansferatu-0.1.0/ansferatu/spider/concurrent/threads_pool.py +356 -0
  52. ansferatu-0.1.0/ansferatu/spider/instances/__init__.py +12 -0
  53. ansferatu-0.1.0/ansferatu/spider/instances/inst_extract.py +29 -0
  54. ansferatu-0.1.0/ansferatu/spider/instances/inst_fetch.py +46 -0
  55. ansferatu-0.1.0/ansferatu/spider/instances/inst_form_interact.py +31 -0
  56. ansferatu-0.1.0/ansferatu/spider/instances/inst_headless.py +31 -0
  57. ansferatu-0.1.0/ansferatu/spider/instances/inst_html_handle.py +29 -0
  58. ansferatu-0.1.0/ansferatu/spider/instances/inst_proxies.py +31 -0
  59. ansferatu-0.1.0/ansferatu/spider/instances/inst_save.py +32 -0
  60. ansferatu-0.1.0/ansferatu/spider/utilities/__init__.py +10 -0
  61. ansferatu-0.1.0/ansferatu/spider/utilities/cfilter.py +44 -0
  62. ansferatu-0.1.0/ansferatu/spider/utilities/cresult.py +133 -0
  63. ansferatu-0.1.0/ansferatu/spider/utilities/ctask.py +179 -0
  64. ansferatu-0.1.0/ansferatu/spider/utilities/functions.py +84 -0
  65. ansferatu-0.1.0/ansferatu/spider/wappalyzer/__init__.py +0 -0
  66. ansferatu-0.1.0/ansferatu/spider/wappalyzer/all.json +15481 -0
  67. ansferatu-0.1.0/ansferatu/spider/wappalyzer/functional.json +295 -0
  68. ansferatu-0.1.0/ansferatu/spider/wappalyzer/scanner.json +700 -0
  69. ansferatu-0.1.0/ansferatu/spider/wappalyzer/wappalyzer.py +249 -0
  70. ansferatu-0.1.0/ansferatu/spider/wappalyzer/webpage.py +118 -0
  71. ansferatu-0.1.0/ansferatu.egg-info/PKG-INFO +409 -0
  72. ansferatu-0.1.0/ansferatu.egg-info/SOURCES.txt +82 -0
  73. ansferatu-0.1.0/ansferatu.egg-info/dependency_links.txt +1 -0
  74. ansferatu-0.1.0/ansferatu.egg-info/entry_points.txt +2 -0
  75. ansferatu-0.1.0/ansferatu.egg-info/requires.txt +15 -0
  76. ansferatu-0.1.0/ansferatu.egg-info/top_level.txt +1 -0
  77. ansferatu-0.1.0/pyproject.toml +53 -0
  78. ansferatu-0.1.0/setup.cfg +4 -0
  79. ansferatu-0.1.0/tests/test_browser_lifecycle.py +215 -0
  80. ansferatu-0.1.0/tests/test_headless_candidate.py +490 -0
  81. ansferatu-0.1.0/tests/test_resource_check.py +95 -0
  82. ansferatu-0.1.0/tests/test_response_dedup.py +1004 -0
  83. ansferatu-0.1.0/tests/test_runner.py +169 -0
  84. ansferatu-0.1.0/tests/test_thread_safety.py +201 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 frostbits-security
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,409 @@
1
+ Metadata-Version: 2.4
2
+ Name: ansferatu
3
+ Version: 0.1.0
4
+ Summary: Multifunctional tool for HTTP reconnaissance, web crawling and web directory bruteforce.
5
+ Author: frostbits-security
6
+ License: MIT License
7
+
8
+ Copyright (c) 2022 frostbits-security
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/frostbits-security/ansferatu
29
+ Project-URL: Repository, https://github.com/frostbits-security/ansferatu
30
+ Keywords: crawler,spider,bruteforce,reconnaissance,security,web
31
+ Classifier: Development Status :: 4 - Beta
32
+ Classifier: Intended Audience :: Information Technology
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Operating System :: OS Independent
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3 :: Only
37
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
38
+ Classifier: Topic :: Security
39
+ Requires-Python: >=3.8
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: requests
43
+ Requires-Dist: urllib3
44
+ Requires-Dist: beautifulsoup4
45
+ Requires-Dist: simhash
46
+ Requires-Dist: tldextract
47
+ Requires-Dist: validators
48
+ Requires-Dist: psutil
49
+ Provides-Extra: headless
50
+ Requires-Dist: playwright; extra == "headless"
51
+ Provides-Extra: dev
52
+ Requires-Dist: pytest; extra == "dev"
53
+ Requires-Dist: build; extra == "dev"
54
+ Requires-Dist: twine; extra == "dev"
55
+ Dynamic: license-file
56
+
57
+ Multifunctional tool for http reconnaissance, web crawling, web directory bruteforce.
58
+ Based at [PSpider](https://github.com/xianhu/PSpider)
59
+
60
+ Killer features:
61
+ 1. Fast multiurl crawling
62
+ 2. Fast multiurl directory bruteforce
63
+ 3. Find new domains without DNS bruteforce. (for example https://mail.ru --> 105 Domains of *.mail.ru)
64
+ 4. To Do: dynamic creation dictionary for brute-force
65
+ 5. To Do: deduplication based on Simhash
66
+ 6. Headless browsing and forms fill-up as addtional option
67
+ 7. To Do: add proper output to jsonl + html reports
68
+ 8. To Do: Collect query parameters (for get and post)
69
+ 9. To Do: better deduplication based on page hash
70
+
71
+
72
+
73
+
74
+ ### Installation
75
+
76
+ Ansferatu is a regular Python package. It requires Python 3.8+.
77
+
78
+ **From PyPI:**
79
+ ```bash
80
+ pip3 install ansferatu
81
+ ```
82
+
83
+ **From source / GitHub:**
84
+ ```bash
85
+ pip3 install git+https://github.com/frostbits-security/ansferatu.git
86
+ # or, from a local checkout:
87
+ pip3 install .
88
+ ```
89
+
90
+ **Headless / form-filling support (optional).** The `--headless` and
91
+ `--fill-forms` modes rely on [Playwright](https://playwright.dev/python/).
92
+ Install the optional extra and download the Chromium runtime:
93
+ ```bash
94
+ pip3 install 'ansferatu[headless]'
95
+ playwright install chromium
96
+ ```
97
+
98
+ Installing the package exposes an `ansferatu` console command (equivalent to
99
+ `python3 -m ansferatu`).
100
+
101
+ ### How to run
102
+
103
+ After installation, run via the `ansferatu` command:
104
+ ```bash
105
+ ansferatu crawl --url https://mail.ru -o ./results/ --limit 1
106
+ ```
107
+
108
+ #### Use as a library
109
+
110
+ The package can be imported into other Python tools:
111
+ ```python
112
+ from ansferatu import common_crawler, common_brute_from_file
113
+
114
+ common_crawler(
115
+ url_list=["https://example.com"],
116
+ scope=["example.com"],
117
+ exclude_codes_list=[403, 404, 401],
118
+ visit_count_limit=10,
119
+ max_deep=2,
120
+ threads=10,
121
+ output_file="results.jsonl",
122
+ )
123
+ ```
124
+ For lower-level control, build the spider directly:
125
+ ```python
126
+ from ansferatu.spider import WebSpider, TaskFetch
127
+ ```
128
+
129
+ #### Docker
130
+
131
+ Build docker image:
132
+ ```bash
133
+ docker build -t ansferatu .
134
+ ```
135
+
136
+ Run the container (the image's entrypoint is the `ansferatu` command):
137
+ ```bash
138
+ docker run --rm -it -v /tmp/ansferatu_out:/ansferatu/results ansferatu \
139
+ crawl --url https://mail.ru -o /ansferatu/results/ --limit 1
140
+ ```
141
+
142
+ #### Modes
143
+
144
+ **crawl** - run crawl for web sites. Main parameter is "visit_count_limit"
145
+ ```
146
+ ansferatu crawl --url https://deti.mail.ru -o /home/sabotaged/BB/mail.ru/
147
+ ```
148
+
149
+ **crawl --headless** - same crawl but with Playwright headless extraction for qualifying pages.
150
+ Requires the headless extra: `pip install 'ansferatu[headless]' && playwright install chromium`.
151
+ ```bash
152
+ ansferatu crawl --headless --url https://example.com -o ./results/
153
+ ```
154
+
155
+ **crawl --fill-forms** - extends headless crawl with form detection and interaction.
156
+ Detects `<form>` elements on pages, fills fields with smart defaults (email, password, search, etc.),
157
+ submits forms and clicks buttons, then captures the resulting POST responses and new URLs.
158
+ Implies `--headless`.
159
+ ```bash
160
+ ansferatu crawl --fill-forms --url https://example.com -o ./results/
161
+ ```
162
+
163
+ **brute** - classic web directories bruteforce. Needs wordlist.
164
+ ```bash
165
+ ansferatu brute --url https://news.mail.ru -w ./wordlists/fuzz_big.txt -o /home/sabotaged/BB/mail.ru/
166
+ ```
167
+
168
+ #### Modes task flow (queues and owners)
169
+
170
+ **crawl** puts start tasks into `QueueFetch`, then the queues are filled and drained by the workers shown below:
171
+ ```mermaid
172
+ flowchart LR
173
+ start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
174
+ qf --> fetchers[Fetchers<br/>multi-threading]
175
+ fetchers -->|TaskExtract| qe[QueueExtract<br/>priority keys deep url content]
176
+ fetchers -->|TaskHTMLHandle| qh[QueueHTMLHandle<br/>priority keys deep url content]
177
+ qe --> extractor[Extractor]
178
+ extractor -->|TaskFetch| qf
179
+ qh --> html[HTML Handler]
180
+ html -->|TaskSave if item| qs[QueueSave<br/>priority keys deep url item]
181
+ qs --> saver[Saver]
182
+
183
+ proxieser[Proxieser] -.->|optional| qp[QueueProxies]
184
+ qp -.->|optional| fetchers
185
+ ```
186
+
187
+ **crawl --headless** extends the regular crawl with a Playwright-based headless browser pipeline.
188
+ Qualifying pages (decided by `HeadlessCandidate`) are routed to a single-threaded headless
189
+ engine instead of the normal Extractor + HTML Handler path. The headless engine intercepts
190
+ CDP network events to discover URLs and captures the fully-rendered page for the HTML Handler.
191
+
192
+ ```mermaid
193
+ flowchart LR
194
+ start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
195
+ qf --> fetchers[Fetchers<br/>multi-thread]
196
+
197
+ fetchers -->|HeadlessCandidate?| decision{is<br/>candidate?}
198
+
199
+ decision -->|No| qe[QueueExtract]
200
+ decision -->|No| qh[QueueHTMLHandle]
201
+ decision -->|Yes| qhl[QueueHeadless<br/>dedup: VisitLimit]
202
+
203
+ qhl --> headless[HeadlessThread<br/>single thread<br/>Playwright + CDP]
204
+
205
+ headless -->|intercepted URLs<br/>TaskFetch| qf
206
+ headless -->|normalized page<br/>TaskHTMLHandle| qh
207
+
208
+ qe --> extractor[Extractor]
209
+ extractor -->|TaskFetch| qf
210
+
211
+ qh --> html[HTML Handler<br/>_normalize_content]
212
+ html -->|TaskSave| qs[QueueSave]
213
+ qs --> saver[Saver]
214
+ ```
215
+
216
+ Key points:
217
+ - **HeadlessCandidate** decides which fetched pages qualify. Currently: root/index-like URLs
218
+ (`is_absolute`) and HTML responses with status 200/301/302.
219
+ - **HeadlessExtractor** (Playwright) uses lazy browser init on the worker thread to avoid
220
+ thread-affinity issues. It hooks `page.on("request")` to capture all network URLs,
221
+ then returns both discovered `TaskFetch` items and a `TaskHTMLHandle` with a normalized
222
+ dict (`status_code`, `url`, `html_text`, `headers`, `title`, etc.).
223
+ - **CommonHTMLHandler** accepts both `requests.Response` objects (regular path) and the
224
+ normalized dict (headless path) via `_normalize_content()`.
225
+ - **Deduplication**: `VisitLimit.check_headless_visited()` prevents the same URL from being
226
+ sent to headless twice. `UrlFilter` continues to deduplicate the fetch queue as usual.
227
+ - When a fetched URL qualifies for headless, it skips the regular Extractor and HTML Handler;
228
+ only the headless pipeline processes it.
229
+
230
+ **crawl --fill-forms** extends the headless pipeline with a two-phase form interaction system.
231
+ Phase 1 (cheap): `HeadlessExtractor` calls `FormDetector.detect(page)` on the already-loaded page
232
+ to produce universal form descriptors. Phase 2 (expensive, deferred): `HeadlessFormInteractor`
233
+ picks up form tasks from a dedicated queue, opens the page in a separate browser, fills fields
234
+ via `FormFiller`, submits, and captures results.
235
+
236
+ ```mermaid
237
+ flowchart LR
238
+ start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
239
+ qf --> fetchers[Fetchers<br/>multi-thread]
240
+
241
+ fetchers -->|HeadlessCandidate?| decision{is<br/>candidate?}
242
+
243
+ decision -->|No| qe[QueueExtract]
244
+ decision -->|No| qh[QueueHTMLHandle]
245
+ decision -->|Yes| qhl[QueueHeadless<br/>dedup: VisitLimit]
246
+
247
+ qhl --> headless[HeadlessThread<br/>single thread<br/>Playwright + CDP]
248
+
249
+ headless -->|intercepted URLs<br/>TaskFetch| qf
250
+ headless -->|normalized page<br/>TaskHTMLHandle| qh
251
+ headless -->|form descriptors<br/>TaskFormInteract| qfi[QueueFormInteract]
252
+
253
+ qfi --> forminteract[FormInteractThread<br/>single thread<br/>separate Playwright browser]
254
+ forminteract -->|POST response URLs<br/>TaskFetch| qf
255
+ forminteract -->|POST response page<br/>TaskHTMLHandle| qh
256
+
257
+ qe --> extractor[Extractor]
258
+ extractor -->|TaskFetch| qf
259
+
260
+ qh --> html[HTML Handler<br/>_normalize_content]
261
+ html -->|TaskSave| qs[QueueSave]
262
+ qs --> saver[Saver]
263
+ ```
264
+
265
+ Key points for form interaction:
266
+ - **FormDetector** scans the already-loaded page DOM for `<form>` elements. Pure detection,
267
+ no extra navigation (~50ms overhead). Returns universal form descriptors.
268
+ - **Form descriptor schema**: `{form_selector, action, method, fields[], buttons[], page_url}`.
269
+ Designed to be self-contained so `HeadlessFormInteractor` needs no extra DOM inspection.
270
+ - **FormFiller** maps input types/names to smart defaults (email, password, search, etc.).
271
+ Supports custom value overrides via dict.
272
+ - **HeadlessFormInteractor** runs in a dedicated thread with its own Playwright browser.
273
+ It navigates to the page, fills fields, submits/clicks, and captures network traffic +
274
+ the resulting page data. Results flow back through the normal URL_FETCH and HTM_HANDLE queues.
275
+ - **Budget cap**: `FormDetector.max_forms_per_page` (default 5) and
276
+ `HeadlessFormInteractor.max_interactions_per_page` prevent runaway on form-heavy pages.
277
+ - The form interaction pipeline is fully independent from the headless extraction pipeline —
278
+ separate queue, separate thread, separate browser instance.
279
+
280
+ **brute** skips extraction and only handles/save results from fetches:
281
+ ```mermaid
282
+ flowchart LR
283
+ start([Start Task]) -->|set_start_task| qf[QueueFetch<br/>priority keys deep url repeat]
284
+ qf --> fetchers[Fetchers<br/>multi-threading]
285
+ fetchers -->|TaskHTMLHandle| qh[QueueHTMLHandle<br/>priority keys deep url content]
286
+ qh --> html[HTML Handler]
287
+ html -->|TaskSave if item| qs[QueueSave<br/>priority keys deep url item]
288
+ qs --> saver[Saver]
289
+
290
+ proxieser[Proxieser] -.->|optional| qp[QueueProxies]
291
+ qp -.->|optional| fetchers
292
+ ```
293
+ #### How to change settings
294
+ Besides parsing the console arguments, ansferatu has a settings file for:
295
+ - blacklist extentions for requests
296
+ - blacklist extentions for parsing
297
+ - HTTP request workers num
298
+ - CPU consumed workers num
299
+ - HTTP error_limit
300
+ - limit of request to one host
301
+ - HTTP request headers
302
+ - ignored content-types for report
303
+ - deduplication mode
304
+
305
+ The default file is stored in modules\settings\default_config.yaml
306
+
307
+ If you want to update settings, it's best to copy the file modules\settings\default_config.yaml to modules\settings\config.yaml and then edit config.yaml file.
308
+
309
+ #### How we avoid loops
310
+
311
+ `checkRecursion()` - check if something is going wrong and request start repeat the same path again and again, like: /blog/atricle/blog/article/... It is happening sometimes because of imperfection of extracting URLs process.
312
+
313
+ `check_limits ()` - Check how many times we access to parent directory.
314
+ How it works. Let's use http://www.example.com/blog/articles/my_article_1.php as example.
315
+ 1. We check how many times we visit http://www.example.com/blog/articles/
316
+ 2. If it cross crawl_limit we mark this path as over_limit_pages.
317
+ 3. We add +1 to crawl limit to upper path (http://www.example.com/blog/).
318
+ 4. Go to step 1 (if this path also contains big amount of URLs we also would avoid this loop too)
319
+
320
+ Step by step at the last we ban visit this website, if all limits will be crossed.
321
+
322
+ #### How retries work
323
+ We have two types of error limit:
324
+ 1. To retried URL
325
+ 2. To add same URL in queue
326
+
327
+ Retries limit should be less than error limit.
328
+
329
+ When we got connection error with url we retried it before retries limit is over and leave this url for a while.
330
+ Than we continue to add urls in queue (maybe it start answer after while) and if it still unavailable we ban it. But if url will answer we would reset the count.
331
+
332
+ #### Wappalazer role
333
+
334
+ Wappalazer work with app.json file. This file contains regexp database for search anything in server response. (cookies, headers, scripts, text in html, etc.)
335
+
336
+
337
+ The idea is use wappalazer’s regex engine for “bad place” searching:
338
+
339
+ - All inputs
340
+ ```
341
+ <input type="email">
342
+ <input type="password">
343
+ <input type="search">
344
+ <input type="submit">
345
+ ```
346
+ - SSRF
347
+ ```
348
+ formcontrolname="url"
349
+ ```
350
+ - Submit buttons
351
+ ```
352
+ <button class="aa" type="submit">Search</button>
353
+ ```
354
+ - File uploads
355
+ ```
356
+ <input type="file">
357
+ ```
358
+
359
+ Wappalazer could be used as simple vulnerability scanner:
360
+ 1. Send specific request
361
+ 2. Regexp search in server's answer.
362
+
363
+
364
+ #### Deduplication
365
+ - Content length + word_count
366
+ - Content length prediction (not fully tested)
367
+ - To Do: Similarity check
368
+ - Check changes in HTML (search for new functions)
369
+
370
+ ### Development
371
+
372
+ Editable install (changes to the source are picked up immediately):
373
+ ```bash
374
+ pip3 install -e '.[headless,dev]'
375
+ ```
376
+
377
+ Run the test suite:
378
+ ```bash
379
+ pytest
380
+ ```
381
+
382
+ ### Building & publishing to PyPI
383
+
384
+ The project is configured with `pyproject.toml` (PEP 621). To build the
385
+ distribution artifacts (source distribution + wheel):
386
+ ```bash
387
+ pip3 install build
388
+ python3 -m build # writes dist/ansferatu-<version>.tar.gz and .whl
389
+ ```
390
+
391
+ Validate and upload with [Twine](https://twine.readthedocs.io/):
392
+ ```bash
393
+ pip3 install twine
394
+ twine check dist/*
395
+
396
+ # Test upload first (recommended): https://test.pypi.org
397
+ twine upload --repository testpypi dist/*
398
+
399
+ # Real upload
400
+ twine upload dist/*
401
+ ```
402
+
403
+ Notes:
404
+ - Bump `version` in `pyproject.toml` (and `__version__` in `ansferatu/__init__.py`)
405
+ before each release; PyPI rejects re-uploads of an existing version.
406
+ - Uploading requires a PyPI account and an API token (configure it via
407
+ `~/.pypirc` or the `TWINE_USERNAME=__token__` / `TWINE_PASSWORD=<token>`
408
+ environment variables).
409
+ - The package name `ansferatu` must be available on PyPI for the first upload.