scraperecon 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraperecon-0.1.0/.github/workflows/publish.yml +34 -0
- scraperecon-0.1.0/.gitignore +90 -0
- scraperecon-0.1.0/LICENSE +21 -0
- scraperecon-0.1.0/PKG-INFO +150 -0
- scraperecon-0.1.0/README.md +138 -0
- scraperecon-0.1.0/pyproject.toml +19 -0
- scraperecon-0.1.0/scraperecon/__init__.py +0 -0
- scraperecon-0.1.0/scraperecon/data/indicators.json +14 -0
- scraperecon-0.1.0/scraperecon/data/signatures.json +61 -0
- scraperecon-0.1.0/scraperecon/main.py +230 -0
- scraperecon-0.1.0/scraperecon/pipeline.py +49 -0
- scraperecon-0.1.0/scraperecon/report.py +47 -0
- scraperecon-0.1.0/scraperecon/signatures.py +7 -0
- scraperecon-0.1.0/scraperecon/stages/__init__.py +0 -0
- scraperecon-0.1.0/scraperecon/stages/plain.py +67 -0
- scraperecon-0.1.0/scraperecon/stages/ratelimit.py +146 -0
- scraperecon-0.1.0/scraperecon/stages/tls.py +64 -0
- scraperecon-0.1.0/scraperecon/stages/vendor.py +101 -0
- scraperecon-0.1.0/scraperecon/types.py +82 -0
- scraperecon-0.1.0/scraperecon/utils.py +14 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
pypi-publish:
|
|
9
|
+
name: Build and publish to PyPI
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment:
|
|
12
|
+
name: pypi
|
|
13
|
+
url: https://pypi.org/p/scraperecon
|
|
14
|
+
permissions:
|
|
15
|
+
id-token: write # IMPORTANT: mandatory for trusted publishing
|
|
16
|
+
contents: read # Needed for checkout
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- name: Checkout repository
|
|
20
|
+
uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: "3.11"
|
|
26
|
+
|
|
27
|
+
- name: Install pypa/build
|
|
28
|
+
run: python -m pip install --upgrade pip build
|
|
29
|
+
|
|
30
|
+
- name: Build distribution
|
|
31
|
+
run: python -m build
|
|
32
|
+
|
|
33
|
+
- name: Publish to PyPI
|
|
34
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
build/
|
|
11
|
+
develop-eggs/
|
|
12
|
+
dist/
|
|
13
|
+
downloads/
|
|
14
|
+
eggs/
|
|
15
|
+
.eggs/
|
|
16
|
+
lib/
|
|
17
|
+
lib64/
|
|
18
|
+
parts/
|
|
19
|
+
sdist/
|
|
20
|
+
var/
|
|
21
|
+
wheels/
|
|
22
|
+
share/python-wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
MANIFEST
|
|
27
|
+
|
|
28
|
+
# PyInstaller
|
|
29
|
+
# Usually these files are written by a python script from a template
|
|
30
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
31
|
+
*.manifest
|
|
32
|
+
*.spec
|
|
33
|
+
|
|
34
|
+
# Installer logs
|
|
35
|
+
pip-log.txt
|
|
36
|
+
pip-delete-this-directory.txt
|
|
37
|
+
|
|
38
|
+
# Unit test / coverage reports
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*.cover
|
|
48
|
+
*.py,cover
|
|
49
|
+
.hypothesis/
|
|
50
|
+
.pytest_cache/
|
|
51
|
+
cover/
|
|
52
|
+
|
|
53
|
+
# Environments
|
|
54
|
+
.env
|
|
55
|
+
.venv
|
|
56
|
+
env/
|
|
57
|
+
venv/
|
|
58
|
+
ENV/
|
|
59
|
+
env.bak/
|
|
60
|
+
venv.bak/
|
|
61
|
+
|
|
62
|
+
# IDEs and Editors
|
|
63
|
+
.vscode/
|
|
64
|
+
.idea/
|
|
65
|
+
*.swp
|
|
66
|
+
*.swo
|
|
67
|
+
*~
|
|
68
|
+
.project
|
|
69
|
+
.pydevproject
|
|
70
|
+
|
|
71
|
+
# OS generated files
|
|
72
|
+
.DS_Store
|
|
73
|
+
.DS_Store?
|
|
74
|
+
._*
|
|
75
|
+
.Spotlight-V100
|
|
76
|
+
.Trashes
|
|
77
|
+
ehthumbs.db
|
|
78
|
+
Thumbs.db
|
|
79
|
+
|
|
80
|
+
# mypy
|
|
81
|
+
.mypy_cache/
|
|
82
|
+
.dmypy.json
|
|
83
|
+
dmypy.json
|
|
84
|
+
.pyre/
|
|
85
|
+
|
|
86
|
+
# Pyre type checker
|
|
87
|
+
.pyre/
|
|
88
|
+
|
|
89
|
+
# ruff / linters
|
|
90
|
+
.ruff_cache/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shaheer Sarfaraz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scraperecon
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool that runs a sequential reconnaissance pipeline against a target URL before a developer writes a scraper.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: curl-cffi>=0.7
|
|
8
|
+
Requires-Dist: httpx>=0.27
|
|
9
|
+
Requires-Dist: rich>=13
|
|
10
|
+
Requires-Dist: typer>=0.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# scraperecon
|
|
14
|
+
|
|
15
|
+
Run this before you write a scraper. It tells you what bot protection a site has, whether plain HTTP or TLS impersonation is enough to get through, and how aggressively it rate limits — before you've written a single line of scraper code.
|
|
16
|
+
|
|
17
|
+
<img width="1117" height="409" alt="image" src="https://github.com/user-attachments/assets/aa7d0670-c612-4cfb-a579-b610b9c04163" />
|
|
18
|
+
|
|
19
|
+
<br/>
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
scraperecon https://target.com
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
scraperecon — https://target.com
|
|
29
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
30
|
+
|
|
31
|
+
Stage 1 — Plain HTTP (httpx, scraper User-Agent)
|
|
32
|
+
Status: 403 Forbidden
|
|
33
|
+
Time: 212ms
|
|
34
|
+
Verdict: Blocked
|
|
35
|
+
|
|
36
|
+
Stage 2 — TLS Impersonation (chrome131)
|
|
37
|
+
Status: 200 OK
|
|
38
|
+
Time: 389ms
|
|
39
|
+
Verdict: Open
|
|
40
|
+
Note: TLS fingerprint was the blocker ✓
|
|
41
|
+
|
|
42
|
+
Stage 3 — Vendor Detection
|
|
43
|
+
Vendor: Cloudflare
|
|
44
|
+
Confidence: High
|
|
45
|
+
Signals: cf-ray header, __cf_bm cookie, challenges.cloudflare.com in body
|
|
46
|
+
|
|
47
|
+
Stage 4 — Rate Limit Probe
|
|
48
|
+
Skipped (pass --probe-rate to enable)
|
|
49
|
+
|
|
50
|
+
Recommendation
|
|
51
|
+
Use curl_cffi with chrome131 TLS profile
|
|
52
|
+
No CAPTCHA detected at probe volume
|
|
53
|
+
Proxy rotation not required at low request rates
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## What it does
|
|
59
|
+
|
|
60
|
+
scraperecon runs four stages against a URL in order, stopping early where it can.
|
|
61
|
+
|
|
62
|
+
**Stage 1 — Plain HTTP**
|
|
63
|
+
|
|
64
|
+
A basic GET with no tricks. If this comes back clean, you don't need anything else — plain `httpx` or `requests` will work fine and you can stop here.
|
|
65
|
+
|
|
66
|
+
It also checks whether a 200 response is actually real content or a JS challenge page. Cloudflare in particular loves returning 200 with a challenge rather than a 403. scraperecon catches that and marks it `Challenged` instead of lying to you with a green `Open`.
|
|
67
|
+
|
|
68
|
+
**Stage 2 — TLS Impersonation**
|
|
69
|
+
|
|
70
|
+
If Stage 1 was blocked or challenged, it retries using `curl_cffi` impersonating Chrome's TLS fingerprint. A lot of bot detection happens at the TLS handshake level — Python's `requests` library has a completely different fingerprint from a real browser, and that alone is enough to get you blocked on many sites before the server has even looked at your headers. If Stage 2 passes where Stage 1 didn't, you know exactly what the fix is.
|
|
71
|
+
|
|
72
|
+
**Stage 3 — Vendor Detection**
|
|
73
|
+
|
|
74
|
+
Inspects headers, cookies, and the response body for known signatures and tells you which bot protection vendor is running. This matters because Cloudflare, DataDome, Akamai, and PerimeterX all require different bypass strategies. Knowing which one you're dealing with upfront saves you from trying things that were never going to work.
|
|
75
|
+
|
|
76
|
+
**Stage 4 — Rate Limit Probe** _(opt-in)_
|
|
77
|
+
|
|
78
|
+
Fires N requests with configurable concurrency and watches what happens — hard 429s, silent response time degradation, mid-session redirects. Off by default because blasting a site without thinking about it is bad practice. Pass `--probe-rate` when you actually need the data.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Install
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pipx install scraperecon
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Usage
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
scraperecon https://target.com
|
|
94
|
+
scraperecon https://target.com --probe-rate
|
|
95
|
+
scraperecon https://target.com --probe-rate --concurrency 10 --requests 50
|
|
96
|
+
scraperecon https://target.com --impersonate firefox120
|
|
97
|
+
scraperecon https://target.com --save
|
|
98
|
+
scraperecon https://target.com --json | jq .recommendation
|
|
99
|
+
|
|
100
|
+
| Flag | Default | Description |
|
|
101
|
+
| --------------- | --------- | ------------------------------------------------------------------------------------ |
|
|
102
|
+
| `--probe-rate` | off | Run Stage 4 rate limit probe |
|
|
103
|
+
| `--concurrency` | 5 | Workers for rate probe |
|
|
104
|
+
| `--requests` | 20 | Total requests for rate probe |
|
|
105
|
+
| `--impersonate` | chrome131 | TLS profile for Stage 2. Options: `chrome131`, `chrome120`, `firefox120`, `safari17` |
|
|
106
|
+
| `--timeout` | 10 | Per-request timeout in seconds |
|
|
107
|
+
| `--json` | off | Machine-readable JSON output |
|
|
108
|
+
| `--save` | off | Save the full HTML responses to local files (`<domain>_stage1.html`, etc.) |
|
|
109
|
+
| `--skip-tls` | off | Skip Stage 2 |
|
|
110
|
+
| `--skip-vendor` | off | Skip Stage 3 |
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Reading the recommendation
|
|
115
|
+
|
|
116
|
+
At the end of every run you get a plain-English recommendation based on what was found.
|
|
117
|
+
|
|
118
|
+
- **Plain HTTP should be sufficient** — `httpx` or `requests` will work. No special setup needed.
|
|
119
|
+
- **Use curl_cffi with `<profile>`** — TLS fingerprinting is blocking you. Switch to `curl_cffi` with the listed profile.
|
|
120
|
+
- **May need browser automation** — both plain and TLS requests were blocked. You're likely looking at a full JS challenge (Turnstile, hCaptcha). Playwright with a stealth plugin is probably your next move.
|
|
121
|
+
- **Proxy rotation recommended** — the rate probe hit throttling. At any real request volume you'll need rotating proxies.
|
|
122
|
+
- **CAPTCHA detected** — the response body contained CAPTCHA indicators. Automated solving or a managed scraping service required.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Adding vendor signatures
|
|
127
|
+
|
|
128
|
+
Signatures live in `scraperecon/data/signatures.json`. It's a flat JSON file — no code required. If you know a signal that's missing, open a PR.
|
|
129
|
+
|
|
130
|
+
```json
|
|
131
|
+
{
|
|
132
|
+
"name": "YourVendor",
|
|
133
|
+
"signals": [
|
|
134
|
+
{ "type": "header_present", "key": "x-your-vendor", "weight": 0.8 },
|
|
135
|
+
{ "type": "cookie_name", "value": "your_cookie", "weight": 0.6 }
|
|
136
|
+
]
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Signal types: `header_present`, `header_value`, `cookie_name`, `body_contains`, `status_code`.
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## What it won't do
|
|
145
|
+
|
|
146
|
+
scraperecon is a recon tool, not a scraping library. It tells you what you need — it doesn't do it for you. No CAPTCHA solving, no Playwright integration, no proxy support, no persistent history.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
Every scraper project starts with the same 20 minutes of manual work: try curl, get blocked, try curl_cffi, check the headers, fire some requests and see what happens. This automates that.
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# scraperecon
|
|
2
|
+
|
|
3
|
+
Run this before you write a scraper. It tells you what bot protection a site has, whether plain HTTP or TLS impersonation is enough to get through, and how aggressively it rate limits — before you've written a single line of scraper code.
|
|
4
|
+
|
|
5
|
+
<img width="1117" height="409" alt="image" src="https://github.com/user-attachments/assets/aa7d0670-c612-4cfb-a579-b610b9c04163" />
|
|
6
|
+
|
|
7
|
+
<br/>
|
|
8
|
+
|
|
9
|
+
## Usage
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
scraperecon https://target.com
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
scraperecon — https://target.com
|
|
17
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
18
|
+
|
|
19
|
+
Stage 1 — Plain HTTP (httpx, scraper User-Agent)
|
|
20
|
+
Status: 403 Forbidden
|
|
21
|
+
Time: 212ms
|
|
22
|
+
Verdict: Blocked
|
|
23
|
+
|
|
24
|
+
Stage 2 — TLS Impersonation (chrome131)
|
|
25
|
+
Status: 200 OK
|
|
26
|
+
Time: 389ms
|
|
27
|
+
Verdict: Open
|
|
28
|
+
Note: TLS fingerprint was the blocker ✓
|
|
29
|
+
|
|
30
|
+
Stage 3 — Vendor Detection
|
|
31
|
+
Vendor: Cloudflare
|
|
32
|
+
Confidence: High
|
|
33
|
+
Signals: cf-ray header, __cf_bm cookie, challenges.cloudflare.com in body
|
|
34
|
+
|
|
35
|
+
Stage 4 — Rate Limit Probe
|
|
36
|
+
Skipped (pass --probe-rate to enable)
|
|
37
|
+
|
|
38
|
+
Recommendation
|
|
39
|
+
Use curl_cffi with chrome131 TLS profile
|
|
40
|
+
No CAPTCHA detected at probe volume
|
|
41
|
+
Proxy rotation not required at low request rates
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## What it does
|
|
47
|
+
|
|
48
|
+
scraperecon runs four stages against a URL in order, stopping early where it can.
|
|
49
|
+
|
|
50
|
+
**Stage 1 — Plain HTTP**
|
|
51
|
+
|
|
52
|
+
A basic GET with no tricks. If this comes back clean, you don't need anything else — plain `httpx` or `requests` will work fine and you can stop here.
|
|
53
|
+
|
|
54
|
+
It also checks whether a 200 response is actually real content or a JS challenge page. Cloudflare in particular loves returning 200 with a challenge rather than a 403. scraperecon catches that and marks it `Challenged` instead of lying to you with a green `Open`.
|
|
55
|
+
|
|
56
|
+
**Stage 2 — TLS Impersonation**
|
|
57
|
+
|
|
58
|
+
If Stage 1 was blocked or challenged, it retries using `curl_cffi` impersonating Chrome's TLS fingerprint. A lot of bot detection happens at the TLS handshake level — Python's `requests` library has a completely different fingerprint from a real browser, and that alone is enough to get you blocked on many sites before the server has even looked at your headers. If Stage 2 passes where Stage 1 didn't, you know exactly what the fix is.
|
|
59
|
+
|
|
60
|
+
**Stage 3 — Vendor Detection**
|
|
61
|
+
|
|
62
|
+
Inspects headers, cookies, and the response body for known signatures and tells you which bot protection vendor is running. This matters because Cloudflare, DataDome, Akamai, and PerimeterX all require different bypass strategies. Knowing which one you're dealing with upfront saves you from trying things that were never going to work.
|
|
63
|
+
|
|
64
|
+
**Stage 4 — Rate Limit Probe** _(opt-in)_
|
|
65
|
+
|
|
66
|
+
Fires N requests with configurable concurrency and watches what happens — hard 429s, silent response time degradation, mid-session redirects. Off by default because blasting a site without thinking about it is bad practice. Pass `--probe-rate` when you actually need the data.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Install
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pipx install scraperecon
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Usage
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
scraperecon https://target.com
|
|
82
|
+
scraperecon https://target.com --probe-rate
|
|
83
|
+
scraperecon https://target.com --probe-rate --concurrency 10 --requests 50
|
|
84
|
+
scraperecon https://target.com --impersonate firefox120
|
|
85
|
+
scraperecon https://target.com --save
|
|
86
|
+
scraperecon https://target.com --json | jq .recommendation
|
|
87
|
+
|
|
88
|
+
| Flag | Default | Description |
|
|
89
|
+
| --------------- | --------- | ------------------------------------------------------------------------------------ |
|
|
90
|
+
| `--probe-rate` | off | Run Stage 4 rate limit probe |
|
|
91
|
+
| `--concurrency` | 5 | Workers for rate probe |
|
|
92
|
+
| `--requests` | 20 | Total requests for rate probe |
|
|
93
|
+
| `--impersonate` | chrome131 | TLS profile for Stage 2. Options: `chrome131`, `chrome120`, `firefox120`, `safari17` |
|
|
94
|
+
| `--timeout` | 10 | Per-request timeout in seconds |
|
|
95
|
+
| `--json` | off | Machine-readable JSON output |
|
|
96
|
+
| `--save` | off | Save the full HTML responses to local files (`<domain>_stage1.html`, etc.) |
|
|
97
|
+
| `--skip-tls` | off | Skip Stage 2 |
|
|
98
|
+
| `--skip-vendor` | off | Skip Stage 3 |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Reading the recommendation
|
|
103
|
+
|
|
104
|
+
At the end of every run you get a plain-English recommendation based on what was found.
|
|
105
|
+
|
|
106
|
+
- **Plain HTTP should be sufficient** — `httpx` or `requests` will work. No special setup needed.
|
|
107
|
+
- **Use curl_cffi with `<profile>`** — TLS fingerprinting is blocking you. Switch to `curl_cffi` with the listed profile.
|
|
108
|
+
- **May need browser automation** — both plain and TLS requests were blocked. You're likely looking at a full JS challenge (Turnstile, hCaptcha). Playwright with a stealth plugin is probably your next move.
|
|
109
|
+
- **Proxy rotation recommended** — the rate probe hit throttling. At any real request volume you'll need rotating proxies.
|
|
110
|
+
- **CAPTCHA detected** — the response body contained CAPTCHA indicators. Automated solving or a managed scraping service required.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Adding vendor signatures
|
|
115
|
+
|
|
116
|
+
Signatures live in `scraperecon/data/signatures.json`. It's a flat JSON file — no code required. If you know a signal that's missing, open a PR.
|
|
117
|
+
|
|
118
|
+
```json
|
|
119
|
+
{
|
|
120
|
+
"name": "YourVendor",
|
|
121
|
+
"signals": [
|
|
122
|
+
{ "type": "header_present", "key": "x-your-vendor", "weight": 0.8 },
|
|
123
|
+
{ "type": "cookie_name", "value": "your_cookie", "weight": 0.6 }
|
|
124
|
+
]
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Signal types: `header_present`, `header_value`, `cookie_name`, `body_contains`, `status_code`.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## What it won't do
|
|
133
|
+
|
|
134
|
+
scraperecon is a recon tool, not a scraping library. It tells you what you need — it doesn't do it for you. No CAPTCHA solving, no Playwright integration, no proxy support, no persistent history.
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
Every scraper project starts with the same 20 minutes of manual work: try curl, get blocked, try curl_cffi, check the headers, fire some requests and see what happens. This automates that.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scraperecon"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A CLI tool that runs a sequential reconnaissance pipeline against a target URL before a developer writes a scraper."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"httpx>=0.27",
|
|
13
|
+
"curl-cffi>=0.7",
|
|
14
|
+
"typer>=0.12",
|
|
15
|
+
"rich>=13"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
scraperecon = "scraperecon.main:app"
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"vendors": [
|
|
3
|
+
{
|
|
4
|
+
"name": "Cloudflare",
|
|
5
|
+
"signals": [
|
|
6
|
+
{ "type": "header_present", "key": "cf-ray", "weight": 0.6 },
|
|
7
|
+
{ "type": "cookie_name", "value": "__cf_bm", "weight": 0.3 },
|
|
8
|
+
{ "type": "cookie_name", "value": "cf_clearance", "weight": 0.3 },
|
|
9
|
+
{ "type": "header_value", "key": "server", "value": "cloudflare", "weight": 0.2 },
|
|
10
|
+
{ "type": "body_contains", "value": "cf-browser-verification", "weight": 0.4 },
|
|
11
|
+
{ "type": "body_contains", "value": "challenges.cloudflare.com", "weight": 0.5 }
|
|
12
|
+
]
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"name": "DataDome",
|
|
16
|
+
"signals": [
|
|
17
|
+
{ "type": "cookie_name", "value": "datadome", "weight": 0.8 },
|
|
18
|
+
{ "type": "header_present", "key": "x-datadome-cid", "weight": 0.5 },
|
|
19
|
+
{ "type": "body_contains", "value": "datadome", "weight": 0.4 }
|
|
20
|
+
]
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"name": "Akamai",
|
|
24
|
+
"signals": [
|
|
25
|
+
{ "type": "cookie_name", "value": "_abck", "weight": 0.7 },
|
|
26
|
+
{ "type": "cookie_name", "value": "bm_sz", "weight": 0.5 },
|
|
27
|
+
{ "type": "header_present", "key": "x-akamai-edgescape", "weight": 0.4 }
|
|
28
|
+
]
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"name": "PerimeterX",
|
|
32
|
+
"signals": [
|
|
33
|
+
{ "type": "cookie_name", "value": "_px", "weight": 0.6 },
|
|
34
|
+
{ "type": "cookie_name", "value": "_pxhd", "weight": 0.4 },
|
|
35
|
+
{ "type": "body_contains", "value": "px-captcha", "weight": 0.5 }
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"name": "Kasada",
|
|
40
|
+
"signals": [
|
|
41
|
+
{ "type": "header_present", "key": "x-kasada-pow", "weight": 0.8 },
|
|
42
|
+
{ "type": "body_contains", "value": "kasada", "weight": 0.5 }
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "Imperva",
|
|
47
|
+
"signals": [
|
|
48
|
+
{ "type": "cookie_name", "value": "incap_ses", "weight": 0.6 },
|
|
49
|
+
{ "type": "cookie_name", "value": "visid_incap", "weight": 0.5 },
|
|
50
|
+
{ "type": "header_present", "key": "x-iinfo", "weight": 0.4 }
|
|
51
|
+
]
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"name": "AWS WAF",
|
|
55
|
+
"signals": [
|
|
56
|
+
{ "type": "cookie_name", "value": "aws-waf-token", "weight": 0.8 },
|
|
57
|
+
{ "type": "body_contains", "value": "awswaf", "weight": 0.5 }
|
|
58
|
+
]
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
}
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import dataclasses
|
|
4
|
+
from enum import Enum
|
|
5
|
+
import typer
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.text import Text
|
|
8
|
+
from .pipeline import run_pipeline
|
|
9
|
+
from .types import ReconReport, Verdict, Confidence
|
|
10
|
+
|
|
11
|
+
app = typer.Typer(add_completion=False)
|
|
12
|
+
|
|
13
|
+
def _default_json(obj):
|
|
14
|
+
if isinstance(obj, Enum):
|
|
15
|
+
return obj.value
|
|
16
|
+
if dataclasses.is_dataclass(obj):
|
|
17
|
+
return dataclasses.asdict(obj)
|
|
18
|
+
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
|
19
|
+
|
|
20
|
+
def format_verdict(verdict: Verdict) -> Text:
|
|
21
|
+
if verdict == Verdict.OPEN:
|
|
22
|
+
return Text("Open", style="bold green")
|
|
23
|
+
elif verdict == Verdict.BLOCKED:
|
|
24
|
+
return Text("Blocked", style="bold red")
|
|
25
|
+
elif verdict == Verdict.CHALLENGED:
|
|
26
|
+
return Text("Challenged", style="bold magenta")
|
|
27
|
+
elif verdict in (Verdict.UNCERTAIN, Verdict.SKIPPED):
|
|
28
|
+
return Text(verdict.value, style="bold yellow")
|
|
29
|
+
elif verdict == Verdict.ERROR:
|
|
30
|
+
return Text("Error", style="bold red")
|
|
31
|
+
else:
|
|
32
|
+
return Text(verdict.value, style="bold")
|
|
33
|
+
|
|
34
|
+
def print_human(report: ReconReport):
|
|
35
|
+
console = Console()
|
|
36
|
+
err_console = Console(stderr=True)
|
|
37
|
+
|
|
38
|
+
console.print(f"[bold]scraperecon v0.1.0[/bold] — {report.target}")
|
|
39
|
+
console.print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
40
|
+
console.print()
|
|
41
|
+
|
|
42
|
+
# Stage 1
|
|
43
|
+
console.print("[bold]Stage 1 — Plain HTTP (httpx, scraper User-Agent)[/bold]")
|
|
44
|
+
if report.plain.error:
|
|
45
|
+
err_console.print(f" [red]Error:[/red] {report.plain.error}")
|
|
46
|
+
console.print(" Verdict: ", format_verdict(report.plain.verdict))
|
|
47
|
+
else:
|
|
48
|
+
console.print(f" Status: {report.plain.status}")
|
|
49
|
+
console.print(f" Time: {report.plain.response_time_ms}ms")
|
|
50
|
+
console.print(" Verdict: ", format_verdict(report.plain.verdict))
|
|
51
|
+
console.print()
|
|
52
|
+
|
|
53
|
+
# Stage 2
|
|
54
|
+
if report.tls:
|
|
55
|
+
console.print(f"[bold]Stage 2 — TLS Impersonation ({report.tls.profile_used})[/bold]")
|
|
56
|
+
if report.tls.error:
|
|
57
|
+
err_console.print(f" [red]Error:[/red] {report.tls.error}")
|
|
58
|
+
console.print(" Verdict: ", format_verdict(report.tls.verdict))
|
|
59
|
+
elif report.tls.verdict == Verdict.SKIPPED:
|
|
60
|
+
console.print(" [yellow]Skipped (Stage 1 was Open)[/yellow]")
|
|
61
|
+
else:
|
|
62
|
+
console.print(f" Status: {report.tls.status}")
|
|
63
|
+
console.print(f" Time: {report.tls.response_time_ms}ms")
|
|
64
|
+
console.print(" Verdict: ", format_verdict(report.tls.verdict))
|
|
65
|
+
if report.tls.tls_was_blocker:
|
|
66
|
+
console.print(" Note: [green]TLS fingerprint was the blocker ✓[/green]")
|
|
67
|
+
else:
|
|
68
|
+
console.print("[bold]Stage 2 — TLS Impersonation[/bold]")
|
|
69
|
+
console.print(" [yellow]Skipped[/yellow]")
|
|
70
|
+
console.print()
|
|
71
|
+
|
|
72
|
+
# Stage 3
|
|
73
|
+
console.print("[bold]Stage 3 — Vendor Detection[/bold]")
|
|
74
|
+
if report.vendor:
|
|
75
|
+
if report.vendor.vendor:
|
|
76
|
+
console.print(f" Vendor: [bold]{report.vendor.vendor}[/bold]")
|
|
77
|
+
console.print(f" Confidence: {report.vendor.confidence.value}")
|
|
78
|
+
console.print(f" Signals: {', '.join(report.vendor.matched_signals)}")
|
|
79
|
+
else:
|
|
80
|
+
console.print(" [dim]No known vendor detected[/dim]")
|
|
81
|
+
else:
|
|
82
|
+
console.print(" [yellow]Skipped[/yellow]")
|
|
83
|
+
console.print()
|
|
84
|
+
|
|
85
|
+
# Stage 4
|
|
86
|
+
console.print("[bold]Stage 4 — Rate Limit Probe[/bold]")
|
|
87
|
+
if report.rate_limit:
|
|
88
|
+
console.print(f" Requests: {report.rate_limit.successful} successful, {report.rate_limit.blocked} blocked (out of {report.rate_limit.total_requests})")
|
|
89
|
+
console.print(f" Median ms: {report.rate_limit.median_response_ms}ms")
|
|
90
|
+
if report.rate_limit.block_type:
|
|
91
|
+
console.print(f" Block Type: [bold red]{report.rate_limit.block_type.value}[/bold red]")
|
|
92
|
+
if report.rate_limit.estimated_safe_rps:
|
|
93
|
+
console.print(f" Est. Safe: ~{report.rate_limit.estimated_safe_rps:.1f} req/s")
|
|
94
|
+
else:
|
|
95
|
+
console.print(" [green]No rate limit detected at this volume[/green]")
|
|
96
|
+
else:
|
|
97
|
+
console.print(" [yellow]Skipped (pass --probe-rate to enable)[/yellow]")
|
|
98
|
+
console.print()
|
|
99
|
+
|
|
100
|
+
# Recommendations
|
|
101
|
+
console.print("[bold]Recommendation[/bold]")
|
|
102
|
+
rec = report.recommendation
|
|
103
|
+
if rec.use_tls_impersonation:
|
|
104
|
+
console.print(f" [green]Use curl_cffi with {rec.profile or 'appropriate'} TLS profile[/green]")
|
|
105
|
+
else:
|
|
106
|
+
console.print(" [green]Plain HTTP (httpx/requests) should be sufficient[/green]")
|
|
107
|
+
|
|
108
|
+
if rec.captcha_detected:
|
|
109
|
+
console.print(" [red]CAPTCHA/Challenge detected[/red]")
|
|
110
|
+
else:
|
|
111
|
+
console.print(" No CAPTCHA detected")
|
|
112
|
+
|
|
113
|
+
if rec.proxy_recommended:
|
|
114
|
+
console.print(" [yellow]Proxy rotation recommended due to rate limits[/yellow]")
|
|
115
|
+
else:
|
|
116
|
+
console.print(" Proxy rotation not strictly required at tested volume")
|
|
117
|
+
|
|
118
|
+
for note in rec.notes:
|
|
119
|
+
console.print(f" Note: {note}")
|
|
120
|
+
|
|
121
|
+
def print_json(report: ReconReport):
|
|
122
|
+
# Map to the requested JSON format
|
|
123
|
+
out = {
|
|
124
|
+
"target": report.target,
|
|
125
|
+
"stages": {
|
|
126
|
+
"plain": None,
|
|
127
|
+
"tls": None,
|
|
128
|
+
"vendor": None,
|
|
129
|
+
"rate_limit": None
|
|
130
|
+
},
|
|
131
|
+
"recommendation": dataclasses.asdict(report.recommendation)
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if report.plain:
|
|
135
|
+
out["stages"]["plain"] = {
|
|
136
|
+
"verdict": report.plain.verdict.value,
|
|
137
|
+
"status": report.plain.status,
|
|
138
|
+
"response_time_ms": report.plain.response_time_ms
|
|
139
|
+
}
|
|
140
|
+
if report.plain.error:
|
|
141
|
+
out["stages"]["plain"]["error"] = report.plain.error
|
|
142
|
+
|
|
143
|
+
if report.tls:
|
|
144
|
+
out["stages"]["tls"] = {
|
|
145
|
+
"verdict": report.tls.verdict.value,
|
|
146
|
+
"status": report.tls.status,
|
|
147
|
+
"response_time_ms": report.tls.response_time_ms,
|
|
148
|
+
"tls_was_blocker": report.tls.tls_was_blocker,
|
|
149
|
+
"profile": report.tls.profile_used
|
|
150
|
+
}
|
|
151
|
+
if report.tls.error:
|
|
152
|
+
out["stages"]["tls"]["error"] = report.tls.error
|
|
153
|
+
|
|
154
|
+
if report.vendor and report.vendor.vendor:
|
|
155
|
+
out["stages"]["vendor"] = {
|
|
156
|
+
"vendor": report.vendor.vendor,
|
|
157
|
+
"confidence": report.vendor.confidence.value if report.vendor.confidence else None,
|
|
158
|
+
"matched_signals": report.vendor.matched_signals
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if report.rate_limit:
|
|
162
|
+
out["stages"]["rate_limit"] = {
|
|
163
|
+
"total_requests": report.rate_limit.total_requests,
|
|
164
|
+
"successful": report.rate_limit.successful,
|
|
165
|
+
"blocked": report.rate_limit.blocked,
|
|
166
|
+
"block_type": report.rate_limit.block_type.value if report.rate_limit.block_type else None,
|
|
167
|
+
"estimated_safe_rps": report.rate_limit.estimated_safe_rps,
|
|
168
|
+
"retry_after_secs": report.rate_limit.retry_after_secs,
|
|
169
|
+
"median_response_ms": report.rate_limit.median_response_ms
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
print(json.dumps(out, indent=2))
|
|
173
|
+
|
|
174
|
+
@app.command()
|
|
175
|
+
def main(
|
|
176
|
+
url: str = typer.Argument(..., help="Target URL"),
|
|
177
|
+
probe_rate: bool = typer.Option(False, "--probe-rate", help="Run stage 4 (rate limit probe)"),
|
|
178
|
+
concurrency: int = typer.Option(5, "--concurrency", help="Workers for rate probe"),
|
|
179
|
+
requests: int = typer.Option(20, "--requests", help="Total requests for rate probe"),
|
|
180
|
+
impersonate: str = typer.Option("chrome131", "--impersonate", help="TLS profile for stage 2"),
|
|
181
|
+
timeout: int = typer.Option(10, "--timeout", help="Per-request timeout in seconds"),
|
|
182
|
+
json_out: bool = typer.Option(False, "--json", help="Output machine-readable JSON"),
|
|
183
|
+
skip_tls: bool = typer.Option(False, "--skip-tls", help="Skip stage 2"),
|
|
184
|
+
skip_vendor: bool = typer.Option(False, "--skip-vendor", help="Skip stage 3"),
|
|
185
|
+
save: bool = typer.Option(False, "--save", help="Save the full HTML responses to local files"),
|
|
186
|
+
version: bool = typer.Option(False, "--version", help="Print version")
|
|
187
|
+
):
|
|
188
|
+
if version:
|
|
189
|
+
print("scraperecon v0.1.0")
|
|
190
|
+
raise typer.Exit()
|
|
191
|
+
|
|
192
|
+
report = run_pipeline(
|
|
193
|
+
url=url,
|
|
194
|
+
probe_rate=probe_rate,
|
|
195
|
+
concurrency=concurrency,
|
|
196
|
+
requests=requests,
|
|
197
|
+
impersonate=impersonate,
|
|
198
|
+
timeout=timeout,
|
|
199
|
+
skip_tls=skip_tls,
|
|
200
|
+
skip_vendor=skip_vendor
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if json_out:
|
|
204
|
+
print_json(report)
|
|
205
|
+
else:
|
|
206
|
+
print_human(report)
|
|
207
|
+
|
|
208
|
+
if save:
|
|
209
|
+
from urllib.parse import urlparse
|
|
210
|
+
domain = urlparse(report.target).netloc or "target"
|
|
211
|
+
domain = domain.replace(":", "_")
|
|
212
|
+
|
|
213
|
+
console = Console()
|
|
214
|
+
console.print()
|
|
215
|
+
console.print("[bold]Saved Files[/bold]")
|
|
216
|
+
|
|
217
|
+
if report.plain and not report.plain.error and report.plain.full_body:
|
|
218
|
+
fname = f"{domain}_stage1.html"
|
|
219
|
+
with open(fname, "w", encoding="utf-8") as f:
|
|
220
|
+
f.write(report.plain.full_body)
|
|
221
|
+
console.print(f" [green]Stage 1 saved to:[/green] {fname}")
|
|
222
|
+
|
|
223
|
+
if report.tls and report.tls.verdict not in (Verdict.SKIPPED, Verdict.ERROR) and report.tls.full_body:
|
|
224
|
+
fname = f"{domain}_stage2.html"
|
|
225
|
+
with open(fname, "w", encoding="utf-8") as f:
|
|
226
|
+
f.write(report.tls.full_body)
|
|
227
|
+
console.print(f" [green]Stage 2 saved to:[/green] {fname}")
|
|
228
|
+
|
|
229
|
+
if __name__ == "__main__":
|
|
230
|
+
app()
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from .stages import plain, tls, vendor, ratelimit
|
|
3
|
+
from .report import build_recommendation
|
|
4
|
+
from .types import ReconReport
|
|
5
|
+
|
|
6
|
+
def run_pipeline(
|
|
7
|
+
url: str,
|
|
8
|
+
probe_rate: bool,
|
|
9
|
+
concurrency: int,
|
|
10
|
+
requests: int,
|
|
11
|
+
impersonate: str,
|
|
12
|
+
timeout: int,
|
|
13
|
+
skip_tls: bool,
|
|
14
|
+
skip_vendor: bool
|
|
15
|
+
) -> ReconReport:
|
|
16
|
+
|
|
17
|
+
# Stage 1
|
|
18
|
+
plain_res = plain.run(url, timeout)
|
|
19
|
+
|
|
20
|
+
# Stage 2
|
|
21
|
+
if skip_tls or plain_res.error:
|
|
22
|
+
tls_res = None
|
|
23
|
+
else:
|
|
24
|
+
tls_res = tls.run(url, plain_res, impersonate, timeout)
|
|
25
|
+
|
|
26
|
+
# Stage 3
|
|
27
|
+
if skip_vendor:
|
|
28
|
+
vendor_res = None
|
|
29
|
+
else:
|
|
30
|
+
vendor_res = vendor.run(plain_res, tls_res)
|
|
31
|
+
|
|
32
|
+
# Stage 4
|
|
33
|
+
if probe_rate and ((plain_res and not plain_res.error) or (tls_res and not tls_res.error)):
|
|
34
|
+
rate_res = asyncio.run(ratelimit.run(
|
|
35
|
+
url, True, requests, concurrency, plain_res, tls_res, impersonate
|
|
36
|
+
))
|
|
37
|
+
else:
|
|
38
|
+
rate_res = None
|
|
39
|
+
|
|
40
|
+
rec = build_recommendation(plain_res, tls_res, vendor_res, rate_res)
|
|
41
|
+
|
|
42
|
+
return ReconReport(
|
|
43
|
+
target=url,
|
|
44
|
+
plain=plain_res,
|
|
45
|
+
tls=tls_res,
|
|
46
|
+
vendor=vendor_res,
|
|
47
|
+
rate_limit=rate_res,
|
|
48
|
+
recommendation=rec
|
|
49
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from .types import Recommendation, ReconReport, PlainResult, TlsResult, VendorResult, RateLimitResult, Verdict, BlockType
|
|
3
|
+
|
|
4
|
+
def build_recommendation(
|
|
5
|
+
plain: PlainResult,
|
|
6
|
+
tls: Optional[TlsResult],
|
|
7
|
+
vendor: Optional[VendorResult],
|
|
8
|
+
rate: Optional[RateLimitResult]
|
|
9
|
+
) -> Recommendation:
|
|
10
|
+
|
|
11
|
+
use_tls_impersonation = False
|
|
12
|
+
profile = None
|
|
13
|
+
captcha_detected = False
|
|
14
|
+
proxy_recommended = False
|
|
15
|
+
notes = []
|
|
16
|
+
|
|
17
|
+
if plain.verdict == Verdict.OPEN:
|
|
18
|
+
use_tls_impersonation = False
|
|
19
|
+
elif tls and tls.tls_was_blocker:
|
|
20
|
+
use_tls_impersonation = True
|
|
21
|
+
profile = tls.profile_used
|
|
22
|
+
elif plain.verdict in (Verdict.BLOCKED, Verdict.CHALLENGED) and tls and tls.verdict in (Verdict.BLOCKED, Verdict.CHALLENGED):
|
|
23
|
+
use_tls_impersonation = True
|
|
24
|
+
notes.append("Both stages blocked/challenged: may need browser automation (Playwright + stealth)")
|
|
25
|
+
|
|
26
|
+
if vendor and vendor.vendor == "Cloudflare":
|
|
27
|
+
notes.append("Cloudflare detected: consider Playwright + stealth plugin if curl_cffi fails")
|
|
28
|
+
|
|
29
|
+
if rate and rate.block_type is not None:
|
|
30
|
+
proxy_recommended = True
|
|
31
|
+
|
|
32
|
+
body_to_check = ""
|
|
33
|
+
if tls and tls.verdict not in (Verdict.SKIPPED, Verdict.ERROR):
|
|
34
|
+
body_to_check = tls.body_preview.lower()
|
|
35
|
+
else:
|
|
36
|
+
body_to_check = plain.body_preview.lower()
|
|
37
|
+
|
|
38
|
+
if "captcha" in body_to_check or "challenge" in body_to_check:
|
|
39
|
+
captcha_detected = True
|
|
40
|
+
|
|
41
|
+
return Recommendation(
|
|
42
|
+
use_tls_impersonation=use_tls_impersonation,
|
|
43
|
+
profile=profile,
|
|
44
|
+
captcha_detected=captcha_detected,
|
|
45
|
+
proxy_recommended=proxy_recommended,
|
|
46
|
+
notes=notes
|
|
47
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import httpx
|
|
3
|
+
from ..types import PlainResult, Verdict
|
|
4
|
+
from ..utils import is_challenge_body
|
|
5
|
+
|
|
6
|
+
DEFAULT_HEADERS = {
|
|
7
|
+
"User-Agent": "Mozilla/5.0 (compatible; scraperecon/0.1)",
|
|
8
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
9
|
+
"Accept-Language": "en-US,en;q=0.5"
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
def get_verdict(status: int, body_preview: str = "") -> Verdict:
|
|
13
|
+
if is_challenge_body(body_preview):
|
|
14
|
+
return Verdict.CHALLENGED
|
|
15
|
+
if 200 <= status <= 299:
|
|
16
|
+
return Verdict.OPEN
|
|
17
|
+
elif status in (301, 302, 307, 308):
|
|
18
|
+
return Verdict.REDIRECTED
|
|
19
|
+
elif status in (403, 429, 503):
|
|
20
|
+
return Verdict.BLOCKED
|
|
21
|
+
else:
|
|
22
|
+
return Verdict.UNCERTAIN
|
|
23
|
+
|
|
24
|
+
def run(url: str, timeout: int) -> PlainResult:
|
|
25
|
+
start_time = time.perf_counter()
|
|
26
|
+
try:
|
|
27
|
+
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
|
|
28
|
+
resp = client.get(url, headers=DEFAULT_HEADERS)
|
|
29
|
+
|
|
30
|
+
response_time_ms = int((time.perf_counter() - start_time) * 1000)
|
|
31
|
+
|
|
32
|
+
headers = {k.lower(): v for k, v in resp.headers.items()}
|
|
33
|
+
cookies = list(resp.cookies.keys())
|
|
34
|
+
body_preview = resp.text[:2048]
|
|
35
|
+
final_url = str(resp.url)
|
|
36
|
+
|
|
37
|
+
# Check history to see if there was a redirect
|
|
38
|
+
if resp.history:
|
|
39
|
+
# We followed redirects, the final status is what matters for Open/Blocked/etc.
|
|
40
|
+
# But the spec says "200-299 => OPEN, 301.. => REDIRECTED". If we follow redirects,
|
|
41
|
+
# we might just return the final status. Let's return the final status verdict.
|
|
42
|
+
verdict = get_verdict(resp.status_code, resp.text)
|
|
43
|
+
else:
|
|
44
|
+
verdict = get_verdict(resp.status_code, resp.text)
|
|
45
|
+
|
|
46
|
+
return PlainResult(
|
|
47
|
+
verdict=verdict,
|
|
48
|
+
status=resp.status_code,
|
|
49
|
+
response_time_ms=response_time_ms,
|
|
50
|
+
headers=headers,
|
|
51
|
+
cookies=cookies,
|
|
52
|
+
body_preview=body_preview,
|
|
53
|
+
final_url=final_url,
|
|
54
|
+
full_body=resp.text
|
|
55
|
+
)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
response_time_ms = int((time.perf_counter() - start_time) * 1000)
|
|
58
|
+
return PlainResult(
|
|
59
|
+
verdict=Verdict.ERROR,
|
|
60
|
+
status=None,
|
|
61
|
+
response_time_ms=response_time_ms,
|
|
62
|
+
headers={},
|
|
63
|
+
cookies=[],
|
|
64
|
+
body_preview="",
|
|
65
|
+
final_url=url,
|
|
66
|
+
error=str(e)
|
|
67
|
+
)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from curl_cffi import requests
|
|
5
|
+
import httpx
|
|
6
|
+
from ..types import RateLimitResult, BlockType, PlainResult, TlsResult, Verdict
|
|
7
|
+
from .plain import DEFAULT_HEADERS
|
|
8
|
+
|
|
9
|
+
async def _worker(queue: asyncio.Queue, results: list, client, is_httpx: bool):
|
|
10
|
+
while True:
|
|
11
|
+
try:
|
|
12
|
+
url = queue.get_nowait()
|
|
13
|
+
except asyncio.QueueEmpty:
|
|
14
|
+
break
|
|
15
|
+
|
|
16
|
+
start_time = time.perf_counter()
|
|
17
|
+
try:
|
|
18
|
+
if is_httpx:
|
|
19
|
+
resp = await client.get(url, headers=DEFAULT_HEADERS)
|
|
20
|
+
status = resp.status_code
|
|
21
|
+
retry_after = resp.headers.get("retry-after")
|
|
22
|
+
is_redirect = status in (301, 302, 307, 308)
|
|
23
|
+
else:
|
|
24
|
+
resp = await client.get(url, headers=DEFAULT_HEADERS, allow_redirects=False)
|
|
25
|
+
status = resp.status_code
|
|
26
|
+
retry_after = resp.headers.get("retry-after")
|
|
27
|
+
is_redirect = status in (301, 302, 307, 308)
|
|
28
|
+
|
|
29
|
+
elapsed = int((time.perf_counter() - start_time) * 1000)
|
|
30
|
+
|
|
31
|
+
is_soft_block = status in (429, 503) or is_redirect
|
|
32
|
+
|
|
33
|
+
results.append({
|
|
34
|
+
"status": status,
|
|
35
|
+
"elapsed": elapsed,
|
|
36
|
+
"is_soft_block": is_soft_block,
|
|
37
|
+
"retry_after": retry_after
|
|
38
|
+
})
|
|
39
|
+
except Exception:
|
|
40
|
+
elapsed = int((time.perf_counter() - start_time) * 1000)
|
|
41
|
+
results.append({
|
|
42
|
+
"status": 0,
|
|
43
|
+
"elapsed": elapsed,
|
|
44
|
+
"is_soft_block": False,
|
|
45
|
+
"retry_after": None
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
queue.task_done()
|
|
49
|
+
|
|
50
|
+
async def run(
|
|
51
|
+
url: str,
|
|
52
|
+
probe_rate: bool,
|
|
53
|
+
requests_count: int,
|
|
54
|
+
concurrency: int,
|
|
55
|
+
plain_result: PlainResult,
|
|
56
|
+
tls_result: Optional[TlsResult],
|
|
57
|
+
profile: str
|
|
58
|
+
) -> Optional[RateLimitResult]:
|
|
59
|
+
if not probe_rate:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
use_plain = False
|
|
63
|
+
if tls_result is None or tls_result.verdict == Verdict.SKIPPED:
|
|
64
|
+
use_plain = True
|
|
65
|
+
|
|
66
|
+
queue = asyncio.Queue()
|
|
67
|
+
for _ in range(requests_count):
|
|
68
|
+
queue.put_nowait(url)
|
|
69
|
+
|
|
70
|
+
results = []
|
|
71
|
+
|
|
72
|
+
probe_start = time.perf_counter()
|
|
73
|
+
|
|
74
|
+
if use_plain:
|
|
75
|
+
async with httpx.AsyncClient(verify=False) as client:
|
|
76
|
+
tasks = []
|
|
77
|
+
for _ in range(concurrency):
|
|
78
|
+
tasks.append(asyncio.create_task(_worker(queue, results, client, True)))
|
|
79
|
+
await asyncio.sleep(0.1) # 100ms jitter
|
|
80
|
+
await asyncio.gather(*tasks)
|
|
81
|
+
else:
|
|
82
|
+
async with requests.AsyncSession(impersonate=profile) as client:
|
|
83
|
+
tasks = []
|
|
84
|
+
for _ in range(concurrency):
|
|
85
|
+
tasks.append(asyncio.create_task(_worker(queue, results, client, False)))
|
|
86
|
+
await asyncio.sleep(0.1)
|
|
87
|
+
await asyncio.gather(*tasks)
|
|
88
|
+
|
|
89
|
+
probe_duration = time.perf_counter() - probe_start
|
|
90
|
+
|
|
91
|
+
successful = sum(1 for r in results if 200 <= r["status"] < 300)
|
|
92
|
+
blocked = sum(1 for r in results if r["status"] >= 400 or r["status"] == 0)
|
|
93
|
+
|
|
94
|
+
block_type = None
|
|
95
|
+
retry_after_secs = None
|
|
96
|
+
|
|
97
|
+
for r in results:
|
|
98
|
+
if r["status"] == 429:
|
|
99
|
+
block_type = BlockType.RATE_LIMITED
|
|
100
|
+
if r["retry_after"] and r["retry_after"].isdigit():
|
|
101
|
+
retry_after_secs = int(r["retry_after"])
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
if not block_type:
|
|
105
|
+
for r in results:
|
|
106
|
+
if r["status"] == 403:
|
|
107
|
+
block_type = BlockType.HARD_BLOCK
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
if not block_type:
|
|
111
|
+
for r in results:
|
|
112
|
+
if r["is_soft_block"] and r["status"] not in (429, 503):
|
|
113
|
+
block_type = BlockType.SOFT_REDIRECT
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
times = sorted(r["elapsed"] for r in results)
|
|
117
|
+
median_response_ms = times[len(times)//2] if times else 0
|
|
118
|
+
|
|
119
|
+
if not block_type and len(results) >= 2:
|
|
120
|
+
half = len(results) // 2
|
|
121
|
+
first_half = sorted(r["elapsed"] for r in results[:half])
|
|
122
|
+
second_half = sorted(r["elapsed"] for r in results[-half:])
|
|
123
|
+
|
|
124
|
+
m1 = first_half[len(first_half)//2] if first_half else 0
|
|
125
|
+
m2 = second_half[len(second_half)//2] if second_half else 0
|
|
126
|
+
|
|
127
|
+
if m1 > 0 and m2 > m1 * 3:
|
|
128
|
+
block_type = BlockType.SILENT
|
|
129
|
+
|
|
130
|
+
avg_response_time_secs = (sum(r["elapsed"] for r in results) / len(results)) / 1000 if results else 1.0
|
|
131
|
+
if avg_response_time_secs == 0:
|
|
132
|
+
avg_response_time_secs = 0.001
|
|
133
|
+
|
|
134
|
+
estimated_safe_rps = None
|
|
135
|
+
if block_type and requests_count > 0:
|
|
136
|
+
estimated_safe_rps = (successful / requests_count) * (concurrency / avg_response_time_secs)
|
|
137
|
+
|
|
138
|
+
return RateLimitResult(
|
|
139
|
+
total_requests=requests_count,
|
|
140
|
+
successful=successful,
|
|
141
|
+
blocked=blocked,
|
|
142
|
+
block_type=block_type,
|
|
143
|
+
estimated_safe_rps=estimated_safe_rps,
|
|
144
|
+
retry_after_secs=retry_after_secs,
|
|
145
|
+
median_response_ms=median_response_ms
|
|
146
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from curl_cffi import requests
|
|
3
|
+
from ..types import TlsResult, Verdict, PlainResult
|
|
4
|
+
from .plain import DEFAULT_HEADERS, get_verdict
|
|
5
|
+
|
|
6
|
+
def run(url: str, plain_result: PlainResult, profile: str, timeout: int) -> TlsResult:
|
|
7
|
+
if plain_result.verdict == Verdict.OPEN:
|
|
8
|
+
return TlsResult(
|
|
9
|
+
verdict=Verdict.SKIPPED,
|
|
10
|
+
status=None,
|
|
11
|
+
response_time_ms=0,
|
|
12
|
+
headers={},
|
|
13
|
+
cookies=[],
|
|
14
|
+
body_preview="",
|
|
15
|
+
tls_was_blocker=False,
|
|
16
|
+
profile_used=profile
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
start_time = time.perf_counter()
|
|
20
|
+
try:
|
|
21
|
+
resp = requests.get(
|
|
22
|
+
url,
|
|
23
|
+
headers=DEFAULT_HEADERS,
|
|
24
|
+
impersonate=profile,
|
|
25
|
+
timeout=timeout,
|
|
26
|
+
allow_redirects=True
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
response_time_ms = int((time.perf_counter() - start_time) * 1000)
|
|
30
|
+
|
|
31
|
+
headers = {k.lower(): v for k, v in resp.headers.items()}
|
|
32
|
+
cookies = list(resp.cookies.keys())
|
|
33
|
+
body_preview = resp.text[:2048]
|
|
34
|
+
|
|
35
|
+
verdict = get_verdict(resp.status_code, body_preview)
|
|
36
|
+
|
|
37
|
+
tls_was_blocker = False
|
|
38
|
+
if plain_result.verdict in (Verdict.BLOCKED, Verdict.CHALLENGED) and verdict == Verdict.OPEN:
|
|
39
|
+
tls_was_blocker = True
|
|
40
|
+
|
|
41
|
+
return TlsResult(
|
|
42
|
+
verdict=verdict,
|
|
43
|
+
status=resp.status_code,
|
|
44
|
+
response_time_ms=response_time_ms,
|
|
45
|
+
headers=headers,
|
|
46
|
+
cookies=cookies,
|
|
47
|
+
body_preview=body_preview,
|
|
48
|
+
tls_was_blocker=tls_was_blocker,
|
|
49
|
+
profile_used=profile,
|
|
50
|
+
full_body=resp.text
|
|
51
|
+
)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
response_time_ms = int((time.perf_counter() - start_time) * 1000)
|
|
54
|
+
return TlsResult(
|
|
55
|
+
verdict=Verdict.ERROR,
|
|
56
|
+
status=None,
|
|
57
|
+
response_time_ms=response_time_ms,
|
|
58
|
+
headers={},
|
|
59
|
+
cookies=[],
|
|
60
|
+
body_preview="",
|
|
61
|
+
tls_was_blocker=False,
|
|
62
|
+
profile_used=profile,
|
|
63
|
+
error=str(e)
|
|
64
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from ..types import VendorResult, Confidence, PlainResult, TlsResult, Verdict
|
|
2
|
+
from ..signatures import load_signatures
|
|
3
|
+
|
|
4
|
+
def run(plain_result: PlainResult, tls_result: TlsResult | None) -> VendorResult:
|
|
5
|
+
# Prefer Stage 2 result if it ran and was not skipped, else use Stage 1.
|
|
6
|
+
if tls_result and tls_result.verdict not in (Verdict.SKIPPED, Verdict.ERROR):
|
|
7
|
+
target_headers = tls_result.headers
|
|
8
|
+
target_cookies = tls_result.cookies
|
|
9
|
+
target_body = tls_result.body_preview
|
|
10
|
+
target_status = tls_result.status
|
|
11
|
+
else:
|
|
12
|
+
target_headers = plain_result.headers
|
|
13
|
+
target_cookies = plain_result.cookies
|
|
14
|
+
target_body = plain_result.body_preview
|
|
15
|
+
target_status = plain_result.status
|
|
16
|
+
|
|
17
|
+
sigs = load_signatures()
|
|
18
|
+
|
|
19
|
+
all_scores = []
|
|
20
|
+
best_vendor = None
|
|
21
|
+
best_score = 0.0
|
|
22
|
+
best_signals = []
|
|
23
|
+
|
|
24
|
+
target_body_lower = target_body.lower()
|
|
25
|
+
target_cookies_lower = [c.lower() for c in target_cookies]
|
|
26
|
+
|
|
27
|
+
for vendor_data in sigs.get("vendors", []):
|
|
28
|
+
vendor_name = vendor_data["name"]
|
|
29
|
+
signals = vendor_data.get("signals", [])
|
|
30
|
+
|
|
31
|
+
max_possible = sum(s.get("weight", 0) for s in signals)
|
|
32
|
+
if max_possible == 0:
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
score = 0.0
|
|
36
|
+
matched_signals = []
|
|
37
|
+
|
|
38
|
+
for sig in signals:
|
|
39
|
+
stype = sig["type"]
|
|
40
|
+
weight = sig.get("weight", 0)
|
|
41
|
+
matched = False
|
|
42
|
+
match_str = ""
|
|
43
|
+
|
|
44
|
+
if stype == "header_present":
|
|
45
|
+
key = sig["key"].lower()
|
|
46
|
+
if key in target_headers:
|
|
47
|
+
matched = True
|
|
48
|
+
match_str = f"{key} header"
|
|
49
|
+
elif stype == "header_value":
|
|
50
|
+
key = sig["key"].lower()
|
|
51
|
+
val = sig["value"].lower()
|
|
52
|
+
if key in target_headers and val in target_headers[key].lower():
|
|
53
|
+
matched = True
|
|
54
|
+
match_str = f"{val} in {key} header"
|
|
55
|
+
elif stype == "cookie_name":
|
|
56
|
+
val = sig["value"].lower()
|
|
57
|
+
if val in target_cookies_lower:
|
|
58
|
+
matched = True
|
|
59
|
+
match_str = f"{sig['value']} cookie"
|
|
60
|
+
elif stype == "body_contains":
|
|
61
|
+
val = sig["value"].lower()
|
|
62
|
+
if val in target_body_lower:
|
|
63
|
+
matched = True
|
|
64
|
+
match_str = f"{sig['value']} in body"
|
|
65
|
+
elif stype == "status_code":
|
|
66
|
+
if target_status == sig["value"]:
|
|
67
|
+
matched = True
|
|
68
|
+
match_str = f"status {sig['value']}"
|
|
69
|
+
|
|
70
|
+
if matched:
|
|
71
|
+
score += weight
|
|
72
|
+
matched_signals.append(match_str)
|
|
73
|
+
|
|
74
|
+
normalized = score / max_possible
|
|
75
|
+
all_scores.append((vendor_name, normalized))
|
|
76
|
+
|
|
77
|
+
if normalized > best_score:
|
|
78
|
+
best_score = normalized
|
|
79
|
+
best_vendor = vendor_name
|
|
80
|
+
best_signals = matched_signals
|
|
81
|
+
|
|
82
|
+
if best_score >= 0.8:
|
|
83
|
+
confidence = Confidence.HIGH
|
|
84
|
+
elif best_score >= 0.5:
|
|
85
|
+
confidence = Confidence.MEDIUM
|
|
86
|
+
elif best_score >= 0.3:
|
|
87
|
+
confidence = Confidence.LOW
|
|
88
|
+
else:
|
|
89
|
+
return VendorResult(
|
|
90
|
+
vendor=None,
|
|
91
|
+
confidence=None,
|
|
92
|
+
matched_signals=[],
|
|
93
|
+
all_scores=all_scores
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return VendorResult(
|
|
97
|
+
vendor=best_vendor,
|
|
98
|
+
confidence=confidence,
|
|
99
|
+
matched_signals=best_signals,
|
|
100
|
+
all_scores=all_scores
|
|
101
|
+
)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
class Verdict(Enum):
|
|
6
|
+
OPEN = "Open"
|
|
7
|
+
BLOCKED = "Blocked"
|
|
8
|
+
CHALLENGED = "Challenged"
|
|
9
|
+
REDIRECTED = "Redirected"
|
|
10
|
+
UNCERTAIN = "Uncertain"
|
|
11
|
+
SKIPPED = "Skipped"
|
|
12
|
+
ERROR = "Error"
|
|
13
|
+
|
|
14
|
+
class Confidence(Enum):
|
|
15
|
+
HIGH = "High"
|
|
16
|
+
MEDIUM = "Medium"
|
|
17
|
+
LOW = "Low"
|
|
18
|
+
|
|
19
|
+
class BlockType(Enum):
|
|
20
|
+
HARD_BLOCK = "HardBlock"
|
|
21
|
+
RATE_LIMITED = "RateLimited"
|
|
22
|
+
SOFT_REDIRECT = "SoftRedirect"
|
|
23
|
+
SILENT = "Silent"
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class PlainResult:
|
|
27
|
+
verdict: Verdict
|
|
28
|
+
status: Optional[int]
|
|
29
|
+
response_time_ms: int
|
|
30
|
+
headers: dict[str, str]
|
|
31
|
+
cookies: list[str]
|
|
32
|
+
body_preview: str
|
|
33
|
+
final_url: str
|
|
34
|
+
full_body: str = ""
|
|
35
|
+
error: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class TlsResult:
|
|
39
|
+
verdict: Verdict
|
|
40
|
+
status: Optional[int]
|
|
41
|
+
response_time_ms: int
|
|
42
|
+
headers: dict[str, str]
|
|
43
|
+
cookies: list[str]
|
|
44
|
+
body_preview: str
|
|
45
|
+
tls_was_blocker: bool
|
|
46
|
+
profile_used: str
|
|
47
|
+
full_body: str = ""
|
|
48
|
+
error: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class VendorResult:
|
|
52
|
+
vendor: Optional[str]
|
|
53
|
+
confidence: Optional[Confidence]
|
|
54
|
+
matched_signals: list[str]
|
|
55
|
+
all_scores: list[tuple[str, float]]
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class RateLimitResult:
|
|
59
|
+
total_requests: int
|
|
60
|
+
successful: int
|
|
61
|
+
blocked: int
|
|
62
|
+
block_type: Optional[BlockType]
|
|
63
|
+
estimated_safe_rps: Optional[float]
|
|
64
|
+
retry_after_secs: Optional[int]
|
|
65
|
+
median_response_ms: int
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class Recommendation:
|
|
69
|
+
use_tls_impersonation: bool
|
|
70
|
+
profile: Optional[str]
|
|
71
|
+
captcha_detected: bool
|
|
72
|
+
proxy_recommended: bool
|
|
73
|
+
notes: list[str] = field(default_factory=list)
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class ReconReport:
|
|
77
|
+
target: str
|
|
78
|
+
plain: Optional[PlainResult]
|
|
79
|
+
tls: Optional[TlsResult]
|
|
80
|
+
vendor: Optional[VendorResult]
|
|
81
|
+
rate_limit: Optional[RateLimitResult]
|
|
82
|
+
recommendation: Recommendation
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import importlib.resources
|
|
3
|
+
from . import data
|
|
4
|
+
|
|
5
|
+
def load_indicators() -> list[str]:
|
|
6
|
+
text = importlib.resources.read_text(data, "indicators.json")
|
|
7
|
+
return json.loads(text)
|
|
8
|
+
|
|
9
|
+
def is_challenge_body(body: str) -> bool:
|
|
10
|
+
if not body:
|
|
11
|
+
return False
|
|
12
|
+
body_lower = body.lower()
|
|
13
|
+
indicators = load_indicators()
|
|
14
|
+
return any(indicator in body_lower for indicator in indicators)
|