scraperecon 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ pypi-publish:
9
+ name: Build and publish to PyPI
10
+ runs-on: ubuntu-latest
11
+ environment:
12
+ name: pypi
13
+ url: https://pypi.org/p/scraperecon
14
+ permissions:
15
+ id-token: write # IMPORTANT: mandatory for trusted publishing
16
+ contents: read # Needed for checkout
17
+
18
+ steps:
19
+ - name: Checkout repository
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: "3.11"
26
+
27
+ - name: Install pypa/build
28
+ run: python -m pip install --upgrade pip build
29
+
30
+ - name: Build distribution
31
+ run: python -m build
32
+
33
+ - name: Publish to PyPI
34
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ share/python-wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+ cover/
52
+
53
+ # Environments
54
+ .env
55
+ .venv
56
+ env/
57
+ venv/
58
+ ENV/
59
+ env.bak/
60
+ venv.bak/
61
+
62
+ # IDEs and Editors
63
+ .vscode/
64
+ .idea/
65
+ *.swp
66
+ *.swo
67
+ *~
68
+ .project
69
+ .pydevproject
70
+
71
+ # OS generated files
72
+ .DS_Store
73
+ .DS_Store?
74
+ ._*
75
+ .Spotlight-V100
76
+ .Trashes
77
+ ehthumbs.db
78
+ Thumbs.db
79
+
80
+ # mypy
81
+ .mypy_cache/
82
+ .dmypy.json
83
+ dmypy.json
84
+ .pyre/
85
+
86
+ # Pyre type checker
87
+ .pyre/
88
+
89
+ # ruff / linters
90
+ .ruff_cache/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shaheer Sarfaraz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,150 @@
1
+ Metadata-Version: 2.4
2
+ Name: scraperecon
3
+ Version: 0.1.0
4
+ Summary: A CLI tool that runs a sequential reconnaissance pipeline against a target URL before a developer writes a scraper.
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: curl-cffi>=0.7
8
+ Requires-Dist: httpx>=0.27
9
+ Requires-Dist: rich>=13
10
+ Requires-Dist: typer>=0.12
11
+ Description-Content-Type: text/markdown
12
+
13
+ # scraperecon
14
+
15
+ Run this before you write a scraper. It tells you what bot protection a site has, whether plain HTTP or TLS impersonation is enough to get through, and how aggressively it rate limits — before you've written a single line of scraper code.
16
+
17
+ <img width="1117" height="409" alt="image" src="https://github.com/user-attachments/assets/aa7d0670-c612-4cfb-a579-b610b9c04163" />
18
+
19
+ <br/>
20
+
21
+ ## Usage
22
+
23
+ ```bash
24
+ scraperecon https://target.com
25
+ ```
26
+
27
+ ```
28
+ scraperecon — https://target.com
29
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
30
+
31
+ Stage 1 — Plain HTTP (httpx, scraper User-Agent)
32
+ Status: 403 Forbidden
33
+ Time: 212ms
34
+ Verdict: Blocked
35
+
36
+ Stage 2 — TLS Impersonation (chrome131)
37
+ Status: 200 OK
38
+ Time: 389ms
39
+ Verdict: Open
40
+ Note: TLS fingerprint was the blocker ✓
41
+
42
+ Stage 3 — Vendor Detection
43
+ Vendor: Cloudflare
44
+ Confidence: High
45
+ Signals: cf-ray header, __cf_bm cookie, challenges.cloudflare.com in body
46
+
47
+ Stage 4 — Rate Limit Probe
48
+ Skipped (pass --probe-rate to enable)
49
+
50
+ Recommendation
51
+ Use curl_cffi with chrome131 TLS profile
52
+ No CAPTCHA detected at probe volume
53
+ Proxy rotation not required at low request rates
54
+ ```
55
+
56
+ ---
57
+
58
+ ## What it does
59
+
60
+ scraperecon runs four stages against a URL in order, stopping early where it can.
61
+
62
+ **Stage 1 — Plain HTTP**
63
+
64
+ A basic GET with no tricks. If this comes back clean, you don't need anything else — plain `httpx` or `requests` will work fine and you can stop here.
65
+
66
+ It also checks whether a 200 response is actually real content or a JS challenge page. Cloudflare in particular loves returning 200 with a challenge rather than a 403. scraperecon catches that and marks it `Challenged` instead of lying to you with a green `Open`.
67
+
68
+ **Stage 2 — TLS Impersonation**
69
+
70
+ If Stage 1 was blocked or challenged, it retries using `curl_cffi` impersonating Chrome's TLS fingerprint. A lot of bot detection happens at the TLS handshake level — Python's `requests` library has a completely different fingerprint from a real browser, and that alone is enough to get you blocked on many sites before the server has even looked at your headers. If Stage 2 passes where Stage 1 didn't, you know exactly what the fix is.
71
+
72
+ **Stage 3 — Vendor Detection**
73
+
74
+ Inspects headers, cookies, and the response body for known signatures and tells you which bot protection vendor is running. This matters because Cloudflare, DataDome, Akamai, and PerimeterX all require different bypass strategies. Knowing which one you're dealing with upfront saves you from trying things that were never going to work.
75
+
76
+ **Stage 4 — Rate Limit Probe** _(opt-in)_
77
+
78
+ Fires N requests with configurable concurrency and watches what happens — hard 429s, silent response time degradation, mid-session redirects. Off by default because blasting a site without thinking about it is bad practice. Pass `--probe-rate` when you actually need the data.
79
+
80
+ ---
81
+
82
+ ## Install
83
+
84
+ ```bash
85
+ pipx install scraperecon
86
+ ```
87
+
88
+ ---
89
+
90
+ ## Usage
91
+
92
+ ```bash
93
+ scraperecon https://target.com
94
+ scraperecon https://target.com --probe-rate
95
+ scraperecon https://target.com --probe-rate --concurrency 10 --requests 50
96
+ scraperecon https://target.com --impersonate firefox120
97
+ scraperecon https://target.com --save
98
+ scraperecon https://target.com --json | jq .recommendation
99
+
100
+ | Flag | Default | Description |
101
+ | --------------- | --------- | ------------------------------------------------------------------------------------ |
102
+ | `--probe-rate` | off | Run Stage 4 rate limit probe |
103
+ | `--concurrency` | 5 | Workers for rate probe |
104
+ | `--requests` | 20 | Total requests for rate probe |
105
+ | `--impersonate` | chrome131 | TLS profile for Stage 2. Options: `chrome131`, `chrome120`, `firefox120`, `safari17` |
106
+ | `--timeout` | 10 | Per-request timeout in seconds |
107
+ | `--json` | off | Machine-readable JSON output |
108
+ | `--save` | off | Save the full HTML responses to local files (`<domain>_stage1.html`, etc.) |
109
+ | `--skip-tls` | off | Skip Stage 2 |
110
+ | `--skip-vendor` | off | Skip Stage 3 |
111
+
112
+ ---
113
+
114
+ ## Reading the recommendation
115
+
116
+ At the end of every run you get a plain-English recommendation based on what was found.
117
+
118
+ - **Plain HTTP should be sufficient** — `httpx` or `requests` will work. No special setup needed.
119
+ - **Use curl_cffi with `<profile>`** — TLS fingerprinting is blocking you. Switch to `curl_cffi` with the listed profile.
120
+ - **May need browser automation** — both plain and TLS requests were blocked. You're likely looking at a full JS challenge (Turnstile, hCaptcha). Playwright with a stealth plugin is probably your next move.
121
+ - **Proxy rotation recommended** — the rate probe hit throttling. At any real request volume you'll need rotating proxies.
122
+ - **CAPTCHA detected** — the response body contained CAPTCHA indicators. Automated solving or a managed scraping service required.
123
+
124
+ ---
125
+
126
+ ## Adding vendor signatures
127
+
128
+ Signatures live in `scraperecon/data/signatures.json`. It's a flat JSON file — no code required. If you know a signal that's missing, open a PR.
129
+
130
+ ```json
131
+ {
132
+ "name": "YourVendor",
133
+ "signals": [
134
+ { "type": "header_present", "key": "x-your-vendor", "weight": 0.8 },
135
+ { "type": "cookie_name", "value": "your_cookie", "weight": 0.6 }
136
+ ]
137
+ }
138
+ ```
139
+
140
+ Signal types: `header_present`, `header_value`, `cookie_name`, `body_contains`, `status_code`.
141
+
142
+ ---
143
+
144
+ ## What it won't do
145
+
146
+ scraperecon is a recon tool, not a scraping library. It tells you what you need — it doesn't do it for you. No CAPTCHA solving, no Playwright integration, no proxy support, no persistent history.
147
+
148
+ ---
149
+
150
+ Every scraper project starts with the same 20 minutes of manual work: try curl, get blocked, try curl_cffi, check the headers, fire some requests and see what happens. This automates that.
@@ -0,0 +1,138 @@
1
+ # scraperecon
2
+
3
+ Run this before you write a scraper. It tells you what bot protection a site has, whether plain HTTP or TLS impersonation is enough to get through, and how aggressively it rate limits — before you've written a single line of scraper code.
4
+
5
+ <img width="1117" height="409" alt="image" src="https://github.com/user-attachments/assets/aa7d0670-c612-4cfb-a579-b610b9c04163" />
6
+
7
+ <br/>
8
+
9
+ ## Usage
10
+
11
+ ```bash
12
+ scraperecon https://target.com
13
+ ```
14
+
15
+ ```
16
+ scraperecon — https://target.com
17
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
18
+
19
+ Stage 1 — Plain HTTP (httpx, scraper User-Agent)
20
+ Status: 403 Forbidden
21
+ Time: 212ms
22
+ Verdict: Blocked
23
+
24
+ Stage 2 — TLS Impersonation (chrome131)
25
+ Status: 200 OK
26
+ Time: 389ms
27
+ Verdict: Open
28
+ Note: TLS fingerprint was the blocker ✓
29
+
30
+ Stage 3 — Vendor Detection
31
+ Vendor: Cloudflare
32
+ Confidence: High
33
+ Signals: cf-ray header, __cf_bm cookie, challenges.cloudflare.com in body
34
+
35
+ Stage 4 — Rate Limit Probe
36
+ Skipped (pass --probe-rate to enable)
37
+
38
+ Recommendation
39
+ Use curl_cffi with chrome131 TLS profile
40
+ No CAPTCHA detected at probe volume
41
+ Proxy rotation not required at low request rates
42
+ ```
43
+
44
+ ---
45
+
46
+ ## What it does
47
+
48
+ scraperecon runs four stages against a URL in order, stopping early where it can.
49
+
50
+ **Stage 1 — Plain HTTP**
51
+
52
+ A basic GET with no tricks. If this comes back clean, you don't need anything else — plain `httpx` or `requests` will work fine and you can stop here.
53
+
54
+ It also checks whether a 200 response is actually real content or a JS challenge page. Cloudflare in particular loves returning 200 with a challenge rather than a 403. scraperecon catches that and marks it `Challenged` instead of lying to you with a green `Open`.
55
+
56
+ **Stage 2 — TLS Impersonation**
57
+
58
+ If Stage 1 was blocked or challenged, it retries using `curl_cffi` impersonating Chrome's TLS fingerprint. A lot of bot detection happens at the TLS handshake level — Python's `requests` library has a completely different fingerprint from a real browser, and that alone is enough to get you blocked on many sites before the server has even looked at your headers. If Stage 2 passes where Stage 1 didn't, you know exactly what the fix is.
59
+
60
+ **Stage 3 — Vendor Detection**
61
+
62
+ Inspects headers, cookies, and the response body for known signatures and tells you which bot protection vendor is running. This matters because Cloudflare, DataDome, Akamai, and PerimeterX all require different bypass strategies. Knowing which one you're dealing with upfront saves you from trying things that were never going to work.
63
+
64
+ **Stage 4 — Rate Limit Probe** _(opt-in)_
65
+
66
+ Fires N requests with configurable concurrency and watches what happens — hard 429s, silent response time degradation, mid-session redirects. Off by default because blasting a site without thinking about it is bad practice. Pass `--probe-rate` when you actually need the data.
67
+
68
+ ---
69
+
70
+ ## Install
71
+
72
+ ```bash
73
+ pipx install scraperecon
74
+ ```
75
+
76
+ ---
77
+
78
+ ## Usage
79
+
80
+ ```bash
81
+ scraperecon https://target.com
82
+ scraperecon https://target.com --probe-rate
83
+ scraperecon https://target.com --probe-rate --concurrency 10 --requests 50
84
+ scraperecon https://target.com --impersonate firefox120
85
+ scraperecon https://target.com --save
86
+ scraperecon https://target.com --json | jq .recommendation
87
+
88
+ | Flag | Default | Description |
89
+ | --------------- | --------- | ------------------------------------------------------------------------------------ |
90
+ | `--probe-rate` | off | Run Stage 4 rate limit probe |
91
+ | `--concurrency` | 5 | Workers for rate probe |
92
+ | `--requests` | 20 | Total requests for rate probe |
93
+ | `--impersonate` | chrome131 | TLS profile for Stage 2. Options: `chrome131`, `chrome120`, `firefox120`, `safari17` |
94
+ | `--timeout` | 10 | Per-request timeout in seconds |
95
+ | `--json` | off | Machine-readable JSON output |
96
+ | `--save` | off | Save the full HTML responses to local files (`<domain>_stage1.html`, etc.) |
97
+ | `--skip-tls` | off | Skip Stage 2 |
98
+ | `--skip-vendor` | off | Skip Stage 3 |
99
+
100
+ ---
101
+
102
+ ## Reading the recommendation
103
+
104
+ At the end of every run you get a plain-English recommendation based on what was found.
105
+
106
+ - **Plain HTTP should be sufficient** — `httpx` or `requests` will work. No special setup needed.
107
+ - **Use curl_cffi with `<profile>`** — TLS fingerprinting is blocking you. Switch to `curl_cffi` with the listed profile.
108
+ - **May need browser automation** — both plain and TLS requests were blocked. You're likely looking at a full JS challenge (Turnstile, hCaptcha). Playwright with a stealth plugin is probably your next move.
109
+ - **Proxy rotation recommended** — the rate probe hit throttling. At any real request volume you'll need rotating proxies.
110
+ - **CAPTCHA detected** — the response body contained CAPTCHA indicators. Automated solving or a managed scraping service required.
111
+
112
+ ---
113
+
114
+ ## Adding vendor signatures
115
+
116
+ Signatures live in `scraperecon/data/signatures.json`. It's a flat JSON file — no code required. If you know a signal that's missing, open a PR.
117
+
118
+ ```json
119
+ {
120
+ "name": "YourVendor",
121
+ "signals": [
122
+ { "type": "header_present", "key": "x-your-vendor", "weight": 0.8 },
123
+ { "type": "cookie_name", "value": "your_cookie", "weight": 0.6 }
124
+ ]
125
+ }
126
+ ```
127
+
128
+ Signal types: `header_present`, `header_value`, `cookie_name`, `body_contains`, `status_code`.
129
+
130
+ ---
131
+
132
+ ## What it won't do
133
+
134
+ scraperecon is a recon tool, not a scraping library. It tells you what you need — it doesn't do it for you. No CAPTCHA solving, no Playwright integration, no proxy support, no persistent history.
135
+
136
+ ---
137
+
138
+ Every scraper project starts with the same 20 minutes of manual work: try curl, get blocked, try curl_cffi, check the headers, fire some requests and see what happens. This automates that.
@@ -0,0 +1,19 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "scraperecon"
7
+ version = "0.1.0"
8
+ description = "A CLI tool that runs a sequential reconnaissance pipeline against a target URL before a developer writes a scraper."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "httpx>=0.27",
13
+ "curl-cffi>=0.7",
14
+ "typer>=0.12",
15
+ "rich>=13"
16
+ ]
17
+
18
+ [project.scripts]
19
+ scraperecon = "scraperecon.main:app"
File without changes
@@ -0,0 +1,14 @@
1
+ [
2
+ "cf-browser-verification",
3
+ "challenges.cloudflare.com",
4
+ "turnstile",
5
+ "cf_chl",
6
+ "jschl_vc",
7
+ "ray id",
8
+ "enable javascript",
9
+ "checking your browser",
10
+ "datadome",
11
+ "px-captcha",
12
+ "captcha",
13
+ "challenge"
14
+ ]
@@ -0,0 +1,61 @@
1
+ {
2
+ "vendors": [
3
+ {
4
+ "name": "Cloudflare",
5
+ "signals": [
6
+ { "type": "header_present", "key": "cf-ray", "weight": 0.6 },
7
+ { "type": "cookie_name", "value": "__cf_bm", "weight": 0.3 },
8
+ { "type": "cookie_name", "value": "cf_clearance", "weight": 0.3 },
9
+ { "type": "header_value", "key": "server", "value": "cloudflare", "weight": 0.2 },
10
+ { "type": "body_contains", "value": "cf-browser-verification", "weight": 0.4 },
11
+ { "type": "body_contains", "value": "challenges.cloudflare.com", "weight": 0.5 }
12
+ ]
13
+ },
14
+ {
15
+ "name": "DataDome",
16
+ "signals": [
17
+ { "type": "cookie_name", "value": "datadome", "weight": 0.8 },
18
+ { "type": "header_present", "key": "x-datadome-cid", "weight": 0.5 },
19
+ { "type": "body_contains", "value": "datadome", "weight": 0.4 }
20
+ ]
21
+ },
22
+ {
23
+ "name": "Akamai",
24
+ "signals": [
25
+ { "type": "cookie_name", "value": "_abck", "weight": 0.7 },
26
+ { "type": "cookie_name", "value": "bm_sz", "weight": 0.5 },
27
+ { "type": "header_present", "key": "x-akamai-edgescape", "weight": 0.4 }
28
+ ]
29
+ },
30
+ {
31
+ "name": "PerimeterX",
32
+ "signals": [
33
+ { "type": "cookie_name", "value": "_px", "weight": 0.6 },
34
+ { "type": "cookie_name", "value": "_pxhd", "weight": 0.4 },
35
+ { "type": "body_contains", "value": "px-captcha", "weight": 0.5 }
36
+ ]
37
+ },
38
+ {
39
+ "name": "Kasada",
40
+ "signals": [
41
+ { "type": "header_present", "key": "x-kasada-pow", "weight": 0.8 },
42
+ { "type": "body_contains", "value": "kasada", "weight": 0.5 }
43
+ ]
44
+ },
45
+ {
46
+ "name": "Imperva",
47
+ "signals": [
48
+ { "type": "cookie_name", "value": "incap_ses", "weight": 0.6 },
49
+ { "type": "cookie_name", "value": "visid_incap", "weight": 0.5 },
50
+ { "type": "header_present", "key": "x-iinfo", "weight": 0.4 }
51
+ ]
52
+ },
53
+ {
54
+ "name": "AWS WAF",
55
+ "signals": [
56
+ { "type": "cookie_name", "value": "aws-waf-token", "weight": 0.8 },
57
+ { "type": "body_contains", "value": "awswaf", "weight": 0.5 }
58
+ ]
59
+ }
60
+ ]
61
+ }
@@ -0,0 +1,230 @@
1
+ import sys
2
+ import json
3
+ import dataclasses
4
+ from enum import Enum
5
+ import typer
6
+ from rich.console import Console
7
+ from rich.text import Text
8
+ from .pipeline import run_pipeline
9
+ from .types import ReconReport, Verdict, Confidence
10
+
11
+ app = typer.Typer(add_completion=False)
12
+
13
+ def _default_json(obj):
14
+ if isinstance(obj, Enum):
15
+ return obj.value
16
+ if dataclasses.is_dataclass(obj):
17
+ return dataclasses.asdict(obj)
18
+ raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
19
+
20
+ def format_verdict(verdict: Verdict) -> Text:
21
+ if verdict == Verdict.OPEN:
22
+ return Text("Open", style="bold green")
23
+ elif verdict == Verdict.BLOCKED:
24
+ return Text("Blocked", style="bold red")
25
+ elif verdict == Verdict.CHALLENGED:
26
+ return Text("Challenged", style="bold magenta")
27
+ elif verdict in (Verdict.UNCERTAIN, Verdict.SKIPPED):
28
+ return Text(verdict.value, style="bold yellow")
29
+ elif verdict == Verdict.ERROR:
30
+ return Text("Error", style="bold red")
31
+ else:
32
+ return Text(verdict.value, style="bold")
33
+
34
+ def print_human(report: ReconReport):
35
+ console = Console()
36
+ err_console = Console(stderr=True)
37
+
38
+ console.print(f"[bold]scraperecon v0.1.0[/bold] — {report.target}")
39
+ console.print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
40
+ console.print()
41
+
42
+ # Stage 1
43
+ console.print("[bold]Stage 1 — Plain HTTP (httpx, scraper User-Agent)[/bold]")
44
+ if report.plain.error:
45
+ err_console.print(f" [red]Error:[/red] {report.plain.error}")
46
+ console.print(" Verdict: ", format_verdict(report.plain.verdict))
47
+ else:
48
+ console.print(f" Status: {report.plain.status}")
49
+ console.print(f" Time: {report.plain.response_time_ms}ms")
50
+ console.print(" Verdict: ", format_verdict(report.plain.verdict))
51
+ console.print()
52
+
53
+ # Stage 2
54
+ if report.tls:
55
+ console.print(f"[bold]Stage 2 — TLS Impersonation ({report.tls.profile_used})[/bold]")
56
+ if report.tls.error:
57
+ err_console.print(f" [red]Error:[/red] {report.tls.error}")
58
+ console.print(" Verdict: ", format_verdict(report.tls.verdict))
59
+ elif report.tls.verdict == Verdict.SKIPPED:
60
+ console.print(" [yellow]Skipped (Stage 1 was Open)[/yellow]")
61
+ else:
62
+ console.print(f" Status: {report.tls.status}")
63
+ console.print(f" Time: {report.tls.response_time_ms}ms")
64
+ console.print(" Verdict: ", format_verdict(report.tls.verdict))
65
+ if report.tls.tls_was_blocker:
66
+ console.print(" Note: [green]TLS fingerprint was the blocker ✓[/green]")
67
+ else:
68
+ console.print("[bold]Stage 2 — TLS Impersonation[/bold]")
69
+ console.print(" [yellow]Skipped[/yellow]")
70
+ console.print()
71
+
72
+ # Stage 3
73
+ console.print("[bold]Stage 3 — Vendor Detection[/bold]")
74
+ if report.vendor:
75
+ if report.vendor.vendor:
76
+ console.print(f" Vendor: [bold]{report.vendor.vendor}[/bold]")
77
+ console.print(f" Confidence: {report.vendor.confidence.value}")
78
+ console.print(f" Signals: {', '.join(report.vendor.matched_signals)}")
79
+ else:
80
+ console.print(" [dim]No known vendor detected[/dim]")
81
+ else:
82
+ console.print(" [yellow]Skipped[/yellow]")
83
+ console.print()
84
+
85
+ # Stage 4
86
+ console.print("[bold]Stage 4 — Rate Limit Probe[/bold]")
87
+ if report.rate_limit:
88
+ console.print(f" Requests: {report.rate_limit.successful} successful, {report.rate_limit.blocked} blocked (out of {report.rate_limit.total_requests})")
89
+ console.print(f" Median ms: {report.rate_limit.median_response_ms}ms")
90
+ if report.rate_limit.block_type:
91
+ console.print(f" Block Type: [bold red]{report.rate_limit.block_type.value}[/bold red]")
92
+ if report.rate_limit.estimated_safe_rps:
93
+ console.print(f" Est. Safe: ~{report.rate_limit.estimated_safe_rps:.1f} req/s")
94
+ else:
95
+ console.print(" [green]No rate limit detected at this volume[/green]")
96
+ else:
97
+ console.print(" [yellow]Skipped (pass --probe-rate to enable)[/yellow]")
98
+ console.print()
99
+
100
+ # Recommendations
101
+ console.print("[bold]Recommendation[/bold]")
102
+ rec = report.recommendation
103
+ if rec.use_tls_impersonation:
104
+ console.print(f" [green]Use curl_cffi with {rec.profile or 'appropriate'} TLS profile[/green]")
105
+ else:
106
+ console.print(" [green]Plain HTTP (httpx/requests) should be sufficient[/green]")
107
+
108
+ if rec.captcha_detected:
109
+ console.print(" [red]CAPTCHA/Challenge detected[/red]")
110
+ else:
111
+ console.print(" No CAPTCHA detected")
112
+
113
+ if rec.proxy_recommended:
114
+ console.print(" [yellow]Proxy rotation recommended due to rate limits[/yellow]")
115
+ else:
116
+ console.print(" Proxy rotation not strictly required at tested volume")
117
+
118
+ for note in rec.notes:
119
+ console.print(f" Note: {note}")
120
+
121
+ def print_json(report: ReconReport):
122
+ # Map to the requested JSON format
123
+ out = {
124
+ "target": report.target,
125
+ "stages": {
126
+ "plain": None,
127
+ "tls": None,
128
+ "vendor": None,
129
+ "rate_limit": None
130
+ },
131
+ "recommendation": dataclasses.asdict(report.recommendation)
132
+ }
133
+
134
+ if report.plain:
135
+ out["stages"]["plain"] = {
136
+ "verdict": report.plain.verdict.value,
137
+ "status": report.plain.status,
138
+ "response_time_ms": report.plain.response_time_ms
139
+ }
140
+ if report.plain.error:
141
+ out["stages"]["plain"]["error"] = report.plain.error
142
+
143
+ if report.tls:
144
+ out["stages"]["tls"] = {
145
+ "verdict": report.tls.verdict.value,
146
+ "status": report.tls.status,
147
+ "response_time_ms": report.tls.response_time_ms,
148
+ "tls_was_blocker": report.tls.tls_was_blocker,
149
+ "profile": report.tls.profile_used
150
+ }
151
+ if report.tls.error:
152
+ out["stages"]["tls"]["error"] = report.tls.error
153
+
154
+ if report.vendor and report.vendor.vendor:
155
+ out["stages"]["vendor"] = {
156
+ "vendor": report.vendor.vendor,
157
+ "confidence": report.vendor.confidence.value if report.vendor.confidence else None,
158
+ "matched_signals": report.vendor.matched_signals
159
+ }
160
+
161
+ if report.rate_limit:
162
+ out["stages"]["rate_limit"] = {
163
+ "total_requests": report.rate_limit.total_requests,
164
+ "successful": report.rate_limit.successful,
165
+ "blocked": report.rate_limit.blocked,
166
+ "block_type": report.rate_limit.block_type.value if report.rate_limit.block_type else None,
167
+ "estimated_safe_rps": report.rate_limit.estimated_safe_rps,
168
+ "retry_after_secs": report.rate_limit.retry_after_secs,
169
+ "median_response_ms": report.rate_limit.median_response_ms
170
+ }
171
+
172
+ print(json.dumps(out, indent=2))
173
+
174
+ @app.command()
175
+ def main(
176
+ url: str = typer.Argument(..., help="Target URL"),
177
+ probe_rate: bool = typer.Option(False, "--probe-rate", help="Run stage 4 (rate limit probe)"),
178
+ concurrency: int = typer.Option(5, "--concurrency", help="Workers for rate probe"),
179
+ requests: int = typer.Option(20, "--requests", help="Total requests for rate probe"),
180
+ impersonate: str = typer.Option("chrome131", "--impersonate", help="TLS profile for stage 2"),
181
+ timeout: int = typer.Option(10, "--timeout", help="Per-request timeout in seconds"),
182
+ json_out: bool = typer.Option(False, "--json", help="Output machine-readable JSON"),
183
+ skip_tls: bool = typer.Option(False, "--skip-tls", help="Skip stage 2"),
184
+ skip_vendor: bool = typer.Option(False, "--skip-vendor", help="Skip stage 3"),
185
+ save: bool = typer.Option(False, "--save", help="Save the full HTML responses to local files"),
186
+ version: bool = typer.Option(False, "--version", help="Print version")
187
+ ):
188
+ if version:
189
+ print("scraperecon v0.1.0")
190
+ raise typer.Exit()
191
+
192
+ report = run_pipeline(
193
+ url=url,
194
+ probe_rate=probe_rate,
195
+ concurrency=concurrency,
196
+ requests=requests,
197
+ impersonate=impersonate,
198
+ timeout=timeout,
199
+ skip_tls=skip_tls,
200
+ skip_vendor=skip_vendor
201
+ )
202
+
203
+ if json_out:
204
+ print_json(report)
205
+ else:
206
+ print_human(report)
207
+
208
+ if save:
209
+ from urllib.parse import urlparse
210
+ domain = urlparse(report.target).netloc or "target"
211
+ domain = domain.replace(":", "_")
212
+
213
+ console = Console()
214
+ console.print()
215
+ console.print("[bold]Saved Files[/bold]")
216
+
217
+ if report.plain and not report.plain.error and report.plain.full_body:
218
+ fname = f"{domain}_stage1.html"
219
+ with open(fname, "w", encoding="utf-8") as f:
220
+ f.write(report.plain.full_body)
221
+ console.print(f" [green]Stage 1 saved to:[/green] {fname}")
222
+
223
+ if report.tls and report.tls.verdict not in (Verdict.SKIPPED, Verdict.ERROR) and report.tls.full_body:
224
+ fname = f"{domain}_stage2.html"
225
+ with open(fname, "w", encoding="utf-8") as f:
226
+ f.write(report.tls.full_body)
227
+ console.print(f" [green]Stage 2 saved to:[/green] {fname}")
228
+
229
+ if __name__ == "__main__":
230
+ app()
@@ -0,0 +1,49 @@
1
+ import asyncio
2
+ from .stages import plain, tls, vendor, ratelimit
3
+ from .report import build_recommendation
4
+ from .types import ReconReport
5
+
6
+ def run_pipeline(
7
+ url: str,
8
+ probe_rate: bool,
9
+ concurrency: int,
10
+ requests: int,
11
+ impersonate: str,
12
+ timeout: int,
13
+ skip_tls: bool,
14
+ skip_vendor: bool
15
+ ) -> ReconReport:
16
+
17
+ # Stage 1
18
+ plain_res = plain.run(url, timeout)
19
+
20
+ # Stage 2
21
+ if skip_tls or plain_res.error:
22
+ tls_res = None
23
+ else:
24
+ tls_res = tls.run(url, plain_res, impersonate, timeout)
25
+
26
+ # Stage 3
27
+ if skip_vendor:
28
+ vendor_res = None
29
+ else:
30
+ vendor_res = vendor.run(plain_res, tls_res)
31
+
32
+ # Stage 4
33
+ if probe_rate and ((plain_res and not plain_res.error) or (tls_res and not tls_res.error)):
34
+ rate_res = asyncio.run(ratelimit.run(
35
+ url, True, requests, concurrency, plain_res, tls_res, impersonate
36
+ ))
37
+ else:
38
+ rate_res = None
39
+
40
+ rec = build_recommendation(plain_res, tls_res, vendor_res, rate_res)
41
+
42
+ return ReconReport(
43
+ target=url,
44
+ plain=plain_res,
45
+ tls=tls_res,
46
+ vendor=vendor_res,
47
+ rate_limit=rate_res,
48
+ recommendation=rec
49
+ )
@@ -0,0 +1,47 @@
1
+ from typing import Optional
2
+ from .types import Recommendation, ReconReport, PlainResult, TlsResult, VendorResult, RateLimitResult, Verdict, BlockType
3
+
4
+ def build_recommendation(
5
+ plain: PlainResult,
6
+ tls: Optional[TlsResult],
7
+ vendor: Optional[VendorResult],
8
+ rate: Optional[RateLimitResult]
9
+ ) -> Recommendation:
10
+
11
+ use_tls_impersonation = False
12
+ profile = None
13
+ captcha_detected = False
14
+ proxy_recommended = False
15
+ notes = []
16
+
17
+ if plain.verdict == Verdict.OPEN:
18
+ use_tls_impersonation = False
19
+ elif tls and tls.tls_was_blocker:
20
+ use_tls_impersonation = True
21
+ profile = tls.profile_used
22
+ elif plain.verdict in (Verdict.BLOCKED, Verdict.CHALLENGED) and tls and tls.verdict in (Verdict.BLOCKED, Verdict.CHALLENGED):
23
+ use_tls_impersonation = True
24
+ notes.append("Both stages blocked/challenged: may need browser automation (Playwright + stealth)")
25
+
26
+ if vendor and vendor.vendor == "Cloudflare":
27
+ notes.append("Cloudflare detected: consider Playwright + stealth plugin if curl_cffi fails")
28
+
29
+ if rate and rate.block_type is not None:
30
+ proxy_recommended = True
31
+
32
+ body_to_check = ""
33
+ if tls and tls.verdict not in (Verdict.SKIPPED, Verdict.ERROR):
34
+ body_to_check = tls.body_preview.lower()
35
+ else:
36
+ body_to_check = plain.body_preview.lower()
37
+
38
+ if "captcha" in body_to_check or "challenge" in body_to_check:
39
+ captcha_detected = True
40
+
41
+ return Recommendation(
42
+ use_tls_impersonation=use_tls_impersonation,
43
+ profile=profile,
44
+ captcha_detected=captcha_detected,
45
+ proxy_recommended=proxy_recommended,
46
+ notes=notes
47
+ )
@@ -0,0 +1,7 @@
1
+ import json
2
+ import importlib.resources
3
+ from . import data
4
+
5
+ def load_signatures() -> dict:
6
+ text = importlib.resources.read_text(data, "signatures.json")
7
+ return json.loads(text)
File without changes
@@ -0,0 +1,67 @@
1
+ import time
2
+ import httpx
3
+ from ..types import PlainResult, Verdict
4
+ from ..utils import is_challenge_body
5
+
6
+ DEFAULT_HEADERS = {
7
+ "User-Agent": "Mozilla/5.0 (compatible; scraperecon/0.1)",
8
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9
+ "Accept-Language": "en-US,en;q=0.5"
10
+ }
11
+
12
+ def get_verdict(status: int, body_preview: str = "") -> Verdict:
13
+ if is_challenge_body(body_preview):
14
+ return Verdict.CHALLENGED
15
+ if 200 <= status <= 299:
16
+ return Verdict.OPEN
17
+ elif status in (301, 302, 307, 308):
18
+ return Verdict.REDIRECTED
19
+ elif status in (403, 429, 503):
20
+ return Verdict.BLOCKED
21
+ else:
22
+ return Verdict.UNCERTAIN
23
+
24
+ def run(url: str, timeout: int) -> PlainResult:
25
+ start_time = time.perf_counter()
26
+ try:
27
+ with httpx.Client(timeout=timeout, follow_redirects=True) as client:
28
+ resp = client.get(url, headers=DEFAULT_HEADERS)
29
+
30
+ response_time_ms = int((time.perf_counter() - start_time) * 1000)
31
+
32
+ headers = {k.lower(): v for k, v in resp.headers.items()}
33
+ cookies = list(resp.cookies.keys())
34
+ body_preview = resp.text[:2048]
35
+ final_url = str(resp.url)
36
+
37
+ # Check history to see if there was a redirect
38
+ if resp.history:
39
+ # We followed redirects, the final status is what matters for Open/Blocked/etc.
40
+ # But the spec says "200-299 => OPEN, 301.. => REDIRECTED". If we follow redirects,
41
+ # we might just return the final status. Let's return the final status verdict.
42
+ verdict = get_verdict(resp.status_code, resp.text)
43
+ else:
44
+ verdict = get_verdict(resp.status_code, resp.text)
45
+
46
+ return PlainResult(
47
+ verdict=verdict,
48
+ status=resp.status_code,
49
+ response_time_ms=response_time_ms,
50
+ headers=headers,
51
+ cookies=cookies,
52
+ body_preview=body_preview,
53
+ final_url=final_url,
54
+ full_body=resp.text
55
+ )
56
+ except Exception as e:
57
+ response_time_ms = int((time.perf_counter() - start_time) * 1000)
58
+ return PlainResult(
59
+ verdict=Verdict.ERROR,
60
+ status=None,
61
+ response_time_ms=response_time_ms,
62
+ headers={},
63
+ cookies=[],
64
+ body_preview="",
65
+ final_url=url,
66
+ error=str(e)
67
+ )
@@ -0,0 +1,146 @@
1
+ import asyncio
2
+ import time
3
+ from typing import Optional
4
+ from curl_cffi import requests
5
+ import httpx
6
+ from ..types import RateLimitResult, BlockType, PlainResult, TlsResult, Verdict
7
+ from .plain import DEFAULT_HEADERS
8
+
9
+ async def _worker(queue: asyncio.Queue, results: list, client, is_httpx: bool):
10
+ while True:
11
+ try:
12
+ url = queue.get_nowait()
13
+ except asyncio.QueueEmpty:
14
+ break
15
+
16
+ start_time = time.perf_counter()
17
+ try:
18
+ if is_httpx:
19
+ resp = await client.get(url, headers=DEFAULT_HEADERS)
20
+ status = resp.status_code
21
+ retry_after = resp.headers.get("retry-after")
22
+ is_redirect = status in (301, 302, 307, 308)
23
+ else:
24
+ resp = await client.get(url, headers=DEFAULT_HEADERS, allow_redirects=False)
25
+ status = resp.status_code
26
+ retry_after = resp.headers.get("retry-after")
27
+ is_redirect = status in (301, 302, 307, 308)
28
+
29
+ elapsed = int((time.perf_counter() - start_time) * 1000)
30
+
31
+ is_soft_block = status in (429, 503) or is_redirect
32
+
33
+ results.append({
34
+ "status": status,
35
+ "elapsed": elapsed,
36
+ "is_soft_block": is_soft_block,
37
+ "retry_after": retry_after
38
+ })
39
+ except Exception:
40
+ elapsed = int((time.perf_counter() - start_time) * 1000)
41
+ results.append({
42
+ "status": 0,
43
+ "elapsed": elapsed,
44
+ "is_soft_block": False,
45
+ "retry_after": None
46
+ })
47
+
48
+ queue.task_done()
49
+
50
+ async def run(
51
+ url: str,
52
+ probe_rate: bool,
53
+ requests_count: int,
54
+ concurrency: int,
55
+ plain_result: PlainResult,
56
+ tls_result: Optional[TlsResult],
57
+ profile: str
58
+ ) -> Optional[RateLimitResult]:
59
+ if not probe_rate:
60
+ return None
61
+
62
+ use_plain = False
63
+ if tls_result is None or tls_result.verdict == Verdict.SKIPPED:
64
+ use_plain = True
65
+
66
+ queue = asyncio.Queue()
67
+ for _ in range(requests_count):
68
+ queue.put_nowait(url)
69
+
70
+ results = []
71
+
72
+ probe_start = time.perf_counter()
73
+
74
+ if use_plain:
75
+ async with httpx.AsyncClient(verify=False) as client:
76
+ tasks = []
77
+ for _ in range(concurrency):
78
+ tasks.append(asyncio.create_task(_worker(queue, results, client, True)))
79
+ await asyncio.sleep(0.1) # 100ms jitter
80
+ await asyncio.gather(*tasks)
81
+ else:
82
+ async with requests.AsyncSession(impersonate=profile) as client:
83
+ tasks = []
84
+ for _ in range(concurrency):
85
+ tasks.append(asyncio.create_task(_worker(queue, results, client, False)))
86
+ await asyncio.sleep(0.1)
87
+ await asyncio.gather(*tasks)
88
+
89
+ probe_duration = time.perf_counter() - probe_start
90
+
91
+ successful = sum(1 for r in results if 200 <= r["status"] < 300)
92
+ blocked = sum(1 for r in results if r["status"] >= 400 or r["status"] == 0)
93
+
94
+ block_type = None
95
+ retry_after_secs = None
96
+
97
+ for r in results:
98
+ if r["status"] == 429:
99
+ block_type = BlockType.RATE_LIMITED
100
+ if r["retry_after"] and r["retry_after"].isdigit():
101
+ retry_after_secs = int(r["retry_after"])
102
+ break
103
+
104
+ if not block_type:
105
+ for r in results:
106
+ if r["status"] == 403:
107
+ block_type = BlockType.HARD_BLOCK
108
+ break
109
+
110
+ if not block_type:
111
+ for r in results:
112
+ if r["is_soft_block"] and r["status"] not in (429, 503):
113
+ block_type = BlockType.SOFT_REDIRECT
114
+ break
115
+
116
+ times = sorted(r["elapsed"] for r in results)
117
+ median_response_ms = times[len(times)//2] if times else 0
118
+
119
+ if not block_type and len(results) >= 2:
120
+ half = len(results) // 2
121
+ first_half = sorted(r["elapsed"] for r in results[:half])
122
+ second_half = sorted(r["elapsed"] for r in results[-half:])
123
+
124
+ m1 = first_half[len(first_half)//2] if first_half else 0
125
+ m2 = second_half[len(second_half)//2] if second_half else 0
126
+
127
+ if m1 > 0 and m2 > m1 * 3:
128
+ block_type = BlockType.SILENT
129
+
130
+ avg_response_time_secs = (sum(r["elapsed"] for r in results) / len(results)) / 1000 if results else 1.0
131
+ if avg_response_time_secs == 0:
132
+ avg_response_time_secs = 0.001
133
+
134
+ estimated_safe_rps = None
135
+ if block_type and requests_count > 0:
136
+ estimated_safe_rps = (successful / requests_count) * (concurrency / avg_response_time_secs)
137
+
138
+ return RateLimitResult(
139
+ total_requests=requests_count,
140
+ successful=successful,
141
+ blocked=blocked,
142
+ block_type=block_type,
143
+ estimated_safe_rps=estimated_safe_rps,
144
+ retry_after_secs=retry_after_secs,
145
+ median_response_ms=median_response_ms
146
+ )
@@ -0,0 +1,64 @@
1
+ import time
2
+ from curl_cffi import requests
3
+ from ..types import TlsResult, Verdict, PlainResult
4
+ from .plain import DEFAULT_HEADERS, get_verdict
5
+
6
+ def run(url: str, plain_result: PlainResult, profile: str, timeout: int) -> TlsResult:
7
+ if plain_result.verdict == Verdict.OPEN:
8
+ return TlsResult(
9
+ verdict=Verdict.SKIPPED,
10
+ status=None,
11
+ response_time_ms=0,
12
+ headers={},
13
+ cookies=[],
14
+ body_preview="",
15
+ tls_was_blocker=False,
16
+ profile_used=profile
17
+ )
18
+
19
+ start_time = time.perf_counter()
20
+ try:
21
+ resp = requests.get(
22
+ url,
23
+ headers=DEFAULT_HEADERS,
24
+ impersonate=profile,
25
+ timeout=timeout,
26
+ allow_redirects=True
27
+ )
28
+
29
+ response_time_ms = int((time.perf_counter() - start_time) * 1000)
30
+
31
+ headers = {k.lower(): v for k, v in resp.headers.items()}
32
+ cookies = list(resp.cookies.keys())
33
+ body_preview = resp.text[:2048]
34
+
35
+ verdict = get_verdict(resp.status_code, body_preview)
36
+
37
+ tls_was_blocker = False
38
+ if plain_result.verdict in (Verdict.BLOCKED, Verdict.CHALLENGED) and verdict == Verdict.OPEN:
39
+ tls_was_blocker = True
40
+
41
+ return TlsResult(
42
+ verdict=verdict,
43
+ status=resp.status_code,
44
+ response_time_ms=response_time_ms,
45
+ headers=headers,
46
+ cookies=cookies,
47
+ body_preview=body_preview,
48
+ tls_was_blocker=tls_was_blocker,
49
+ profile_used=profile,
50
+ full_body=resp.text
51
+ )
52
+ except Exception as e:
53
+ response_time_ms = int((time.perf_counter() - start_time) * 1000)
54
+ return TlsResult(
55
+ verdict=Verdict.ERROR,
56
+ status=None,
57
+ response_time_ms=response_time_ms,
58
+ headers={},
59
+ cookies=[],
60
+ body_preview="",
61
+ tls_was_blocker=False,
62
+ profile_used=profile,
63
+ error=str(e)
64
+ )
@@ -0,0 +1,101 @@
1
+ from ..types import VendorResult, Confidence, PlainResult, TlsResult, Verdict
2
+ from ..signatures import load_signatures
3
+
4
+ def run(plain_result: PlainResult, tls_result: TlsResult | None) -> VendorResult:
5
+ # Prefer Stage 2 result if it ran and was not skipped, else use Stage 1.
6
+ if tls_result and tls_result.verdict not in (Verdict.SKIPPED, Verdict.ERROR):
7
+ target_headers = tls_result.headers
8
+ target_cookies = tls_result.cookies
9
+ target_body = tls_result.body_preview
10
+ target_status = tls_result.status
11
+ else:
12
+ target_headers = plain_result.headers
13
+ target_cookies = plain_result.cookies
14
+ target_body = plain_result.body_preview
15
+ target_status = plain_result.status
16
+
17
+ sigs = load_signatures()
18
+
19
+ all_scores = []
20
+ best_vendor = None
21
+ best_score = 0.0
22
+ best_signals = []
23
+
24
+ target_body_lower = target_body.lower()
25
+ target_cookies_lower = [c.lower() for c in target_cookies]
26
+
27
+ for vendor_data in sigs.get("vendors", []):
28
+ vendor_name = vendor_data["name"]
29
+ signals = vendor_data.get("signals", [])
30
+
31
+ max_possible = sum(s.get("weight", 0) for s in signals)
32
+ if max_possible == 0:
33
+ continue
34
+
35
+ score = 0.0
36
+ matched_signals = []
37
+
38
+ for sig in signals:
39
+ stype = sig["type"]
40
+ weight = sig.get("weight", 0)
41
+ matched = False
42
+ match_str = ""
43
+
44
+ if stype == "header_present":
45
+ key = sig["key"].lower()
46
+ if key in target_headers:
47
+ matched = True
48
+ match_str = f"{key} header"
49
+ elif stype == "header_value":
50
+ key = sig["key"].lower()
51
+ val = sig["value"].lower()
52
+ if key in target_headers and val in target_headers[key].lower():
53
+ matched = True
54
+ match_str = f"{val} in {key} header"
55
+ elif stype == "cookie_name":
56
+ val = sig["value"].lower()
57
+ if val in target_cookies_lower:
58
+ matched = True
59
+ match_str = f"{sig['value']} cookie"
60
+ elif stype == "body_contains":
61
+ val = sig["value"].lower()
62
+ if val in target_body_lower:
63
+ matched = True
64
+ match_str = f"{sig['value']} in body"
65
+ elif stype == "status_code":
66
+ if target_status == sig["value"]:
67
+ matched = True
68
+ match_str = f"status {sig['value']}"
69
+
70
+ if matched:
71
+ score += weight
72
+ matched_signals.append(match_str)
73
+
74
+ normalized = score / max_possible
75
+ all_scores.append((vendor_name, normalized))
76
+
77
+ if normalized > best_score:
78
+ best_score = normalized
79
+ best_vendor = vendor_name
80
+ best_signals = matched_signals
81
+
82
+ if best_score >= 0.8:
83
+ confidence = Confidence.HIGH
84
+ elif best_score >= 0.5:
85
+ confidence = Confidence.MEDIUM
86
+ elif best_score >= 0.3:
87
+ confidence = Confidence.LOW
88
+ else:
89
+ return VendorResult(
90
+ vendor=None,
91
+ confidence=None,
92
+ matched_signals=[],
93
+ all_scores=all_scores
94
+ )
95
+
96
+ return VendorResult(
97
+ vendor=best_vendor,
98
+ confidence=confidence,
99
+ matched_signals=best_signals,
100
+ all_scores=all_scores
101
+ )
@@ -0,0 +1,82 @@
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import Optional
4
+
5
+ class Verdict(Enum):
6
+ OPEN = "Open"
7
+ BLOCKED = "Blocked"
8
+ CHALLENGED = "Challenged"
9
+ REDIRECTED = "Redirected"
10
+ UNCERTAIN = "Uncertain"
11
+ SKIPPED = "Skipped"
12
+ ERROR = "Error"
13
+
14
+ class Confidence(Enum):
15
+ HIGH = "High"
16
+ MEDIUM = "Medium"
17
+ LOW = "Low"
18
+
19
+ class BlockType(Enum):
20
+ HARD_BLOCK = "HardBlock"
21
+ RATE_LIMITED = "RateLimited"
22
+ SOFT_REDIRECT = "SoftRedirect"
23
+ SILENT = "Silent"
24
+
25
+ @dataclass
26
+ class PlainResult:
27
+ verdict: Verdict
28
+ status: Optional[int]
29
+ response_time_ms: int
30
+ headers: dict[str, str]
31
+ cookies: list[str]
32
+ body_preview: str
33
+ final_url: str
34
+ full_body: str = ""
35
+ error: Optional[str] = None
36
+
37
+ @dataclass
38
+ class TlsResult:
39
+ verdict: Verdict
40
+ status: Optional[int]
41
+ response_time_ms: int
42
+ headers: dict[str, str]
43
+ cookies: list[str]
44
+ body_preview: str
45
+ tls_was_blocker: bool
46
+ profile_used: str
47
+ full_body: str = ""
48
+ error: Optional[str] = None
49
+
50
+ @dataclass
51
+ class VendorResult:
52
+ vendor: Optional[str]
53
+ confidence: Optional[Confidence]
54
+ matched_signals: list[str]
55
+ all_scores: list[tuple[str, float]]
56
+
57
+ @dataclass
58
+ class RateLimitResult:
59
+ total_requests: int
60
+ successful: int
61
+ blocked: int
62
+ block_type: Optional[BlockType]
63
+ estimated_safe_rps: Optional[float]
64
+ retry_after_secs: Optional[int]
65
+ median_response_ms: int
66
+
67
+ @dataclass
68
+ class Recommendation:
69
+ use_tls_impersonation: bool
70
+ profile: Optional[str]
71
+ captcha_detected: bool
72
+ proxy_recommended: bool
73
+ notes: list[str] = field(default_factory=list)
74
+
75
+ @dataclass
76
+ class ReconReport:
77
+ target: str
78
+ plain: Optional[PlainResult]
79
+ tls: Optional[TlsResult]
80
+ vendor: Optional[VendorResult]
81
+ rate_limit: Optional[RateLimitResult]
82
+ recommendation: Recommendation
@@ -0,0 +1,14 @@
1
+ import json
2
+ import importlib.resources
3
+ from . import data
4
+
5
+ def load_indicators() -> list[str]:
6
+ text = importlib.resources.read_text(data, "indicators.json")
7
+ return json.loads(text)
8
+
9
+ def is_challenge_body(body: str) -> bool:
10
+ if not body:
11
+ return False
12
+ body_lower = body.lower()
13
+ indicators = load_indicators()
14
+ return any(indicator in body_lower for indicator in indicators)