sectalon 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sectalon-0.1.0/PKG-INFO +246 -0
- sectalon-0.1.0/README.md +233 -0
- sectalon-0.1.0/pyproject.toml +26 -0
- sectalon-0.1.0/sectalon.egg-info/PKG-INFO +246 -0
- sectalon-0.1.0/sectalon.egg-info/SOURCES.txt +9 -0
- sectalon-0.1.0/sectalon.egg-info/dependency_links.txt +1 -0
- sectalon-0.1.0/sectalon.egg-info/entry_points.txt +2 -0
- sectalon-0.1.0/sectalon.egg-info/requires.txt +2 -0
- sectalon-0.1.0/sectalon.egg-info/top_level.txt +1 -0
- sectalon-0.1.0/setup.cfg +4 -0
- sectalon-0.1.0/talon_v1.py +442 -0
sectalon-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sectalon
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Talon CLI: phishing URL analysis with Playwright, Ollama/OpenAI, and evidence capture.
|
|
5
|
+
Author: tejkdno1
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/tejkdno1/Talon
|
|
8
|
+
Project-URL: Repository, https://github.com/tejkdno1/Talon
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: playwright>=1.52.0
|
|
12
|
+
Requires-Dist: openai>=1.0.0
|
|
13
|
+
|
|
14
|
+
# ๐ฆ
Talon (Alpha)
|
|
15
|
+
|
|
16
|
+
> **The Autonomous AI Phishing Hunter**
|
|
17
|
+
> Detonate suspicious URLs, capture evidence, and get a fast phishing-risk verdict.
|
|
18
|
+
|
|
19
|
+
[](https://opensource.org/licenses/MIT)
|
|
20
|
+
[](https://www.python.org/downloads/)
|
|
21
|
+
[](https://playwright.dev/)
|
|
22
|
+
[](https://www.docker.com/)
|
|
23
|
+
|
|
24
|
+
Talon is a practical phishing URL analysis tool that:
|
|
25
|
+
- detonates suspicious links in headless Chromium,
|
|
26
|
+
- captures forensic evidence (screenshot + DOM snapshot),
|
|
27
|
+
- returns an LLM-assisted phishing risk verdict (with heuristic fallback).
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## โจ Features (V1)
|
|
32
|
+
|
|
33
|
+
- **๐ต๏ธ URL detonation:** opens a target URL safely in Playwright.
|
|
34
|
+
- **๐ Redirect awareness:** records the final resolved URL after redirects.
|
|
35
|
+
- **๐งพ Evidence capture:** stores full-page screenshot and DOM snapshot.
|
|
36
|
+
- **๐ค LLM analysis:** uses an LLM for smarter risk reasoning.
|
|
37
|
+
- **๐ Structured output:** writes a JSON report with score, level, reasons, and method.
|
|
38
|
+
- **๐งฑ Docker sandbox mode:** runs analysis in a hardened container profile.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## โ๏ธ How It Works
|
|
43
|
+
|
|
44
|
+
1. **Ingest**: receive a URL input.
|
|
45
|
+
2. **Detonate**: load it in headless Chromium.
|
|
46
|
+
3. **Collect**: save final URL, HTTP status, title, screenshot, and DOM.
|
|
47
|
+
4. **Verdict**: run LLM analysis (`ollama` / `openai`) with heuristic fallback.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## ๐ Project Structure
|
|
52
|
+
|
|
53
|
+
```text
|
|
54
|
+
.
|
|
55
|
+
โโโ talon_v1.py
|
|
56
|
+
โโโ requirements.txt
|
|
57
|
+
โโโ Dockerfile
|
|
58
|
+
โโโ docker-compose.sandbox.yml
|
|
59
|
+
โโโ .gitignore
|
|
60
|
+
โโโ evidence/ # generated at runtime (ignored by git)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## ๐ Local Setup
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
git clone https://github.com/tejkdno1/Talon.git
|
|
69
|
+
cd Talon
|
|
70
|
+
python3 -m pip install -r requirements.txt
|
|
71
|
+
python3 -m playwright install chromium
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## ๐ฅ Install As CLI Package (`talon`)
|
|
77
|
+
|
|
78
|
+
You can install Talon on another machine as a CLI package and run it via `talon`.
|
|
79
|
+
|
|
80
|
+
From GitHub:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
python3 -m pip install "git+https://github.com/tejkdno1/Talon.git"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Or with `pipx` (recommended for CLI tools):
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pipx install "git+https://github.com/tejkdno1/Talon.git"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Then run:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
talon "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Notes:
|
|
99
|
+
- Package name is `sectalon`, command name is `talon`.
|
|
100
|
+
- First run auto-downloads Chromium if missing.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## ๐ฆ One-File Executable (Open-Source Friendly)
|
|
105
|
+
|
|
106
|
+
Build a single binary (no Python required on target machine):
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
./build_onefile.sh
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Output:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
dist/talon
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Run it directly:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
./dist/talon "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Notes:
|
|
125
|
+
- Keep Ollama running locally when using `--llm-provider ollama`.
|
|
126
|
+
- For OpenAI provider, set `OPENAI_API_KEY` as usual.
|
|
127
|
+
- On first run, Chromium auto-installs to `~/.cache/ms-playwright`.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## โถ๏ธ Quick Start (Host Run)
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
python3 talon_v1.py "https://example.com"
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Local Ollama (Gemma4) - recommended for your setup
|
|
138
|
+
|
|
139
|
+
Start Ollama and pull model:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
ollama pull gemma4
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Run Talon using Ollama backend:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
export TALON_LLM_PROVIDER="ollama"
|
|
149
|
+
export TALON_LLM_MODEL="gemma4"
|
|
150
|
+
export OLLAMA_HOST="http://localhost:11434"
|
|
151
|
+
export OLLAMA_TIMEOUT_SEC="180"
|
|
152
|
+
python3 talon_v1.py "https://example.com"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### OpenAI (optional)
|
|
156
|
+
|
|
157
|
+
If you want cloud LLM instead:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
export OPENAI_API_KEY="your_api_key_here"
|
|
161
|
+
export TALON_LLM_PROVIDER="openai"
|
|
162
|
+
python3 talon_v1.py "https://example.com"
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Optional:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
python3 talon_v1.py "example.com/login" --output-dir evidence --timeout-ms 20000
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Force heuristic-only mode:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
python3 talon_v1.py "https://example.com" --no-llm
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Optional model override:
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
export TALON_LLM_MODEL="gemma4"
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Optional provider override per run:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
python3 talon_v1.py "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## ๐ก๏ธ Docker Sandbox Run (Recommended)
|
|
192
|
+
|
|
193
|
+
Build once:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
docker compose -f docker-compose.sandbox.yml build
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Run analysis:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
TARGET_URL="https://leadscruise.com" docker compose -f docker-compose.sandbox.yml run --rm talon
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
For Docker + host Ollama, default `OLLAMA_HOST` is set to:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
http://host.docker.internal:11434
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
You can override it if needed:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
OLLAMA_HOST="http://host.docker.internal:11434" TARGET_URL="https://example.com" docker compose -f docker-compose.sandbox.yml run --rm talon
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
The sandbox profile includes:
|
|
218
|
+
- read-only root filesystem,
|
|
219
|
+
- all Linux capabilities dropped,
|
|
220
|
+
- `no-new-privileges`,
|
|
221
|
+
- CPU/memory/PID limits,
|
|
222
|
+
- output only through mounted `./evidence`.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## ๐ฆ Output Artifacts
|
|
227
|
+
|
|
228
|
+
Each scan now creates a dedicated run folder:
|
|
229
|
+
|
|
230
|
+
- `evidence/run_<timestamp>/report.json`
|
|
231
|
+
- `evidence/run_<timestamp>/screenshot.png`
|
|
232
|
+
- `evidence/run_<timestamp>/dom.html`
|
|
233
|
+
|
|
234
|
+
Each `report.json` includes `analysis_method`:
|
|
235
|
+
- `llm-ollama`
|
|
236
|
+
- `llm-openai`
|
|
237
|
+
- `heuristic`
|
|
238
|
+
|
|
239
|
+
Run logs are also appended to:
|
|
240
|
+
- `logs/runs.jsonl` (one JSON entry per scan)
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## ๐ Security Note
|
|
245
|
+
|
|
246
|
+
Docker sandboxing significantly reduces risk compared to running directly on the host, but no sandbox is perfect. For high-risk investigations, use a dedicated VM and isolated network segment.
|
sectalon-0.1.0/README.md
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# ๐ฆ
Talon (Alpha)
|
|
2
|
+
|
|
3
|
+
> **The Autonomous AI Phishing Hunter**
|
|
4
|
+
> Detonate suspicious URLs, capture evidence, and get a fast phishing-risk verdict.
|
|
5
|
+
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://playwright.dev/)
|
|
9
|
+
[](https://www.docker.com/)
|
|
10
|
+
|
|
11
|
+
Talon is a practical phishing URL analysis tool that:
|
|
12
|
+
- detonates suspicious links in headless Chromium,
|
|
13
|
+
- captures forensic evidence (screenshot + DOM snapshot),
|
|
14
|
+
- returns an LLM-assisted phishing risk verdict (with heuristic fallback).
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## โจ Features (V1)
|
|
19
|
+
|
|
20
|
+
- **๐ต๏ธ URL detonation:** opens a target URL safely in Playwright.
|
|
21
|
+
- **๐ Redirect awareness:** records the final resolved URL after redirects.
|
|
22
|
+
- **๐งพ Evidence capture:** stores full-page screenshot and DOM snapshot.
|
|
23
|
+
- **๐ค LLM analysis:** uses an LLM for smarter risk reasoning.
|
|
24
|
+
- **๐ Structured output:** writes a JSON report with score, level, reasons, and method.
|
|
25
|
+
- **๐งฑ Docker sandbox mode:** runs analysis in a hardened container profile.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## โ๏ธ How It Works
|
|
30
|
+
|
|
31
|
+
1. **Ingest**: receive a URL input.
|
|
32
|
+
2. **Detonate**: load it in headless Chromium.
|
|
33
|
+
3. **Collect**: save final URL, HTTP status, title, screenshot, and DOM.
|
|
34
|
+
4. **Verdict**: run LLM analysis (`ollama` / `openai`) with heuristic fallback.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## ๐ Project Structure
|
|
39
|
+
|
|
40
|
+
```text
|
|
41
|
+
.
|
|
42
|
+
โโโ talon_v1.py
|
|
43
|
+
โโโ requirements.txt
|
|
44
|
+
โโโ Dockerfile
|
|
45
|
+
โโโ docker-compose.sandbox.yml
|
|
46
|
+
โโโ .gitignore
|
|
47
|
+
โโโ evidence/ # generated at runtime (ignored by git)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## ๐ Local Setup
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/tejkdno1/Talon.git
|
|
56
|
+
cd Talon
|
|
57
|
+
python3 -m pip install -r requirements.txt
|
|
58
|
+
python3 -m playwright install chromium
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## ๐ฅ Install As CLI Package (`talon`)
|
|
64
|
+
|
|
65
|
+
You can install Talon on another machine as a CLI package and run it via `talon`.
|
|
66
|
+
|
|
67
|
+
From GitHub:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
python3 -m pip install "git+https://github.com/tejkdno1/Talon.git"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or with `pipx` (recommended for CLI tools):
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pipx install "git+https://github.com/tejkdno1/Talon.git"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Then run:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
talon "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Notes:
|
|
86
|
+
- Package name is `sectalon`, command name is `talon`.
|
|
87
|
+
- First run auto-downloads Chromium if missing.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## ๐ฆ One-File Executable (Open-Source Friendly)
|
|
92
|
+
|
|
93
|
+
Build a single binary (no Python required on target machine):
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
./build_onefile.sh
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Output:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
dist/talon
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Run it directly:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
./dist/talon "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Notes:
|
|
112
|
+
- Keep Ollama running locally when using `--llm-provider ollama`.
|
|
113
|
+
- For OpenAI provider, set `OPENAI_API_KEY` as usual.
|
|
114
|
+
- On first run, Chromium auto-installs to `~/.cache/ms-playwright`.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## โถ๏ธ Quick Start (Host Run)
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
python3 talon_v1.py "https://example.com"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Local Ollama (Gemma4) - recommended for your setup
|
|
125
|
+
|
|
126
|
+
Start Ollama and pull model:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
ollama pull gemma4
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Run Talon using Ollama backend:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
export TALON_LLM_PROVIDER="ollama"
|
|
136
|
+
export TALON_LLM_MODEL="gemma4"
|
|
137
|
+
export OLLAMA_HOST="http://localhost:11434"
|
|
138
|
+
export OLLAMA_TIMEOUT_SEC="180"
|
|
139
|
+
python3 talon_v1.py "https://example.com"
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### OpenAI (optional)
|
|
143
|
+
|
|
144
|
+
If you want cloud LLM instead:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
export OPENAI_API_KEY="your_api_key_here"
|
|
148
|
+
export TALON_LLM_PROVIDER="openai"
|
|
149
|
+
python3 talon_v1.py "https://example.com"
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Optional:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
python3 talon_v1.py "example.com/login" --output-dir evidence --timeout-ms 20000
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Force heuristic-only mode:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
python3 talon_v1.py "https://example.com" --no-llm
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Optional model override:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
export TALON_LLM_MODEL="gemma4"
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Optional provider override per run:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
python3 talon_v1.py "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## ๐ก๏ธ Docker Sandbox Run (Recommended)
|
|
179
|
+
|
|
180
|
+
Build once:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
docker compose -f docker-compose.sandbox.yml build
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Run analysis:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
TARGET_URL="https://leadscruise.com" docker compose -f docker-compose.sandbox.yml run --rm talon
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
For Docker + host Ollama, default `OLLAMA_HOST` is set to:
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
http://host.docker.internal:11434
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
You can override it if needed:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
OLLAMA_HOST="http://host.docker.internal:11434" TARGET_URL="https://example.com" docker compose -f docker-compose.sandbox.yml run --rm talon
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
The sandbox profile includes:
|
|
205
|
+
- read-only root filesystem,
|
|
206
|
+
- all Linux capabilities dropped,
|
|
207
|
+
- `no-new-privileges`,
|
|
208
|
+
- CPU/memory/PID limits,
|
|
209
|
+
- output only through mounted `./evidence`.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## ๐ฆ Output Artifacts
|
|
214
|
+
|
|
215
|
+
Each scan now creates a dedicated run folder:
|
|
216
|
+
|
|
217
|
+
- `evidence/run_<timestamp>/report.json`
|
|
218
|
+
- `evidence/run_<timestamp>/screenshot.png`
|
|
219
|
+
- `evidence/run_<timestamp>/dom.html`
|
|
220
|
+
|
|
221
|
+
Each `report.json` includes `analysis_method`:
|
|
222
|
+
- `llm-ollama`
|
|
223
|
+
- `llm-openai`
|
|
224
|
+
- `heuristic`
|
|
225
|
+
|
|
226
|
+
Run logs are also appended to:
|
|
227
|
+
- `logs/runs.jsonl` (one JSON entry per scan)
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## ๐ Security Note
|
|
232
|
+
|
|
233
|
+
Docker sandboxing significantly reduces risk compared to running directly on the host, but no sandbox is perfect. For high-risk investigations, use a dedicated VM and isolated network segment.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sectalon"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Talon CLI: phishing URL analysis with Playwright, Ollama/OpenAI, and evidence capture."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "tejkdno1" }]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"playwright>=1.52.0",
|
|
15
|
+
"openai>=1.0.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://github.com/tejkdno1/Talon"
|
|
20
|
+
Repository = "https://github.com/tejkdno1/Talon"
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
talon = "talon_v1:main"
|
|
24
|
+
|
|
25
|
+
[tool.setuptools]
|
|
26
|
+
py-modules = ["talon_v1"]
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sectalon
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Talon CLI: phishing URL analysis with Playwright, Ollama/OpenAI, and evidence capture.
|
|
5
|
+
Author: tejkdno1
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/tejkdno1/Talon
|
|
8
|
+
Project-URL: Repository, https://github.com/tejkdno1/Talon
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: playwright>=1.52.0
|
|
12
|
+
Requires-Dist: openai>=1.0.0
|
|
13
|
+
|
|
14
|
+
# ๐ฆ
Talon (Alpha)
|
|
15
|
+
|
|
16
|
+
> **The Autonomous AI Phishing Hunter**
|
|
17
|
+
> Detonate suspicious URLs, capture evidence, and get a fast phishing-risk verdict.
|
|
18
|
+
|
|
19
|
+
[](https://opensource.org/licenses/MIT)
|
|
20
|
+
[](https://www.python.org/downloads/)
|
|
21
|
+
[](https://playwright.dev/)
|
|
22
|
+
[](https://www.docker.com/)
|
|
23
|
+
|
|
24
|
+
Talon is a practical phishing URL analysis tool that:
|
|
25
|
+
- detonates suspicious links in headless Chromium,
|
|
26
|
+
- captures forensic evidence (screenshot + DOM snapshot),
|
|
27
|
+
- returns an LLM-assisted phishing risk verdict (with heuristic fallback).
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## โจ Features (V1)
|
|
32
|
+
|
|
33
|
+
- **๐ต๏ธ URL detonation:** opens a target URL safely in Playwright.
|
|
34
|
+
- **๐ Redirect awareness:** records the final resolved URL after redirects.
|
|
35
|
+
- **๐งพ Evidence capture:** stores full-page screenshot and DOM snapshot.
|
|
36
|
+
- **๐ค LLM analysis:** uses an LLM for smarter risk reasoning.
|
|
37
|
+
- **๐ Structured output:** writes a JSON report with score, level, reasons, and method.
|
|
38
|
+
- **๐งฑ Docker sandbox mode:** runs analysis in a hardened container profile.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## โ๏ธ How It Works
|
|
43
|
+
|
|
44
|
+
1. **Ingest**: receive a URL input.
|
|
45
|
+
2. **Detonate**: load it in headless Chromium.
|
|
46
|
+
3. **Collect**: save final URL, HTTP status, title, screenshot, and DOM.
|
|
47
|
+
4. **Verdict**: run LLM analysis (`ollama` / `openai`) with heuristic fallback.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## ๐ Project Structure
|
|
52
|
+
|
|
53
|
+
```text
|
|
54
|
+
.
|
|
55
|
+
โโโ talon_v1.py
|
|
56
|
+
โโโ requirements.txt
|
|
57
|
+
โโโ Dockerfile
|
|
58
|
+
โโโ docker-compose.sandbox.yml
|
|
59
|
+
โโโ .gitignore
|
|
60
|
+
โโโ evidence/ # generated at runtime (ignored by git)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## ๐ Local Setup
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
git clone https://github.com/tejkdno1/Talon.git
|
|
69
|
+
cd Talon
|
|
70
|
+
python3 -m pip install -r requirements.txt
|
|
71
|
+
python3 -m playwright install chromium
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## ๐ฅ Install As CLI Package (`talon`)
|
|
77
|
+
|
|
78
|
+
You can install Talon on another machine as a CLI package and run it via `talon`.
|
|
79
|
+
|
|
80
|
+
From GitHub:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
python3 -m pip install "git+https://github.com/tejkdno1/Talon.git"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Or with `pipx` (recommended for CLI tools):
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pipx install "git+https://github.com/tejkdno1/Talon.git"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Then run:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
talon "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Notes:
|
|
99
|
+
- Package name is `sectalon`, command name is `talon`.
|
|
100
|
+
- First run auto-downloads Chromium if missing.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## ๐ฆ One-File Executable (Open-Source Friendly)
|
|
105
|
+
|
|
106
|
+
Build a single binary (no Python required on target machine):
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
./build_onefile.sh
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Output:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
dist/talon
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Run it directly:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
./dist/talon "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Notes:
|
|
125
|
+
- Keep Ollama running locally when using `--llm-provider ollama`.
|
|
126
|
+
- For OpenAI provider, set `OPENAI_API_KEY` as usual.
|
|
127
|
+
- On first run, Chromium auto-installs to `~/.cache/ms-playwright`.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## โถ๏ธ Quick Start (Host Run)
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
python3 talon_v1.py "https://example.com"
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Local Ollama (Gemma4) - recommended for your setup
|
|
138
|
+
|
|
139
|
+
Start Ollama and pull model:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
ollama pull gemma4
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Run Talon using Ollama backend:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
export TALON_LLM_PROVIDER="ollama"
|
|
149
|
+
export TALON_LLM_MODEL="gemma4"
|
|
150
|
+
export OLLAMA_HOST="http://localhost:11434"
|
|
151
|
+
export OLLAMA_TIMEOUT_SEC="180"
|
|
152
|
+
python3 talon_v1.py "https://example.com"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### OpenAI (optional)
|
|
156
|
+
|
|
157
|
+
If you want cloud LLM instead:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
export OPENAI_API_KEY="your_api_key_here"
|
|
161
|
+
export TALON_LLM_PROVIDER="openai"
|
|
162
|
+
python3 talon_v1.py "https://example.com"
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Optional:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
python3 talon_v1.py "example.com/login" --output-dir evidence --timeout-ms 20000
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Force heuristic-only mode:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
python3 talon_v1.py "https://example.com" --no-llm
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Optional model override:
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
export TALON_LLM_MODEL="gemma4"
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Optional provider override per run:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
python3 talon_v1.py "https://example.com" --llm-provider ollama --llm-model gemma4
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## ๐ก๏ธ Docker Sandbox Run (Recommended)
|
|
192
|
+
|
|
193
|
+
Build once:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
docker compose -f docker-compose.sandbox.yml build
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Run analysis:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
TARGET_URL="https://leadscruise.com" docker compose -f docker-compose.sandbox.yml run --rm talon
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
For Docker + host Ollama, default `OLLAMA_HOST` is set to:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
http://host.docker.internal:11434
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
You can override it if needed:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
OLLAMA_HOST="http://host.docker.internal:11434" TARGET_URL="https://example.com" docker compose -f docker-compose.sandbox.yml run --rm talon
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
The sandbox profile includes:
|
|
218
|
+
- read-only root filesystem,
|
|
219
|
+
- all Linux capabilities dropped,
|
|
220
|
+
- `no-new-privileges`,
|
|
221
|
+
- CPU/memory/PID limits,
|
|
222
|
+
- output only through mounted `./evidence`.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## ๐ฆ Output Artifacts
|
|
227
|
+
|
|
228
|
+
Each scan now creates a dedicated run folder:
|
|
229
|
+
|
|
230
|
+
- `evidence/run_<timestamp>/report.json`
|
|
231
|
+
- `evidence/run_<timestamp>/screenshot.png`
|
|
232
|
+
- `evidence/run_<timestamp>/dom.html`
|
|
233
|
+
|
|
234
|
+
Each `report.json` includes `analysis_method`:
|
|
235
|
+
- `llm-ollama`
|
|
236
|
+
- `llm-openai`
|
|
237
|
+
- `heuristic`
|
|
238
|
+
|
|
239
|
+
Run logs are also appended to:
|
|
240
|
+
- `logs/runs.jsonl` (one JSON entry per scan)
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## ๐ Security Note
|
|
245
|
+
|
|
246
|
+
Docker sandboxing significantly reduces risk compared to running directly on the host, but no sandbox is perfect. For high-risk investigations, use a dedicated VM and isolated network segment.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
talon_v1
|
sectalon-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Talon V1: URL detonation + evidence capture + heuristic/LLM verdict."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import sys
|
|
11
|
+
from dataclasses import dataclass, asdict
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from urllib.parse import urlparse
|
|
15
|
+
from urllib.request import Request, urlopen
|
|
16
|
+
from urllib.error import URLError, HTTPError
|
|
17
|
+
|
|
18
|
+
# For one-file executable builds, ensure browsers are stored in a persistent path.
|
|
19
|
+
if "PLAYWRIGHT_BROWSERS_PATH" not in os.environ:
|
|
20
|
+
os.environ["PLAYWRIGHT_BROWSERS_PATH"] = str(Path.home() / ".cache" / "ms-playwright")
|
|
21
|
+
|
|
22
|
+
from playwright.sync_api import sync_playwright
|
|
23
|
+
from playwright._impl._errors import Error as PlaywrightError
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from openai import OpenAI
|
|
27
|
+
except ImportError:
|
|
28
|
+
OpenAI = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
SUSPICIOUS_TLDS = {
|
|
32
|
+
"ru",
|
|
33
|
+
"tk",
|
|
34
|
+
"top",
|
|
35
|
+
"xyz",
|
|
36
|
+
"click",
|
|
37
|
+
"work",
|
|
38
|
+
"gq",
|
|
39
|
+
"fit",
|
|
40
|
+
"cf",
|
|
41
|
+
"ml",
|
|
42
|
+
"ga",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
PHISH_KEYWORDS = {
|
|
46
|
+
"login",
|
|
47
|
+
"signin",
|
|
48
|
+
"verify",
|
|
49
|
+
"secure",
|
|
50
|
+
"account",
|
|
51
|
+
"update",
|
|
52
|
+
"password",
|
|
53
|
+
"banking",
|
|
54
|
+
"microsoft",
|
|
55
|
+
"outlook",
|
|
56
|
+
"paypal",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class Verdict:
|
|
62
|
+
risk_score: int
|
|
63
|
+
risk_level: str
|
|
64
|
+
reasons: list[str]
|
|
65
|
+
method: str
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def normalize_url(raw_url: str) -> str:
|
|
69
|
+
url = raw_url.strip()
|
|
70
|
+
if not url.startswith(("http://", "https://")):
|
|
71
|
+
url = "https://" + url
|
|
72
|
+
return url
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def is_ip_host(host: str) -> bool:
|
|
76
|
+
return bool(re.fullmatch(r"\d{1,3}(?:\.\d{1,3}){3}", host))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def build_verdict(final_url: str, page_title: str) -> Verdict:
|
|
80
|
+
parsed = urlparse(final_url)
|
|
81
|
+
host = (parsed.hostname or "").lower()
|
|
82
|
+
path_query = f"{parsed.path} {parsed.query}".lower()
|
|
83
|
+
reasons: list[str] = []
|
|
84
|
+
score = 0
|
|
85
|
+
|
|
86
|
+
if "xn--" in host:
|
|
87
|
+
score += 30
|
|
88
|
+
reasons.append("Host contains punycode (possible homograph attack).")
|
|
89
|
+
|
|
90
|
+
if is_ip_host(host):
|
|
91
|
+
score += 25
|
|
92
|
+
reasons.append("URL host is a raw IP address.")
|
|
93
|
+
|
|
94
|
+
if "@" in final_url:
|
|
95
|
+
score += 20
|
|
96
|
+
reasons.append("URL contains '@' which can obscure real destination.")
|
|
97
|
+
|
|
98
|
+
tld = host.rsplit(".", 1)[-1] if "." in host else ""
|
|
99
|
+
if tld in SUSPICIOUS_TLDS:
|
|
100
|
+
score += 15
|
|
101
|
+
reasons.append(f"Top-level domain '.{tld}' is commonly abused.")
|
|
102
|
+
|
|
103
|
+
kw_hits = [kw for kw in PHISH_KEYWORDS if kw in path_query or kw in host]
|
|
104
|
+
if kw_hits:
|
|
105
|
+
score += min(30, 8 * len(kw_hits))
|
|
106
|
+
reasons.append(f"Phishing-like keywords found: {', '.join(sorted(kw_hits))}.")
|
|
107
|
+
|
|
108
|
+
title_lower = page_title.lower()
|
|
109
|
+
if any(brand in title_lower for brand in ("microsoft", "google", "paypal", "bank")):
|
|
110
|
+
if not any(brand in host for brand in ("microsoft", "google", "paypal", "bank")):
|
|
111
|
+
score += 20
|
|
112
|
+
reasons.append("Title suggests trusted brand, but host does not match.")
|
|
113
|
+
|
|
114
|
+
score = max(0, min(100, score))
|
|
115
|
+
if score >= 70:
|
|
116
|
+
level = "HIGH"
|
|
117
|
+
elif score >= 40:
|
|
118
|
+
level = "MEDIUM"
|
|
119
|
+
else:
|
|
120
|
+
level = "LOW"
|
|
121
|
+
|
|
122
|
+
if not reasons:
|
|
123
|
+
reasons.append("No obvious phishing indicators found by V1 heuristics.")
|
|
124
|
+
|
|
125
|
+
return Verdict(risk_score=score, risk_level=level, reasons=reasons, method="heuristic")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def strip_html_for_prompt(html: str, max_chars: int = 3000) -> str:
|
|
129
|
+
text = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.IGNORECASE)
|
|
130
|
+
text = re.sub(r"<style[\s\S]*?</style>", " ", text, flags=re.IGNORECASE)
|
|
131
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
132
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
133
|
+
return text[:max_chars]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def parse_llm_verdict(content: str) -> Verdict | None:
|
|
137
|
+
text = content.strip()
|
|
138
|
+
if "```" in text:
|
|
139
|
+
match = re.search(r"```(?:json)?\s*(\{[\s\S]*\})\s*```", text, re.IGNORECASE)
|
|
140
|
+
if match:
|
|
141
|
+
text = match.group(1).strip()
|
|
142
|
+
if not text.startswith("{"):
|
|
143
|
+
match = re.search(r"(\{[\s\S]*\})", text)
|
|
144
|
+
if match:
|
|
145
|
+
text = match.group(1).strip()
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
data = json.loads(text)
|
|
149
|
+
except json.JSONDecodeError:
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
score = int(data.get("risk_score", 0))
|
|
154
|
+
except (TypeError, ValueError):
|
|
155
|
+
score = 0
|
|
156
|
+
level = str(data.get("risk_level", "LOW")).upper()
|
|
157
|
+
reasons = data.get("reasons", [])
|
|
158
|
+
if not isinstance(reasons, list):
|
|
159
|
+
reasons = [str(reasons)]
|
|
160
|
+
|
|
161
|
+
if level not in {"LOW", "MEDIUM", "HIGH"}:
|
|
162
|
+
if score >= 70:
|
|
163
|
+
level = "HIGH"
|
|
164
|
+
elif score >= 40:
|
|
165
|
+
level = "MEDIUM"
|
|
166
|
+
else:
|
|
167
|
+
level = "LOW"
|
|
168
|
+
|
|
169
|
+
return Verdict(
|
|
170
|
+
risk_score=max(0, min(100, score)),
|
|
171
|
+
risk_level=level,
|
|
172
|
+
reasons=[str(r) for r in reasons[:6]] or ["No reasons returned by LLM."],
|
|
173
|
+
method="llm",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def build_prompt(
|
|
178
|
+
input_url: str,
|
|
179
|
+
final_url: str,
|
|
180
|
+
page_title: str,
|
|
181
|
+
http_status: int | None,
|
|
182
|
+
dom_html: str,
|
|
183
|
+
) -> str:
|
|
184
|
+
content_text = strip_html_for_prompt(dom_html)
|
|
185
|
+
return (
|
|
186
|
+
"You are a phishing detection analyst. "
|
|
187
|
+
"Classify risk for this URL visit and return strict JSON only.\n\n"
|
|
188
|
+
f"Input URL: {input_url}\n"
|
|
189
|
+
f"Final URL: {final_url}\n"
|
|
190
|
+
f"HTTP status: {http_status}\n"
|
|
191
|
+
f"Page title: {page_title}\n"
|
|
192
|
+
f"Page text excerpt: {content_text}\n\n"
|
|
193
|
+
"Return JSON with keys:\n"
|
|
194
|
+
"- risk_score (0-100 integer)\n"
|
|
195
|
+
"- risk_level ('LOW'|'MEDIUM'|'HIGH')\n"
|
|
196
|
+
"- reasons (array of short strings)\n"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def build_openai_verdict(prompt: str, model: str) -> Verdict | None:
|
|
201
|
+
api_key = os.getenv("OPENAI_API_KEY", "").strip()
|
|
202
|
+
if not api_key or OpenAI is None:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
client = OpenAI(api_key=api_key)
|
|
206
|
+
try:
|
|
207
|
+
response = client.responses.create(
|
|
208
|
+
model=model,
|
|
209
|
+
input=prompt,
|
|
210
|
+
max_output_tokens=400,
|
|
211
|
+
)
|
|
212
|
+
raw = response.output_text.strip()
|
|
213
|
+
verdict = parse_llm_verdict(raw)
|
|
214
|
+
if verdict:
|
|
215
|
+
verdict.method = "llm-openai"
|
|
216
|
+
return verdict
|
|
217
|
+
except Exception:
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def build_ollama_verdict(prompt: str, model: str) -> Verdict | None:
|
|
222
|
+
host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/")
|
|
223
|
+
timeout_sec = int(os.getenv("OLLAMA_TIMEOUT_SEC", "180"))
|
|
224
|
+
endpoint = f"{host}/api/generate"
|
|
225
|
+
payload = {
|
|
226
|
+
"model": model,
|
|
227
|
+
"prompt": prompt,
|
|
228
|
+
"format": "json",
|
|
229
|
+
"stream": False,
|
|
230
|
+
}
|
|
231
|
+
request = Request(
|
|
232
|
+
endpoint,
|
|
233
|
+
data=json.dumps(payload).encode("utf-8"),
|
|
234
|
+
headers={"Content-Type": "application/json"},
|
|
235
|
+
method="POST",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
with urlopen(request, timeout=timeout_sec) as resp:
|
|
240
|
+
body = resp.read().decode("utf-8")
|
|
241
|
+
data = json.loads(body)
|
|
242
|
+
raw = str(data.get("response", "")).strip()
|
|
243
|
+
verdict = parse_llm_verdict(raw)
|
|
244
|
+
if verdict:
|
|
245
|
+
verdict.method = "llm-ollama"
|
|
246
|
+
return verdict
|
|
247
|
+
except (URLError, HTTPError, TimeoutError, json.JSONDecodeError):
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def build_llm_verdict(
|
|
252
|
+
input_url: str,
|
|
253
|
+
final_url: str,
|
|
254
|
+
page_title: str,
|
|
255
|
+
http_status: int | None,
|
|
256
|
+
dom_html: str,
|
|
257
|
+
llm_provider: str = "auto",
|
|
258
|
+
llm_model: str | None = None,
|
|
259
|
+
) -> Verdict | None:
|
|
260
|
+
provider = llm_provider.lower()
|
|
261
|
+
ollama_model = llm_model or os.getenv("TALON_LLM_MODEL", "gemma4")
|
|
262
|
+
openai_model = os.getenv("TALON_OPENAI_MODEL", "gpt-4o-mini")
|
|
263
|
+
prompt = build_prompt(input_url, final_url, page_title, http_status, dom_html)
|
|
264
|
+
|
|
265
|
+
if provider == "openai":
|
|
266
|
+
return build_openai_verdict(prompt, llm_model or openai_model)
|
|
267
|
+
if provider == "ollama":
|
|
268
|
+
return build_ollama_verdict(prompt, ollama_model)
|
|
269
|
+
|
|
270
|
+
# auto: prefer local ollama first, then openai.
|
|
271
|
+
return build_ollama_verdict(prompt, ollama_model) or build_openai_verdict(prompt, openai_model)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def analyze_url(
|
|
275
|
+
url: str,
|
|
276
|
+
output_dir: Path,
|
|
277
|
+
timeout_ms: int = 15000,
|
|
278
|
+
use_llm: bool = True,
|
|
279
|
+
llm_provider: str = "auto",
|
|
280
|
+
llm_model: str | None = None,
|
|
281
|
+
) -> dict:
|
|
282
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
283
|
+
logs_dir = Path("logs")
|
|
284
|
+
logs_dir.mkdir(parents=True, exist_ok=True)
|
|
285
|
+
|
|
286
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
287
|
+
run_dir = output_dir / f"run_{ts}"
|
|
288
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
289
|
+
screenshot_path = run_dir / "screenshot.png"
|
|
290
|
+
dom_path = run_dir / "dom.html"
|
|
291
|
+
report_path = run_dir / "report.json"
|
|
292
|
+
|
|
293
|
+
with sync_playwright() as p:
|
|
294
|
+
try:
|
|
295
|
+
browser = p.chromium.launch(headless=True)
|
|
296
|
+
except PlaywrightError as launch_error:
|
|
297
|
+
if "Executable doesn't exist" not in str(launch_error):
|
|
298
|
+
raise
|
|
299
|
+
# First run (or clean machine): bootstrap Chromium automatically.
|
|
300
|
+
import playwright.__main__ as playwright_cli
|
|
301
|
+
|
|
302
|
+
original_argv = sys.argv[:]
|
|
303
|
+
try:
|
|
304
|
+
sys.argv = ["playwright", "install", "chromium"]
|
|
305
|
+
playwright_cli.main()
|
|
306
|
+
finally:
|
|
307
|
+
sys.argv = original_argv
|
|
308
|
+
browser = p.chromium.launch(headless=True)
|
|
309
|
+
context = browser.new_context(ignore_https_errors=True)
|
|
310
|
+
page = context.new_page()
|
|
311
|
+
|
|
312
|
+
response = page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
|
|
313
|
+
page.wait_for_timeout(1000)
|
|
314
|
+
|
|
315
|
+
final_url = page.url
|
|
316
|
+
title = page.title()
|
|
317
|
+
status = response.status if response else None
|
|
318
|
+
|
|
319
|
+
page.screenshot(path=str(screenshot_path), full_page=True)
|
|
320
|
+
dom_html = page.content()
|
|
321
|
+
dom_path.write_text(dom_html, encoding="utf-8")
|
|
322
|
+
|
|
323
|
+
llm_verdict = None
|
|
324
|
+
if use_llm:
|
|
325
|
+
llm_verdict = build_llm_verdict(
|
|
326
|
+
url,
|
|
327
|
+
final_url,
|
|
328
|
+
title,
|
|
329
|
+
status,
|
|
330
|
+
dom_html,
|
|
331
|
+
llm_provider=llm_provider,
|
|
332
|
+
llm_model=llm_model,
|
|
333
|
+
)
|
|
334
|
+
verdict = llm_verdict or build_verdict(final_url, title)
|
|
335
|
+
report = {
|
|
336
|
+
"input_url": url,
|
|
337
|
+
"final_url": final_url,
|
|
338
|
+
"http_status": status,
|
|
339
|
+
"page_title": title,
|
|
340
|
+
"timestamp_utc": datetime.now(timezone.utc).isoformat(),
|
|
341
|
+
"analysis_method": verdict.method,
|
|
342
|
+
"verdict": asdict(verdict),
|
|
343
|
+
"evidence": {
|
|
344
|
+
"screenshot": str(screenshot_path),
|
|
345
|
+
"dom_snapshot": str(dom_path),
|
|
346
|
+
},
|
|
347
|
+
}
|
|
348
|
+
report_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
|
349
|
+
|
|
350
|
+
context.close()
|
|
351
|
+
browser.close()
|
|
352
|
+
|
|
353
|
+
report["report_path"] = str(report_path)
|
|
354
|
+
report["run_dir"] = str(run_dir)
|
|
355
|
+
|
|
356
|
+
run_log_entry = {
|
|
357
|
+
"timestamp_utc": report["timestamp_utc"],
|
|
358
|
+
"input_url": report["input_url"],
|
|
359
|
+
"final_url": report["final_url"],
|
|
360
|
+
"http_status": report["http_status"],
|
|
361
|
+
"page_title": report["page_title"],
|
|
362
|
+
"analysis_method": report["analysis_method"],
|
|
363
|
+
"risk_score": report["verdict"]["risk_score"],
|
|
364
|
+
"risk_level": report["verdict"]["risk_level"],
|
|
365
|
+
"run_dir": report["run_dir"],
|
|
366
|
+
"report_path": report["report_path"],
|
|
367
|
+
"screenshot": report["evidence"]["screenshot"],
|
|
368
|
+
"dom_snapshot": report["evidence"]["dom_snapshot"],
|
|
369
|
+
}
|
|
370
|
+
with (logs_dir / "runs.jsonl").open("a", encoding="utf-8") as log_file:
|
|
371
|
+
log_file.write(json.dumps(run_log_entry, ensure_ascii=True) + "\n")
|
|
372
|
+
|
|
373
|
+
return report
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def parse_args() -> argparse.Namespace:
|
|
377
|
+
parser = argparse.ArgumentParser(
|
|
378
|
+
description="Talon V1: analyze a URL with LLM + heuristic fallback."
|
|
379
|
+
)
|
|
380
|
+
parser.add_argument("url", help="URL to analyze")
|
|
381
|
+
parser.add_argument(
|
|
382
|
+
"--output-dir",
|
|
383
|
+
default="evidence",
|
|
384
|
+
help="Directory where screenshot/DOM/report will be saved",
|
|
385
|
+
)
|
|
386
|
+
parser.add_argument(
|
|
387
|
+
"--timeout-ms",
|
|
388
|
+
type=int,
|
|
389
|
+
default=15000,
|
|
390
|
+
help="Navigation timeout in milliseconds",
|
|
391
|
+
)
|
|
392
|
+
parser.add_argument(
|
|
393
|
+
"--no-llm",
|
|
394
|
+
action="store_true",
|
|
395
|
+
help="Disable LLM analysis and use heuristics only",
|
|
396
|
+
)
|
|
397
|
+
parser.add_argument(
|
|
398
|
+
"--llm-provider",
|
|
399
|
+
choices=["auto", "ollama", "openai"],
|
|
400
|
+
default=os.getenv("TALON_LLM_PROVIDER", "auto"),
|
|
401
|
+
help="LLM backend provider (default: env TALON_LLM_PROVIDER or auto)",
|
|
402
|
+
)
|
|
403
|
+
parser.add_argument(
|
|
404
|
+
"--llm-model",
|
|
405
|
+
default=os.getenv("TALON_LLM_MODEL", "gemma4"),
|
|
406
|
+
help="LLM model name (default: env TALON_LLM_MODEL or gemma4)",
|
|
407
|
+
)
|
|
408
|
+
return parser.parse_args()
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def main() -> None:
|
|
412
|
+
args = parse_args()
|
|
413
|
+
normalized = normalize_url(args.url)
|
|
414
|
+
report = analyze_url(
|
|
415
|
+
normalized,
|
|
416
|
+
Path(args.output_dir),
|
|
417
|
+
timeout_ms=args.timeout_ms,
|
|
418
|
+
use_llm=not args.no_llm,
|
|
419
|
+
llm_provider=args.llm_provider,
|
|
420
|
+
llm_model=args.llm_model,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
print("=== TALON V1 REPORT ===")
|
|
424
|
+
print(f"Input URL: {report['input_url']}")
|
|
425
|
+
print(f"Final URL: {report['final_url']}")
|
|
426
|
+
print(f"HTTP Status: {report['http_status']}")
|
|
427
|
+
print(f"Page Title: {report['page_title']}")
|
|
428
|
+
print(
|
|
429
|
+
f"Risk: {report['verdict']['risk_level']} "
|
|
430
|
+
f"({report['verdict']['risk_score']}/100)"
|
|
431
|
+
)
|
|
432
|
+
print(f"Method: {report['analysis_method']}")
|
|
433
|
+
for reason in report["verdict"]["reasons"]:
|
|
434
|
+
print(f"- {reason}")
|
|
435
|
+
print(f"Run Dir: {report['run_dir']}")
|
|
436
|
+
print(f"Report JSON: {report['report_path']}")
|
|
437
|
+
print(f"Screenshot: {report['evidence']['screenshot']}")
|
|
438
|
+
print(f"DOM: {report['evidence']['dom_snapshot']}")
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
if __name__ == "__main__":
|
|
442
|
+
main()
|