voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
monitor/alerts.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Alert delivery (webhook, Telegram bot, SMTP) and persisted monitor_alerts records.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import smtplib
|
|
13
|
+
from email.mime.text import MIMEText
|
|
14
|
+
from typing import Any, Optional
|
|
15
|
+
|
|
16
|
+
from monitor.diff import is_significant_change
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _summarize_job_result(job_result: dict) -> str:
|
|
22
|
+
parts: list[str] = []
|
|
23
|
+
for key in ("query", "url", "changed", "new_pages", "new_entities", "duplicate_pages_skipped"):
|
|
24
|
+
if key in job_result:
|
|
25
|
+
parts.append(f"{key}={job_result[key]!r}")
|
|
26
|
+
if not parts:
|
|
27
|
+
parts.append(str(job_result)[:500])
|
|
28
|
+
return "; ".join(parts)[:1500]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def send_webhook(url: str, payload: dict) -> bool:
|
|
32
|
+
"""POST JSON to *url*; True on HTTP 2xx."""
|
|
33
|
+
try:
|
|
34
|
+
import aiohttp # noqa: PLC0415
|
|
35
|
+
|
|
36
|
+
timeout = aiohttp.ClientTimeout(total=10)
|
|
37
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
38
|
+
async with session.post(
|
|
39
|
+
url,
|
|
40
|
+
json=payload,
|
|
41
|
+
headers={"Content-Type": "application/json"},
|
|
42
|
+
) as resp:
|
|
43
|
+
return 200 <= resp.status < 300
|
|
44
|
+
except Exception as exc:
|
|
45
|
+
logger.error("send_webhook failed: %s", exc)
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def send_telegram_alert(chat_id: str, message: str) -> bool:
|
|
50
|
+
token = os.getenv("TELEGRAM_BOT_TOKEN", "").strip()
|
|
51
|
+
if not token:
|
|
52
|
+
return False
|
|
53
|
+
try:
|
|
54
|
+
import aiohttp # noqa: PLC0415
|
|
55
|
+
|
|
56
|
+
api = f"https://api.telegram.org/bot{token}/sendMessage"
|
|
57
|
+
timeout = aiohttp.ClientTimeout(total=10)
|
|
58
|
+
payload = {"chat_id": chat_id, "text": message[:4096]}
|
|
59
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
60
|
+
async with session.post(api, json=payload) as resp:
|
|
61
|
+
if resp.status != 200:
|
|
62
|
+
body = await resp.text()
|
|
63
|
+
logger.error("Telegram API %s: %s", resp.status, body[:200])
|
|
64
|
+
return resp.status == 200
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
logger.error("send_telegram_alert failed: %s", exc)
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def send_email_alert(to: str, subject: str, body: str) -> bool:
|
|
71
|
+
host = os.getenv("SMTP_HOST", "").strip()
|
|
72
|
+
if not host:
|
|
73
|
+
return False
|
|
74
|
+
port = int(os.getenv("SMTP_PORT", "587") or "587")
|
|
75
|
+
user = os.getenv("SMTP_USER", "").strip()
|
|
76
|
+
password = os.getenv("SMTP_PASS", "").strip()
|
|
77
|
+
|
|
78
|
+
def _send_sync() -> bool:
|
|
79
|
+
msg = MIMEText(body, "plain", "utf-8")
|
|
80
|
+
msg["Subject"] = subject
|
|
81
|
+
msg["From"] = user or "voidaccess@localhost"
|
|
82
|
+
msg["To"] = to
|
|
83
|
+
with smtplib.SMTP(host, port, timeout=15) as smtp:
|
|
84
|
+
smtp.ehlo()
|
|
85
|
+
if user and password:
|
|
86
|
+
smtp.starttls()
|
|
87
|
+
smtp.ehlo()
|
|
88
|
+
smtp.login(user, password)
|
|
89
|
+
smtp.sendmail(msg["From"], [to], msg.as_string())
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
return await asyncio.to_thread(_send_sync)
|
|
94
|
+
except Exception as exc:
|
|
95
|
+
logger.error("send_email_alert failed: %s", exc)
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _derive_severity(change_type: str, diff: dict) -> str:
|
|
100
|
+
"""Derive alert severity from change type and magnitude."""
|
|
101
|
+
entity_mag = int(
|
|
102
|
+
diff.get("new_entity_count", 0)
|
|
103
|
+
or diff.get("entity_count", 0)
|
|
104
|
+
or diff.get("new_entities", 0)
|
|
105
|
+
or len(diff.get("new_entities", []) if isinstance(diff.get("new_entities"), list) else [])
|
|
106
|
+
or 0
|
|
107
|
+
)
|
|
108
|
+
if change_type in ("new_entities",) and entity_mag >= 10:
|
|
109
|
+
return "critical"
|
|
110
|
+
if change_type in ("new_entities", "new_page", "first_result"):
|
|
111
|
+
return "warning"
|
|
112
|
+
if change_type in ("significant_change",):
|
|
113
|
+
return "warning"
|
|
114
|
+
if change_type == "content_change":
|
|
115
|
+
return "info"
|
|
116
|
+
return "info"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _count_entity_delta(diff: dict) -> int:
|
|
120
|
+
"""Extract entity count delta from diff result."""
|
|
121
|
+
v = (
|
|
122
|
+
diff.get("new_entity_count")
|
|
123
|
+
if diff.get("new_entity_count") is not None
|
|
124
|
+
else None
|
|
125
|
+
)
|
|
126
|
+
if v is not None:
|
|
127
|
+
return int(v)
|
|
128
|
+
v = diff.get("entity_count")
|
|
129
|
+
if v is not None:
|
|
130
|
+
return int(v)
|
|
131
|
+
ne = diff.get("new_entities")
|
|
132
|
+
if isinstance(ne, list):
|
|
133
|
+
return len(ne)
|
|
134
|
+
if isinstance(ne, int):
|
|
135
|
+
return ne
|
|
136
|
+
return 0
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _sanitize_diff(diff: dict) -> dict:
|
|
140
|
+
"""
|
|
141
|
+
Ensure diff data is JSON-serializable and not too large.
|
|
142
|
+
Truncate large text fields; strip angle-bracket tags from strings.
|
|
143
|
+
"""
|
|
144
|
+
sanitized: dict[str, Any] = {}
|
|
145
|
+
for k, v in diff.items():
|
|
146
|
+
if isinstance(v, str):
|
|
147
|
+
s = re.sub(r"<[^>]+>", "", v)
|
|
148
|
+
if len(s) > 500:
|
|
149
|
+
sanitized[k] = s[:500] + "..."
|
|
150
|
+
else:
|
|
151
|
+
sanitized[k] = s
|
|
152
|
+
elif isinstance(v, list) and len(v) > 50:
|
|
153
|
+
sanitized[k] = v[:50]
|
|
154
|
+
elif isinstance(v, (str, int, float, bool, list, dict)) or v is None:
|
|
155
|
+
sanitized[k] = v
|
|
156
|
+
return sanitized
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def build_alert_context(watch: dict, job_result: dict) -> Optional[dict[str, Any]]:
|
|
160
|
+
"""
|
|
161
|
+
If this job should produce an alert, return change_type, summary, diff_result.
|
|
162
|
+
Otherwise None.
|
|
163
|
+
"""
|
|
164
|
+
alert_on = watch.get("alert_on") or "new_results"
|
|
165
|
+
wtype = watch.get("type", "keyword")
|
|
166
|
+
|
|
167
|
+
if wtype == "keyword":
|
|
168
|
+
np = int(job_result.get("new_pages") or 0)
|
|
169
|
+
ne = int(job_result.get("new_entities") or 0)
|
|
170
|
+
should = False
|
|
171
|
+
if alert_on == "new_results":
|
|
172
|
+
should = np > 0 or ne > 0
|
|
173
|
+
elif alert_on == "any_change":
|
|
174
|
+
should = np > 0 or ne > 0
|
|
175
|
+
elif alert_on == "any_appearance":
|
|
176
|
+
should = ne > 0
|
|
177
|
+
if not should:
|
|
178
|
+
return None
|
|
179
|
+
if ne > 0:
|
|
180
|
+
ct = "new_entities"
|
|
181
|
+
elif np > 0:
|
|
182
|
+
ct = "new_page"
|
|
183
|
+
else:
|
|
184
|
+
ct = "new_entities"
|
|
185
|
+
summary = f"{ct}: {np} new page(s), {ne} new entities (query={job_result.get('query', '')!r})"
|
|
186
|
+
diff_result = {
|
|
187
|
+
"query": job_result.get("query"),
|
|
188
|
+
"new_pages": np,
|
|
189
|
+
"new_entities": ne,
|
|
190
|
+
"entity_count": ne,
|
|
191
|
+
"duplicate_pages_skipped": job_result.get("duplicate_pages_skipped"),
|
|
192
|
+
}
|
|
193
|
+
return {"change_type": ct, "summary": summary, "diff_result": diff_result}
|
|
194
|
+
|
|
195
|
+
# URL watch
|
|
196
|
+
changed = bool(job_result.get("changed"))
|
|
197
|
+
if not changed:
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
ne = int(job_result.get("new_entities") or 0)
|
|
201
|
+
cr = float(job_result.get("change_ratio") or 0.0)
|
|
202
|
+
is_first = bool(job_result.get("is_first_scrape"))
|
|
203
|
+
sig = is_significant_change({"change_ratio": cr}, threshold=0.1)
|
|
204
|
+
|
|
205
|
+
should = False
|
|
206
|
+
if alert_on == "new_results":
|
|
207
|
+
should = is_first or (ne > 0)
|
|
208
|
+
elif alert_on == "any_change":
|
|
209
|
+
should = sig or is_first
|
|
210
|
+
elif alert_on == "any_appearance":
|
|
211
|
+
should = ne > 0
|
|
212
|
+
if not should:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
if is_first:
|
|
216
|
+
ct = "first_result"
|
|
217
|
+
elif ne > 0 and sig:
|
|
218
|
+
ct = "significant_change"
|
|
219
|
+
elif ne > 0:
|
|
220
|
+
ct = "new_entities"
|
|
221
|
+
elif sig:
|
|
222
|
+
ct = "significant_change"
|
|
223
|
+
else:
|
|
224
|
+
ct = "content_change"
|
|
225
|
+
|
|
226
|
+
summary = (
|
|
227
|
+
f"{ct}: {job_result.get('url', '')!r} — "
|
|
228
|
+
f"entities={ne}, change_ratio={cr:.3f}"
|
|
229
|
+
)
|
|
230
|
+
diff_result = {
|
|
231
|
+
"url": job_result.get("url"),
|
|
232
|
+
"new_entities": ne,
|
|
233
|
+
"entity_count": ne,
|
|
234
|
+
"change_ratio": cr,
|
|
235
|
+
"lines_added": job_result.get("lines_added"),
|
|
236
|
+
"lines_removed": job_result.get("lines_removed"),
|
|
237
|
+
"diff_summary": job_result.get("diff_summary"),
|
|
238
|
+
"is_first_scrape": is_first,
|
|
239
|
+
}
|
|
240
|
+
return {"change_type": ct, "summary": summary, "diff_result": diff_result}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
async def dispatch_alerts(watch: dict, job_result: dict) -> list[str]:
|
|
244
|
+
"""
|
|
245
|
+
Dispatch to all configured external channels concurrently.
|
|
246
|
+
Returns channel names that succeeded (webhook, telegram, email).
|
|
247
|
+
"""
|
|
248
|
+
name = watch.get("name", "watch")
|
|
249
|
+
summary = _summarize_job_result(job_result)
|
|
250
|
+
text = f"[VoidAccess Alert] {name}: {summary}"
|
|
251
|
+
payload = {
|
|
252
|
+
"watch": name,
|
|
253
|
+
"job_result": job_result,
|
|
254
|
+
"message": text,
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
tasks: list[Any] = []
|
|
258
|
+
labels: list[str] = []
|
|
259
|
+
|
|
260
|
+
wu = watch.get("webhook_url")
|
|
261
|
+
if wu and isinstance(wu, str) and wu.strip():
|
|
262
|
+
tasks.append(send_webhook(wu.strip(), payload))
|
|
263
|
+
labels.append("webhook")
|
|
264
|
+
|
|
265
|
+
tc = watch.get("telegram_chat_id")
|
|
266
|
+
if tc and isinstance(tc, str) and tc.strip():
|
|
267
|
+
tasks.append(send_telegram_alert(tc.strip(), text))
|
|
268
|
+
labels.append("telegram")
|
|
269
|
+
|
|
270
|
+
em = watch.get("email")
|
|
271
|
+
if em and isinstance(em, str) and em.strip():
|
|
272
|
+
tasks.append(
|
|
273
|
+
send_email_alert(
|
|
274
|
+
em.strip(),
|
|
275
|
+
f"[VoidAccess Alert] {name}",
|
|
276
|
+
json.dumps(job_result, indent=2, default=str)[:20000],
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
labels.append("email")
|
|
280
|
+
|
|
281
|
+
if not tasks:
|
|
282
|
+
return []
|
|
283
|
+
|
|
284
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
285
|
+
delivered: list[str] = []
|
|
286
|
+
for label, res in zip(labels, results):
|
|
287
|
+
if res is True:
|
|
288
|
+
delivered.append(label)
|
|
289
|
+
elif isinstance(res, Exception):
|
|
290
|
+
logger.error("alert channel %s failed: %s", label, res)
|
|
291
|
+
return delivered
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _persist_alert_record(
|
|
295
|
+
watch: dict,
|
|
296
|
+
change_type: str,
|
|
297
|
+
summary: str,
|
|
298
|
+
diff_result: dict,
|
|
299
|
+
delivered_channels: list[str],
|
|
300
|
+
) -> None:
|
|
301
|
+
from db.queries import create_monitor_alert
|
|
302
|
+
from db.session import get_session
|
|
303
|
+
|
|
304
|
+
severity = _derive_severity(change_type, diff_result)
|
|
305
|
+
entity_delta = _count_entity_delta(diff_result)
|
|
306
|
+
with get_session() as session:
|
|
307
|
+
create_monitor_alert(
|
|
308
|
+
session=session,
|
|
309
|
+
monitor_name=str(watch.get("name", "watch")),
|
|
310
|
+
change_type=change_type,
|
|
311
|
+
summary=summary,
|
|
312
|
+
diff_data=_sanitize_diff(diff_result),
|
|
313
|
+
severity=severity,
|
|
314
|
+
entity_count_delta=entity_delta,
|
|
315
|
+
delivery_channels=delivered_channels,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
async def evaluate_and_dispatch_alerts(watch: dict, job_result: dict) -> None:
|
|
320
|
+
"""
|
|
321
|
+
If the watch policy says we should alert, send external notifications
|
|
322
|
+
and persist a MonitorAlert row. DB failures never block delivery.
|
|
323
|
+
"""
|
|
324
|
+
ctx = build_alert_context(watch, job_result)
|
|
325
|
+
if ctx is None:
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
delivered_channels: list[str] = []
|
|
329
|
+
try:
|
|
330
|
+
delivered_channels = await dispatch_alerts(watch, job_result)
|
|
331
|
+
except Exception as exc:
|
|
332
|
+
logger.error("dispatch_alerts failed for %s: %s", watch.get("name"), exc)
|
|
333
|
+
|
|
334
|
+
try:
|
|
335
|
+
_persist_alert_record(
|
|
336
|
+
watch,
|
|
337
|
+
ctx["change_type"],
|
|
338
|
+
ctx["summary"],
|
|
339
|
+
ctx.get("diff_result") or {},
|
|
340
|
+
delivered_channels,
|
|
341
|
+
)
|
|
342
|
+
except Exception as exc:
|
|
343
|
+
logger.warning(
|
|
344
|
+
"Failed to persist alert for %s: %s", watch.get("name"), exc
|
|
345
|
+
)
|
monitor/config.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Load and validate monitor watch definitions from monitors.yaml.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
_ALERT_ON = frozenset({"new_results", "any_change", "any_appearance"})
|
|
15
|
+
_TYPES = frozenset({"keyword", "url"})
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _yaml_path() -> Path:
|
|
19
|
+
env_path = os.getenv("MONITORS_CONFIG_PATH")
|
|
20
|
+
if env_path:
|
|
21
|
+
return Path(env_path)
|
|
22
|
+
return Path(__file__).resolve().parent.parent / "data" / "monitors.yaml"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _as_float(val: Any, default: Optional[float] = None) -> Optional[float]:
|
|
26
|
+
try:
|
|
27
|
+
return float(val)
|
|
28
|
+
except (TypeError, ValueError):
|
|
29
|
+
return default
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _validate_and_normalize(raw: dict) -> dict | None:
|
|
33
|
+
name = raw.get("name")
|
|
34
|
+
if not name or not isinstance(name, str):
|
|
35
|
+
logger.warning("Monitor entry skipped: missing or invalid 'name'")
|
|
36
|
+
return None
|
|
37
|
+
wtype = raw.get("type")
|
|
38
|
+
if wtype not in _TYPES:
|
|
39
|
+
logger.warning("Monitor %r skipped: invalid type %r", name, wtype)
|
|
40
|
+
return None
|
|
41
|
+
interval = _as_float(raw.get("interval_hours"))
|
|
42
|
+
if interval is None or interval < 0.5:
|
|
43
|
+
logger.warning("Monitor %r skipped: interval_hours must be >= 0.5", name)
|
|
44
|
+
return None
|
|
45
|
+
alert_on = raw.get("alert_on")
|
|
46
|
+
if alert_on not in _ALERT_ON:
|
|
47
|
+
logger.warning("Monitor %r skipped: invalid alert_on %r", name, alert_on)
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
if wtype == "keyword":
|
|
51
|
+
q = raw.get("query")
|
|
52
|
+
if not q or not isinstance(q, str):
|
|
53
|
+
logger.warning("Monitor %r skipped: keyword watch needs query", name)
|
|
54
|
+
return None
|
|
55
|
+
else:
|
|
56
|
+
u = raw.get("url")
|
|
57
|
+
if not u or not isinstance(u, str):
|
|
58
|
+
logger.warning("Monitor %r skipped: url watch needs url", name)
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
enabled = raw.get("enabled", True)
|
|
62
|
+
if not isinstance(enabled, bool):
|
|
63
|
+
enabled = bool(enabled)
|
|
64
|
+
|
|
65
|
+
out: dict[str, Any] = {
|
|
66
|
+
"name": name.strip(),
|
|
67
|
+
"type": wtype,
|
|
68
|
+
"interval_hours": interval,
|
|
69
|
+
"alert_on": alert_on,
|
|
70
|
+
"enabled": enabled,
|
|
71
|
+
"webhook_url": raw.get("webhook_url"),
|
|
72
|
+
"telegram_chat_id": raw.get("telegram_chat_id"),
|
|
73
|
+
"email": raw.get("email"),
|
|
74
|
+
}
|
|
75
|
+
if wtype == "keyword":
|
|
76
|
+
out["query"] = str(raw["query"]).strip()
|
|
77
|
+
else:
|
|
78
|
+
out["url"] = str(raw["url"]).strip()
|
|
79
|
+
|
|
80
|
+
return out
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_watches() -> list[dict]:
|
|
84
|
+
"""Parse monitors.yaml; invalid entries are skipped. Returns [] if missing."""
|
|
85
|
+
path = _yaml_path()
|
|
86
|
+
if not path.is_file():
|
|
87
|
+
return []
|
|
88
|
+
try:
|
|
89
|
+
import yaml # noqa: PLC0415
|
|
90
|
+
except ImportError:
|
|
91
|
+
logger.warning("PyYAML not installed; no watches loaded")
|
|
92
|
+
return []
|
|
93
|
+
try:
|
|
94
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
logger.warning("Failed to read monitors.yaml: %s", exc)
|
|
97
|
+
return []
|
|
98
|
+
if not data or not isinstance(data, dict):
|
|
99
|
+
return []
|
|
100
|
+
watches_raw = data.get("watches")
|
|
101
|
+
if not watches_raw or not isinstance(watches_raw, list):
|
|
102
|
+
return []
|
|
103
|
+
out: list[dict] = []
|
|
104
|
+
for item in watches_raw:
|
|
105
|
+
if not isinstance(item, dict):
|
|
106
|
+
logger.warning("Monitor entry skipped: not a mapping")
|
|
107
|
+
continue
|
|
108
|
+
norm = _validate_and_normalize(item)
|
|
109
|
+
if norm:
|
|
110
|
+
out.append(norm)
|
|
111
|
+
return out
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_watch_by_name(name: str) -> dict | None:
|
|
115
|
+
for w in load_watches():
|
|
116
|
+
if w.get("name") == name:
|
|
117
|
+
return w
|
|
118
|
+
return None
|
monitor/diff.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content change detection using difflib (unified diff + similarity ratio).
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import difflib
|
|
8
|
+
import hashlib
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def compute_diff(old_text: str, new_text: str) -> dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
Compare two text blobs. change_ratio: 0.0 identical, 1.0 completely different.
|
|
15
|
+
"""
|
|
16
|
+
old_bytes = old_text.encode("utf-8")
|
|
17
|
+
new_bytes = new_text.encode("utf-8")
|
|
18
|
+
content_hash_old = hashlib.sha256(old_bytes).hexdigest()
|
|
19
|
+
content_hash_new = hashlib.sha256(new_bytes).hexdigest()
|
|
20
|
+
|
|
21
|
+
if old_text == new_text:
|
|
22
|
+
return {
|
|
23
|
+
"changed": False,
|
|
24
|
+
"content_hash_old": content_hash_old,
|
|
25
|
+
"content_hash_new": content_hash_new,
|
|
26
|
+
"lines_added": 0,
|
|
27
|
+
"lines_removed": 0,
|
|
28
|
+
"diff_summary": "",
|
|
29
|
+
"change_ratio": 0.0,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
old_lines = old_text.splitlines(keepends=True)
|
|
33
|
+
new_lines = new_text.splitlines(keepends=True)
|
|
34
|
+
diff_lines = list(
|
|
35
|
+
difflib.unified_diff(
|
|
36
|
+
old_lines,
|
|
37
|
+
new_lines,
|
|
38
|
+
fromfile="old",
|
|
39
|
+
tofile="new",
|
|
40
|
+
lineterm="",
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
lines_added = 0
|
|
44
|
+
lines_removed = 0
|
|
45
|
+
for line in diff_lines:
|
|
46
|
+
if line.startswith("+++") or line.startswith("---") or line.startswith("@@"):
|
|
47
|
+
continue
|
|
48
|
+
if line.startswith("+"):
|
|
49
|
+
lines_added += 1
|
|
50
|
+
elif line.startswith("-"):
|
|
51
|
+
lines_removed += 1
|
|
52
|
+
|
|
53
|
+
matcher = difflib.SequenceMatcher(None, old_text, new_text)
|
|
54
|
+
change_ratio = 1.0 - matcher.ratio()
|
|
55
|
+
|
|
56
|
+
summary_src = "\n".join(diff_lines)
|
|
57
|
+
diff_summary = summary_src[:500]
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
"changed": True,
|
|
61
|
+
"content_hash_old": content_hash_old,
|
|
62
|
+
"content_hash_new": content_hash_new,
|
|
63
|
+
"lines_added": lines_added,
|
|
64
|
+
"lines_removed": lines_removed,
|
|
65
|
+
"diff_summary": diff_summary,
|
|
66
|
+
"change_ratio": float(change_ratio),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_significant_change(diff: dict, threshold: float = 0.1) -> bool:
|
|
71
|
+
"""True when change_ratio meets or exceeds *threshold*."""
|
|
72
|
+
try:
|
|
73
|
+
return float(diff.get("change_ratio", 0.0)) >= threshold
|
|
74
|
+
except (TypeError, ValueError):
|
|
75
|
+
return False
|