voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/rss_scraper.py
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/rss_scraper.py — RSS/Atom feed scraper for VoidAccess.
|
|
3
|
+
|
|
4
|
+
Fetches recent articles from curated threat intelligence blogs and feeds
|
|
5
|
+
relevant to the investigation query. Runs over CLEARNET — these are public
|
|
6
|
+
security blogs that do not require Tor.
|
|
7
|
+
|
|
8
|
+
Feed results are cached per-URL for 1 hour (feeds update infrequently).
|
|
9
|
+
Articles are scored by relevance to the query and filtered by age (max 90 days).
|
|
10
|
+
|
|
11
|
+
Public API:
|
|
12
|
+
async def scrape_rss_feeds(
|
|
13
|
+
query: str,
|
|
14
|
+
refined_query: str = "",
|
|
15
|
+
max_results: int = MAX_TOTAL_ARTICLES,
|
|
16
|
+
) -> list[dict]
|
|
17
|
+
|
|
18
|
+
Returns page dicts compatible with the extraction pipeline:
|
|
19
|
+
{
|
|
20
|
+
"url": str,
|
|
21
|
+
"text_content": str,
|
|
22
|
+
"title": str,
|
|
23
|
+
"source_type": "rss_feed",
|
|
24
|
+
"source_name": str,
|
|
25
|
+
"feed_category": str,
|
|
26
|
+
"published_at": str,
|
|
27
|
+
"relevance": int,
|
|
28
|
+
"feed_weight": int,
|
|
29
|
+
"scraped_at": str,
|
|
30
|
+
"word_count": int,
|
|
31
|
+
}
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import asyncio
|
|
37
|
+
import aiohttp
|
|
38
|
+
import hashlib
|
|
39
|
+
import json
|
|
40
|
+
import logging
|
|
41
|
+
import os
|
|
42
|
+
import re
|
|
43
|
+
import time
|
|
44
|
+
import xml.etree.ElementTree as ET
|
|
45
|
+
from datetime import datetime, timezone
|
|
46
|
+
from pathlib import Path
|
|
47
|
+
from typing import Optional
|
|
48
|
+
|
|
49
|
+
from utils.content_safety import is_blocked_query, sanitize_content
|
|
50
|
+
|
|
51
|
+
logger = logging.getLogger(__name__)
|
|
52
|
+
|
|
53
|
+
CACHE_DIR = Path("/tmp/voidaccess_rss_cache")
|
|
54
|
+
CACHE_TTL_SECONDS = 3600 # 1 hour
|
|
55
|
+
|
|
56
|
+
MAX_ARTICLE_AGE_DAYS = 90
|
|
57
|
+
MAX_ARTICLES_PER_FEED = 3
|
|
58
|
+
MAX_TOTAL_ARTICLES = 20
|
|
59
|
+
MAX_ARTICLE_SIZE = 100 * 1024 # 100 KB
|
|
60
|
+
|
|
61
|
+
RSS_FEEDS = [
|
|
62
|
+
{
|
|
63
|
+
"name": "Krebs on Security",
|
|
64
|
+
"url": "https://krebsonsecurity.com/feed/",
|
|
65
|
+
"category": "journalism",
|
|
66
|
+
"tags": ["breach", "fraud", "cybercrime", "ransomware", "dark web", "banking"],
|
|
67
|
+
"weight": 10,
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"name": "BleepingComputer",
|
|
71
|
+
"url": "https://www.bleepingcomputer.com/feed/",
|
|
72
|
+
"category": "journalism",
|
|
73
|
+
"tags": ["ransomware", "malware", "breach", "vulnerability", "darkweb", "leak"],
|
|
74
|
+
"weight": 10,
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"name": "The Record by Recorded Future",
|
|
78
|
+
"url": "https://therecord.media/feed",
|
|
79
|
+
"category": "journalism",
|
|
80
|
+
"tags": ["cybercrime", "espionage", "ransomware", "government", "critical infrastructure"],
|
|
81
|
+
"weight": 9,
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"name": "Dark Reading",
|
|
85
|
+
"url": "https://www.darkreading.com/rss.xml",
|
|
86
|
+
"category": "journalism",
|
|
87
|
+
"tags": ["vulnerability", "threat", "attack", "malware", "breach", "security"],
|
|
88
|
+
"weight": 8,
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"name": "SecurityWeek",
|
|
92
|
+
"url": "https://feeds.feedburner.com/Securityweek",
|
|
93
|
+
"category": "journalism",
|
|
94
|
+
"tags": ["vulnerability", "ransomware", "breach", "malware", "exploit"],
|
|
95
|
+
"weight": 8,
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"name": "Threatpost",
|
|
99
|
+
"url": "https://threatpost.com/feed/",
|
|
100
|
+
"category": "journalism",
|
|
101
|
+
"tags": ["vulnerability", "ransomware", "malware", "breach", "APT"],
|
|
102
|
+
"weight": 7,
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
"name": "SANS Internet Storm Center",
|
|
106
|
+
"url": "https://isc.sans.edu/rssfeed_full.xml",
|
|
107
|
+
"category": "technical",
|
|
108
|
+
"tags": ["IOC", "malware", "exploit", "vulnerability", "incident"],
|
|
109
|
+
"weight": 9,
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"name": "Malwarebytes Labs",
|
|
113
|
+
"url": "https://www.malwarebytes.com/blog/feed/",
|
|
114
|
+
"category": "technical",
|
|
115
|
+
"tags": ["malware", "ransomware", "threat", "stealer", "trojan", "adware"],
|
|
116
|
+
"weight": 8,
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
"name": "Cisco Talos Intelligence",
|
|
120
|
+
"url": "https://blog.talosintelligence.com/rss/",
|
|
121
|
+
"category": "technical",
|
|
122
|
+
"tags": ["malware", "IOC", "APT", "exploit", "vulnerability", "threat actor"],
|
|
123
|
+
"weight": 10,
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"name": "Sophos News",
|
|
127
|
+
"url": "https://news.sophos.com/en-us/feed/",
|
|
128
|
+
"category": "technical",
|
|
129
|
+
"tags": ["ransomware", "malware", "threat", "exploit", "attack"],
|
|
130
|
+
"weight": 8,
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"name": "Mandiant Blog",
|
|
134
|
+
"url": "https://www.mandiant.com/resources/blog/rss.xml",
|
|
135
|
+
"category": "threat_intel",
|
|
136
|
+
"tags": ["APT", "threat actor", "espionage", "malware", "incident response", "zero day"],
|
|
137
|
+
"weight": 10,
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"name": "CrowdStrike Blog",
|
|
141
|
+
"url": "https://www.crowdstrike.com/blog/feed/",
|
|
142
|
+
"category": "threat_intel",
|
|
143
|
+
"tags": ["APT", "threat actor", "ransomware", "malware", "eCrime", "adversary"],
|
|
144
|
+
"weight": 10,
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"name": "Secureworks CTU",
|
|
148
|
+
"url": "https://www.secureworks.com/rss?feed=blog",
|
|
149
|
+
"category": "threat_intel",
|
|
150
|
+
"tags": ["threat actor", "malware", "APT", "ransomware", "darkweb", "TTPs"],
|
|
151
|
+
"weight": 9,
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
"name": "US-CERT Alerts",
|
|
155
|
+
"url": "https://www.cisa.gov/uscert/ncas/alerts.xml",
|
|
156
|
+
"category": "government",
|
|
157
|
+
"tags": ["vulnerability", "alert", "advisory", "critical infrastructure", "KEV"],
|
|
158
|
+
"weight": 10,
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
"name": "CISA News",
|
|
162
|
+
"url": "https://www.cisa.gov/news.xml",
|
|
163
|
+
"category": "government",
|
|
164
|
+
"tags": ["vulnerability", "advisory", "ransomware", "critical infrastructure"],
|
|
165
|
+
"weight": 9,
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"name": "FBI Cyber Division News",
|
|
169
|
+
"url": "https://www.fbi.gov/feeds/fbi-in-the-news/rss.xml",
|
|
170
|
+
"category": "government",
|
|
171
|
+
"tags": ["cybercrime", "ransomware", "darkweb", "arrest", "seizure", "takedown"],
|
|
172
|
+
"weight": 9,
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
"name": "Recorded Future Intelligence",
|
|
176
|
+
"url": "https://www.recordedfuture.com/feed",
|
|
177
|
+
"category": "threat_intel",
|
|
178
|
+
"tags": ["threat actor", "dark web", "IOC", "malware", "vulnerability", "APT"],
|
|
179
|
+
"weight": 9,
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
"name": "Palo Alto Unit 42",
|
|
183
|
+
"url": "https://unit42.paloaltonetworks.com/feed/",
|
|
184
|
+
"category": "threat_intel",
|
|
185
|
+
"tags": ["malware", "APT", "threat actor", "ransomware", "phishing", "exploit"],
|
|
186
|
+
"weight": 10,
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
"name": "Microsoft Security Blog",
|
|
190
|
+
"url": "https://www.microsoft.com/en-us/security/blog/feed/",
|
|
191
|
+
"category": "threat_intel",
|
|
192
|
+
"tags": ["APT", "ransomware", "vulnerability", "threat actor", "malware", "nation state"],
|
|
193
|
+
"weight": 9,
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
"name": "Google Project Zero",
|
|
197
|
+
"url": "https://googleprojectzero.blogspot.com/feeds/posts/default",
|
|
198
|
+
"category": "technical",
|
|
199
|
+
"tags": ["zero day", "exploit", "vulnerability", "CVE", "browser", "kernel"],
|
|
200
|
+
"weight": 9,
|
|
201
|
+
},
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
_KNOWN_ACTORS = [
|
|
205
|
+
"lockbit", "blackcat", "alphv", "cl0p", "clop", "play", "akira",
|
|
206
|
+
"blackbasta", "black basta", "revil", "conti", "ryuk", "maze",
|
|
207
|
+
"darkside", "hive", "ragnarlocker", "cobalt strike", "metasploit",
|
|
208
|
+
"mimikatz", "beacon", "sliver", "havoc", "brute ratel", "covenant",
|
|
209
|
+
"lazarus", "apt28", "apt29", "cozy bear", "fancy bear",
|
|
210
|
+
"sandworm", "volt typhoon", "scattered spider", "lapsus",
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class RSSCache:
|
|
215
|
+
"""Simple file-based cache for RSS feed article lists."""
|
|
216
|
+
|
|
217
|
+
def __init__(self):
|
|
218
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
219
|
+
|
|
220
|
+
def _cache_path(self, url: str) -> Path:
|
|
221
|
+
key = hashlib.md5(url.encode()).hexdigest()
|
|
222
|
+
return CACHE_DIR / f"{key}.json"
|
|
223
|
+
|
|
224
|
+
def get(self, url: str) -> Optional[list]:
|
|
225
|
+
path = self._cache_path(url)
|
|
226
|
+
if not path.exists():
|
|
227
|
+
return None
|
|
228
|
+
try:
|
|
229
|
+
data = json.loads(path.read_text())
|
|
230
|
+
age = time.time() - data.get("cached_at", 0)
|
|
231
|
+
if age > CACHE_TTL_SECONDS:
|
|
232
|
+
path.unlink(missing_ok=True)
|
|
233
|
+
return None
|
|
234
|
+
return data.get("articles", [])
|
|
235
|
+
except Exception:
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
def set(self, url: str, articles: list) -> None:
|
|
239
|
+
path = self._cache_path(url)
|
|
240
|
+
try:
|
|
241
|
+
path.write_text(json.dumps({
|
|
242
|
+
"cached_at": time.time(),
|
|
243
|
+
"articles": articles,
|
|
244
|
+
}))
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logger.debug("RSS cache write failed: %s", e)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class RSSFeedScraper:
|
|
250
|
+
"""Fetches and parses RSS/Atom feeds from curated threat intelligence sources."""
|
|
251
|
+
|
|
252
|
+
def __init__(self):
|
|
253
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
254
|
+
self._cache = RSSCache()
|
|
255
|
+
|
|
256
|
+
async def __aenter__(self):
|
|
257
|
+
self._session = aiohttp.ClientSession(
|
|
258
|
+
headers={
|
|
259
|
+
"User-Agent": "Mozilla/5.0 (compatible; RSS-Reader/1.0; +https://github.com/voidaccess/voidaccess)",
|
|
260
|
+
"Accept": (
|
|
261
|
+
"application/rss+xml, application/atom+xml, "
|
|
262
|
+
"application/xml, text/xml"
|
|
263
|
+
),
|
|
264
|
+
},
|
|
265
|
+
timeout=aiohttp.ClientTimeout(total=15),
|
|
266
|
+
)
|
|
267
|
+
return self
|
|
268
|
+
|
|
269
|
+
async def __aexit__(self, *args):
|
|
270
|
+
if self._session:
|
|
271
|
+
await self._session.close()
|
|
272
|
+
|
|
273
|
+
async def fetch_relevant_articles(
|
|
274
|
+
self,
|
|
275
|
+
query: str,
|
|
276
|
+
refined_query: str = "",
|
|
277
|
+
max_results: int = MAX_TOTAL_ARTICLES,
|
|
278
|
+
) -> list[dict]:
|
|
279
|
+
"""
|
|
280
|
+
Fetch articles from all feeds relevant to the query.
|
|
281
|
+
Returns page dicts compatible with the extraction pipeline.
|
|
282
|
+
"""
|
|
283
|
+
blocked, _ = is_blocked_query(query)
|
|
284
|
+
if blocked:
|
|
285
|
+
logger.warning("RSS scraping blocked — prohibited query")
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
search_terms = self._extract_search_terms(query, refined_query)
|
|
289
|
+
|
|
290
|
+
logger.info(
|
|
291
|
+
"RSS feeds: fetching for '%s' (%d feeds)",
|
|
292
|
+
query[:50],
|
|
293
|
+
len(RSS_FEEDS),
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
tasks = [self._fetch_feed(feed, search_terms) for feed in RSS_FEEDS]
|
|
297
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
298
|
+
|
|
299
|
+
all_articles: list[dict] = []
|
|
300
|
+
seen_urls: set[str] = set()
|
|
301
|
+
|
|
302
|
+
for result in results:
|
|
303
|
+
if isinstance(result, list):
|
|
304
|
+
for article in result:
|
|
305
|
+
url = article.get("url", "")
|
|
306
|
+
if url and url not in seen_urls:
|
|
307
|
+
seen_urls.add(url)
|
|
308
|
+
all_articles.append(article)
|
|
309
|
+
|
|
310
|
+
all_articles.sort(
|
|
311
|
+
key=lambda x: x.get("relevance", 0) * x.get("feed_weight", 1),
|
|
312
|
+
reverse=True,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
final = all_articles[:max_results]
|
|
316
|
+
logger.info("RSS feeds: %d relevant articles found", len(final))
|
|
317
|
+
return final
|
|
318
|
+
|
|
319
|
+
def _extract_search_terms(self, query: str, refined_query: str) -> list[str]:
|
|
320
|
+
"""Extract key terms for relevance matching."""
|
|
321
|
+
text = f"{query} {refined_query}".lower()
|
|
322
|
+
words = [w for w in re.split(r"\W+", text) if len(w) > 3]
|
|
323
|
+
terms = list(set(words))
|
|
324
|
+
|
|
325
|
+
terms.append(query.lower())
|
|
326
|
+
if refined_query:
|
|
327
|
+
terms.append(refined_query.lower())
|
|
328
|
+
|
|
329
|
+
cves = re.findall(r"CVE-\d{4}-\d+", query, re.IGNORECASE)
|
|
330
|
+
terms.extend(c.lower() for c in cves)
|
|
331
|
+
|
|
332
|
+
for actor in _KNOWN_ACTORS:
|
|
333
|
+
if actor in text:
|
|
334
|
+
terms.append(actor)
|
|
335
|
+
|
|
336
|
+
return list(set(terms))
|
|
337
|
+
|
|
338
|
+
async def _fetch_feed(self, feed: dict, search_terms: list[str]) -> list[dict]:
|
|
339
|
+
"""Fetch one RSS feed and return relevant articles."""
|
|
340
|
+
feed_url = feed["url"]
|
|
341
|
+
feed_name = feed["name"]
|
|
342
|
+
|
|
343
|
+
cached = self._cache.get(feed_url)
|
|
344
|
+
if cached is not None:
|
|
345
|
+
logger.debug("RSS cache hit: %s", feed_name)
|
|
346
|
+
raw_articles = cached
|
|
347
|
+
else:
|
|
348
|
+
raw_articles = await self._fetch_and_parse(feed_url, feed_name)
|
|
349
|
+
if raw_articles:
|
|
350
|
+
self._cache.set(feed_url, raw_articles)
|
|
351
|
+
|
|
352
|
+
if not raw_articles:
|
|
353
|
+
return []
|
|
354
|
+
|
|
355
|
+
relevant: list[dict] = []
|
|
356
|
+
for article in raw_articles:
|
|
357
|
+
relevance = self._score_article(article, search_terms, feed)
|
|
358
|
+
if relevance <= 0:
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
full_content = await self._fetch_article_content(article.get("url", ""))
|
|
362
|
+
content = full_content or article.get("summary", "")
|
|
363
|
+
|
|
364
|
+
if not content or len(content.strip()) < 100:
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
clean, flagged = sanitize_content(content)
|
|
368
|
+
if flagged or not clean:
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
relevant.append({
|
|
372
|
+
"url": article.get("url", ""),
|
|
373
|
+
"text_content": clean,
|
|
374
|
+
"title": article.get("title", feed_name),
|
|
375
|
+
"source_type": "rss_feed",
|
|
376
|
+
"source_name": feed_name,
|
|
377
|
+
"feed_category": feed.get("category", "unknown"),
|
|
378
|
+
"published_at": article.get("published", ""),
|
|
379
|
+
"relevance": relevance,
|
|
380
|
+
"feed_weight": feed.get("weight", 5),
|
|
381
|
+
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
382
|
+
"word_count": len(clean.split()),
|
|
383
|
+
})
|
|
384
|
+
|
|
385
|
+
if len(relevant) >= MAX_ARTICLES_PER_FEED:
|
|
386
|
+
break
|
|
387
|
+
|
|
388
|
+
return relevant
|
|
389
|
+
|
|
390
|
+
async def _fetch_and_parse(self, feed_url: str, feed_name: str) -> list[dict]:
|
|
391
|
+
"""Fetch and parse an RSS/Atom feed XML."""
|
|
392
|
+
if not self._session:
|
|
393
|
+
return []
|
|
394
|
+
try:
|
|
395
|
+
async with self._session.get(feed_url, allow_redirects=True) as resp:
|
|
396
|
+
if resp.status != 200:
|
|
397
|
+
return []
|
|
398
|
+
content = await resp.text(encoding="utf-8", errors="ignore")
|
|
399
|
+
return self._parse_feed(content, feed_url)
|
|
400
|
+
except asyncio.TimeoutError:
|
|
401
|
+
logger.debug("RSS timeout: %s", feed_name)
|
|
402
|
+
return []
|
|
403
|
+
except Exception as e:
|
|
404
|
+
logger.debug("RSS fetch error %s: %s", feed_name, e)
|
|
405
|
+
return []
|
|
406
|
+
|
|
407
|
+
def _parse_feed(self, content: str, feed_url: str) -> list[dict]:
|
|
408
|
+
"""
|
|
409
|
+
Parse RSS 2.0 or Atom feed XML.
|
|
410
|
+
Returns list of article dicts: title, url, summary, published.
|
|
411
|
+
"""
|
|
412
|
+
articles: list[dict] = []
|
|
413
|
+
try:
|
|
414
|
+
# Strip namespace declarations and prefixes so ET can parse any RSS/Atom
|
|
415
|
+
content = re.sub(r'\s+xmlns(?::[a-zA-Z0-9_]+)?="[^"]*"', "", content)
|
|
416
|
+
def _strip_ns(m: re.Match) -> str:
|
|
417
|
+
slash, ns, tag = m.group(1), m.group(2), m.group(3)
|
|
418
|
+
if ns.lower() == "http":
|
|
419
|
+
return m.group(0)
|
|
420
|
+
return f"<{slash}{ns}_{tag}"
|
|
421
|
+
content = re.sub(r"<(/?)([a-zA-Z][a-zA-Z0-9_]*):([a-zA-Z][a-zA-Z0-9_]*)", _strip_ns, content)
|
|
422
|
+
|
|
423
|
+
root = ET.fromstring(content)
|
|
424
|
+
is_atom = "feed" in root.tag.lower()
|
|
425
|
+
|
|
426
|
+
if is_atom:
|
|
427
|
+
for entry in root.findall("entry")[:20]:
|
|
428
|
+
url = ""
|
|
429
|
+
for link in entry.findall("link"):
|
|
430
|
+
if link.get("rel") in ("alternate", None):
|
|
431
|
+
url = link.get("href", "")
|
|
432
|
+
break
|
|
433
|
+
title_el = entry.find("title")
|
|
434
|
+
summary_el = entry.find("summary") or entry.find("content")
|
|
435
|
+
pub_el = entry.find("published") or entry.find("updated")
|
|
436
|
+
if url:
|
|
437
|
+
articles.append({
|
|
438
|
+
"url": url,
|
|
439
|
+
"title": (title_el.text or "") if title_el else "",
|
|
440
|
+
"summary": (summary_el.text or "") if summary_el else "",
|
|
441
|
+
"published": (pub_el.text or "") if pub_el else "",
|
|
442
|
+
})
|
|
443
|
+
else:
|
|
444
|
+
channel = root.find("channel") or root
|
|
445
|
+
for item in channel.findall("item")[:20]:
|
|
446
|
+
link_el = item.find("link")
|
|
447
|
+
title_el = item.find("title")
|
|
448
|
+
desc_el = item.find("description")
|
|
449
|
+
pub_el = item.find("pubDate")
|
|
450
|
+
url = (link_el.text or "").strip() if link_el is not None else ""
|
|
451
|
+
if url:
|
|
452
|
+
articles.append({
|
|
453
|
+
"url": url,
|
|
454
|
+
"title": (title_el.text or "") if title_el else "",
|
|
455
|
+
"summary": self._strip_html(
|
|
456
|
+
(desc_el.text or "") if desc_el else ""
|
|
457
|
+
),
|
|
458
|
+
"published": (pub_el.text or "") if pub_el else "",
|
|
459
|
+
})
|
|
460
|
+
|
|
461
|
+
except ET.ParseError as e:
|
|
462
|
+
logger.debug("RSS parse error %s: %s", feed_url, e)
|
|
463
|
+
except Exception as e:
|
|
464
|
+
logger.debug("RSS parse unexpected error: %s", e)
|
|
465
|
+
|
|
466
|
+
return articles
|
|
467
|
+
|
|
468
|
+
async def _fetch_article_content(self, url: str) -> Optional[str]:
|
|
469
|
+
"""Fetch and extract plain text from an article URL."""
|
|
470
|
+
if not url or not self._session:
|
|
471
|
+
return None
|
|
472
|
+
try:
|
|
473
|
+
async with self._session.get(
|
|
474
|
+
url,
|
|
475
|
+
allow_redirects=True,
|
|
476
|
+
timeout=aiohttp.ClientTimeout(total=10),
|
|
477
|
+
) as resp:
|
|
478
|
+
if resp.status != 200:
|
|
479
|
+
return None
|
|
480
|
+
html = await resp.text(encoding="utf-8", errors="ignore")
|
|
481
|
+
if len(html) > MAX_ARTICLE_SIZE:
|
|
482
|
+
html = html[:MAX_ARTICLE_SIZE]
|
|
483
|
+
text = self._extract_article_text(html)
|
|
484
|
+
return text if len(text) > 100 else None
|
|
485
|
+
except Exception:
|
|
486
|
+
return None
|
|
487
|
+
|
|
488
|
+
def _extract_article_text(self, html: str) -> str:
|
|
489
|
+
"""Strip scripts, styles, and tags; collapse whitespace."""
|
|
490
|
+
html = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL)
|
|
491
|
+
html = re.sub(r"<style[^>]*>.*?</style>", " ", html, flags=re.DOTALL)
|
|
492
|
+
text = re.sub(r"<[^>]+>", " ", html)
|
|
493
|
+
for entity, char in {
|
|
494
|
+
"&": "&", "<": "<", ">": ">",
|
|
495
|
+
""": '"', "'": "'", " ": " ", "'": "'",
|
|
496
|
+
}.items():
|
|
497
|
+
text = text.replace(entity, char)
|
|
498
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
499
|
+
|
|
500
|
+
def _strip_html(self, html: str) -> str:
|
|
501
|
+
"""Strip HTML tags from a string."""
|
|
502
|
+
text = re.sub(r"<[^>]+>", " ", html)
|
|
503
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
504
|
+
|
|
505
|
+
def _score_article(
|
|
506
|
+
self,
|
|
507
|
+
article: dict,
|
|
508
|
+
search_terms: list[str],
|
|
509
|
+
feed: dict,
|
|
510
|
+
) -> int:
|
|
511
|
+
"""Score article relevance to search terms (0 = exclude)."""
|
|
512
|
+
score = 0
|
|
513
|
+
|
|
514
|
+
title = article.get("title", "").lower()
|
|
515
|
+
summary = article.get("summary", "").lower()
|
|
516
|
+
|
|
517
|
+
pub_str = article.get("published", "")
|
|
518
|
+
if pub_str:
|
|
519
|
+
try:
|
|
520
|
+
from email.utils import parsedate_to_datetime
|
|
521
|
+
try:
|
|
522
|
+
pub_dt = parsedate_to_datetime(pub_str)
|
|
523
|
+
except Exception:
|
|
524
|
+
import dateutil.parser
|
|
525
|
+
pub_dt = dateutil.parser.parse(pub_str)
|
|
526
|
+
|
|
527
|
+
now = datetime.now(timezone.utc)
|
|
528
|
+
if pub_dt.tzinfo is None:
|
|
529
|
+
pub_dt = pub_dt.replace(tzinfo=timezone.utc)
|
|
530
|
+
|
|
531
|
+
age_days = (now - pub_dt).days
|
|
532
|
+
if age_days > MAX_ARTICLE_AGE_DAYS:
|
|
533
|
+
return 0
|
|
534
|
+
elif age_days <= 7:
|
|
535
|
+
score += 5
|
|
536
|
+
elif age_days <= 30:
|
|
537
|
+
score += 3
|
|
538
|
+
else:
|
|
539
|
+
score += 1
|
|
540
|
+
except Exception:
|
|
541
|
+
score += 1
|
|
542
|
+
|
|
543
|
+
for term in search_terms:
|
|
544
|
+
if len(term) > 3:
|
|
545
|
+
if term in title:
|
|
546
|
+
score += 5
|
|
547
|
+
elif term in summary:
|
|
548
|
+
score += 2
|
|
549
|
+
|
|
550
|
+
feed_tags = [t.lower() for t in feed.get("tags", [])]
|
|
551
|
+
for term in search_terms:
|
|
552
|
+
if term in feed_tags:
|
|
553
|
+
score += 1
|
|
554
|
+
|
|
555
|
+
return score
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
async def scrape_rss_feeds(
|
|
559
|
+
query: str,
|
|
560
|
+
refined_query: str = "",
|
|
561
|
+
max_results: int = MAX_TOTAL_ARTICLES,
|
|
562
|
+
) -> list[dict]:
|
|
563
|
+
"""
|
|
564
|
+
Main entry point. Returns page dicts compatible with the extraction pipeline.
|
|
565
|
+
Opt-out via RSS_FEEDS_ENABLED=false.
|
|
566
|
+
"""
|
|
567
|
+
if os.getenv("RSS_FEEDS_ENABLED", "true").lower() != "true":
|
|
568
|
+
logger.info("RSS feeds disabled")
|
|
569
|
+
return []
|
|
570
|
+
|
|
571
|
+
async with RSSFeedScraper() as scraper:
|
|
572
|
+
return await scraper.fetch_relevant_articles(
|
|
573
|
+
query=query,
|
|
574
|
+
refined_query=refined_query,
|
|
575
|
+
max_results=max_results,
|
|
576
|
+
)
|