voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/rss_scraper.py ADDED
@@ -0,0 +1,576 @@
1
+ """
2
+ sources/rss_scraper.py — RSS/Atom feed scraper for VoidAccess.
3
+
4
+ Fetches recent articles from curated threat intelligence blogs and feeds
5
+ relevant to the investigation query. Runs over CLEARNET — these are public
6
+ security blogs that do not require Tor.
7
+
8
+ Feed results are cached per-URL for 1 hour (feeds update infrequently).
9
+ Articles are scored by relevance to the query and filtered by age (max 90 days).
10
+
11
+ Public API:
12
+ async def scrape_rss_feeds(
13
+ query: str,
14
+ refined_query: str = "",
15
+ max_results: int = MAX_TOTAL_ARTICLES,
16
+ ) -> list[dict]
17
+
18
+ Returns page dicts compatible with the extraction pipeline:
19
+ {
20
+ "url": str,
21
+ "text_content": str,
22
+ "title": str,
23
+ "source_type": "rss_feed",
24
+ "source_name": str,
25
+ "feed_category": str,
26
+ "published_at": str,
27
+ "relevance": int,
28
+ "feed_weight": int,
29
+ "scraped_at": str,
30
+ "word_count": int,
31
+ }
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import asyncio
37
+ import aiohttp
38
+ import hashlib
39
+ import json
40
+ import logging
41
+ import os
42
+ import re
43
+ import time
44
+ import xml.etree.ElementTree as ET
45
+ from datetime import datetime, timezone
46
+ from pathlib import Path
47
+ from typing import Optional
48
+
49
+ from utils.content_safety import is_blocked_query, sanitize_content
50
+
51
+ logger = logging.getLogger(__name__)
52
+
53
+ CACHE_DIR = Path("/tmp/voidaccess_rss_cache")
54
+ CACHE_TTL_SECONDS = 3600 # 1 hour
55
+
56
+ MAX_ARTICLE_AGE_DAYS = 90
57
+ MAX_ARTICLES_PER_FEED = 3
58
+ MAX_TOTAL_ARTICLES = 20
59
+ MAX_ARTICLE_SIZE = 100 * 1024 # 100 KB
60
+
61
+ RSS_FEEDS = [
62
+ {
63
+ "name": "Krebs on Security",
64
+ "url": "https://krebsonsecurity.com/feed/",
65
+ "category": "journalism",
66
+ "tags": ["breach", "fraud", "cybercrime", "ransomware", "dark web", "banking"],
67
+ "weight": 10,
68
+ },
69
+ {
70
+ "name": "BleepingComputer",
71
+ "url": "https://www.bleepingcomputer.com/feed/",
72
+ "category": "journalism",
73
+ "tags": ["ransomware", "malware", "breach", "vulnerability", "darkweb", "leak"],
74
+ "weight": 10,
75
+ },
76
+ {
77
+ "name": "The Record by Recorded Future",
78
+ "url": "https://therecord.media/feed",
79
+ "category": "journalism",
80
+ "tags": ["cybercrime", "espionage", "ransomware", "government", "critical infrastructure"],
81
+ "weight": 9,
82
+ },
83
+ {
84
+ "name": "Dark Reading",
85
+ "url": "https://www.darkreading.com/rss.xml",
86
+ "category": "journalism",
87
+ "tags": ["vulnerability", "threat", "attack", "malware", "breach", "security"],
88
+ "weight": 8,
89
+ },
90
+ {
91
+ "name": "SecurityWeek",
92
+ "url": "https://feeds.feedburner.com/Securityweek",
93
+ "category": "journalism",
94
+ "tags": ["vulnerability", "ransomware", "breach", "malware", "exploit"],
95
+ "weight": 8,
96
+ },
97
+ {
98
+ "name": "Threatpost",
99
+ "url": "https://threatpost.com/feed/",
100
+ "category": "journalism",
101
+ "tags": ["vulnerability", "ransomware", "malware", "breach", "APT"],
102
+ "weight": 7,
103
+ },
104
+ {
105
+ "name": "SANS Internet Storm Center",
106
+ "url": "https://isc.sans.edu/rssfeed_full.xml",
107
+ "category": "technical",
108
+ "tags": ["IOC", "malware", "exploit", "vulnerability", "incident"],
109
+ "weight": 9,
110
+ },
111
+ {
112
+ "name": "Malwarebytes Labs",
113
+ "url": "https://www.malwarebytes.com/blog/feed/",
114
+ "category": "technical",
115
+ "tags": ["malware", "ransomware", "threat", "stealer", "trojan", "adware"],
116
+ "weight": 8,
117
+ },
118
+ {
119
+ "name": "Cisco Talos Intelligence",
120
+ "url": "https://blog.talosintelligence.com/rss/",
121
+ "category": "technical",
122
+ "tags": ["malware", "IOC", "APT", "exploit", "vulnerability", "threat actor"],
123
+ "weight": 10,
124
+ },
125
+ {
126
+ "name": "Sophos News",
127
+ "url": "https://news.sophos.com/en-us/feed/",
128
+ "category": "technical",
129
+ "tags": ["ransomware", "malware", "threat", "exploit", "attack"],
130
+ "weight": 8,
131
+ },
132
+ {
133
+ "name": "Mandiant Blog",
134
+ "url": "https://www.mandiant.com/resources/blog/rss.xml",
135
+ "category": "threat_intel",
136
+ "tags": ["APT", "threat actor", "espionage", "malware", "incident response", "zero day"],
137
+ "weight": 10,
138
+ },
139
+ {
140
+ "name": "CrowdStrike Blog",
141
+ "url": "https://www.crowdstrike.com/blog/feed/",
142
+ "category": "threat_intel",
143
+ "tags": ["APT", "threat actor", "ransomware", "malware", "eCrime", "adversary"],
144
+ "weight": 10,
145
+ },
146
+ {
147
+ "name": "Secureworks CTU",
148
+ "url": "https://www.secureworks.com/rss?feed=blog",
149
+ "category": "threat_intel",
150
+ "tags": ["threat actor", "malware", "APT", "ransomware", "darkweb", "TTPs"],
151
+ "weight": 9,
152
+ },
153
+ {
154
+ "name": "US-CERT Alerts",
155
+ "url": "https://www.cisa.gov/uscert/ncas/alerts.xml",
156
+ "category": "government",
157
+ "tags": ["vulnerability", "alert", "advisory", "critical infrastructure", "KEV"],
158
+ "weight": 10,
159
+ },
160
+ {
161
+ "name": "CISA News",
162
+ "url": "https://www.cisa.gov/news.xml",
163
+ "category": "government",
164
+ "tags": ["vulnerability", "advisory", "ransomware", "critical infrastructure"],
165
+ "weight": 9,
166
+ },
167
+ {
168
+ "name": "FBI Cyber Division News",
169
+ "url": "https://www.fbi.gov/feeds/fbi-in-the-news/rss.xml",
170
+ "category": "government",
171
+ "tags": ["cybercrime", "ransomware", "darkweb", "arrest", "seizure", "takedown"],
172
+ "weight": 9,
173
+ },
174
+ {
175
+ "name": "Recorded Future Intelligence",
176
+ "url": "https://www.recordedfuture.com/feed",
177
+ "category": "threat_intel",
178
+ "tags": ["threat actor", "dark web", "IOC", "malware", "vulnerability", "APT"],
179
+ "weight": 9,
180
+ },
181
+ {
182
+ "name": "Palo Alto Unit 42",
183
+ "url": "https://unit42.paloaltonetworks.com/feed/",
184
+ "category": "threat_intel",
185
+ "tags": ["malware", "APT", "threat actor", "ransomware", "phishing", "exploit"],
186
+ "weight": 10,
187
+ },
188
+ {
189
+ "name": "Microsoft Security Blog",
190
+ "url": "https://www.microsoft.com/en-us/security/blog/feed/",
191
+ "category": "threat_intel",
192
+ "tags": ["APT", "ransomware", "vulnerability", "threat actor", "malware", "nation state"],
193
+ "weight": 9,
194
+ },
195
+ {
196
+ "name": "Google Project Zero",
197
+ "url": "https://googleprojectzero.blogspot.com/feeds/posts/default",
198
+ "category": "technical",
199
+ "tags": ["zero day", "exploit", "vulnerability", "CVE", "browser", "kernel"],
200
+ "weight": 9,
201
+ },
202
+ ]
203
+
204
+ _KNOWN_ACTORS = [
205
+ "lockbit", "blackcat", "alphv", "cl0p", "clop", "play", "akira",
206
+ "blackbasta", "black basta", "revil", "conti", "ryuk", "maze",
207
+ "darkside", "hive", "ragnarlocker", "cobalt strike", "metasploit",
208
+ "mimikatz", "beacon", "sliver", "havoc", "brute ratel", "covenant",
209
+ "lazarus", "apt28", "apt29", "cozy bear", "fancy bear",
210
+ "sandworm", "volt typhoon", "scattered spider", "lapsus",
211
+ ]
212
+
213
+
214
+ class RSSCache:
215
+ """Simple file-based cache for RSS feed article lists."""
216
+
217
+ def __init__(self):
218
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
219
+
220
+ def _cache_path(self, url: str) -> Path:
221
+ key = hashlib.md5(url.encode()).hexdigest()
222
+ return CACHE_DIR / f"{key}.json"
223
+
224
+ def get(self, url: str) -> Optional[list]:
225
+ path = self._cache_path(url)
226
+ if not path.exists():
227
+ return None
228
+ try:
229
+ data = json.loads(path.read_text())
230
+ age = time.time() - data.get("cached_at", 0)
231
+ if age > CACHE_TTL_SECONDS:
232
+ path.unlink(missing_ok=True)
233
+ return None
234
+ return data.get("articles", [])
235
+ except Exception:
236
+ return None
237
+
238
+ def set(self, url: str, articles: list) -> None:
239
+ path = self._cache_path(url)
240
+ try:
241
+ path.write_text(json.dumps({
242
+ "cached_at": time.time(),
243
+ "articles": articles,
244
+ }))
245
+ except Exception as e:
246
+ logger.debug("RSS cache write failed: %s", e)
247
+
248
+
249
+ class RSSFeedScraper:
250
+ """Fetches and parses RSS/Atom feeds from curated threat intelligence sources."""
251
+
252
+ def __init__(self):
253
+ self._session: Optional[aiohttp.ClientSession] = None
254
+ self._cache = RSSCache()
255
+
256
+ async def __aenter__(self):
257
+ self._session = aiohttp.ClientSession(
258
+ headers={
259
+ "User-Agent": "Mozilla/5.0 (compatible; RSS-Reader/1.0; +https://github.com/voidaccess/voidaccess)",
260
+ "Accept": (
261
+ "application/rss+xml, application/atom+xml, "
262
+ "application/xml, text/xml"
263
+ ),
264
+ },
265
+ timeout=aiohttp.ClientTimeout(total=15),
266
+ )
267
+ return self
268
+
269
+ async def __aexit__(self, *args):
270
+ if self._session:
271
+ await self._session.close()
272
+
273
+ async def fetch_relevant_articles(
274
+ self,
275
+ query: str,
276
+ refined_query: str = "",
277
+ max_results: int = MAX_TOTAL_ARTICLES,
278
+ ) -> list[dict]:
279
+ """
280
+ Fetch articles from all feeds relevant to the query.
281
+ Returns page dicts compatible with the extraction pipeline.
282
+ """
283
+ blocked, _ = is_blocked_query(query)
284
+ if blocked:
285
+ logger.warning("RSS scraping blocked — prohibited query")
286
+ return []
287
+
288
+ search_terms = self._extract_search_terms(query, refined_query)
289
+
290
+ logger.info(
291
+ "RSS feeds: fetching for '%s' (%d feeds)",
292
+ query[:50],
293
+ len(RSS_FEEDS),
294
+ )
295
+
296
+ tasks = [self._fetch_feed(feed, search_terms) for feed in RSS_FEEDS]
297
+ results = await asyncio.gather(*tasks, return_exceptions=True)
298
+
299
+ all_articles: list[dict] = []
300
+ seen_urls: set[str] = set()
301
+
302
+ for result in results:
303
+ if isinstance(result, list):
304
+ for article in result:
305
+ url = article.get("url", "")
306
+ if url and url not in seen_urls:
307
+ seen_urls.add(url)
308
+ all_articles.append(article)
309
+
310
+ all_articles.sort(
311
+ key=lambda x: x.get("relevance", 0) * x.get("feed_weight", 1),
312
+ reverse=True,
313
+ )
314
+
315
+ final = all_articles[:max_results]
316
+ logger.info("RSS feeds: %d relevant articles found", len(final))
317
+ return final
318
+
319
+ def _extract_search_terms(self, query: str, refined_query: str) -> list[str]:
320
+ """Extract key terms for relevance matching."""
321
+ text = f"{query} {refined_query}".lower()
322
+ words = [w for w in re.split(r"\W+", text) if len(w) > 3]
323
+ terms = list(set(words))
324
+
325
+ terms.append(query.lower())
326
+ if refined_query:
327
+ terms.append(refined_query.lower())
328
+
329
+ cves = re.findall(r"CVE-\d{4}-\d+", query, re.IGNORECASE)
330
+ terms.extend(c.lower() for c in cves)
331
+
332
+ for actor in _KNOWN_ACTORS:
333
+ if actor in text:
334
+ terms.append(actor)
335
+
336
+ return list(set(terms))
337
+
338
+ async def _fetch_feed(self, feed: dict, search_terms: list[str]) -> list[dict]:
339
+ """Fetch one RSS feed and return relevant articles."""
340
+ feed_url = feed["url"]
341
+ feed_name = feed["name"]
342
+
343
+ cached = self._cache.get(feed_url)
344
+ if cached is not None:
345
+ logger.debug("RSS cache hit: %s", feed_name)
346
+ raw_articles = cached
347
+ else:
348
+ raw_articles = await self._fetch_and_parse(feed_url, feed_name)
349
+ if raw_articles:
350
+ self._cache.set(feed_url, raw_articles)
351
+
352
+ if not raw_articles:
353
+ return []
354
+
355
+ relevant: list[dict] = []
356
+ for article in raw_articles:
357
+ relevance = self._score_article(article, search_terms, feed)
358
+ if relevance <= 0:
359
+ continue
360
+
361
+ full_content = await self._fetch_article_content(article.get("url", ""))
362
+ content = full_content or article.get("summary", "")
363
+
364
+ if not content or len(content.strip()) < 100:
365
+ continue
366
+
367
+ clean, flagged = sanitize_content(content)
368
+ if flagged or not clean:
369
+ continue
370
+
371
+ relevant.append({
372
+ "url": article.get("url", ""),
373
+ "text_content": clean,
374
+ "title": article.get("title", feed_name),
375
+ "source_type": "rss_feed",
376
+ "source_name": feed_name,
377
+ "feed_category": feed.get("category", "unknown"),
378
+ "published_at": article.get("published", ""),
379
+ "relevance": relevance,
380
+ "feed_weight": feed.get("weight", 5),
381
+ "scraped_at": datetime.now(timezone.utc).isoformat(),
382
+ "word_count": len(clean.split()),
383
+ })
384
+
385
+ if len(relevant) >= MAX_ARTICLES_PER_FEED:
386
+ break
387
+
388
+ return relevant
389
+
390
+ async def _fetch_and_parse(self, feed_url: str, feed_name: str) -> list[dict]:
391
+ """Fetch and parse an RSS/Atom feed XML."""
392
+ if not self._session:
393
+ return []
394
+ try:
395
+ async with self._session.get(feed_url, allow_redirects=True) as resp:
396
+ if resp.status != 200:
397
+ return []
398
+ content = await resp.text(encoding="utf-8", errors="ignore")
399
+ return self._parse_feed(content, feed_url)
400
+ except asyncio.TimeoutError:
401
+ logger.debug("RSS timeout: %s", feed_name)
402
+ return []
403
+ except Exception as e:
404
+ logger.debug("RSS fetch error %s: %s", feed_name, e)
405
+ return []
406
+
407
+ def _parse_feed(self, content: str, feed_url: str) -> list[dict]:
408
+ """
409
+ Parse RSS 2.0 or Atom feed XML.
410
+ Returns list of article dicts: title, url, summary, published.
411
+ """
412
+ articles: list[dict] = []
413
+ try:
414
+ # Strip namespace declarations and prefixes so ET can parse any RSS/Atom
415
+ content = re.sub(r'\s+xmlns(?::[a-zA-Z0-9_]+)?="[^"]*"', "", content)
416
+ def _strip_ns(m: re.Match) -> str:
417
+ slash, ns, tag = m.group(1), m.group(2), m.group(3)
418
+ if ns.lower() == "http":
419
+ return m.group(0)
420
+ return f"<{slash}{ns}_{tag}"
421
+ content = re.sub(r"<(/?)([a-zA-Z][a-zA-Z0-9_]*):([a-zA-Z][a-zA-Z0-9_]*)", _strip_ns, content)
422
+
423
+ root = ET.fromstring(content)
424
+ is_atom = "feed" in root.tag.lower()
425
+
426
+ if is_atom:
427
+ for entry in root.findall("entry")[:20]:
428
+ url = ""
429
+ for link in entry.findall("link"):
430
+ if link.get("rel") in ("alternate", None):
431
+ url = link.get("href", "")
432
+ break
433
+ title_el = entry.find("title")
434
+ summary_el = entry.find("summary") or entry.find("content")
435
+ pub_el = entry.find("published") or entry.find("updated")
436
+ if url:
437
+ articles.append({
438
+ "url": url,
439
+ "title": (title_el.text or "") if title_el else "",
440
+ "summary": (summary_el.text or "") if summary_el else "",
441
+ "published": (pub_el.text or "") if pub_el else "",
442
+ })
443
+ else:
444
+ channel = root.find("channel") or root
445
+ for item in channel.findall("item")[:20]:
446
+ link_el = item.find("link")
447
+ title_el = item.find("title")
448
+ desc_el = item.find("description")
449
+ pub_el = item.find("pubDate")
450
+ url = (link_el.text or "").strip() if link_el is not None else ""
451
+ if url:
452
+ articles.append({
453
+ "url": url,
454
+ "title": (title_el.text or "") if title_el else "",
455
+ "summary": self._strip_html(
456
+ (desc_el.text or "") if desc_el else ""
457
+ ),
458
+ "published": (pub_el.text or "") if pub_el else "",
459
+ })
460
+
461
+ except ET.ParseError as e:
462
+ logger.debug("RSS parse error %s: %s", feed_url, e)
463
+ except Exception as e:
464
+ logger.debug("RSS parse unexpected error: %s", e)
465
+
466
+ return articles
467
+
468
+ async def _fetch_article_content(self, url: str) -> Optional[str]:
469
+ """Fetch and extract plain text from an article URL."""
470
+ if not url or not self._session:
471
+ return None
472
+ try:
473
+ async with self._session.get(
474
+ url,
475
+ allow_redirects=True,
476
+ timeout=aiohttp.ClientTimeout(total=10),
477
+ ) as resp:
478
+ if resp.status != 200:
479
+ return None
480
+ html = await resp.text(encoding="utf-8", errors="ignore")
481
+ if len(html) > MAX_ARTICLE_SIZE:
482
+ html = html[:MAX_ARTICLE_SIZE]
483
+ text = self._extract_article_text(html)
484
+ return text if len(text) > 100 else None
485
+ except Exception:
486
+ return None
487
+
488
+ def _extract_article_text(self, html: str) -> str:
489
+ """Strip scripts, styles, and tags; collapse whitespace."""
490
+ html = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL)
491
+ html = re.sub(r"<style[^>]*>.*?</style>", " ", html, flags=re.DOTALL)
492
+ text = re.sub(r"<[^>]+>", " ", html)
493
+ for entity, char in {
494
+ "&amp;": "&", "&lt;": "<", "&gt;": ">",
495
+ "&quot;": '"', "&#39;": "'", "&nbsp;": " ", "&apos;": "'",
496
+ }.items():
497
+ text = text.replace(entity, char)
498
+ return re.sub(r"\s+", " ", text).strip()
499
+
500
+ def _strip_html(self, html: str) -> str:
501
+ """Strip HTML tags from a string."""
502
+ text = re.sub(r"<[^>]+>", " ", html)
503
+ return re.sub(r"\s+", " ", text).strip()
504
+
505
+ def _score_article(
506
+ self,
507
+ article: dict,
508
+ search_terms: list[str],
509
+ feed: dict,
510
+ ) -> int:
511
+ """Score article relevance to search terms (0 = exclude)."""
512
+ score = 0
513
+
514
+ title = article.get("title", "").lower()
515
+ summary = article.get("summary", "").lower()
516
+
517
+ pub_str = article.get("published", "")
518
+ if pub_str:
519
+ try:
520
+ from email.utils import parsedate_to_datetime
521
+ try:
522
+ pub_dt = parsedate_to_datetime(pub_str)
523
+ except Exception:
524
+ import dateutil.parser
525
+ pub_dt = dateutil.parser.parse(pub_str)
526
+
527
+ now = datetime.now(timezone.utc)
528
+ if pub_dt.tzinfo is None:
529
+ pub_dt = pub_dt.replace(tzinfo=timezone.utc)
530
+
531
+ age_days = (now - pub_dt).days
532
+ if age_days > MAX_ARTICLE_AGE_DAYS:
533
+ return 0
534
+ elif age_days <= 7:
535
+ score += 5
536
+ elif age_days <= 30:
537
+ score += 3
538
+ else:
539
+ score += 1
540
+ except Exception:
541
+ score += 1
542
+
543
+ for term in search_terms:
544
+ if len(term) > 3:
545
+ if term in title:
546
+ score += 5
547
+ elif term in summary:
548
+ score += 2
549
+
550
+ feed_tags = [t.lower() for t in feed.get("tags", [])]
551
+ for term in search_terms:
552
+ if term in feed_tags:
553
+ score += 1
554
+
555
+ return score
556
+
557
+
558
+ async def scrape_rss_feeds(
559
+ query: str,
560
+ refined_query: str = "",
561
+ max_results: int = MAX_TOTAL_ARTICLES,
562
+ ) -> list[dict]:
563
+ """
564
+ Main entry point. Returns page dicts compatible with the extraction pipeline.
565
+ Opt-out via RSS_FEEDS_ENABLED=false.
566
+ """
567
+ if os.getenv("RSS_FEEDS_ENABLED", "true").lower() != "true":
568
+ logger.info("RSS feeds disabled")
569
+ return []
570
+
571
+ async with RSSFeedScraper() as scraper:
572
+ return await scraper.fetch_relevant_articles(
573
+ query=query,
574
+ refined_query=refined_query,
575
+ max_results=max_results,
576
+ )