voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,521 @@
1
+ """
2
+ sources/ip_reputation.py — IP reputation enrichment.
3
+
4
+ Checks extracted IP addresses against four sources:
5
+ - Feodo Tracker (abuse.ch): confirmed C2 IPs for banking trojans/ransomware loaders
6
+ - C2IntelFeeds (montysecurity/C2-Tracker): framework-specific C2 IPs
7
+ - AbuseIPDB: community abuse reports (requires ABUSEIPDB_API_KEY)
8
+ - GreyNoise: scanner classification (requires GREYNOISE_API_KEY)
9
+
10
+ GreyNoise "benign" IPs (known legitimate scanners) are SUPPRESSED from results.
11
+ All other sources run without API keys — Feodo and C2IntelFeeds are fully public.
12
+
13
+ Public interface
14
+ ----------------
15
+ async load_feodo_feed() → dict[ip, malware_family]
16
+ async load_c2_feeds() → dict[framework, set[ip]]
17
+ async check_ip_reputation(ip, base_conf) → dict with suppress/tags/threat_confidence
18
+ async enrich_ip_entities(extraction_results, investigation_id) → (results, stats)
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import asyncio
24
+ import csv
25
+ import ipaddress
26
+ import json
27
+ import logging
28
+ import os
29
+ import time
30
+ from typing import Any
31
+
32
+ import aiohttp
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ MAX_IPS = 50
37
+
38
+ FEODO_CSV_URL = "https://feodotracker.abuse.ch/downloads/ipblocklist.csv"
39
+
40
+ C2_FEED_URLS: dict[str, str] = {
41
+ "cobalt_strike": "https://raw.githubusercontent.com/montysecurity/C2-Tracker/main/data/Cobalt%20Strike%20C2%20IPs.txt",
42
+ "sliver": "https://raw.githubusercontent.com/montysecurity/C2-Tracker/main/data/Sliver%20C2%20IPs.txt",
43
+ "metasploit": "https://raw.githubusercontent.com/montysecurity/C2-Tracker/main/data/Metasploit%20Framework%20C2%20IPs.txt",
44
+ "brute_ratel": "https://raw.githubusercontent.com/montysecurity/C2-Tracker/main/data/Brute%20Ratel%20C4%20IPs.txt",
45
+ "posh_c2": "https://raw.githubusercontent.com/montysecurity/C2-Tracker/main/data/Posh%20C2%20IPs.txt",
46
+ "havoc": "https://raw.githubusercontent.com/montysecurity/C2-Tracker/main/data/Havoc%20C2%20IPs.txt",
47
+ }
48
+
49
+ # In-memory feed caches (module-level singletons, refreshed on TTL expiry)
50
+ _feed_cache: dict[str, dict] = {
51
+ "feodo": {"ips": {}, "loaded_at": 0.0},
52
+ "c2feeds": {"ips": {}, "loaded_at": 0.0},
53
+ }
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Helpers
58
+ # ---------------------------------------------------------------------------
59
+
60
+ def _feed_ttl_seconds() -> float:
61
+ try:
62
+ hours = float(os.getenv("C2_FEED_CACHE_TTL", "24"))
63
+ except ValueError:
64
+ hours = 24.0
65
+ return hours * 3600.0
66
+
67
+
68
+ def is_private_ip(ip: str) -> bool:
69
+ """Return True if *ip* is private, loopback, link-local, or reserved."""
70
+ try:
71
+ addr = ipaddress.ip_address(ip.strip())
72
+ return (
73
+ addr.is_private
74
+ or addr.is_loopback
75
+ or addr.is_reserved
76
+ or addr.is_link_local
77
+ or addr.is_multicast
78
+ or addr.is_unspecified
79
+ )
80
+ except ValueError:
81
+ return False
82
+
83
+
84
+ def _parse_feodo_csv(csv_text: str) -> dict[str, str]:
85
+ """Parse Feodo Tracker ipblocklist.csv → {ip: malware_family}."""
86
+ result: dict[str, str] = {}
87
+ # Strip comment lines; the first non-comment line is the CSV header
88
+ lines = [
89
+ line for line in csv_text.splitlines()
90
+ if line.strip() and not line.strip().startswith("#")
91
+ ]
92
+ if not lines:
93
+ return result
94
+ try:
95
+ reader = csv.DictReader(lines)
96
+ for row in reader:
97
+ ip = (row.get("dst_ip") or row.get("ip_address") or "").strip()
98
+ malware = (row.get("malware") or row.get("malware_family") or "").strip()
99
+ if ip:
100
+ result[ip] = malware or "unknown"
101
+ except Exception as exc:
102
+ logger.warning("ip_reputation: Feodo CSV parse error: %s", exc)
103
+ return result
104
+
105
+
106
+ def _parse_c2_txt(text: str) -> set[str]:
107
+ """Parse a plain-text C2 IP list (one entry per line, optional comments)."""
108
+ ips: set[str] = set()
109
+ for line in text.splitlines():
110
+ line = line.strip()
111
+ if not line or line.startswith("#"):
112
+ continue
113
+ # Strip port suffix (1.2.3.4:8080 → 1.2.3.4) and CIDR (/32)
114
+ ip = line.split(":")[0].split("/")[0].strip()
115
+ if ip:
116
+ ips.add(ip)
117
+ return ips
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Feed loaders (cached, refreshed on TTL expiry)
122
+ # ---------------------------------------------------------------------------
123
+
124
+ async def load_feodo_feed() -> dict[str, str]:
125
+ """Fetch and cache Feodo Tracker blocklist. Returns {ip: malware_family}."""
126
+ cache = _feed_cache["feodo"]
127
+ if time.time() - cache["loaded_at"] < _feed_ttl_seconds() and cache["ips"]:
128
+ return cache["ips"] # type: ignore[return-value]
129
+
130
+ logger.info("ip_reputation: Refreshing Feodo Tracker feed")
131
+ try:
132
+ timeout = aiohttp.ClientTimeout(total=30)
133
+ async with aiohttp.ClientSession(timeout=timeout) as session:
134
+ async with session.get(FEODO_CSV_URL) as resp:
135
+ if resp.status != 200:
136
+ logger.warning("ip_reputation: Feodo returned HTTP %s", resp.status)
137
+ return dict(cache["ips"])
138
+ text = await resp.text()
139
+
140
+ parsed = _parse_feodo_csv(text)
141
+ cache["ips"] = parsed
142
+ cache["loaded_at"] = time.time()
143
+ logger.info("ip_reputation: Feodo Tracker: %d C2 IPs loaded", len(parsed))
144
+ return parsed
145
+ except Exception as exc:
146
+ logger.warning("ip_reputation: Feodo fetch failed: %s", exc)
147
+ return dict(cache["ips"])
148
+
149
+
150
+ async def load_c2_feeds() -> dict[str, set[str]]:
151
+ """Fetch and cache all C2IntelFeeds. Returns {framework: set_of_ips}."""
152
+ cache = _feed_cache["c2feeds"]
153
+ if time.time() - cache["loaded_at"] < _feed_ttl_seconds() and cache["ips"]:
154
+ return cache["ips"] # type: ignore[return-value]
155
+
156
+ logger.info("ip_reputation: Refreshing C2IntelFeeds")
157
+
158
+ async def _fetch_one(framework: str, url: str) -> tuple[str, set[str]]:
159
+ try:
160
+ timeout = aiohttp.ClientTimeout(total=20)
161
+ async with aiohttp.ClientSession(timeout=timeout) as session:
162
+ async with session.get(url) as resp:
163
+ if resp.status != 200:
164
+ logger.debug("ip_reputation: C2Feed %s → HTTP %s", framework, resp.status)
165
+ return framework, set()
166
+ text = await resp.text()
167
+ return framework, _parse_c2_txt(text)
168
+ except Exception as exc:
169
+ logger.debug("ip_reputation: C2Feed %s fetch failed: %s", framework, exc)
170
+ return framework, set()
171
+
172
+ fetched = await asyncio.gather(*[_fetch_one(fw, url) for fw, url in C2_FEED_URLS.items()])
173
+
174
+ results: dict[str, set[str]] = {}
175
+ for framework, ips in fetched:
176
+ results[framework] = ips
177
+ logger.info("ip_reputation: C2Feed %-14s %d IPs", framework, len(ips))
178
+
179
+ cache["ips"] = results
180
+ cache["loaded_at"] = time.time()
181
+ return results
182
+
183
+
184
+ # ---------------------------------------------------------------------------
185
+ # External API checks
186
+ # ---------------------------------------------------------------------------
187
+
188
+ async def _check_abuseipdb(ip: str, api_key: str) -> dict:
189
+ """Query AbuseIPDB v2 /check. Returns parsed response or {}."""
190
+ try:
191
+ headers = {"Key": api_key, "Accept": "application/json"}
192
+ params = {"ipAddress": ip, "maxAgeInDays": 90}
193
+ timeout = aiohttp.ClientTimeout(total=15)
194
+ async with aiohttp.ClientSession(timeout=timeout) as session:
195
+ async with session.get(
196
+ "https://api.abuseipdb.com/api/v2/check",
197
+ headers=headers,
198
+ params=params,
199
+ ) as resp:
200
+ if resp.status != 200:
201
+ logger.debug("ip_reputation: AbuseIPDB → HTTP %s for %s", resp.status, ip)
202
+ return {}
203
+ return await resp.json()
204
+ except Exception as exc:
205
+ logger.debug("ip_reputation: AbuseIPDB check failed for %s: %s", ip, exc)
206
+ return {}
207
+
208
+
209
+ async def _check_greynoise(ip: str, api_key: str) -> dict:
210
+ """Query GreyNoise community API. Returns parsed response or {}."""
211
+ try:
212
+ headers = {"key": api_key}
213
+ timeout = aiohttp.ClientTimeout(total=15)
214
+ async with aiohttp.ClientSession(timeout=timeout) as session:
215
+ async with session.get(
216
+ f"https://api.greynoise.io/v3/community/{ip}",
217
+ headers=headers,
218
+ ) as resp:
219
+ if resp.status == 404:
220
+ return {"classification": "unknown"}
221
+ if resp.status != 200:
222
+ logger.debug("ip_reputation: GreyNoise → HTTP %s for %s", resp.status, ip)
223
+ return {}
224
+ return await resp.json()
225
+ except Exception as exc:
226
+ logger.debug("ip_reputation: GreyNoise check failed for %s: %s", ip, exc)
227
+ return {}
228
+
229
+
230
+ # ---------------------------------------------------------------------------
231
+ # Core reputation check
232
+ # ---------------------------------------------------------------------------
233
+
234
+ async def check_ip_reputation(
235
+ ip: str,
236
+ base_confidence: float = 1.0,
237
+ ) -> dict[str, Any]:
238
+ """
239
+ Run all four reputation checks for a single IP address.
240
+
241
+ Returns a dict with keys:
242
+ ip, feodo_hit, feodo_malware, c2feed_hit, c2feed_framework,
243
+ abuseipdb_score, abuseipdb_categories, greynoise_classification,
244
+ suppress, tags, threat_confidence
245
+ """
246
+ result: dict[str, Any] = {
247
+ "ip": ip,
248
+ "feodo_hit": False,
249
+ "feodo_malware": None,
250
+ "c2feed_hit": False,
251
+ "c2feed_framework": None,
252
+ "abuseipdb_score": None,
253
+ "abuseipdb_categories": [],
254
+ "greynoise_classification": None,
255
+ "suppress": False,
256
+ "tags": [],
257
+ "threat_confidence": base_confidence,
258
+ }
259
+
260
+ if is_private_ip(ip):
261
+ return result
262
+
263
+ abuseipdb_key = (os.getenv("ABUSEIPDB_API_KEY") or "").strip()
264
+ greynoise_key = (os.getenv("GREYNOISE_API_KEY") or "").strip()
265
+
266
+ # Load local feeds (both are cached — near-instant after first load)
267
+ feodo_data, c2feeds_data = await asyncio.gather(
268
+ load_feodo_feed(),
269
+ load_c2_feeds(),
270
+ )
271
+
272
+ # --- Feodo Tracker check ---
273
+ if ip in feodo_data:
274
+ malware = feodo_data[ip]
275
+ result["feodo_hit"] = True
276
+ result["feodo_malware"] = malware
277
+ result["tags"].append("confirmed_c2")
278
+ if malware and malware.lower() != "unknown":
279
+ slug = malware.lower().replace(" ", "_").replace("-", "_")
280
+ result["tags"].append(f"confirmed_c2_{slug}")
281
+
282
+ # --- C2IntelFeeds check ---
283
+ for framework, ips in c2feeds_data.items():
284
+ if ip in ips:
285
+ result["c2feed_hit"] = True
286
+ result["c2feed_framework"] = framework
287
+ if "confirmed_c2" not in result["tags"]:
288
+ result["tags"].append("confirmed_c2")
289
+ result["tags"].append(f"confirmed_c2_{framework}")
290
+ break
291
+
292
+ # --- AbuseIPDB check ---
293
+ if abuseipdb_key:
294
+ abuse_resp = await _check_abuseipdb(ip, abuseipdb_key)
295
+ if abuse_resp:
296
+ data = abuse_resp.get("data", {})
297
+ score = data.get("abuseConfidenceScore")
298
+ result["abuseipdb_score"] = score
299
+ # usageType is a string; categories come from individual reports
300
+ usage = data.get("usageType")
301
+ result["abuseipdb_categories"] = [usage] if usage else []
302
+ if score is not None and score > 50:
303
+ result["tags"].append("abuse_confirmed")
304
+ else:
305
+ logger.debug("ip_reputation: AbuseIPDB skipped — no API key")
306
+
307
+ # --- GreyNoise check ---
308
+ if greynoise_key:
309
+ gn_resp = await _check_greynoise(ip, greynoise_key)
310
+ if gn_resp:
311
+ classification = gn_resp.get("classification", "unknown")
312
+ result["greynoise_classification"] = classification
313
+
314
+ if classification == "benign":
315
+ result["suppress"] = True
316
+ logger.info("IP %s suppressed — GreyNoise benign scanner", ip)
317
+ return result
318
+
319
+ if classification == "malicious":
320
+ result["tags"].append("greynoise_malicious")
321
+ for gn_tag in gn_resp.get("tags") or []:
322
+ slug = str(gn_tag).lower().replace(" ", "_")
323
+ result["tags"].append(f"greynoise_{slug}")
324
+ else:
325
+ logger.debug("ip_reputation: GreyNoise skipped — no API key")
326
+
327
+ # --- Threat confidence calculation ---
328
+ conf = base_confidence
329
+ if result["feodo_hit"]:
330
+ conf += 0.15
331
+ if result["c2feed_hit"]:
332
+ conf += 0.15
333
+ score = result["abuseipdb_score"]
334
+ if score is not None:
335
+ if score > 80:
336
+ conf += 0.10
337
+ elif score >= 50:
338
+ conf += 0.05
339
+ if "greynoise_malicious" in result["tags"]:
340
+ conf += 0.10
341
+ result["threat_confidence"] = min(conf, 1.0)
342
+
343
+ return result
344
+
345
+
346
+ # ---------------------------------------------------------------------------
347
+ # DB helpers (sync — called via asyncio.to_thread or direct from sync context)
348
+ # ---------------------------------------------------------------------------
349
+
350
+ def _suppress_entities_in_db(suppressed_ips: set[str], investigation_id: Any) -> None:
351
+ """Remove suppressed IPs from an investigation's entity pool in the DB."""
352
+ if not os.getenv("DATABASE_URL") or not suppressed_ips:
353
+ return
354
+ try:
355
+ from db.session import get_session
356
+ from db.models import Entity, InvestigationEntityLink
357
+
358
+ with get_session() as session:
359
+ entity_ids = [
360
+ row[0]
361
+ for row in session.query(Entity.id).filter(
362
+ Entity.entity_type == "IP_ADDRESS",
363
+ Entity.value.in_(suppressed_ips),
364
+ ).all()
365
+ ]
366
+ if not entity_ids:
367
+ return
368
+
369
+ session.query(InvestigationEntityLink).filter(
370
+ InvestigationEntityLink.investigation_id == investigation_id,
371
+ InvestigationEntityLink.entity_id.in_(entity_ids),
372
+ ).delete(synchronize_session=False)
373
+
374
+ session.query(Entity).filter(
375
+ Entity.investigation_id == investigation_id,
376
+ Entity.id.in_(entity_ids),
377
+ ).update({"investigation_id": None}, synchronize_session=False)
378
+
379
+ session.commit()
380
+ logger.info(
381
+ "ip_reputation: Suppressed %d IP(s) from investigation %s",
382
+ len(entity_ids),
383
+ investigation_id,
384
+ )
385
+ except Exception as exc:
386
+ logger.warning("ip_reputation: DB suppression failed: %s", exc)
387
+
388
+
389
+ def _update_entity_reputations(
390
+ updates: list[tuple[str, float, list[str]]],
391
+ ) -> None:
392
+ """
393
+ Update confidence and corroborating_sources for non-suppressed IP entities.
394
+
395
+ *updates* is a list of (ip_value, new_confidence, tags).
396
+ """
397
+ if not os.getenv("DATABASE_URL") or not updates:
398
+ return
399
+ try:
400
+ from db.session import get_session
401
+ from db.models import Entity
402
+
403
+ with get_session() as session:
404
+ for ip_val, confidence, tags in updates:
405
+ db_entity = session.query(Entity).filter(
406
+ Entity.entity_type == "IP_ADDRESS",
407
+ Entity.value == ip_val,
408
+ ).first()
409
+ if db_entity is None:
410
+ continue
411
+ if confidence > (db_entity.confidence or 0.0):
412
+ db_entity.confidence = confidence
413
+ if tags:
414
+ existing: list = json.loads(db_entity.corroborating_sources or "[]")
415
+ for tag in tags:
416
+ if tag not in existing:
417
+ existing.append(tag)
418
+ db_entity.corroborating_sources = json.dumps(existing)
419
+ session.commit()
420
+ except Exception as exc:
421
+ logger.warning("ip_reputation: DB reputation update failed: %s", exc)
422
+
423
+
424
+ # ---------------------------------------------------------------------------
425
+ # Pipeline integration
426
+ # ---------------------------------------------------------------------------
427
+
428
+ async def enrich_ip_entities(
429
+ extraction_results: list,
430
+ investigation_id: Any,
431
+ ) -> tuple[list, dict]:
432
+ """
433
+ Post-extraction IP reputation enrichment step.
434
+
435
+ - Collects IP_ADDRESS entities from *extraction_results*
436
+ - Limits to MAX_IPS unique IPs per investigation
437
+ - Runs all four checks concurrently
438
+ - Suppresses benign scanner IPs (GreyNoise benign): removes from results + DB
439
+ - Updates confidence and corroborating_sources for remaining IPs
440
+
441
+ Returns (filtered_extraction_results, stats_dict).
442
+ """
443
+ # Collect unique IPs → (first_entity, confidence)
444
+ seen: dict[str, float] = {}
445
+ for exr in extraction_results:
446
+ for entity in getattr(exr, "entities", []):
447
+ if getattr(entity, "entity_type", "") == "IP_ADDRESS":
448
+ ip = entity.value
449
+ if ip not in seen:
450
+ seen[ip] = getattr(entity, "confidence", 1.0)
451
+
452
+ unique_ips = list(seen.keys())
453
+ if not unique_ips:
454
+ return extraction_results, {"ip_reputation": "ok_0_ips"}
455
+
456
+ if len(unique_ips) > MAX_IPS:
457
+ logger.info(
458
+ "ip_reputation: capping to %d of %d unique IPs",
459
+ MAX_IPS, len(unique_ips),
460
+ )
461
+ unique_ips = unique_ips[:MAX_IPS]
462
+
463
+ logger.info("ip_reputation: checking %d unique IP(s)", len(unique_ips))
464
+
465
+ # Run all checks concurrently
466
+ rep_list = await asyncio.gather(
467
+ *[check_ip_reputation(ip, base_confidence=seen[ip]) for ip in unique_ips],
468
+ return_exceptions=True,
469
+ )
470
+
471
+ suppressed_ips: set[str] = set()
472
+ db_updates: list[tuple[str, float, list[str]]] = []
473
+ stats = {
474
+ "checked": len(unique_ips),
475
+ "suppressed": 0,
476
+ "c2_confirmed": 0,
477
+ "abuse_confirmed": 0,
478
+ }
479
+
480
+ for ip, rep in zip(unique_ips, rep_list):
481
+ if isinstance(rep, Exception):
482
+ logger.debug("ip_reputation: check raised for %s: %s", ip, rep)
483
+ continue
484
+ if rep["suppress"]:
485
+ suppressed_ips.add(ip)
486
+ stats["suppressed"] += 1
487
+ continue
488
+ if rep["c2feed_hit"] or rep["feodo_hit"]:
489
+ stats["c2_confirmed"] += 1
490
+ if (rep["abuseipdb_score"] or 0) > 50:
491
+ stats["abuse_confirmed"] += 1
492
+ if rep["tags"] or rep["threat_confidence"] > seen[ip]:
493
+ db_updates.append((ip, rep["threat_confidence"], rep["tags"]))
494
+
495
+ # Apply suppression to in-memory extraction results
496
+ if suppressed_ips:
497
+ for exr in extraction_results:
498
+ exr.entities = [
499
+ e for e in getattr(exr, "entities", [])
500
+ if not (
501
+ getattr(e, "entity_type", "") == "IP_ADDRESS"
502
+ and e.value in suppressed_ips
503
+ )
504
+ ]
505
+ exr.entity_count = len(exr.entities)
506
+ await asyncio.to_thread(_suppress_entities_in_db, suppressed_ips, investigation_id)
507
+
508
+ # Update DB for non-suppressed IPs
509
+ if db_updates:
510
+ await asyncio.to_thread(_update_entity_reputations, db_updates)
511
+
512
+ checked = stats["checked"]
513
+ sup = stats["suppressed"]
514
+ status = f"ok_{checked}_ips" + (f"_{sup}_suppressed" if sup else "")
515
+
516
+ logger.info(
517
+ "ip_reputation: done — %d checked, %d suppressed, %d C2, %d abuse",
518
+ checked, sup, stats["c2_confirmed"], stats["abuse_confirmed"],
519
+ )
520
+
521
+ return extraction_results, {"ip_reputation": status, **stats}