voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
monitor/jobs.py ADDED
@@ -0,0 +1,247 @@
1
+ """
2
+ Scheduled monitor jobs (keyword search pipeline and URL change detection).
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from datetime import datetime, timezone
9
+ from typing import Any, TYPE_CHECKING
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ if TYPE_CHECKING:
14
+ import graph
15
+ import scraper.scrape as scrape
16
+ import search.search as search
17
+ import vector
18
+ from extractor import extract_entities_from_page, extract_entities_from_pages
19
+ from monitor import _db
20
+
21
+
22
+ def _utc_iso() -> str:
23
+ return datetime.now(timezone.utc).isoformat()
24
+
25
+
26
+ async def run_keyword_watch(watch: dict, llm=None) -> dict[str, Any]:
27
+ """
28
+ Full pipeline: search → scrape → dedup → extract → graph rebuild.
29
+ """
30
+ import scraper.scrape as scrape
31
+ import search.search as search
32
+ import vector
33
+ from extractor import extract_entities_from_pages
34
+ from monitor import _db
35
+ from monitor.diff import compute_diff
36
+
37
+ name = watch.get("name", "")
38
+ query = watch.get("query", "")
39
+ errors: list[str] = []
40
+ new_pages: list[dict] = []
41
+ duplicate_pages_skipped = 0
42
+
43
+ try:
44
+ raw_results = search.get_search_results(query)
45
+ except Exception as exc:
46
+ logger.error("search failed: %s", exc)
47
+ return {
48
+ "name": name,
49
+ "query": query,
50
+ "new_pages": 0,
51
+ "new_entities": 0,
52
+ "duplicate_pages_skipped": 0,
53
+ "errors": [str(exc)],
54
+ "timestamp": _utc_iso(),
55
+ }
56
+
57
+ urls_data = [
58
+ {"link": r["link"], "title": r.get("title", "")}
59
+ for r in raw_results
60
+ if r.get("link")
61
+ ]
62
+
63
+ try:
64
+ scraped = await scrape.scrape_multiple(urls_data)
65
+ except Exception as exc:
66
+ logger.error("scrape failed: %s", exc)
67
+ return {
68
+ "name": name,
69
+ "query": query,
70
+ "new_pages": 0,
71
+ "new_entities": 0,
72
+ "duplicate_pages_skipped": 0,
73
+ "errors": [str(exc)],
74
+ "timestamp": _utc_iso(),
75
+ }
76
+
77
+ for url, text in scraped.items():
78
+ try:
79
+ if vector.is_duplicate(text):
80
+ duplicate_pages_skipped += 1
81
+ continue
82
+ except Exception as exc:
83
+ logger.warning("is_duplicate check failed for %s: %s", url, exc)
84
+ try:
85
+ vector.upsert_page(
86
+ url,
87
+ text,
88
+ metadata={"watch_name": name, "watch_type": "keyword"},
89
+ )
90
+ except Exception as exc:
91
+ logger.warning("upsert_page failed for %s: %s", url, exc)
92
+ new_pages.append({"url": url, "text": text, "content": text})
93
+
94
+ new_entities_total = 0
95
+ if new_pages:
96
+ try:
97
+ results = await extract_entities_from_pages(
98
+ new_pages,
99
+ investigation_id=None,
100
+ llm=llm,
101
+ run_llm_extraction=llm is not None,
102
+ )
103
+ for er in results:
104
+ new_entities_total += int(er.entity_count)
105
+ errors.extend(er.errors)
106
+ except Exception as exc:
107
+ logger.error("extract_entities_from_pages failed: %s", exc)
108
+ errors.append(str(exc))
109
+
110
+ try:
111
+ import graph
112
+ graph.build_graph_from_db()
113
+ except Exception as exc:
114
+ logger.warning("build_graph_from_db: %s", exc)
115
+ errors.append(f"graph: {exc}")
116
+
117
+ return {
118
+ "name": name,
119
+ "query": query,
120
+ "new_pages": len(new_pages),
121
+ "new_entities": new_entities_total,
122
+ "duplicate_pages_skipped": duplicate_pages_skipped,
123
+ "errors": errors,
124
+ "timestamp": _utc_iso(),
125
+ }
126
+
127
+
128
+ async def run_url_watch(watch: dict) -> dict[str, Any]:
129
+ """Scrape one URL, diff against DB-backed previous content, extract if changed."""
130
+ import scraper.scrape as scrape
131
+ import vector
132
+ from extractor import extract_entities_from_page
133
+ from monitor import _db
134
+ from monitor.diff import compute_diff
135
+
136
+ name = watch.get("name", "")
137
+ url = watch.get("url", "")
138
+ old_content = _db.get_last_cleaned_text_for_url(url)
139
+
140
+ try:
141
+ scraped = await scrape.scrape_multiple([{"link": url, "title": ""}])
142
+ except Exception as exc:
143
+ logger.error("url watch scrape failed: %s", exc)
144
+ return {
145
+ "name": name,
146
+ "url": url,
147
+ "changed": False,
148
+ "diff_summary": "",
149
+ "new_entities": 0,
150
+ "timestamp": _utc_iso(),
151
+ }
152
+
153
+ new_content = scraped.get(url, "")
154
+ diff = compute_diff(old_content, new_content)
155
+ changed = bool(diff.get("changed"))
156
+ diff_summary = str(diff.get("diff_summary", ""))
157
+ is_first_scrape = not (old_content or "").strip()
158
+
159
+ new_entities = 0
160
+ if changed:
161
+ try:
162
+ vector.upsert_page(
163
+ url,
164
+ new_content,
165
+ metadata={"watch_name": name, "watch_type": "url"},
166
+ )
167
+ except Exception as exc:
168
+ logger.warning("upsert_page failed: %s", exc)
169
+ try:
170
+ er = await extract_entities_from_page(
171
+ new_content,
172
+ url,
173
+ page_id=None,
174
+ investigation_id=None,
175
+ llm=None,
176
+ run_llm_extraction=False,
177
+ )
178
+ new_entities = int(er.entity_count)
179
+ except Exception as exc:
180
+ logger.error("extract_entities_from_page failed: %s", exc)
181
+
182
+ fp = str(diff.get("content_hash_new", ""))
183
+ _db.update_source_watch_fingerprint(url, fp)
184
+
185
+ return {
186
+ "name": name,
187
+ "url": url,
188
+ "changed": changed,
189
+ "diff_summary": diff_summary,
190
+ "new_entities": new_entities,
191
+ "change_ratio": float(diff.get("change_ratio", 0.0)),
192
+ "lines_added": int(diff.get("lines_added", 0)),
193
+ "lines_removed": int(diff.get("lines_removed", 0)),
194
+ "is_first_scrape": is_first_scrape,
195
+ "timestamp": _utc_iso(),
196
+ }
197
+
198
+
199
+ async def refresh_seed_data():
200
+ """
201
+ Weekly job: refresh historical seed data from live APIs.
202
+ Upserts new records, updates existing ones.
203
+ Runs every Sunday at 03:00 UTC.
204
+ """
205
+ logger.warning("Starting weekly seed data refresh...")
206
+
207
+ try:
208
+ from sources.enrichment import (
209
+ fetch_threatfox, fetch_malwarebazaar
210
+ )
211
+ from scripts.import_seed import (
212
+ import_threatfox_iocs, import_malwarebazaar
213
+ )
214
+ from db.session import get_session
215
+
216
+ tf_results = await fetch_threatfox("", limit=500)
217
+ mb_results = await fetch_malwarebazaar("", limit=500)
218
+
219
+ with get_session() as session:
220
+ import_threatfox_iocs(session, tf_results)
221
+ import_malwarebazaar(session, mb_results)
222
+
223
+ logger.warning("Weekly seed refresh complete")
224
+ except Exception as e:
225
+ logger.error(f"Weekly seed refresh failed: {e}")
226
+
227
+
228
+ async def validate_seeds_job():
229
+ """
230
+ Weekly job: check which curated .onion seeds are still reachable over Tor.
231
+ Updates status in data/onion_seeds.json. Concurrency is kept low so
232
+ the validation pass doesn't saturate the Tor circuit.
233
+ """
234
+ logger.warning("Starting weekly seed validation...")
235
+ try:
236
+ from sources.seed_manager import get_seed_manager
237
+
238
+ seed_manager = get_seed_manager()
239
+ results = await seed_manager.validate_seeds(concurrency=3)
240
+ logger.warning(
241
+ "Seed validation complete: %d/%d active, %d unreachable",
242
+ results.get("active", 0),
243
+ results.get("checked", 0),
244
+ results.get("dead", 0),
245
+ )
246
+ except Exception as e:
247
+ logger.error(f"Seed validation failed: {e}")
monitor/scheduler.py ADDED
@@ -0,0 +1,184 @@
1
+ """
2
+ APScheduler-based background runner for monitor watches.
3
+ Uses AsyncIOScheduler to properly integrate with the asyncio event loop.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import asyncio
9
+ import logging
10
+ from datetime import datetime, timezone
11
+ from typing import Any, Coroutine
12
+
13
+ from monitor import jobs
14
+ from monitor.alerts import evaluate_and_dispatch_alerts
15
+ from monitor.config import load_watches
16
+ from utils.async_utils import run_async
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _wrap_keyword(watch: dict, llm) -> Coroutine[Any, Any, None]:
22
+ """
23
+ Create an async job function for keyword watches.
24
+ Returns a coroutine that can be awaited.
25
+ """
26
+ async def _run_watch() -> None:
27
+ result = await jobs.run_keyword_watch(watch, llm=llm)
28
+ await evaluate_and_dispatch_alerts(watch, result)
29
+
30
+ return _run_watch
31
+
32
+
33
+ def _wrap_url(watch: dict) -> Coroutine[Any, Any, None]:
34
+ """
35
+ Create an async job function for URL watches.
36
+ Returns a coroutine that can be awaited.
37
+ """
38
+ async def _run_watch() -> None:
39
+ result = await jobs.run_url_watch(watch)
40
+ await evaluate_and_dispatch_alerts(watch, result)
41
+
42
+ return _run_watch
43
+
44
+
45
+ def _wrap_seed_refresh() -> Coroutine[Any, Any, None]:
46
+ """Create an async job function for seed data refresh."""
47
+ async def _run_refresh() -> None:
48
+ await jobs.refresh_seed_data()
49
+
50
+ return _run_refresh
51
+
52
+
53
+ def _wrap_seed_validation() -> Coroutine[Any, Any, None]:
54
+ """Create an async job function for .onion seed reachability validation."""
55
+ async def _run_validation() -> None:
56
+ await jobs.validate_seeds_job()
57
+
58
+ return _run_validation
59
+
60
+
61
+ def start_scheduler(llm=None, event_loop: asyncio.AbstractEventLoop | None = None):
62
+ """
63
+ Register interval jobs for each enabled watch. Returns AsyncIOScheduler or None.
64
+
65
+ Args:
66
+ llm: Optional LLM instance for keyword watches
67
+ event_loop: Optional event loop to use. If not provided, attempts to get the running loop.
68
+ """
69
+ try:
70
+ from apscheduler.schedulers.asyncio import AsyncIOScheduler # noqa: PLC0415
71
+ from apscheduler.triggers.interval import IntervalTrigger # noqa: PLC0415
72
+ from apscheduler.triggers.cron import CronTrigger # noqa: PLC0415
73
+ except ImportError:
74
+ logger.warning("APScheduler not installed; scheduler disabled")
75
+ return None
76
+
77
+ if event_loop is None:
78
+ try:
79
+ event_loop = asyncio.get_running_loop()
80
+ logger.debug("Using existing event loop for scheduler")
81
+ except RuntimeError:
82
+ logger.debug("No running event loop, creating new one")
83
+ event_loop = asyncio.new_event_loop()
84
+ asyncio.set_event_loop(event_loop)
85
+
86
+ watches = [w for w in load_watches() if w.get("enabled", True)]
87
+ scheduler = AsyncIOScheduler(event_loop=event_loop)
88
+
89
+ for w in watches:
90
+ wid = w["name"]
91
+ hours = float(w["interval_hours"])
92
+ trigger = IntervalTrigger(hours=hours)
93
+ if w.get("type") == "keyword":
94
+ func = _wrap_keyword(w, llm)
95
+ else:
96
+ func = _wrap_url(w)
97
+ try:
98
+ scheduler.add_job(
99
+ func,
100
+ trigger=trigger,
101
+ id=wid,
102
+ replace_existing=True,
103
+ max_instances=1,
104
+ coalesce=True,
105
+ )
106
+ except Exception as exc:
107
+ logger.error("Failed to add job %r: %s", wid, exc)
108
+
109
+ try:
110
+ scheduler.add_job(
111
+ _wrap_seed_refresh(),
112
+ trigger=CronTrigger(day_of_week="sun", hour=3, minute=0),
113
+ id="weekly_seed_refresh",
114
+ replace_existing=True,
115
+ )
116
+ except Exception as exc:
117
+ logger.error("Failed to add weekly_seed_refresh job: %s", exc)
118
+
119
+ try:
120
+ scheduler.add_job(
121
+ _wrap_seed_validation(),
122
+ trigger=CronTrigger(day_of_week="sun", hour=2, minute=0),
123
+ id="seed_validation",
124
+ replace_existing=True,
125
+ )
126
+ except Exception as exc:
127
+ logger.error("Failed to add seed_validation job: %s", exc)
128
+
129
+ try:
130
+ scheduler.start()
131
+ except Exception as exc:
132
+ logger.error("Scheduler start failed: %s", exc)
133
+ return None
134
+
135
+ logger.info("AsyncIOScheduler started with %d jobs", len(watches) + 2)
136
+ return scheduler
137
+
138
+
139
+ def stop_scheduler(scheduler) -> None:
140
+ if scheduler is None:
141
+ return
142
+ try:
143
+ scheduler.shutdown(wait=True)
144
+ except Exception as exc:
145
+ logger.warning("scheduler shutdown: %s", exc)
146
+
147
+
148
+ def get_job_status(scheduler) -> list[dict]:
149
+ """Return {name, next_run_time, last_run_time} for each job."""
150
+ if scheduler is None:
151
+ return []
152
+ out: list[dict] = []
153
+ try:
154
+ for job in scheduler.get_jobs():
155
+ next_t = job.next_run_time
156
+ last_t = getattr(job, "last_run_time", None)
157
+ out.append(
158
+ {
159
+ "name": job.id,
160
+ "next_run_time": next_t,
161
+ "last_run_time": last_t,
162
+ }
163
+ )
164
+ except Exception as exc:
165
+ logger.warning("get_job_status: %s", exc)
166
+ return out
167
+
168
+
169
+ def trigger_job_now(scheduler, watch_name: str) -> bool:
170
+ """Run the watch job as soon as possible (reschedule to now)."""
171
+ if scheduler is None:
172
+ return False
173
+ try:
174
+ job = scheduler.get_job(watch_name)
175
+ if job is None:
176
+ return False
177
+ scheduler.modify_job(
178
+ watch_name,
179
+ next_run_time=datetime.now(timezone.utc),
180
+ )
181
+ return True
182
+ except Exception as exc:
183
+ logger.warning("trigger_job_now: %s", exc)
184
+ return False
scraper/__init__.py ADDED
File without changes