voidaccess 1.4.7__tar.gz → 1.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {voidaccess-1.4.7/voidaccess.egg-info → voidaccess-1.6.0}/PKG-INFO +108 -7
- {voidaccess-1.4.7 → voidaccess-1.6.0}/README.md +107 -6
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/main.py +155 -11
- voidaccess-1.6.0/api/routes/actors.py +735 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/admin.py +124 -1
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/export.py +235 -6
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/investigations.py +1176 -271
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/settings.py +34 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/config.py +23 -2
- voidaccess-1.6.0/db/migrations/versions/0021_add_search_engine_stats.py +34 -0
- voidaccess-1.6.0/db/migrations/versions/0022_add_actor_profiles.py +158 -0
- voidaccess-1.6.0/db/migrations/versions/0023_add_investigation_metadata.py +60 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/models.py +183 -0
- voidaccess-1.6.0/db/search_engine_stats.py +315 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/session.py +23 -4
- voidaccess-1.6.0/export/__init__.py +58 -0
- voidaccess-1.6.0/export/ioc_package.py +1055 -0
- voidaccess-1.6.0/export/snort_export.py +551 -0
- voidaccess-1.6.0/export/yara_export.py +664 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/llm_extract.py +50 -38
- voidaccess-1.6.0/extractor/normalizer.py +1245 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/pipeline.py +200 -6
- voidaccess-1.6.0/extractor/regex_patterns.py +2326 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/fingerprint/profiler.py +26 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/builder.py +262 -22
- {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/model.py +34 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/jobs.py +9 -15
- {voidaccess-1.4.7 → voidaccess-1.6.0}/pyproject.toml +1 -1
- {voidaccess-1.4.7 → voidaccess-1.6.0}/scraper/scrape.py +153 -2
- {voidaccess-1.4.7 → voidaccess-1.6.0}/search/__init__.py +118 -24
- voidaccess-1.6.0/search/circuit_breaker.py +79 -0
- voidaccess-1.6.0/search/query_builder.py +48 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/search/search.py +119 -26
- voidaccess-1.6.0/sources/actor_profiles.py +1684 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/dns_enrichment.py +98 -3
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/domain_reputation.py +68 -4
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/email_reputation.py +60 -3
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/hash_reputation.py +110 -5
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/ip_reputation.py +57 -3
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/paste_scraper.py +23 -30
- voidaccess-1.6.0/sources/proxy_client.py +622 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/rss_scraper.py +72 -23
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/seed_manager.py +203 -8
- voidaccess-1.6.0/tests/test_cli_proxy_config.py +697 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_db.py +2 -1
- voidaccess-1.6.0/tests/test_filter_e2e_manual.py +50 -0
- voidaccess-1.6.0/tests/test_filter_parser_manual.py +60 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_pagination.py +7 -6
- voidaccess-1.6.0/tests/test_paste_scraper.py +569 -0
- voidaccess-1.6.0/tests/test_proxy_client.py +1135 -0
- voidaccess-1.6.0/tests/test_regex_patterns.py +2341 -0
- voidaccess-1.6.0/tests/test_rss_scraper.py +675 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_settings.py +3 -4
- voidaccess-1.6.0/tests/test_snort_export.py +431 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_vector.py +3 -1
- voidaccess-1.6.0/tests/test_yara_export.py +353 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/content_safety.py +59 -2
- voidaccess-1.6.0/utils/enrichment_cache.py +676 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/user_keys.py +10 -7
- {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/embedder.py +5 -1
- voidaccess-1.6.0/vector/model_singleton.py +87 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/store.py +1 -1
- voidaccess-1.6.0/voidaccess/config.py +14 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess/llm.py +222 -33
- {voidaccess-1.4.7 → voidaccess-1.6.0/voidaccess.egg-info}/PKG-INFO +108 -7
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/SOURCES.txt +21 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/__init__.py +1 -1
- voidaccess-1.6.0/voidaccess_cli/adapters/sqlite.py +860 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/browser.py +203 -71
- voidaccess-1.6.0/voidaccess_cli/commands/actors.py +479 -0
- voidaccess-1.6.0/voidaccess_cli/commands/configure.py +408 -0
- voidaccess-1.6.0/voidaccess_cli/commands/export.py +352 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/investigate.py +420 -15
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/show.py +98 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/config.py +70 -0
- voidaccess-1.6.0/voidaccess_cli/main.py +488 -0
- voidaccess-1.4.7/export/__init__.py +0 -34
- voidaccess-1.4.7/extractor/normalizer.py +0 -638
- voidaccess-1.4.7/extractor/regex_patterns.py +0 -325
- voidaccess-1.4.7/search/circuit_breaker.py +0 -247
- voidaccess-1.4.7/tests/test_paste_scraper.py +0 -245
- voidaccess-1.4.7/tests/test_rss_scraper.py +0 -359
- voidaccess-1.4.7/vector/model_singleton.py +0 -49
- voidaccess-1.4.7/voidaccess_cli/adapters/sqlite.py +0 -329
- voidaccess-1.4.7/voidaccess_cli/commands/configure.py +0 -182
- voidaccess-1.4.7/voidaccess_cli/commands/export.py +0 -162
- voidaccess-1.4.7/voidaccess_cli/main.py +0 -191
- {voidaccess-1.4.7 → voidaccess-1.6.0}/LICENSE +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/opsec.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/patterns.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/temporal.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/auth.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/auth.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/entities.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/monitors.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/search.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/auth/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/auth/token_blacklist.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/dedup.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/frontier.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/spider.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/utils.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/env.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0001_initial_schema.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0002_add_investigation_status_column.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0002_add_missing_tables.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0003_add_canonical_value_and_entity_links.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0004_add_page_posted_at.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0005_add_extraction_method.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0006_add_monitor_alerts.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0007_add_actor_style_profiles.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0008_add_users_table.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0009_add_investigation_id_to_relationships.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0010_add_composite_index_entity_relationships.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0011_add_page_extraction_cache.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0013_add_graph_status.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0015_add_progress_fields.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0016_backfill_graph_status.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0017_add_user_api_keys.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0018_add_user_id_to_investigations.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0019_add_content_safety_log.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0020_add_entity_source_tracking.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/db/queries.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/export/misp.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/export/sigma.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/export/stix.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/ner.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/fingerprint/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/fingerprint/stylometry.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/export.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/queries.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/visualize.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/detect.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/query_expand.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/translate.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/_db.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/alerts.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/config.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/diff.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/scheduler.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/scraper/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/scraper/scrape_js.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/setup.cfg +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/blockchain.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/cache.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/cisa.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/engines.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/enrichment.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/github_scraper.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/gitlab_scraper.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/historical_intel.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/pastes.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/seeds.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/shodan.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/telegram.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/virustotal.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_analysis_opsec.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_analysis_stylometry.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_analysis_temporal.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_api.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_api_monitors.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_blockchain.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_config.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_crawler.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_dns_enrichment.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_domain_reputation.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_email_reputation.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_fingerprint.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_github_scraper.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_gitlab_scraper.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_graph.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_hash_reputation.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_i18n.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_ip_reputation.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_llm.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_llm_utils.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_model_singleton.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_monitor.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_scrape_js.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_sources.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_sources_enrichment_new.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/async_utils.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/defang.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/encryption.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/ioc_freshness.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/search.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess/llm_utils.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/dependency_links.txt +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/entry_points.txt +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/requires.txt +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/top_level.txt +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/adapters/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/__init__.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/enrich.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/display.py +0 -0
- {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/tor_detect.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: voidaccess
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.6.0
|
|
4
4
|
Summary: Dark web OSINT CLI — automated threat intelligence from query to report
|
|
5
5
|
Author: VoidAccess
|
|
6
6
|
License-Expression: MIT
|
|
@@ -65,6 +65,35 @@ Commercial threat intelligence platforms often charge prohibitive annual fees fo
|
|
|
65
65
|
|
|
66
66
|
---
|
|
67
67
|
|
|
68
|
+
## What's New in v1.6.0
|
|
69
|
+
|
|
70
|
+
- **Optional clearnet scraping proxy (ScrapingAnt)** — paste sites and RSS feeds can now be routed through ScrapingAnt. Affects clearnet scraping only; Tor, `.onion`, GitHub, and GitLab traffic are never affected.
|
|
71
|
+
- **Two mutually exclusive transports** — pick one, not both:
|
|
72
|
+
- **REST API transport** — `VOIDACCESS_USE_PROXIES=true` (legacy v1.5.0 toggle) routes requests through ScrapingAnt's Web Scraping API.
|
|
73
|
+
- **Proxy Mode transport** — `VOIDACCESS_USE_PROXY=true` routes requests through ScrapingAnt's HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
|
|
74
|
+
- Per [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): "Proxy Mode is a light front-end for the scraping API and has all the same functionality and performance" — so the two are alternate transports to the same backend, never chained.
|
|
75
|
+
- **`SCRAPINGANT_PROXY_TYPE`** — `residential` (default) or `datacenter`; per docs this is passed as a `proxy_type=` parameter in the Proxy Mode username string (which is built at connection time as `"scrapingant&browser=false&proxy_type=..."`). NOT a separate hostname.
|
|
76
|
+
- **Single credential** — `SCRAPINGANT_API_KEY` is the only real credential; the Proxy Mode username is a literal constant per docs. No per-customer username field, no second key.
|
|
77
|
+
- **New CLI surfaces** — `voidaccess configure proxy` now prompts for key + type in one uninterrupted block, plus `--enable-proxy / --disable-proxy` for non-interactive Proxy Mode toggling and `--show` for masked state inspection.
|
|
78
|
+
- **55 → 74 proxy-config tests** — covers both transports, the `apply_env()` independent-toggle guarantee, the single-transport selection logic, and the masked `--show` output.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## What's New in v1.5.0
|
|
83
|
+
|
|
84
|
+
- 37 new entity types across crypto, credentials, messaging, and network/forensic indicators.
|
|
85
|
+
- YARA, Snort, Suricata, and IOC package ZIP exports.
|
|
86
|
+
- Persistent actor profiles with aliases, infrastructure, notes, and timelines.
|
|
87
|
+
- Cross-alias resolution using shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation.
|
|
88
|
+
- Backend graph community detection and path-between-nodes queries.
|
|
89
|
+
- CLI graph browser path finder and frontend Find Path highlighting.
|
|
90
|
+
- Per-phase pipeline timeouts for enrichment, graph, summary, finalize, and parallel sources.
|
|
91
|
+
- `sources_used` and `infrastructure_clusters` persist in investigation metadata.
|
|
92
|
+
- Cross-run enrichment cache with Redis, SQLite, and memory backends.
|
|
93
|
+
- Auto-discovery and weekly validation of `.onion` seeds.
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
68
97
|
## Quick Start
|
|
69
98
|
|
|
70
99
|
### Option A - CLI (no Docker, 30 seconds)
|
|
@@ -102,11 +131,28 @@ The Docker stack includes PostgreSQL, Tor, FastAPI, and Next.js.
|
|
|
102
131
|
|---|---|
|
|
103
132
|
| `voidaccess investigate` | Run an investigation |
|
|
104
133
|
| `voidaccess show` | Interactive entity browser |
|
|
105
|
-
| `voidaccess export` | Export STIX/MISP/Sigma/CSV/MD |
|
|
134
|
+
| `voidaccess export` | Export STIX/MISP/Sigma/YARA/Snort/Suricata/package/CSV/MD/JSON |
|
|
135
|
+
| `voidaccess package <file>` | Export an IOC ZIP bundle |
|
|
106
136
|
| `voidaccess enrich` | Re-enrich saved results |
|
|
107
137
|
| `voidaccess list` | List saved investigations |
|
|
108
|
-
| `voidaccess status` | Config
|
|
138
|
+
| `voidaccess status` | Config, API key, cache, engine, and seed status |
|
|
139
|
+
| `voidaccess actors` | List persistent actor profiles |
|
|
140
|
+
| `voidaccess actor <handle>` | Show an actor profile with aliases, infrastructure, notes, and history |
|
|
141
|
+
| `voidaccess actor <handle> --timeline` | Show an actor activity timeline |
|
|
142
|
+
| `voidaccess actor <handle> --note "text"` | Append an analyst note to an actor profile |
|
|
143
|
+
| `voidaccess timeline <handle>` | Shortcut for `voidaccess actor <handle> --timeline` |
|
|
109
144
|
| `voidaccess configure` | Setup wizard |
|
|
145
|
+
| `voidaccess configure proxy` | ScrapingAnt key, username, type, and routing toggles. Flags: `--enable / --disable` (API gate), `--enable-proxy / --disable-proxy` (proxy gate), `--show` (masked state) |
|
|
146
|
+
|
|
147
|
+
Export examples:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
voidaccess package investigation.json
|
|
151
|
+
voidaccess export investigation.json --format yara
|
|
152
|
+
voidaccess export investigation.json --format snort
|
|
153
|
+
voidaccess export investigation.json --format suricata
|
|
154
|
+
voidaccess status --seeds
|
|
155
|
+
```
|
|
110
156
|
|
|
111
157
|
### CLI vs Docker
|
|
112
158
|
|
|
@@ -167,14 +213,17 @@ VoidAccess handles the complexity of dark web research through a rigorous sequen
|
|
|
167
213
|
|
|
168
214
|
## What It Extracts
|
|
169
215
|
|
|
170
|
-
The extraction pipeline identifies
|
|
216
|
+
The extraction pipeline identifies 55+ entity types:
|
|
171
217
|
|
|
172
218
|
| Category | Examples |
|
|
173
219
|
|---|---|
|
|
174
|
-
| **Cryptocurrency** | Bitcoin, Ethereum, Monero
|
|
220
|
+
| **Cryptocurrency** | Bitcoin, Ethereum, Monero, Litecoin, Zcash, Dogecoin, XRP, Solana, Tron, Bitcoin Cash, Dash, ENS |
|
|
175
221
|
| **Network Indicators** | IPv4 addresses, .onion URLs, domains, email addresses, PGP keys |
|
|
176
222
|
| **File Indicators** | MD5, SHA1, SHA256 hashes |
|
|
177
|
-
| **
|
|
223
|
+
| **Credentials** | AWS keys, GitHub tokens, Slack tokens, Discord tokens, JWTs, Google API keys, Stripe keys, generic API keys, stealer log entries |
|
|
224
|
+
| **Messaging Handles** | Telegram, Discord, XMPP, Tox, Session, Matrix, Wire, ICQ, Wickr |
|
|
225
|
+
| **Network/Forensic** | IPv6, MAC addresses, IPFS CIDs, combo-list entries, YARA rules, MITRE tactics, Exploit-DB IDs, Nuclei templates, seed phrases |
|
|
226
|
+
| **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques and tactics |
|
|
178
227
|
| **Threat Actors** | Actor handles, malware families, ransomware group names |
|
|
179
228
|
| **Paste Sites** | Pastebin, Ghostbin, Rentry, and similar links |
|
|
180
229
|
| **People/Orgs** | Named persons, organization names, locations |
|
|
@@ -214,7 +263,19 @@ Export formats:
|
|
|
214
263
|
- **STIX 2.1** — bundles with indicators, threat actors, malware objects
|
|
215
264
|
- **MISP JSON** — events with galaxies for direct import
|
|
216
265
|
- **Sigma rules** — auto-generated detection rules from extracted IOCs
|
|
217
|
-
- **
|
|
266
|
+
- **YARA rules** - generated rules for malware, credentials, infrastructure, and IOC strings
|
|
267
|
+
- **Snort rules** - network detection rules for IPs, domains, URLs, and selected IOC content
|
|
268
|
+
- **Suricata rules** - Suricata-compatible network rules with the same IOC coverage as Snort
|
|
269
|
+
- **IOC package ZIP** - 21-file bundle containing text IOC lists, STIX, MISP, Sigma, YARA, Snort, Suricata, summary, and CSV
|
|
270
|
+
- **CSV** - flat entity dumps for spreadsheet analysis
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
## Actor Intelligence
|
|
275
|
+
|
|
276
|
+
VoidAccess v1.5.0 persists actor profiles across investigations in `actor_profiles`, with linked aliases and infrastructure in `actor_aliases` and `actor_infrastructure`. Profiles are populated from threat actor, ransomware group, and handle entities, then enriched with co-occurring infrastructure and timeline events.
|
|
277
|
+
|
|
278
|
+
Cross-alias resolution scores five signals: shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation. Use `voidaccess actors` to list profiles, `voidaccess actor <handle>` for the full profile, `voidaccess actor <handle> --timeline` for chronology, and `voidaccess actor <handle> --note "text"` for analyst notes.
|
|
218
279
|
|
|
219
280
|
---
|
|
220
281
|
|
|
@@ -253,6 +314,46 @@ All enrichment sources that require a key degrade gracefully when the key is abs
|
|
|
253
314
|
| `GITLAB_TOKEN` | Raises GitLab scraping from 15 to 60 req/min | Free | [gitlab.com/profile/personal_access_tokens](https://gitlab.com/-/profile/personal_access_tokens) |
|
|
254
315
|
| `BLOCKCYPHER_TOKEN` | BTC/ETH wallet balance and transaction graph | Yes | [blockcypher.com](https://www.blockcypher.com) |
|
|
255
316
|
| `ETHERSCAN_API_KEY` | ETH wallet lookups | Yes | [etherscan.io/apis](https://etherscan.io/apis) |
|
|
317
|
+
| `SCRAPINGANT_API_KEY` + `VOIDACCESS_USE_PROXIES=true` | Optional clearnet proxy for paste sites + RSS feeds (see below) | Yes (free tier) | [scrapingant.com](https://scrapingant.com/?ref=mzliyzh) |
|
|
318
|
+
|
|
319
|
+
### Optional: Clearnet Scraping Proxy (ScrapingAnt)
|
|
320
|
+
|
|
321
|
+
When a third-party clearnet site rate-limits or blocks VoidAccess's outbound IP, every paste site fetch and every RSS feed fetch in the same investigation can fail. The optional **ScrapingAnt** integration routes those specific requests through ScrapingAnt — either its Web Scraping API or its Proxy Mode HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
|
|
322
|
+
|
|
323
|
+
**What it covers** — paste sites (Pastebin, dpaste, paste.ee, Rentry) and the 20 curated RSS security feeds (Krebs on Security, BleepingComputer, Talos, Mandiant, CrowdStrike, Unit 42, CISA, and others). Nothing else.
|
|
324
|
+
|
|
325
|
+
**What it does not cover** — Tor traffic, dark web scraping, and `.onion` fetches are **completely unaffected** by this setting regardless of how it is configured. The proxy only sees the two clearnet sources named above.
|
|
326
|
+
|
|
327
|
+
**GitHub and GitLab scraping are also unaffected** — and intentionally so. Both of those scrapers carry authentication tokens (`GITHUB_TOKEN`, `GITLAB_TOKEN`) in their requests. Forwarding those tokens through a third-party proxy would expose them to that third party, which is unacceptable from a security standpoint. Both scrapers always go direct to the GitHub/GitLab API regardless of the proxy setting. This is a permanent design constraint, not something the proxy toggle can override.
|
|
328
|
+
|
|
329
|
+
**It's entirely optional.** VoidAccess behaves identically without it — paste sites and RSS feeds are simply fetched directly, exactly as they were in every prior release. Add the key only if you see upstream rate-limiting or blocks affecting those two sources.
|
|
330
|
+
|
|
331
|
+
#### Two mutually exclusive transports (v1.6.0)
|
|
332
|
+
|
|
333
|
+
Per the [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): *"The proxy mode is a light front-end for the scraping API and has all the same functionality and performance as sending requests to the API endpoint."* Therefore the two transports below are **alternate transports to the same backend service** — pick ONE per request, never both:
|
|
334
|
+
|
|
335
|
+
| Transport | Env var | Required config | What it does |
|
|
336
|
+
|---|---|---|---|
|
|
337
|
+
| **REST API** | `VOIDACCESS_USE_PROXIES=true` | `SCRAPINGANT_API_KEY` | POSTs the target URL to `api.scrapingant.com/v2/general` and returns the response body. Legacy v1.5.0 toggle. |
|
|
338
|
+
| **Proxy Mode** | `VOIDACCESS_USE_PROXY=true` | `SCRAPINGANT_API_KEY` (only) | Routes the request as HTTP CONNECT through `proxy.scrapingant.com:8080` with username string built at connection time per docs: `scrapingant&browser=false&proxy_type=residential\|datacenter`. |
|
|
339
|
+
|
|
340
|
+
The Proxy Mode transport also reads `SCRAPINGANT_PROXY_TYPE` to pick the pool: `residential` (default; harder to detect, slightly higher latency) or `datacenter` (faster, cheaper, easier to fingerprint).
|
|
341
|
+
|
|
342
|
+
**Missing credentials leave both transports inactive.** Setting either transport env var to `true` without `SCRAPINGANT_API_KEY` is a no-op for that transport. No errors, no surprises.
|
|
343
|
+
|
|
344
|
+
**If both transport env vars are set, Proxy Mode wins** with a one-shot info log at runtime — there is no chained mode (Proxy Mode is documented as "the same functionality" as the REST API, so stacking them would double-charge without adding capability).
|
|
345
|
+
|
|
346
|
+
**How to turn it on** — all four surfaces, covering either transport:
|
|
347
|
+
|
|
348
|
+
| Surface | How |
|
|
349
|
+
|---|---|
|
|
350
|
+
| CLI configure wizard | `voidaccess configure` then `voidaccess configure keys` — paste sites and RSS feeds will be flagged with their honest "never Tor" description before any field is asked for. The interactive prompt covers the key, pool type, and asks about each transport separately. |
|
|
351
|
+
| `voidaccess configure proxy` (subcommand) | Interactive prompt for key + pool type. Non-interactive flags: `--enable / --disable` (REST API transport), `--enable-proxy / --disable-proxy` (Proxy Mode transport), `--show` (prints masked key `abcd…5678`, pool type, and both transport states). |
|
|
352
|
+
| `setup.sh` during Docker install | Group F in the Enrichment Keys step; prompts for key + pool type, asks about each transport toggle separately. |
|
|
353
|
+
| `--use-proxies` flag (single run) | `voidaccess investigate "query" --use-proxies` — sets `VOIDACCESS_USE_PROXIES=true` (REST API transport) for the current process only, leaves the on-disk config untouched. |
|
|
354
|
+
| Docker / web settings page | Settings → API Keys → ScrapingAnt. Stored encrypted at rest via the existing per-user `UserApiKey` mechanism (Fernet AES-128). |
|
|
355
|
+
|
|
356
|
+
**Referral signup:** [https://scrapingant.com/?ref=mzliyzh](https://scrapingant.com/?ref=mzliyzh) (referral bonus applied on first paid plan; a free tier is available for low-volume use).
|
|
256
357
|
|
|
257
358
|
---
|
|
258
359
|
|
|
@@ -19,6 +19,35 @@ Commercial threat intelligence platforms often charge prohibitive annual fees fo
|
|
|
19
19
|
|
|
20
20
|
---
|
|
21
21
|
|
|
22
|
+
## What's New in v1.6.0
|
|
23
|
+
|
|
24
|
+
- **Optional clearnet scraping proxy (ScrapingAnt)** — paste sites and RSS feeds can now be routed through ScrapingAnt. Affects clearnet scraping only; Tor, `.onion`, GitHub, and GitLab traffic are never affected.
|
|
25
|
+
- **Two mutually exclusive transports** — pick one, not both:
|
|
26
|
+
- **REST API transport** — `VOIDACCESS_USE_PROXIES=true` (legacy v1.5.0 toggle) routes requests through ScrapingAnt's Web Scraping API.
|
|
27
|
+
- **Proxy Mode transport** — `VOIDACCESS_USE_PROXY=true` routes requests through ScrapingAnt's HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
|
|
28
|
+
- Per [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): "Proxy Mode is a light front-end for the scraping API and has all the same functionality and performance" — so the two are alternate transports to the same backend, never chained.
|
|
29
|
+
- **`SCRAPINGANT_PROXY_TYPE`** — `residential` (default) or `datacenter`; per docs this is passed as a `proxy_type=` parameter in the Proxy Mode username string (which is built at connection time as `"scrapingant&browser=false&proxy_type=..."`). NOT a separate hostname.
|
|
30
|
+
- **Single credential** — `SCRAPINGANT_API_KEY` is the only real credential; the Proxy Mode username is a literal constant per docs. No per-customer username field, no second key.
|
|
31
|
+
- **New CLI surfaces** — `voidaccess configure proxy` now prompts for key + type in one uninterrupted block, plus `--enable-proxy / --disable-proxy` for non-interactive Proxy Mode toggling and `--show` for masked state inspection.
|
|
32
|
+
- **55 → 74 proxy-config tests** — covers both transports, the `apply_env()` independent-toggle guarantee, the single-transport selection logic, and the masked `--show` output.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## What's New in v1.5.0
|
|
37
|
+
|
|
38
|
+
- 37 new entity types across crypto, credentials, messaging, and network/forensic indicators.
|
|
39
|
+
- YARA, Snort, Suricata, and IOC package ZIP exports.
|
|
40
|
+
- Persistent actor profiles with aliases, infrastructure, notes, and timelines.
|
|
41
|
+
- Cross-alias resolution using shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation.
|
|
42
|
+
- Backend graph community detection and path-between-nodes queries.
|
|
43
|
+
- CLI graph browser path finder and frontend Find Path highlighting.
|
|
44
|
+
- Per-phase pipeline timeouts for enrichment, graph, summary, finalize, and parallel sources.
|
|
45
|
+
- `sources_used` and `infrastructure_clusters` persist in investigation metadata.
|
|
46
|
+
- Cross-run enrichment cache with Redis, SQLite, and memory backends.
|
|
47
|
+
- Auto-discovery and weekly validation of `.onion` seeds.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
22
51
|
## Quick Start
|
|
23
52
|
|
|
24
53
|
### Option A - CLI (no Docker, 30 seconds)
|
|
@@ -56,11 +85,28 @@ The Docker stack includes PostgreSQL, Tor, FastAPI, and Next.js.
|
|
|
56
85
|
|---|---|
|
|
57
86
|
| `voidaccess investigate` | Run an investigation |
|
|
58
87
|
| `voidaccess show` | Interactive entity browser |
|
|
59
|
-
| `voidaccess export` | Export STIX/MISP/Sigma/CSV/MD |
|
|
88
|
+
| `voidaccess export` | Export STIX/MISP/Sigma/YARA/Snort/Suricata/package/CSV/MD/JSON |
|
|
89
|
+
| `voidaccess package <file>` | Export an IOC ZIP bundle |
|
|
60
90
|
| `voidaccess enrich` | Re-enrich saved results |
|
|
61
91
|
| `voidaccess list` | List saved investigations |
|
|
62
|
-
| `voidaccess status` | Config
|
|
92
|
+
| `voidaccess status` | Config, API key, cache, engine, and seed status |
|
|
93
|
+
| `voidaccess actors` | List persistent actor profiles |
|
|
94
|
+
| `voidaccess actor <handle>` | Show an actor profile with aliases, infrastructure, notes, and history |
|
|
95
|
+
| `voidaccess actor <handle> --timeline` | Show an actor activity timeline |
|
|
96
|
+
| `voidaccess actor <handle> --note "text"` | Append an analyst note to an actor profile |
|
|
97
|
+
| `voidaccess timeline <handle>` | Shortcut for `voidaccess actor <handle> --timeline` |
|
|
63
98
|
| `voidaccess configure` | Setup wizard |
|
|
99
|
+
| `voidaccess configure proxy` | ScrapingAnt key, username, type, and routing toggles. Flags: `--enable / --disable` (API gate), `--enable-proxy / --disable-proxy` (proxy gate), `--show` (masked state) |
|
|
100
|
+
|
|
101
|
+
Export examples:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
voidaccess package investigation.json
|
|
105
|
+
voidaccess export investigation.json --format yara
|
|
106
|
+
voidaccess export investigation.json --format snort
|
|
107
|
+
voidaccess export investigation.json --format suricata
|
|
108
|
+
voidaccess status --seeds
|
|
109
|
+
```
|
|
64
110
|
|
|
65
111
|
### CLI vs Docker
|
|
66
112
|
|
|
@@ -121,14 +167,17 @@ VoidAccess handles the complexity of dark web research through a rigorous sequen
|
|
|
121
167
|
|
|
122
168
|
## What It Extracts
|
|
123
169
|
|
|
124
|
-
The extraction pipeline identifies
|
|
170
|
+
The extraction pipeline identifies 55+ entity types:
|
|
125
171
|
|
|
126
172
|
| Category | Examples |
|
|
127
173
|
|---|---|
|
|
128
|
-
| **Cryptocurrency** | Bitcoin, Ethereum, Monero
|
|
174
|
+
| **Cryptocurrency** | Bitcoin, Ethereum, Monero, Litecoin, Zcash, Dogecoin, XRP, Solana, Tron, Bitcoin Cash, Dash, ENS |
|
|
129
175
|
| **Network Indicators** | IPv4 addresses, .onion URLs, domains, email addresses, PGP keys |
|
|
130
176
|
| **File Indicators** | MD5, SHA1, SHA256 hashes |
|
|
131
|
-
| **
|
|
177
|
+
| **Credentials** | AWS keys, GitHub tokens, Slack tokens, Discord tokens, JWTs, Google API keys, Stripe keys, generic API keys, stealer log entries |
|
|
178
|
+
| **Messaging Handles** | Telegram, Discord, XMPP, Tox, Session, Matrix, Wire, ICQ, Wickr |
|
|
179
|
+
| **Network/Forensic** | IPv6, MAC addresses, IPFS CIDs, combo-list entries, YARA rules, MITRE tactics, Exploit-DB IDs, Nuclei templates, seed phrases |
|
|
180
|
+
| **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques and tactics |
|
|
132
181
|
| **Threat Actors** | Actor handles, malware families, ransomware group names |
|
|
133
182
|
| **Paste Sites** | Pastebin, Ghostbin, Rentry, and similar links |
|
|
134
183
|
| **People/Orgs** | Named persons, organization names, locations |
|
|
@@ -168,7 +217,19 @@ Export formats:
|
|
|
168
217
|
- **STIX 2.1** — bundles with indicators, threat actors, malware objects
|
|
169
218
|
- **MISP JSON** — events with galaxies for direct import
|
|
170
219
|
- **Sigma rules** — auto-generated detection rules from extracted IOCs
|
|
171
|
-
- **
|
|
220
|
+
- **YARA rules** - generated rules for malware, credentials, infrastructure, and IOC strings
|
|
221
|
+
- **Snort rules** - network detection rules for IPs, domains, URLs, and selected IOC content
|
|
222
|
+
- **Suricata rules** - Suricata-compatible network rules with the same IOC coverage as Snort
|
|
223
|
+
- **IOC package ZIP** - 21-file bundle containing text IOC lists, STIX, MISP, Sigma, YARA, Snort, Suricata, summary, and CSV
|
|
224
|
+
- **CSV** - flat entity dumps for spreadsheet analysis
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## Actor Intelligence
|
|
229
|
+
|
|
230
|
+
VoidAccess v1.5.0 persists actor profiles across investigations in `actor_profiles`, with linked aliases and infrastructure in `actor_aliases` and `actor_infrastructure`. Profiles are populated from threat actor, ransomware group, and handle entities, then enriched with co-occurring infrastructure and timeline events.
|
|
231
|
+
|
|
232
|
+
Cross-alias resolution scores five signals: shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation. Use `voidaccess actors` to list profiles, `voidaccess actor <handle>` for the full profile, `voidaccess actor <handle> --timeline` for chronology, and `voidaccess actor <handle> --note "text"` for analyst notes.
|
|
172
233
|
|
|
173
234
|
---
|
|
174
235
|
|
|
@@ -207,6 +268,46 @@ All enrichment sources that require a key degrade gracefully when the key is abs
|
|
|
207
268
|
| `GITLAB_TOKEN` | Raises GitLab scraping from 15 to 60 req/min | Free | [gitlab.com/profile/personal_access_tokens](https://gitlab.com/-/profile/personal_access_tokens) |
|
|
208
269
|
| `BLOCKCYPHER_TOKEN` | BTC/ETH wallet balance and transaction graph | Yes | [blockcypher.com](https://www.blockcypher.com) |
|
|
209
270
|
| `ETHERSCAN_API_KEY` | ETH wallet lookups | Yes | [etherscan.io/apis](https://etherscan.io/apis) |
|
|
271
|
+
| `SCRAPINGANT_API_KEY` + `VOIDACCESS_USE_PROXIES=true` | Optional clearnet proxy for paste sites + RSS feeds (see below) | Yes (free tier) | [scrapingant.com](https://scrapingant.com/?ref=mzliyzh) |
|
|
272
|
+
|
|
273
|
+
### Optional: Clearnet Scraping Proxy (ScrapingAnt)
|
|
274
|
+
|
|
275
|
+
When a third-party clearnet site rate-limits or blocks VoidAccess's outbound IP, every paste site fetch and every RSS feed fetch in the same investigation can fail. The optional **ScrapingAnt** integration routes those specific requests through ScrapingAnt — either its Web Scraping API or its Proxy Mode HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
|
|
276
|
+
|
|
277
|
+
**What it covers** — paste sites (Pastebin, dpaste, paste.ee, Rentry) and the 20 curated RSS security feeds (Krebs on Security, BleepingComputer, Talos, Mandiant, CrowdStrike, Unit 42, CISA, and others). Nothing else.
|
|
278
|
+
|
|
279
|
+
**What it does not cover** — Tor traffic, dark web scraping, and `.onion` fetches are **completely unaffected** by this setting regardless of how it is configured. The proxy only sees the two clearnet sources named above.
|
|
280
|
+
|
|
281
|
+
**GitHub and GitLab scraping are also unaffected** — and intentionally so. Both of those scrapers carry authentication tokens (`GITHUB_TOKEN`, `GITLAB_TOKEN`) in their requests. Forwarding those tokens through a third-party proxy would expose them to that third party, which is unacceptable from a security standpoint. Both scrapers always go direct to the GitHub/GitLab API regardless of the proxy setting. This is a permanent design constraint, not something the proxy toggle can override.
|
|
282
|
+
|
|
283
|
+
**It's entirely optional.** VoidAccess behaves identically without it — paste sites and RSS feeds are simply fetched directly, exactly as they were in every prior release. Add the key only if you see upstream rate-limiting or blocks affecting those two sources.
|
|
284
|
+
|
|
285
|
+
#### Two mutually exclusive transports (v1.6.0)
|
|
286
|
+
|
|
287
|
+
Per the [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): *"The proxy mode is a light front-end for the scraping API and has all the same functionality and performance as sending requests to the API endpoint."* Therefore the two transports below are **alternate transports to the same backend service** — pick ONE per request, never both:
|
|
288
|
+
|
|
289
|
+
| Transport | Env var | Required config | What it does |
|
|
290
|
+
|---|---|---|---|
|
|
291
|
+
| **REST API** | `VOIDACCESS_USE_PROXIES=true` | `SCRAPINGANT_API_KEY` | POSTs the target URL to `api.scrapingant.com/v2/general` and returns the response body. Legacy v1.5.0 toggle. |
|
|
292
|
+
| **Proxy Mode** | `VOIDACCESS_USE_PROXY=true` | `SCRAPINGANT_API_KEY` (only) | Routes the request as HTTP CONNECT through `proxy.scrapingant.com:8080` with username string built at connection time per docs: `scrapingant&browser=false&proxy_type=residential\|datacenter`. |
|
|
293
|
+
|
|
294
|
+
The Proxy Mode transport also reads `SCRAPINGANT_PROXY_TYPE` to pick the pool: `residential` (default; harder to detect, slightly higher latency) or `datacenter` (faster, cheaper, easier to fingerprint).
|
|
295
|
+
|
|
296
|
+
**Missing credentials leave both transports inactive.** Setting either transport env var to `true` without `SCRAPINGANT_API_KEY` is a no-op for that transport. No errors, no surprises.
|
|
297
|
+
|
|
298
|
+
**If both transport env vars are set, Proxy Mode wins** with a one-shot info log at runtime — there is no chained mode (Proxy Mode is documented as "the same functionality" as the REST API, so stacking them would double-charge without adding capability).
|
|
299
|
+
|
|
300
|
+
**How to turn it on** — all four surfaces, covering either transport:
|
|
301
|
+
|
|
302
|
+
| Surface | How |
|
|
303
|
+
|---|---|
|
|
304
|
+
| CLI configure wizard | `voidaccess configure` then `voidaccess configure keys` — paste sites and RSS feeds will be flagged with their honest "never Tor" description before any field is asked for. The interactive prompt covers the key, pool type, and asks about each transport separately. |
|
|
305
|
+
| `voidaccess configure proxy` (subcommand) | Interactive prompt for key + pool type. Non-interactive flags: `--enable / --disable` (REST API transport), `--enable-proxy / --disable-proxy` (Proxy Mode transport), `--show` (prints masked key `abcd…5678`, pool type, and both transport states). |
|
|
306
|
+
| `setup.sh` during Docker install | Group F in the Enrichment Keys step; prompts for key + pool type, asks about each transport toggle separately. |
|
|
307
|
+
| `--use-proxies` flag (single run) | `voidaccess investigate "query" --use-proxies` — sets `VOIDACCESS_USE_PROXIES=true` (REST API transport) for the current process only, leaves the on-disk config untouched. |
|
|
308
|
+
| Docker / web settings page | Settings → API Keys → ScrapingAnt. Stored encrypted at rest via the existing per-user `UserApiKey` mechanism (Fernet AES-128). |
|
|
309
|
+
|
|
310
|
+
**Referral signup:** [https://scrapingant.com/?ref=mzliyzh](https://scrapingant.com/?ref=mzliyzh) (referral bonus applied on first paid plan; a free tier is available for low-volume use).
|
|
210
311
|
|
|
211
312
|
---
|
|
212
313
|
|
|
@@ -14,7 +14,8 @@ import asyncio
|
|
|
14
14
|
import logging
|
|
15
15
|
import os
|
|
16
16
|
from contextlib import asynccontextmanager
|
|
17
|
-
from
|
|
17
|
+
from datetime import datetime, timedelta, timezone
|
|
18
|
+
from typing import Callable, Optional
|
|
18
19
|
|
|
19
20
|
from fastapi import FastAPI, Depends, Request
|
|
20
21
|
from fastapi.exceptions import RequestValidationError
|
|
@@ -24,7 +25,7 @@ from slowapi import Limiter
|
|
|
24
25
|
from slowapi.errors import RateLimitExceeded
|
|
25
26
|
from slowapi.util import get_remote_address
|
|
26
27
|
|
|
27
|
-
from api.routes import entities, export, investigations, monitors, search, auth, admin, settings
|
|
28
|
+
from api.routes import entities, export, investigations, monitors, search, auth, admin, settings, actors
|
|
28
29
|
from api.auth import get_current_user
|
|
29
30
|
from monitor.scheduler import start_scheduler
|
|
30
31
|
|
|
@@ -118,18 +119,19 @@ async def lifespan(app: FastAPI):
|
|
|
118
119
|
except Exception as e:
|
|
119
120
|
logger.warning(f"Seed database load failed (non-fatal): {e}")
|
|
120
121
|
|
|
121
|
-
# Recover stranded processing investigations
|
|
122
|
+
# Recover stranded processing investigations (Phase 6.3 startup sweep)
|
|
123
|
+
# On startup: every investigation left in 'processing' by a previous
|
|
124
|
+
# process is marked failed — the pipeline tasks that owned them are
|
|
125
|
+
# gone. A periodic sweep (every 5 min) handles investigations that
|
|
126
|
+
# get stuck while the server is alive.
|
|
122
127
|
try:
|
|
123
128
|
if os.getenv("DATABASE_URL"):
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
+
swept = await _sweep_stuck_investigations(cutoff_minutes=None)
|
|
130
|
+
if swept:
|
|
131
|
+
logger.warning(
|
|
132
|
+
"Recovered %d stranded investigations (marked as failed).",
|
|
133
|
+
swept,
|
|
129
134
|
)
|
|
130
|
-
if stranded_count > 0:
|
|
131
|
-
session.commit()
|
|
132
|
-
logger.warning(f"Recovered {stranded_count} stranded investigations (marked as failed).")
|
|
133
135
|
except Exception as e:
|
|
134
136
|
logger.warning(f"Failed to recover stranded investigations: {e}")
|
|
135
137
|
|
|
@@ -145,9 +147,31 @@ async def lifespan(app: FastAPI):
|
|
|
145
147
|
logger.error(f"APScheduler failed to start: {e}")
|
|
146
148
|
scheduler = None
|
|
147
149
|
|
|
150
|
+
# Start periodic stuck-investigation sweeper (Phase 6.3). Runs every
|
|
151
|
+
# 5 minutes and marks investigations stuck in 'processing' for more
|
|
152
|
+
# than INVESTIGATION_HARD_TIMEOUT_MINUTES as 'failed'. Cancelled on
|
|
153
|
+
# shutdown below.
|
|
154
|
+
_periodic_sweep_task: Optional[asyncio.Task] = None
|
|
155
|
+
if os.getenv("DATABASE_URL"):
|
|
156
|
+
try:
|
|
157
|
+
_periodic_sweep_task = asyncio.create_task(
|
|
158
|
+
_periodic_stuck_sweep(),
|
|
159
|
+
name="voidaccess-stuck-investigation-sweeper",
|
|
160
|
+
)
|
|
161
|
+
logger.info("Periodic stuck-investigation sweeper started (every 5 min).")
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.warning(f"Failed to start periodic sweeper: {e}")
|
|
164
|
+
|
|
148
165
|
yield
|
|
149
166
|
|
|
150
167
|
# --- Shutdown ---
|
|
168
|
+
if _periodic_sweep_task is not None and not _periodic_sweep_task.done():
|
|
169
|
+
_periodic_sweep_task.cancel()
|
|
170
|
+
try:
|
|
171
|
+
await _periodic_sweep_task
|
|
172
|
+
except (asyncio.CancelledError, Exception):
|
|
173
|
+
pass
|
|
174
|
+
|
|
151
175
|
if scheduler and scheduler.running:
|
|
152
176
|
scheduler.shutdown(wait=False)
|
|
153
177
|
logger.warning("APScheduler stopped")
|
|
@@ -170,6 +194,120 @@ async def lifespan(app: FastAPI):
|
|
|
170
194
|
pass
|
|
171
195
|
|
|
172
196
|
|
|
197
|
+
# ---------------------------------------------------------------------------
|
|
198
|
+
# Stuck-investigation sweeper (Phase 6.3)
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
# FastAPI BackgroundTasks runs in the same process as the HTTP handler.
|
|
201
|
+
# If the worker crashes mid-investigation, the row stays at status='processing'
|
|
202
|
+
# forever. This sweeper marks them 'failed' on two schedules:
|
|
203
|
+
#
|
|
204
|
+
# 1. Startup — cutoff_minutes=None → every 'processing' row is swept
|
|
205
|
+
# (the prior process is gone, no legitimate owner).
|
|
206
|
+
# 2. Periodic — every 5 minutes, cutoff = INVESTIGATION_HARD_TIMEOUT_MINUTES
|
|
207
|
+
# (configurable via env). Defends against in-process hangs.
|
|
208
|
+
#
|
|
209
|
+
# The sweep only ever UPDATES status; it never deletes rows.
|
|
210
|
+
|
|
211
|
+
# Hard timeout after which an investigation is considered permanently stuck.
|
|
212
|
+
# Default 30 min — generous enough to cover the slowest legitimate run
|
|
213
|
+
# (parallel_sources 300s + enrichment 120s + graph 60s + summary 90s + finalize
|
|
214
|
+
# 30s ≈ 10 min on a healthy host; 30 min is 3x that to absorb transient
|
|
215
|
+
# network slowness without false positives).
|
|
216
|
+
INVESTIGATION_HARD_TIMEOUT_MINUTES = int(
|
|
217
|
+
os.getenv("VOIDACCESS_INVESTIGATION_HARD_TIMEOUT_MINUTES", "30") or 30
|
|
218
|
+
)
|
|
219
|
+
# Periodic sweep interval. 5 min is a good default — catches stuck rows
|
|
220
|
+
# quickly without flooding the DB.
|
|
221
|
+
SWEEP_INTERVAL_SECONDS = int(
|
|
222
|
+
os.getenv("VOIDACCESS_SWEEP_INTERVAL_SECONDS", "300") or 300
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
async def _sweep_stuck_investigations(cutoff_minutes: Optional[int] = 30) -> int:
|
|
227
|
+
"""Mark investigations stuck in 'processing' as 'failed'.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
cutoff_minutes: Only sweep rows older than this many minutes.
|
|
231
|
+
``None`` → startup mode: sweep *all* processing rows (the prior
|
|
232
|
+
process is gone, no legitimate owner remains).
|
|
233
|
+
``int`` → periodic mode: sweep only rows older than the cutoff.
|
|
234
|
+
|
|
235
|
+
Returns the number of rows swept. Returns 0 when DB is unconfigured,
|
|
236
|
+
the table is missing, or no rows match — never raises.
|
|
237
|
+
"""
|
|
238
|
+
if not os.getenv("DATABASE_URL"):
|
|
239
|
+
return 0
|
|
240
|
+
try:
|
|
241
|
+
from db.session import get_session
|
|
242
|
+
from db.models import Investigation
|
|
243
|
+
|
|
244
|
+
# Build the query in a short-lived session, do the UPDATE in another.
|
|
245
|
+
with get_session() as session:
|
|
246
|
+
query = session.query(Investigation).filter(
|
|
247
|
+
Investigation.status == "processing"
|
|
248
|
+
)
|
|
249
|
+
if cutoff_minutes is not None:
|
|
250
|
+
cutoff_dt = datetime.now(timezone.utc) - timedelta(
|
|
251
|
+
minutes=cutoff_minutes
|
|
252
|
+
)
|
|
253
|
+
query = query.filter(Investigation.created_at < cutoff_dt)
|
|
254
|
+
|
|
255
|
+
stuck = query.all()
|
|
256
|
+
if not stuck:
|
|
257
|
+
return 0
|
|
258
|
+
|
|
259
|
+
swept_ids = [inv.id for inv in stuck]
|
|
260
|
+
sweep_reason = (
|
|
261
|
+
"Server restarted mid-investigation"
|
|
262
|
+
if cutoff_minutes is None
|
|
263
|
+
else f"Investigation timed out after {cutoff_minutes} min — "
|
|
264
|
+
"server may have restarted or pipeline may be hung"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Update outside the read session.
|
|
268
|
+
from sqlalchemy import update
|
|
269
|
+
with get_session() as session:
|
|
270
|
+
session.execute(
|
|
271
|
+
update(Investigation)
|
|
272
|
+
.where(Investigation.id.in_(swept_ids))
|
|
273
|
+
.values(
|
|
274
|
+
status="failed",
|
|
275
|
+
summary=sweep_reason,
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
session.commit()
|
|
279
|
+
|
|
280
|
+
for inv_id in swept_ids:
|
|
281
|
+
logger.warning("Swept stuck investigation: %s", inv_id)
|
|
282
|
+
logger.info("Swept %d stuck investigations (cutoff=%s)", len(swept_ids), cutoff_minutes)
|
|
283
|
+
return len(swept_ids)
|
|
284
|
+
except Exception as exc:
|
|
285
|
+
logger.warning("Swept-investigation sweep failed: %s", exc)
|
|
286
|
+
return 0
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
async def _periodic_stuck_sweep() -> None:
|
|
290
|
+
"""Background task: every SWEEP_INTERVAL_SECONDS, sweep stuck rows.
|
|
291
|
+
|
|
292
|
+
Runs until cancelled by the lifespan teardown. Sleeps in a loop so
|
|
293
|
+
cancelling the task is the only stop signal — never raises.
|
|
294
|
+
"""
|
|
295
|
+
try:
|
|
296
|
+
while True:
|
|
297
|
+
await asyncio.sleep(SWEEP_INTERVAL_SECONDS)
|
|
298
|
+
try:
|
|
299
|
+
await _sweep_stuck_investigations(
|
|
300
|
+
cutoff_minutes=INVESTIGATION_HARD_TIMEOUT_MINUTES,
|
|
301
|
+
)
|
|
302
|
+
except asyncio.CancelledError:
|
|
303
|
+
raise
|
|
304
|
+
except Exception as exc:
|
|
305
|
+
logger.warning("Periodic stuck-investigation sweep iteration failed: %s", exc)
|
|
306
|
+
except asyncio.CancelledError:
|
|
307
|
+
logger.info("Periodic stuck-investigation sweep cancelled.")
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
|
|
173
311
|
# ---------------------------------------------------------------------------
|
|
174
312
|
# App setup
|
|
175
313
|
# ---------------------------------------------------------------------------
|
|
@@ -294,6 +432,12 @@ app.include_router(
|
|
|
294
432
|
tags=["monitors"],
|
|
295
433
|
dependencies=[Depends(get_current_user)],
|
|
296
434
|
)
|
|
435
|
+
app.include_router(
|
|
436
|
+
actors.router,
|
|
437
|
+
prefix="/actors",
|
|
438
|
+
tags=["actors"],
|
|
439
|
+
dependencies=[Depends(get_current_user)],
|
|
440
|
+
)
|
|
297
441
|
app.include_router(
|
|
298
442
|
admin.router,
|
|
299
443
|
prefix="/admin",
|