voidaccess 1.6.0__tar.gz → 1.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. voidaccess-1.6.0/README.md → voidaccess-1.6.2/PKG-INFO +542 -506
  2. voidaccess-1.6.0/PKG-INFO → voidaccess-1.6.2/README.md +492 -548
  3. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/settings.py +30 -7
  4. {voidaccess-1.6.0 → voidaccess-1.6.2}/config.py +5 -2
  5. {voidaccess-1.6.0 → voidaccess-1.6.2}/extractor/llm_extract.py +68 -3
  6. {voidaccess-1.6.0 → voidaccess-1.6.2}/pyproject.toml +19 -2
  7. voidaccess-1.6.2/sources/data/__init__.py +10 -0
  8. voidaccess-1.6.2/sources/data/onion_seeds.json +272 -0
  9. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/proxy_client.py +125 -57
  10. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/rss_scraper.py +100 -4
  11. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/seed_manager.py +106 -11
  12. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_cli_proxy_config.py +262 -67
  13. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_proxy_client.py +107 -58
  14. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess.egg-info/PKG-INFO +29 -39
  15. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess.egg-info/SOURCES.txt +2 -0
  16. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/__init__.py +1 -1
  17. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/commands/configure.py +54 -18
  18. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/commands/investigate.py +90 -10
  19. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/config.py +11 -6
  20. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/display.py +96 -3
  21. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/main.py +47 -18
  22. {voidaccess-1.6.0 → voidaccess-1.6.2}/LICENSE +0 -0
  23. {voidaccess-1.6.0 → voidaccess-1.6.2}/analysis/__init__.py +0 -0
  24. {voidaccess-1.6.0 → voidaccess-1.6.2}/analysis/opsec.py +0 -0
  25. {voidaccess-1.6.0 → voidaccess-1.6.2}/analysis/patterns.py +0 -0
  26. {voidaccess-1.6.0 → voidaccess-1.6.2}/analysis/temporal.py +0 -0
  27. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/__init__.py +0 -0
  28. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/auth.py +0 -0
  29. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/main.py +0 -0
  30. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/__init__.py +0 -0
  31. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/actors.py +0 -0
  32. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/admin.py +0 -0
  33. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/auth.py +0 -0
  34. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/entities.py +0 -0
  35. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/export.py +0 -0
  36. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/investigations.py +0 -0
  37. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/monitors.py +0 -0
  38. {voidaccess-1.6.0 → voidaccess-1.6.2}/api/routes/search.py +0 -0
  39. {voidaccess-1.6.0 → voidaccess-1.6.2}/auth/__init__.py +0 -0
  40. {voidaccess-1.6.0 → voidaccess-1.6.2}/auth/token_blacklist.py +0 -0
  41. {voidaccess-1.6.0 → voidaccess-1.6.2}/crawler/__init__.py +0 -0
  42. {voidaccess-1.6.0 → voidaccess-1.6.2}/crawler/dedup.py +0 -0
  43. {voidaccess-1.6.0 → voidaccess-1.6.2}/crawler/frontier.py +0 -0
  44. {voidaccess-1.6.0 → voidaccess-1.6.2}/crawler/spider.py +0 -0
  45. {voidaccess-1.6.0 → voidaccess-1.6.2}/crawler/utils.py +0 -0
  46. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/__init__.py +0 -0
  47. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/__init__.py +0 -0
  48. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/env.py +0 -0
  49. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0001_initial_schema.py +0 -0
  50. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0002_add_investigation_status_column.py +0 -0
  51. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0002_add_missing_tables.py +0 -0
  52. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0003_add_canonical_value_and_entity_links.py +0 -0
  53. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0004_add_page_posted_at.py +0 -0
  54. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0005_add_extraction_method.py +0 -0
  55. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0006_add_monitor_alerts.py +0 -0
  56. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0007_add_actor_style_profiles.py +0 -0
  57. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0008_add_users_table.py +0 -0
  58. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0009_add_investigation_id_to_relationships.py +0 -0
  59. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0010_add_composite_index_entity_relationships.py +0 -0
  60. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0011_add_page_extraction_cache.py +0 -0
  61. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0013_add_graph_status.py +0 -0
  62. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0015_add_progress_fields.py +0 -0
  63. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0016_backfill_graph_status.py +0 -0
  64. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0017_add_user_api_keys.py +0 -0
  65. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0018_add_user_id_to_investigations.py +0 -0
  66. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0019_add_content_safety_log.py +0 -0
  67. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0020_add_entity_source_tracking.py +0 -0
  68. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0021_add_search_engine_stats.py +0 -0
  69. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0022_add_actor_profiles.py +0 -0
  70. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/migrations/versions/0023_add_investigation_metadata.py +0 -0
  71. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/models.py +0 -0
  72. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/queries.py +0 -0
  73. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/search_engine_stats.py +0 -0
  74. {voidaccess-1.6.0 → voidaccess-1.6.2}/db/session.py +0 -0
  75. {voidaccess-1.6.0 → voidaccess-1.6.2}/export/__init__.py +0 -0
  76. {voidaccess-1.6.0 → voidaccess-1.6.2}/export/ioc_package.py +0 -0
  77. {voidaccess-1.6.0 → voidaccess-1.6.2}/export/misp.py +0 -0
  78. {voidaccess-1.6.0 → voidaccess-1.6.2}/export/sigma.py +0 -0
  79. {voidaccess-1.6.0 → voidaccess-1.6.2}/export/snort_export.py +0 -0
  80. {voidaccess-1.6.0 → voidaccess-1.6.2}/export/stix.py +0 -0
  81. {voidaccess-1.6.0 → voidaccess-1.6.2}/export/yara_export.py +0 -0
  82. {voidaccess-1.6.0 → voidaccess-1.6.2}/extractor/__init__.py +0 -0
  83. {voidaccess-1.6.0 → voidaccess-1.6.2}/extractor/ner.py +0 -0
  84. {voidaccess-1.6.0 → voidaccess-1.6.2}/extractor/normalizer.py +0 -0
  85. {voidaccess-1.6.0 → voidaccess-1.6.2}/extractor/pipeline.py +0 -0
  86. {voidaccess-1.6.0 → voidaccess-1.6.2}/extractor/regex_patterns.py +0 -0
  87. {voidaccess-1.6.0 → voidaccess-1.6.2}/fingerprint/__init__.py +0 -0
  88. {voidaccess-1.6.0 → voidaccess-1.6.2}/fingerprint/profiler.py +0 -0
  89. {voidaccess-1.6.0 → voidaccess-1.6.2}/fingerprint/stylometry.py +0 -0
  90. {voidaccess-1.6.0 → voidaccess-1.6.2}/graph/__init__.py +0 -0
  91. {voidaccess-1.6.0 → voidaccess-1.6.2}/graph/builder.py +0 -0
  92. {voidaccess-1.6.0 → voidaccess-1.6.2}/graph/export.py +0 -0
  93. {voidaccess-1.6.0 → voidaccess-1.6.2}/graph/model.py +0 -0
  94. {voidaccess-1.6.0 → voidaccess-1.6.2}/graph/queries.py +0 -0
  95. {voidaccess-1.6.0 → voidaccess-1.6.2}/graph/visualize.py +0 -0
  96. {voidaccess-1.6.0 → voidaccess-1.6.2}/i18n/__init__.py +0 -0
  97. {voidaccess-1.6.0 → voidaccess-1.6.2}/i18n/detect.py +0 -0
  98. {voidaccess-1.6.0 → voidaccess-1.6.2}/i18n/query_expand.py +0 -0
  99. {voidaccess-1.6.0 → voidaccess-1.6.2}/i18n/translate.py +0 -0
  100. {voidaccess-1.6.0 → voidaccess-1.6.2}/monitor/__init__.py +0 -0
  101. {voidaccess-1.6.0 → voidaccess-1.6.2}/monitor/_db.py +0 -0
  102. {voidaccess-1.6.0 → voidaccess-1.6.2}/monitor/alerts.py +0 -0
  103. {voidaccess-1.6.0 → voidaccess-1.6.2}/monitor/config.py +0 -0
  104. {voidaccess-1.6.0 → voidaccess-1.6.2}/monitor/diff.py +0 -0
  105. {voidaccess-1.6.0 → voidaccess-1.6.2}/monitor/jobs.py +0 -0
  106. {voidaccess-1.6.0 → voidaccess-1.6.2}/monitor/scheduler.py +0 -0
  107. {voidaccess-1.6.0 → voidaccess-1.6.2}/scraper/__init__.py +0 -0
  108. {voidaccess-1.6.0 → voidaccess-1.6.2}/scraper/scrape.py +0 -0
  109. {voidaccess-1.6.0 → voidaccess-1.6.2}/scraper/scrape_js.py +0 -0
  110. {voidaccess-1.6.0 → voidaccess-1.6.2}/search/__init__.py +0 -0
  111. {voidaccess-1.6.0 → voidaccess-1.6.2}/search/circuit_breaker.py +0 -0
  112. {voidaccess-1.6.0 → voidaccess-1.6.2}/search/query_builder.py +0 -0
  113. {voidaccess-1.6.0 → voidaccess-1.6.2}/search/search.py +0 -0
  114. {voidaccess-1.6.0 → voidaccess-1.6.2}/setup.cfg +0 -0
  115. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/__init__.py +0 -0
  116. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/actor_profiles.py +0 -0
  117. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/blockchain.py +0 -0
  118. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/cache.py +0 -0
  119. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/cisa.py +0 -0
  120. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/dns_enrichment.py +0 -0
  121. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/domain_reputation.py +0 -0
  122. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/email_reputation.py +0 -0
  123. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/engines.py +0 -0
  124. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/enrichment.py +0 -0
  125. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/github_scraper.py +0 -0
  126. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/gitlab_scraper.py +0 -0
  127. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/hash_reputation.py +0 -0
  128. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/historical_intel.py +0 -0
  129. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/ip_reputation.py +0 -0
  130. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/paste_scraper.py +0 -0
  131. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/pastes.py +0 -0
  132. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/seeds.py +0 -0
  133. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/shodan.py +0 -0
  134. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/telegram.py +0 -0
  135. {voidaccess-1.6.0 → voidaccess-1.6.2}/sources/virustotal.py +0 -0
  136. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_analysis_opsec.py +0 -0
  137. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_analysis_stylometry.py +0 -0
  138. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_analysis_temporal.py +0 -0
  139. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_api.py +0 -0
  140. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_api_monitors.py +0 -0
  141. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_blockchain.py +0 -0
  142. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_config.py +0 -0
  143. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_crawler.py +0 -0
  144. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_db.py +0 -0
  145. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_dns_enrichment.py +0 -0
  146. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_domain_reputation.py +0 -0
  147. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_email_reputation.py +0 -0
  148. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_filter_e2e_manual.py +0 -0
  149. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_filter_parser_manual.py +0 -0
  150. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_fingerprint.py +0 -0
  151. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_github_scraper.py +0 -0
  152. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_gitlab_scraper.py +0 -0
  153. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_graph.py +0 -0
  154. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_hash_reputation.py +0 -0
  155. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_i18n.py +0 -0
  156. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_ip_reputation.py +0 -0
  157. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_llm.py +0 -0
  158. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_llm_utils.py +0 -0
  159. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_model_singleton.py +0 -0
  160. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_monitor.py +0 -0
  161. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_pagination.py +0 -0
  162. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_paste_scraper.py +0 -0
  163. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_regex_patterns.py +0 -0
  164. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_rss_scraper.py +0 -0
  165. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_scrape_js.py +0 -0
  166. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_settings.py +0 -0
  167. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_snort_export.py +0 -0
  168. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_sources.py +0 -0
  169. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_sources_enrichment_new.py +0 -0
  170. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_vector.py +0 -0
  171. {voidaccess-1.6.0 → voidaccess-1.6.2}/tests/test_yara_export.py +0 -0
  172. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/__init__.py +0 -0
  173. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/async_utils.py +0 -0
  174. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/content_safety.py +0 -0
  175. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/defang.py +0 -0
  176. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/encryption.py +0 -0
  177. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/enrichment_cache.py +0 -0
  178. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/ioc_freshness.py +0 -0
  179. {voidaccess-1.6.0 → voidaccess-1.6.2}/utils/user_keys.py +0 -0
  180. {voidaccess-1.6.0 → voidaccess-1.6.2}/vector/__init__.py +0 -0
  181. {voidaccess-1.6.0 → voidaccess-1.6.2}/vector/embedder.py +0 -0
  182. {voidaccess-1.6.0 → voidaccess-1.6.2}/vector/model_singleton.py +0 -0
  183. {voidaccess-1.6.0 → voidaccess-1.6.2}/vector/search.py +0 -0
  184. {voidaccess-1.6.0 → voidaccess-1.6.2}/vector/store.py +0 -0
  185. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess/__init__.py +0 -0
  186. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess/config.py +0 -0
  187. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess/llm.py +0 -0
  188. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess/llm_utils.py +0 -0
  189. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess.egg-info/dependency_links.txt +0 -0
  190. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess.egg-info/entry_points.txt +0 -0
  191. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess.egg-info/requires.txt +0 -0
  192. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess.egg-info/top_level.txt +0 -0
  193. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/adapters/__init__.py +0 -0
  194. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/adapters/sqlite.py +0 -0
  195. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/browser.py +0 -0
  196. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/commands/__init__.py +0 -0
  197. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/commands/actors.py +0 -0
  198. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/commands/enrich.py +0 -0
  199. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/commands/export.py +0 -0
  200. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/commands/show.py +0 -0
  201. {voidaccess-1.6.0 → voidaccess-1.6.2}/voidaccess_cli/tor_detect.py +0 -0
@@ -1,506 +1,542 @@
1
- <div align="center">
2
- <img src="./public/logo_circle.png" width="160" alt="VoidAccess Logo">
3
- <h1>VoidAccess</h1>
4
- <p><strong>A self-hosted OSINT platform for dark web threat intelligence.</strong></p>
5
- <p>Automate the entire investigation workflow from query refinement to relationship mapping in 13 autonomous pipeline steps.</p>
6
- </div>
7
-
8
- ---
9
-
10
- ## The OSINT Powerhouse
11
-
12
- Commercial threat intelligence platforms often charge prohibitive annual fees for capabilities that can be run on private hardware. **VoidAccess** democratizes high-end dark web intelligence by providing an automated, end-to-end workflow:
13
-
14
- - **Query Refinement**: Intelligent search term optimization using LLMs.
15
- - **Multilingual Search**: Deep-web fan-out across English, Russian, and Chinese engines.
16
- - **Entity Extraction**: Autonomous identification of wallets, IOCs, PGP keys, and more.
17
- - **Relationship Mapping**: Dynamic graph generation from extracted data co-occurrence.
18
- - **Structured Export**: STIX 2.1, MISP, Sigma, and CSV support.
19
-
20
- ---
21
-
22
- ## What's New in v1.6.0
23
-
24
- - **Optional clearnet scraping proxy (ScrapingAnt)** — paste sites and RSS feeds can now be routed through ScrapingAnt. Affects clearnet scraping only; Tor, `.onion`, GitHub, and GitLab traffic are never affected.
25
- - **Two mutually exclusive transports** — pick one, not both:
26
- - **REST API transport** — `VOIDACCESS_USE_PROXIES=true` (legacy v1.5.0 toggle) routes requests through ScrapingAnt's Web Scraping API.
27
- - **Proxy Mode transport** — `VOIDACCESS_USE_PROXY=true` routes requests through ScrapingAnt's HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
28
- - Per [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): "Proxy Mode is a light front-end for the scraping API and has all the same functionality and performance" — so the two are alternate transports to the same backend, never chained.
29
- - **`SCRAPINGANT_PROXY_TYPE`** — `residential` (default) or `datacenter`; per docs this is passed as a `proxy_type=` parameter in the Proxy Mode username string (which is built at connection time as `"scrapingant&browser=false&proxy_type=..."`). NOT a separate hostname.
30
- - **Single credential** — `SCRAPINGANT_API_KEY` is the only real credential; the Proxy Mode username is a literal constant per docs. No per-customer username field, no second key.
31
- - **New CLI surfaces** — `voidaccess configure proxy` now prompts for key + type in one uninterrupted block, plus `--enable-proxy / --disable-proxy` for non-interactive Proxy Mode toggling and `--show` for masked state inspection.
32
- - **55 → 74 proxy-config tests** — covers both transports, the `apply_env()` independent-toggle guarantee, the single-transport selection logic, and the masked `--show` output.
33
-
34
- ---
35
-
36
- ## What's New in v1.5.0
37
-
38
- - 37 new entity types across crypto, credentials, messaging, and network/forensic indicators.
39
- - YARA, Snort, Suricata, and IOC package ZIP exports.
40
- - Persistent actor profiles with aliases, infrastructure, notes, and timelines.
41
- - Cross-alias resolution using shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation.
42
- - Backend graph community detection and path-between-nodes queries.
43
- - CLI graph browser path finder and frontend Find Path highlighting.
44
- - Per-phase pipeline timeouts for enrichment, graph, summary, finalize, and parallel sources.
45
- - `sources_used` and `infrastructure_clusters` persist in investigation metadata.
46
- - Cross-run enrichment cache with Redis, SQLite, and memory backends.
47
- - Auto-discovery and weekly validation of `.onion` seeds.
48
-
49
- ---
50
-
51
- ## Quick Start
52
-
53
- ### Option A - CLI (no Docker, 30 seconds)
54
-
55
- ```bash
56
- pip install voidaccess
57
- voidaccess configure
58
- voidaccess investigate "LockBit ransomware"
59
- ```
60
-
61
- <div align="center">
62
- <img src="./public/cli_investigation_gif.gif" alt="VoidAccess CLI investigation walkthrough" width="900">
63
- </div>
64
-
65
- Requires local Tor for dark web sources:
66
-
67
- - https://torproject.org
68
- - Use `--no-tor` for clearnet-only investigations
69
-
70
- The CLI stores config in `~/.voidaccess/config.json` and writes results to `~/.voidaccess/results/`.
71
-
72
- ### Option B - Docker (full stack, 5 minutes)
73
-
74
- ```bash
75
- git clone https://github.com/KatrielMoses/voidaccess
76
- cd voidaccess
77
- bash setup.sh
78
- ```
79
-
80
- The Docker stack includes PostgreSQL, Tor, FastAPI, and Next.js.
81
-
82
- ### CLI Commands
83
-
84
- | Command | Description |
85
- |---|---|
86
- | `voidaccess investigate` | Run an investigation |
87
- | `voidaccess show` | Interactive entity browser |
88
- | `voidaccess export` | Export STIX/MISP/Sigma/YARA/Snort/Suricata/package/CSV/MD/JSON |
89
- | `voidaccess package <file>` | Export an IOC ZIP bundle |
90
- | `voidaccess enrich` | Re-enrich saved results |
91
- | `voidaccess list` | List saved investigations |
92
- | `voidaccess status` | Config, API key, cache, engine, and seed status |
93
- | `voidaccess actors` | List persistent actor profiles |
94
- | `voidaccess actor <handle>` | Show an actor profile with aliases, infrastructure, notes, and history |
95
- | `voidaccess actor <handle> --timeline` | Show an actor activity timeline |
96
- | `voidaccess actor <handle> --note "text"` | Append an analyst note to an actor profile |
97
- | `voidaccess timeline <handle>` | Shortcut for `voidaccess actor <handle> --timeline` |
98
- | `voidaccess configure` | Setup wizard |
99
- | `voidaccess configure proxy` | ScrapingAnt key, username, type, and routing toggles. Flags: `--enable / --disable` (API gate), `--enable-proxy / --disable-proxy` (proxy gate), `--show` (masked state) |
100
-
101
- Export examples:
102
-
103
- ```bash
104
- voidaccess package investigation.json
105
- voidaccess export investigation.json --format yara
106
- voidaccess export investigation.json --format snort
107
- voidaccess export investigation.json --format suricata
108
- voidaccess status --seeds
109
- ```
110
-
111
- ### CLI vs Docker
112
-
113
- | Feature | CLI | Docker |
114
- |---|---|---|
115
- | Install time | 30 seconds | 5 minutes |
116
- | Dark web scraping | Requires local Tor | Built-in |
117
- | Graph visualization | Terminal TUI | sigma.js |
118
- | Monitoring/alerts | No | Yes |
119
- | Multi-user | No | Yes |
120
- | Persistence | SQLite (`~/.voidaccess`) | PostgreSQL |
121
-
122
- ---
123
-
124
- ## Visual Walkthrough
125
-
126
- ### 1. Intuitive Dashboard
127
- Start investigations with a clean, dark-themed interface designed for high-stakes research.
128
- ![Homepage](./public/homepage.png)
129
-
130
- ### 2. Intelligent Scoping
131
- Refine queries and select investigation depth with precision.
132
- ![Topic Selection](./public/topic_selection.png)
133
-
134
- ### 3. Real-time Pipeline Tracking
135
- Monitor the 13-step autonomous pipeline as it crawls and extracts intelligence.
136
- ![Loading](./public/loading.png)
137
-
138
- ### 4. Interactive Graph Intelligence
139
- Explore connections between entities, onion sites, and threat actors in a dynamic, high-contrast graph.
140
- ![Node Selection](./public/node_selection.png)
141
-
142
- ### 5. Comprehensive Intel Reports
143
- Get structured summaries and actionable artifacts once the scan completes.
144
- ![Scan Completed](./public/scan_completed.png)
145
-
146
- ---
147
-
148
- ## How It Works (The 13-Step Pipeline)
149
-
150
- VoidAccess handles the complexity of dark web research through a rigorous sequence:
151
-
152
- 1. **LLM Query Refinement**: Optimizes search terms for .onion engine indexing.
153
- 2. **Parallel Collection**: Queries 16+ Tor search engines simultaneously with paste sites (Pastebin, dpaste, paste.ee), GitHub, GitLab, and curated RSS security feeds.
154
- 3. **Intelligence Filtering**: LLM filters noise, keeping only relevant intelligence pages.
155
- 4. **Multi-Source Enrichment**: Pulls from AlienVault OTX, abuse.ch, ransomware.live, CISA KEV, Shodan, GreyNoise, AbuseIPDB, Feodo Tracker, C2IntelFeeds, and more — running in parallel with collection.
156
- 5. **Recursive .onion Discovery**: Discovers hidden links via seed URL crawling.
157
- 6. **Vector Cache Check**: Avoids redundant scraping for recently visited pages (24h TTL).
158
- 7. **Tor-Routed Scraping**: Safely fetches page content with a 1MB safety cap.
159
- 8. **Persistence**: Stores new content in the local vector cache.
160
- 9. **Intelligence Merging**: Combines scraped and enriched data for processing.
161
- 10. **Advanced Extraction**: Regex, NER, and LLM-based entity identification.
162
- 11. **Historical Cross-Referencing**: Validates data against seed datasets.
163
- 12. **Graph Construction**: Builds relationship nodes based on co-occurrence.
164
- 13. **Final Intelligence Summary**: LLM generates a structured technical briefing.
165
-
166
- ---
167
-
168
- ## What It Extracts
169
-
170
- The extraction pipeline identifies 55+ entity types:
171
-
172
- | Category | Examples |
173
- |---|---|
174
- | **Cryptocurrency** | Bitcoin, Ethereum, Monero, Litecoin, Zcash, Dogecoin, XRP, Solana, Tron, Bitcoin Cash, Dash, ENS |
175
- | **Network Indicators** | IPv4 addresses, .onion URLs, domains, email addresses, PGP keys |
176
- | **File Indicators** | MD5, SHA1, SHA256 hashes |
177
- | **Credentials** | AWS keys, GitHub tokens, Slack tokens, Discord tokens, JWTs, Google API keys, Stripe keys, generic API keys, stealer log entries |
178
- | **Messaging Handles** | Telegram, Discord, XMPP, Tox, Session, Matrix, Wire, ICQ, Wickr |
179
- | **Network/Forensic** | IPv6, MAC addresses, IPFS CIDs, combo-list entries, YARA rules, MITRE tactics, Exploit-DB IDs, Nuclei templates, seed phrases |
180
- | **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques and tactics |
181
- | **Threat Actors** | Actor handles, malware families, ransomware group names |
182
- | **Paste Sites** | Pastebin, Ghostbin, Rentry, and similar links |
183
- | **People/Orgs** | Named persons, organization names, locations |
184
-
185
- Parallel collection sources (run alongside Tor search):
186
-
187
- - **Paste sites** — Pastebin, dpaste, paste.ee, Rentry
188
- - **GitHub** — code search and repository READMEs
189
- - **GitLab** — code search and project pages
190
- - **RSS security feeds** — 20 curated feeds (Krebs, BleepingComputer, Talos, Mandiant, CrowdStrike, Unit 42, CISA, and more)
191
- - **Curated .onion seed catalogue** 31 vetted seeds across 8 categories, scored per query
192
-
193
- Enrichment and quality sources (19 total):
194
-
195
- - **AlienVault OTX** threat pulses and malware families
196
- - **MalwareBazaar** malware samples and signatures
197
- - **ThreatFox** recent IOC feed
198
- - **URLhaus**malicious URL database
199
- - **ransomware.live** ransomware group tracking and leak-site seeds
200
- - **CISA KEV** known exploited vulnerabilities catalog
201
- - **Shodan InternetDB** passive vulnerability signatures
202
- - **VirusTotal** file hash AV detection ratio (API key required)
203
- - **GreyNoise** suppresses known benign scanner IPs from results (API key required)
204
- - **AbuseIPDB** community IP abuse reports; 1,000 checks/day free
205
- - **Feodo Tracker + C2IntelFeeds** confirmed C2 IPs for 6 major frameworks; no key required
206
- - **crt.sh** certificate transparency logs; subdomain enumeration; free
207
- - **URLScan.io** live domain scan data and malicious verdicts
208
- - **Wayback Machine** — historical domain snapshots for taken-down infrastructure
209
- - **Hybrid Analysis** — behavioral sandbox verdict and AV detection ratio for file hashes
210
- - **HaveIBeenPwned** — breach history for email addresses (paid API key)
211
- - **EmailRep** email reputation scoring and disposable detection
212
- - **CIRCL PDNS + RDAP** — passive DNS history and WHOIS registration data; free
213
- - **BlockCypher + Etherscan** blockchain wallet balance and transaction graph
214
-
215
- Export formats:
216
-
217
- - **STIX 2.1** bundles with indicators, threat actors, malware objects
218
- - **MISP JSON** events with galaxies for direct import
219
- - **Sigma rules** auto-generated detection rules from extracted IOCs
220
- - **YARA rules** - generated rules for malware, credentials, infrastructure, and IOC strings
221
- - **Snort rules** - network detection rules for IPs, domains, URLs, and selected IOC content
222
- - **Suricata rules** - Suricata-compatible network rules with the same IOC coverage as Snort
223
- - **IOC package ZIP** - 21-file bundle containing text IOC lists, STIX, MISP, Sigma, YARA, Snort, Suricata, summary, and CSV
224
- - **CSV** - flat entity dumps for spreadsheet analysis
225
-
226
- ---
227
-
228
- ## Actor Intelligence
229
-
230
- VoidAccess v1.5.0 persists actor profiles across investigations in `actor_profiles`, with linked aliases and infrastructure in `actor_aliases` and `actor_infrastructure`. Profiles are populated from threat actor, ransomware group, and handle entities, then enriched with co-occurring infrastructure and timeline events.
231
-
232
- Cross-alias resolution scores five signals: shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation. Use `voidaccess actors` to list profiles, `voidaccess actor <handle>` for the full profile, `voidaccess actor <handle> --timeline` for chronology, and `voidaccess actor <handle> --note "text"` for analyst notes.
233
-
234
- ---
235
-
236
- ## LLM & Enrichment Ecosystem
237
-
238
- ### Supported LLM Providers
239
-
240
- | Provider | Models | Notes |
241
- |---|---|---|
242
- | **OpenRouter** | DeepSeek, Llama 3.3, Claude Haiku | Recommended default; free models available |
243
- | **Groq** | Llama 3.3, Llama 3.1 | Fast inference; free tier |
244
- | **OpenAI** | GPT-4o Mini | API key required |
245
- | **Anthropic** | Claude Haiku | Haiku is the tested default; other models work via manual override. |
246
- | **Google Gemini** | Gemini 1.5 Flash, 2.5 Pro | Free tier via AI Studio |
247
- | **Ollama** | Any local model | Air-gapped; no API key needed |
248
-
249
- The default is **DeepSeek via OpenRouter** — fast and strong on technical security content. With free-tier LLMs (Groq free, OpenRouter free models, or Ollama) the cost is **$0**. With paid models like DeepSeek via OpenRouter it is **under $0.50 per investigation**. For fully air-gapped deployments, Ollama runs entirely locally.
250
-
251
- ### Optional Enrichment API Keys
252
-
253
- All enrichment sources that require a key degrade gracefully when the key is absent — they are skipped without failing the investigation. Keys marked "free" require registration but have no cost.
254
-
255
- | Key | What it does | Free | Sign up |
256
- |---|---|---|---|
257
- | `OTX_API_KEY` | AlienVault OTX threat pulses | Yes | [otx.alienvault.com](https://otx.alienvault.com) |
258
- | `VT_API_KEY` | VirusTotal file hash AV detections | Yes (4 req/min) | [virustotal.com](https://www.virustotal.com) |
259
- | `ABUSECH_API_KEY` | MalwareBazaar, ThreatFox, URLhaus rate limits | Yes | [abuse.ch](https://abuse.ch) |
260
- | `ABUSEIPDB_API_KEY` | Community IP abuse reports; 1,000 checks/day | Yes | [abuseipdb.com/register](https://www.abuseipdb.com/register) |
261
- | `GREYNOISE_API_KEY` | Suppresses known scanner/researcher IPs | Free tier available | [greynoise.io/pricing](https://www.greynoise.io/pricing) |
262
- | `URLSCAN_API_KEY` | Higher rate limits for URLScan.io domain scans | Yes (public results without key) | [urlscan.io/user/signup](https://urlscan.io/user/signup) |
263
- | `HYBRID_ANALYSIS_API_KEY` | Behavioral sandbox analysis for file hashes | Yes | [hybrid-analysis.com/signup](https://www.hybrid-analysis.com/signup) |
264
- | `HIBP_API_KEY` | Email breach history the most valuable email enrichment | No ($3.50/month) | [haveibeenpwned.com/API/Key](https://haveibeenpwned.com/API/Key) |
265
- | `EMAILREP_API_KEY` | Email reputation scoring; increases rate limits | Yes (reduced rate without key) | [emailrep.io/key](https://emailrep.io/key) |
266
- | `SECURITYTRAILS_API_KEY` | Richer DNS history for domains | Yes (50 queries/month) | [securitytrails.com/corp/api](https://securitytrails.com/corp/api) |
267
- | `GITHUB_TOKEN` | Raises GitHub scraping from 10 to 30 req/min | Free | [github.com/settings/tokens](https://github.com/settings/tokens) |
268
- | `GITLAB_TOKEN` | Raises GitLab scraping from 15 to 60 req/min | Free | [gitlab.com/profile/personal_access_tokens](https://gitlab.com/-/profile/personal_access_tokens) |
269
- | `BLOCKCYPHER_TOKEN` | BTC/ETH wallet balance and transaction graph | Yes | [blockcypher.com](https://www.blockcypher.com) |
270
- | `ETHERSCAN_API_KEY` | ETH wallet lookups | Yes | [etherscan.io/apis](https://etherscan.io/apis) |
271
- | `SCRAPINGANT_API_KEY` + `VOIDACCESS_USE_PROXIES=true` | Optional clearnet proxy for paste sites + RSS feeds (see below) | Yes (free tier) | [scrapingant.com](https://scrapingant.com/?ref=mzliyzh) |
272
-
273
- ### Optional: Clearnet Scraping Proxy (ScrapingAnt)
274
-
275
- When a third-party clearnet site rate-limits or blocks VoidAccess's outbound IP, every paste site fetch and every RSS feed fetch in the same investigation can fail. The optional **ScrapingAnt** integration routes those specific requests through ScrapingAnt either its Web Scraping API or its Proxy Mode HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
276
-
277
- **What it covers** — paste sites (Pastebin, dpaste, paste.ee, Rentry) and the 20 curated RSS security feeds (Krebs on Security, BleepingComputer, Talos, Mandiant, CrowdStrike, Unit 42, CISA, and others). Nothing else.
278
-
279
- **What it does not cover** — Tor traffic, dark web scraping, and `.onion` fetches are **completely unaffected** by this setting regardless of how it is configured. The proxy only sees the two clearnet sources named above.
280
-
281
- **GitHub and GitLab scraping are also unaffected** — and intentionally so. Both of those scrapers carry authentication tokens (`GITHUB_TOKEN`, `GITLAB_TOKEN`) in their requests. Forwarding those tokens through a third-party proxy would expose them to that third party, which is unacceptable from a security standpoint. Both scrapers always go direct to the GitHub/GitLab API regardless of the proxy setting. This is a permanent design constraint, not something the proxy toggle can override.
282
-
283
- **It's entirely optional.** VoidAccess behaves identically without it — paste sites and RSS feeds are simply fetched directly, exactly as they were in every prior release. Add the key only if you see upstream rate-limiting or blocks affecting those two sources.
284
-
285
- #### Two mutually exclusive transports (v1.6.0)
286
-
287
- Per the [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): *"The proxy mode is a light front-end for the scraping API and has all the same functionality and performance as sending requests to the API endpoint."* Therefore the two transports below are **alternate transports to the same backend service** — pick ONE per request, never both:
288
-
289
- | Transport | Env var | Required config | What it does |
290
- |---|---|---|---|
291
- | **REST API** | `VOIDACCESS_USE_PROXIES=true` | `SCRAPINGANT_API_KEY` | POSTs the target URL to `api.scrapingant.com/v2/general` and returns the response body. Legacy v1.5.0 toggle. |
292
- | **Proxy Mode** | `VOIDACCESS_USE_PROXY=true` | `SCRAPINGANT_API_KEY` (only) | Routes the request as HTTP CONNECT through `proxy.scrapingant.com:8080` with username string built at connection time per docs: `scrapingant&browser=false&proxy_type=residential\|datacenter`. |
293
-
294
- The Proxy Mode transport also reads `SCRAPINGANT_PROXY_TYPE` to pick the pool: `residential` (default; harder to detect, slightly higher latency) or `datacenter` (faster, cheaper, easier to fingerprint).
295
-
296
- **Missing credentials leave both transports inactive.** Setting either transport env var to `true` without `SCRAPINGANT_API_KEY` is a no-op for that transport. No errors, no surprises.
297
-
298
- **If both transport env vars are set, Proxy Mode wins** with a one-shot info log at runtime — there is no chained mode (Proxy Mode is documented as "the same functionality" as the REST API, so stacking them would double-charge without adding capability).
299
-
300
- **How to turn it on** all four surfaces, covering either transport:
301
-
302
- | Surface | How |
303
- |---|---|
304
- | CLI configure wizard | `voidaccess configure` then `voidaccess configure keys` paste sites and RSS feeds will be flagged with their honest "never Tor" description before any field is asked for. The interactive prompt covers the key, pool type, and asks about each transport separately. |
305
- | `voidaccess configure proxy` (subcommand) | Interactive prompt for key + pool type. Non-interactive flags: `--enable / --disable` (REST API transport), `--enable-proxy / --disable-proxy` (Proxy Mode transport), `--show` (prints masked key `abcd…5678`, pool type, and both transport states). |
306
- | `setup.sh` during Docker install | Group F in the Enrichment Keys step; prompts for key + pool type, asks about each transport toggle separately. |
307
- | `--use-proxies` flag (single run) | `voidaccess investigate "query" --use-proxies` sets `VOIDACCESS_USE_PROXIES=true` (REST API transport) for the current process only, leaves the on-disk config untouched. |
308
- | Docker / web settings page | Settings API Keys ScrapingAnt. Stored encrypted at rest via the existing per-user `UserApiKey` mechanism (Fernet AES-128). |
309
-
310
- **Referral signup:** [https://scrapingant.com/?ref=mzliyzh](https://scrapingant.com/?ref=mzliyzh) (referral bonus applied on first paid plan; a free tier is available for low-volume use).
311
-
312
- ---
313
-
314
- ## Cost Comparison
315
-
316
- | Platform | Annual Cost | Self-Hosted | Open Source |
317
- |---|---|---|---|
318
- | Recorded Future | ~$25,000 | No | No |
319
- | DarkOwl | ~$15,000 | No | No |
320
- | Flare | ~$8,000 | No | No |
321
- | **VoidAccess** | **Free** | **Yes** | **Yes** |
322
-
323
- Free with Groq, OpenRouter free models, or Ollama. Under $0.50 per investigation with paid models like DeepSeek.
324
-
325
- ---
326
-
327
- ## Recent Updates
328
-
329
- - **10 new enrichment sources**: GreyNoise (scanner suppression), AbuseIPDB, Feodo Tracker, C2IntelFeeds, crt.sh, URLScan.io, Wayback Machine, Hybrid Analysis, HaveIBeenPwned, EmailRep
330
- - **4 new clearnet collection sources**: paste sites, GitHub code search, GitLab code search, and 20 curated RSS security feeds
331
- - **Curated .onion seed list** 31 seeds across 8 categories, relevance-scored per query
332
- - **CIRCL passive DNS + RDAP WHOIS** infrastructure cluster detection for IPs and domains
333
- - **Investigation cancellation** cancel a running pipeline at any checkpoint; partial results are preserved
334
- - **Sources panel** — per-investigation breakdown of which sources ran and what each returned
335
- - **Infrastructure clusters panel** — groups IPs and domains sharing ASN, CIDR block, or WHOIS registrant
336
- - **Entity quality badges** — C2, Malicious, Breached, Disposable, Archived, Taken Down, AV ratio
337
- - **GreyNoise suppression** known benign scanner IPs are filtered from entity results automatically
338
- - **MALWARE_FAMILY auto-creation** from confirmed family names returned by hash enrichment
339
-
340
- ---
341
-
342
- ## Docker Setup
343
-
344
- ### Prerequisites
345
- - Docker and Docker Compose
346
- - Python 3 (recommended — used by setup.sh for secret generation; Linux/macOS fall back to /dev/urandom if absent, Windows setup.bat may require it)
347
- - One LLM API key — or Ollama for fully local operation (free)
348
-
349
- **Free LLM options (no credit card required):**
350
- - [Groq](https://console.groq.com) fast, free tier, Llama 3.3 70B
351
- - [OpenRouter](https://openrouter.ai) free models including DeepSeek and Llama 3.3
352
- - [Google AI Studio](https://aistudio.google.com) Gemini free tier
353
- - [Ollama](https://ollama.ai) fully local, no internet required
354
-
355
- ### Installation
356
-
357
- **macOS / Linux / WSL:**
358
- ```bash
359
- bash setup.sh
360
- ```
361
-
362
- **Windows (native):**
363
- ```bat
364
- setup.bat
365
- ```
366
-
367
- The interactive wizard creates `.env`, generates `JWT_SECRET` and `POSTGRES_PASSWORD`, prompts for your LLM provider (one of: Groq, OpenRouter, Anthropic, OpenAI, Google Gemini, or Ollama), optionally collects threat-intel keys (`OTX_API_KEY`, `VT_API_KEY`), optionally enables Redis, sets the admin password, and starts the Docker stack.
368
-
369
- <div align="center">
370
- <img src="./public/setup_gif.gif" width="100%" alt="Setup walkthrough">
371
- </div>
372
-
373
- ### Starting and Stopping
374
-
375
- **macOS / Linux / WSL:**
376
- ```bash
377
- ./start.sh # build and start all services
378
- ./stop.sh # stop all services
379
- ```
380
-
381
- **Windows (native):**
382
- ```bat
383
- start.bat :: build and start all services
384
- stop.bat :: stop all services
385
- ```
386
-
387
- Once running, open **http://localhost:3001** in your browser.
388
-
389
- <div align="center">
390
- <img src="./public/start_gif.gif" width="100%" alt="Starting VoidAccess">
391
- </div>
392
-
393
- ### Getting a JWT (API access)
394
-
395
- `setup.sh` creates a default admin account at `admin@voidaccess.tech` with the password you provided during the wizard.
396
-
397
- ```bash
398
- curl -X POST http://localhost:8000/auth/login \
399
- -H "Content-Type: application/json" \
400
- -d '{"email": "admin@voidaccess.tech", "password": "yourpassword"}'
401
- ```
402
-
403
- Use the returned token in an `Authorization: Bearer <token>` header for API requests.
404
-
405
- ### Running your first investigation (API)
406
-
407
- ```bash
408
- curl -X POST http://localhost:8000/investigations \
409
- -H "Authorization: Bearer <your_jwt>" \
410
- -H "Content-Type: application/json" \
411
- -d '{"query": "LockBit ransomware infrastructure 2024"}'
412
- ```
413
-
414
- The investigation starts in `pending`, moves to `processing`, and completes in 3–5 minutes with a summary, extracted entities, relationship graph, and export-ready artifacts.
415
-
416
- ---
417
-
418
- ## Architecture
419
-
420
- Four Docker services:
421
-
422
- | Service | Technology | Port |
423
- |---|---|---|
424
- | **postgres** | PostgreSQL 16 | 5433 |
425
- | **tor** | Tor SOCKS5 proxy | 9050 |
426
- | **fastapi** | Python 3.11, FastAPI, SQLAlchemy | 8000 |
427
- | **nextjs** | Next.js 14, TypeScript, Tailwind | 3001 |
428
-
429
- The FastAPI backend runs a 13-step pipeline triggered by `POST /investigations`. Every external call has `try/except` with graceful fallback — the pipeline never hard-crashes. API docs are available at **http://localhost:8000/docs** when running.
430
-
431
- ### Source Tree
432
-
433
- ```
434
- voidaccess/
435
- ├── analysis/ # Temporal patterns, OPSEC failure detection, anomaly scoring
436
- ├── api/ # FastAPI routes; investigation pipeline orchestrator
437
- ├── auth/ # JWT authentication and user management
438
- ├── crawler/ # Recursive .onion link discovery spider
439
- ├── db/ # SQLAlchemy ORM models and Alembic migrations
440
- ├── docs/ # Contributing, security, and usage policy documents
441
- ├── export/ # STIX 2.1, MISP, Sigma, and CSV artifact generation
442
- ├── extractor/ # Regex → NER → LLM entity extraction pipeline
443
- ├── fingerprint/ # Stylometry vectors and actor style profiling
444
- ├── graph/ # NetworkX MultiDiGraph builder and pyvis visualization
445
- ├── i18n/ # Language detection, translation, multilingual query expansion
446
- ├── infra/ # Docker Compose, Tor config, Postgres init
447
- ├── monitor/ # APScheduler watches, change diffing, Telegram/SMTP alerts
448
- ├── public/ # Logo, walkthrough screenshots, demo media
449
- ├── scraper/ # Async aiohttp and Playwright scrapers over Tor
450
- ├── scripts/ # Seed imports and operational utilities
451
- ├── search/ # 16+ .onion search engine fan-out with circuit breaker
452
- ├── sources/ # DarkSearch, Telegram, paste sites, threat-intel feeds
453
- ├── tests/ # Pytest suite (one test file per module)
454
- ├── utils/ # Async helpers, content safety, encryption, defang
455
- ├── vector/ # ChromaDB cache with sentence-transformer embeddings
456
- ├── voidaccess/ # LangChain LLM wrappers and provider registry
457
- └── web/ # Next.js 14 + TypeScript + Tailwind frontend
458
- ```
459
-
460
- > **Note on `voidaccess/voidaccess/`** — the nested directory holds the core LLM utilities (`llm.py`, `llm_utils.py`) and is imported at runtime by the API routes (`from voidaccess.llm import ...`). The nested naming reflects the original package structure from the project's pre-API baseline.
461
-
462
- ---
463
-
464
- ## Troubleshooting
465
-
466
- **Services won't start:**
467
- ```bash
468
- docker compose -f infra/docker-compose.yml --project-directory . ps
469
- docker compose -f infra/docker-compose.yml --project-directory . logs -f
470
- ```
471
-
472
- **Port conflicts** (3001 or 8000 already in use):
473
- - macOS/Linux: `lsof -i :3001` to find what's using it
474
- - Windows: `netstat -ano | findstr :3001`
475
-
476
- **Tor not connecting:** The Tor service takes 30–60 seconds to bootstrap on first start. Check health with `./check_health.sh`. This script verifies Tor proxy connectivity, LLM provider reachability, and dark web search engine availability.
477
-
478
- **No .env file:** Run `bash setup.sh` (macOS/Linux/WSL) or `setup.bat` (Windows) before starting.
479
-
480
- **Docker build takes a long time:** First build downloads ~3GB of layers. Subsequent builds use the Docker layer cache and are much faster.
481
-
482
- ---
483
-
484
- ## Content Safety
485
-
486
- Every investigation runs through mandatory content safety filters before results reach the UI or appear in the graph. CSAM, gore, snuff content, and other prohibited material are blocked at the query stage, URL validation, content scanning, and post-extraction entity filtering. These filters are mandatory and cannot be disabled.
487
-
488
- ---
489
-
490
- ## Acceptable Use
491
-
492
- VoidAccess is for authorized security research, threat intelligence gathering, and law enforcement purposes only. Users are responsible for ensuring compliance with all local laws and ethical standards. See [docs/USAGE_POLICY.md](docs/USAGE_POLICY.md) for the full policy.
493
-
494
- ---
495
-
496
- ## Contributing
497
-
498
- Contributions are welcome. See [docs/CONTRIBUTING.md](docs/CONTRIBUTING.md) for setup instructions, code standards, and the PR process. Please read [docs/CODE_OF_CONDUCT.md](docs/CODE_OF_CONDUCT.md) before participating.
499
-
500
- To report a security vulnerability, see [docs/SECURITY.md](docs/SECURITY.md).
501
-
502
- ---
503
-
504
- ## License
505
-
506
- MIT License. See [LICENSE](LICENSE) for details.
1
+ Metadata-Version: 2.4
2
+ Name: voidaccess
3
+ Version: 1.6.2
4
+ Summary: Dark web OSINT CLI automated threat intelligence from query to report
5
+ Author: VoidAccess
6
+ License-Expression: MIT
7
+ Keywords: osint,darkweb,threat-intelligence,tor,cli
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Information Technology
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Security
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: typer>=0.12
20
+ Requires-Dist: rich>=13
21
+ Requires-Dist: textual>=0.60
22
+ Requires-Dist: aiohttp>=3.9
23
+ Requires-Dist: aiohttp-socks>=0.8
24
+ Requires-Dist: sqlalchemy>=2.0
25
+ Requires-Dist: aiosqlite>=0.20
26
+ Requires-Dist: langchain>=0.2
27
+ Requires-Dist: langchain-openai>=0.1
28
+ Requires-Dist: langchain-anthropic>=0.1
29
+ Requires-Dist: langchain-google-genai>=1.0
30
+ Requires-Dist: langchain-groq>=0.1
31
+ Requires-Dist: langchain-ollama>=0.1
32
+ Requires-Dist: python-dotenv>=1.0
33
+ Requires-Dist: httpx>=0.27
34
+ Requires-Dist: spacy>=3.7
35
+ Requires-Dist: beautifulsoup4>=4.12
36
+ Requires-Dist: feedparser>=6.0
37
+ Requires-Dist: python-dateutil>=2.9
38
+ Requires-Dist: trafilatura>=1.6
39
+ Requires-Dist: requests>=2.31
40
+ Requires-Dist: python-socks>=2.4
41
+ Requires-Dist: tldextract>=5.1
42
+ Provides-Extra: dev
43
+ Requires-Dist: pytest; extra == "dev"
44
+ Requires-Dist: pytest-asyncio; extra == "dev"
45
+ Dynamic: license-file
46
+
47
+ <div align="center">
48
+ <img src="./public/logo_circle.png" width="160" alt="VoidAccess Logo">
49
+ <h1>VoidAccess</h1>
50
+ <p><strong>A self-hosted OSINT platform for dark web threat intelligence.</strong></p>
51
+ <p>Automate the entire investigation workflow from query refinement to relationship mapping in 13 autonomous pipeline steps.</p>
52
+ </div>
53
+
54
+ ---
55
+
56
+ ## The OSINT Powerhouse
57
+
58
+ Commercial threat intelligence platforms often charge prohibitive annual fees for capabilities that can be run on private hardware. **VoidAccess** democratizes high-end dark web intelligence by providing an automated, end-to-end workflow:
59
+
60
+ - **Query Refinement**: Intelligent search term optimization using LLMs.
61
+ - **Multilingual Search**: Deep-web fan-out across English, Russian, and Chinese engines.
62
+ - **Entity Extraction**: Autonomous identification of wallets, IOCs, PGP keys, and more.
63
+ - **Relationship Mapping**: Dynamic graph generation from extracted data co-occurrence.
64
+ - **Structured Export**: STIX 2.1, MISP, Sigma, and CSV support.
65
+
66
+ ---
67
+
68
+ ## What's New in v1.6.0
69
+
70
+ - **Optional clearnet ScrapingAnt integration** — paste sites and RSS feeds can now be routed through ScrapingAnt. Affects clearnet scraping only; Tor, `.onion`, GitHub, and GitLab traffic are never affected.
71
+ - **Three independent products** — Web Scraping API, Residential Proxy transport, and Datacenter Proxy transport are documented separately so the API key is not conflated with the proxy credentials.
72
+ - **One-shot CLI flags** — `voidaccess investigate --use-scraping-api` and `voidaccess investigate --use-proxies` select the REST API or proxy transport for a single run.
73
+ - **`SCRAPINGANT_PROXY_TYPE`** — `residential` (default) or `datacenter`; this controls the proxy pool used by the proxy transports and is not a separate hostname.
74
+ - **New CLI surfaces** — `voidaccess configure proxy` now prompts for the ScrapingAnt keys and transport toggles in a clearly separated flow, plus `--enable-proxy / --disable-proxy` for non-interactive proxy transport toggling and `--show` for masked state inspection.
75
+ - **55 → 74 proxy-config tests** — covers both transports, the `apply_env()` independent-toggle guarantee, the transport-selection logic, and the masked `--show` output.
76
+
77
+ ---
78
+
79
+ ## What's New in v1.5.0
80
+
81
+ - 37 new entity types across crypto, credentials, messaging, and network/forensic indicators.
82
+ - YARA, Snort, Suricata, and IOC package ZIP exports.
83
+ - Persistent actor profiles with aliases, infrastructure, notes, and timelines.
84
+ - Cross-alias resolution using shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation.
85
+ - Backend graph community detection and path-between-nodes queries.
86
+ - CLI graph browser path finder and frontend Find Path highlighting.
87
+ - Per-phase pipeline timeouts for enrichment, graph, summary, finalize, and parallel sources.
88
+ - `sources_used` and `infrastructure_clusters` persist in investigation metadata.
89
+ - Cross-run enrichment cache with Redis, SQLite, and memory backends.
90
+ - Auto-discovery and weekly validation of `.onion` seeds.
91
+
92
+ ---
93
+
94
+ ## Quick Start
95
+
96
+ ### Option A - CLI (no Docker, 30 seconds)
97
+
98
+ ```bash
99
+ pip install voidaccess
100
+ voidaccess configure
101
+ voidaccess investigate "LockBit ransomware"
102
+ ```
103
+
104
+ <div align="center">
105
+ <img src="./public/cli_investigation_gif.gif" alt="VoidAccess CLI investigation walkthrough" width="900">
106
+ </div>
107
+
108
+ Requires local Tor for dark web sources:
109
+
110
+ - https://torproject.org
111
+ - Use `--no-tor` for clearnet-only investigations
112
+
113
+ The CLI stores config in `~/.voidaccess/config.json` and writes results to `~/.voidaccess/results/`.
114
+
115
+ ### Option B - Docker (full stack, 5 minutes)
116
+
117
+ ```bash
118
+ git clone https://github.com/KatrielMoses/voidaccess
119
+ cd voidaccess
120
+ bash setup.sh
121
+ ```
122
+
123
+ The Docker stack includes PostgreSQL, Tor, FastAPI, and Next.js.
124
+
125
+ ### CLI Commands
126
+
127
+ | Command | Description |
128
+ |---|---|
129
+ | `voidaccess investigate` | Run an investigation |
130
+ | `voidaccess show` | Interactive entity browser |
131
+ | `voidaccess export` | Export STIX/MISP/Sigma/YARA/Snort/Suricata/package/CSV/MD/JSON |
132
+ | `voidaccess package <file>` | Export an IOC ZIP bundle |
133
+ | `voidaccess enrich` | Re-enrich saved results |
134
+ | `voidaccess list` | List saved investigations |
135
+ | `voidaccess status` | Config, API key, cache, engine, and seed status |
136
+ | `voidaccess actors` | List persistent actor profiles |
137
+ | `voidaccess actor <handle>` | Show an actor profile with aliases, infrastructure, notes, and history |
138
+ | `voidaccess actor <handle> --timeline` | Show an actor activity timeline |
139
+ | `voidaccess actor <handle> --note "text"` | Append an analyst note to an actor profile |
140
+ | `voidaccess timeline <handle>` | Shortcut for `voidaccess actor <handle> --timeline` |
141
+ | `voidaccess configure` | Setup wizard |
142
+ | `voidaccess configure proxy` | ScrapingAnt key, username, type, and routing toggles. Flags: `--enable / --disable` (API gate), `--enable-proxy / --disable-proxy` (proxy gate), `--show` (masked state) |
143
+
144
+ Export examples:
145
+
146
+ ```bash
147
+ voidaccess package investigation.json
148
+ voidaccess export investigation.json --format yara
149
+ voidaccess export investigation.json --format snort
150
+ voidaccess export investigation.json --format suricata
151
+ voidaccess status --seeds
152
+ ```
153
+
154
+ ### CLI vs Docker
155
+
156
+ | Feature | CLI | Docker |
157
+ |---|---|---|
158
+ | Install time | 30 seconds | 5 minutes |
159
+ | Dark web scraping | Requires local Tor | Built-in |
160
+ | Graph visualization | Terminal TUI | sigma.js |
161
+ | Monitoring/alerts | No | Yes |
162
+ | Multi-user | No | Yes |
163
+ | Persistence | SQLite (`~/.voidaccess`) | PostgreSQL |
164
+
165
+ ---
166
+
167
+ ## Visual Walkthrough
168
+
169
+ ### 1. Intuitive Dashboard
170
+ Start investigations with a clean, dark-themed interface designed for high-stakes research.
171
+ ![Homepage](./public/homepage.png)
172
+
173
+ ### 2. Intelligent Scoping
174
+ Refine queries and select investigation depth with precision.
175
+ ![Topic Selection](./public/topic_selection.png)
176
+
177
+ ### 3. Real-time Pipeline Tracking
178
+ Monitor the 13-step autonomous pipeline as it crawls and extracts intelligence.
179
+ ![Loading](./public/loading.png)
180
+
181
+ ### 4. Interactive Graph Intelligence
182
+ Explore connections between entities, onion sites, and threat actors in a dynamic, high-contrast graph.
183
+ ![Node Selection](./public/node_selection.png)
184
+
185
+ ### 5. Comprehensive Intel Reports
186
+ Get structured summaries and actionable artifacts once the scan completes.
187
+ ![Scan Completed](./public/scan_completed.png)
188
+
189
+ ---
190
+
191
+ ## How It Works (The 13-Step Pipeline)
192
+
193
+ VoidAccess handles the complexity of dark web research through a rigorous sequence:
194
+
195
+ 1. **LLM Query Refinement**: Optimizes search terms for .onion engine indexing.
196
+ 2. **Parallel Collection**: Queries 16+ Tor search engines simultaneously with paste sites (Pastebin, dpaste, paste.ee), GitHub, GitLab, and curated RSS security feeds.
197
+ 3. **Intelligence Filtering**: LLM filters noise, keeping only relevant intelligence pages.
198
+ 4. **Multi-Source Enrichment**: Pulls from AlienVault OTX, abuse.ch, ransomware.live, CISA KEV, Shodan, GreyNoise, AbuseIPDB, Feodo Tracker, C2IntelFeeds, and more running in parallel with collection.
199
+ 5. **Recursive .onion Discovery**: Discovers hidden links via seed URL crawling.
200
+ 6. **Vector Cache Check**: Avoids redundant scraping for recently visited pages (24h TTL).
201
+ 7. **Tor-Routed Scraping**: Safely fetches page content with a 1MB safety cap.
202
+ 8. **Persistence**: Stores new content in the local vector cache.
203
+ 9. **Intelligence Merging**: Combines scraped and enriched data for processing.
204
+ 10. **Advanced Extraction**: Regex, NER, and LLM-based entity identification.
205
+ 11. **Historical Cross-Referencing**: Validates data against seed datasets.
206
+ 12. **Graph Construction**: Builds relationship nodes based on co-occurrence.
207
+ 13. **Final Intelligence Summary**: LLM generates a structured technical briefing.
208
+
209
+ ---
210
+
211
+ ## What It Extracts
212
+
213
+ The extraction pipeline identifies 55+ entity types:
214
+
215
+ | Category | Examples |
216
+ |---|---|
217
+ | **Cryptocurrency** | Bitcoin, Ethereum, Monero, Litecoin, Zcash, Dogecoin, XRP, Solana, Tron, Bitcoin Cash, Dash, ENS |
218
+ | **Network Indicators** | IPv4 addresses, .onion URLs, domains, email addresses, PGP keys |
219
+ | **File Indicators** | MD5, SHA1, SHA256 hashes |
220
+ | **Credentials** | AWS keys, GitHub tokens, Slack tokens, Discord tokens, JWTs, Google API keys, Stripe keys, generic API keys, stealer log entries |
221
+ | **Messaging Handles** | Telegram, Discord, XMPP, Tox, Session, Matrix, Wire, ICQ, Wickr |
222
+ | **Network/Forensic** | IPv6, MAC addresses, IPFS CIDs, combo-list entries, YARA rules, MITRE tactics, Exploit-DB IDs, Nuclei templates, seed phrases |
223
+ | **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques and tactics |
224
+ | **Threat Actors** | Actor handles, malware families, ransomware group names |
225
+ | **Paste Sites** | Pastebin, Ghostbin, Rentry, and similar links |
226
+ | **People/Orgs** | Named persons, organization names, locations |
227
+
228
+ Parallel collection sources (run alongside Tor search):
229
+
230
+ - **Paste sites** Pastebin, dpaste, paste.ee, Rentry
231
+ - **GitHub** — code search and repository READMEs
232
+ - **GitLab** code search and project pages
233
+ - **RSS security feeds** — 20 curated feeds (Krebs, BleepingComputer, Talos, Mandiant, CrowdStrike, Unit 42, CISA, and more)
234
+ - **Curated .onion seed catalogue** — 31 vetted seeds across 8 categories, scored per query
235
+
236
+ Enrichment and quality sources (19 total):
237
+
238
+ - **AlienVault OTX** — threat pulses and malware families
239
+ - **MalwareBazaar** — malware samples and signatures
240
+ - **ThreatFox** recent IOC feed
241
+ - **URLhaus** — malicious URL database
242
+ - **ransomware.live** ransomware group tracking and leak-site seeds
243
+ - **CISA KEV** known exploited vulnerabilities catalog
244
+ - **Shodan InternetDB** passive vulnerability signatures
245
+ - **VirusTotal** file hash AV detection ratio (API key required)
246
+ - **GreyNoise** suppresses known benign scanner IPs from results (API key required)
247
+ - **AbuseIPDB** community IP abuse reports; 1,000 checks/day free
248
+ - **Feodo Tracker + C2IntelFeeds** — confirmed C2 IPs for 6 major frameworks; no key required
249
+ - **crt.sh** — certificate transparency logs; subdomain enumeration; free
250
+ - **URLScan.io** — live domain scan data and malicious verdicts
251
+ - **Wayback Machine** historical domain snapshots for taken-down infrastructure
252
+ - **Hybrid Analysis** — behavioral sandbox verdict and AV detection ratio for file hashes
253
+ - **HaveIBeenPwned** breach history for email addresses (paid API key)
254
+ - **EmailRep** — email reputation scoring and disposable detection
255
+ - **CIRCL PDNS + RDAP** passive DNS history and WHOIS registration data; free
256
+ - **BlockCypher + Etherscan** — blockchain wallet balance and transaction graph
257
+
258
+ Export formats:
259
+
260
+ - **STIX 2.1** bundles with indicators, threat actors, malware objects
261
+ - **MISP JSON** events with galaxies for direct import
262
+ - **Sigma rules** auto-generated detection rules from extracted IOCs
263
+ - **YARA rules** - generated rules for malware, credentials, infrastructure, and IOC strings
264
+ - **Snort rules** - network detection rules for IPs, domains, URLs, and selected IOC content
265
+ - **Suricata rules** - Suricata-compatible network rules with the same IOC coverage as Snort
266
+ - **IOC package ZIP** - 21-file bundle containing text IOC lists, STIX, MISP, Sigma, YARA, Snort, Suricata, summary, and CSV
267
+ - **CSV** - flat entity dumps for spreadsheet analysis
268
+
269
+ ---
270
+
271
+ ## Actor Intelligence
272
+
273
+ VoidAccess v1.5.0 persists actor profiles across investigations in `actor_profiles`, with linked aliases and infrastructure in `actor_aliases` and `actor_infrastructure`. Profiles are populated from threat actor, ransomware group, and handle entities, then enriched with co-occurring infrastructure and timeline events.
274
+
275
+ Cross-alias resolution scores five signals: shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation. Use `voidaccess actors` to list profiles, `voidaccess actor <handle>` for the full profile, `voidaccess actor <handle> --timeline` for chronology, and `voidaccess actor <handle> --note "text"` for analyst notes.
276
+
277
+ ---
278
+
279
+ ## LLM & Enrichment Ecosystem
280
+
281
+ ### Supported LLM Providers
282
+
283
+ | Provider | Models | Notes |
284
+ |---|---|---|
285
+ | **OpenRouter** | DeepSeek, Llama 3.3, Claude Haiku | Recommended default; free models available |
286
+ | **Groq** | Llama 3.3, Llama 3.1 | Fast inference; free tier |
287
+ | **OpenAI** | GPT-4o Mini | API key required |
288
+ | **Anthropic** | Claude Haiku | Haiku is the tested default; other models work via manual override. |
289
+ | **Google Gemini** | Gemini 1.5 Flash, 2.5 Pro | Free tier via AI Studio |
290
+ | **Ollama** | Any local model | Air-gapped; no API key needed |
291
+
292
+ The default is **DeepSeek via OpenRouter** fast and strong on technical security content. With free-tier LLMs (Groq free, OpenRouter free models, or Ollama) the cost is **$0**. With paid models like DeepSeek via OpenRouter it is **under $0.50 per investigation**. For fully air-gapped deployments, Ollama runs entirely locally.
293
+
294
+ ### Optional Enrichment API Keys
295
+
296
+ All enrichment sources that require a key degrade gracefully when the key is absent they are skipped without failing the investigation. Keys marked "free" require registration but have no cost.
297
+
298
+ | Key | What it does | Free | Sign up |
299
+ |---|---|---|---|
300
+ | `OTX_API_KEY` | AlienVault OTX threat pulses | Yes | [otx.alienvault.com](https://otx.alienvault.com) |
301
+ | `VT_API_KEY` | VirusTotal file hash AV detections | Yes (4 req/min) | [virustotal.com](https://www.virustotal.com) |
302
+ | `ABUSECH_API_KEY` | MalwareBazaar, ThreatFox, URLhaus rate limits | Yes | [abuse.ch](https://abuse.ch) |
303
+ | `ABUSEIPDB_API_KEY` | Community IP abuse reports; 1,000 checks/day | Yes | [abuseipdb.com/register](https://www.abuseipdb.com/register) |
304
+ | `GREYNOISE_API_KEY` | Suppresses known scanner/researcher IPs | Free tier available | [greynoise.io/pricing](https://www.greynoise.io/pricing) |
305
+ | `URLSCAN_API_KEY` | Higher rate limits for URLScan.io domain scans | Yes (public results without key) | [urlscan.io/user/signup](https://urlscan.io/user/signup) |
306
+ | `HYBRID_ANALYSIS_API_KEY` | Behavioral sandbox analysis for file hashes | Yes | [hybrid-analysis.com/signup](https://www.hybrid-analysis.com/signup) |
307
+ | `HIBP_API_KEY` | Email breach historythe most valuable email enrichment | No ($3.50/month) | [haveibeenpwned.com/API/Key](https://haveibeenpwned.com/API/Key) |
308
+ | `EMAILREP_API_KEY` | Email reputation scoring; increases rate limits | Yes (reduced rate without key) | [emailrep.io/key](https://emailrep.io/key) |
309
+ | `SECURITYTRAILS_API_KEY` | Richer DNS history for domains | Yes (50 queries/month) | [securitytrails.com/corp/api](https://securitytrails.com/corp/api) |
310
+ | `GITHUB_TOKEN` | Raises GitHub scraping from 10 to 30 req/min | Free | [github.com/settings/tokens](https://github.com/settings/tokens) |
311
+ | `GITLAB_TOKEN` | Raises GitLab scraping from 15 to 60 req/min | Free | [gitlab.com/profile/personal_access_tokens](https://gitlab.com/-/profile/personal_access_tokens) |
312
+ | `BLOCKCYPHER_TOKEN` | BTC/ETH wallet balance and transaction graph | Yes | [blockcypher.com](https://www.blockcypher.com) |
313
+ | `ETHERSCAN_API_KEY` | ETH wallet lookups | Yes | [etherscan.io/apis](https://etherscan.io/apis) |
314
+ | `SCRAPINGANT_API_KEY` + `VOIDACCESS_USE_PROXIES=true` | Optional clearnet proxy for paste sites + RSS feeds (see below) | Yes (free tier) | [scrapingant.com](https://scrapingant.com/?ref=mzliyzh) |
315
+
316
+ ### Optional: Clearnet Scraping Proxy (ScrapingAnt)
317
+
318
+ When a third-party clearnet site rate-limits or blocks VoidAccess's outbound IP, paste-site and RSS-feed fetches can be routed through ScrapingAnt. This integration is optional and affects only clearnet scraping.
319
+
320
+ | Product | What it is | Credentials | Persistent config | One-shot CLI flag | Notes |
321
+ |---|---|---|---|---|---|
322
+ | Web Scraping API | REST transport for paste sites and RSS feeds | `SCRAPINGANT_API_KEY` | `VOIDACCESS_USE_PROXIES=true` | `--use-scraping-api` | POSTs the target URL to `https://api.scrapingant.com/v2/general` and is billed in request credits. |
323
+ | Residential Proxy transport | HTTP CONNECT proxy for paste sites and RSS feeds | `SCRAPINGANT_PROXY_USERNAME` + `SCRAPINGANT_PROXY_PASSWORD` | `VOIDACCESS_USE_PROXY=true` | `--use-proxies` | Uses `residential.scrapingant.com:8080` for HTTP proxying or `:443` for HTTPS tunneling and is billed by traffic volume. |
324
+ | Datacenter Proxy transport | Same proxy flow as Residential, but a different proxy pool | `SCRAPINGANT_PROXY_USERNAME` + `SCRAPINGANT_PROXY_PASSWORD` | `SCRAPINGANT_PROXY_TYPE=datacenter` plus `VOIDACCESS_USE_PROXY=true` | `--use-proxies` | Configuration exists, but live verification is still open; do not assume access is provisioned on every account. |
325
+
326
+ The proxy transports use `SCRAPINGANT_PROXY_TYPE=residential` or `SCRAPINGANT_PROXY_TYPE=datacenter` to choose the pool. The REST API transport does not use the proxy username/password pair.
327
+
328
+ The integration is strictly scoped:
329
+
330
+ - Tor and `.onion` traffic are completely unaffected in every configuration.
331
+ - GitHub and GitLab scraping are permanently excluded because their requests carry `GITHUB_TOKEN` and `GITLAB_TOKEN`.
332
+ - If both the REST API and proxy transport are enabled, the proxy transport wins for that request and VoidAccess logs the choice once.
333
+ - There is no chained mode. Each request uses exactly one transport, and failures fall back to direct fetches.
334
+
335
+ How to turn it on:
336
+
337
+ - `voidaccess configure proxy` for the persistent config path.
338
+ - `voidaccess investigate "query" --use-scraping-api` for a one-shot REST API run.
339
+ - `voidaccess investigate "query" --use-proxies` for a one-shot proxy run.
340
+ - `setup.sh` and the Docker / web settings path provide the same persistent storage surfaces as the rest of the project.
341
+
342
+ Referral signup: [https://scrapingant.com/?ref=mzliyzh](https://scrapingant.com/?ref=mzliyzh)
343
+
344
+ ---
345
+
346
+ ## Cost Comparison
347
+
348
+ | Platform | Annual Cost | Self-Hosted | Open Source |
349
+ |---|---|---|---|
350
+ | Recorded Future | ~$25,000 | No | No |
351
+ | DarkOwl | ~$15,000 | No | No |
352
+ | Flare | ~$8,000 | No | No |
353
+ | **VoidAccess** | **Free** | **Yes** | **Yes** |
354
+
355
+ Free with Groq, OpenRouter free models, or Ollama. Under $0.50 per investigation with paid models like DeepSeek.
356
+
357
+ ---
358
+
359
+ ## Recent Updates
360
+
361
+ - **10 new enrichment sources**: GreyNoise (scanner suppression), AbuseIPDB, Feodo Tracker, C2IntelFeeds, crt.sh, URLScan.io, Wayback Machine, Hybrid Analysis, HaveIBeenPwned, EmailRep
362
+ - **4 new clearnet collection sources**: paste sites, GitHub code search, GitLab code search, and 20 curated RSS security feeds
363
+ - **Curated .onion seed list** — 31 seeds across 8 categories, relevance-scored per query
364
+ - **CIRCL passive DNS + RDAP WHOIS** — infrastructure cluster detection for IPs and domains
365
+ - **Investigation cancellation** — cancel a running pipeline at any checkpoint; partial results are preserved
366
+ - **Sources panel** — per-investigation breakdown of which sources ran and what each returned
367
+ - **Infrastructure clusters panel** groups IPs and domains sharing ASN, CIDR block, or WHOIS registrant
368
+ - **Entity quality badges** — C2, Malicious, Breached, Disposable, Archived, Taken Down, AV ratio
369
+ - **GreyNoise suppression** — known benign scanner IPs are filtered from entity results automatically
370
+ - **MALWARE_FAMILY auto-creation** from confirmed family names returned by hash enrichment
371
+
372
+ ---
373
+
374
+ ## Docker Setup
375
+
376
+ ### Prerequisites
377
+ - Docker and Docker Compose
378
+ - Python 3 (recommended — used by setup.sh for secret generation; Linux/macOS fall back to /dev/urandom if absent, Windows setup.bat may require it)
379
+ - One LLM API key — or Ollama for fully local operation (free)
380
+
381
+ **Free LLM options (no credit card required):**
382
+ - [Groq](https://console.groq.com) — fast, free tier, Llama 3.3 70B
383
+ - [OpenRouter](https://openrouter.ai) free models including DeepSeek and Llama 3.3
384
+ - [Google AI Studio](https://aistudio.google.com) Gemini free tier
385
+ - [Ollama](https://ollama.ai) — fully local, no internet required
386
+
387
+ ### Installation
388
+
389
+ **macOS / Linux / WSL:**
390
+ ```bash
391
+ bash setup.sh
392
+ ```
393
+
394
+ **Windows (native):**
395
+ ```bat
396
+ setup.bat
397
+ ```
398
+
399
+ The interactive wizard creates `.env`, generates `JWT_SECRET` and `POSTGRES_PASSWORD`, prompts for your LLM provider (one of: Groq, OpenRouter, Anthropic, OpenAI, Google Gemini, or Ollama), optionally collects threat-intel keys (`OTX_API_KEY`, `VT_API_KEY`), optionally enables Redis, sets the admin password, and starts the Docker stack.
400
+
401
+ <div align="center">
402
+ <img src="./public/setup_gif.gif" width="100%" alt="Setup walkthrough">
403
+ </div>
404
+
405
+ ### Starting and Stopping
406
+
407
+ **macOS / Linux / WSL:**
408
+ ```bash
409
+ ./start.sh # build and start all services
410
+ ./stop.sh # stop all services
411
+ ```
412
+
413
+ **Windows (native):**
414
+ ```bat
415
+ start.bat :: build and start all services
416
+ stop.bat :: stop all services
417
+ ```
418
+
419
+ Once running, open **http://localhost:3001** in your browser.
420
+
421
+ <div align="center">
422
+ <img src="./public/start_gif.gif" width="100%" alt="Starting VoidAccess">
423
+ </div>
424
+
425
+ ### Getting a JWT (API access)
426
+
427
+ `setup.sh` creates a default admin account at `admin@voidaccess.tech` with the password you provided during the wizard.
428
+
429
+ ```bash
430
+ curl -X POST http://localhost:8000/auth/login \
431
+ -H "Content-Type: application/json" \
432
+ -d '{"email": "admin@voidaccess.tech", "password": "yourpassword"}'
433
+ ```
434
+
435
+ Use the returned token in an `Authorization: Bearer <token>` header for API requests.
436
+
437
+ ### Running your first investigation (API)
438
+
439
+ ```bash
440
+ curl -X POST http://localhost:8000/investigations \
441
+ -H "Authorization: Bearer <your_jwt>" \
442
+ -H "Content-Type: application/json" \
443
+ -d '{"query": "LockBit ransomware infrastructure 2024"}'
444
+ ```
445
+
446
+ The investigation starts in `pending`, moves to `processing`, and completes in 3–5 minutes with a summary, extracted entities, relationship graph, and export-ready artifacts.
447
+
448
+ ---
449
+
450
+ ## Architecture
451
+
452
+ Four Docker services:
453
+
454
+ | Service | Technology | Port |
455
+ |---|---|---|
456
+ | **postgres** | PostgreSQL 16 | 5433 |
457
+ | **tor** | Tor SOCKS5 proxy | 9050 |
458
+ | **fastapi** | Python 3.11, FastAPI, SQLAlchemy | 8000 |
459
+ | **nextjs** | Next.js 14, TypeScript, Tailwind | 3001 |
460
+
461
+ The FastAPI backend runs a 13-step pipeline triggered by `POST /investigations`. Every external call has `try/except` with graceful fallback — the pipeline never hard-crashes. API docs are available at **http://localhost:8000/docs** when running.
462
+
463
+ ### Source Tree
464
+
465
+ ```
466
+ voidaccess/
467
+ ├── analysis/ # Temporal patterns, OPSEC failure detection, anomaly scoring
468
+ ├── api/ # FastAPI routes; investigation pipeline orchestrator
469
+ ├── auth/ # JWT authentication and user management
470
+ ├── crawler/ # Recursive .onion link discovery spider
471
+ ├── db/ # SQLAlchemy ORM models and Alembic migrations
472
+ ├── docs/ # Contributing, security, and usage policy documents
473
+ ├── export/ # STIX 2.1, MISP, Sigma, and CSV artifact generation
474
+ ├── extractor/ # Regex NER LLM entity extraction pipeline
475
+ ├── fingerprint/ # Stylometry vectors and actor style profiling
476
+ ├── graph/ # NetworkX MultiDiGraph builder and pyvis visualization
477
+ ├── i18n/ # Language detection, translation, multilingual query expansion
478
+ ├── infra/ # Docker Compose, Tor config, Postgres init
479
+ ├── monitor/ # APScheduler watches, change diffing, Telegram/SMTP alerts
480
+ ├── public/ # Logo, walkthrough screenshots, demo media
481
+ ├── scraper/ # Async aiohttp and Playwright scrapers over Tor
482
+ ├── scripts/ # Seed imports and operational utilities
483
+ ├── search/ # 16+ .onion search engine fan-out with circuit breaker
484
+ ├── sources/ # DarkSearch, Telegram, paste sites, threat-intel feeds
485
+ ├── tests/ # Pytest suite (one test file per module)
486
+ ├── utils/ # Async helpers, content safety, encryption, defang
487
+ ├── vector/ # ChromaDB cache with sentence-transformer embeddings
488
+ ├── voidaccess/ # LangChain LLM wrappers and provider registry
489
+ └── web/ # Next.js 14 + TypeScript + Tailwind frontend
490
+ ```
491
+
492
+ > **Note on `voidaccess/voidaccess/`** the nested directory holds the core LLM utilities (`llm.py`, `llm_utils.py`) and is imported at runtime by the API routes (`from voidaccess.llm import ...`). The nested naming reflects the original package structure from the project's pre-API baseline.
493
+
494
+ ---
495
+
496
+ ## Troubleshooting
497
+
498
+ **Services won't start:**
499
+ ```bash
500
+ docker compose -f infra/docker-compose.yml --project-directory . ps
501
+ docker compose -f infra/docker-compose.yml --project-directory . logs -f
502
+ ```
503
+
504
+ **Port conflicts** (3001 or 8000 already in use):
505
+ - macOS/Linux: `lsof -i :3001` to find what's using it
506
+ - Windows: `netstat -ano | findstr :3001`
507
+
508
+ **Tor not connecting:** The Tor service takes 30–60 seconds to bootstrap on first start. Check health with `./check_health.sh`. This script verifies Tor proxy connectivity, LLM provider reachability, and dark web search engine availability.
509
+
510
+ **No .env file:** Run `bash setup.sh` (macOS/Linux/WSL) or `setup.bat` (Windows) before starting.
511
+
512
+ **Docker build takes a long time:** First build downloads ~3GB of layers. Subsequent builds use the Docker layer cache and are much faster.
513
+
514
+ ---
515
+
516
+ ## Content Safety
517
+
518
+ Every investigation runs through mandatory content safety filters before results reach the UI or appear in the graph. CSAM, gore, snuff content, and other prohibited material are blocked at the query stage, URL validation, content scanning, and post-extraction entity filtering. These filters are mandatory and cannot be disabled.
519
+
520
+ ---
521
+
522
+ ## Acceptable Use
523
+
524
+ VoidAccess is for authorized security research, threat intelligence gathering, and law enforcement purposes only. Users are responsible for ensuring compliance with all local laws and ethical standards. See [docs/USAGE_POLICY.md](docs/USAGE_POLICY.md) for the full policy.
525
+
526
+ ---
527
+
528
+ ## Contributing
529
+
530
+ Contributions are welcome. See [docs/CONTRIBUTING.md](docs/CONTRIBUTING.md) for setup instructions, code standards, and the PR process. Please read [docs/CODE_OF_CONDUCT.md](docs/CODE_OF_CONDUCT.md) before participating.
531
+
532
+ To report a security vulnerability, see [docs/SECURITY.md](docs/SECURITY.md).
533
+
534
+ ---
535
+
536
+ ## License
537
+
538
+ MIT License. See [LICENSE](LICENSE) for details.
539
+
540
+
541
+
542
+